diff --git a/._01Readme.txt b/._01Readme.txt new file mode 100644 index 0000000..84187d6 Binary files /dev/null and b/._01Readme.txt differ diff --git a/._02QuickInstall.txt b/._02QuickInstall.txt new file mode 100644 index 0000000..4b6842b Binary files /dev/null and b/._02QuickInstall.txt differ diff --git a/._03FAQ.txt b/._03FAQ.txt new file mode 100644 index 0000000..19b9cee Binary files /dev/null and b/._03FAQ.txt differ diff --git a/._04Windows64bit.txt b/._04Windows64bit.txt new file mode 100644 index 0000000..0987f2a Binary files /dev/null and b/._04Windows64bit.txt differ diff --git a/._05LargePage b/._05LargePage new file mode 100644 index 0000000..093c2cb Binary files /dev/null and b/._05LargePage differ diff --git a/._06WeirdPerformance b/._06WeirdPerformance new file mode 100644 index 0000000..b393adb Binary files /dev/null and b/._06WeirdPerformance differ diff --git a/._Makefile b/._Makefile new file mode 100644 index 0000000..aa495f3 Binary files /dev/null and b/._Makefile differ diff --git a/._Makefile.alpha b/._Makefile.alpha new file mode 100644 index 0000000..ac78ed7 Binary files /dev/null and b/._Makefile.alpha differ diff --git a/._Makefile.generic b/._Makefile.generic new file mode 100644 index 0000000..cdc559a Binary files /dev/null and b/._Makefile.generic differ diff --git a/._Makefile.getarch b/._Makefile.getarch new file mode 100644 index 0000000..2e56386 Binary files /dev/null and b/._Makefile.getarch differ diff --git a/._Makefile.ia64 b/._Makefile.ia64 new file mode 100644 index 0000000..737234f Binary files /dev/null and b/._Makefile.ia64 differ diff --git a/._Makefile.mips64 b/._Makefile.mips64 new file mode 100644 index 0000000..9f59505 Binary files /dev/null and b/._Makefile.mips64 differ diff --git a/._Makefile.power b/._Makefile.power new file mode 100644 index 0000000..54ba618 Binary files /dev/null and b/._Makefile.power differ diff --git a/._Makefile.rule b/._Makefile.rule new file mode 100644 index 0000000..6737618 Binary files /dev/null and b/._Makefile.rule differ diff --git a/._Makefile.sparc b/._Makefile.sparc new file mode 100644 index 0000000..b4d3ba8 Binary files /dev/null and b/._Makefile.sparc differ diff --git a/._Makefile.system b/._Makefile.system new file mode 100644 index 0000000..46f373a Binary files /dev/null and b/._Makefile.system differ diff --git a/._Makefile.tail b/._Makefile.tail new file mode 100644 index 0000000..8f34c3a Binary files /dev/null and b/._Makefile.tail differ diff --git a/._Makefile.x86 b/._Makefile.x86 new file mode 100644 index 0000000..9448428 Binary files /dev/null and b/._Makefile.x86 differ diff --git a/._Makefile.x86_64 b/._Makefile.x86_64 new file mode 100644 index 0000000..5f442d0 Binary files /dev/null and b/._Makefile.x86_64 differ diff --git a/._benchmark b/._benchmark new file mode 100755 index 0000000..b7472c5 Binary files /dev/null and b/._benchmark differ diff --git a/._c_check b/._c_check new file mode 100755 index 0000000..5c1e21d Binary files /dev/null and b/._c_check differ diff --git a/._cblas.h b/._cblas.h new file mode 100644 index 0000000..8958aa9 Binary files /dev/null and b/._cblas.h differ diff --git a/._common.h b/._common.h new file mode 100644 index 0000000..6403ec8 Binary files /dev/null and b/._common.h differ diff --git a/._common_alpha.h b/._common_alpha.h new file mode 100644 index 0000000..79c0495 Binary files /dev/null and b/._common_alpha.h differ diff --git a/._common_c.h b/._common_c.h new file mode 100644 index 0000000..931ae85 Binary files /dev/null and b/._common_c.h differ diff --git a/._common_d.h b/._common_d.h new file mode 100644 index 0000000..dd2d46c Binary files /dev/null and b/._common_d.h differ diff --git a/._common_ia64.h b/._common_ia64.h new file mode 100644 index 0000000..f25ad6c Binary files /dev/null and b/._common_ia64.h differ diff --git a/._common_interface.h b/._common_interface.h new file mode 100644 index 0000000..2b596c1 Binary files /dev/null and b/._common_interface.h differ diff --git a/._common_lapack.h b/._common_lapack.h new file mode 100644 index 0000000..7491bce Binary files /dev/null and b/._common_lapack.h differ diff --git a/._common_level1.h b/._common_level1.h new file mode 100644 index 0000000..eebc70d Binary files /dev/null and b/._common_level1.h differ diff --git a/._common_level2.h b/._common_level2.h new file mode 100644 index 0000000..651bebf Binary files /dev/null and b/._common_level2.h differ diff --git a/._common_level3.h b/._common_level3.h new file mode 100644 index 0000000..f460493 Binary files /dev/null and b/._common_level3.h differ diff --git a/._common_linux.h b/._common_linux.h new file mode 100644 index 0000000..4cb18fc Binary files /dev/null and b/._common_linux.h differ diff --git a/._common_macro.h b/._common_macro.h new file mode 100644 index 0000000..8bd18f8 Binary files /dev/null and b/._common_macro.h differ diff --git a/._common_mips64.h b/._common_mips64.h new file mode 100644 index 0000000..87ddd4b Binary files /dev/null and b/._common_mips64.h differ diff --git a/._common_param.h b/._common_param.h new file mode 100644 index 0000000..1579f0c Binary files /dev/null and b/._common_param.h differ diff --git a/._common_power.h b/._common_power.h new file mode 100644 index 0000000..536ce69 Binary files /dev/null and b/._common_power.h differ diff --git a/._common_q.h b/._common_q.h new file mode 100644 index 0000000..1df1f33 Binary files /dev/null and b/._common_q.h differ diff --git a/._common_reference.h b/._common_reference.h new file mode 100644 index 0000000..50c7f3c Binary files /dev/null and b/._common_reference.h differ diff --git a/._common_s.h b/._common_s.h new file mode 100644 index 0000000..41675df Binary files /dev/null and b/._common_s.h differ diff --git a/._common_sparc.h b/._common_sparc.h new file mode 100644 index 0000000..dfc0c68 Binary files /dev/null and b/._common_sparc.h differ diff --git a/._common_thread.h b/._common_thread.h new file mode 100644 index 0000000..9297846 Binary files /dev/null and b/._common_thread.h differ diff --git a/._common_x.h b/._common_x.h new file mode 100644 index 0000000..ef244a7 Binary files /dev/null and b/._common_x.h differ diff --git a/._common_x86.h b/._common_x86.h new file mode 100644 index 0000000..1c52699 Binary files /dev/null and b/._common_x86.h differ diff --git a/._common_x86_64.h b/._common_x86_64.h new file mode 100644 index 0000000..40b3475 Binary files /dev/null and b/._common_x86_64.h differ diff --git a/._common_z.h b/._common_z.h new file mode 100644 index 0000000..5186444 Binary files /dev/null and b/._common_z.h differ diff --git a/._cpuid.S b/._cpuid.S new file mode 100644 index 0000000..94246af Binary files /dev/null and b/._cpuid.S differ diff --git a/._cpuid.h b/._cpuid.h new file mode 100644 index 0000000..a247929 Binary files /dev/null and b/._cpuid.h differ diff --git a/._cpuid_alpha.c b/._cpuid_alpha.c new file mode 100644 index 0000000..3f7c917 Binary files /dev/null and b/._cpuid_alpha.c differ diff --git a/._cpuid_ia64.c b/._cpuid_ia64.c new file mode 100644 index 0000000..ccedaa3 Binary files /dev/null and b/._cpuid_ia64.c differ diff --git a/._cpuid_mips.c b/._cpuid_mips.c new file mode 100644 index 0000000..d9ab03d Binary files /dev/null and b/._cpuid_mips.c differ diff --git a/._cpuid_power.c b/._cpuid_power.c new file mode 100644 index 0000000..c7d9ca8 Binary files /dev/null and b/._cpuid_power.c differ diff --git a/._cpuid_sparc.c b/._cpuid_sparc.c new file mode 100644 index 0000000..057a9a7 Binary files /dev/null and b/._cpuid_sparc.c differ diff --git a/._cpuid_x86.c b/._cpuid_x86.c new file mode 100644 index 0000000..da71221 Binary files /dev/null and b/._cpuid_x86.c differ diff --git a/._ctest b/._ctest new file mode 100755 index 0000000..5bc162f Binary files /dev/null and b/._ctest differ diff --git a/._ctest.c b/._ctest.c new file mode 100644 index 0000000..a5ae166 Binary files /dev/null and b/._ctest.c differ diff --git a/._ctest1.c b/._ctest1.c new file mode 100644 index 0000000..8f31e78 Binary files /dev/null and b/._ctest1.c differ diff --git a/._ctest2.c b/._ctest2.c new file mode 100644 index 0000000..0b42eb3 Binary files /dev/null and b/._ctest2.c differ diff --git a/._driver b/._driver new file mode 100755 index 0000000..51cdbcc Binary files /dev/null and b/._driver differ diff --git a/._exports b/._exports new file mode 100755 index 0000000..a2b64e2 Binary files /dev/null and b/._exports differ diff --git a/._f_check b/._f_check new file mode 100755 index 0000000..856c981 Binary files /dev/null and b/._f_check differ diff --git a/._ftest.f b/._ftest.f new file mode 100644 index 0000000..7e43bbb Binary files /dev/null and b/._ftest.f differ diff --git a/._ftest2.f b/._ftest2.f new file mode 100644 index 0000000..01b2200 Binary files /dev/null and b/._ftest2.f differ diff --git a/._getarch.c b/._getarch.c new file mode 100644 index 0000000..e8ee3aa Binary files /dev/null and b/._getarch.c differ diff --git a/._getarch_2nd.c b/._getarch_2nd.c new file mode 100644 index 0000000..def3860 Binary files /dev/null and b/._getarch_2nd.c differ diff --git a/._interface b/._interface new file mode 100755 index 0000000..b637914 Binary files /dev/null and b/._interface differ diff --git a/._kernel b/._kernel new file mode 100755 index 0000000..dd7835e Binary files /dev/null and b/._kernel differ diff --git a/._l1param.h b/._l1param.h new file mode 100644 index 0000000..d07cf87 Binary files /dev/null and b/._l1param.h differ diff --git a/._l2param.h b/._l2param.h new file mode 100644 index 0000000..beb24fd Binary files /dev/null and b/._l2param.h differ diff --git a/._lapack b/._lapack new file mode 100755 index 0000000..444966d Binary files /dev/null and b/._lapack differ diff --git a/._make.inc b/._make.inc new file mode 100644 index 0000000..6bfd404 Binary files /dev/null and b/._make.inc differ diff --git a/._param.h b/._param.h new file mode 100644 index 0000000..662c2bd Binary files /dev/null and b/._param.h differ diff --git a/._patch.for_lapack-3.1.1 b/._patch.for_lapack-3.1.1 new file mode 100644 index 0000000..87c3fba Binary files /dev/null and b/._patch.for_lapack-3.1.1 differ diff --git a/._quickbuild.32bit b/._quickbuild.32bit new file mode 100755 index 0000000..43d7df1 Binary files /dev/null and b/._quickbuild.32bit differ diff --git a/._quickbuild.64bit b/._quickbuild.64bit new file mode 100755 index 0000000..1ca2159 Binary files /dev/null and b/._quickbuild.64bit differ diff --git a/._quickbuild.win32 b/._quickbuild.win32 new file mode 100755 index 0000000..8c55327 Binary files /dev/null and b/._quickbuild.win32 differ diff --git a/._quickbuild.win64 b/._quickbuild.win64 new file mode 100755 index 0000000..492ec7c Binary files /dev/null and b/._quickbuild.win64 differ diff --git a/._reference b/._reference new file mode 100755 index 0000000..dd5673e Binary files /dev/null and b/._reference differ diff --git a/._symcopy.h b/._symcopy.h new file mode 100644 index 0000000..db29151 Binary files /dev/null and b/._symcopy.h differ diff --git a/._test b/._test new file mode 100755 index 0000000..6a3f60d Binary files /dev/null and b/._test differ diff --git a/._version.h b/._version.h new file mode 100644 index 0000000..9f078b2 Binary files /dev/null and b/._version.h differ diff --git a/00License.txt b/00License.txt new file mode 100644 index 0000000..56a0f74 --- /dev/null +++ b/00License.txt @@ -0,0 +1,32 @@ + +Copyright 2009, 2010 The University of Texas at Austin. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT AUSTIN ``AS IS'' +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT +AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation +are those of the authors and should not be interpreted as representing +official policies, either expressed or implied, of The University of +Texas at Austin. diff --git a/01Readme.txt b/01Readme.txt new file mode 100644 index 0000000..fdde1e3 --- /dev/null +++ b/01Readme.txt @@ -0,0 +1,93 @@ + Optimized GotoBLAS2 libraries version 1.13 + + By Kazushige Goto + +# This is the last update and done on 5th Feb. 2010. + +0. License + + See 00TACC_Research_License.txt. + +1. Supported OS + + Linux + FreeBSD(Also it may work on NetBSD) + OSX + Soralis + Windows 2k, XP, Server 2003 and 2008(both 32bit and 64bit) + AIX + Tru64 UNIX + +2. Supported Architecture + + X86 : Pentium3 Katmai + Coppermine + Athlon (not well optimized, though) + PentiumM Banias, Yonah + Pentium4 Northwood + Nocona (Prescott) + Core 2 Woodcrest + Core 2 Penryn + Nehalem-EP Corei{3,5,7} + Atom + AMD Opteron + AMD Barlcelona, Shanghai, Istanbul + VIA NANO + + X86_64: Pentium4 Nocona + Core 2 Woodcrest + Core 2 Penryn + Nehalem + Atom + AMD Opteron + AMD Barlcelona, Shanghai, Istanbul + VIA NANO + + IA64 : Itanium2 + + Alpha : EV4, EV5, EV6 + + POWER : POWER4 + PPC970/PPC970FX + PPC970MP + CELL (PPU only) + POWER5 + PPC440 (QCDOC) + PPC440FP2(BG/L) + POWERPC G4(PPC7450) + POWER6 + + SPARC : SPARC IV + SPARC VI, VII (Fujitsu chip) + + MIPS64/32: Sicortex + +3. Supported compiler + + C compiler : GNU CC + Cygwin, MinGW + Other commercial compiler(especially for x86/x86_64) + + Fortran Compiler : GNU G77, GFORTRAN + G95 + Open64 + Compaq + F2C + IBM + Intel + PathScale + PGI + SUN + Fujitsu + +4. Suported precision + + Now x86/x86_64 version support 80bit FP precision in addition to +normal double presicion and single precision. Currently only +gfortran supports 80bit FP with "REAL*10". + + +5. How to build library? + + Please see 02QuickInstall.txt or just type "make". + diff --git a/02QuickInstall.txt b/02QuickInstall.txt new file mode 100644 index 0000000..abf3807 --- /dev/null +++ b/02QuickInstall.txt @@ -0,0 +1,118 @@ + Quick installation for GotoBLAS2 + +*************************************************************************** +*************************************************************************** +** ** +** ** +** Just type "make" <>. ** +** ** +** If you're not satisfied with this library, ** +** please read following instruction and customize it. ** +** ** +** ** +*************************************************************************** +*************************************************************************** + + +1. REALLY REALLY quick way to build library + + Type "make" or "gmake". + + $shell> make + + The script will detect Fortran compiler, number of cores and + architecture which you're using. If default gcc binary type is + 64bit, 64 bit library will be created. Otherwise 32 bit library + will be created. + + After finishing compile, you'll find various information about + generated library. + + =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + + GotoBLAS2 build complete. + + OS ... Linux + Architecture ... x86_64 + BINARY ... 64bit + C compiler ... GCC (command line : gcc) + Fortran compiler ... PATHSCALE (command line : pathf90) + Library Name ... libgoto_barcelonap-r1.27.a (Multi threaded; Max + num-threads is 16) + + =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + + +2. Specifying 32bit or 64bit library + + If you need 32bit binary, + + $shell> make BINARY=32 + + If you need 64bit binary, + + $shell> make BINARY=64 + + +3. Specifying target architecture + + If you need library for different architecture, you can use TARGET + option. You can find current available options in top of getarch.c. + For example, if you need library for Intel core2 architecture, + you'll find FORCE_CORE2 option in getarch.c. Therefore you can + specify TARGET=CORE2 (get rid of FORCE_) with make. + + $shell> make TARGET=CORE2 + + Also if you want GotoBLAS2 to support multiple architecture, + + $shell> make DYNAMIC_ARCH=1 + + All kernel will be included in the library and dynamically switched + the best architecutre at run time. + + +4. Specifying for enabling multi-threaded + + Script will detect number of cores and will enable multi threaded + library if number of cores is more than two. If you still want to + create single threaded library, + + $shell> make USE_THREAD=0 + + Or if you need threaded library by force, + + $shell> make USE_THREAD=1 + + +5. Specifying target OS + + Target architecture will be determined by the CC. If you + specify cross compiler for MIPS, you can create library for + MIPS architecture. + + $shell> make CC=mips64el-linux-gcc TARGET=SICORTEX + + Or you can specify your favorite C compiler with absolute path. + + $shell> make CC=/opt/intel/cc/32/10.0.026/bin/icc TARGET=BARCELONA + + Binary type (32bit/64bit) is determined by checking CC, you + can control binary type with this option. + + $shell> make CC="pathcc -m32" + + In this case, 32bit library will be created. + + +6. Specifying Fortran compiler + + If you need to support other Fortran compiler, you can specify with + FC option. + + $shell> make FC=gfortran + + +7. Other useful options + + You'll find other useful options in Makefile.rule. diff --git a/03FAQ.txt b/03FAQ.txt new file mode 100644 index 0000000..b6033fe --- /dev/null +++ b/03FAQ.txt @@ -0,0 +1,119 @@ + GotoBLAS2 FAQ + +1. General + +1.1 Q Can I find useful paper about GotoBLAS2? + + A You may check following URL. + + http://www.cs.utexas.edu/users/flame/Publications/index.htm + + 11. Kazushige Goto and Robert A. van de Geijn, " Anatomy of + High-Performance Matrix Multiplication," ACM Transactions on + Mathematical Software, accepted. + + 15. Kazushige Goto and Robert van de Geijn, "High-Performance + Implementation of the Level-3 BLAS." ACM Transactions on + Mathematical Software, submitted. + + +1.2 Q Does GotoBLAS2 work with Hyperthread (SMT)? + + A Yes, it will work. GotoBLAS2 detects Hyperthread and + avoid scheduling on the same core. + + +1.3 Q When I type "make", following error occured. What's wrong? + + $shell> make + "./Makefile.rule", line 58: Missing dependency operator + "./Makefile.rule", line 61: Need an operator + ... + + A This error occurs because you didn't use GNU make. Some binary + packages install GNU make as "gmake" and it's worth to try. + + +1.4 Q Function "xxx" is slow. Why? + + A Generally GotoBLAS2 has many well optimized functions, but it's + far and far from perfect. Especially Level 1/2 function + performance depends on how you call BLAS. You should understand + what happends between your function and GotoBLAS2 by using profile + enabled version or hardware performance counter. Again, please + don't regard GotoBLAS2 as a black box. + + +1.5 Q I have a commercial C compiler and want to compile GotoBLAS2 with + it. Is it possible? + + A All function that affects performance is written in assembler + and C code is just used for wrapper of assembler functions or + complicated functions. Also I use many inline assembler functions, + unfortunately most of commercial compiler can't handle inline + assembler. Therefore you should use gcc. + + +1.6 Q I use OpenMP compiler. How can I use GotoBLAS2 with it? + + A Please understand that OpenMP is a compromised method to use + thread. If you want to use OpenMP based code with GotoBLAS2, you + should enable "USE_OPENMP=1" in Makefile.rule. + + +1.7 Q Could you tell me how to use profiled library? + + A You need to build and link your application with -pg + option. After executing your application, "gmon.out" is + generated in your current directory. + + $shell> gprof gmon.out + + Each sample counts as 0.01 seconds. + % cumulative self self total + time seconds seconds calls Ks/call Ks/call name + 89.86 975.02 975.02 79317 0.00 0.00 .dgemm_kernel + 4.19 1020.47 45.45 40 0.00 0.00 .dlaswp00N + 2.28 1045.16 24.69 2539 0.00 0.00 .dtrsm_kernel_LT + 1.19 1058.03 12.87 79317 0.00 0.00 .dgemm_otcopy + 1.05 1069.40 11.37 4999 0.00 0.00 .dgemm_oncopy + .... + + I think profiled BLAS library is really useful for your + research. Please find bottleneck of your application and + improve it. + +1.8 Q Is number of thread limited? + + A Basically, there is no limitation about number of threads. You + can specify number of threads as many as you want, but larger + number of threads will consume extra resource. I recommend you to + specify minimum number of threads. + + +2. Architecture Specific issue or Implementation + +2.1 Q GotoBLAS2 seems to support any combination with OS and + architecture. Is it possible? + + A Combination is limited by current OS and architecture. For + examble, the combination OSX with SPARC is impossible. But it + will be possible with slight modification if these combination + appears in front of us. + + +2.2 Q I have POWER architecture systems. Do I need extra work? + + A Although POWER architecture defined special instruction + like CPUID to detect correct architecture, it's privileged + and can't be accessed by user process. So you have to set + the architecture that you have manually in getarch.c. + + +2.3 Q I can't create DLL on Cygwin (Error 53). What's wrong? + + A You have to make sure if lib.exe and mspdb80.dll are in Microsoft + Studio PATH. The easiest way is to use 'which' command. + + $shell> which lib.exe + /cygdrive/c/Program Files/Microsoft Visual Studio/VC98/bin/lib.exe diff --git a/04Windows64bit.txt b/04Windows64bit.txt new file mode 100644 index 0000000..c9b8fc3 --- /dev/null +++ b/04Windows64bit.txt @@ -0,0 +1,13 @@ + +Quick guide to build library for Windows 64bit. + +1. What you need + + a. Windows Server 2003 or later + b. Cygwin environment(make, gcc, g77, perl, sed, wget) + c. MinGW64 compiler + d. Microsoft Visual Studio (lib.exe and mspdb80.dll are required to create dll) + +2. Do ./quickbuild.win64 + +Good luck diff --git a/05LargePage b/05LargePage new file mode 100644 index 0000000..fb7de6b --- /dev/null +++ b/05LargePage @@ -0,0 +1,53 @@ + To enhance perfomance, I'd recommend you to enable large page on + your OS (root account is required). + + A) Linux + + x86 32bit ... (number of core) * 4 pages + x86 64bit ... (number of core) * 8 pages + POWER 32/64bit ... (number of core) * 1 pages + + If you want to allocate 64 large pages, + + $shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset + $shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page + $shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number + $shell> echo 3355443200 > /pros/sys/kernel/shmall + + Also may add a few lines into /etc/security/limits.conf file. + + * hard memlock unlimited + * soft memlock unlimited + + Then restart sshd (/etc/init.d/sshd restart). + + B) Solaris + + You don't have to set up. + + C) Windows (Windows Server 2003 or later, XP 64bit) + + You have to assign memory lock operation to your account. + + Control Panel -> Administrative Tools -> Local Security Policy -> + Local Policies -> User Rights Assignment -> Lock pages in memory + + D) AIX + + Ask your administrator + + E) Tru64 UNIX + + Assign shared memory at boot time. + + F) Other aarchitecture which doesn't have Large TLB enhancement + + If you have root permission, please install device driver which + located in drivers/mapper. + + $shell> cd drivers/mapper + $shell> make + $shell> insmod mapper.ko + $shell> ./device_setup + + Then enable DEVICEDRIVER_ALLOCATION = 1 in Makefile.rule. diff --git a/06WeirdPerformance b/06WeirdPerformance new file mode 100644 index 0000000..8046267 --- /dev/null +++ b/06WeirdPerformance @@ -0,0 +1,22 @@ + Weird Performance + +1. If you see serious performance loss (extremely low performance), + probably you created too many threads or process. Basically GotoBLAS + assumes that available cores that you specify are exclusively for + BLAS computation. Even one small thread/process conflicts with BLAS + threads, performance will become worse. + + The best solution is to reduce your number of threads or insert + some synchronization mechanism and suspend your threads until BLAS + operation is finished. + + +2. Simlar problem may happen under virtual machine. If supervisor + allocates different cores for each scheduling, BLAS performnace + will be bad. This is because BLAS also utilizes all cache, + unexpected re-schedule for different core may result of heavy + performance loss. + + +Anyway, if you see any weird performance loss, it means your code or +algorithm is not optimal. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c0cfc6b --- /dev/null +++ b/Makefile @@ -0,0 +1,230 @@ +TOPDIR = . +include ./Makefile.system + +BLASDIRS = interface driver/level2 driver/level3 driver/others + +ifndef DYNAMIC_ARCH +BLASDIRS += kernel +endif + +ifdef SANITY_CHECK +BLASDIRS += reference +endif + +SUBDIRS = $(BLASDIRS) lapack + +SUBDIRS_ALL = $(SUBDIRS) test ctest exports benchmark ../laswp ../bench + +.PHONY : all libs netlib test ctest shared +.NOTPARALLEL : all libs prof lapack-test + +all :: libs netlib tests shared + @echo + @echo " GotoBLAS build complete." + @echo + @echo " OS ... $(OSNAME) " + @echo " Architecture ... $(ARCH) " +ifndef BINARY64 + @echo " BINARY ... 32bit " +else + @echo " BINARY ... 64bit " +endif + @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" + @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" +ifneq ($(OSNAME), AIX) + @echo -n " Library Name ... $(LIBNAME)" +else + @echo " Library Name ... $(LIBNAME)" +endif + +ifndef SMP + @echo " (Single threaded) " +else + @echo " (Multi threaded; Max num-threads is $(NUM_THREADS))" +endif + @echo + +shared : +ifeq ($(OSNAME), Linux) + $(MAKE) -C exports so + -ln -fs $(LIBSONAME) libgoto2.so +endif +ifeq ($(OSNAME), FreeBSD) + $(MAKE) -C exports so + -ln -fs $(LIBSONAME) libgoto2.so +endif +ifeq ($(OSNAME), NetBSD) + $(MAKE) -C exports so + -ln -fs $(LIBSONAME) libgoto2.so +endif +ifeq ($(OSNAME), Darwin) + $(MAKE) -C exports dyn + -ln -fs $(LIBDYNNAME) libgoto2.dylib +endif +ifeq ($(OSNAME), WINNT) + $(MAKE) -C exports dll +# -ln -fs $(LIBDLLNAME) libgoto2.dll +endif +ifeq ($(OSNAME), CYGWIN_NT) + $(MAKE) -C exports dll + -ln -fs $(LIBDLLNAME) libgoto2.dll +endif + +tests : +ifndef NOFORTRAN +ifndef TARGET +ifndef CROSS + touch $(LIBNAME) +ifndef NO_FBLAS + $(MAKE) -C test all +endif +ifndef NO_CBLAS + $(MAKE) -C ctest all +endif +endif +endif +endif + +libs : + -ln -fs $(LIBNAME) libgoto2.$(LIBSUFFIX) + for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done +ifdef DYNAMIC_ARCH + $(MAKE) -C kernel commonlibs || exit 1 + for d in $(DYNAMIC_CORE) ; \ + do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ + done +endif + +prof : prof_blas prof_lapack + +prof_blas : + ln -fs $(LIBNAME_P) libgoto2_p.$(LIBSUFFIX) + for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d prof || exit 1 ; \ + fi; \ + done +ifdef DYNAMIC_ARCH + $(MAKE) -C kernel commonprof || exit 1 +endif + +blas : + ln -fs $(LIBNAME) libgoto2.$(LIBSUFFIX) + for d in $(BLASDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d libs || exit 1 ; \ + fi; \ + done + +hpl : + ln -fs $(LIBNAME) libgoto2.$(LIBSUFFIX) + for d in $(BLASDIRS) ../laswp exports ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done +ifdef DYNAMIC_ARCH + $(MAKE) -C kernel commonlibs || exit 1 + for d in $(DYNAMIC_CORE) ; \ + do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ + done +endif + +hpl_p : + ln -fs $(LIBNAME_P) libgoto2_p.$(LIBSUFFIX) + for d in $(SUBDIRS) ../laswp exports ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done + +netlib : lapack-3.1.1 patch.for_lapack-3.1.1 lapack-3.1.1/make.inc +ifndef NOFORTRAN + -@$(MAKE) -C lapack-3.1.1 lapacklib +endif + +prof_lapack : lapack-3.1.1 lapack-3.1.1/make.inc + -@$(MAKE) -C lapack-3.1.1 lapack_prof + +lapack-3.1.1/make.inc : +ifndef NOFORTRAN + -@echo "FORTRAN = $(FC)" > lapack-3.1.1/make.inc + -@echo "OPTS = $(FFLAGS)" >> lapack-3.1.1/make.inc + -@echo "POPTS = $(FPFLAGS)" >> lapack-3.1.1/make.inc + -@echo "NOOPT = $(FFLAGS) -O0" >> lapack-3.1.1/make.inc + -@echo "PNOOPT = $(FPFLAGS) -O0" >> lapack-3.1.1/make.inc + -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> lapack-3.1.1/make.inc + -@echo "ARCH = $(AR)" >> lapack-3.1.1/make.inc + -@echo "RANLIB = $(RANLIB)" >> lapack-3.1.1/make.inc + -@echo "LAPACKLIB = ../$(LIBNAME)" >> lapack-3.1.1/make.inc + -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> lapack-3.1.1/make.inc + -@echo "SUFFIX = $(SUFFIX)" >> lapack-3.1.1/make.inc + -@echo "PSUFFIX = $(PSUFFIX)" >> lapack-3.1.1/make.inc +# -@echo "CEXTRALIB = $(CEXTRALIB)" >> lapack-3.1.1/make.inc + -@cat make.inc >> lapack-3.1.1/make.inc +endif + +lapack-3.1.1 : lapack-3.1.1.tgz +ifndef NOFORTRAN + @if test `$(MD5SUM) lapack-3.1.1.tgz | $(AWK) '{print $$1}'` = 00b21551a899bcfbaa7b8443e1faeef9; then \ + echo $(TAR) zxf $< ;\ + $(TAR) zxf $< && (cd lapack-3.1.1; $(PATCH) -p1 < ../patch.for_lapack-3.1.1) ;\ + else \ + echo " lapack-3.1.1.tgz check sum is wrong (Please use orignal)." ;\ + rm -rf lapack-3.1.1 ;\ + fi +endif + +lapack-3.1.1.tgz : +ifndef NOFORTRAN + -wget http://www.netlib.org/lapack/lapack-3.1.1.tgz +endif + +large.tgz : +ifndef NOFORTRAN + -wget http://www.netlib.org/lapack/timing/large.tgz +endif + +timing.tgz : +ifndef NOFORTRAN + -wget http://www.netlib.org/lapack/timing/timing.tgz +endif + +lapack-timing : lapack-3.1.1 large.tgz timing.tgz +ifndef NOFORTRAN + (cd lapack-3.1.1; $(TAR) zxf ../timing.tgz TIMING) + (cd lapack-3.1.1/TIMING; $(TAR) zxf ../../large.tgz ) + make -C lapack-3.1.1 tmglib + make -C lapack-3.1.1/TIMING +endif + + +lapack-test : + $(MAKE) -C lapack-3.1.1 tmglib + $(MAKE) -C lapack-3.1.1/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc + @rm -f lapack-3.1.1/TESTING/*.out + $(MAKE) -j 1 -C lapack-3.1.1/TESTING + $(GREP) failed lapack-3.1.1/TESTING/*.out + +dummy : + +clean :: + @for d in $(SUBDIRS_ALL) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done +ifdef DYNAMIC_ARCH + @$(MAKE) -C kernel clean +endif + @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libgoto2.$(LIBSUFFIX) libgoto2_p.$(LIBSUFFIX) *.lnk myconfig.h + @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @if test -d lapack-3.1.1; then \ + echo deleting lapack-3.1.1; \ + rm -rf lapack-3.1.1 ;\ + fi + @echo Done. \ No newline at end of file diff --git a/Makefile.alpha b/Makefile.alpha new file mode 100644 index 0000000..2305483 --- /dev/null +++ b/Makefile.alpha @@ -0,0 +1,57 @@ +CPP = $(CC) -E +RANLIB = ranlib + +ifeq ($(LIBSUBARCH), EV4) +LIBNAME = $(LIBPREFIX)_ev4.a +LIBNAME_P = $(LIBPREFIX)_ev4_p.a +endif + +ifeq ($(LIBSUBARCH), EV5) +LIBNAME = $(LIBPREFIX)_ev5.a +LIBNAME_P = $(LIBPREFIX)_ev5_p.a +endif + +ifeq ($(LIBSUBARCH), EV6) +LIBNAME = $(LIBPREFIX)_ev6.a +LIBNAME_P = $(LIBPREFIX)_ev6_p.a +endif + +ifneq ($(COMPILER), NATIVE) +# GCC User +ifeq ($(LIBSUBARCH), EV4) +OPTION += -DEV4 -mcpu=ev4 +endif +ifeq ($(LIBSUBARCH), EV5) +OPTION += -DEV5 -mcpu=ev5 +endif +ifeq ($(LIBSUBARCH), EV6) +OPTION += -DEV6 -mcpu=ev6 +endif +else +# Compaq Compiler User +ifeq ($(LIBSUBARCH), EV4) +OPTION += -DEV4 -tune ev4 -arch ev4 +endif +ifeq ($(LIBSUBARCH), EV5) +OPTION += -DEV5 -tune ev5 -arch ev5 +endif +ifeq ($(LIBSUBARCH), EV6) +OPTION += -DEV6 -tune ev6 -arch ev6 +endif +endif + +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -mieee +endif + +ifeq ($(F_COMPILER), G77) +FCOMMON_OPT += -mieee +endif + +ifndef SMP +LIBCXML = -lcxml -lots -lm +LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm +else +LIBCXML = -lcxmlp -lots -lm +LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm +endif diff --git a/Makefile.generic b/Makefile.generic new file mode 100644 index 0000000..770aaf8 --- /dev/null +++ b/Makefile.generic @@ -0,0 +1,6 @@ +COPT = -Wall -O2 # -DGEMMTEST +ifdef BINARY64 +else +# LDFLAGS = -m elf32ppc +LDFLAGS = -m elf_i386 +endif diff --git a/Makefile.getarch b/Makefile.getarch new file mode 100644 index 0000000..dadfb5b --- /dev/null +++ b/Makefile.getarch @@ -0,0 +1,39 @@ +export BINARY +export USE_OPENMP + +ifdef TARGET_CORE +TARGET_MAKE = Makefile_kernel.conf +TARGET_CONF = config_kernel.h +else +TARGET_MAKE = Makefile.conf +TARGET_CONF = config.h +endif + +# CPUIDEMU = ../../cpuid/table.o + +ifdef CPUIDEMU +EXFLAGS = -DCPUIDEMU -DVENDOR=99 +endif + +all: getarch_2nd + ./getarch_2nd 0 >> $(TARGET_MAKE) + ./getarch_2nd 1 >> $(TARGET_CONF) + +config.h : c_check f_check getarch + perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) + perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) + ./getarch 0 >> $(TARGET_MAKE) + ./getarch 1 >> $(TARGET_CONF) + + +getarch : getarch.c cpuid.S dummy $(CPUIDEMU) + $(HOSTCC) $(CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) + +getarch_2nd : getarch_2nd.c config.h dummy +ifndef TARGET_CORE + $(HOSTCC) -I. $(CFLAGS) -o $(@F) getarch_2nd.c +else + $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c +endif + +dummy: diff --git a/Makefile.ia64 b/Makefile.ia64 new file mode 100644 index 0000000..7ffcd1d --- /dev/null +++ b/Makefile.ia64 @@ -0,0 +1,22 @@ +CCOMMON_COPT += # -DUSE64BITINT # -DGEMMTEST + +# CCOMMON_OPT += -DPARAMTEST +FLAMEPATH = $(HOME)/flame/lib/ia64 + +ifndef SMP +LIBMKL = -L$(MKLPATH)/64 -Wl,-rpath,$(MKLPATH)/64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lguide -lpthread -lm +else +LIBMKL = -L$(MKLPATH)/64 -Wl,-rpath,$(MKLPATH)/64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lguide -lpthread -lm +endif + +LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame $(TOPDIR)/$(LIBNAME) -lgfortran -lpthread -lm + +LIBMLIB = ../../level1/others/libmisc.a -L/opt/intel/fc/ia64/9.1.040/lib -L/opt/mlib/lib \ + -llapack -lguide -lifcore -lm -lpthread +LIBSCSL = -L/opt/scsl/1.4.1.0/lib -Wl,-rpath,/opt/scsl/1.4.1.0/lib -lscs + +ifndef SMP +LIBATLAS = -L/usr/lib/atlas3.6.0 -lf77blas -latlas -lm +else +LIBATLAS = -L$(HOME)/misc/lib -L/usr/lib/atlas3.6.0p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm +endif diff --git a/Makefile.mips64 b/Makefile.mips64 new file mode 100644 index 0000000..05ea9c6 --- /dev/null +++ b/Makefile.mips64 @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif diff --git a/Makefile.power b/Makefile.power new file mode 100644 index 0000000..35eb2cb --- /dev/null +++ b/Makefile.power @@ -0,0 +1,93 @@ +# CCOMMON_OPT += -DALLOC_SHM + +FLAMEPATH = $(HOME)/flame/lib + +#ifeq ($(CORE), CELL) +#CELL_SDK_ROOT = /opt/IBM/cell-sdk-1.1/sysroot/usr +#SPU_CC = spu-gcc +#EXTRALIB += -lspe +#endif + +ifeq ($(OSNAME), Linux) +ifdef BINARY64 +# COMPILER_PREFIX = powerpc64-linux- +else +# COMPILER_PREFIX = powerpc-linux- +endif +endif + +ifdef BINARY64 +ifeq ($(OSNAME), Linux) +LDFLAGS = -m elf64ppc +endif + +ifeq ($(OSNAME), Darwin) +LDFLAGS = -arch ppc64 +endif + +ifeq ($(OSNAME), AIX) +CCOMMON_OPT += -mpowerpc64 -maix64 +ifeq ($(COMPILER_F77), g77) +FCOMMON_OPT += -mpowerpc64 -maix64 +endif +ifeq ($(COMPILER_F77), xlf) +FCOMMON_OPT += -q64 +endif +ARFLAGS = -X 64 +LDFLAGS = -b64 +ASFLAGS = -a64 +endif +else +ifeq ($(OSNAME), Linux) +LDFLAGS = -m elf32ppc +endif +ifeq ($(OSNAME), AIX) +CCOMMON_OPT += -Wa,-a32 +ARFLAGS = -X 32 +LDFLAGS = -b32 +ASFLAGS = -a32 +endif +endif + +# CCOMMON_OPT += -maltivec -mabi=altivec + +LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame-lapack -lflame-base $(LIBS) + +ifeq ($(OSNAME), Darwin) +CCOMMON_OPT += -force_cpusubtype_ALL +endif + + +ifndef BINARY64 +ifeq ($(OSNAME), Linux) +ESSLPATH = -L/opt/ibmcmp/lib -L/opt/ibmcmp/xlf/11.1/lib -Wl,-rpath,/opt/ibmcmp/lib -Wl,-rpath,/opt/ibmcmp/xlf/11.1/lib -lxlf90_r -lxlomp_ser -lxlfmath -lxl -lpthread +else +ESSLPATH = -lxlf90_r +endif + + +LIBVECLIB = -framework VecLib +ifndef SMP +LIBATLAS = -L/usr/lib/atlas3.7.11 -lf77blas -latlas -lg2c -lm +LIBESSL = -lessl $(ESSLPATH) ../../level1/others/libmisc.a -lm +else +LIBATLAS = -L/usr/lib/atlas3.7.11p -lptf77blas -latlas -lm -lpthread +LIBESSL = -lesslsmp $(ESSLPATH) ../../level1/others/libmisc.a -lm +endif +else +ifeq ($(OSNAME), Linux) +ESSLPATH = -L/opt/ibmcmp/lib64 -Wl,-rpath,/opt/ibmcmp/lib64 -L/opt/ibmcmp/xlf/11.1/lib64 -Wl,-rpath,/opt/ibmcmp/xlf/11.1/lib64 -lxlf90_r -lxlomp_ser +else +ESSLPATH = -lxlf90_r +endif + +LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib + +ifndef SMP +LIBATLAS = -L/usr/lib64/atlas3.7.11 -lf77blas -latlas -lg2c -lm +LIBESSL = -lessl $(ESSLPATH) -lm +else +LIBATLAS = -L/usr/lib64/atlas3.7.11p -lptf77blas -latlas -lm -lpthread +LIBESSL = -lesslsmp $(ESSLPATH) -lxlsmp -lm +endif +endif diff --git a/Makefile.rule b/Makefile.rule new file mode 100644 index 0000000..8be5515 --- /dev/null +++ b/Makefile.rule @@ -0,0 +1,95 @@ +# +# Beginning of user configuration +# + +# This library's version +VERSION = 1.13 + +# You can specify the target architecture, otherwise it's +# automatically detected. +# TARGET = PENRYN + +# If you want to support multiple architecture in one binary +# DYNAMIC_ARCH = 1 + +# C compiler including binary type(32bit / 64bit). Default is gcc. +# Don't use Intel Compiler or PGI, it won't generate right codes as I expect. +# CC = gcc + +# Fortran compiler. Default is g77. +# FC = gfortran + +# Even you can specify cross compiler +# CC = x86_64-w64-mingw32-gcc +# FC = x86_64-w64-mingw32-gfortran + +# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 +# BINARY=64 + +# About threaded BLAS. It will be automatically detected if you don't +# specify it. +# For force setting for single threaded, specify USE_THREAD = 0 +# For force setting for multi threaded, specify USE_THREAD = 1 +# USE_THREAD = 0 + +# If you're going to use this library with OpenMP, please comment it in. +# USE_OPENMP = 1 + +# You can define maximum number of threads. Basically it should be +# less than actual number of cores. If you don't specify one, it's +# automatically detected by the the script. +# NUM_THREADS = 24 + +# If you don't need CBLAS interface, please comment it in. +# NO_CBLAS = 1 + +# If you want to use legacy threaded Level 3 implementation. +# USE_SIMPLE_THREADED_LEVEL3 = 1 + +# If you want to drive whole 64bit region by BLAS. Not all Fortran +# compiler supports this. It's safe to keep comment it out if you +# are not sure(equivalent to "-i8" option). +# INTERFACE64 = 1 + +# Unfortunately most of kernel won't give us high quality buffer. +# BLAS tries to find the best region before entering main function, +# but it will consume time. If you don't like it, you can disable one. +# NO_WARMUP = 1 + +# If you want to disable CPU/Memory affinity on Linux. +# NO_AFFINITY = 1 + +# If you would like to know minute performance report of GotoBLAS. +# FUNCTION_PROFILE = 1 + +# Support for IEEE quad precision(it's *real* REAL*16)( under testing) +# QUAD_PRECISION = 1 + +# Theads are still working for a while after finishing BLAS operation +# to reduce thread activate/deactivate overhead. You can determine +# time out to improve performance. This number should be from 4 to 30 +# which corresponds to (1 << n) cycles. For example, if you set to 26, +# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz +# system). Also you can control this mumber by GOTO_THREAD_TIMEOUT +# CCOMMON_OPT += -DTHREAD_TIMEOUT=26 + +# Using special device driver for mapping physically contigous memory +# to the user space. If bigphysarea is enabled, it will use it. +# DEVICEDRIVER_ALLOCATION = 1 + +# If you need to synchronize FP CSR between threads (for x86/x86_64 only). +# CONSISTENT_FPCSR = 1 + +# If you need santy check by comparing reference BLAS. It'll be very +# slow (Not implemented yet). +# SANITY_CHECK = 1 + +# Common Optimization Flag; -O2 is enough. +COMMON_OPT += -O2 + +# Profiling flags +COMMON_PROF = -pg + +# +# End of user configuration +# diff --git a/Makefile.sparc b/Makefile.sparc new file mode 100644 index 0000000..c2b878e --- /dev/null +++ b/Makefile.sparc @@ -0,0 +1,41 @@ +CPP = $(CC) -E +RANLIB = ranlib + +ifdef BINARY64 + +CCOMMON_OPT += -mcpu=v9 -m64 +ifeq ($(COMPILER_F77), g77) +FCOMMON_OPT += -mcpu=v9 -m64 +endif +ifeq ($(COMPILER_F77), f90) +FCOMMON_OPT += -xarch=v9 +endif +LDFLAGS = -64 +else + +CCOMMON_OPT += -mcpu=v9 + +ifeq ($(COMPILER_F77), g77) +FCOMMON_OPT += -mcpu=v9 +endif +ifeq ($(COMPILER_F77), f90) +FCOMMON_OPT += -xarch=v8plusb +endif + +endif + +LIBNAME = $(LIBPREFIX).a + +ifndef SMP +LIBCXML = -L/opt/SUNWspro/lib/v9 +LIBATLAS = -L$(HOME)/misc/lib -lf77blas -latlas -lm +else +LIBCXML = -lcxmlp -lots -lm +endif +ifdef BINARY64 +LIBSUNPERF = -L/opt/SUNWspro/lib/v9 -L/opt/SUNWspro/prod/lib/v9 \ + -Wl,-R,/opt/SUNWspro/lib/v9 -lsunperf -lompstubs -lfui -lfsu -lsunmath +else +LIBSUNPERF = -L/opt/SUNWspro/lib -L/opt/SUNWspro/prod/lib \ + -Wl,-R,/opt/SUNWspro/lib -lsunperf -lompstubs -lfui -lfsu -lsunmath +endif \ No newline at end of file diff --git a/Makefile.system b/Makefile.system new file mode 100644 index 0000000..cece535 --- /dev/null +++ b/Makefile.system @@ -0,0 +1,753 @@ +# +# Include user definition +# + +# TO suppress recursive includes +INCLUDED = 1 + +ifndef TOPDIR +TOPDIR = . +endif + +# Default C compiler +CC = gcc + +ifndef MAKEFILE_RULE +include $(TOPDIR)/Makefile.rule +else +include $(TOPDIR)/$(MAKEFILE_RULE) +endif + +# +# Beginning of system configuration +# + +ifndef HOSTCC +HOSTCC = $(CC) +endif + +ifdef TARGET +GETARCH_FLAGS += -DFORCE_$(TARGET) +endif + +# This operation is expensive, so execution should be once. +ifndef GOTOBLAS_MAKEFILE +export GOTOBLAS_MAKEFILE = 1 + +# Generating Makefile.conf and config.h +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS=$(GETARCH_FLAGS) BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) + +ifndef TARGET_CORE +include $(TOPDIR)/Makefile.conf +else +include $(TOPDIR)/Makefile_kernel.conf +endif + +endif + +ifndef NUM_THREADS +NUM_THREADS = $(NUM_CORES) +endif + +ifeq ($(NUM_THREADS), 1) +override USE_THREAD = 0 +endif + +ifdef USE_THREAD +ifeq ($(USE_THREAD), 0) +SMP = +else +SMP = 1 +endif +else +ifeq ($(NUM_THREAD), 1) +SMP = +else +SMP = 1 +endif +endif + +ifndef NEED_PIC +NEED_PIC = 1 +endif + +ARFLAGS = +CPP = $(COMPILER) -E +AR = $(CROSS_SUFFIX)ar +AS = $(CROSS_SUFFIX)as +LD = $(CROSS_SUFFIX)ld +RANLIB = $(CROSS_SUFFIX)ranlib +NM = $(CROSS_SUFFIX)nm +DLLWRAP = $(CROSS_SUFFIX)dllwrap + +# +# OS dependent settings +# + +ifeq ($(OSNAME), Darwin) +EXTRALIB += -lSystemStubs +export MACOSX_DEPLOYMENT_TARGET=10.2 +endif + +ifeq ($(OSNAME), Linux) +EXTRALIB += -lm +endif + +ifeq ($(OSNAME), AIX) +EXTRALIB += -lm +endif + +ifeq ($(OSNAME), WINNT) +NEED_PIC = 0 +NO_EXPRECISION = 1 + +EXTRALIB += -defaultlib:advapi32 + +SUFFIX = obj +PSUFFIX = pobj +LIBSUFFIX = lib +endif + +ifeq ($(OSNAME), Interix) +NEED_PIC = 0 +NO_EXPRECISION = 1 + +INTERIX_TOOL_DIR = /opt/gcc.3.3/i586-pc-interix3/bin +endif + +ifeq ($(OSNAME), CYGWIN_NT) +NEED_PIC = 0 +NO_EXPRECISION = 1 +endif + +ifneq ($(OSNAME), WINNT) +ifneq ($(OSNAME), CYGWIN_NT) +ifneq ($(OSNAME), Interix) +ifdef SMP +EXTRALIB += -lpthread +endif +endif +endif +endif + +ifdef QUAD_PRECISION +CCOMMON_OPT += -DQUAD_PRECISION +NO_EXPRECISION = 1 +endif + +ifneq ($(ARCH), x86) +ifneq ($(ARCH), x86_64) +NO_EXPRECISION = 1 +endif +endif + +ifdef SANITY_CHECK +CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) +endif + +# +# Architecture dependent settings +# + +ifeq ($(ARCH), x86) +ifndef BINARY +NO_BINARY_MODE = 1 +endif +ifndef NO_EXPRECISION +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), GCC) +EXPRECISION = 1 +CCOMMON_OPT += -DEXPRECISION -m128bit-long-double +FCOMMON_OPT += -m128bit-long-double +endif +endif +endif +endif + +ifeq ($(ARCH), x86_64) +ifndef NO_EXPRECISION +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), GCC) +EXPRECISION = 1 +CCOMMON_OPT += -DEXPRECISION -m128bit-long-double +FCOMMON_OPT += -m128bit-long-double +endif +endif +endif +endif + +ifeq ($(C_COMPILER), INTEL) +CCOMMON_OPT += -wd981 +endif + +ifdef USE_OPENMP +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fopenmp +endif + +ifeq ($(C_COMPILER), INTEL) +CCOMMON_OPT += -openmp +endif + +ifeq ($(C_COMPILER), PGI) +CCOMMON_OPT += -mp +endif + +ifeq ($(C_COMPILER), OPEN64) +CCOMMON_OPT += -mp +CEXTRALIB += -lstdc++ +endif + +ifeq ($(C_COMPILER), PATHSCALE) +CCOMMON_OPT += -mp +endif +endif + + +ifdef DYNAMIC_ARCH +ifeq ($(ARCH), x86) +DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ + CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +endif + +ifeq ($(ARCH), x86_64) +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +endif + +ifndef DYNAMIC_CORE +DYNAMIC_ARCH = +endif +endif + +ifeq ($(ARCH), ia64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 + +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), GCC) +# EXPRECISION = 1 +# CCOMMON_OPT += -DEXPRECISION +endif +endif +endif + +ifeq ($(ARCH), mips64) +NO_BINARY_MODE = 1 +endif + +ifeq ($(ARCH), alpha) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + +# +# C Compiler dependent settings +# + +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -Wall +COMMON_PROF += -fno-inline +NO_UNINITIALIZED_WARN = -Wno-uninitialized + +ifdef NO_BINARY_MODE + +ifeq ($(ARCH), mips64) +ifdef BINARY64 +CCOMMON_OPT += -mabi=64 +else +CCOMMON_OPT += -mabi=n32 +endif +BINARY_DEFINED = 1 +endif + +ifeq ($(OSNAME), AIX) +BINARY_DEFINED = 1 +endif + +endif + +ifndef BINARY_DEFINED +ifdef BINARY64 +CCOMMON_OPT += -m64 +else +CCOMMON_OPT += -m32 +endif +endif + +endif + +ifeq ($(C_COMPILER), PGI) +ifdef BINARY64 +CCOMMON_OPT += -tp p7-64 +else +CCOMMON_OPT += -tp p7 +endif +endif + +ifeq ($(C_COMPILER), PATHSCALE) +ifdef BINARY64 +CCOMMON_OPT += -m64 +else +CCOMMON_OPT += -m32 +endif +endif + +# +# Fortran Compiler dependent settings +# + +ifeq ($(F_COMPILER), G77) +CCOMMON_OPT += -DF_INTERFACE_G77 +FCOMMON_OPT += -Wall +ifndef NO_BINARY_MODE +ifdef BINARY64 +FCOMMON_OPT += -m64 +else +FCOMMON_OPT += -m32 +endif +endif +endif + +ifeq ($(F_COMPILER), G95) +CCOMMON_OPT += -DF_INTERFACE_G95 +FCOMMON_OPT += -Wall +ifndef NO_BINARY_MODE +ifdef BINARY64 +FCOMMON_OPT += -m64 +else +FCOMMON_OPT += -m32 +endif +endif +endif + +ifeq ($(F_COMPILER), GFORTRAN) +CCOMMON_OPT += -DF_INTERFACE_GFORT +FCOMMON_OPT += -Wall +ifdef NO_BINARY_MODE +ifeq ($(ARCH), mips64) +ifdef BINARY64 +FCOMMON_OPT += -mabi=64 +else +FCOMMON_OPT += -mabi=n32 +endif +endif +else +ifdef BINARY64 +FCOMMON_OPT += -m64 +ifdef INTERFACE64 +FCOMMON_OPT += -fdefault-integer-8 +endif +else +FCOMMON_OPT += -m32 +endif +endif +ifdef USE_OPENMP +FCOMMON_OPT += -fopenmp +endif +endif + +ifeq ($(F_COMPILER), INTEL) +CCOMMON_OPT += -DF_INTERFACE_INTEL +ifdef INTERFACE64 +FCOMMON_OPT += -i8 +endif +ifdef USE_OPENMP +FCOMMON_OPT += -openmp +endif +endif + +ifeq ($(F_COMPILER), FUJITSU) +CCOMMON_OPT += -DF_INTERFACE_FUJITSU +ifdef USE_OPENMP +FCOMMON_OPT += -openmp +endif +endif + +ifeq ($(F_COMPILER), IBM) +CCOMMON_OPT += -DF_INTERFACE_IBM +# FCOMMON_OPT += -qarch=440 +ifdef BINARY64 +FCOMMON_OPT += -q64 +ifdef INTERFACE64 +FCOMMON_OPT += -qintsize=8 +endif +else +FCOMMON_OPT += -q32 +endif +ifdef USE_OPENMP +FCOMMON_OPT += -openmp +endif +endif + +ifeq ($(F_COMPILER), PGI) +CCOMMON_OPT += -DF_INTERFACE_PGI +COMMON_PROF += -DPGICOMPILER +ifdef BINARY64 +ifdef INTERFACE64 +FCOMMON_OPT += -i8 +endif +FCOMMON_OPT += -tp p7-64 +else +FCOMMON_OPT += -tp p7 +endif +ifdef USE_OPENMP +FCOMMON_OPT += -mp +endif +endif + +ifeq ($(F_COMPILER), PATHSCALE) +CCOMMON_OPT += -DF_INTERFACE_PATHSCALE +ifdef BINARY64 +ifdef INTERFACE64 +FCOMMON_OPT += -i8 +endif +endif + +ifneq ($(ARCH), mips64) +ifndef BINARY64 +FCOMMON_OPT += -m32 +else +FCOMMON_OPT += -m64 +endif +else +ifdef BINARY64 +FCOMMON_OPT += -mabi=64 +else +FCOMMON_OPT += -mabi=n32 +endif +endif + +ifdef USE_OPENMP +FCOMMON_OPT += -mp +endif +endif + +ifeq ($(F_COMPILER), OPEN64) +CCOMMON_OPT += -DF_INTERFACE_OPEN64 +ifdef BINARY64 +ifdef INTERFACE64 +FCOMMON_OPT += -i8 +endif +endif +ifndef BINARY64 +FCOMMON_OPT += -m32 +else +FCOMMON_OPT += -m64 +endif + +ifdef USE_OPENMP +FEXTRALIB += -lstdc++ +FCOMMON_OPT += -mp +endif +endif + +ifeq ($(C_COMPILER), OPEN64) +ifndef BINARY64 +CCOMMON_OPT += -m32 +else +CCOMMON_OPT += -m64 +endif +endif + +ifeq ($(C_COMPILER), SUN) +CCOMMON_OPT += -w +ifeq ($(ARCH), x86) +CCOMMON_OPT += -m32 +else +FCOMMON_OPT += -m64 +endif +endif + +ifeq ($(F_COMPILER), SUN) +CCOMMON_OPT += -DF_INTERFACE_SUN +ifeq ($(ARCH), x86) +FCOMMON_OPT += -m32 +else +FCOMMON_OPT += -m64 +endif +ifdef USE_OPENMP +FCOMMON_OPT += -xopenmp=parallel +endif +endif + +ifeq ($(F_COMPILER), COMPAQ) +CCOMMON_OPT += -DF_INTERFACE_COMPAQ +ifdef USE_OPENMP +FCOMMON_OPT += -openmp +endif +endif + +ifdef BINARY64 +ifdef INTERFACE64 +CCOMMON_OPT += -DUSE64BITINT +endif +endif + +ifeq ($(NEED_PIC), 1) +ifeq ($(C_COMPILER), IBM) +CCOMMON_OPT += -qpic=large +else +CCOMMON_OPT += -fPIC +endif +ifeq ($(F_COMPILER), SUN) +FCOMMON_OPT += -pic +else +FCOMMON_OPT += -fPIC +endif +endif + +ifeq ($(DYNAMIC_ARCH), 1) +CCOMMON_OPT += -DDYNAMIC_ARCH +endif + +ifdef SMP +CCOMMON_OPT += -DSMP_SERVER + +ifeq ($(ARCH), mips64) +USE_SIMPLE_THREADED_LEVEL3 = 1 +endif + +ifeq ($(USE_OPENMP), 1) +# USE_SIMPLE_THREADED_LEVEL3 = 1 +# NO_AFFINITY = 1 +CCOMMON_OPT += -DUSE_OPENMP +endif + +endif + +ifeq ($(NO_WARMUP), 1) +CCOMMON_OPT += -DNO_WARMUP +endif + +ifeq ($(CONSISTENT_FPCSR), 1) +CCOMMON_OPT += -DCONSISTENT_FPCSR +endif + +# Only for development +# CCOMMON_OPT += -DPARAMTEST +# CCOMMON_OPT += -DPREFETCHTEST +# CCOMMON_OPT += -DNO_SWITCHING +# USE_PAPI = 1 + +ifdef USE_PAPI +CCOMMON_OPT += -DUSE_PAPI +EXTRALIB += -lpapi -lperfctr +endif + +ifdef DYNAMIC_THREADS +CCOMMON_OPT += -DDYNAMIC_THREADS +endif + +CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) + +ifdef USE_SIMPLE_THREADED_LEVEL3 +CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 +endif + +LIBPREFIX = libgoto2 + +KERNELDIR = $(TOPDIR)/kernel/$(ARCH) + +include $(TOPDIR)/Makefile.$(ARCH) + +CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" + +ifeq ($(CORE), PPC440) +CCOMMON_OPT += -DALLOC_QALLOC +endif + +ifeq ($(CORE), PPC440FP2) +STATIC_ALLOCATION = 1 +endif + +ifneq ($(OSNAME), Linux) +NO_AFFINITY = 1 +endif + +ifneq ($(ARCH), x86_64) +ifneq ($(ARCH), x86) +NO_AFFINITY = 1 +endif +endif + +ifdef NO_AFFINITY +CCOMMON_OPT += -DNO_AFFINITY +endif + +ifdef FUNCTION_PROFILE +CCOMMON_OPT += -DFUNCTION_PROFILE +endif + +ifdef HUGETLB_ALLOCATION +CCOMMON_OPT += -DALLOC_HUGETLB +endif + +ifdef HUGETLBFILE_ALLOCATION +CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION) +endif + +ifdef STATIC_ALLOCATION +CCOMMON_OPT += -DALLOC_STATIC +endif + +ifdef DEVICEDRIVER_ALLOCATION +CCOMMON_OPT += -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\" +endif + +ifdef MIXED_MEMORY_ALLOCATION +CCOMMON_OPT += -DMIXED_MEMORY_ALLOCATION +endif + +ifeq ($(OSNAME), SunOS) +TAR = gtar +PATCH = gpatch +GREP = ggrep +else +TAR = tar +PATCH = patch +GREP = grep +endif + +MD5SUM = md5sum +AWK = awk + +REVISION = -r$(VERSION) + +CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) +PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) + +FFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) +FPFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) + +ifndef SUFFIX +SUFFIX = o +endif + +ifndef PSUFFIX +PSUFFIX = po +endif + +ifndef LIBSUFFIX +LIBSUFFIX = a +endif + +ifndef DYNAMIC_ARCH +ifndef SMP +LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) +LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) +else +LIBNAME = $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX) +LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX) +endif +else +ifndef SMP +LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX) +LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX) +else +LIBNAME = $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX) +LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) +endif +endif + + +LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) +LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll) +LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) +LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) +LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) +LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) + +LIBS = $(TOPDIR)/$(LIBNAME) +LIBS_P = $(TOPDIR)/$(LIBNAME_P) + +export OSNAME +export ARCH +export CORE +export LIBCORE +export PGCPATH +export CONFIG +export CC +export FC +export BU +export FU +export USE_THREAD +export NUM_THREADS +export NUM_CORES +export SMP +export MAKEFILE_RULE +export NEED_PIC +export BINARY +export BINARY32 +export BINARY64 +export F_COMPILER +export C_COMPILER +export USE_OPENMP +export CROSS +export CROSS_SUFFIX +export NOFORTRAN +export EXTRALIB +export CEXTRALIB +export FEXTRALIB +export HAVE_SSE +export HAVE_SSE2 +export HAVE_SSE3 +export HAVE_SSSE3 +export HAVE_SSE4_1 +export HAVE_SSE4_2 +export HAVE_SSE4A +export HAVE_SSE5 +export KERNELDIR +export FUNCTION_PROFILE +export TARGET_CORE + +export SGEMM_UNROLL_M +export SGEMM_UNROLL_N +export DGEMM_UNROLL_M +export DGEMM_UNROLL_N +export QGEMM_UNROLL_M +export QGEMM_UNROLL_N +export CGEMM_UNROLL_M +export CGEMM_UNROLL_N +export ZGEMM_UNROLL_M +export ZGEMM_UNROLL_N +export XGEMM_UNROLL_M +export XGEMM_UNROLL_N + +ifdef USE_CUDA +export CUDADIR +export CUCC +export CUFLAGS +export CULIB +endif + +.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f + +.f.$(SUFFIX): + $(FC) $(FFLAGS) -c $< -o $(@F) + +.f.$(PSUFFIX): + $(FC) $(FPFLAGS) -pg -c $< -o $(@F) + + +ifdef BINARY64 +PATHSCALEPATH = /opt/pathscale/lib/3.1 +PGIPATH = /opt/pgi/linux86-64/7.1-5/lib +else +PATHSCALEPATH = /opt/pathscale/lib/3.1/32 +PGIPATH = /opt/pgi/linux86/7.1-5/lib +endif + +ACMLPATH = /opt/acml/4.3.0 +ifneq ($(OSNAME), Darwin) +MKLPATH = /opt/intel/mkl/10.2.2.025/lib +else +MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib +endif +ATLASPATH = /opt/atlas/3.9.17/opteron +FLAMEPATH = $(HOME)/flame/lib +ifneq ($(OSNAME), SunOS) +SUNPATH = /opt/sunstudio12.1 +else +SUNPATH = /opt/SUNWspro +endif + diff --git a/Makefile.tail b/Makefile.tail new file mode 100644 index 0000000..64f98ab --- /dev/null +++ b/Makefile.tail @@ -0,0 +1,617 @@ +SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) + +COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) + +HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) + +BLASOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +BLASOBJS_P = $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) + +ifdef EXPRECISION +BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) +BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) +endif + +ifdef QUAD_PRECISION +BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) +BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) +endif + +$(SBLASOBJS) $(SBLASOBJS_P) : CFLAGS += -UDOUBLE -UCOMPLEX +$(DBLASOBJS) $(DBLASOBJS_P) : CFLAGS += -DDOUBLE -UCOMPLEX +$(QBLASOBJS) $(QBLASOBJS_P) : CFLAGS += -DXDOUBLE -UCOMPLEX +$(CBLASOBJS) $(CBLASOBJS_P) : CFLAGS += -UDOUBLE -DCOMPLEX +$(ZBLASOBJS) $(ZBLASOBJS_P) : CFLAGS += -DDOUBLE -DCOMPLEX +$(XBLASOBJS) $(XBLASOBJS_P) : CFLAGS += -DXDOUBLE -DCOMPLEX + +$(SBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(DBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(QBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(CBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(ZBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(XBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) + +libs :: $(BLASOBJS) $(COMMONOBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +prof :: $(BLASOBJS_P) $(COMMONOBJS_P) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ + +hpl :: $(HPLOBJS) $(COMMONOBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +hpl_p :: $(HPLOBJS_P) $(COMMONOBJS_P) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ + +kernel :: $(BLASOBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +commonlibs :: $(COMMONOBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +commonprof :: $(COMMONOBJS_P) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ + +quick : + $(MAKE) -C $(TOPDIR) libs + +bms.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmd.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmd-k.$(SUFFIX):bm-k.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +ifdef QUAD_PRECISION +bmq.$(SUFFIX):bmq.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmx.$(SUFFIX):bmx.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) +else +bmq.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmx.$(SUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) +endif + +bmc.$(SUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) + +bmz.$(SUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) + +bmd_nn.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DNN -c $< -o $(@F) + +bmd_nt.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DNT -c $< -o $(@F) + +bmd_tn.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DTN -c $< -o $(@F) + +bmd_tt.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DTT -c $< -o $(@F) + +bm-phy.$(SUFFIX):bm-phy.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +bms.$(PSUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmd.$(PSUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +ifdef QUAD_PRECISION +bmq.$(PSUFFIX):bmq.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmx.$(PSUFFIX):bmx.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) +else +bmq.$(PSUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmx.$(PSUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) +endif + +bmc.$(PSUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) + +bmz.$(PSUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) + +bms : bms.$(SUFFIX) $(SBLASOBJS) $(COMMONOBJS) $(SOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd : bmd.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) -lm + +bmd-k : bmd-k.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) -lm + +bmq : bmq.$(SUFFIX) $(QBLASOBJS) $(COMMONOBJS) $(QOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmc : bmc.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) $(FEXTRALIB) + +bmz : bmz.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmx : bmx.$(SUFFIX) $(XBLASOBJS) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd_nn : bmd_nn.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd_nt : bmd_nt.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd_tn : bmd_tn.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd_tt : bmd_tt.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bm-phy:bm-phy.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmcc : bmcc.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmzc : bmzc.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bms.prof : bms.$(PSUFFIX) $(SBLASOBJS_P) $(COMMONOBJS_P) $(SOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd.prof : bmd.$(PSUFFIX) $(DBLASOBJS_P) $(COMMONOBJS_P) $(DOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmq.prof : bmq.$(PSUFFIX) $(QBLASOBJS_P) $(COMMONOBJS_P) $(QOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmc.prof : bmc.$(PSUFFIX) $(CBLASOBJS_P) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmz.prof : bmz.$(PSUFFIX) $(ZBLASOBJS_P) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmx.prof : bmz.$(PSUFFIX) $(XBLASOBJS_P) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bms.cxml : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bmd.cxml : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bmc.cxml : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bmz.cxml : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bms.scsl : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) + +bmd.scsl : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) + +bmc.scsl : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) + +bmz.scsl : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) + +bms.acml : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +bmd.acml : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +bmc.acml : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +bmz.acml : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +bms.sun : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bmd.sun : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bmc.sun : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bmz.sun : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bms.atlas : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bmd.atlas : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bmc.atlas : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bmz.atlas : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bms.essl : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) $(FCOMMON_OPT) -o $(@F) $^ $(LIBESSL) + +bmd.essl : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) $(CCOMMON_OPT) -o $(@F) $^ $(LIBESSL) + +bmc.essl : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(F77) $(CCOMMON_OPT) -o $(@F) $^ $(LIBESSL) + +bmz.essl : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) $(CCOMMON_OPT) -o $(@F) $^ $(LIBESSL) + +bms.flame : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) + +bmd.flame : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) + +bmc.flame : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) + +bmz.flame : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) + +bms.flame.prof : bms.$(SUFFIX) $(SOBJS) $(OBJS_P) + $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) + +bmd.flame.prof : bmd.$(SUFFIX) $(DOBJS) $(OBJS_P) + $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) + +bmc.flame.prof : bmc.$(SUFFIX) $(COBJS) $(OBJS_P) + $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) + +bmz.flame.prof : bmz.$(SUFFIX) $(ZOBJS) $(OBJS_P) + $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) + +bms.mkl : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bmd.mkl : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bmc.mkl : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bmz.mkl : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bmq.mkl : bmq.$(SUFFIX) $(QOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bms.mkl.prof : bms.$(PSUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) + +bmd.mkl.prof : bmd.$(PSUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) + +bmc.mkl.prof : bmc.$(PSUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) + +bmz.mkl.prof : bmz.$(PSUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) + +bms.mlib : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bmd.mlib : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bmc.mlib : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bmz.mlib : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bms.veclib : bms.$(SUFFIX) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +bmd.veclib : bmd.$(SUFFIX) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +bmc.veclib : bmc.$(SUFFIX) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +bmz.veclib : bmz.$(SUFFIX) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +bms.fuji : bms.$(SUFFIX) $(SOBJS) +ifndef SMP + fcc -KV9FMADD -SSL2 -o $(@F) $^ +else + fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ +endif + +bmd.fuji : bmd.$(SUFFIX) $(DOBJS) +ifndef SMP + fcc -KV9FMADD -SSL2 -o $(@F) $^ +else + fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ +endif + +bmc.fuji : bmc.$(SUFFIX) $(COBJS) +ifndef SMP + fcc -KV9FMADD -SSL2 -o $(@F) $^ +else + fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ +endif + +bmz.fuji : bmz.$(SUFFIX) $(ZOBJS) +ifndef SMP + fcc -KV9FMADD -SSL2 -o $(@F) $^ +else + fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ +endif + +bench: bench.$(SUFFIX) $(BLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bench.$(SUFFIX): bench.c + $(CC) -c -o $(@F) $(CFLAGS) $^ + +bench_old: bench_old.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +kbench: kbench.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +prebench: prebench.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +kbench_rank_k: kbench_rank_k.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +smallbench: smallbench.$(SUFFIX) $(BLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +smallbench.mkl: smallbench.$(SUFFIX) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bench.sun: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bench.cxml: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bench.atlas: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bench.essl: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) ../../level1/others/libmisc.$(LIBSUFFIX) + +bench.scsl: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) $(EXTRALIB) $(CEXTRALIB) + +bench.acml: bench.$(SUFFIX) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBACML) $(EXTRALIB) $(CEXTRALIB) + +bench.flame: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +kbench.mkl: kbench.$(SUFFIX) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bench.mkl: bench.$(SUFFIX) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bench_old.mkl: bench_old.$(SUFFIX) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bench.mlib: bench.$(SUFFIX) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bench.veclib: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +params : params.$(SUFFIX) $(SBLASOBJS) $(COMMONOBJS) $(SOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramd : paramd.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramq : paramq.$(SUFFIX) $(QBLASOBJS) $(COMMONOBJS) $(QOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramc : paramc.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramz : paramz.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramx : paramx.$(SUFFIX) $(XBLASOBJS) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +params-ex : params-ex.$(SUFFIX) $(SBLASOBJS) $(COMMONOBJS) $(SOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramd-ex : paramd-ex.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramq-ex : paramq-ex.$(SUFFIX) $(QBLASOBJS) $(COMMONOBJS) $(QOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramc-ex : paramc-ex.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramz-ex : paramz-ex.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramx-ex : paramx-ex.$(SUFFIX) $(XBLASOBJS) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +params.atlas : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +paramd.atlas : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +paramc.atlas : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +paramz.atlas : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +params.sun : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) + +paramd.sun : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) + +paramc.sun : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) + +paramz.sun : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) + +params.essl : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) + +paramd.essl : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) + +paramc.essl : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) + +paramz.essl : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) + +params.mkl : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) + +paramd.mkl : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) + +paramc.mkl : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) + +paramz.mkl : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) + +params.acml : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +paramd.acml : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +paramc.acml : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +paramz.acml : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +params.flame : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +paramd.flame : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +paramc.flame : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +paramz.flame : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +params.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramd.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramq.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramc.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) + +paramz.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) + +paramx.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) + +params-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramd-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramq-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramc-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) + +paramz-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) + +paramx-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) + +gen_insn_flash.c : + echo '#include ' > gen_insn_flash.c + echo '#include ' >> gen_insn_flash.c + echo '#define ICACHE_SIZE ( 256 << 10)' >> gen_insn_flash.c + echo 'int main(void){' >> gen_insn_flash.c + echo 'int i;' >> gen_insn_flash.c + echo '#ifdef __alpha' >> gen_insn_flash.c + echo 'printf(".set noat;.set noreorder;\n");' >> gen_insn_flash.c + echo 'printf(".arch ev6;.text;.align 5\n");' >> gen_insn_flash.c + echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c + echo 'printf(".ent insn_flash\n");' >> gen_insn_flash.c + echo 'printf("insn_flash:\n");' >> gen_insn_flash.c + echo 'for (i = 0; i < ICACHE_SIZE / 4; i++)' >> gen_insn_flash.c + echo 'printf("br 1f\n 1:\n");' >> gen_insn_flash.c + echo 'printf(".align 5;ret;.end insn_flash\n");'>> gen_insn_flash.c + echo '#else' >> gen_insn_flash.c + echo 'printf(".text;.align 32\n");' >> gen_insn_flash.c + echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c + echo 'printf("insn_flash:\n");' >> gen_insn_flash.c + echo 'for (i = 0; i < ICACHE_SIZE / 2; i++)' >> gen_insn_flash.c + echo 'printf("jmp 1f\n 1:\n");' >> gen_insn_flash.c + echo 'printf(".align 32;ret\n");' >> gen_insn_flash.c + echo '#endif' >> gen_insn_flash.c + echo 'return 0;' >> gen_insn_flash.c + echo '}' >> gen_insn_flash.c + +insn_flash.$(SUFFIX) : gen_insn_flash + ./gen_insn_flash > temp.s + $(AS) -o $(@F) temp.s + rm -f temp.s + +dummy : + +clean :: + @if test -d $(ARCH); then \ + (cd $(ARCH) && $(MAKE) clean) \ + fi + @rm -rf *.a *.s *.o *.po *.obj *.i *.so core core.* gmon.out *.cso \ + *.csx *.is *~ *.exe *.flame *.pdb *.dwf \ + gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \ + *.pc *.pcl *.def *.i *.prof linktest.c \ + bms bmd bmc bmz bmq bmx \ + params paramd paramc paramz paramq paramx \ + params-ex paramd-ex paramc-ex paramz-ex paramq-ex paramx-ex \ + bench tpp kbench kbench2 \ + *.mkl *.sun *.acml *.cxml *.essl *.atlas *.scsl *.mlib *.veclib *.fuji diff --git a/Makefile.x86 b/Makefile.x86 new file mode 100644 index 0000000..94ca7c4 --- /dev/null +++ b/Makefile.x86 @@ -0,0 +1,59 @@ +# COMPILER_PREFIX = mingw32- + +ifeq ($(OSNAME), Linux) +LDFLAGS = -melf_i386 +endif + +ifeq ($(OSNAME), Interix) +ARFLAGS = -m x86 +endif + +ifndef SMP +LIBMKL = -L$(MKLPATH)/32 -Wl,-rpath,$(MKLPATH)/32 -lmkl_intel -lmkl_sequential -lmkl_core -lguide -lpthread -lm +else +LIBMKL = -L$(MKLPATH)/32 -Wl,-rpath,$(MKLPATH)/32 -lmkl_intel -lmkl_intel_thread -lmkl_core -lguide -lpthread -lm +endif + +# LIBMKL = -L$(MKLPATH)/32 -lmkl_lapack -lmkl_ia32 -lguide -lpthread -lm + +ifndef SMP +LIBATLAS = -L$(ATLAS) -lf77blas -latlas -lg2c -lm +else +LIBATLAS = -L$(ATLAS) -lptf77blas -latlas -lpthread -lg2c -lm +endif + +ifeq ($(COMPILER_F77), g77) +LIBACML = -L$(ACMLPATH)/gnu32/lib -Wl,-rpath,$(ACMLPATH)/gnu32/lib -lacml -lg2c +endif + +LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame-lapack -lflame-base $(LIBS) + +ifeq ($(F_COMPILER), GFORTRAN) +ifndef SMP +LIBACML = -L$(ACMLPATH)/gfortran32/lib -Wl,-rpath,$(ACMLPATH)/gfortran32/lib -lacml -lgfortran -lm +else +LIBACML = -L$(ACMLPATH)/gfortran32_mp/lib -Wl,-rpath,$(ACMLPATH)/gfortran32_mp/lib -lacml_mp -lgfortran -lgomp -lm +endif +endif + +ifeq ($(COMPILER_F77), pgf77) +LIBACML = -L$(ACMLPATH)/pgi32/lib -lacml -L/opt/pgi/linux86-64/5.2/lib -lpgftnrtl -lnspgc -lpgc +endif + +ifeq ($(F_COMPILER), PATHSCALE) +ifndef SMP +LIBACML = -L$(ACMLPATH)/pathscale32/lib -Wl,-rpath,$(ACMLPATH)/pathscale32/lib -lacml -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lpathfortran -lm +else +LIBACML = -L$(ACMLPATH)/pathscale32_mp/lib -Wl,-rpath,$(ACMLPATH)/pathscale32_mp/lib -lacml_mp -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lopenmp -lpathfortran -lm +endif +endif + +LIBSUNPERF = -L/opt/SUNWspro/lib/sse2 -Wl,-R,/opt/SUNWspro/lib/sse2 -lsunperf + +LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib + +ifndef SMP +LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm +else +LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm +endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 new file mode 100644 index 0000000..b939e54 --- /dev/null +++ b/Makefile.x86_64 @@ -0,0 +1,102 @@ +# CCOMMON_OPT += -DFASTCPU + +ifeq ($(OSNAME), SunOS) +ifdef BINARY64 +LDFLAGS = -64 +ifeq ($(F_COMPILER), SUN) +FCOMMON_OPT += -m64 +endif +endif +endif + +ifeq ($(OSNAME), FreeBSD) +LDFLAGS = -m elf_x86_64_fbsd +endif + +ifeq ($(OSNAME), Linux) +LDFLAGS = -m elf_x86_64 +endif + +ifeq ($(OSNAME), Darwin) +LDFLAGS = +endif + +ifeq ($(OSNAME), Interix) +ARFLAGS = -m x64 +endif + +ifeq ($(OSNAME), Darwin) +ifndef SMP +LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lguide -lpthread -lm +else +LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lguide -lpthread -lm +endif +else +ifndef SMP +LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -Wl,--start-group -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -Wl,--end-group -lguide -lpthread -lm +else +LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -Wl,--start-group -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -Wl,--end-group -lguide -lpthread -lm +endif +endif + + +ifndef SMP +LIBATLAS = -L$(ATLASPATH)64 -llapack -lcblas -lf77blas -latlas -lm +else +LIBATLAS = -L$(ATLASPATH)64 -llapack -lptcblas -lptf77blas -latlas -lpthread -lm +endif + +LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame $(TOPDIR)/$(LIBNAME) -lgfortran -lpthread -lm + + +ifeq ($(F_COMPILER), g77) +LIBACML = -L$(ACMLPATH)/gnu64/lib -Wl,-rpath,$(ACMLPATH)/gnu64/lib -lacml -lacml_mv -lg2c -lm +endif + +ifeq ($(F_COMPILER), GFORTRAN) +ifndef SMP +LIBACML = -L$(ACMLPATH)/gfortran64/lib -Wl,-rpath,$(ACMLPATH)/gfortran64/lib -lacml -lacml_mv -lgfortran -lm +else +LIBACML = -L$(ACMLPATH)/gfortran64_mp/lib -Wl,-rpath,$(ACMLPATH)/gfortran64_mp/lib -lacml_mp -lacml_mv -lgfortran -lgomp -lm +endif +endif + +ifeq ($(F_COMPILER), INTEL) +ifndef SMP +LIBACML = -L$(ACMLPATH)/ifort64/lib -Wl,-rpath,$(ACMLPATH)/ifort64/lib -lacml -lacml_mv -lifcoremt_pic -lirc -lm -lpthread -ldl +else +LIBACML = -L$(ACMLPATH)/ifort64_mp/lib -Wl,-rpath,$(ACMLPATH)/ifort64_mp/lib -lacml_mp -lacml_mv -lifcoremt_pic -liomp5 -lirc -lm -lpthread -ldl +endif +endif + +ifeq ($(F_COMPILER), OPEN64) +ifndef SMP +LIBACML = -L$(ACMLPATH)/open64/lib -Wl,-rpath,$(ACMLPATH)/open64/lib -lacml -lacml_mv -lm +else +LIBACML = -L$(ACMLPATH)/open64_mp/lib -Wl,-rpath,$(ACMLPATH)/open64_mp/lib -lacml_mp -lacml_mv -lm -lpthread +endif +endif + +ifeq ($(F_COMPILER), pgf77) +ifndef SMP +LIBACML = -L$(ACMLPATH)/pgi64/lib -Wl,-rpath,$(ACMLPATH)/pgi64/lib -lacml -lacml_mv -L$(PGIPATH) -Wl,-rpath,$(PGIPATH) -lpgftnrtl -lnspgc -lpgmp -lpgc +else +LIBACML = -L$(ACMLPATH)/pgi64_mp/lib -Wl,-rpath,$(ACMLPATH)/pgi64_mp/lib -lacml -lacml_mv -L$(PGIPATH) -Wl,-rpath,$(PGIPATH) -lpgftnrtl -lnspgc -lpgmp -lpgc +endif +endif + +ifeq ($(F_COMPILER), PATHSCALE) +ifndef SMP +LIBACML = -L$(ACMLPATH)/pathscale64/lib -Wl,-rpath,$(ACMLPATH)/pathscale64/lib -lacml -lacml_mv -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lpathfortran -lm +else +LIBACML = -L$(ACMLPATH)/pathscale64_mp/lib -Wl,-rpath,$(ACMLPATH)/pathscale64_mp/lib -lacml_mp -lacml_mv -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lopenmp -lpathfortran -lm +endif +endif + +ifeq ($(F_COMPILER), f90) +LIBACML = -L$(ACMLPATH)/sun64/lib -Wl,-R,$(ACMLPATH)/sun64/lib -L$(SUNPATH)/lib/amd64 -Wl,-R,$(SUNPATH)/lib/amd64 -lacml -lacml_mv -lfsu +endif + +LIBSUNPERF = -L$(SUNPATH)/lib/amd64 -L$(SUNPATH)/rtlibs/amd64 -Wl,-R,$(SUNPATH)/lib/amd64 -Wl,-R,$(SUNPATH)/rtlibs/amd64 -lsunperf -lfui -lfsu -lmtsk + +LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib diff --git a/benchmark/._Makefile b/benchmark/._Makefile new file mode 100644 index 0000000..932ad06 Binary files /dev/null and b/benchmark/._Makefile differ diff --git a/benchmark/._cholesky.c b/benchmark/._cholesky.c new file mode 100644 index 0000000..6b8cef2 Binary files /dev/null and b/benchmark/._cholesky.c differ diff --git a/benchmark/._cula_wrapper.c b/benchmark/._cula_wrapper.c new file mode 100644 index 0000000..c9a8ec8 Binary files /dev/null and b/benchmark/._cula_wrapper.c differ diff --git a/benchmark/._linpack.c b/benchmark/._linpack.c new file mode 100644 index 0000000..33e03ab Binary files /dev/null and b/benchmark/._linpack.c differ diff --git a/benchmark/Makefile b/benchmark/Makefile new file mode 100644 index 0000000..0c37570 --- /dev/null +++ b/benchmark/Makefile @@ -0,0 +1,195 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +CULA_INC = -I/usr/local/cula/include +CULA_LIB = -L/usr/local/cula/lib64 -Wl,-rpath,/usr/local/cula/lib64 -lcula_fortran -lcula -lcublas + +all :: dlinpack.goto dlinpack.mkl dlinpack.acml dcholesky.goto dcholesky.mkl dcholesky.acml + ./dlinpack.goto 4000 4000 1 + -./dlinpack.mkl 4000 4000 1 + -./dlinpack.acml 4000 4000 1 + ./dcholesky.goto 4000 4000 1 + -./dcholesky.mkl 4000 4000 1 + -./dcholesky.acml 4000 4000 1 + +slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +qlinpack.goto : qlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +xlinpack.goto : xlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +qcholesky.goto : qcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +xcholesky.goto : xcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +slinpack.mkl : slinpack.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.mkl : dlinpack.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.mkl : clinpack.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.mkl : zlinpack.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.mkl : scholesky.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.mkl : dcholesky.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.mkl : ccholesky.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.mkl : zcholesky.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.acml : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.acml : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.acml : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.acml : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.acml : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.acml : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.acml : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.acml : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.flame : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.flame : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.flame : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.flame : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.flame : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.flame : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.flame : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.flame : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.sun : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.sun : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.sun : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.sun : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.sun : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.sun : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.sun : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.sun : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.cula : slinpack.$(SUFFIX) cula_wrapper.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(CULA_LIB) ../$(LIBNAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.cula : clinpack.$(SUFFIX) cula_wrapper.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(CULA_LIB) ../$(LIBNAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cula_wrapper.$(SUFFIX) : cula_wrapper.c + $(CC) $(CFLAGS) -c $(CULA_INC) -o $(@F) $^ + +slinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +qlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DXDOUBLE -o $(@F) $^ + +clinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +xlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DXDOUBLE -o $(@F) $^ + +scholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +qcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DXDOUBLE -o $(@F) $^ + +ccholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +xcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DXDOUBLE -o $(@F) $^ + +clean :: + @rm -f *.goto *.mkl *.acml *.sun *.cula + +include $(TOPDIR)/Makefile.tail diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c new file mode 100644 index 0000000..a40cdd2 --- /dev/null +++ b/benchmark/cholesky.c @@ -0,0 +1,272 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +double fabs(double); + +#undef POTRF + +#ifndef COMPLEX +#ifdef XDOUBLE +#define POTRF BLASFUNC(qpotrf) +#define SYRK BLASFUNC(qsyrk) +#elif defined(DOUBLE) +#define POTRF BLASFUNC(dpotrf) +#define SYRK BLASFUNC(dsyrk) +#else +#define POTRF BLASFUNC(spotrf) +#define SYRK BLASFUNC(ssyrk) +#endif +#else +#ifdef XDOUBLE +#define POTRF BLASFUNC(xpotrf) +#define SYRK BLASFUNC(xherk) +#elif defined(DOUBLE) +#define POTRF BLASFUNC(zpotrf) +#define SYRK BLASFUNC(zherk) +#else +#define POTRF BLASFUNC(cpotrf) +#define SYRK BLASFUNC(cherk) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +static __inline double getmflops(int ratio, int m, double secs){ + + double mm = (double)m; + double mulflops, addflops; + + if (secs==0.) return 0.; + + mulflops = mm * (1./3. + mm * (1./2. + mm * 1./6.)); + addflops = 1./6. * mm * (mm * mm - 1); + + if (ratio == 1) { + return (mulflops + addflops) / secs * 1.e-6; + } else { + return (2. * mulflops + 6. * addflops) / secs * 1.e-6; + } +} + + +int MAIN__(int argc, char *argv[]){ + + char *trans[] = {"T", "N"}; + char *uplo[] = {"U", "L"}; + FLOAT alpha[] = {1.0, 0.0}; + FLOAT beta [] = {0.0, 0.0}; + + FLOAT *a, *b; + + blasint m, i, j, info, uplos; + + int from = 1; + int to = 200; + int step = 1; + + FLOAT maxerr; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + for(m = from; m <= to; m += step){ + + fprintf(stderr, "M = %6d : ", (int)m); + + for (uplos = 0; uplos < 2; uplos ++) { + +#ifndef COMPLEX + if (uplos & 1) { + for (j = 0; j < m; j++) { + for(i = 0; i < j; i++) a[i + j * m] = 0.; + a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; + } + } else { + for (j = 0; j < m; j++) { + for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[i + j * m] = 0.; + } + } +#else + if (uplos & 1) { + for (j = 0; j < m; j++) { + for(i = 0; i < j; i++) { + a[(i + j * m) * 2 + 0] = 0.; + a[(i + j * m) * 2 + 1] = 0.; + } + + a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(j + j * m) * 2 + 1] = 0.; + + for(i = j + 1; i < m; i++) { + a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + } + } + } else { + for (j = 0; j < m; j++) { + for(i = 0; i < j; i++) { + a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + } + + a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(j + j * m) * 2 + 1] = 0.; + + for(i = j + 1; i < m; i++) { + a[(i + j * m) * 2 + 0] = 0.; + a[(i + j * m) * 2 + 1] = 0.; + } + } + } +#endif + + SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); + + gettimeofday( &start, (struct timezone *)0); + + POTRF(uplo[uplos], &m, b, &m, &info); + + gettimeofday( &stop, (struct timezone *)0); + + if (info != 0) { + fprintf(stderr, "Info = %d\n", info); + exit(1); + } + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + maxerr = 0.; + + if (!(uplos & 1)) { + for (j = 0; j < m; j++) { + for(i = 0; i <= j; i++) { +#ifndef COMPLEX + if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); +#else + if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); + if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); +#endif + } + } + } else { + for (j = 0; j < m; j++) { + for(i = j; i < m; i++) { +#ifndef COMPLEX + if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); +#else + if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); + if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); +#endif + } + } + } + + fprintf(stderr, +#ifdef XDOUBLE + " %Le %10.3f MFlops", maxerr, +#else + " %e %10.3f MFlops", maxerr, +#endif + getmflops(COMPSIZE * COMPSIZE, m, time1)); + + if (maxerr > 1.e-3) { + fprintf(stderr, "Hmm, probably it has bug.\n"); + exit(1); + } + + } + fprintf(stderr, "\n"); + + } + + return 0; +} + +void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/cula_wrapper.c b/benchmark/cula_wrapper.c new file mode 100644 index 0000000..05dbcc2 --- /dev/null +++ b/benchmark/cula_wrapper.c @@ -0,0 +1,28 @@ +#include +#include "culapack.h" + +static int initialized = 0; + +int sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info) { + + if (!initialized) { + culaInitialize(); + initialized = 1; + } + + *info = culaSgetrf(*m, *m, a, *lda, ipiv); + + return 0; +} + +int cgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info) { + + if (!initialized) { + culaInitialize(); + initialized = 1; + } + + *info = culaCgetrf(*m, *m, (culaFloatComplex *)a, *lda, ipiv); + + return 0; +} diff --git a/benchmark/linpack.c b/benchmark/linpack.c new file mode 100644 index 0000000..0261859 --- /dev/null +++ b/benchmark/linpack.c @@ -0,0 +1,273 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +double fabs(double); + +#undef GETRF +#undef GETRS + +#ifndef COMPLEX +#ifdef XDOUBLE +#define GETRF BLASFUNC(qgetrf) +#define GETRS BLASFUNC(qgetrs) +#elif defined(DOUBLE) +#define GETRF BLASFUNC(dgetrf) +#define GETRS BLASFUNC(dgetrs) +#else +#define GETRF BLASFUNC(sgetrf) +#define GETRS BLASFUNC(sgetrs) +#endif +#else +#ifdef XDOUBLE +#define GETRF BLASFUNC(xgetrf) +#define GETRS BLASFUNC(xgetrs) +#elif defined(DOUBLE) +#define GETRF BLASFUNC(zgetrf) +#define GETRS BLASFUNC(zgetrs) +#else +#define GETRF BLASFUNC(cgetrf) +#define GETRS BLASFUNC(cgetrs) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int MAIN__(int argc, char *argv[]){ + + FLOAT *a, *b; + blasint *ipiv; + + blasint m, i, j, info; + blasint unit = 1; + + int from = 1; + int to = 200; + int step = 1; + + FLOAT maxerr; + + struct timeval start, stop; + double time1, time2; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); + + for(m = from; m <= to; m += step){ + + fprintf(stderr, " %6d : ", (int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < m * COMPSIZE; ++i) b[i] = 0.; + + for (j = 0; j < m; ++j) { + for (i = 0; i < m * COMPSIZE; ++i) { + b[i] += a[i + j * m * COMPSIZE]; + } + } + + gettimeofday( &start, (struct timezone *)0); + + GETRF (&m, &m, a, &m, ipiv, &info); + + gettimeofday( &stop, (struct timezone *)0); + + if (info) { + fprintf(stderr, "Matrix is not singular .. %d\n", info); + exit(1); + } + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + gettimeofday( &start, (struct timezone *)0); + + GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info); + + gettimeofday( &stop, (struct timezone *)0); + + if (info) { + fprintf(stderr, "Matrix is not singular .. %d\n", info); + exit(1); + } + + time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + maxerr = 0.; + + for(i = 0; i < m; i++){ +#ifndef XDOUBLE + if (maxerr < fabs(b[i * COMPSIZE] - 1.0)) maxerr = fabs(b[i * COMPSIZE] - 1.0); +#ifdef COMPLEX + if (maxerr < fabs(b[i * COMPSIZE] + 1)) maxerr = fabs(b[i * COMPSIZE + 1]); +#endif +#else + if (maxerr < fabsl(b[i * COMPSIZE] - 1.0L)) maxerr = fabsl(b[i * COMPSIZE] - 1.0L); +#ifdef COMPLEX + if (maxerr < fabsl(b[i * COMPSIZE] + 1)) maxerr = fabsl(b[i * COMPSIZE + 1]); +#endif +#endif + } + +#ifdef XDOUBLE + fprintf(stderr," %Le ", maxerr); +#else + fprintf(stderr," %e ", maxerr); +#endif + + fprintf(stderr, + " %10.2f MFlops %10.2f MFlops %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. / 3. * (double)m * (double)m * (double)m / time1 * 1.e-6, + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time2 * 1.e-6, + COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m) / (time1 + time2) * 1.e-6); + +#if 0 + if ( +#ifdef DOUBLE + maxerr > 1.e-8 +#else + maxerr > 1.e-1 +#endif + ) { + fprintf(stderr, "Error is too large.\n"); + exit(1); + } +#endif + + } + + return 0; +} + +void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/c_check b/c_check new file mode 100755 index 0000000..d8025f9 --- /dev/null +++ b/c_check @@ -0,0 +1,254 @@ +#!/usr/bin/perl + +# Checking cross compile +$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); +$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); + +$binary = $ENV{"BINARY"}; + +$makefile = shift(@ARGV); +$config = shift(@ARGV); + +$compiler_name = join(" ", @ARGV); + +# First, we need to know the target OS and compiler name + +$data = `$compiler_name -E ctest.c`; + +if ($?) { + printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; + die 1; +} + +$cross_suffix = ""; + +if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { + if ($1 =~ /(.*-)(.*)/) { + $cross_suffix = $1; + } +} else { + if ($ARGV[0] =~ /(.*-)(.*)/) { + $cross_suffix = $1; + } +} + +$compiler = ""; +$compiler = PGI if ($data =~ /COMPILER_PGI/); +$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/); +$compiler = INTEL if ($data =~ /COMPILER_INTEL/); +$compiler = OPEN64 if ($data =~ /COMPILER_OPEN64/); +$compiler = SUN if ($data =~ /COMPILER_SUN/); +$compiler = IBM if ($data =~ /COMPILER_IBM/); +$compiler = DEC if ($data =~ /COMPILER_DEC/); +$compiler = GCC if ($compiler eq ""); + +$os = Linux if ($data =~ /OS_LINUX/); +$os = FreeBSD if ($data =~ /OS_FreeBSD/); +$os = NetBSD if ($data =~ /OS_NetBSD/); +$os = Darwin if ($data =~ /OS_Darwin/); +$os = SunOS if ($data =~ /OS_SunOS/); +$os = AIX if ($data =~ /OS_AIX/); +$os = osf if ($data =~ /OS_OSF/); +$os = WINNT if ($data =~ /OS_WINNT/); +$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); +$os = Interix if ($data =~ /OS_INTERIX/); + +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips32 if ($data =~ /ARCH_MIPS32/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); + +$defined = 0; + +if ($os eq "AIX") { + $compiler_name .= " -maix32" if ($binary eq "32"); + $compiler_name .= " -maix64" if ($binary eq "64"); + $defined = 1; +} + +if (($architecture eq "mips32") || ($architecture eq "mips64")) { + $compiler_name .= " -mabi=n32" if ($binary eq "32"); + $compiler_name .= " -mabi=64" if ($binary eq "64"); + $defined = 1; +} + +if ($architecture eq "alpha") { + $defined = 1; + $binary = 64; +} + +if ($architecture eq "ia64") { + $defined = 1; + $binary = 64; +} + +if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { + $defined = 1; + $binary =32; +} + +if ($compiler eq "PGI") { + $compiler_name .= " -tp p7" if ($binary eq "32"); + $compiler_name .= " -tp p7-64" if ($binary eq "64"); + $openmp = "-mp"; + $defined = 1; +} + +if ($compiler eq "IBM") { + $compiler_name .= " -q32" if ($binary eq "32"); + $compiler_name .= " -q64" if ($binary eq "64"); + $openmp = "-qsmp=omp"; + $defined = 1; +} + +if ($compiler eq "INTEL") { + $openmp = "-openmp"; +} + +if ($compiler eq "PATHSCALE") { + $openmp = "-mp"; +} + +if ($compiler eq "OPEN64") { + $openmp = "-mp"; +} + +if ($compiler eq "GCC") { + $openmp = "-fopenmp"; +} + +if ($defined == 0) { + $compiler_name .= " -m32" if ($binary eq "32"); + $compiler_name .= " -m64" if ($binary eq "64"); +} + +# Do again + +$data = `$compiler_name -E ctest.c`; + +if ($?) { + printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; + die 1; +} + +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips32 if ($data =~ /ARCH_MIPS32/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); + +$binformat = bin32; +$binformat = bin64 if ($data =~ /BINARY_64/); + +$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; + +$data =~ /globl\ ([_\.]*)(.*)/; + +$need_fu = $1; + +$cross = 0; +$cross = 1 if ($os ne $hostos); + +if ($architecture ne $hostarch) { + $cross = 1; + $cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86")); + $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); +} + +$openmp = "" if $ENV{USE_OPENMP} != 1; + +$linker_L = ""; +$linker_l = ""; +$linker_a = ""; + +{ + $link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; + + $link =~ s/\-Y\sP\,/\-Y/g; + + @flags = split(/[\s\,\n]/, $link); + + foreach $flags (@flags) { + if ( + ($flags =~ /^\-L/) + && ($flags !~ /^-LIST:/) + && ($flags !~ /^-LANG:/) + ) { + $linker_L .= $flags . " " + } + + if ($flags =~ /^\-Y/) { + $linker_L .= "-Wl,". $flags . " " + } + + if ( + ($flags =~ /^\-l/) + && ($flags !~ /gfortranbegin/) + && ($flags !~ /frtbegin/) + && ($flags !~ /pathfstart/) + && ($flags !~ /numa/) + && ($flags !~ /crt[0-9]/) + && ($flags !~ /gcc/) + && ($flags !~ /user32/) + && ($flags !~ /kernel32/) + && ($flags !~ /advapi32/) + && ($flags !~ /shell32/) + ) { + $linker_l .= $flags . " " + } + + $linker_a .= $flags . " " if $flags =~ /\.a$/; + } + +} + +open(MAKEFILE, "> $makefile") || die "Can't create $makefile"; +open(CONFFILE, "> $config" ) || die "Can't create $config"; + +# print $data, "\n"; + +print MAKEFILE "OSNAME=$os\n"; +print MAKEFILE "ARCH=$architecture\n"; +print MAKEFILE "C_COMPILER=$compiler\n"; +print MAKEFILE "BINARY32=\n" if $binformat ne bin32; +print MAKEFILE "BINARY64=\n" if $binformat ne bin64; +print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; +print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; +print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; +print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne ""; +print MAKEFILE "CROSS=1\n" if $cross != 0; +print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; + +$os =~ tr/[a-z]/[A-Z]/; +$architecture =~ tr/[a-z]/[A-Z]/; +$compiler =~ tr/[a-z]/[A-Z]/; + +print CONFFILE "#define OS_$os\t1\n"; +print CONFFILE "#define ARCH_$architecture\t1\n"; +print CONFFILE "#define C_$compiler\t1\n"; +print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; +print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; +print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; + +if ($os eq "LINUX") { + + @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`); + + if ($pthread[2] ne "") { + print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n"; + } else { + print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; + } +} else { + print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; +} + +close(MAKEFILE); +close(CONFFILE); diff --git a/cblas.h b/cblas.h new file mode 100644 index 0000000..ea0fbb6 --- /dev/null +++ b/cblas.h @@ -0,0 +1,273 @@ +#ifndef CBLAS_H +#define CBLAS_H + +#define CBLAS_INDEX size_t + +enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; +enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; +enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; +enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; +enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; + +float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); +double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); +float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); +double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); + +float _Complex cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); +float _Complex cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); +double _Complex cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); +double _Complex cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); +void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); +void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); +void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); + +float cblas_sasum (blasint n, float *x, blasint incx); +double cblas_dasum (blasint n, double *x, blasint incx); +float cblas_scasum(blasint n, float *x, blasint incx); +double cblas_dzasum(blasint n, double *x, blasint incx); + +float cblas_snrm2 (blasint N, float *X, blasint incX); +double cblas_dnrm2 (blasint N, double *X, blasint incX); +float cblas_scnrm2(blasint N, float *X, blasint incX); +double cblas_dznrm2(blasint N, double *X, blasint incX); + +CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); +CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); + +void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); +void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); +void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); +void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); + +void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); +void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); + +void cblas_srotg(float *a, float *b, float *c, float *s); +void cblas_drotg(double *a, double *b, double *c, double *s); + +void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); +void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); + +void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); + +void cblas_sscal(blasint N, float alpha, float *X, blasint incX); +void cblas_dscal(blasint N, double alpha, double *X, blasint incX); +void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); +void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); +void cblas_csscal(blasint N, float alpha, float *X, blasint incX); +void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); + +void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); +void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); +void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); +void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); + +void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); + +void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); +void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); + +void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, + blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, + blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, + float *Y, blasint incY, float *A, blasint lda); +void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, + double *Y, blasint incY, double *A, blasint lda); + +void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); + + +void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, + blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, + blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + + +void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, + float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, + double *X, blasint incX, double beta, double *Y, blasint incY); + +void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); +void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); + +void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); +void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); + +void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); +void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); +void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); +void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); + +void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); +void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); +void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); + +void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); + +void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); + +void cblas_xerbla(blasint p, char *rout, char *form, ...); +#endif diff --git a/common.h b/common.h new file mode 100644 index 0000000..a481b2a --- /dev/null +++ b/common.h @@ -0,0 +1,610 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_H +#define COMMON_H + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#ifndef __USE_XOPEN +#define __USE_XOPEN +#endif + +#ifndef __USE_SVID +#define __USE_SVID +#endif + +#ifdef BUILD_KERNEL +#include "config_kernel.h" +#else +#include "config.h" +#endif + +#undef ENABLE_SSE_EXCEPTION + +#if defined(SMP_SERVER) || defined(SMP_ONDEMAND) +#define SMP +#endif + +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) +#define WINDOWS_ABI +#define OS_WINDOWS + +#ifdef DOUBLE +#define DOUBLE_DEFINED DOUBLE +#undef DOUBLE +#endif +#endif + +#if !defined(NOINCLUDE) && !defined(ASSEMBLER) +#include +#include +#include +#include + +#ifdef OS_LINUX +#include +#include +#endif + +#ifdef OS_WINDOWS +#ifdef ATOM +#define GOTO_ATOM ATOM +#undef ATOM +#endif +#include +#include +#ifdef GOTO_ATOM +#define ATOM GOTO_ATOM +#undef GOTO_ATOM +#endif +#else +#include +#include +#include +#include +#include +#ifdef SMP +#include +#endif +#endif + +#if defined(OS_SUNOS) +#include +#endif + +#ifdef __DECC +#include +#include +#endif + +#if defined(ARCH_IA64) && defined(ENABLE_SSE_EXCEPTION) +#include +#endif + +#endif + +#if defined(OS_WINDOWS) && defined(DOUBLE_DEFINED) +#define DOUBLE DOUBLE_DEFINED +#undef DOUBLE_DEFINED +#endif + +#undef DEBUG_INFO +#define SMP_DEBUG +#undef MALLOC_DEBUG +#undef SMP_ALLOC_DEBUG + +#ifndef ZERO +#ifdef XDOUBLE +#define ZERO 0.e0L +#elif defined DOUBLE +#define ZERO 0.e0 +#else +#define ZERO 0.e0f +#endif +#endif + +#ifndef ONE +#ifdef XDOUBLE +#define ONE 1.e0L +#elif defined DOUBLE +#define ONE 1.e0 +#else +#define ONE 1.e0f +#endif +#endif + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#define ALLOCA_ALIGN 63UL + +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) + +#ifdef NEEDBUNDERSCORE +#define BLASFUNC(FUNC) FUNC##_ +#else +#define BLASFUNC(FUNC) FUNC +#endif + +#undef USE_PTHREAD_LOCK +#undef USE_PTHREAD_SPINLOCK + +#if defined(USE_PTHREAD_LOCK) && defined(USE_PTHREAD_SPINLOCK) +#error "You can't specify both LOCK operation!" +#endif + +#ifdef SMP +#define USE_PTHREAD_LOCK +#undef USE_PTHREAD_SPINLOCK +#endif + +#ifdef OS_WINDOWS +#undef USE_PTHREAD_LOCK +#undef USE_PTHREAD_SPINLOCK +#endif + +#if defined(USE_PTHREAD_LOCK) +#define LOCK_COMMAND(x) pthread_mutex_lock(x) +#define UNLOCK_COMMAND(x) pthread_mutex_unlock(x) +#elif defined(USE_PTHREAD_SPINLOCK) +#ifndef ASSEMBLER +typedef volatile int pthread_spinlock_t; +int pthread_spin_lock (pthread_spinlock_t *__lock); +int pthread_spin_unlock (pthread_spinlock_t *__lock); +#endif +#define LOCK_COMMAND(x) pthread_spin_lock(x) +#define UNLOCK_COMMAND(x) pthread_spin_unlock(x) +#else +#define LOCK_COMMAND(x) blas_lock(x) +#define UNLOCK_COMMAND(x) blas_unlock(x) +#endif + +#define GOTO_SHMID 0x510510 + +#if 0 +#ifndef __CUDACC__ +#define __global__ +#define __device__ +#define __host__ +#define __shared__ +#endif +#endif + +#ifndef ASSEMBLER + +#ifdef QUAD_PRECISION +typedef struct { + unsigned long x[2]; +} xdouble; +#elif defined EXPRECISION +#define xdouble long double +#else +#define xdouble double +#endif + +#if defined(OS_WINDOWS) && defined(__64BIT__) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef USE64BITINT +typedef BLASLONG blasint; +#else +typedef int blasint; +#endif +#else +#ifdef USE64BITINT +#define INTSHIFT 3 +#define INTSIZE 8 +#else +#define INTSHIFT 2 +#define INTSIZE 4 +#endif +#endif + +#ifdef XDOUBLE +#define FLOAT xdouble +#ifdef QUAD_PRECISION +#define XFLOAT xidouble +#endif +#ifdef QUAD_PRECISION +#define SIZE 32 +#define BASE_SHIFT 5 +#define ZBASE_SHIFT 6 +#else +#define SIZE 16 +#define BASE_SHIFT 4 +#define ZBASE_SHIFT 5 +#endif +#elif defined(DOUBLE) +#define FLOAT double +#define SIZE 8 +#define BASE_SHIFT 3 +#define ZBASE_SHIFT 4 +#else +#define FLOAT float +#define SIZE 4 +#define BASE_SHIFT 2 +#define ZBASE_SHIFT 3 +#endif + +#ifndef XFLOAT +#define XFLOAT FLOAT +#endif + +#ifndef COMPLEX +#define COMPSIZE 1 +#else +#define COMPSIZE 2 +#endif + +#if defined(C_PGI) || defined(C_SUN) +#define CREAL(X) (*((FLOAT *)&X + 0)) +#define CIMAG(X) (*((FLOAT *)&X + 1)) +#else +#define CREAL __real__ +#define CIMAG __imag__ +#endif + +#define Address_H(x) (((x)+(1<<15))>>16) +#define Address_L(x) ((x)-((Address_H(x))<<16)) + +#ifndef MAX_CPU_NUMBER +#define MAX_CPU_NUMBER 2 +#endif + +#if defined(OS_SUNOS) +#define YIELDING thr_yield() +#endif + +#if defined(OS_WINDOWS) +#define YIELDING SwitchToThread() +#endif + +#ifndef YIELDING +#define YIELDING sched_yield() +#endif + +#ifdef QUAD_PRECISION +#include "common_quad.h" +#endif + +#ifdef ARCH_ALPHA +#include "common_alpha.h" +#endif + +#ifdef ARCH_X86 +#include "common_x86.h" +#endif + +#ifdef ARCH_X86_64 +#include "common_x86_64.h" +#endif + +#ifdef ARCH_IA64 +#include "common_ia64.h" +#endif + +#ifdef ARCH_POWER +#include "common_power.h" +#endif + +#ifdef sparc +#include "common_sparc.h" +#endif + +#ifdef ARCH_MIPS64 +#include "common_mips64.h" +#endif + +#ifdef OS_LINUX +#include "common_linux.h" +#endif + +#define MMAP_ACCESS (PROT_READ | PROT_WRITE) +#define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) + +#include "param.h" +#include "common_param.h" + +#ifndef STDERR +#define STDERR stderr +#endif + +#ifndef MASK +#define MASK(a, b) (((a) + ((b) - 1)) & ~((b) - 1)) +#endif + +#if defined(XDOUBLE) || defined(DOUBLE) +#define FLOATRET FLOAT +#else +#ifdef NEED_F2CCONV +#define FLOATRET double +#else +#define FLOATRET float +#endif +#endif + +#ifndef IFLUSH +#define IFLUSH +#endif + +#ifndef IFLUSH_HALF +#define IFLUSH_HALF +#endif + +#if defined(C_GCC) && (( __GNUC__ <= 3) || ((__GNUC__ == 4) && (__GNUC_MINOR__ < 2))) +#ifdef USE_OPENMP +#undef USE_OPENMP +#endif +#endif + +#ifndef ASSEMBLER + +#ifndef MIN +#define MIN(a,b) (a>b? b:a) +#endif + +#ifndef MAX +#define MAX(a,b) (a 0x60) (a) -= 0x20;} + +#if defined(__FreeBSD__) || defined(__APPLE__) +#define MAP_ANONYMOUS MAP_ANON +#endif + +/* Common Memory Management Routine */ +void blas_set_parameter(void); +int blas_get_cpu_number(void); +void *blas_memory_alloc (int); +void blas_memory_free (void *); + +int get_num_procs (void); + +#if defined(OS_LINUX) && defined(SMP) && !defined(NO_AFFINITY) +int get_num_nodes (void); +int get_num_proc (int); +int get_node_equal (void); +#endif + +void goto_set_num_threads(int); + +void gotoblas_affinity_init(void); +void gotoblas_affinity_quit(void); +void gotoblas_dynamic_init(void); +void gotoblas_dynamic_quit(void); +void gotoblas_profile_init(void); +void gotoblas_profile_quit(void); + +#ifdef USE_OPENMP +int omp_in_parallel(void); +int omp_get_num_procs(void); +#else +#ifdef __ELF__ +int omp_in_parallel (void) __attribute__ ((weak)); +int omp_get_num_procs(void) __attribute__ ((weak)); +#endif +#endif + +static __inline void blas_unlock(volatile BLASULONG *address){ + MB; + *address = 0; +} + +static __inline int readenv(char *env) { + + char *p; + + p = getenv(env); + + if (p == NULL) return 0; else return atoi(p); +} + + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + +static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){ + +#ifndef UNIT + FLOAT ratio, den; + + if ( +#ifdef XDOUBLE + (fabsl(ar)) >= (fabsl(ai)) +#elif defined DOUBLE + (fabs (ar)) >= (fabs (ai)) +#else + (fabsf(ar)) >= (fabsf(ai)) +#endif + ) { + ratio = ai / ar; + den = (FLOAT)(ONE / (ar * (ONE + ratio * ratio))); + ar = den; + ai = -ratio * den; + } else { + ratio = ar / ai; + den = (FLOAT)(ONE /(ai * (ONE + ratio * ratio))); + ar = ratio * den; + ai = -den; + } + b[0] = ar; + b[1] = ai; +#else + b[0] = ONE; + b[1] = ZERO; +#endif + +} +#endif + +#ifdef MALLOC_DEBUG +void *blas_debug_alloc(int); +void *blas_debug_free(void *); +#undef malloc +#undef free +#define malloc(a) blas_debug_alloc(a) +#define free(a) blas_debug_free (a) +#endif + +#ifndef COPYOVERHEAD +#define GEMMRETTYPE int +#else + +typedef struct { + double outercopy; + double innercopy; + double kernel; + double mflops; +} copyoverhead_t; + +#define GEMMRETTYPE copyoverhead_t +#endif +#endif + +#ifndef BUILD_KERNEL +#define KNAME(A, B) A +#else +#define KNAME(A, B) A##B +#endif + +#include "common_interface.h" +#ifdef SANITY_CHECK +#include "common_reference.h" +#endif +#include "common_macro.h" +#include "common_level1.h" +#include "common_level2.h" +#include "common_level3.h" +#include "common_lapack.h" +#ifdef CBLAS +#include "cblas.h" +#endif + +#ifndef ASSEMBLER +#if 0 +#include "symcopy.h" +#endif + +#if defined(SMP_SERVER) && defined(SMP_ONDEMAND) +#error Both SMP_SERVER and SMP_ONDEMAND are specified. +#endif + +#if defined(SMP_SERVER) || defined(SMP_ONDEMAND) +#include "common_thread.h" +#endif + +#endif + +#define INFO_NUM 99 + +#ifndef DEFAULT_CPU_NUMBER +#define DEFAULT_CPU_NUMBER 4 +#endif + +#ifndef IDEBUG_START +#define IDEBUG_START +#endif + +#ifndef IDEBUG_END +#define IDEBUG_END +#endif + +#if !defined(ASSEMBLER) && defined(FUNCTION_PROFILE) + +typedef struct { + int func; + unsigned long long calls, fops, area, cycles, tcycles; +} func_profile_t; + +extern func_profile_t function_profile_table[]; +extern int gotoblas_profile; + +#ifdef XDOUBLE +#define NUMOPT QNUMOPT +#elif defined DOUBLE +#define NUMOPT DNUMOPT +#else +#define NUMOPT SNUMOPT +#endif + +#define FUNCTION_PROFILE_START() { unsigned long long profile_start = rpcc(), profile_end; +#ifdef SMP +#define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ + if (gotoblas_profile) { \ + profile_end = rpcc(); \ + function_profile_table[PROFILE_FUNC_NAME].calls ++; \ + function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ + function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ + function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ + function_profile_table[PROFILE_FUNC_NAME].tcycles += blas_cpu_number * (profile_end - profile_start); \ + } \ + } +#else +#define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ + if (gotoblas_profile) { \ + profile_end = rpcc(); \ + function_profile_table[PROFILE_FUNC_NAME].calls ++; \ + function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ + function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ + function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ + function_profile_table[PROFILE_FUNC_NAME].tcycles += (profile_end - profile_start); \ + } \ + } +#endif + +#else +#define FUNCTION_PROFILE_START() +#define FUNCTION_PROFILE_END(COMP, AREA, OPS) +#endif + +#if 1 +#define PRINT_DEBUG_CNAME +#define PRINT_DEBUG_NAME +#else +#define PRINT_DEBUG_CNAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_CNAME) +#define PRINT_DEBUG_NAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME) +#endif + +#endif diff --git a/common_alpha.h b/common_alpha.h new file mode 100644 index 0000000..cf79473 --- /dev/null +++ b/common_alpha.h @@ -0,0 +1,179 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ALPHA +#define COMMON_ALPHA + +#ifndef ASSEMBLER + +#define MB asm("mb") +#define WMB asm("wmb") + +static void __inline blas_lock(unsigned long *address){ +#ifndef __DECC + unsigned long tmp1, tmp2; + asm volatile( + "1: ldq %1, %0\n" + " bne %1, 2f\n" + " ldq_l %1, %0\n" + " bne %1, 2f\n" + " or %1, 1, %2\n" + " stq_c %2, %0\n" + " beq %2, 2f\n" + " mb\n " + " br $31, 3f\n" + "2: br $31, 1b\n" + "3:\n" : "=m"(*address), "=&r"(tmp1), "=&r"(tmp2) : : "memory"); +#else + asm ( + "10:" + " ldq %t0, 0(%a0); " + " bne %t0, 20f; " + " ldq_l %t0, 0(%a0); " + " bne %t0, 20f; " + " or %t0, 1, %t1;" + " stq_c %t1, 0(%a0); " + " beq %t1, 20f; " + " mb; " + " br %r31,30f; " + "20: " + " br %r31,10b; " + "30:", address); +#endif +} + +static __inline unsigned int rpcc(void){ + + unsigned int r0; + +#ifndef __DECC + asm __volatile__("rpcc %0" : "=r"(r0) : : "memory"); +#else + r0 = asm("rpcc %v0"); +#endif + + return r0; +} + + +#define HALT ldq $0, 0($0) + +#ifndef __DECC +#define GET_IMAGE(res) asm __volatile__("fmov $f1, %0" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) res = dasm("fmov $f1, %f0") +#endif + +#ifdef SMP +#ifdef USE64BITINT +static __inline long blas_quickdivide(long x, long y){ + return x/y; +} +#else +extern unsigned int blas_quick_divide_table[]; + +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ + if (y <= 1) return x; + return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32); +} +#endif +#endif + +#define BASE_ADDRESS ((0x1b0UL << 33) | (0x1c0UL << 23) | (0x000UL << 13)) + +#ifndef PAGESIZE +#define PAGESIZE ( 8UL << 10) +#define HUGE_PAGESIZE ( 4 << 20) +#endif +#define BUFFER_SIZE (32UL << 20) + +#else + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#define PROLOGUE \ + .arch ev6; \ + .set noat; \ + .set noreorder; \ +.text; \ + .align 5; \ + .globl REALNAME; \ + .ent REALNAME; \ +REALNAME: + +#ifdef PROFILE +#define PROFCODE \ + ldgp $gp, 0($27); \ + lda $28, _mcount; \ + jsr $28, ($28), _mcount; \ + .prologue 1 +#else +#define PROFCODE .prologue 0 +#endif + +#define EPILOGUE \ + .end REALNAME; \ + .ident VERSION +#endif + +#ifdef DOUBLE +#define SXADDQ s8addq +#define SXSUBL s8subl +#define LD ldt +#define ST stt +#define STQ stq +#define ADD addt/su +#define SUB subt/su +#define MUL mult/su +#define DIV divt/su +#else +#define SXADDQ s4addq +#define SXSUBL s4subl +#define LD lds +#define ST sts +#define STQ stl +#define ADD adds/su +#define SUB subs/su +#define MUL muls/su +#define DIV divs/su +#endif +#endif diff --git a/common_c.h b/common_c.h new file mode 100644 index 0000000..f78f172 --- /dev/null +++ b/common_c.h @@ -0,0 +1,611 @@ +#ifndef COMMON_C_H +#define COMMON_C_H + +#ifndef DYNAMIC_ARCH + +#define CAMAX_K camax_k +#define CAMIN_K camin_k +#define CMAX_K cmax_k +#define CMIN_K cmin_k +#define ICAMAX_K icamax_k +#define ICAMIN_K icamin_k +#define ICMAX_K icmax_k +#define ICMIN_K icmin_k +#define CASUM_K casum_k +#define CAXPYU_K caxpy_k +#define CAXPYC_K caxpyc_k +#define CCOPY_K ccopy_k +#define CDOTU_K cdotu_k +#define CDOTC_K cdotc_k +#define CNRM2_K cnrm2_k +#define CSCAL_K cscal_k +#define CSWAP_K cswap_k +#define CROT_K csrot_k + +#define CGEMV_N cgemv_n +#define CGEMV_T cgemv_t +#define CGEMV_R cgemv_r +#define CGEMV_C cgemv_c +#define CGEMV_O cgemv_o +#define CGEMV_U cgemv_u +#define CGEMV_S cgemv_s +#define CGEMV_D cgemv_d + +#define CGERU_K cgeru_k +#define CGERC_K cgerc_k +#define CGERV_K cgerv_k +#define CGERD_K cgerd_k + +#define CSYMV_U csymv_U +#define CSYMV_L csymv_L +#define CHEMV_U chemv_U +#define CHEMV_L chemv_L +#define CHEMV_V chemv_V +#define CHEMV_M chemv_M + +#define CSYMV_THREAD_U csymv_thread_U +#define CSYMV_THREAD_L csymv_thread_L +#define CHEMV_THREAD_U chemv_thread_U +#define CHEMV_THREAD_L chemv_thread_L +#define CHEMV_THREAD_V chemv_thread_V +#define CHEMV_THREAD_M chemv_thread_M + +#define CGEMM_ONCOPY cgemm_oncopy +#define CGEMM_OTCOPY cgemm_otcopy + +#if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N +#define CGEMM_INCOPY cgemm_oncopy +#define CGEMM_ITCOPY cgemm_otcopy +#else +#define CGEMM_INCOPY cgemm_incopy +#define CGEMM_ITCOPY cgemm_itcopy +#endif + +#define CTRMM_OUNUCOPY ctrmm_ounucopy +#define CTRMM_OUNNCOPY ctrmm_ounncopy +#define CTRMM_OUTUCOPY ctrmm_outucopy +#define CTRMM_OUTNCOPY ctrmm_outncopy +#define CTRMM_OLNUCOPY ctrmm_olnucopy +#define CTRMM_OLNNCOPY ctrmm_olnncopy +#define CTRMM_OLTUCOPY ctrmm_oltucopy +#define CTRMM_OLTNCOPY ctrmm_oltncopy + +#define CTRSM_OUNUCOPY ctrsm_ounucopy +#define CTRSM_OUNNCOPY ctrsm_ounncopy +#define CTRSM_OUTUCOPY ctrsm_outucopy +#define CTRSM_OUTNCOPY ctrsm_outncopy +#define CTRSM_OLNUCOPY ctrsm_olnucopy +#define CTRSM_OLNNCOPY ctrsm_olnncopy +#define CTRSM_OLTUCOPY ctrsm_oltucopy +#define CTRSM_OLTNCOPY ctrsm_oltncopy + +#if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N +#define CTRMM_IUNUCOPY ctrmm_ounucopy +#define CTRMM_IUNNCOPY ctrmm_ounncopy +#define CTRMM_IUTUCOPY ctrmm_outucopy +#define CTRMM_IUTNCOPY ctrmm_outncopy +#define CTRMM_ILNUCOPY ctrmm_olnucopy +#define CTRMM_ILNNCOPY ctrmm_olnncopy +#define CTRMM_ILTUCOPY ctrmm_oltucopy +#define CTRMM_ILTNCOPY ctrmm_oltncopy + +#define CTRSM_IUNUCOPY ctrsm_ounucopy +#define CTRSM_IUNNCOPY ctrsm_ounncopy +#define CTRSM_IUTUCOPY ctrsm_outucopy +#define CTRSM_IUTNCOPY ctrsm_outncopy +#define CTRSM_ILNUCOPY ctrsm_olnucopy +#define CTRSM_ILNNCOPY ctrsm_olnncopy +#define CTRSM_ILTUCOPY ctrsm_oltucopy +#define CTRSM_ILTNCOPY ctrsm_oltncopy +#else +#define CTRMM_IUNUCOPY ctrmm_iunucopy +#define CTRMM_IUNNCOPY ctrmm_iunncopy +#define CTRMM_IUTUCOPY ctrmm_iutucopy +#define CTRMM_IUTNCOPY ctrmm_iutncopy +#define CTRMM_ILNUCOPY ctrmm_ilnucopy +#define CTRMM_ILNNCOPY ctrmm_ilnncopy +#define CTRMM_ILTUCOPY ctrmm_iltucopy +#define CTRMM_ILTNCOPY ctrmm_iltncopy + +#define CTRSM_IUNUCOPY ctrsm_iunucopy +#define CTRSM_IUNNCOPY ctrsm_iunncopy +#define CTRSM_IUTUCOPY ctrsm_iutucopy +#define CTRSM_IUTNCOPY ctrsm_iutncopy +#define CTRSM_ILNUCOPY ctrsm_ilnucopy +#define CTRSM_ILNNCOPY ctrsm_ilnncopy +#define CTRSM_ILTUCOPY ctrsm_iltucopy +#define CTRSM_ILTNCOPY ctrsm_iltncopy +#endif + +#define CGEMM_BETA cgemm_beta + +#define CGEMM_KERNEL_N cgemm_kernel_n +#define CGEMM_KERNEL_L cgemm_kernel_l +#define CGEMM_KERNEL_R cgemm_kernel_r +#define CGEMM_KERNEL_B cgemm_kernel_b + +#define CTRMM_KERNEL_LN ctrmm_kernel_LN +#define CTRMM_KERNEL_LT ctrmm_kernel_LT +#define CTRMM_KERNEL_LR ctrmm_kernel_LR +#define CTRMM_KERNEL_LC ctrmm_kernel_LC +#define CTRMM_KERNEL_RN ctrmm_kernel_RN +#define CTRMM_KERNEL_RT ctrmm_kernel_RT +#define CTRMM_KERNEL_RR ctrmm_kernel_RR +#define CTRMM_KERNEL_RC ctrmm_kernel_RC + +#define CTRSM_KERNEL_LN ctrsm_kernel_LN +#define CTRSM_KERNEL_LT ctrsm_kernel_LT +#define CTRSM_KERNEL_LR ctrsm_kernel_LR +#define CTRSM_KERNEL_LC ctrsm_kernel_LC +#define CTRSM_KERNEL_RN ctrsm_kernel_RN +#define CTRSM_KERNEL_RT ctrsm_kernel_RT +#define CTRSM_KERNEL_RR ctrsm_kernel_RR +#define CTRSM_KERNEL_RC ctrsm_kernel_RC + +#define CSYMM_OUTCOPY csymm_outcopy +#define CSYMM_OLTCOPY csymm_oltcopy +#if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N +#define CSYMM_IUTCOPY csymm_outcopy +#define CSYMM_ILTCOPY csymm_oltcopy +#else +#define CSYMM_IUTCOPY csymm_iutcopy +#define CSYMM_ILTCOPY csymm_iltcopy +#endif + +#define CHEMM_OUTCOPY chemm_outcopy +#define CHEMM_OLTCOPY chemm_oltcopy +#if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N +#define CHEMM_IUTCOPY chemm_outcopy +#define CHEMM_ILTCOPY chemm_oltcopy +#else +#define CHEMM_IUTCOPY chemm_iutcopy +#define CHEMM_ILTCOPY chemm_iltcopy +#endif + +#define CGEMM3M_ONCOPYB cgemm3m_oncopyb +#define CGEMM3M_ONCOPYR cgemm3m_oncopyr +#define CGEMM3M_ONCOPYI cgemm3m_oncopyi +#define CGEMM3M_OTCOPYB cgemm3m_otcopyb +#define CGEMM3M_OTCOPYR cgemm3m_otcopyr +#define CGEMM3M_OTCOPYI cgemm3m_otcopyi + +#define CGEMM3M_INCOPYB cgemm3m_incopyb +#define CGEMM3M_INCOPYR cgemm3m_incopyr +#define CGEMM3M_INCOPYI cgemm3m_incopyi +#define CGEMM3M_ITCOPYB cgemm3m_itcopyb +#define CGEMM3M_ITCOPYR cgemm3m_itcopyr +#define CGEMM3M_ITCOPYI cgemm3m_itcopyi + +#define CSYMM3M_ILCOPYB csymm3m_ilcopyb +#define CSYMM3M_IUCOPYB csymm3m_iucopyb +#define CSYMM3M_ILCOPYR csymm3m_ilcopyr +#define CSYMM3M_IUCOPYR csymm3m_iucopyr +#define CSYMM3M_ILCOPYI csymm3m_ilcopyi +#define CSYMM3M_IUCOPYI csymm3m_iucopyi + +#define CSYMM3M_OLCOPYB csymm3m_olcopyb +#define CSYMM3M_OUCOPYB csymm3m_oucopyb +#define CSYMM3M_OLCOPYR csymm3m_olcopyr +#define CSYMM3M_OUCOPYR csymm3m_oucopyr +#define CSYMM3M_OLCOPYI csymm3m_olcopyi +#define CSYMM3M_OUCOPYI csymm3m_oucopyi + +#define CHEMM3M_ILCOPYB chemm3m_ilcopyb +#define CHEMM3M_IUCOPYB chemm3m_iucopyb +#define CHEMM3M_ILCOPYR chemm3m_ilcopyr +#define CHEMM3M_IUCOPYR chemm3m_iucopyr +#define CHEMM3M_ILCOPYI chemm3m_ilcopyi +#define CHEMM3M_IUCOPYI chemm3m_iucopyi + +#define CHEMM3M_OLCOPYB chemm3m_olcopyb +#define CHEMM3M_OUCOPYB chemm3m_oucopyb +#define CHEMM3M_OLCOPYR chemm3m_olcopyr +#define CHEMM3M_OUCOPYR chemm3m_oucopyr +#define CHEMM3M_OLCOPYI chemm3m_olcopyi +#define CHEMM3M_OUCOPYI chemm3m_oucopyi + +#define CGEMM3M_KERNEL cgemm3m_kernel + +#define CNEG_TCOPY cneg_tcopy +#define CLASWP_NCOPY claswp_ncopy + +#else + +#define CAMAX_K gotoblas -> camax_k +#define CAMIN_K gotoblas -> camin_k +#define CMAX_K gotoblas -> cmax_k +#define CMIN_K gotoblas -> cmin_k +#define ICAMAX_K gotoblas -> icamax_k +#define ICAMIN_K gotoblas -> icamin_k +#define ICMAX_K gotoblas -> icmax_k +#define ICMIN_K gotoblas -> icmin_k +#define CASUM_K gotoblas -> casum_k +#define CAXPYU_K gotoblas -> caxpy_k +#define CAXPYC_K gotoblas -> caxpyc_k +#define CCOPY_K gotoblas -> ccopy_k +#define CDOTU_K gotoblas -> cdotu_k +#define CDOTC_K gotoblas -> cdotc_k +#define CNRM2_K gotoblas -> cnrm2_k +#define CSCAL_K gotoblas -> cscal_k +#define CSWAP_K gotoblas -> cswap_k +#define CROT_K gotoblas -> csrot_k + +#define CGEMV_N gotoblas -> cgemv_n +#define CGEMV_T gotoblas -> cgemv_t +#define CGEMV_R gotoblas -> cgemv_r +#define CGEMV_C gotoblas -> cgemv_c +#define CGEMV_O gotoblas -> cgemv_o +#define CGEMV_U gotoblas -> cgemv_u +#define CGEMV_S gotoblas -> cgemv_s +#define CGEMV_D gotoblas -> cgemv_d + +#define CGERU_K gotoblas -> cgeru_k +#define CGERC_K gotoblas -> cgerc_k +#define CGERV_K gotoblas -> cgerv_k +#define CGERD_K gotoblas -> cgerd_k + +#define CSYMV_U gotoblas -> csymv_U +#define CSYMV_L gotoblas -> csymv_L +#define CHEMV_U gotoblas -> chemv_U +#define CHEMV_L gotoblas -> chemv_L +#define CHEMV_V gotoblas -> chemv_V +#define CHEMV_M gotoblas -> chemv_M + +#define CSYMV_THREAD_U csymv_thread_U +#define CSYMV_THREAD_L csymv_thread_L +#define CHEMV_THREAD_U chemv_thread_U +#define CHEMV_THREAD_L chemv_thread_L +#define CHEMV_THREAD_V chemv_thread_V +#define CHEMV_THREAD_M chemv_thread_M + +#define CGEMM_ONCOPY gotoblas -> cgemm_oncopy +#define CGEMM_OTCOPY gotoblas -> cgemm_otcopy +#define CGEMM_INCOPY gotoblas -> cgemm_incopy +#define CGEMM_ITCOPY gotoblas -> cgemm_itcopy + +#define CTRMM_OUNUCOPY gotoblas -> ctrmm_ounucopy +#define CTRMM_OUTUCOPY gotoblas -> ctrmm_outucopy +#define CTRMM_OLNUCOPY gotoblas -> ctrmm_olnucopy +#define CTRMM_OLTUCOPY gotoblas -> ctrmm_oltucopy +#define CTRSM_OUNUCOPY gotoblas -> ctrsm_ounucopy +#define CTRSM_OUTUCOPY gotoblas -> ctrsm_outucopy +#define CTRSM_OLNUCOPY gotoblas -> ctrsm_olnucopy +#define CTRSM_OLTUCOPY gotoblas -> ctrsm_oltucopy + +#define CTRMM_IUNUCOPY gotoblas -> ctrmm_iunucopy +#define CTRMM_IUTUCOPY gotoblas -> ctrmm_iutucopy +#define CTRMM_ILNUCOPY gotoblas -> ctrmm_ilnucopy +#define CTRMM_ILTUCOPY gotoblas -> ctrmm_iltucopy +#define CTRSM_IUNUCOPY gotoblas -> ctrsm_iunucopy +#define CTRSM_IUTUCOPY gotoblas -> ctrsm_iutucopy +#define CTRSM_ILNUCOPY gotoblas -> ctrsm_ilnucopy +#define CTRSM_ILTUCOPY gotoblas -> ctrsm_iltucopy + +#define CTRMM_OUNNCOPY gotoblas -> ctrmm_ounncopy +#define CTRMM_OUTNCOPY gotoblas -> ctrmm_outncopy +#define CTRMM_OLNNCOPY gotoblas -> ctrmm_olnncopy +#define CTRMM_OLTNCOPY gotoblas -> ctrmm_oltncopy +#define CTRSM_OUNNCOPY gotoblas -> ctrsm_ounncopy +#define CTRSM_OUTNCOPY gotoblas -> ctrsm_outncopy +#define CTRSM_OLNNCOPY gotoblas -> ctrsm_olnncopy +#define CTRSM_OLTNCOPY gotoblas -> ctrsm_oltncopy + +#define CTRMM_IUNNCOPY gotoblas -> ctrmm_iunncopy +#define CTRMM_IUTNCOPY gotoblas -> ctrmm_iutncopy +#define CTRMM_ILNNCOPY gotoblas -> ctrmm_ilnncopy +#define CTRMM_ILTNCOPY gotoblas -> ctrmm_iltncopy +#define CTRSM_IUNNCOPY gotoblas -> ctrsm_iunncopy +#define CTRSM_IUTNCOPY gotoblas -> ctrsm_iutncopy +#define CTRSM_ILNNCOPY gotoblas -> ctrsm_ilnncopy +#define CTRSM_ILTNCOPY gotoblas -> ctrsm_iltncopy + +#define CGEMM_BETA gotoblas -> cgemm_beta +#define CGEMM_KERNEL_N gotoblas -> cgemm_kernel_n +#define CGEMM_KERNEL_L gotoblas -> cgemm_kernel_l +#define CGEMM_KERNEL_R gotoblas -> cgemm_kernel_r +#define CGEMM_KERNEL_B gotoblas -> cgemm_kernel_b + +#define CTRMM_KERNEL_LN gotoblas -> ctrmm_kernel_LN +#define CTRMM_KERNEL_LT gotoblas -> ctrmm_kernel_LT +#define CTRMM_KERNEL_LR gotoblas -> ctrmm_kernel_LR +#define CTRMM_KERNEL_LC gotoblas -> ctrmm_kernel_LC +#define CTRMM_KERNEL_RN gotoblas -> ctrmm_kernel_RN +#define CTRMM_KERNEL_RT gotoblas -> ctrmm_kernel_RT +#define CTRMM_KERNEL_RR gotoblas -> ctrmm_kernel_RR +#define CTRMM_KERNEL_RC gotoblas -> ctrmm_kernel_RC + +#define CTRSM_KERNEL_LN gotoblas -> ctrsm_kernel_LN +#define CTRSM_KERNEL_LT gotoblas -> ctrsm_kernel_LT +#define CTRSM_KERNEL_LR gotoblas -> ctrsm_kernel_LR +#define CTRSM_KERNEL_LC gotoblas -> ctrsm_kernel_LC +#define CTRSM_KERNEL_RN gotoblas -> ctrsm_kernel_RN +#define CTRSM_KERNEL_RT gotoblas -> ctrsm_kernel_RT +#define CTRSM_KERNEL_RR gotoblas -> ctrsm_kernel_RR +#define CTRSM_KERNEL_RC gotoblas -> ctrsm_kernel_RC + +#define CSYMM_IUTCOPY gotoblas -> csymm_iutcopy +#define CSYMM_ILTCOPY gotoblas -> csymm_iltcopy +#define CSYMM_OUTCOPY gotoblas -> csymm_outcopy +#define CSYMM_OLTCOPY gotoblas -> csymm_oltcopy + +#define CHEMM_OUTCOPY gotoblas -> chemm_outcopy +#define CHEMM_OLTCOPY gotoblas -> chemm_oltcopy +#define CHEMM_IUTCOPY gotoblas -> chemm_iutcopy +#define CHEMM_ILTCOPY gotoblas -> chemm_iltcopy + +#define CGEMM3M_ONCOPYB gotoblas -> cgemm3m_oncopyb +#define CGEMM3M_ONCOPYR gotoblas -> cgemm3m_oncopyr +#define CGEMM3M_ONCOPYI gotoblas -> cgemm3m_oncopyi +#define CGEMM3M_OTCOPYB gotoblas -> cgemm3m_otcopyb +#define CGEMM3M_OTCOPYR gotoblas -> cgemm3m_otcopyr +#define CGEMM3M_OTCOPYI gotoblas -> cgemm3m_otcopyi + +#define CGEMM3M_INCOPYB gotoblas -> cgemm3m_incopyb +#define CGEMM3M_INCOPYR gotoblas -> cgemm3m_incopyr +#define CGEMM3M_INCOPYI gotoblas -> cgemm3m_incopyi +#define CGEMM3M_ITCOPYB gotoblas -> cgemm3m_itcopyb +#define CGEMM3M_ITCOPYR gotoblas -> cgemm3m_itcopyr +#define CGEMM3M_ITCOPYI gotoblas -> cgemm3m_itcopyi + +#define CSYMM3M_ILCOPYB gotoblas -> csymm3m_ilcopyb +#define CSYMM3M_IUCOPYB gotoblas -> csymm3m_iucopyb +#define CSYMM3M_ILCOPYR gotoblas -> csymm3m_ilcopyr +#define CSYMM3M_IUCOPYR gotoblas -> csymm3m_iucopyr +#define CSYMM3M_ILCOPYI gotoblas -> csymm3m_ilcopyi +#define CSYMM3M_IUCOPYI gotoblas -> csymm3m_iucopyi + +#define CSYMM3M_OLCOPYB gotoblas -> csymm3m_olcopyb +#define CSYMM3M_OUCOPYB gotoblas -> csymm3m_oucopyb +#define CSYMM3M_OLCOPYR gotoblas -> csymm3m_olcopyr +#define CSYMM3M_OUCOPYR gotoblas -> csymm3m_oucopyr +#define CSYMM3M_OLCOPYI gotoblas -> csymm3m_olcopyi +#define CSYMM3M_OUCOPYI gotoblas -> csymm3m_oucopyi + +#define CHEMM3M_ILCOPYB gotoblas -> chemm3m_ilcopyb +#define CHEMM3M_IUCOPYB gotoblas -> chemm3m_iucopyb +#define CHEMM3M_ILCOPYR gotoblas -> chemm3m_ilcopyr +#define CHEMM3M_IUCOPYR gotoblas -> chemm3m_iucopyr +#define CHEMM3M_ILCOPYI gotoblas -> chemm3m_ilcopyi +#define CHEMM3M_IUCOPYI gotoblas -> chemm3m_iucopyi + +#define CHEMM3M_OLCOPYB gotoblas -> chemm3m_olcopyb +#define CHEMM3M_OUCOPYB gotoblas -> chemm3m_oucopyb +#define CHEMM3M_OLCOPYR gotoblas -> chemm3m_olcopyr +#define CHEMM3M_OUCOPYR gotoblas -> chemm3m_oucopyr +#define CHEMM3M_OLCOPYI gotoblas -> chemm3m_olcopyi +#define CHEMM3M_OUCOPYI gotoblas -> chemm3m_oucopyi + +#define CGEMM3M_KERNEL gotoblas -> cgemm3m_kernel + +#define CNEG_TCOPY gotoblas -> cneg_tcopy +#define CLASWP_NCOPY gotoblas -> claswp_ncopy + +#endif + +#define CGEMM_NN cgemm_nn +#define CGEMM_CN cgemm_cn +#define CGEMM_TN cgemm_tn +#define CGEMM_NC cgemm_nc +#define CGEMM_NT cgemm_nt +#define CGEMM_CC cgemm_cc +#define CGEMM_CT cgemm_ct +#define CGEMM_TC cgemm_tc +#define CGEMM_TT cgemm_tt +#define CGEMM_NR cgemm_nr +#define CGEMM_TR cgemm_tr +#define CGEMM_CR cgemm_cr +#define CGEMM_RN cgemm_rn +#define CGEMM_RT cgemm_rt +#define CGEMM_RC cgemm_rc +#define CGEMM_RR cgemm_rr + +#define CSYMM_LU csymm_LU +#define CSYMM_LL csymm_LL +#define CSYMM_RU csymm_RU +#define CSYMM_RL csymm_RL + +#define CHEMM_LU chemm_LU +#define CHEMM_LL chemm_LL +#define CHEMM_RU chemm_RU +#define CHEMM_RL chemm_RL + +#define CSYRK_UN csyrk_UN +#define CSYRK_UT csyrk_UT +#define CSYRK_LN csyrk_LN +#define CSYRK_LT csyrk_LT +#define CSYRK_UR csyrk_UN +#define CSYRK_UC csyrk_UT +#define CSYRK_LR csyrk_LN +#define CSYRK_LC csyrk_LT + +#define CSYRK_KERNEL_U csyrk_kernel_U +#define CSYRK_KERNEL_L csyrk_kernel_L + +#define CHERK_UN cherk_UN +#define CHERK_LN cherk_LN +#define CHERK_UC cherk_UC +#define CHERK_LC cherk_LC + +#define CHER2K_UN cher2k_UN +#define CHER2K_LN cher2k_LN +#define CHER2K_UC cher2k_UC +#define CHER2K_LC cher2k_LC + +#define CSYR2K_UN csyr2k_UN +#define CSYR2K_UT csyr2k_UT +#define CSYR2K_LN csyr2k_LN +#define CSYR2K_LT csyr2k_LT +#define CSYR2K_UR csyr2k_UN +#define CSYR2K_UC csyr2k_UT +#define CSYR2K_LR csyr2k_LN +#define CSYR2K_LC csyr2k_LT + +#define CSYR2K_KERNEL_U csyr2k_kernel_U +#define CSYR2K_KERNEL_L csyr2k_kernel_L + +#define CTRMM_LNUU ctrmm_LNUU +#define CTRMM_LNUN ctrmm_LNUN +#define CTRMM_LNLU ctrmm_LNLU +#define CTRMM_LNLN ctrmm_LNLN +#define CTRMM_LTUU ctrmm_LTUU +#define CTRMM_LTUN ctrmm_LTUN +#define CTRMM_LTLU ctrmm_LTLU +#define CTRMM_LTLN ctrmm_LTLN +#define CTRMM_LRUU ctrmm_LRUU +#define CTRMM_LRUN ctrmm_LRUN +#define CTRMM_LRLU ctrmm_LRLU +#define CTRMM_LRLN ctrmm_LRLN +#define CTRMM_LCUU ctrmm_LCUU +#define CTRMM_LCUN ctrmm_LCUN +#define CTRMM_LCLU ctrmm_LCLU +#define CTRMM_LCLN ctrmm_LCLN +#define CTRMM_RNUU ctrmm_RNUU +#define CTRMM_RNUN ctrmm_RNUN +#define CTRMM_RNLU ctrmm_RNLU +#define CTRMM_RNLN ctrmm_RNLN +#define CTRMM_RTUU ctrmm_RTUU +#define CTRMM_RTUN ctrmm_RTUN +#define CTRMM_RTLU ctrmm_RTLU +#define CTRMM_RTLN ctrmm_RTLN +#define CTRMM_RRUU ctrmm_RRUU +#define CTRMM_RRUN ctrmm_RRUN +#define CTRMM_RRLU ctrmm_RRLU +#define CTRMM_RRLN ctrmm_RRLN +#define CTRMM_RCUU ctrmm_RCUU +#define CTRMM_RCUN ctrmm_RCUN +#define CTRMM_RCLU ctrmm_RCLU +#define CTRMM_RCLN ctrmm_RCLN + +#define CTRSM_LNUU ctrsm_LNUU +#define CTRSM_LNUN ctrsm_LNUN +#define CTRSM_LNLU ctrsm_LNLU +#define CTRSM_LNLN ctrsm_LNLN +#define CTRSM_LTUU ctrsm_LTUU +#define CTRSM_LTUN ctrsm_LTUN +#define CTRSM_LTLU ctrsm_LTLU +#define CTRSM_LTLN ctrsm_LTLN +#define CTRSM_LRUU ctrsm_LRUU +#define CTRSM_LRUN ctrsm_LRUN +#define CTRSM_LRLU ctrsm_LRLU +#define CTRSM_LRLN ctrsm_LRLN +#define CTRSM_LCUU ctrsm_LCUU +#define CTRSM_LCUN ctrsm_LCUN +#define CTRSM_LCLU ctrsm_LCLU +#define CTRSM_LCLN ctrsm_LCLN +#define CTRSM_RNUU ctrsm_RNUU +#define CTRSM_RNUN ctrsm_RNUN +#define CTRSM_RNLU ctrsm_RNLU +#define CTRSM_RNLN ctrsm_RNLN +#define CTRSM_RTUU ctrsm_RTUU +#define CTRSM_RTUN ctrsm_RTUN +#define CTRSM_RTLU ctrsm_RTLU +#define CTRSM_RTLN ctrsm_RTLN +#define CTRSM_RRUU ctrsm_RRUU +#define CTRSM_RRUN ctrsm_RRUN +#define CTRSM_RRLU ctrsm_RRLU +#define CTRSM_RRLN ctrsm_RRLN +#define CTRSM_RCUU ctrsm_RCUU +#define CTRSM_RCUN ctrsm_RCUN +#define CTRSM_RCLU ctrsm_RCLU +#define CTRSM_RCLN ctrsm_RCLN + +#define CGEMM_THREAD_NN cgemm_thread_nn +#define CGEMM_THREAD_CN cgemm_thread_cn +#define CGEMM_THREAD_TN cgemm_thread_tn +#define CGEMM_THREAD_NC cgemm_thread_nc +#define CGEMM_THREAD_NT cgemm_thread_nt +#define CGEMM_THREAD_CC cgemm_thread_cc +#define CGEMM_THREAD_CT cgemm_thread_ct +#define CGEMM_THREAD_TC cgemm_thread_tc +#define CGEMM_THREAD_TT cgemm_thread_tt +#define CGEMM_THREAD_NR cgemm_thread_nr +#define CGEMM_THREAD_TR cgemm_thread_tr +#define CGEMM_THREAD_CR cgemm_thread_cr +#define CGEMM_THREAD_RN cgemm_thread_rn +#define CGEMM_THREAD_RT cgemm_thread_rt +#define CGEMM_THREAD_RC cgemm_thread_rc +#define CGEMM_THREAD_RR cgemm_thread_rr + +#define CSYMM_THREAD_LU csymm_thread_LU +#define CSYMM_THREAD_LL csymm_thread_LL +#define CSYMM_THREAD_RU csymm_thread_RU +#define CSYMM_THREAD_RL csymm_thread_RL + +#define CHEMM_THREAD_LU chemm_thread_LU +#define CHEMM_THREAD_LL chemm_thread_LL +#define CHEMM_THREAD_RU chemm_thread_RU +#define CHEMM_THREAD_RL chemm_thread_RL + +#define CSYRK_THREAD_UN csyrk_thread_UN +#define CSYRK_THREAD_UT csyrk_thread_UT +#define CSYRK_THREAD_LN csyrk_thread_LN +#define CSYRK_THREAD_LT csyrk_thread_LT +#define CSYRK_THREAD_UR csyrk_thread_UN +#define CSYRK_THREAD_UC csyrk_thread_UT +#define CSYRK_THREAD_LR csyrk_thread_LN +#define CSYRK_THREAD_LC csyrk_thread_LT + +#define CHERK_THREAD_UN cherk_thread_UN +#define CHERK_THREAD_UT cherk_thread_UT +#define CHERK_THREAD_LN cherk_thread_LN +#define CHERK_THREAD_LT cherk_thread_LT +#define CHERK_THREAD_UR cherk_thread_UR +#define CHERK_THREAD_UC cherk_thread_UC +#define CHERK_THREAD_LR cherk_thread_LR +#define CHERK_THREAD_LC cherk_thread_LC + +#define CGEMM3M_NN cgemm3m_nn +#define CGEMM3M_CN cgemm3m_cn +#define CGEMM3M_TN cgemm3m_tn +#define CGEMM3M_NC cgemm3m_nc +#define CGEMM3M_NT cgemm3m_nt +#define CGEMM3M_CC cgemm3m_cc +#define CGEMM3M_CT cgemm3m_ct +#define CGEMM3M_TC cgemm3m_tc +#define CGEMM3M_TT cgemm3m_tt +#define CGEMM3M_NR cgemm3m_nr +#define CGEMM3M_TR cgemm3m_tr +#define CGEMM3M_CR cgemm3m_cr +#define CGEMM3M_RN cgemm3m_rn +#define CGEMM3M_RT cgemm3m_rt +#define CGEMM3M_RC cgemm3m_rc +#define CGEMM3M_RR cgemm3m_rr + +#define CGEMM3M_THREAD_NN cgemm3m_thread_nn +#define CGEMM3M_THREAD_CN cgemm3m_thread_cn +#define CGEMM3M_THREAD_TN cgemm3m_thread_tn +#define CGEMM3M_THREAD_NC cgemm3m_thread_nc +#define CGEMM3M_THREAD_NT cgemm3m_thread_nt +#define CGEMM3M_THREAD_CC cgemm3m_thread_cc +#define CGEMM3M_THREAD_CT cgemm3m_thread_ct +#define CGEMM3M_THREAD_TC cgemm3m_thread_tc +#define CGEMM3M_THREAD_TT cgemm3m_thread_tt +#define CGEMM3M_THREAD_NR cgemm3m_thread_nr +#define CGEMM3M_THREAD_TR cgemm3m_thread_tr +#define CGEMM3M_THREAD_CR cgemm3m_thread_cr +#define CGEMM3M_THREAD_RN cgemm3m_thread_rn +#define CGEMM3M_THREAD_RT cgemm3m_thread_rt +#define CGEMM3M_THREAD_RC cgemm3m_thread_rc +#define CGEMM3M_THREAD_RR cgemm3m_thread_rr + +#define CSYMM3M_LU csymm3m_LU +#define CSYMM3M_LL csymm3m_LL +#define CSYMM3M_RU csymm3m_RU +#define CSYMM3M_RL csymm3m_RL + +#define CSYMM3M_THREAD_LU csymm3m_thread_LU +#define CSYMM3M_THREAD_LL csymm3m_thread_LL +#define CSYMM3M_THREAD_RU csymm3m_thread_RU +#define CSYMM3M_THREAD_RL csymm3m_thread_RL + +#define CHEMM3M_LU chemm3m_LU +#define CHEMM3M_LL chemm3m_LL +#define CHEMM3M_RU chemm3m_RU +#define CHEMM3M_RL chemm3m_RL + +#define CHEMM3M_THREAD_LU chemm3m_thread_LU +#define CHEMM3M_THREAD_LL chemm3m_thread_LL +#define CHEMM3M_THREAD_RU chemm3m_thread_RU +#define CHEMM3M_THREAD_RL chemm3m_thread_RL + +#endif diff --git a/common_d.h b/common_d.h new file mode 100644 index 0000000..4c9a53f --- /dev/null +++ b/common_d.h @@ -0,0 +1,432 @@ +#ifndef COMMON_D_H +#define COMMON_D_H + +#ifndef DYNAMIC_ARCH + +#define DAMAX_K damax_k +#define DAMIN_K damin_k +#define DMAX_K dmax_k +#define DMIN_K dmin_k +#define IDAMAX_K idamax_k +#define IDAMIN_K idamin_k +#define IDMAX_K idmax_k +#define IDMIN_K idmin_k +#define DASUM_K dasum_k +#define DAXPYU_K daxpy_k +#define DAXPYC_K daxpy_k +#define DCOPY_K dcopy_k +#define DDOTU_K ddot_k +#define DDOTC_K ddot_k +#define DNRM2_K dnrm2_k +#define DSCAL_K dscal_k +#define DSWAP_K dswap_k +#define DROT_K drot_k + +#define DGEMV_N dgemv_n +#define DGEMV_T dgemv_t +#define DGEMV_R dgemv_n +#define DGEMV_C dgemv_t +#define DGEMV_O dgemv_n +#define DGEMV_U dgemv_t +#define DGEMV_S dgemv_n +#define DGEMV_D dgemv_t + +#define DGERU_K dger_k +#define DGERC_K dger_k +#define DGERV_K dger_k +#define DGERD_K dger_k + +#define DSYMV_U dsymv_U +#define DSYMV_L dsymv_L + +#define DSYMV_THREAD_U dsymv_thread_U +#define DSYMV_THREAD_L dsymv_thread_L + +#define DGEMM_ONCOPY dgemm_oncopy +#define DGEMM_OTCOPY dgemm_otcopy + +#if DGEMM_DEFAULT_UNROLL_M == DGEMM_DEFAULT_UNROLL_N +#define DGEMM_INCOPY dgemm_oncopy +#define DGEMM_ITCOPY dgemm_otcopy +#else +#define DGEMM_INCOPY dgemm_incopy +#define DGEMM_ITCOPY dgemm_itcopy +#endif + +#define DTRMM_OUNUCOPY dtrmm_ounucopy +#define DTRMM_OUNNCOPY dtrmm_ounncopy +#define DTRMM_OUTUCOPY dtrmm_outucopy +#define DTRMM_OUTNCOPY dtrmm_outncopy +#define DTRMM_OLNUCOPY dtrmm_olnucopy +#define DTRMM_OLNNCOPY dtrmm_olnncopy +#define DTRMM_OLTUCOPY dtrmm_oltucopy +#define DTRMM_OLTNCOPY dtrmm_oltncopy + +#define DTRSM_OUNUCOPY dtrsm_ounucopy +#define DTRSM_OUNNCOPY dtrsm_ounncopy +#define DTRSM_OUTUCOPY dtrsm_outucopy +#define DTRSM_OUTNCOPY dtrsm_outncopy +#define DTRSM_OLNUCOPY dtrsm_olnucopy +#define DTRSM_OLNNCOPY dtrsm_olnncopy +#define DTRSM_OLTUCOPY dtrsm_oltucopy +#define DTRSM_OLTNCOPY dtrsm_oltncopy + +#if DGEMM_DEFAULT_UNROLL_M == DGEMM_DEFAULT_UNROLL_N +#define DTRMM_IUNUCOPY dtrmm_ounucopy +#define DTRMM_IUNNCOPY dtrmm_ounncopy +#define DTRMM_IUTUCOPY dtrmm_outucopy +#define DTRMM_IUTNCOPY dtrmm_outncopy +#define DTRMM_ILNUCOPY dtrmm_olnucopy +#define DTRMM_ILNNCOPY dtrmm_olnncopy +#define DTRMM_ILTUCOPY dtrmm_oltucopy +#define DTRMM_ILTNCOPY dtrmm_oltncopy + +#define DTRSM_IUNUCOPY dtrsm_ounucopy +#define DTRSM_IUNNCOPY dtrsm_ounncopy +#define DTRSM_IUTUCOPY dtrsm_outucopy +#define DTRSM_IUTNCOPY dtrsm_outncopy +#define DTRSM_ILNUCOPY dtrsm_olnucopy +#define DTRSM_ILNNCOPY dtrsm_olnncopy +#define DTRSM_ILTUCOPY dtrsm_oltucopy +#define DTRSM_ILTNCOPY dtrsm_oltncopy +#else +#define DTRMM_IUNUCOPY dtrmm_iunucopy +#define DTRMM_IUNNCOPY dtrmm_iunncopy +#define DTRMM_IUTUCOPY dtrmm_iutucopy +#define DTRMM_IUTNCOPY dtrmm_iutncopy +#define DTRMM_ILNUCOPY dtrmm_ilnucopy +#define DTRMM_ILNNCOPY dtrmm_ilnncopy +#define DTRMM_ILTUCOPY dtrmm_iltucopy +#define DTRMM_ILTNCOPY dtrmm_iltncopy + +#define DTRSM_IUNUCOPY dtrsm_iunucopy +#define DTRSM_IUNNCOPY dtrsm_iunncopy +#define DTRSM_IUTUCOPY dtrsm_iutucopy +#define DTRSM_IUTNCOPY dtrsm_iutncopy +#define DTRSM_ILNUCOPY dtrsm_ilnucopy +#define DTRSM_ILNNCOPY dtrsm_ilnncopy +#define DTRSM_ILTUCOPY dtrsm_iltucopy +#define DTRSM_ILTNCOPY dtrsm_iltncopy +#endif + +#define DGEMM_BETA dgemm_beta + +#define DGEMM_KERNEL dgemm_kernel + +#define DTRMM_KERNEL_LN dtrmm_kernel_LN +#define DTRMM_KERNEL_LT dtrmm_kernel_LT +#define DTRMM_KERNEL_LR dtrmm_kernel_LN +#define DTRMM_KERNEL_LC dtrmm_kernel_LT +#define DTRMM_KERNEL_RN dtrmm_kernel_RN +#define DTRMM_KERNEL_RT dtrmm_kernel_RT +#define DTRMM_KERNEL_RR dtrmm_kernel_RN +#define DTRMM_KERNEL_RC dtrmm_kernel_RT + +#define DTRSM_KERNEL_LN dtrsm_kernel_LN +#define DTRSM_KERNEL_LT dtrsm_kernel_LT +#define DTRSM_KERNEL_LR dtrsm_kernel_LN +#define DTRSM_KERNEL_LC dtrsm_kernel_LT +#define DTRSM_KERNEL_RN dtrsm_kernel_RN +#define DTRSM_KERNEL_RT dtrsm_kernel_RT +#define DTRSM_KERNEL_RR dtrsm_kernel_RN +#define DTRSM_KERNEL_RC dtrsm_kernel_RT + +#define DSYMM_OUTCOPY dsymm_outcopy +#define DSYMM_OLTCOPY dsymm_oltcopy +#if DGEMM_DEFAULT_UNROLL_M == DGEMM_DEFAULT_UNROLL_N +#define DSYMM_IUTCOPY dsymm_outcopy +#define DSYMM_ILTCOPY dsymm_oltcopy +#else +#define DSYMM_IUTCOPY dsymm_iutcopy +#define DSYMM_ILTCOPY dsymm_iltcopy +#endif + +#define DNEG_TCOPY dneg_tcopy +#define DLASWP_NCOPY dlaswp_ncopy + +#else + +#define DAMAX_K gotoblas -> damax_k +#define DAMIN_K gotoblas -> damin_k +#define DMAX_K gotoblas -> dmax_k +#define DMIN_K gotoblas -> dmin_k +#define IDAMAX_K gotoblas -> idamax_k +#define IDAMIN_K gotoblas -> idamin_k +#define IDMAX_K gotoblas -> idmax_k +#define IDMIN_K gotoblas -> idmin_k +#define DASUM_K gotoblas -> dasum_k +#define DAXPYU_K gotoblas -> daxpy_k +#define DAXPYC_K gotoblas -> daxpy_k +#define DCOPY_K gotoblas -> dcopy_k +#define DDOTU_K gotoblas -> ddot_k +#define DDOTC_K gotoblas -> ddot_k +#define DNRM2_K gotoblas -> dnrm2_k +#define DSCAL_K gotoblas -> dscal_k +#define DSWAP_K gotoblas -> dswap_k +#define DROT_K gotoblas -> drot_k + +#define DGEMV_N gotoblas -> dgemv_n +#define DGEMV_T gotoblas -> dgemv_t +#define DGEMV_R gotoblas -> dgemv_n +#define DGEMV_C gotoblas -> dgemv_t +#define DGEMV_O gotoblas -> dgemv_n +#define DGEMV_U gotoblas -> dgemv_t +#define DGEMV_S gotoblas -> dgemv_n +#define DGEMV_D gotoblas -> dgemv_t + +#define DGERU_K gotoblas -> dger_k +#define DGERC_K gotoblas -> dger_k +#define DGERV_K gotoblas -> dger_k +#define DGERD_K gotoblas -> dger_k + +#define DSYMV_U gotoblas -> dsymv_U +#define DSYMV_L gotoblas -> dsymv_L + +#define DSYMV_THREAD_U dsymv_thread_U +#define DSYMV_THREAD_L dsymv_thread_L + +#define DGEMM_ONCOPY gotoblas -> dgemm_oncopy +#define DGEMM_OTCOPY gotoblas -> dgemm_otcopy +#define DGEMM_INCOPY gotoblas -> dgemm_incopy +#define DGEMM_ITCOPY gotoblas -> dgemm_itcopy + +#define DTRMM_OUNUCOPY gotoblas -> dtrmm_ounucopy +#define DTRMM_OUTUCOPY gotoblas -> dtrmm_outucopy +#define DTRMM_OLNUCOPY gotoblas -> dtrmm_olnucopy +#define DTRMM_OLTUCOPY gotoblas -> dtrmm_oltucopy +#define DTRSM_OUNUCOPY gotoblas -> dtrsm_ounucopy +#define DTRSM_OUTUCOPY gotoblas -> dtrsm_outucopy +#define DTRSM_OLNUCOPY gotoblas -> dtrsm_olnucopy +#define DTRSM_OLTUCOPY gotoblas -> dtrsm_oltucopy + +#define DTRMM_IUNUCOPY gotoblas -> dtrmm_iunucopy +#define DTRMM_IUTUCOPY gotoblas -> dtrmm_iutucopy +#define DTRMM_ILNUCOPY gotoblas -> dtrmm_ilnucopy +#define DTRMM_ILTUCOPY gotoblas -> dtrmm_iltucopy +#define DTRSM_IUNUCOPY gotoblas -> dtrsm_iunucopy +#define DTRSM_IUTUCOPY gotoblas -> dtrsm_iutucopy +#define DTRSM_ILNUCOPY gotoblas -> dtrsm_ilnucopy +#define DTRSM_ILTUCOPY gotoblas -> dtrsm_iltucopy + +#define DTRMM_OUNNCOPY gotoblas -> dtrmm_ounncopy +#define DTRMM_OUTNCOPY gotoblas -> dtrmm_outncopy +#define DTRMM_OLNNCOPY gotoblas -> dtrmm_olnncopy +#define DTRMM_OLTNCOPY gotoblas -> dtrmm_oltncopy +#define DTRSM_OUNNCOPY gotoblas -> dtrsm_ounncopy +#define DTRSM_OUTNCOPY gotoblas -> dtrsm_outncopy +#define DTRSM_OLNNCOPY gotoblas -> dtrsm_olnncopy +#define DTRSM_OLTNCOPY gotoblas -> dtrsm_oltncopy + +#define DTRMM_IUNNCOPY gotoblas -> dtrmm_iunncopy +#define DTRMM_IUTNCOPY gotoblas -> dtrmm_iutncopy +#define DTRMM_ILNNCOPY gotoblas -> dtrmm_ilnncopy +#define DTRMM_ILTNCOPY gotoblas -> dtrmm_iltncopy +#define DTRSM_IUNNCOPY gotoblas -> dtrsm_iunncopy +#define DTRSM_IUTNCOPY gotoblas -> dtrsm_iutncopy +#define DTRSM_ILNNCOPY gotoblas -> dtrsm_ilnncopy +#define DTRSM_ILTNCOPY gotoblas -> dtrsm_iltncopy + +#define DGEMM_BETA gotoblas -> dgemm_beta +#define DGEMM_KERNEL gotoblas -> dgemm_kernel + +#define DTRMM_KERNEL_LN gotoblas -> dtrmm_kernel_LN +#define DTRMM_KERNEL_LT gotoblas -> dtrmm_kernel_LT +#define DTRMM_KERNEL_LR gotoblas -> dtrmm_kernel_LN +#define DTRMM_KERNEL_LC gotoblas -> dtrmm_kernel_LT +#define DTRMM_KERNEL_RN gotoblas -> dtrmm_kernel_RN +#define DTRMM_KERNEL_RT gotoblas -> dtrmm_kernel_RT +#define DTRMM_KERNEL_RR gotoblas -> dtrmm_kernel_RN +#define DTRMM_KERNEL_RC gotoblas -> dtrmm_kernel_RT + +#define DTRSM_KERNEL_LN gotoblas -> dtrsm_kernel_LN +#define DTRSM_KERNEL_LT gotoblas -> dtrsm_kernel_LT +#define DTRSM_KERNEL_LR gotoblas -> dtrsm_kernel_LN +#define DTRSM_KERNEL_LC gotoblas -> dtrsm_kernel_LT +#define DTRSM_KERNEL_RN gotoblas -> dtrsm_kernel_RN +#define DTRSM_KERNEL_RT gotoblas -> dtrsm_kernel_RT +#define DTRSM_KERNEL_RR gotoblas -> dtrsm_kernel_RN +#define DTRSM_KERNEL_RC gotoblas -> dtrsm_kernel_RT + +#define DSYMM_IUTCOPY gotoblas -> dsymm_iutcopy +#define DSYMM_ILTCOPY gotoblas -> dsymm_iltcopy +#define DSYMM_OUTCOPY gotoblas -> dsymm_outcopy +#define DSYMM_OLTCOPY gotoblas -> dsymm_oltcopy + +#define DNEG_TCOPY gotoblas -> dneg_tcopy +#define DLASWP_NCOPY gotoblas -> dlaswp_ncopy + +#endif + +#define DGEMM_NN dgemm_nn +#define DGEMM_CN dgemm_tn +#define DGEMM_TN dgemm_tn +#define DGEMM_NC dgemm_nt +#define DGEMM_NT dgemm_nt +#define DGEMM_CC dgemm_tt +#define DGEMM_CT dgemm_tt +#define DGEMM_TC dgemm_tt +#define DGEMM_TT dgemm_tt +#define DGEMM_NR dgemm_nn +#define DGEMM_TR dgemm_tn +#define DGEMM_CR dgemm_tn +#define DGEMM_RN dgemm_nn +#define DGEMM_RT dgemm_nt +#define DGEMM_RC dgemm_nt +#define DGEMM_RR dgemm_nn + +#define DSYMM_LU dsymm_LU +#define DSYMM_LL dsymm_LL +#define DSYMM_RU dsymm_RU +#define DSYMM_RL dsymm_RL + +#define DHEMM_LU dhemm_LU +#define DHEMM_LL dhemm_LL +#define DHEMM_RU dhemm_RU +#define DHEMM_RL dhemm_RL + +#define DSYRK_UN dsyrk_UN +#define DSYRK_UT dsyrk_UT +#define DSYRK_LN dsyrk_LN +#define DSYRK_LT dsyrk_LT +#define DSYRK_UR dsyrk_UN +#define DSYRK_UC dsyrk_UT +#define DSYRK_LR dsyrk_LN +#define DSYRK_LC dsyrk_LT + +#define DSYRK_KERNEL_U dsyrk_kernel_U +#define DSYRK_KERNEL_L dsyrk_kernel_L + +#define DHERK_UN dsyrk_UN +#define DHERK_LN dsyrk_LN +#define DHERK_UC dsyrk_UT +#define DHERK_LC dsyrk_LT + +#define DHER2K_UN dsyr2k_UN +#define DHER2K_LN dsyr2k_LN +#define DHER2K_UC dsyr2k_UT +#define DHER2K_LC dsyr2k_LT + +#define DSYR2K_UN dsyr2k_UN +#define DSYR2K_UT dsyr2k_UT +#define DSYR2K_LN dsyr2k_LN +#define DSYR2K_LT dsyr2k_LT +#define DSYR2K_UR dsyr2k_UN +#define DSYR2K_UC dsyr2k_UT +#define DSYR2K_LR dsyr2k_LN +#define DSYR2K_LC dsyr2k_LT + +#define DSYR2K_KERNEL_U dsyr2k_kernel_U +#define DSYR2K_KERNEL_L dsyr2k_kernel_L + +#define DTRMM_LNUU dtrmm_LNUU +#define DTRMM_LNUN dtrmm_LNUN +#define DTRMM_LNLU dtrmm_LNLU +#define DTRMM_LNLN dtrmm_LNLN +#define DTRMM_LTUU dtrmm_LTUU +#define DTRMM_LTUN dtrmm_LTUN +#define DTRMM_LTLU dtrmm_LTLU +#define DTRMM_LTLN dtrmm_LTLN +#define DTRMM_LRUU dtrmm_LNUU +#define DTRMM_LRUN dtrmm_LNUN +#define DTRMM_LRLU dtrmm_LNLU +#define DTRMM_LRLN dtrmm_LNLN +#define DTRMM_LCUU dtrmm_LTUU +#define DTRMM_LCUN dtrmm_LTUN +#define DTRMM_LCLU dtrmm_LTLU +#define DTRMM_LCLN dtrmm_LTLN +#define DTRMM_RNUU dtrmm_RNUU +#define DTRMM_RNUN dtrmm_RNUN +#define DTRMM_RNLU dtrmm_RNLU +#define DTRMM_RNLN dtrmm_RNLN +#define DTRMM_RTUU dtrmm_RTUU +#define DTRMM_RTUN dtrmm_RTUN +#define DTRMM_RTLU dtrmm_RTLU +#define DTRMM_RTLN dtrmm_RTLN +#define DTRMM_RRUU dtrmm_RNUU +#define DTRMM_RRUN dtrmm_RNUN +#define DTRMM_RRLU dtrmm_RNLU +#define DTRMM_RRLN dtrmm_RNLN +#define DTRMM_RCUU dtrmm_RTUU +#define DTRMM_RCUN dtrmm_RTUN +#define DTRMM_RCLU dtrmm_RTLU +#define DTRMM_RCLN dtrmm_RTLN + +#define DTRSM_LNUU dtrsm_LNUU +#define DTRSM_LNUN dtrsm_LNUN +#define DTRSM_LNLU dtrsm_LNLU +#define DTRSM_LNLN dtrsm_LNLN +#define DTRSM_LTUU dtrsm_LTUU +#define DTRSM_LTUN dtrsm_LTUN +#define DTRSM_LTLU dtrsm_LTLU +#define DTRSM_LTLN dtrsm_LTLN +#define DTRSM_LRUU dtrsm_LNUU +#define DTRSM_LRUN dtrsm_LNUN +#define DTRSM_LRLU dtrsm_LNLU +#define DTRSM_LRLN dtrsm_LNLN +#define DTRSM_LCUU dtrsm_LTUU +#define DTRSM_LCUN dtrsm_LTUN +#define DTRSM_LCLU dtrsm_LTLU +#define DTRSM_LCLN dtrsm_LTLN +#define DTRSM_RNUU dtrsm_RNUU +#define DTRSM_RNUN dtrsm_RNUN +#define DTRSM_RNLU dtrsm_RNLU +#define DTRSM_RNLN dtrsm_RNLN +#define DTRSM_RTUU dtrsm_RTUU +#define DTRSM_RTUN dtrsm_RTUN +#define DTRSM_RTLU dtrsm_RTLU +#define DTRSM_RTLN dtrsm_RTLN +#define DTRSM_RRUU dtrsm_RNUU +#define DTRSM_RRUN dtrsm_RNUN +#define DTRSM_RRLU dtrsm_RNLU +#define DTRSM_RRLN dtrsm_RNLN +#define DTRSM_RCUU dtrsm_RTUU +#define DTRSM_RCUN dtrsm_RTUN +#define DTRSM_RCLU dtrsm_RTLU +#define DTRSM_RCLN dtrsm_RTLN + +#define DGEMM_THREAD_NN dgemm_thread_nn +#define DGEMM_THREAD_CN dgemm_thread_tn +#define DGEMM_THREAD_TN dgemm_thread_tn +#define DGEMM_THREAD_NC dgemm_thread_nt +#define DGEMM_THREAD_NT dgemm_thread_nt +#define DGEMM_THREAD_CC dgemm_thread_tt +#define DGEMM_THREAD_CT dgemm_thread_tt +#define DGEMM_THREAD_TC dgemm_thread_tt +#define DGEMM_THREAD_TT dgemm_thread_tt +#define DGEMM_THREAD_NR dgemm_thread_nn +#define DGEMM_THREAD_TR dgemm_thread_tn +#define DGEMM_THREAD_CR dgemm_thread_tn +#define DGEMM_THREAD_RN dgemm_thread_nn +#define DGEMM_THREAD_RT dgemm_thread_nt +#define DGEMM_THREAD_RC dgemm_thread_nt +#define DGEMM_THREAD_RR dgemm_thread_nn + +#define DSYMM_THREAD_LU dsymm_thread_LU +#define DSYMM_THREAD_LL dsymm_thread_LL +#define DSYMM_THREAD_RU dsymm_thread_RU +#define DSYMM_THREAD_RL dsymm_thread_RL + +#define DHEMM_THREAD_LU dhemm_thread_LU +#define DHEMM_THREAD_LL dhemm_thread_LL +#define DHEMM_THREAD_RU dhemm_thread_RU +#define DHEMM_THREAD_RL dhemm_thread_RL + +#define DSYRK_THREAD_UN dsyrk_thread_UN +#define DSYRK_THREAD_UT dsyrk_thread_UT +#define DSYRK_THREAD_LN dsyrk_thread_LN +#define DSYRK_THREAD_LT dsyrk_thread_LT +#define DSYRK_THREAD_UR dsyrk_thread_UN +#define DSYRK_THREAD_UC dsyrk_thread_UT +#define DSYRK_THREAD_LR dsyrk_thread_LN +#define DSYRK_THREAD_LC dsyrk_thread_LT + +#define DHERK_THREAD_UN dsyrk_thread_UN +#define DHERK_THREAD_UT dsyrk_thread_UT +#define DHERK_THREAD_LN dsyrk_thread_LN +#define DHERK_THREAD_LT dsyrk_thread_LT +#define DHERK_THREAD_UR dsyrk_thread_UN +#define DHERK_THREAD_UC dsyrk_thread_UT +#define DHERK_THREAD_LR dsyrk_thread_LN +#define DHERK_THREAD_LC dsyrk_thread_LT + +#endif diff --git a/common_ia64.h b/common_ia64.h new file mode 100644 index 0000000..81939cc --- /dev/null +++ b/common_ia64.h @@ -0,0 +1,408 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_IA64 +#define COMMON_IA64 + +#ifndef ASSEMBLER + +#ifndef MAP_WRITECOMBINED +#define MAP_WRITECOMBINED 0x10000 +#endif + +#define MB +#define WMB + +#ifdef __ECC +#include +#endif + +#define RPCC64BIT + +#ifndef __ECC +static __inline void blas_lock(volatile unsigned long *address){ + + unsigned long ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__ ("mov ar.ccv=r0\n;;\n" + "cmpxchg4.acq %0=[%2],%1,ar.ccv\n" + : "=r"(ret) : "r"(1), "r"(address) + : "ar.ccv", "memory"); + } while (ret); +} + +static __inline unsigned long rpcc(void) { + unsigned long clocks; + + __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks)); + return clocks; +} + + +static __inline unsigned long stmxcsr(void){ + unsigned long fp; + + __asm__ __volatile__ ("mov.m %0=ar.fpsr" : "=r" (fp)); + + return fp; +} + +static __inline void ldmxcsr(unsigned long fp) { + + __asm__ __volatile__ ("mov.m ar.fpsr=%0" :: "r" (fp)); + +} + +#define GET_IMAGE(res) asm __volatile__("mov %0 = f9" : "=f"(res) : : "memory") + +#else + +static __inline void blas_lock(volatile unsigned long *address){ + while (*address || _InterlockedCompareExchange((volatile int *) address,1,0)) + ; +} + +static __inline unsigned int rpcc(void) { + return __getReg(_IA64_REG_AR_ITC); +} + +static __inline unsigned int stmxcsr(void) { + return __getReg(_IA64_REG_AR_FPSR); +} + +static __inline void ldmxcsr(unsigned long fp) { + + return __setReg(_IA64_REG_AR_FPSR, fp); + +} + +#ifdef DOUBLE +#define GET_IMAGE(res) __stfd(&res, 9) +#else +#define GET_IMAGE(res) __stfs(&res, 9) +#endif + +#endif + +#define GET_IMAGE_CANCEL + +#ifdef ENABLE_SSE_EXCEPTION + +#define IDEBUG_START \ + { \ + unsigned long fp_sse_mode, new_fp_mode; \ + fp_sse_mode = stmxcsr();\ + new_fp_mode = (fp_sse_mode & ~(FE_UNDERFLOW | FE_OVERFLOW | FE_UNNORMAL | FE_INVALID));\ + ldmxcsr(new_fp_mode); + +#define IDEBUG_END \ + ldmxcsr(fp_sse_mode); \ + } + +#endif + +#ifdef SMP + +#ifdef USE64BITINT + +/* 64bit version */ + +extern unsigned long blas_quick_divide_table[]; + +#ifndef __ECC +static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){ + unsigned long ret; + + if (y <= 1) return x; + + __asm__ __volatile__("setf.sig f6 = %1\n\t" + "ldf8 f7 = [%2];;\n\t" + "xmpy.hu f6= f6, f7;;\n\t" + "getf.sig %0 = f6;;\n" + : "=r"(ret) + : "r"(x), "r"(&blas_quick_divide_table[y]) : "f6", "f7" + ); + + return ret; +} +#else +/* Using Intel Compiler */ +static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){ + if (y <= 1) return x; + return _m64_xmahu(x, blas_quick_divide_table[y], 0); +} +#endif + +#else + /* 32bit version */ +extern unsigned int blas_quick_divide_table[]; + +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ + if (y <= 1) return x; + return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32); +} +#endif +#endif + +#endif + +#if 0 +#ifdef DOUBLE +#define GEMM_NCOPY dgemm_ncopy +#define GEMM_TCOPY dgemm_tcopy +#define ZGEMM_NCOPY zgemm_ncopy +#define ZGEMM_TCOPY zgemm_tcopy +#define GEMM_KERNEL dgemm_kernel + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ZGEMM_KERNEL zgemm_kernel_n +#endif +#if defined(CN) || defined(CT) || defined(RN) || defined(RT) +#define ZGEMM_KERNEL zgemm_kernel_l +#endif +#if defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define ZGEMM_KERNEL zgemm_kernel_r +#endif +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define ZGEMM_KERNEL zgemm_kernel_b +#endif + +#else +#define GEMM_NCOPY sgemm_ncopy +#define GEMM_TCOPY sgemm_tcopy +#define ZGEMM_NCOPY cgemm_ncopy +#define ZGEMM_TCOPY cgemm_tcopy +#define GEMM_KERNEL sgemm_kernel + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ZGEMM_KERNEL cgemm_kernel_n +#endif +#if defined(CN) || defined(CT) || defined(RN) || defined(RT) +#define ZGEMM_KERNEL cgemm_kernel_l +#endif +#if defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define ZGEMM_KERNEL cgemm_kernel_r +#endif +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define ZGEMM_KERNEL cgemm_kernel_b +#endif + +#endif +#endif + +#ifdef USE64BITINT +#define LDINT ld8 +#define INTSIZE 8 +#define CMP4GE cmp.ge +#define CMP4NE cmp.ge +#define CMP4EQ cmp.eq +#else +#define LDINT ld4 +#define INTSIZE 4 +#define CMP4GE cmp4.ge +#define CMP4NE cmp4.ne +#define CMP4EQ cmp4.eq +#endif + +#define HALT mov r0 = 0 + +#ifdef XDOUBLE +#define LD8 ld8 +#define ST8 st8 +#define LDFD ldfe +#define LDFPD ldfpe +#define LDFD_T1 ldfe.t1 +#define LDFD_NT1 ldfe.nt1 +#define LDFD_NT2 ldfe.nt2 +#define LDFD_NTA ldfe.nta +#define LDFPD_NT1 ldfpe.nt1 +#define LDFPD_NT2 ldfpe.nt2 +#define LDFPD_NTA ldfpe.nta +#define STFD stfe +#define STFD_NTA stfe.nta +#define FADD fadd +#define FSUB fsub +#define FMPY fmpy +#define FMA fma +#define FMS fms +#define FNMA fnma +#define FPMA fpma +#define SETF setf.d +#elif defined(DOUBLE) +#define LD8 ld8 +#define ST8 st8 +#define LDF8 ldf8 +#define LDF8_NT1 ldf8.nt1 +#define LDF8_NTA ldf8.nta +#define STF8 stf8 +#define STF8_NTA stf8.nta +#define LDFD ldfd +#define LDFPD ldfpd +#define LDFD_T1 ldfd.t1 +#define LDFD_NT1 ldfd.nt1 +#define LDFD_NT2 ldfd.nt2 +#define LDFD_NTA ldfd.nta +#define LDFPD_NT1 ldfpd.nt1 +#define LDFPD_NT2 ldfpd.nt2 +#define LDFPD_NTA ldfpd.nta +#define STFD stfd +#define STFD_NTA stfd.nta +#define FADD fadd.d +#define FSUB fsub.d +#define FMPY fmpy.d +#define FMA fma.d +#define FMS fms.d +#define FNMA fnma.d +#define FPMA fpma.d +#define SETF setf.d +#else +#define LD8 ld4 +#define ST8 st4 +#define LDF8 ldfs +#define LDF8_NT1 ldfs.nt1 +#define LDF8_NTA ldfs.nta +#define STF8 stfs +#define STF8_NTA stfs.nta +#define LDFD ldfs +#define LDFPD ldfps +#define LDFD_T1 ldfs.t1 +#define LDFD_NT1 ldfs.nt1 +#define LDFD_NT2 ldfs.nt2 +#define LDFD_NTA ldfs.nta +#define LDFPD_NT1 ldfps.nt1 +#define LDFPD_NT2 ldfps.nt2 +#define LDFPD_NTA ldfps.nta +#define STFD stfs +#define STFD_NTA stfs.nta +#if 0 +#define FADD fadd.s +#define FSUB fsub.s +#define FMPY fmpy.s +#define FMA fma.s +#define FMS fms.s +#define FNMA fnma.s +#define FPMA fpma.s +#else +#define FADD fadd +#define FSUB fsub +#define FMPY fmpy +#define FMA fma +#define FMS fms +#define FNMA fnma +#define FPMA fpma +#endif +#define SETF setf.s +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#ifdef F_INTERFACE_G77 +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_G95 +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_GFORT +#define RETURN_BY_REGS +#endif + +#ifdef F_INTERFACE_INTEL +#define RETURN_BY_STACK +#endif + +#define PROLOGUE \ + .explicit; \ + .text; \ + .align 128; \ + .global REALNAME; \ + .proc REALNAME; \ +REALNAME: + + +#ifdef PROFILE +#define PROFCODE \ + .data; \ + .align 8; \ +.LP0:; \ + data8 0; \ + .text; \ + alloc out0 = ar.pfs, 8, 0, 4, 0; \ + mov out1 = r1; \ + mov out2 = b0; \ + addl out3 = @ltoff(.LP0), r1;;; \ + br.call.sptk.many b0 = _mcount;; +#else +#define PROFCODE +#endif + +#define EPILOGUE \ + .endp REALNAME + +#define START_ADDRESS 0x20000fc800000000UL + +#undef SEEK_ADDRESS + +#if 0 +#ifdef CONFIG_IA64_PAGE_SIZE_4KB +#define SEEK_ADDRESS +#endif + +#ifdef CONFIG_IA64_PAGE_SIZE_8KB +#define SEEK_ADDRESS +#endif +#endif + +#define BUFFER_SIZE (128 << 20) + +#ifndef PAGESIZE +#define PAGESIZE (16UL << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BASE_ADDRESS (START_ADDRESS - (BLASULONG)BUFFER_SIZE * MAX_CPU_NUMBER) + +#endif diff --git a/common_interface.h b/common_interface.h new file mode 100644 index 0000000..36bf5aa --- /dev/null +++ b/common_interface.h @@ -0,0 +1,736 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +int BLASFUNC(xerbla)(char *, blasint *info, blasint); + +FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); +FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); + +double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); +double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); +xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + + +#ifdef RETURN_BY_STRUCT +typedef struct { + float r, i; +} myccomplex_t; + +typedef struct { + double r, i; +} myzcomplex_t; + +typedef struct { + xdouble r, i; +} myxcomplex_t; + +myccomplex_t BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); +myccomplex_t BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); +myzcomplex_t BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); +myzcomplex_t BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); +myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + +#elif defined RETURN_BY_STACK +void BLASFUNC(cdotu) (float _Complex *, blasint *, float * , blasint *, float *, blasint *); +void BLASFUNC(cdotc) (float _Complex *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zdotu) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(zdotc) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xdotu) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(xdotc) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +#else +float _Complex BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); +float _Complex BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); +double _Complex BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); +double _Complex BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); +xdouble _Complex BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +xdouble _Complex BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +#endif + +void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC(qaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC(xaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(caxpyc)(blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC(zaxpyc)(blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC(xaxpyc)(blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(scopy) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dcopy) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ccopy) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zcopy) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(sswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(cswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(sasum) (blasint *, float *, blasint *); +FLOATRET BLASFUNC(scasum)(blasint *, float *, blasint *); +double BLASFUNC(dasum) (blasint *, double *, blasint *); +xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); +double BLASFUNC(dzasum)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); + +blasint BLASFUNC(isamax)(blasint *, float *, blasint *); +blasint BLASFUNC(idamax)(blasint *, double *, blasint *); +blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); +blasint BLASFUNC(icamax)(blasint *, float *, blasint *); +blasint BLASFUNC(izamax)(blasint *, double *, blasint *); +blasint BLASFUNC(ixamax)(blasint *, xdouble *, blasint *); + +blasint BLASFUNC(ismax) (blasint *, float *, blasint *); +blasint BLASFUNC(idmax) (blasint *, double *, blasint *); +blasint BLASFUNC(iqmax) (blasint *, xdouble *, blasint *); +blasint BLASFUNC(icmax) (blasint *, float *, blasint *); +blasint BLASFUNC(izmax) (blasint *, double *, blasint *); +blasint BLASFUNC(ixmax) (blasint *, xdouble *, blasint *); + +blasint BLASFUNC(isamin)(blasint *, float *, blasint *); +blasint BLASFUNC(idamin)(blasint *, double *, blasint *); +blasint BLASFUNC(iqamin)(blasint *, xdouble *, blasint *); +blasint BLASFUNC(icamin)(blasint *, float *, blasint *); +blasint BLASFUNC(izamin)(blasint *, double *, blasint *); +blasint BLASFUNC(ixamin)(blasint *, xdouble *, blasint *); + +blasint BLASFUNC(ismin)(blasint *, float *, blasint *); +blasint BLASFUNC(idmin)(blasint *, double *, blasint *); +blasint BLASFUNC(iqmin)(blasint *, xdouble *, blasint *); +blasint BLASFUNC(icmin)(blasint *, float *, blasint *); +blasint BLASFUNC(izmin)(blasint *, double *, blasint *); +blasint BLASFUNC(ixmin)(blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(samax) (blasint *, float *, blasint *); +double BLASFUNC(damax) (blasint *, double *, blasint *); +xdouble BLASFUNC(qamax) (blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(scamax)(blasint *, float *, blasint *); +double BLASFUNC(dzamax)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxamax)(blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(samin) (blasint *, float *, blasint *); +double BLASFUNC(damin) (blasint *, double *, blasint *); +xdouble BLASFUNC(qamin) (blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(scamin)(blasint *, float *, blasint *); +double BLASFUNC(dzamin)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxamin)(blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(smax) (blasint *, float *, blasint *); +double BLASFUNC(dmax) (blasint *, double *, blasint *); +xdouble BLASFUNC(qmax) (blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(scmax) (blasint *, float *, blasint *); +double BLASFUNC(dzmax) (blasint *, double *, blasint *); +xdouble BLASFUNC(qxmax) (blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(smin) (blasint *, float *, blasint *); +double BLASFUNC(dmin) (blasint *, double *, blasint *); +xdouble BLASFUNC(qmin) (blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(scmin) (blasint *, float *, blasint *); +double BLASFUNC(dzmin) (blasint *, double *, blasint *); +xdouble BLASFUNC(qxmin) (blasint *, xdouble *, blasint *); + +void BLASFUNC(sscal) (blasint *, float *, float *, blasint *); +void BLASFUNC(dscal) (blasint *, double *, double *, blasint *); +void BLASFUNC(qscal) (blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cscal) (blasint *, float *, float *, blasint *); +void BLASFUNC(zscal) (blasint *, double *, double *, blasint *); +void BLASFUNC(xscal) (blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csscal)(blasint *, float *, float *, blasint *); +void BLASFUNC(zdscal)(blasint *, double *, double *, blasint *); +void BLASFUNC(xqscal)(blasint *, xdouble *, xdouble *, blasint *); + +FLOATRET BLASFUNC(snrm2) (blasint *, float *, blasint *); +FLOATRET BLASFUNC(scnrm2)(blasint *, float *, blasint *); + +double BLASFUNC(dnrm2) (blasint *, double *, blasint *); +xdouble BLASFUNC(qnrm2) (blasint *, xdouble *, blasint *); +double BLASFUNC(dznrm2)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxnrm2)(blasint *, xdouble *, blasint *); + +void BLASFUNC(srot) (blasint *, float *, blasint *, float *, blasint *, float *, float *); +void BLASFUNC(drot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); +void BLASFUNC(qrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); +void BLASFUNC(csrot) (blasint *, float *, blasint *, float *, blasint *, float *, float *); +void BLASFUNC(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); +void BLASFUNC(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); + +void BLASFUNC(srotg) (float *, float *, float *, float *); +void BLASFUNC(drotg) (double *, double *, double *, double *); +void BLASFUNC(qrotg) (xdouble *, xdouble *, xdouble *, xdouble *); +void BLASFUNC(crotg) (float *, float *, float *, float *); +void BLASFUNC(zrotg) (double *, double *, double *, double *); +void BLASFUNC(xrotg) (xdouble *, xdouble *, xdouble *, xdouble *); + +void BLASFUNC(srotmg)(float *, float *, float *, float *, float *); +void BLASFUNC(drotmg)(double *, double *, double *, double *, double *); + +void BLASFUNC(srotm) (blasint *, float *, blasint *, float *, blasint *, float *); +void BLASFUNC(drotm) (blasint *, double *, blasint *, double *, blasint *, double *); +void BLASFUNC(qrotm) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *); + +/* Level 2 routines */ + +void BLASFUNC(sger)(blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, blasint *); +void BLASFUNC(dger)(blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, blasint *); +void BLASFUNC(qger)(blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(cgeru)(blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, blasint *); +void BLASFUNC(cgerc)(blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, blasint *); +void BLASFUNC(zgeru)(blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, blasint *); +void BLASFUNC(zgerc)(blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, blasint *); +void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cgemv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(strsv) (char *, char *, char *, blasint *, float *, blasint *, + float *, blasint *); +void BLASFUNC(dtrsv) (char *, char *, char *, blasint *, double *, blasint *, + double *, blasint *); +void BLASFUNC(qtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *, + xdouble *, blasint *); +void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float *, blasint *, + float *, blasint *); +void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *, + double *, blasint *); +void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *, + xdouble *, blasint *); + +void BLASFUNC(strmv) (char *, char *, char *, blasint *, float *, blasint *, + float *, blasint *); +void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *, + double *, blasint *); +void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, + xdouble *, blasint *); +void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float *, blasint *, + float *, blasint *); +void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *, + double *, blasint *); +void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, + xdouble *, blasint *); + +void BLASFUNC(stpsv) (char *, char *, char *, blasint *, float *, float *, blasint *); +void BLASFUNC(dtpsv) (char *, char *, char *, blasint *, double *, double *, blasint *); +void BLASFUNC(qtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(ctpsv) (char *, char *, char *, blasint *, float *, float *, blasint *); +void BLASFUNC(ztpsv) (char *, char *, char *, blasint *, double *, double *, blasint *); +void BLASFUNC(xtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(stpmv) (char *, char *, char *, blasint *, float *, float *, blasint *); +void BLASFUNC(dtpmv) (char *, char *, char *, blasint *, double *, double *, blasint *); +void BLASFUNC(qtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(ctpmv) (char *, char *, char *, blasint *, float *, float *, blasint *); +void BLASFUNC(ztpmv) (char *, char *, char *, blasint *, double *, double *, blasint *); +void BLASFUNC(xtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(stbmv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dtbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ctbmv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(ztbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(stbsv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dtbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ctbsv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(ztbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(ssymv) (char *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dsymv) (char *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csymv) (char *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsymv) (char *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(sspmv) (char *, blasint *, float *, float *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dspmv) (char *, blasint *, double *, double *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qspmv) (char *, blasint *, xdouble *, xdouble *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cspmv) (char *, blasint *, float *, float *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zspmv) (char *, blasint *, double *, double *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xspmv) (char *, blasint *, xdouble *, xdouble *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(ssyr) (char *, blasint *, float *, float *, blasint *, + float *, blasint *); +void BLASFUNC(dsyr) (char *, blasint *, double *, double *, blasint *, + double *, blasint *); +void BLASFUNC(qsyr) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *); +void BLASFUNC(csyr) (char *, blasint *, float *, float *, blasint *, + float *, blasint *); +void BLASFUNC(zsyr) (char *, blasint *, double *, double *, blasint *, + double *, blasint *); +void BLASFUNC(xsyr) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *); + +void BLASFUNC(ssyr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dsyr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qsyr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(csyr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zsyr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xsyr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(sspr) (char *, blasint *, float *, float *, blasint *, + float *); +void BLASFUNC(dspr) (char *, blasint *, double *, double *, blasint *, + double *); +void BLASFUNC(qspr) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *); +void BLASFUNC(cspr) (char *, blasint *, float *, float *, blasint *, + float *); +void BLASFUNC(zspr) (char *, blasint *, double *, double *, blasint *, + double *); +void BLASFUNC(xspr) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *); + +void BLASFUNC(sspr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *); +void BLASFUNC(dspr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *); +void BLASFUNC(qspr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *); +void BLASFUNC(cspr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *); +void BLASFUNC(zspr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *); +void BLASFUNC(xspr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *); + +void BLASFUNC(cher) (char *, blasint *, float *, float *, blasint *, + float *, blasint *); +void BLASFUNC(zher) (char *, blasint *, double *, double *, blasint *, + double *, blasint *); +void BLASFUNC(xher) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *); + +void BLASFUNC(chpr) (char *, blasint *, float *, float *, blasint *, float *); +void BLASFUNC(zhpr) (char *, blasint *, double *, double *, blasint *, double *); +void BLASFUNC(xhpr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); + +void BLASFUNC(cher2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zher2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xher2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(chpr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *); +void BLASFUNC(zhpr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *); +void BLASFUNC(xhpr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *); + +void BLASFUNC(chemv) (char *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhemv) (char *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhemv) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(chpmv) (char *, blasint *, float *, float *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhpmv) (char *, blasint *, double *, double *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhpmv) (char *, blasint *, xdouble *, xdouble *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(snorm)(char *, blasint *, blasint *, float *, blasint *); +int BLASFUNC(dnorm)(char *, blasint *, blasint *, double *, blasint *); +int BLASFUNC(cnorm)(char *, blasint *, blasint *, float *, blasint *); +int BLASFUNC(znorm)(char *, blasint *, blasint *, double *, blasint *); + +void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csbmv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(chbmv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +/* Level 3 routines */ + +void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cgemm)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemm)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(cgemm3m)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *, + float *, float *, blasint *); +int BLASFUNC(dge2mm)(char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *, + double *, double *, blasint *); +int BLASFUNC(cge2mm)(char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *, + float *, float *, blasint *); +int BLASFUNC(zge2mm)(char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *, + double *, double *, blasint *); + +void BLASFUNC(strsm)(char *, char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *); +void BLASFUNC(dtrsm)(char *, char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *); +void BLASFUNC(qtrsm)(char *, char *, char *, char *, blasint *, blasint *, + xdouble *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ctrsm)(char *, char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *); +void BLASFUNC(ztrsm)(char *, char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *); +void BLASFUNC(xtrsm)(char *, char *, char *, char *, blasint *, blasint *, + xdouble *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(strmm)(char *, char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *); +void BLASFUNC(dtrmm)(char *, char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *); +void BLASFUNC(qtrmm)(char *, char *, char *, char *, blasint *, blasint *, + xdouble *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ctrmm)(char *, char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *); +void BLASFUNC(ztrmm)(char *, char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *); +void BLASFUNC(xtrmm)(char *, char *, char *, char *, blasint *, blasint *, + xdouble *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(ssymm)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csymm)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(csymm3m)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsymm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xsymm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(ssyrk)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, float *, blasint *); +void BLASFUNC(dsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, double *, blasint *); +void BLASFUNC(qsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, xdouble *, blasint *); +void BLASFUNC(csyrk)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, float *, blasint *); +void BLASFUNC(zsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, double *, blasint *); +void BLASFUNC(xsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, xdouble *, blasint *); + +void BLASFUNC(ssyr2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double*, blasint *, double *, double *, blasint *); +void BLASFUNC(qsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble*, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csyr2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double*, blasint *, double *, double *, blasint *); +void BLASFUNC(xsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble*, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(chemm)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhemm)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhemm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(chemm3m)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhemm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhemm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(cherk)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, float *, blasint *); +void BLASFUNC(zherk)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, double *, blasint *); +void BLASFUNC(xherk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, xdouble *, blasint *); + +void BLASFUNC(cher2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zher2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double*, blasint *, double *, double *, blasint *); +void BLASFUNC(xher2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble*, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(cher2m)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +int BLASFUNC(zher2m)(char *, char *, char *, blasint *, blasint *, double *, double *, blasint *, + double*, blasint *, double *, double *, blasint *); +int BLASFUNC(xher2m)(char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble*, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(sgemt)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *); +int BLASFUNC(dgemt)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *); +int BLASFUNC(cgemt)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *); +int BLASFUNC(zgemt)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *); + +int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, float *, blasint *, float *, blasint *); +int BLASFUNC(dgema)(char *, char *, blasint *, blasint *, double *, + double *, blasint *, double*, double *, blasint *, double*, blasint *); +int BLASFUNC(cgema)(char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, float *, blasint *, float *, blasint *); +int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *, + double *, blasint *, double*, double *, blasint *, double*, blasint *); + +int BLASFUNC(sgems)(char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, float *, blasint *, float *, blasint *); +int BLASFUNC(dgems)(char *, char *, blasint *, blasint *, double *, + double *, blasint *, double*, double *, blasint *, double*, blasint *); +int BLASFUNC(cgems)(char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, float *, blasint *, float *, blasint *); +int BLASFUNC(zgems)(char *, char *, blasint *, blasint *, double *, + double *, blasint *, double*, double *, blasint *, double*, blasint *); + +int BLASFUNC(sgemc)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *, float *, float *, blasint *); +int BLASFUNC(dgemc)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *); +int BLASFUNC(qgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +int BLASFUNC(cgemc)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *, float *, float *, blasint *); +int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *); +int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); +int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); +int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); +int BLASFUNC(cgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); +int BLASFUNC(zgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); +int BLASFUNC(xgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); + +int BLASFUNC(sgetrf)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); +int BLASFUNC(dgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); +int BLASFUNC(qgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); +int BLASFUNC(cgetrf)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); +int BLASFUNC(zgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); +int BLASFUNC(xgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); + +int BLASFUNC(slaswp)(blasint *, float *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(dlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(qlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(claswp)(blasint *, float *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(zlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(xlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *); + +int BLASFUNC(sgetrs)(char *, blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cgetrs)(char *, blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(sgesv)(blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); +int BLASFUNC(qgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); +int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); +int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); + +int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotf2)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotf2)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(spotrf)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotrf)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(clauu2)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zlauu2)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(slauum)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dlauum)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qlauum)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(clauum)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zlauum)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xlauum)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(strti2)(char *, char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dtrti2)(char *, char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(ctrti2)(char *, char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(ztrti2)(char *, char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(strtri)(char *, char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dtrtri)(char *, char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(ctrtri)(char *, char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(ztrtri)(char *, char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(slarf)(char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *); +int BLASFUNC(dlarf)(char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *); +int BLASFUNC(qlarf)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); +int BLASFUNC(clarf)(char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *); +int BLASFUNC(zlarf)(char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *); +int BLASFUNC(xlarf)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); + +FLOATRET BLASFUNC(slamch)(char *); +double BLASFUNC(dlamch)(char *); +xdouble BLASFUNC(qlamch)(char *); + +FLOATRET BLASFUNC(slamc3)(float *, float *); +double BLASFUNC(dlamc3)(double *, double *); +xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); +#endif diff --git a/common_lapack.h b/common_lapack.h new file mode 100644 index 0000000..f6d1956 --- /dev/null +++ b/common_lapack.h @@ -0,0 +1,296 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +/* Lapack Library */ + +blasint sgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint sgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint sgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int slaswp_plus (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); +int slaswp_minus(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); +int dlaswp_plus (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); +int dlaswp_minus(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); +int qlaswp_plus (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); +int qlaswp_minus(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); + +int claswp_plus (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); +int claswp_minus(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); +int zlaswp_plus (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); +int zlaswp_minus(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); +int xlaswp_plus (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); +int xlaswp_minus(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); + +int slaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +int dlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); +int qlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); +int claswp_ncopy(BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +int zlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); +int xlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); + +blasint sgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint sgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_R_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_C_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_R_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_C_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_R_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_C_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint sgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint sgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_R_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_C_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_R_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_C_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_R_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_C_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint spotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint spotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint spotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint spotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint spotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint spotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint slauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint slauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint clauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint clauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint slauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint slauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint clauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint clauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint slauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint slauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint clauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint clauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint strti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dtrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qtrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint ctrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ztrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xtrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint strtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dtrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qtrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint ctrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ztrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xtrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint strtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dtrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qtrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint ctrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ztrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xtrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int sneg_tcopy(BLASLONG, BLASLONG, float *, BLASLONG, float *); +int dneg_tcopy(BLASLONG, BLASLONG, double *, BLASLONG, double *); +int qneg_tcopy(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); +int cneg_tcopy(BLASLONG, BLASLONG, float *, BLASLONG, float *); +int zneg_tcopy(BLASLONG, BLASLONG, double *, BLASLONG, double *); +int xneg_tcopy(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + +blasint slarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint slarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint clarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint clarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +#endif diff --git a/common_level1.h b/common_level1.h new file mode 100644 index 0000000..f51ced6 --- /dev/null +++ b/common_level1.h @@ -0,0 +1,212 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +#ifdef __CUDACC__ +extern "C" { +#endif + +float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); +double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); +double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); +xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +float _Complex cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); +float _Complex cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); +double _Complex zdotc_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); +double _Complex zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); +xdouble _Complex xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +xdouble _Complex xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int daxpy_k (BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int qaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int caxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zaxpy_k (BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int caxpyc_k (BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zaxpyc_k (BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xaxpyc_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +int scopy_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); +int dcopy_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); +int qcopy_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int ccopy_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zcopy_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xcopy_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +int sswap_k (BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int dswap_k (BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double*, BLASLONG); +int qswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble*, BLASLONG); +int cswap_k (BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zswap_k (BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double*, BLASLONG); +int xswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble*, BLASLONG); + +float sasum_k (BLASLONG, float *, BLASLONG); +double dasum_k (BLASLONG, double *, BLASLONG); +xdouble qasum_k (BLASLONG, xdouble *, BLASLONG); +float casum_k (BLASLONG, float *, BLASLONG); +double zasum_k (BLASLONG, double *, BLASLONG); +xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); + +float samax_k (BLASLONG, float *, BLASLONG); +double damax_k (BLASLONG, double *, BLASLONG); +xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); +float camax_k (BLASLONG, float *, BLASLONG); +double zamax_k (BLASLONG, double *, BLASLONG); +xdouble xamax_k (BLASLONG, xdouble *, BLASLONG); + +float samin_k (BLASLONG, float *, BLASLONG); +double damin_k (BLASLONG, double *, BLASLONG); +xdouble qamin_k (BLASLONG, xdouble *, BLASLONG); +float camin_k (BLASLONG, float *, BLASLONG); +double zamin_k (BLASLONG, double *, BLASLONG); +xdouble xamin_k (BLASLONG, xdouble *, BLASLONG); + +BLASLONG isamax_k(BLASLONG, float *, BLASLONG); +BLASLONG idamax_k(BLASLONG, double *, BLASLONG); +BLASLONG iqamax_k(BLASLONG, xdouble *, BLASLONG); +BLASLONG icamax_k(BLASLONG, float *, BLASLONG); +BLASLONG izamax_k(BLASLONG, double *, BLASLONG); +BLASLONG ixamax_k(BLASLONG, xdouble *, BLASLONG); + +BLASLONG isamin_k(BLASLONG, float *, BLASLONG); +BLASLONG idamin_k(BLASLONG, double *, BLASLONG); +BLASLONG iqamin_k(BLASLONG, xdouble *, BLASLONG); +BLASLONG icamin_k(BLASLONG, float *, BLASLONG); +BLASLONG izamin_k(BLASLONG, double *, BLASLONG); +BLASLONG ixamin_k(BLASLONG, xdouble *, BLASLONG); + +float smax_k (BLASLONG, float *, BLASLONG); +double dmax_k (BLASLONG, double *, BLASLONG); +xdouble qmax_k (BLASLONG, xdouble *, BLASLONG); +float cmax_k (BLASLONG, float *, BLASLONG); +double zmax_k (BLASLONG, double *, BLASLONG); +xdouble xmax_k (BLASLONG, xdouble *, BLASLONG); + +float smin_k (BLASLONG, float *, BLASLONG); +double dmin_k (BLASLONG, double *, BLASLONG); +xdouble qmin_k (BLASLONG, xdouble *, BLASLONG); +float cmin_k (BLASLONG, float *, BLASLONG); +double zmin_k (BLASLONG, double *, BLASLONG); +xdouble xmin_k (BLASLONG, xdouble *, BLASLONG); + +BLASLONG ismax_k(BLASLONG, float *, BLASLONG); +BLASLONG idmax_k(BLASLONG, double *, BLASLONG); +BLASLONG iqmax_k(BLASLONG, xdouble *, BLASLONG); +BLASLONG icmax_k(BLASLONG, float *, BLASLONG); +BLASLONG izmax_k(BLASLONG, double *, BLASLONG); +BLASLONG ixmax_k(BLASLONG, xdouble *, BLASLONG); + +BLASLONG ismin_k(BLASLONG, float *, BLASLONG); +BLASLONG idmin_k(BLASLONG, double *, BLASLONG); +BLASLONG iqmin_k(BLASLONG, xdouble *, BLASLONG); +BLASLONG icmin_k(BLASLONG, float *, BLASLONG); +BLASLONG izmin_k(BLASLONG, double *, BLASLONG); +BLASLONG ixmin_k(BLASLONG, xdouble *, BLASLONG); + +int sscal_k(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int dscal_k(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int qscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int cscal_k(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zscal_k(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int csscal_k(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zdscal_k(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xqscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +float snrm2_k(BLASLONG, float *, BLASLONG); +double dnrm2_k(BLASLONG, double *, BLASLONG); +xdouble qnrm2_k(BLASLONG, xdouble *, BLASLONG); +float cnrm2_k(BLASLONG, float *, BLASLONG); +double znrm2_k(BLASLONG, double *, BLASLONG); +xdouble xnrm2_k(BLASLONG, xdouble *, BLASLONG); + +int srot_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float , float ); +int drot_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); +int qrot_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); +int csrot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG, float , float ); +int zdrot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); +int xqrot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + +int srotg_k(float *, float *, float *, float *); +int drotg_k(double *, double *, double *, double *); +int qrotg_k(xdouble *, xdouble *, xdouble *, xdouble *); +int csrotg_k(float *, float *, float *, float *); +int zdrotg_k(double *, double *, double *, double *); +int xqrotg_k(xdouble *, xdouble *, xdouble *, xdouble *); + +int srotmg_k(float *, float *, float *, float *, float *); +int drotmg_k(double *, double *, double *, double *, double *); +int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); + +int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); +int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); +int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); + +#ifdef __CUDACC__ +} +#endif + +#endif + diff --git a/common_level2.h b/common_level2.h new file mode 100644 index 0000000..2ab682a --- /dev/null +++ b/common_level2.h @@ -0,0 +1,1359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +/* Level 2 Blas routines */ + +#ifdef __CUDACC__ +extern "C" { +#endif + +int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int cgeru_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cgerc_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cgerv_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cgerd_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zgeru_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zgerc_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zgerv_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zgerd_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xgeru_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xgerc_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xgerv_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xgerd_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int sger_thread (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int dger_thread (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int qger_thread (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int cger_thread_U(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cger_thread_C(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cger_thread_V(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cger_thread_D(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zger_thread_U(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zger_thread_C(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zger_thread_V(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zger_thread_D(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xger_thread_U(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xger_thread_C(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xger_thread_V(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xger_thread_D(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int sgemv_n(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int sgemv_t(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int dgemv_n(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int dgemv_t(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int qgemv_n(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int qgemv_t(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); + +int cgemv_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_t(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_c(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_o(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_u(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_s(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_d(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); + +int zgemv_n(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_t(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_r(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_c(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_o(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_u(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_s(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_d(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); + +int xgemv_n(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_t(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_r(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_c(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_o(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_u(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_s(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_d(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); + +int sgemv_thread_n(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int sgemv_thread_t(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int dgemv_thread_n(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int dgemv_thread_t(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int qgemv_thread_n(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int qgemv_thread_t(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); + +int cgemv_thread_n(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_t(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_r(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_c(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_o(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_u(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_s(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_d(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); + +int zgemv_thread_n(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_t(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_r(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_c(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_o(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_u(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_s(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_d(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); + +int xgemv_thread_n(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_t(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_r(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_c(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_o(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_u(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_s(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_d(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); + +int strsv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int dtrsv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int qtrsv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int ctrsv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_RUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_RUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_RLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_RLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_CUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_CUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_CLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_CLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int ztrsv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_RUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_RUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_RLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_RLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_CUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_CUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_CLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_CLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int xtrsv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_RUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_RUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_RLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_RLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_CUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_CUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_CLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_CLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int strmv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + +int dtrmv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + +int qtrmv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int ctrmv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_RUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_RUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_RLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_RLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_CUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_CUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_CLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_CLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + +int ztrmv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_RUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_RUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_RLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_RLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_CUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_CUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_CLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_CLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + +int xtrmv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_RUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_RUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_RLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_RLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_CUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_CUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_CLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_CLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int strmv_thread_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); + +int dtrmv_thread_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); + +int qtrmv_thread_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ctrmv_thread_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_RUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_RUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_RLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_RLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_CUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_CUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_CLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_CLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); + +int ztrmv_thread_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_RUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_RUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_RLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_RLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_CUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_CUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_CLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_CLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); + +int xtrmv_thread_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_RUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_RUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_RLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_RLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_CUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_CUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_CLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_CLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int stpsv_NUU(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_NUN(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_NLU(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_NLN(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_TUU(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_TUN(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_TLU(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_TLN(BLASLONG, float *, float *, BLASLONG, void *); + +int dtpsv_NUU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_NUN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_NLU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_NLN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_TUU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_TUN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_TLU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_TLN(BLASLONG, double *, double *, BLASLONG, void *); + +int qtpsv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); + +int ctpsv_NUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_NUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_NLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_NLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_TUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_TUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_TLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_TLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_RUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_RUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_RLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_RLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_CUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_CUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_CLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_CLN(BLASLONG, float *, float *, BLASLONG, void *); + +int ztpsv_NUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_NUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_NLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_NLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_TUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_TUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_TLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_TLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_RUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_RUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_RLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_RLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_CUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_CUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_CLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_CLN(BLASLONG, double *, double *, BLASLONG, void *); + +int xtpsv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_RUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_RUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_RLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_RLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_CUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_CUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_CLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_CLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); + +int stpmv_NUU(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_NUN(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_NLU(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_NLN(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_TUU(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_TUN(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_TLU(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_TLN(BLASLONG, float *, float *, BLASLONG, void *); + +int dtpmv_NUU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_NUN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_NLU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_NLN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_TUU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_TUN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_TLU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_TLN(BLASLONG, double *, double *, BLASLONG, void *); + +int qtpmv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); + +int ctpmv_NUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_NUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_NLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_NLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_TUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_TUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_TLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_TLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_RUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_RUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_RLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_RLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_CUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_CUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_CLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_CLN(BLASLONG, float *, float *, BLASLONG, void *); + +int ztpmv_NUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_NUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_NLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_NLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_TUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_TUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_TLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_TLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_RUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_RUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_RLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_RLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_CUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_CUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_CLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_CLN(BLASLONG, double *, double *, BLASLONG, void *); + +int xtpmv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_RUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_RUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_RLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_RLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_CUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_CUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_CLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_CLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); + +int stpmv_thread_NUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_NUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_NLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_NLN(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_TUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_TUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_TLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_TLN(BLASLONG, float *, float *, BLASLONG, float *, int); + +int dtpmv_thread_NUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_NUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_NLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_NLN(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_TUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_TUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_TLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_TLN(BLASLONG, double *, double *, BLASLONG, double *, int); + +int qtpmv_thread_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); + +int ctpmv_thread_NUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_NUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_NLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_NLN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_TUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_TUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_TLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_TLN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_RUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_RUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_RLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_RLN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_CUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_CUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_CLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_CLN(BLASLONG, float *, float *, BLASLONG, float *, int); + +int ztpmv_thread_NUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_NUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_NLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_NLN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_TUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_TUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_TLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_TLN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_RUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_RUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_RLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_RLN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_CUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_CUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_CLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_CLN(BLASLONG, double *, double *, BLASLONG, double *, int); + +int xtpmv_thread_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_RUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_RUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_RLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_RLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_CUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_CUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_CLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_CLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); + +int ssymv_L(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ssymv_U(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int dsymv_L(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dsymv_U(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int qsymv_L(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qsymv_U(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int csymv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int csymv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zsymv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zsymv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xsymv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xsymv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int ssymv_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ssymv_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int dsymv_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dsymv_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int qsymv_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qsymv_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int csymv_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int csymv_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zsymv_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zsymv_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xsymv_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xsymv_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int chemv_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chemv_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chemv_thread_M(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chemv_thread_V(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zhemv_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhemv_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhemv_thread_M(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhemv_thread_V(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xhemv_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhemv_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhemv_thread_M(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhemv_thread_V(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int sspmv_L(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int sspmv_U(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int dspmv_L(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int dspmv_U(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int qspmv_L(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qspmv_U(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int cspmv_L(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int cspmv_U(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int zspmv_L(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int zspmv_U(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int xspmv_L(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xspmv_U(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int sspmv_thread_L(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int sspmv_thread_U(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int dspmv_thread_L(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int dspmv_thread_U(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int qspmv_thread_L(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qspmv_thread_U(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int cspmv_thread_L(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int cspmv_thread_U(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int zspmv_thread_L(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zspmv_thread_U(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int xspmv_thread_L(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xspmv_thread_U(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ssyr_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int ssyr_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int dsyr_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int dsyr_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int qsyr_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qsyr_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int csyr_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *); +int csyr_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *); +int zsyr_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *); +int zsyr_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *); +int xsyr_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xsyr_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int ssyr_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int ssyr_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int dsyr_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int dsyr_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int qsyr_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qsyr_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int csyr_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int csyr_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int zsyr_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zsyr_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int xsyr_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xsyr_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ssyr2_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ssyr2_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int dsyr2_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dsyr2_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int qsyr2_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qsyr2_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int csyr2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int csyr2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zsyr2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zsyr2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xsyr2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xsyr2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int ssyr2_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ssyr2_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int dsyr2_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dsyr2_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int qsyr2_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qsyr2_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int csyr2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int csyr2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zsyr2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zsyr2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xsyr2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xsyr2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int sspr_L(BLASLONG, float, float *, BLASLONG, float *, float *); +int sspr_U(BLASLONG, float, float *, BLASLONG, float *, float *); +int dspr_L(BLASLONG, double, double *, BLASLONG, double *, double *); +int dspr_U(BLASLONG, double, double *, BLASLONG, double *, double *); +int qspr_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int qspr_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int cspr_L(BLASLONG, float, float, float *, BLASLONG, float *, float *); +int cspr_U(BLASLONG, float, float, float *, BLASLONG, float *, float *); +int zspr_L(BLASLONG, double, double, double *, BLASLONG, double *, double *); +int zspr_U(BLASLONG, double, double, double *, BLASLONG, double *, double *); +int xspr_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int xspr_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); + +int sspr_thread_L(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int sspr_thread_U(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int dspr_thread_L(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int dspr_thread_U(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int qspr_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int qspr_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int cspr_thread_L(BLASLONG, float *, float *, BLASLONG, float *, float *, int); +int cspr_thread_U(BLASLONG, float *, float *, BLASLONG, float *, float *, int); +int zspr_thread_L(BLASLONG, double *, double *, BLASLONG, double *, double *, int); +int zspr_thread_U(BLASLONG, double *, double *, BLASLONG, double *, double *, int); +int xspr_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xspr_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, xdouble *, int); + +int sspr2_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int sspr2_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int dspr2_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int dspr2_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int qspr2_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int qspr2_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int cspr2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int cspr2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int zspr2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int zspr2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int xspr2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int xspr2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); + +int sspr2_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int sspr2_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int dspr2_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int dspr2_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int qspr2_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int qspr2_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int cspr2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int cspr2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int zspr2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int zspr2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int xspr2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xspr2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); + +int cher_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int cher_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int cher_V(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int cher_M(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int zher_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int zher_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int zher_V(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int zher_M(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int xher_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int cher_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher_thread_V(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher_thread_M(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int zher_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher_thread_V(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher_thread_M(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int xher_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher_thread_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher_thread_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int cher2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cher2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cher2_M(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cher2_V(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zher2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zher2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zher2_M(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zher2_V(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xher2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher2_M(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher2_V(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int cher2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher2_thread_M(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher2_thread_V(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zher2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher2_thread_M(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher2_thread_V(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xher2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher2_thread_M(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher2_thread_V(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int chpr_L(BLASLONG, float, float *, BLASLONG, float *, float *); +int chpr_U(BLASLONG, float, float *, BLASLONG, float *, float *); +int chpr_M(BLASLONG, float, float *, BLASLONG, float *, float *); +int chpr_V(BLASLONG, float, float *, BLASLONG, float *, float *); +int zhpr_L(BLASLONG, double, double *, BLASLONG, double *, double *); +int zhpr_U(BLASLONG, double, double *, BLASLONG, double *, double *); +int zhpr_M(BLASLONG, double, double *, BLASLONG, double *, double *); +int zhpr_V(BLASLONG, double, double *, BLASLONG, double *, double *); +int xhpr_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); + +int chpr_thread_L(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int chpr_thread_U(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int chpr_thread_M(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int chpr_thread_V(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int zhpr_thread_L(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int zhpr_thread_U(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int zhpr_thread_M(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int zhpr_thread_V(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int xhpr_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr_thread_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr_thread_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); + +int chpr2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int chpr2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int chpr2_M(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int chpr2_V(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int zhpr2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int zhpr2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int zhpr2_M(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int zhpr2_V(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int xhpr2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr2_M(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr2_V(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); + +int chpr2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int chpr2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int chpr2_thread_M(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int chpr2_thread_V(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int zhpr2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int zhpr2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int zhpr2_thread_M(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int zhpr2_thread_V(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int xhpr2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr2_thread_M(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr2_thread_V(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); + +int chemv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int chemv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int chemv_M(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int chemv_V(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zhemv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zhemv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zhemv_M(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zhemv_V(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xhemv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xhemv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xhemv_M(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xhemv_V(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int chpmv_L(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int chpmv_U(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int chpmv_M(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int chpmv_V(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int zhpmv_L(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int zhpmv_U(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int zhpmv_M(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int zhpmv_V(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int xhpmv_L(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhpmv_U(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhpmv_M(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhpmv_V(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int chpmv_thread_L(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int chpmv_thread_U(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int chpmv_thread_M(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int chpmv_thread_V(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int zhpmv_thread_L(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhpmv_thread_U(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhpmv_thread_M(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhpmv_thread_V(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int xhpmv_thread_L(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhpmv_thread_U(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhpmv_thread_M(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhpmv_thread_V(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ssbmv_L(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ssbmv_U(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int dsbmv_L(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dsbmv_U(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int qsbmv_L(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qsbmv_U(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int csbmv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int csbmv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int zsbmv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int zsbmv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int xsbmv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xsbmv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int chbmv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int chbmv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int chbmv_M(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int chbmv_V(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int zhbmv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int zhbmv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int zhbmv_M(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int zhbmv_V(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int xhbmv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhbmv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhbmv_M(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhbmv_V(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + + +int ssbmv_thread_L(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ssbmv_thread_U(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int dsbmv_thread_L(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dsbmv_thread_U(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int qsbmv_thread_L(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qsbmv_thread_U(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int csbmv_thread_L(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int csbmv_thread_U(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zsbmv_thread_L(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zsbmv_thread_U(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xsbmv_thread_L(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xsbmv_thread_U(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int chbmv_thread_L(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chbmv_thread_U(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chbmv_thread_M(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chbmv_thread_V(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zhbmv_thread_L(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhbmv_thread_U(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhbmv_thread_M(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhbmv_thread_V(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xhbmv_thread_L(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhbmv_thread_U(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhbmv_thread_M(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhbmv_thread_V(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int snorm_n(BLASLONG, BLASLONG, float *a, BLASLONG); +int snorm_t(BLASLONG, BLASLONG, float *a, BLASLONG); +int dnorm_n(BLASLONG, BLASLONG, double *a, BLASLONG); +int dnorm_t(BLASLONG, BLASLONG, double *a, BLASLONG); +int cnorm_n(BLASLONG, BLASLONG, float *a, BLASLONG); +int cnorm_t(BLASLONG, BLASLONG, float *a, BLASLONG); +int znorm_n(BLASLONG, BLASLONG, double *a, BLASLONG); +int znorm_t(BLASLONG, BLASLONG, double *a, BLASLONG); + +void sgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void sgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); + +void dgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void dgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); + +void qgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void qgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); + +void cgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); + +void zgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); + +void xgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); + +int sgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int sgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); + +int dgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int dgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); + +int qgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int qgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); + +int cgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); + +int zgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); + +int xgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); + +int stbmv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int dtbmv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int qtbmv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int ctbmv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_RUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_RUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_RLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_RLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_CUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_CUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_CLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_CLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int ztbmv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_RUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_RUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_RLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_RLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_CUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_CUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_CLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_CLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int xtbmv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_RUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_RUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_RLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_RLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_CUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_CUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_CLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_CLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int stbmv_thread_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); + +int dtbmv_thread_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); + +int qtbmv_thread_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ctbmv_thread_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_RUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_RUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_RLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_RLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_CUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_CUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_CLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_CLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); + +int ztbmv_thread_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_RUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_RUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_RLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_RLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_CUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_CUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_CLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_CLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); + +int xtbmv_thread_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_RUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_RUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_RLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_RLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_CUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_CUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_CLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_CLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int stbsv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int dtbsv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int qtbsv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int ctbsv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_RUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_RUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_RLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_RLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_CUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_CUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_CLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_CLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int ztbsv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_RUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_RUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_RLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_RLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_CUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_CUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_CLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_CLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int xtbsv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_RUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_RUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_RLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_RLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_CUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_CUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_CLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_CLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +#ifdef __CUDACC__ +} +#endif + +#endif diff --git a/common_level3.h b/common_level3.h new file mode 100644 index 0000000..cbc67a6 --- /dev/null +++ b/common_level3.h @@ -0,0 +1,1739 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +#ifdef __CUDACC__ +__global__ void cuda_sgemm_kernel(int, int, int, float *, float *, float *); +__global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); +#endif + +#ifdef __CUDACC__ +extern "C" { +#endif + +int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int cgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + +#ifdef EXPRECISION +int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +#else +int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +#endif + +int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int sgemm_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int dgemm_incopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int dgemm_itcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int dgemm_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int dgemm_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int cgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int zgemm_incopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm_itcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); + +#ifdef QUAD_PRECISION +int qgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int qgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int qgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int qgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int xgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int xgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int xgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int xgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +#else +int qgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int qgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int qgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int qgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +#endif + + +int strsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int dtrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + +int qtrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + +int ctrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + +int ztrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + +int xtrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + +int strmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + +int dtrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + +int qtrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + +int ctrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + +int ztrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + +int xtrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + +int strmm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); + +int dtrmm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); + +int qtrmm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); + +int ctrmm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); + +int ztrmm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); + +int xtrmm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); + +int strsm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); + +int dtrsm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); + +int qtrsm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); + +int ctrsm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); + +int ztrsm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); + +int xtrsm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); + +int ssymm_iutcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ssymm_outcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ssymm_iltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ssymm_oltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int dsymm_iutcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dsymm_outcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dsymm_iltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dsymm_oltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int qsymm_iutcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qsymm_outcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qsymm_iltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qsymm_oltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int csymm_iutcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int csymm_outcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int csymm_iltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int csymm_oltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int zsymm_iutcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zsymm_outcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zsymm_iltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zsymm_oltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int xsymm_iutcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xsymm_outcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xsymm_iltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xsymm_oltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); + +int chemm_iutcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int chemm_outcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int chemm_iltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int chemm_oltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int zhemm_iutcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zhemm_outcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zhemm_iltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zhemm_oltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int xhemm_iutcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xhemm_outcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xhemm_iltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xhemm_oltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); + +int ssyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int ssyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); + +int dsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int dsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); + +int qsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int qsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); + +int csyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int csyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int zsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int zsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int xsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int xsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); + +int ssyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int ssyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int dsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int dsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int qsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int qsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); + +int csyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int csyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int zsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int zsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int xsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int xsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); + +int cherk_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int cherk_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int cherk_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int cherk_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); + +int zherk_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int zherk_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int zherk_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int zherk_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); + +int xherk_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int xherk_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int xherk_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int xherk_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); + +int cher2k_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int cher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int cher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int cher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); + +int zher2k_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int zher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int zher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int zher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); + +int xher2k_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); + +int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); +int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); + +#ifdef QUAD_PRECISION +int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble *, xdouble *, BLASLONG); +#else +int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +#endif + +int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int cgemm_kernel_b(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + +int zgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int zgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int zgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int zgemm_kernel_b(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + +int xgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int xgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int xgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int xgemm_kernel_b(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + +int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + +int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +#ifdef QUAD_PRECISION +int qgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +#else +int qgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +#endif + +int cgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +#ifdef QUAD_PRECISION +int xgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +#else +int xgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +#endif + +int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +#ifdef QUAD_PRECISION +int qgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +#else +int qgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +#endif + +int cgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cgemm3m_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int zgemm3m_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int xgemm3m_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cgemm3m_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zgemm3m_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xgemm3m_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cher2m_LNN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LNT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LNR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LNC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LTN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LTT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LTR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LTC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LRN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LRT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LRR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LRC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LCN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LCT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LCR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LCC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UNN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UNT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UNR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UNC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UTN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UTT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UTR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UTC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_URN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_URT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_URR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_URC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UCN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UCT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UCR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UCC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); + +int zher2m_LNN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LNT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LNR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LNC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LTN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LTT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LTR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LTC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LRN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LRT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LRR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LRC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LCN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LCT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LCR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LCC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UNN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UNT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UNR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UNC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UTN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UTT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UTR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UTC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_URN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_URT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_URR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_URC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UCN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UCT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UCR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UCC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); + +int strsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dtrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qtrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ctrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int ztrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xtrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int strmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dtrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qtrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ctrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int ztrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xtrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csymm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsymm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsymm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csymm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsymm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsymm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int chemm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zhemm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xhemm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int chemm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zhemm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xhemm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int chemm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zhemm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xhemm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int chemm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zhemm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xhemm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cherk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_UC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_LC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zherk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_UC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_LC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xherk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_UC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_LC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cherk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_thread_UC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_thread_LC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zherk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_thread_UC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_thread_LC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xherk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_thread_UC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_thread_LC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cher2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cher2k_UC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cher2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cher2k_LC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zher2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zher2k_UC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zher2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zher2k_LC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xher2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xher2k_UC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xher2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xher2k_LC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int sgemt_n(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, int); +int sgemt_t(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, int); +int dgemt_n(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, int); +int dgemt_t(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, int); + +int cgemt_n(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); +int cgemt_t(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); +int cgemt_r(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); +int cgemt_c(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); +int zgemt_n(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); +int zgemt_t(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); +int zgemt_r(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); +int zgemt_c(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); + +int sgema_n(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); +int sgema_t(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); +int dgema_n(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); +int dgema_t(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); + +int cgema_n(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cgema_t(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cgema_r(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cgema_c(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int zgema_n(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zgema_t(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zgema_r(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zgema_c(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); + +int cgemm3m_incopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_incopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_incopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_itcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_itcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_itcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); + +int cgemm3m_oncopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_oncopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_oncopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_otcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_otcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_otcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); + +int zgemm3m_incopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_incopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_incopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_itcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_itcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_itcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); + +int zgemm3m_oncopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_oncopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_oncopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_otcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_otcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_otcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); + +int xgemm3m_incopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_incopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_incopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_itcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_itcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_itcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); + +int xgemm3m_oncopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_oncopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_oncopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_otcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_otcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_otcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); + +int csymm3m_iucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_ilcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_iucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_ilcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_iucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_ilcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); + +int csymm3m_oucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_olcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_oucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_olcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_oucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_olcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); + +int zsymm3m_iucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_ilcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_iucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_ilcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_iucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_ilcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); + +int zsymm3m_oucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_olcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_oucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_olcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_oucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_olcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); + +int xsymm3m_iucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_ilcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_iucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_ilcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_iucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_ilcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); + +int xsymm3m_oucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_olcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_oucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_olcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_oucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_olcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); + +int chemm3m_iucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_ilcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_iucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_ilcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_iucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_ilcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); + +int chemm3m_oucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_olcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_oucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_olcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_oucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_olcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); + +int zhemm3m_iucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_ilcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_iucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_ilcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_iucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_ilcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); + +int zhemm3m_oucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_olcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_oucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_olcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_oucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_olcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); + +int xhemm3m_iucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_ilcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_iucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_ilcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_iucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_ilcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); + +int xhemm3m_oucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_olcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_oucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_olcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_oucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_olcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); + +int sgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int sgemc_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); +int sgemc_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); +int dgemc_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); +int dgemc_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); +int qgemc_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); +int qgemc_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); +int cgemc_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); +int cgemc_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); +int zgemc_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); +int zgemc_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); +int xgemc_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); +int xgemc_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); + +#ifdef __CUDACC__ +} +#endif + +#endif diff --git a/common_linux.h b/common_linux.h new file mode 100644 index 0000000..d18cd2b --- /dev/null +++ b/common_linux.h @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_LINUX_H +#define COMMON_LINUX_H + +#ifndef ASSEMBLER + +#include + +extern long int syscall (long int __sysno, ...); + +#ifndef MPOL_PREFERRED +#define MPOL_PREFERRED 1 +#endif + +#ifndef MPOL_INTERLEAVE +#define MPOL_INTERLEAVE 3 +#endif + +#if defined(ARCH_IA64) && defined(__ECC) +#ifndef __NR_mbind +#define __NR_mbind 1259 +#endif +#ifndef __NR_get_mempolicy +#define __NR_get_mempolicy 1260 +#endif +#ifndef __NR_set_mempolicy +#define __NR_set_mempolicy 1261 +#endif +#endif + +static inline int my_mbind(void *addr, unsigned long len, int mode, + unsigned long *nodemask, unsigned long maxnode, + unsigned flags) { + + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +} + +static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { + + return syscall(SYS_set_mempolicy, mode, addr, flag); +} + +static inline int my_gettid(void) { return syscall(SYS_gettid); } + +#endif +#endif diff --git a/common_macro.h b/common_macro.h new file mode 100644 index 0000000..bcaa9f3 --- /dev/null +++ b/common_macro.h @@ -0,0 +1,2734 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_MACRO +#define COMMON_MACRO + +#include "common_s.h" +#include "common_d.h" +#include "common_q.h" + +#include "common_c.h" +#include "common_z.h" +#include "common_x.h" + +#ifndef COMPLEX +#ifdef XDOUBLE + +#define AMAX_K QAMAX_K +#define AMIN_K QAMIN_K +#define MAX_K QMAX_K +#define MIN_K QMIN_K +#define IAMAX_K IQAMAX_K +#define IAMIN_K IQAMIN_K +#define IMAX_K IQMAX_K +#define IMIN_K IQMIN_K +#define ASUM_K QASUM_K +#define AXPYU_K QAXPYU_K +#define AXPYC_K QAXPYC_K +#define COPY_K QCOPY_K +#define DOTU_K QDOTU_K +#define DOTC_K QDOTC_K +#define NRM2_K QNRM2_K +#define SCAL_K QSCAL_K +#define SWAP_K QSWAP_K +#define ROT_K QROT_K + +#define GEMV_N QGEMV_N +#define GEMV_T QGEMV_T +#define GEMV_R QGEMV_R +#define GEMV_C QGEMV_C +#define GEMV_O QGEMV_O +#define GEMV_U QGEMV_U +#define GEMV_S QGEMV_S +#define GEMV_D QGEMV_D + +#define GERU_K QGERU_K +#define GERC_K QGERC_K +#define GERV_K QGERV_K +#define GERD_K QGERD_K + +#define SYMV_U QSYMV_U +#define SYMV_L QSYMV_L + +#define SYMV_THREAD_U QSYMV_THREAD_U +#define SYMV_THREAD_L QSYMV_THREAD_L + +#define GEMM_ONCOPY QGEMM_ONCOPY +#define GEMM_OTCOPY QGEMM_OTCOPY +#define GEMM_INCOPY QGEMM_INCOPY +#define GEMM_ITCOPY QGEMM_ITCOPY + +#ifdef UNIT + +#define TRMM_OUNCOPY QTRMM_OUNUCOPY +#define TRMM_OUTCOPY QTRMM_OUTUCOPY +#define TRMM_OLNCOPY QTRMM_OLNUCOPY +#define TRMM_OLTCOPY QTRMM_OLTUCOPY +#define TRSM_OUNCOPY QTRSM_OUNUCOPY +#define TRSM_OUTCOPY QTRSM_OUTUCOPY +#define TRSM_OLNCOPY QTRSM_OLNUCOPY +#define TRSM_OLTCOPY QTRSM_OLTUCOPY + +#define TRMM_IUNCOPY QTRMM_IUNUCOPY +#define TRMM_IUTCOPY QTRMM_IUTUCOPY +#define TRMM_ILNCOPY QTRMM_ILNUCOPY +#define TRMM_ILTCOPY QTRMM_ILTUCOPY +#define TRSM_IUNCOPY QTRSM_IUNUCOPY +#define TRSM_IUTCOPY QTRSM_IUTUCOPY +#define TRSM_ILNCOPY QTRSM_ILNUCOPY +#define TRSM_ILTCOPY QTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY QTRMM_OUNNCOPY +#define TRMM_OUTCOPY QTRMM_OUTNCOPY +#define TRMM_OLNCOPY QTRMM_OLNNCOPY +#define TRMM_OLTCOPY QTRMM_OLTNCOPY +#define TRSM_OUNCOPY QTRSM_OUNNCOPY +#define TRSM_OUTCOPY QTRSM_OUTNCOPY +#define TRSM_OLNCOPY QTRSM_OLNNCOPY +#define TRSM_OLTCOPY QTRSM_OLTNCOPY + +#define TRMM_IUNCOPY QTRMM_IUNNCOPY +#define TRMM_IUTCOPY QTRMM_IUTNCOPY +#define TRMM_ILNCOPY QTRMM_ILNNCOPY +#define TRMM_ILTCOPY QTRMM_ILTNCOPY +#define TRSM_IUNCOPY QTRSM_IUNNCOPY +#define TRSM_IUTCOPY QTRSM_IUTNCOPY +#define TRSM_ILNCOPY QTRSM_ILNNCOPY +#define TRSM_ILTCOPY QTRSM_ILTNCOPY + +#endif + +#define GEMM_BETA QGEMM_BETA + +#define GEMM_KERNEL_N QGEMM_KERNEL +#define GEMM_KERNEL_L QGEMM_KERNEL +#define GEMM_KERNEL_R QGEMM_KERNEL +#define GEMM_KERNEL_B QGEMM_KERNEL + +#define TRMM_KERNEL_LN QTRMM_KERNEL_LN +#define TRMM_KERNEL_LT QTRMM_KERNEL_LT +#define TRMM_KERNEL_LR QTRMM_KERNEL_LN +#define TRMM_KERNEL_LC QTRMM_KERNEL_LT +#define TRMM_KERNEL_RN QTRMM_KERNEL_RN +#define TRMM_KERNEL_RT QTRMM_KERNEL_RT +#define TRMM_KERNEL_RR QTRMM_KERNEL_RN +#define TRMM_KERNEL_RC QTRMM_KERNEL_RT + +#define TRSM_KERNEL_LN QTRSM_KERNEL_LN +#define TRSM_KERNEL_LT QTRSM_KERNEL_LT +#define TRSM_KERNEL_LR QTRSM_KERNEL_LN +#define TRSM_KERNEL_LC QTRSM_KERNEL_LT +#define TRSM_KERNEL_RN QTRSM_KERNEL_RN +#define TRSM_KERNEL_RT QTRSM_KERNEL_RT +#define TRSM_KERNEL_RR QTRSM_KERNEL_RN +#define TRSM_KERNEL_RC QTRSM_KERNEL_RT + +#define SYMM_IUTCOPY QSYMM_IUTCOPY +#define SYMM_ILTCOPY QSYMM_ILTCOPY +#define SYMM_OUTCOPY QSYMM_OUTCOPY +#define SYMM_OLTCOPY QSYMM_OLTCOPY + +#define GEMM_NN QGEMM_NN +#define GEMM_CN QGEMM_TN +#define GEMM_TN QGEMM_TN +#define GEMM_NC QGEMM_NT +#define GEMM_NT QGEMM_NT +#define GEMM_CC QGEMM_TT +#define GEMM_CT QGEMM_TT +#define GEMM_TC QGEMM_TT +#define GEMM_TT QGEMM_TT +#define GEMM_NR QGEMM_NN +#define GEMM_TR QGEMM_TN +#define GEMM_CR QGEMM_TN +#define GEMM_RN QGEMM_NN +#define GEMM_RT QGEMM_NT +#define GEMM_RC QGEMM_NT +#define GEMM_RR QGEMM_NN + +#define SYMM_LU QSYMM_LU +#define SYMM_LL QSYMM_LL +#define SYMM_RU QSYMM_RU +#define SYMM_RL QSYMM_RL + +#define HEMM_LU QHEMM_LU +#define HEMM_LL QHEMM_LL +#define HEMM_RU QHEMM_RU +#define HEMM_RL QHEMM_RL + +#define SYRK_UN QSYRK_UN +#define SYRK_UT QSYRK_UT +#define SYRK_LN QSYRK_LN +#define SYRK_LT QSYRK_LT +#define SYRK_UR QSYRK_UN +#define SYRK_UC QSYRK_UT +#define SYRK_LR QSYRK_LN +#define SYRK_LC QSYRK_LT + +#define SYRK_KERNEL_U QSYRK_KERNEL_U +#define SYRK_KERNEL_L QSYRK_KERNEL_L + +#define HERK_UN QSYRK_UN +#define HERK_LN QSYRK_LN +#define HERK_UC QSYRK_UT +#define HERK_LC QSYRK_LT + +#define HER2K_UN QSYR2K_UN +#define HER2K_LN QSYR2K_LN +#define HER2K_UC QSYR2K_UT +#define HER2K_LC QSYR2K_LT + +#define SYR2K_UN QSYR2K_UN +#define SYR2K_UT QSYR2K_UT +#define SYR2K_LN QSYR2K_LN +#define SYR2K_LT QSYR2K_LT +#define SYR2K_UR QSYR2K_UN +#define SYR2K_UC QSYR2K_UT +#define SYR2K_LR QSYR2K_LN +#define SYR2K_LC QSYR2K_LT + +#define SYR2K_KERNEL_U QSYR2K_KERNEL_U +#define SYR2K_KERNEL_L QSYR2K_KERNEL_L + +#define TRMM_LNUU QTRMM_LNUU +#define TRMM_LNUN QTRMM_LNUN +#define TRMM_LNLU QTRMM_LNLU +#define TRMM_LNLN QTRMM_LNLN +#define TRMM_LTUU QTRMM_LTUU +#define TRMM_LTUN QTRMM_LTUN +#define TRMM_LTLU QTRMM_LTLU +#define TRMM_LTLN QTRMM_LTLN +#define TRMM_LRUU QTRMM_LNUU +#define TRMM_LRUN QTRMM_LNUN +#define TRMM_LRLU QTRMM_LNLU +#define TRMM_LRLN QTRMM_LNLN +#define TRMM_LCUU QTRMM_LTUU +#define TRMM_LCUN QTRMM_LTUN +#define TRMM_LCLU QTRMM_LTLU +#define TRMM_LCLN QTRMM_LTLN +#define TRMM_RNUU QTRMM_RNUU +#define TRMM_RNUN QTRMM_RNUN +#define TRMM_RNLU QTRMM_RNLU +#define TRMM_RNLN QTRMM_RNLN +#define TRMM_RTUU QTRMM_RTUU +#define TRMM_RTUN QTRMM_RTUN +#define TRMM_RTLU QTRMM_RTLU +#define TRMM_RTLN QTRMM_RTLN +#define TRMM_RRUU QTRMM_RNUU +#define TRMM_RRUN QTRMM_RNUN +#define TRMM_RRLU QTRMM_RNLU +#define TRMM_RRLN QTRMM_RNLN +#define TRMM_RCUU QTRMM_RTUU +#define TRMM_RCUN QTRMM_RTUN +#define TRMM_RCLU QTRMM_RTLU +#define TRMM_RCLN QTRMM_RTLN + +#define TRSM_LNUU QTRSM_LNUU +#define TRSM_LNUN QTRSM_LNUN +#define TRSM_LNLU QTRSM_LNLU +#define TRSM_LNLN QTRSM_LNLN +#define TRSM_LTUU QTRSM_LTUU +#define TRSM_LTUN QTRSM_LTUN +#define TRSM_LTLU QTRSM_LTLU +#define TRSM_LTLN QTRSM_LTLN +#define TRSM_LRUU QTRSM_LNUU +#define TRSM_LRUN QTRSM_LNUN +#define TRSM_LRLU QTRSM_LNLU +#define TRSM_LRLN QTRSM_LNLN +#define TRSM_LCUU QTRSM_LTUU +#define TRSM_LCUN QTRSM_LTUN +#define TRSM_LCLU QTRSM_LTLU +#define TRSM_LCLN QTRSM_LTLN +#define TRSM_RNUU QTRSM_RNUU +#define TRSM_RNUN QTRSM_RNUN +#define TRSM_RNLU QTRSM_RNLU +#define TRSM_RNLN QTRSM_RNLN +#define TRSM_RTUU QTRSM_RTUU +#define TRSM_RTUN QTRSM_RTUN +#define TRSM_RTLU QTRSM_RTLU +#define TRSM_RTLN QTRSM_RTLN +#define TRSM_RRUU QTRSM_RNUU +#define TRSM_RRUN QTRSM_RNUN +#define TRSM_RRLU QTRSM_RNLU +#define TRSM_RRLN QTRSM_RNLN +#define TRSM_RCUU QTRSM_RTUU +#define TRSM_RCUN QTRSM_RTUN +#define TRSM_RCLU QTRSM_RTLU +#define TRSM_RCLN QTRSM_RTLN + +#define GEMM_THREAD_NN QGEMM_THREAD_NN +#define GEMM_THREAD_CN QGEMM_THREAD_TN +#define GEMM_THREAD_TN QGEMM_THREAD_TN +#define GEMM_THREAD_NC QGEMM_THREAD_NT +#define GEMM_THREAD_NT QGEMM_THREAD_NT +#define GEMM_THREAD_CC QGEMM_THREAD_TT +#define GEMM_THREAD_CT QGEMM_THREAD_TT +#define GEMM_THREAD_TC QGEMM_THREAD_TT +#define GEMM_THREAD_TT QGEMM_THREAD_TT +#define GEMM_THREAD_NR QGEMM_THREAD_NN +#define GEMM_THREAD_TR QGEMM_THREAD_TN +#define GEMM_THREAD_CR QGEMM_THREAD_TN +#define GEMM_THREAD_RN QGEMM_THREAD_NN +#define GEMM_THREAD_RT QGEMM_THREAD_NT +#define GEMM_THREAD_RC QGEMM_THREAD_NT +#define GEMM_THREAD_RR QGEMM_THREAD_NN + +#define SYMM_THREAD_LU QSYMM_THREAD_LU +#define SYMM_THREAD_LL QSYMM_THREAD_LL +#define SYMM_THREAD_RU QSYMM_THREAD_RU +#define SYMM_THREAD_RL QSYMM_THREAD_RL + +#define HEMM_THREAD_LU QHEMM_THREAD_LU +#define HEMM_THREAD_LL QHEMM_THREAD_LL +#define HEMM_THREAD_RU QHEMM_THREAD_RU +#define HEMM_THREAD_RL QHEMM_THREAD_RL + +#define SYRK_THREAD_UN QSYRK_THREAD_UN +#define SYRK_THREAD_UT QSYRK_THREAD_UT +#define SYRK_THREAD_LN QSYRK_THREAD_LN +#define SYRK_THREAD_LT QSYRK_THREAD_LT +#define SYRK_THREAD_UR QSYRK_THREAD_UR +#define SYRK_THREAD_UC QSYRK_THREAD_UC +#define SYRK_THREAD_LR QSYRK_THREAD_LN +#define SYRK_THREAD_LC QSYRK_THREAD_LT + +#define HERK_THREAD_UN QSYRK_THREAD_UN +#define HERK_THREAD_UT QSYRK_THREAD_UT +#define HERK_THREAD_LN QSYRK_THREAD_LN +#define HERK_THREAD_LT QSYRK_THREAD_LT +#define HERK_THREAD_UR QSYRK_THREAD_UR +#define HERK_THREAD_UC QSYRK_THREAD_UC +#define HERK_THREAD_LR QSYRK_THREAD_LN +#define HERK_THREAD_LC QSYRK_THREAD_LT + +#elif defined(DOUBLE) + +#define AMAX_K DAMAX_K +#define AMIN_K DAMIN_K +#define MAX_K DMAX_K +#define MIN_K DMIN_K +#define IAMAX_K IDAMAX_K +#define IAMIN_K IDAMIN_K +#define IMAX_K IDMAX_K +#define IMIN_K IDMIN_K +#define ASUM_K DASUM_K +#define AXPYU_K DAXPYU_K +#define AXPYC_K DAXPYC_K +#define COPY_K DCOPY_K +#define DOTU_K DDOTU_K +#define DOTC_K DDOTC_K +#define NRM2_K DNRM2_K +#define SCAL_K DSCAL_K +#define SWAP_K DSWAP_K +#define ROT_K DROT_K + +#define GEMV_N DGEMV_N +#define GEMV_T DGEMV_T +#define GEMV_R DGEMV_R +#define GEMV_C DGEMV_C +#define GEMV_O DGEMV_O +#define GEMV_U DGEMV_U +#define GEMV_S DGEMV_S +#define GEMV_D DGEMV_D + +#define GERU_K DGERU_K +#define GERC_K DGERC_K +#define GERV_K DGERV_K +#define GERD_K DGERD_K + +#define SYMV_U DSYMV_U +#define SYMV_L DSYMV_L + +#define SYMV_THREAD_U DSYMV_THREAD_U +#define SYMV_THREAD_L DSYMV_THREAD_L + +#define GEMM_ONCOPY DGEMM_ONCOPY +#define GEMM_OTCOPY DGEMM_OTCOPY +#define GEMM_INCOPY DGEMM_INCOPY +#define GEMM_ITCOPY DGEMM_ITCOPY + +#ifdef UNIT + +#define TRMM_OUNCOPY DTRMM_OUNUCOPY +#define TRMM_OUTCOPY DTRMM_OUTUCOPY +#define TRMM_OLNCOPY DTRMM_OLNUCOPY +#define TRMM_OLTCOPY DTRMM_OLTUCOPY +#define TRSM_OUNCOPY DTRSM_OUNUCOPY +#define TRSM_OUTCOPY DTRSM_OUTUCOPY +#define TRSM_OLNCOPY DTRSM_OLNUCOPY +#define TRSM_OLTCOPY DTRSM_OLTUCOPY + +#define TRMM_IUNCOPY DTRMM_IUNUCOPY +#define TRMM_IUTCOPY DTRMM_IUTUCOPY +#define TRMM_ILNCOPY DTRMM_ILNUCOPY +#define TRMM_ILTCOPY DTRMM_ILTUCOPY +#define TRSM_IUNCOPY DTRSM_IUNUCOPY +#define TRSM_IUTCOPY DTRSM_IUTUCOPY +#define TRSM_ILNCOPY DTRSM_ILNUCOPY +#define TRSM_ILTCOPY DTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY DTRMM_OUNNCOPY +#define TRMM_OUTCOPY DTRMM_OUTNCOPY +#define TRMM_OLNCOPY DTRMM_OLNNCOPY +#define TRMM_OLTCOPY DTRMM_OLTNCOPY +#define TRSM_OUNCOPY DTRSM_OUNNCOPY +#define TRSM_OUTCOPY DTRSM_OUTNCOPY +#define TRSM_OLNCOPY DTRSM_OLNNCOPY +#define TRSM_OLTCOPY DTRSM_OLTNCOPY + +#define TRMM_IUNCOPY DTRMM_IUNNCOPY +#define TRMM_IUTCOPY DTRMM_IUTNCOPY +#define TRMM_ILNCOPY DTRMM_ILNNCOPY +#define TRMM_ILTCOPY DTRMM_ILTNCOPY +#define TRSM_IUNCOPY DTRSM_IUNNCOPY +#define TRSM_IUTCOPY DTRSM_IUTNCOPY +#define TRSM_ILNCOPY DTRSM_ILNNCOPY +#define TRSM_ILTCOPY DTRSM_ILTNCOPY + +#endif + +#define GEMM_BETA DGEMM_BETA + +#define GEMM_KERNEL_N DGEMM_KERNEL +#define GEMM_KERNEL_L DGEMM_KERNEL +#define GEMM_KERNEL_R DGEMM_KERNEL +#define GEMM_KERNEL_B DGEMM_KERNEL + +#define TRMM_KERNEL_LN DTRMM_KERNEL_LN +#define TRMM_KERNEL_LT DTRMM_KERNEL_LT +#define TRMM_KERNEL_LR DTRMM_KERNEL_LN +#define TRMM_KERNEL_LC DTRMM_KERNEL_LT +#define TRMM_KERNEL_RN DTRMM_KERNEL_RN +#define TRMM_KERNEL_RT DTRMM_KERNEL_RT +#define TRMM_KERNEL_RR DTRMM_KERNEL_RN +#define TRMM_KERNEL_RC DTRMM_KERNEL_RT + +#define TRSM_KERNEL_LN DTRSM_KERNEL_LN +#define TRSM_KERNEL_LT DTRSM_KERNEL_LT +#define TRSM_KERNEL_LR DTRSM_KERNEL_LN +#define TRSM_KERNEL_LC DTRSM_KERNEL_LT +#define TRSM_KERNEL_RN DTRSM_KERNEL_RN +#define TRSM_KERNEL_RT DTRSM_KERNEL_RT +#define TRSM_KERNEL_RR DTRSM_KERNEL_RN +#define TRSM_KERNEL_RC DTRSM_KERNEL_RT + +#define SYMM_IUTCOPY DSYMM_IUTCOPY +#define SYMM_ILTCOPY DSYMM_ILTCOPY +#define SYMM_OUTCOPY DSYMM_OUTCOPY +#define SYMM_OLTCOPY DSYMM_OLTCOPY + +#define GEMM_NN DGEMM_NN +#define GEMM_CN DGEMM_TN +#define GEMM_TN DGEMM_TN +#define GEMM_NC DGEMM_NT +#define GEMM_NT DGEMM_NT +#define GEMM_CC DGEMM_TT +#define GEMM_CT DGEMM_TT +#define GEMM_TC DGEMM_TT +#define GEMM_TT DGEMM_TT +#define GEMM_NR DGEMM_NN +#define GEMM_TR DGEMM_TN +#define GEMM_CR DGEMM_TN +#define GEMM_RN DGEMM_NN +#define GEMM_RT DGEMM_NT +#define GEMM_RC DGEMM_NT +#define GEMM_RR DGEMM_NN + +#define SYMM_LU DSYMM_LU +#define SYMM_LL DSYMM_LL +#define SYMM_RU DSYMM_RU +#define SYMM_RL DSYMM_RL + +#define HEMM_LU DHEMM_LU +#define HEMM_LL DHEMM_LL +#define HEMM_RU DHEMM_RU +#define HEMM_RL DHEMM_RL + +#define SYRK_UN DSYRK_UN +#define SYRK_UT DSYRK_UT +#define SYRK_LN DSYRK_LN +#define SYRK_LT DSYRK_LT +#define SYRK_UR DSYRK_UN +#define SYRK_UC DSYRK_UT +#define SYRK_LR DSYRK_LN +#define SYRK_LC DSYRK_LT + +#define SYRK_KERNEL_U DSYRK_KERNEL_U +#define SYRK_KERNEL_L DSYRK_KERNEL_L + +#define HERK_UN DSYRK_UN +#define HERK_LN DSYRK_LN +#define HERK_UC DSYRK_UT +#define HERK_LC DSYRK_LT + +#define HER2K_UN DSYR2K_UN +#define HER2K_LN DSYR2K_LN +#define HER2K_UC DSYR2K_UT +#define HER2K_LC DSYR2K_LT + +#define SYR2K_UN DSYR2K_UN +#define SYR2K_UT DSYR2K_UT +#define SYR2K_LN DSYR2K_LN +#define SYR2K_LT DSYR2K_LT +#define SYR2K_UR DSYR2K_UN +#define SYR2K_UC DSYR2K_UT +#define SYR2K_LR DSYR2K_LN +#define SYR2K_LC DSYR2K_LT + +#define SYR2K_KERNEL_U DSYR2K_KERNEL_U +#define SYR2K_KERNEL_L DSYR2K_KERNEL_L + +#define TRMM_LNUU DTRMM_LNUU +#define TRMM_LNUN DTRMM_LNUN +#define TRMM_LNLU DTRMM_LNLU +#define TRMM_LNLN DTRMM_LNLN +#define TRMM_LTUU DTRMM_LTUU +#define TRMM_LTUN DTRMM_LTUN +#define TRMM_LTLU DTRMM_LTLU +#define TRMM_LTLN DTRMM_LTLN +#define TRMM_LRUU DTRMM_LNUU +#define TRMM_LRUN DTRMM_LNUN +#define TRMM_LRLU DTRMM_LNLU +#define TRMM_LRLN DTRMM_LNLN +#define TRMM_LCUU DTRMM_LTUU +#define TRMM_LCUN DTRMM_LTUN +#define TRMM_LCLU DTRMM_LTLU +#define TRMM_LCLN DTRMM_LTLN +#define TRMM_RNUU DTRMM_RNUU +#define TRMM_RNUN DTRMM_RNUN +#define TRMM_RNLU DTRMM_RNLU +#define TRMM_RNLN DTRMM_RNLN +#define TRMM_RTUU DTRMM_RTUU +#define TRMM_RTUN DTRMM_RTUN +#define TRMM_RTLU DTRMM_RTLU +#define TRMM_RTLN DTRMM_RTLN +#define TRMM_RRUU DTRMM_RNUU +#define TRMM_RRUN DTRMM_RNUN +#define TRMM_RRLU DTRMM_RNLU +#define TRMM_RRLN DTRMM_RNLN +#define TRMM_RCUU DTRMM_RTUU +#define TRMM_RCUN DTRMM_RTUN +#define TRMM_RCLU DTRMM_RTLU +#define TRMM_RCLN DTRMM_RTLN + +#define TRSM_LNUU DTRSM_LNUU +#define TRSM_LNUN DTRSM_LNUN +#define TRSM_LNLU DTRSM_LNLU +#define TRSM_LNLN DTRSM_LNLN +#define TRSM_LTUU DTRSM_LTUU +#define TRSM_LTUN DTRSM_LTUN +#define TRSM_LTLU DTRSM_LTLU +#define TRSM_LTLN DTRSM_LTLN +#define TRSM_LRUU DTRSM_LNUU +#define TRSM_LRUN DTRSM_LNUN +#define TRSM_LRLU DTRSM_LNLU +#define TRSM_LRLN DTRSM_LNLN +#define TRSM_LCUU DTRSM_LTUU +#define TRSM_LCUN DTRSM_LTUN +#define TRSM_LCLU DTRSM_LTLU +#define TRSM_LCLN DTRSM_LTLN +#define TRSM_RNUU DTRSM_RNUU +#define TRSM_RNUN DTRSM_RNUN +#define TRSM_RNLU DTRSM_RNLU +#define TRSM_RNLN DTRSM_RNLN +#define TRSM_RTUU DTRSM_RTUU +#define TRSM_RTUN DTRSM_RTUN +#define TRSM_RTLU DTRSM_RTLU +#define TRSM_RTLN DTRSM_RTLN +#define TRSM_RRUU DTRSM_RNUU +#define TRSM_RRUN DTRSM_RNUN +#define TRSM_RRLU DTRSM_RNLU +#define TRSM_RRLN DTRSM_RNLN +#define TRSM_RCUU DTRSM_RTUU +#define TRSM_RCUN DTRSM_RTUN +#define TRSM_RCLU DTRSM_RTLU +#define TRSM_RCLN DTRSM_RTLN + +#define GEMM_THREAD_NN DGEMM_THREAD_NN +#define GEMM_THREAD_CN DGEMM_THREAD_TN +#define GEMM_THREAD_TN DGEMM_THREAD_TN +#define GEMM_THREAD_NC DGEMM_THREAD_NT +#define GEMM_THREAD_NT DGEMM_THREAD_NT +#define GEMM_THREAD_CC DGEMM_THREAD_TT +#define GEMM_THREAD_CT DGEMM_THREAD_TT +#define GEMM_THREAD_TC DGEMM_THREAD_TT +#define GEMM_THREAD_TT DGEMM_THREAD_TT +#define GEMM_THREAD_NR DGEMM_THREAD_NN +#define GEMM_THREAD_TR DGEMM_THREAD_TN +#define GEMM_THREAD_CR DGEMM_THREAD_TN +#define GEMM_THREAD_RN DGEMM_THREAD_NN +#define GEMM_THREAD_RT DGEMM_THREAD_NT +#define GEMM_THREAD_RC DGEMM_THREAD_NT +#define GEMM_THREAD_RR DGEMM_THREAD_NN + +#define SYMM_THREAD_LU DSYMM_THREAD_LU +#define SYMM_THREAD_LL DSYMM_THREAD_LL +#define SYMM_THREAD_RU DSYMM_THREAD_RU +#define SYMM_THREAD_RL DSYMM_THREAD_RL + +#define HEMM_THREAD_LU DHEMM_THREAD_LU +#define HEMM_THREAD_LL DHEMM_THREAD_LL +#define HEMM_THREAD_RU DHEMM_THREAD_RU +#define HEMM_THREAD_RL DHEMM_THREAD_RL + +#define SYRK_THREAD_UN DSYRK_THREAD_UN +#define SYRK_THREAD_UT DSYRK_THREAD_UT +#define SYRK_THREAD_LN DSYRK_THREAD_LN +#define SYRK_THREAD_LT DSYRK_THREAD_LT +#define SYRK_THREAD_UR DSYRK_THREAD_UR +#define SYRK_THREAD_UC DSYRK_THREAD_UC +#define SYRK_THREAD_LR DSYRK_THREAD_LN +#define SYRK_THREAD_LC DSYRK_THREAD_LT + +#define HERK_THREAD_UN DSYRK_THREAD_UN +#define HERK_THREAD_UT DSYRK_THREAD_UT +#define HERK_THREAD_LN DSYRK_THREAD_LN +#define HERK_THREAD_LT DSYRK_THREAD_LT +#define HERK_THREAD_UR DSYRK_THREAD_UR +#define HERK_THREAD_UC DSYRK_THREAD_UC +#define HERK_THREAD_LR DSYRK_THREAD_LN +#define HERK_THREAD_LC DSYRK_THREAD_LT + +#else + +#define AMAX_K SAMAX_K +#define AMIN_K SAMIN_K +#define MAX_K SMAX_K +#define MIN_K SMIN_K +#define IAMAX_K ISAMAX_K +#define IAMIN_K ISAMIN_K +#define IMAX_K ISMAX_K +#define IMIN_K ISMIN_K +#define ASUM_K SASUM_K +#define AXPYU_K SAXPYU_K +#define AXPYC_K SAXPYU_K +#define COPY_K SCOPY_K +#define DOTU_K SDOTU_K +#define DOTC_K SDOTC_K +#define NRM2_K SNRM2_K +#define SCAL_K SSCAL_K +#define SWAP_K SSWAP_K +#define ROT_K SROT_K + +#define GEMV_N SGEMV_N +#define GEMV_T SGEMV_T +#define GEMV_R SGEMV_R +#define GEMV_C SGEMV_C +#define GEMV_O SGEMV_O +#define GEMV_U SGEMV_U +#define GEMV_S SGEMV_S +#define GEMV_D SGEMV_D + +#define GERU_K SGERU_K +#define GERC_K SGERC_K +#define GERV_K SGERV_K +#define GERD_K SGERD_K + +#define SYMV_U SSYMV_U +#define SYMV_L SSYMV_L + +#define SYMV_THREAD_U SSYMV_THREAD_U +#define SYMV_THREAD_L SSYMV_THREAD_L + +#define GEMM_ONCOPY SGEMM_ONCOPY +#define GEMM_OTCOPY SGEMM_OTCOPY +#define GEMM_INCOPY SGEMM_INCOPY +#define GEMM_ITCOPY SGEMM_ITCOPY + +#ifdef UNIT + +#define TRMM_OUNCOPY STRMM_OUNUCOPY +#define TRMM_OUTCOPY STRMM_OUTUCOPY +#define TRMM_OLNCOPY STRMM_OLNUCOPY +#define TRMM_OLTCOPY STRMM_OLTUCOPY +#define TRSM_OUNCOPY STRSM_OUNUCOPY +#define TRSM_OUTCOPY STRSM_OUTUCOPY +#define TRSM_OLNCOPY STRSM_OLNUCOPY +#define TRSM_OLTCOPY STRSM_OLTUCOPY + +#define TRMM_IUNCOPY STRMM_IUNUCOPY +#define TRMM_IUTCOPY STRMM_IUTUCOPY +#define TRMM_ILNCOPY STRMM_ILNUCOPY +#define TRMM_ILTCOPY STRMM_ILTUCOPY +#define TRSM_IUNCOPY STRSM_IUNUCOPY +#define TRSM_IUTCOPY STRSM_IUTUCOPY +#define TRSM_ILNCOPY STRSM_ILNUCOPY +#define TRSM_ILTCOPY STRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY STRMM_OUNNCOPY +#define TRMM_OUTCOPY STRMM_OUTNCOPY +#define TRMM_OLNCOPY STRMM_OLNNCOPY +#define TRMM_OLTCOPY STRMM_OLTNCOPY +#define TRSM_OUNCOPY STRSM_OUNNCOPY +#define TRSM_OUTCOPY STRSM_OUTNCOPY +#define TRSM_OLNCOPY STRSM_OLNNCOPY +#define TRSM_OLTCOPY STRSM_OLTNCOPY + +#define TRMM_IUNCOPY STRMM_IUNNCOPY +#define TRMM_IUTCOPY STRMM_IUTNCOPY +#define TRMM_ILNCOPY STRMM_ILNNCOPY +#define TRMM_ILTCOPY STRMM_ILTNCOPY +#define TRSM_IUNCOPY STRSM_IUNNCOPY +#define TRSM_IUTCOPY STRSM_IUTNCOPY +#define TRSM_ILNCOPY STRSM_ILNNCOPY +#define TRSM_ILTCOPY STRSM_ILTNCOPY + +#endif + +#define GEMM_BETA SGEMM_BETA + +#define GEMM_KERNEL_N SGEMM_KERNEL +#define GEMM_KERNEL_L SGEMM_KERNEL +#define GEMM_KERNEL_R SGEMM_KERNEL +#define GEMM_KERNEL_B SGEMM_KERNEL + +#define TRMM_KERNEL_LN STRMM_KERNEL_LN +#define TRMM_KERNEL_LT STRMM_KERNEL_LT +#define TRMM_KERNEL_LR STRMM_KERNEL_LN +#define TRMM_KERNEL_LC STRMM_KERNEL_LT +#define TRMM_KERNEL_RN STRMM_KERNEL_RN +#define TRMM_KERNEL_RT STRMM_KERNEL_RT +#define TRMM_KERNEL_RR STRMM_KERNEL_RN +#define TRMM_KERNEL_RC STRMM_KERNEL_RT + +#define TRSM_KERNEL_LN STRSM_KERNEL_LN +#define TRSM_KERNEL_LT STRSM_KERNEL_LT +#define TRSM_KERNEL_LR STRSM_KERNEL_LN +#define TRSM_KERNEL_LC STRSM_KERNEL_LT +#define TRSM_KERNEL_RN STRSM_KERNEL_RN +#define TRSM_KERNEL_RT STRSM_KERNEL_RT +#define TRSM_KERNEL_RR STRSM_KERNEL_RN +#define TRSM_KERNEL_RC STRSM_KERNEL_RT + +#define SYMM_IUTCOPY SSYMM_IUTCOPY +#define SYMM_ILTCOPY SSYMM_ILTCOPY +#define SYMM_OUTCOPY SSYMM_OUTCOPY +#define SYMM_OLTCOPY SSYMM_OLTCOPY + +#define GEMM_NN SGEMM_NN +#define GEMM_CN SGEMM_TN +#define GEMM_TN SGEMM_TN +#define GEMM_NC SGEMM_NT +#define GEMM_NT SGEMM_NT +#define GEMM_CC SGEMM_TT +#define GEMM_CT SGEMM_TT +#define GEMM_TC SGEMM_TT +#define GEMM_TT SGEMM_TT +#define GEMM_NR SGEMM_NN +#define GEMM_TR SGEMM_TN +#define GEMM_CR SGEMM_TN +#define GEMM_RN SGEMM_NN +#define GEMM_RT SGEMM_NT +#define GEMM_RC SGEMM_NT +#define GEMM_RR SGEMM_NN + +#define SYMM_LU SSYMM_LU +#define SYMM_LL SSYMM_LL +#define SYMM_RU SSYMM_RU +#define SYMM_RL SSYMM_RL + +#define HEMM_LU SHEMM_LU +#define HEMM_LL SHEMM_LL +#define HEMM_RU SHEMM_RU +#define HEMM_RL SHEMM_RL + +#define SYRK_UN SSYRK_UN +#define SYRK_UT SSYRK_UT +#define SYRK_LN SSYRK_LN +#define SYRK_LT SSYRK_LT +#define SYRK_UR SSYRK_UN +#define SYRK_UC SSYRK_UT +#define SYRK_LR SSYRK_LN +#define SYRK_LC SSYRK_LT + +#define SYRK_KERNEL_U SSYRK_KERNEL_U +#define SYRK_KERNEL_L SSYRK_KERNEL_L + +#define HERK_UN SSYRK_UN +#define HERK_LN SSYRK_LN +#define HERK_UC SSYRK_UT +#define HERK_LC SSYRK_LT + +#define HER2K_UN SSYR2K_UN +#define HER2K_LN SSYR2K_LN +#define HER2K_UC SSYR2K_UT +#define HER2K_LC SSYR2K_LT + +#define SYR2K_UN SSYR2K_UN +#define SYR2K_UT SSYR2K_UT +#define SYR2K_LN SSYR2K_LN +#define SYR2K_LT SSYR2K_LT +#define SYR2K_UR SSYR2K_UN +#define SYR2K_UC SSYR2K_UT +#define SYR2K_LR SSYR2K_LN +#define SYR2K_LC SSYR2K_LT + +#define SYR2K_KERNEL_U SSYR2K_KERNEL_U +#define SYR2K_KERNEL_L SSYR2K_KERNEL_L + +#define TRMM_LNUU STRMM_LNUU +#define TRMM_LNUN STRMM_LNUN +#define TRMM_LNLU STRMM_LNLU +#define TRMM_LNLN STRMM_LNLN +#define TRMM_LTUU STRMM_LTUU +#define TRMM_LTUN STRMM_LTUN +#define TRMM_LTLU STRMM_LTLU +#define TRMM_LTLN STRMM_LTLN +#define TRMM_LRUU STRMM_LNUU +#define TRMM_LRUN STRMM_LNUN +#define TRMM_LRLU STRMM_LNLU +#define TRMM_LRLN STRMM_LNLN +#define TRMM_LCUU STRMM_LTUU +#define TRMM_LCUN STRMM_LTUN +#define TRMM_LCLU STRMM_LTLU +#define TRMM_LCLN STRMM_LTLN +#define TRMM_RNUU STRMM_RNUU +#define TRMM_RNUN STRMM_RNUN +#define TRMM_RNLU STRMM_RNLU +#define TRMM_RNLN STRMM_RNLN +#define TRMM_RTUU STRMM_RTUU +#define TRMM_RTUN STRMM_RTUN +#define TRMM_RTLU STRMM_RTLU +#define TRMM_RTLN STRMM_RTLN +#define TRMM_RRUU STRMM_RNUU +#define TRMM_RRUN STRMM_RNUN +#define TRMM_RRLU STRMM_RNLU +#define TRMM_RRLN STRMM_RNLN +#define TRMM_RCUU STRMM_RTUU +#define TRMM_RCUN STRMM_RTUN +#define TRMM_RCLU STRMM_RTLU +#define TRMM_RCLN STRMM_RTLN + +#define TRSM_LNUU STRSM_LNUU +#define TRSM_LNUN STRSM_LNUN +#define TRSM_LNLU STRSM_LNLU +#define TRSM_LNLN STRSM_LNLN +#define TRSM_LTUU STRSM_LTUU +#define TRSM_LTUN STRSM_LTUN +#define TRSM_LTLU STRSM_LTLU +#define TRSM_LTLN STRSM_LTLN +#define TRSM_LRUU STRSM_LNUU +#define TRSM_LRUN STRSM_LNUN +#define TRSM_LRLU STRSM_LNLU +#define TRSM_LRLN STRSM_LNLN +#define TRSM_LCUU STRSM_LTUU +#define TRSM_LCUN STRSM_LTUN +#define TRSM_LCLU STRSM_LTLU +#define TRSM_LCLN STRSM_LTLN +#define TRSM_RNUU STRSM_RNUU +#define TRSM_RNUN STRSM_RNUN +#define TRSM_RNLU STRSM_RNLU +#define TRSM_RNLN STRSM_RNLN +#define TRSM_RTUU STRSM_RTUU +#define TRSM_RTUN STRSM_RTUN +#define TRSM_RTLU STRSM_RTLU +#define TRSM_RTLN STRSM_RTLN +#define TRSM_RRUU STRSM_RNUU +#define TRSM_RRUN STRSM_RNUN +#define TRSM_RRLU STRSM_RNLU +#define TRSM_RRLN STRSM_RNLN +#define TRSM_RCUU STRSM_RTUU +#define TRSM_RCUN STRSM_RTUN +#define TRSM_RCLU STRSM_RTLU +#define TRSM_RCLN STRSM_RTLN + +#define GEMM_THREAD_NN SGEMM_THREAD_NN +#define GEMM_THREAD_CN SGEMM_THREAD_TN +#define GEMM_THREAD_TN SGEMM_THREAD_TN +#define GEMM_THREAD_NC SGEMM_THREAD_NT +#define GEMM_THREAD_NT SGEMM_THREAD_NT +#define GEMM_THREAD_CC SGEMM_THREAD_TT +#define GEMM_THREAD_CT SGEMM_THREAD_TT +#define GEMM_THREAD_TC SGEMM_THREAD_TT +#define GEMM_THREAD_TT SGEMM_THREAD_TT +#define GEMM_THREAD_NR SGEMM_THREAD_NN +#define GEMM_THREAD_TR SGEMM_THREAD_TN +#define GEMM_THREAD_CR SGEMM_THREAD_TN +#define GEMM_THREAD_RN SGEMM_THREAD_NN +#define GEMM_THREAD_RT SGEMM_THREAD_NT +#define GEMM_THREAD_RC SGEMM_THREAD_NT +#define GEMM_THREAD_RR SGEMM_THREAD_NN + +#define SYMM_THREAD_LU SSYMM_THREAD_LU +#define SYMM_THREAD_LL SSYMM_THREAD_LL +#define SYMM_THREAD_RU SSYMM_THREAD_RU +#define SYMM_THREAD_RL SSYMM_THREAD_RL + +#define HEMM_THREAD_LU SHEMM_THREAD_LU +#define HEMM_THREAD_LL SHEMM_THREAD_LL +#define HEMM_THREAD_RU SHEMM_THREAD_RU +#define HEMM_THREAD_RL SHEMM_THREAD_RL + +#define SYRK_THREAD_UN SSYRK_THREAD_UN +#define SYRK_THREAD_UT SSYRK_THREAD_UT +#define SYRK_THREAD_LN SSYRK_THREAD_LN +#define SYRK_THREAD_LT SSYRK_THREAD_LT +#define SYRK_THREAD_UR SSYRK_THREAD_UR +#define SYRK_THREAD_UC SSYRK_THREAD_UC +#define SYRK_THREAD_LR SSYRK_THREAD_LN +#define SYRK_THREAD_LC SSYRK_THREAD_LT + +#define HERK_THREAD_UN SSYRK_THREAD_UN +#define HERK_THREAD_UT SSYRK_THREAD_UT +#define HERK_THREAD_LN SSYRK_THREAD_LN +#define HERK_THREAD_LT SSYRK_THREAD_LT +#define HERK_THREAD_UR SSYRK_THREAD_UR +#define HERK_THREAD_UC SSYRK_THREAD_UC +#define HERK_THREAD_LR SSYRK_THREAD_LN +#define HERK_THREAD_LC SSYRK_THREAD_LT + +#endif +#else +#ifdef XDOUBLE + +#define AMAX_K XAMAX_K +#define AMIN_K XAMIN_K +#define MAX_K XMAX_K +#define MIN_K XMIN_K +#define IAMAX_K IXAMAX_K +#define IAMIN_K IXAMIN_K +#define IMAX_K IXMAX_K +#define IMIN_K IXMIN_K +#define ASUM_K XASUM_K +#define AXPYU_K XAXPYU_K +#define AXPYC_K XAXPYC_K +#define COPY_K XCOPY_K +#define DOTU_K XDOTU_K +#define DOTC_K XDOTC_K +#define NRM2_K XNRM2_K +#define SCAL_K XSCAL_K +#define SWAP_K XSWAP_K +#define ROT_K XROT_K + +#define GEMV_N XGEMV_N +#define GEMV_T XGEMV_T +#define GEMV_R XGEMV_R +#define GEMV_C XGEMV_C +#define GEMV_O XGEMV_O +#define GEMV_U XGEMV_U +#define GEMV_S XGEMV_S +#define GEMV_D XGEMV_D + +#define GERU_K XGERU_K +#define GERC_K XGERC_K +#define GERV_K XGERV_K +#define GERD_K XGERD_K + +#define SYMV_U XSYMV_U +#define SYMV_L XSYMV_L +#define HEMV_U XHEMV_U +#define HEMV_L XHEMV_L +#define HEMV_V XHEMV_V +#define HEMV_M XHEMV_M + +#define SYMV_THREAD_U XSYMV_THREAD_U +#define SYMV_THREAD_L XSYMV_THREAD_L +#define HEMV_THREAD_U XHEMV_THREAD_U +#define HEMV_THREAD_L XHEMV_THREAD_L +#define HEMV_THREAD_V XHEMV_THREAD_V +#define HEMV_THREAD_M XHEMV_THREAD_M + +#define GEMM_ONCOPY XGEMM_ONCOPY +#define GEMM_OTCOPY XGEMM_OTCOPY +#define GEMM_INCOPY XGEMM_INCOPY +#define GEMM_ITCOPY XGEMM_ITCOPY + +#define GEMM3M_ONCOPYB XGEMM3M_ONCOPYB +#define GEMM3M_ONCOPYR XGEMM3M_ONCOPYR +#define GEMM3M_ONCOPYI XGEMM3M_ONCOPYI +#define GEMM3M_OTCOPYB XGEMM3M_OTCOPYB +#define GEMM3M_OTCOPYR XGEMM3M_OTCOPYR +#define GEMM3M_OTCOPYI XGEMM3M_OTCOPYI +#define GEMM3M_INCOPYB XGEMM3M_INCOPYB +#define GEMM3M_INCOPYR XGEMM3M_INCOPYR +#define GEMM3M_INCOPYI XGEMM3M_INCOPYI +#define GEMM3M_ITCOPYB XGEMM3M_ITCOPYB +#define GEMM3M_ITCOPYR XGEMM3M_ITCOPYR +#define GEMM3M_ITCOPYI XGEMM3M_ITCOPYI + +#ifdef UNIT + +#define TRMM_OUNCOPY XTRMM_OUNUCOPY +#define TRMM_OUTCOPY XTRMM_OUTUCOPY +#define TRMM_OLNCOPY XTRMM_OLNUCOPY +#define TRMM_OLTCOPY XTRMM_OLTUCOPY +#define TRSM_OUNCOPY XTRSM_OUNUCOPY +#define TRSM_OUTCOPY XTRSM_OUTUCOPY +#define TRSM_OLNCOPY XTRSM_OLNUCOPY +#define TRSM_OLTCOPY XTRSM_OLTUCOPY + +#define TRMM_IUNCOPY XTRMM_IUNUCOPY +#define TRMM_IUTCOPY XTRMM_IUTUCOPY +#define TRMM_ILNCOPY XTRMM_ILNUCOPY +#define TRMM_ILTCOPY XTRMM_ILTUCOPY +#define TRSM_IUNCOPY XTRSM_IUNUCOPY +#define TRSM_IUTCOPY XTRSM_IUTUCOPY +#define TRSM_ILNCOPY XTRSM_ILNUCOPY +#define TRSM_ILTCOPY XTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY XTRMM_OUNNCOPY +#define TRMM_OUTCOPY XTRMM_OUTNCOPY +#define TRMM_OLNCOPY XTRMM_OLNNCOPY +#define TRMM_OLTCOPY XTRMM_OLTNCOPY +#define TRSM_OUNCOPY XTRSM_OUNNCOPY +#define TRSM_OUTCOPY XTRSM_OUTNCOPY +#define TRSM_OLNCOPY XTRSM_OLNNCOPY +#define TRSM_OLTCOPY XTRSM_OLTNCOPY + +#define TRMM_IUNCOPY XTRMM_IUNNCOPY +#define TRMM_IUTCOPY XTRMM_IUTNCOPY +#define TRMM_ILNCOPY XTRMM_ILNNCOPY +#define TRMM_ILTCOPY XTRMM_ILTNCOPY +#define TRSM_IUNCOPY XTRSM_IUNNCOPY +#define TRSM_IUTCOPY XTRSM_IUTNCOPY +#define TRSM_ILNCOPY XTRSM_ILNNCOPY +#define TRSM_ILTCOPY XTRSM_ILTNCOPY + +#endif + +#define SYMM3M_ILCOPYB XSYMM3M_ILCOPYB +#define SYMM3M_IUCOPYB XSYMM3M_IUCOPYB +#define SYMM3M_ILCOPYR XSYMM3M_ILCOPYR +#define SYMM3M_IUCOPYR XSYMM3M_IUCOPYR +#define SYMM3M_ILCOPYI XSYMM3M_ILCOPYI +#define SYMM3M_IUCOPYI XSYMM3M_IUCOPYI + +#define SYMM3M_OLCOPYB XSYMM3M_OLCOPYB +#define SYMM3M_OUCOPYB XSYMM3M_OUCOPYB +#define SYMM3M_OLCOPYR XSYMM3M_OLCOPYR +#define SYMM3M_OUCOPYR XSYMM3M_OUCOPYR +#define SYMM3M_OLCOPYI XSYMM3M_OLCOPYI +#define SYMM3M_OUCOPYI XSYMM3M_OUCOPYI + +#define HEMM3M_ILCOPYB XHEMM3M_ILCOPYB +#define HEMM3M_IUCOPYB XHEMM3M_IUCOPYB +#define HEMM3M_ILCOPYR XHEMM3M_ILCOPYR +#define HEMM3M_IUCOPYR XHEMM3M_IUCOPYR +#define HEMM3M_ILCOPYI XHEMM3M_ILCOPYI +#define HEMM3M_IUCOPYI XHEMM3M_IUCOPYI + +#define HEMM3M_OLCOPYB XHEMM3M_OLCOPYB +#define HEMM3M_OUCOPYB XHEMM3M_OUCOPYB +#define HEMM3M_OLCOPYR XHEMM3M_OLCOPYR +#define HEMM3M_OUCOPYR XHEMM3M_OUCOPYR +#define HEMM3M_OLCOPYI XHEMM3M_OLCOPYI +#define HEMM3M_OUCOPYI XHEMM3M_OUCOPYI + +#define GEMM_BETA XGEMM_BETA + +#define GEMM_KERNEL_N XGEMM_KERNEL_N +#define GEMM_KERNEL_L XGEMM_KERNEL_L +#define GEMM_KERNEL_R XGEMM_KERNEL_R +#define GEMM_KERNEL_B XGEMM_KERNEL_B + +#define GEMM3M_KERNEL XGEMM3M_KERNEL + +#define TRMM_KERNEL_LN XTRMM_KERNEL_LN +#define TRMM_KERNEL_LT XTRMM_KERNEL_LT +#define TRMM_KERNEL_LR XTRMM_KERNEL_LR +#define TRMM_KERNEL_LC XTRMM_KERNEL_LC +#define TRMM_KERNEL_RN XTRMM_KERNEL_RN +#define TRMM_KERNEL_RT XTRMM_KERNEL_RT +#define TRMM_KERNEL_RR XTRMM_KERNEL_RR +#define TRMM_KERNEL_RC XTRMM_KERNEL_RC + +#define TRSM_KERNEL_LN XTRSM_KERNEL_LN +#define TRSM_KERNEL_LT XTRSM_KERNEL_LT +#define TRSM_KERNEL_LR XTRSM_KERNEL_LR +#define TRSM_KERNEL_LC XTRSM_KERNEL_LC +#define TRSM_KERNEL_RN XTRSM_KERNEL_RN +#define TRSM_KERNEL_RT XTRSM_KERNEL_RT +#define TRSM_KERNEL_RR XTRSM_KERNEL_RR +#define TRSM_KERNEL_RC XTRSM_KERNEL_RC + +#define GEMM_NN XGEMM_NN +#define GEMM_CN XGEMM_CN +#define GEMM_TN XGEMM_TN +#define GEMM_NC XGEMM_NC +#define GEMM_NT XGEMM_NT +#define GEMM_CC XGEMM_CC +#define GEMM_CT XGEMM_CT +#define GEMM_TC XGEMM_TC +#define GEMM_TT XGEMM_TT +#define GEMM_NR XGEMM_NR +#define GEMM_TR XGEMM_TR +#define GEMM_CR XGEMM_CR +#define GEMM_RN XGEMM_RN +#define GEMM_RT XGEMM_RT +#define GEMM_RC XGEMM_RC +#define GEMM_RR XGEMM_RR + +#define SYMM_LU XSYMM_LU +#define SYMM_LL XSYMM_LL +#define SYMM_RU XSYMM_RU +#define SYMM_RL XSYMM_RL + +#define HEMM_LU XHEMM_LU +#define HEMM_LL XHEMM_LL +#define HEMM_RU XHEMM_RU +#define HEMM_RL XHEMM_RL + +#define HEMM_IUTCOPY XHEMM_IUTCOPY +#define HEMM_ILTCOPY XHEMM_ILTCOPY +#define HEMM_OUTCOPY XHEMM_OUTCOPY +#define HEMM_OLTCOPY XHEMM_OLTCOPY + +#define SYRK_UN XSYRK_UN +#define SYRK_UT XSYRK_UT +#define SYRK_LN XSYRK_LN +#define SYRK_LT XSYRK_LT +#define SYRK_UR XSYRK_UN +#define SYRK_UC XSYRK_UT +#define SYRK_LR XSYRK_LN +#define SYRK_LC XSYRK_LT + +#define SYRK_KERNEL_U XSYRK_KERNEL_U +#define SYRK_KERNEL_L XSYRK_KERNEL_L + +#define HERK_UN XHERK_UN +#define HERK_LN XHERK_LN +#define HERK_UC XHERK_UC +#define HERK_LC XHERK_LC + +#define HER2K_UN XHER2K_UN +#define HER2K_LN XHER2K_LN +#define HER2K_UC XHER2K_UC +#define HER2K_LC XHER2K_LC + +#define SYR2K_UN XSYR2K_UN +#define SYR2K_UT XSYR2K_UT +#define SYR2K_LN XSYR2K_LN +#define SYR2K_LT XSYR2K_LT +#define SYR2K_UR XSYR2K_UN +#define SYR2K_UC XSYR2K_UT +#define SYR2K_LR XSYR2K_LN +#define SYR2K_LC XSYR2K_LT + +#define SYR2K_KERNEL_U XSYR2K_KERNEL_U +#define SYR2K_KERNEL_L XSYR2K_KERNEL_L + +#define TRMM_LNUU XTRMM_LNUU +#define TRMM_LNUN XTRMM_LNUN +#define TRMM_LNLU XTRMM_LNLU +#define TRMM_LNLN XTRMM_LNLN +#define TRMM_LTUU XTRMM_LTUU +#define TRMM_LTUN XTRMM_LTUN +#define TRMM_LTLU XTRMM_LTLU +#define TRMM_LTLN XTRMM_LTLN +#define TRMM_LRUU XTRMM_LRUU +#define TRMM_LRUN XTRMM_LRUN +#define TRMM_LRLU XTRMM_LRLU +#define TRMM_LRLN XTRMM_LRLN +#define TRMM_LCUU XTRMM_LCUU +#define TRMM_LCUN XTRMM_LCUN +#define TRMM_LCLU XTRMM_LCLU +#define TRMM_LCLN XTRMM_LCLN +#define TRMM_RNUU XTRMM_RNUU +#define TRMM_RNUN XTRMM_RNUN +#define TRMM_RNLU XTRMM_RNLU +#define TRMM_RNLN XTRMM_RNLN +#define TRMM_RTUU XTRMM_RTUU +#define TRMM_RTUN XTRMM_RTUN +#define TRMM_RTLU XTRMM_RTLU +#define TRMM_RTLN XTRMM_RTLN +#define TRMM_RRUU XTRMM_RRUU +#define TRMM_RRUN XTRMM_RRUN +#define TRMM_RRLU XTRMM_RRLU +#define TRMM_RRLN XTRMM_RRLN +#define TRMM_RCUU XTRMM_RCUU +#define TRMM_RCUN XTRMM_RCUN +#define TRMM_RCLU XTRMM_RCLU +#define TRMM_RCLN XTRMM_RCLN + +#define TRSM_LNUU XTRSM_LNUU +#define TRSM_LNUN XTRSM_LNUN +#define TRSM_LNLU XTRSM_LNLU +#define TRSM_LNLN XTRSM_LNLN +#define TRSM_LTUU XTRSM_LTUU +#define TRSM_LTUN XTRSM_LTUN +#define TRSM_LTLU XTRSM_LTLU +#define TRSM_LTLN XTRSM_LTLN +#define TRSM_LRUU XTRSM_LRUU +#define TRSM_LRUN XTRSM_LRUN +#define TRSM_LRLU XTRSM_LRLU +#define TRSM_LRLN XTRSM_LRLN +#define TRSM_LCUU XTRSM_LCUU +#define TRSM_LCUN XTRSM_LCUN +#define TRSM_LCLU XTRSM_LCLU +#define TRSM_LCLN XTRSM_LCLN +#define TRSM_RNUU XTRSM_RNUU +#define TRSM_RNUN XTRSM_RNUN +#define TRSM_RNLU XTRSM_RNLU +#define TRSM_RNLN XTRSM_RNLN +#define TRSM_RTUU XTRSM_RTUU +#define TRSM_RTUN XTRSM_RTUN +#define TRSM_RTLU XTRSM_RTLU +#define TRSM_RTLN XTRSM_RTLN +#define TRSM_RRUU XTRSM_RRUU +#define TRSM_RRUN XTRSM_RRUN +#define TRSM_RRLU XTRSM_RRLU +#define TRSM_RRLN XTRSM_RRLN +#define TRSM_RCUU XTRSM_RCUU +#define TRSM_RCUN XTRSM_RCUN +#define TRSM_RCLU XTRSM_RCLU +#define TRSM_RCLN XTRSM_RCLN + + +#define GEMM_THREAD_NN XGEMM_THREAD_NN +#define GEMM_THREAD_CN XGEMM_THREAD_CN +#define GEMM_THREAD_TN XGEMM_THREAD_TN +#define GEMM_THREAD_NC XGEMM_THREAD_NC +#define GEMM_THREAD_NT XGEMM_THREAD_NT +#define GEMM_THREAD_CC XGEMM_THREAD_CC +#define GEMM_THREAD_CT XGEMM_THREAD_CT +#define GEMM_THREAD_TC XGEMM_THREAD_TC +#define GEMM_THREAD_TT XGEMM_THREAD_TT +#define GEMM_THREAD_NR XGEMM_THREAD_NR +#define GEMM_THREAD_TR XGEMM_THREAD_TR +#define GEMM_THREAD_CR XGEMM_THREAD_CR +#define GEMM_THREAD_RN XGEMM_THREAD_RN +#define GEMM_THREAD_RT XGEMM_THREAD_RT +#define GEMM_THREAD_RC XGEMM_THREAD_RC +#define GEMM_THREAD_RR XGEMM_THREAD_RR + +#define SYMM_THREAD_LU XSYMM_THREAD_LU +#define SYMM_THREAD_LL XSYMM_THREAD_LL +#define SYMM_THREAD_RU XSYMM_THREAD_RU +#define SYMM_THREAD_RL XSYMM_THREAD_RL + +#define HEMM_THREAD_LU XHEMM_THREAD_LU +#define HEMM_THREAD_LL XHEMM_THREAD_LL +#define HEMM_THREAD_RU XHEMM_THREAD_RU +#define HEMM_THREAD_RL XHEMM_THREAD_RL + +#define SYRK_THREAD_UN XSYRK_THREAD_UN +#define SYRK_THREAD_UT XSYRK_THREAD_UT +#define SYRK_THREAD_LN XSYRK_THREAD_LN +#define SYRK_THREAD_LT XSYRK_THREAD_LT +#define SYRK_THREAD_UR XSYRK_THREAD_UR +#define SYRK_THREAD_UC XSYRK_THREAD_UC +#define SYRK_THREAD_LR XSYRK_THREAD_LR +#define SYRK_THREAD_LC XSYRK_THREAD_LC + +#define HERK_THREAD_UN XHERK_THREAD_UN +#define HERK_THREAD_UT XHERK_THREAD_UT +#define HERK_THREAD_LN XHERK_THREAD_LN +#define HERK_THREAD_LT XHERK_THREAD_LT +#define HERK_THREAD_UR XHERK_THREAD_UR +#define HERK_THREAD_UC XHERK_THREAD_UC +#define HERK_THREAD_LR XHERK_THREAD_LR +#define HERK_THREAD_LC XHERK_THREAD_LC + +#define GEMM3M_NN XGEMM3M_NN +#define GEMM3M_CN XGEMM3M_CN +#define GEMM3M_TN XGEMM3M_TN +#define GEMM3M_NC XGEMM3M_NC +#define GEMM3M_NT XGEMM3M_NT +#define GEMM3M_CC XGEMM3M_CC +#define GEMM3M_CT XGEMM3M_CT +#define GEMM3M_TC XGEMM3M_TC +#define GEMM3M_TT XGEMM3M_TT +#define GEMM3M_NR XGEMM3M_NR +#define GEMM3M_TR XGEMM3M_TR +#define GEMM3M_CR XGEMM3M_CR +#define GEMM3M_RN XGEMM3M_RN +#define GEMM3M_RT XGEMM3M_RT +#define GEMM3M_RC XGEMM3M_RC +#define GEMM3M_RR XGEMM3M_RR + +#define GEMM3M_THREAD_NN XGEMM3M_THREAD_NN +#define GEMM3M_THREAD_CN XGEMM3M_THREAD_CN +#define GEMM3M_THREAD_TN XGEMM3M_THREAD_TN +#define GEMM3M_THREAD_NC XGEMM3M_THREAD_NC +#define GEMM3M_THREAD_NT XGEMM3M_THREAD_NT +#define GEMM3M_THREAD_CC XGEMM3M_THREAD_CC +#define GEMM3M_THREAD_CT XGEMM3M_THREAD_CT +#define GEMM3M_THREAD_TC XGEMM3M_THREAD_TC +#define GEMM3M_THREAD_TT XGEMM3M_THREAD_TT +#define GEMM3M_THREAD_NR XGEMM3M_THREAD_NR +#define GEMM3M_THREAD_TR XGEMM3M_THREAD_TR +#define GEMM3M_THREAD_CR XGEMM3M_THREAD_CR +#define GEMM3M_THREAD_RN XGEMM3M_THREAD_RN +#define GEMM3M_THREAD_RT XGEMM3M_THREAD_RT +#define GEMM3M_THREAD_RC XGEMM3M_THREAD_RC +#define GEMM3M_THREAD_RR XGEMM3M_THREAD_RR + +#define SYMM3M_LU XSYMM3M_LU +#define SYMM3M_LL XSYMM3M_LL +#define SYMM3M_RU XSYMM3M_RU +#define SYMM3M_RL XSYMM3M_RL + +#define SYMM3M_THREAD_LU XSYMM3M_THREAD_LU +#define SYMM3M_THREAD_LL XSYMM3M_THREAD_LL +#define SYMM3M_THREAD_RU XSYMM3M_THREAD_RU +#define SYMM3M_THREAD_RL XSYMM3M_THREAD_RL + +#define HEMM3M_LU XHEMM3M_LU +#define HEMM3M_LL XHEMM3M_LL +#define HEMM3M_RU XHEMM3M_RU +#define HEMM3M_RL XHEMM3M_RL + +#define HEMM3M_THREAD_LU XHEMM3M_THREAD_LU +#define HEMM3M_THREAD_LL XHEMM3M_THREAD_LL +#define HEMM3M_THREAD_RU XHEMM3M_THREAD_RU +#define HEMM3M_THREAD_RL XHEMM3M_THREAD_RL + +#define SYMM_IUTCOPY XSYMM_IUTCOPY +#define SYMM_ILTCOPY XSYMM_ILTCOPY +#define SYMM_OUTCOPY XSYMM_OUTCOPY +#define SYMM_OLTCOPY XSYMM_OLTCOPY + +#elif defined(DOUBLE) + +#define AMAX_K ZAMAX_K +#define AMIN_K ZAMIN_K +#define MAX_K ZMAX_K +#define MIN_K ZMIN_K +#define IAMAX_K IZAMAX_K +#define IAMIN_K IZAMIN_K +#define IMAX_K IZMAX_K +#define IMIN_K IZMIN_K +#define ASUM_K ZASUM_K +#define AXPYU_K ZAXPYU_K +#define AXPYC_K ZAXPYC_K +#define COPY_K ZCOPY_K +#define DOTU_K ZDOTU_K +#define DOTC_K ZDOTC_K +#define NRM2_K ZNRM2_K +#define SCAL_K ZSCAL_K +#define SWAP_K ZSWAP_K +#define ROT_K ZROT_K + +#define GEMV_N ZGEMV_N +#define GEMV_T ZGEMV_T +#define GEMV_R ZGEMV_R +#define GEMV_C ZGEMV_C +#define GEMV_O ZGEMV_O +#define GEMV_U ZGEMV_U +#define GEMV_S ZGEMV_S +#define GEMV_D ZGEMV_D + +#define GERU_K ZGERU_K +#define GERC_K ZGERC_K +#define GERV_K ZGERV_K +#define GERD_K ZGERD_K + +#define SYMV_U ZSYMV_U +#define SYMV_L ZSYMV_L +#define HEMV_U ZHEMV_U +#define HEMV_L ZHEMV_L +#define HEMV_V ZHEMV_V +#define HEMV_M ZHEMV_M + +#define SYMV_THREAD_U ZSYMV_THREAD_U +#define SYMV_THREAD_L ZSYMV_THREAD_L +#define HEMV_THREAD_U ZHEMV_THREAD_U +#define HEMV_THREAD_L ZHEMV_THREAD_L +#define HEMV_THREAD_V ZHEMV_THREAD_V +#define HEMV_THREAD_M ZHEMV_THREAD_M + +#define GEMM_ONCOPY ZGEMM_ONCOPY +#define GEMM_OTCOPY ZGEMM_OTCOPY +#define GEMM_INCOPY ZGEMM_INCOPY +#define GEMM_ITCOPY ZGEMM_ITCOPY + +#define GEMM3M_ONCOPYB ZGEMM3M_ONCOPYB +#define GEMM3M_ONCOPYR ZGEMM3M_ONCOPYR +#define GEMM3M_ONCOPYI ZGEMM3M_ONCOPYI +#define GEMM3M_OTCOPYB ZGEMM3M_OTCOPYB +#define GEMM3M_OTCOPYR ZGEMM3M_OTCOPYR +#define GEMM3M_OTCOPYI ZGEMM3M_OTCOPYI +#define GEMM3M_INCOPYB ZGEMM3M_INCOPYB +#define GEMM3M_INCOPYR ZGEMM3M_INCOPYR +#define GEMM3M_INCOPYI ZGEMM3M_INCOPYI +#define GEMM3M_ITCOPYB ZGEMM3M_ITCOPYB +#define GEMM3M_ITCOPYR ZGEMM3M_ITCOPYR +#define GEMM3M_ITCOPYI ZGEMM3M_ITCOPYI + +#ifdef UNIT + +#define TRMM_OUNCOPY ZTRMM_OUNUCOPY +#define TRMM_OUTCOPY ZTRMM_OUTUCOPY +#define TRMM_OLNCOPY ZTRMM_OLNUCOPY +#define TRMM_OLTCOPY ZTRMM_OLTUCOPY +#define TRSM_OUNCOPY ZTRSM_OUNUCOPY +#define TRSM_OUTCOPY ZTRSM_OUTUCOPY +#define TRSM_OLNCOPY ZTRSM_OLNUCOPY +#define TRSM_OLTCOPY ZTRSM_OLTUCOPY + +#define TRMM_IUNCOPY ZTRMM_IUNUCOPY +#define TRMM_IUTCOPY ZTRMM_IUTUCOPY +#define TRMM_ILNCOPY ZTRMM_ILNUCOPY +#define TRMM_ILTCOPY ZTRMM_ILTUCOPY +#define TRSM_IUNCOPY ZTRSM_IUNUCOPY +#define TRSM_IUTCOPY ZTRSM_IUTUCOPY +#define TRSM_ILNCOPY ZTRSM_ILNUCOPY +#define TRSM_ILTCOPY ZTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY ZTRMM_OUNNCOPY +#define TRMM_OUTCOPY ZTRMM_OUTNCOPY +#define TRMM_OLNCOPY ZTRMM_OLNNCOPY +#define TRMM_OLTCOPY ZTRMM_OLTNCOPY +#define TRSM_OUNCOPY ZTRSM_OUNNCOPY +#define TRSM_OUTCOPY ZTRSM_OUTNCOPY +#define TRSM_OLNCOPY ZTRSM_OLNNCOPY +#define TRSM_OLTCOPY ZTRSM_OLTNCOPY + +#define TRMM_IUNCOPY ZTRMM_IUNNCOPY +#define TRMM_IUTCOPY ZTRMM_IUTNCOPY +#define TRMM_ILNCOPY ZTRMM_ILNNCOPY +#define TRMM_ILTCOPY ZTRMM_ILTNCOPY +#define TRSM_IUNCOPY ZTRSM_IUNNCOPY +#define TRSM_IUTCOPY ZTRSM_IUTNCOPY +#define TRSM_ILNCOPY ZTRSM_ILNNCOPY +#define TRSM_ILTCOPY ZTRSM_ILTNCOPY + +#endif + +#define SYMM3M_ILCOPYB ZSYMM3M_ILCOPYB +#define SYMM3M_IUCOPYB ZSYMM3M_IUCOPYB +#define SYMM3M_ILCOPYR ZSYMM3M_ILCOPYR +#define SYMM3M_IUCOPYR ZSYMM3M_IUCOPYR +#define SYMM3M_ILCOPYI ZSYMM3M_ILCOPYI +#define SYMM3M_IUCOPYI ZSYMM3M_IUCOPYI + +#define SYMM3M_OLCOPYB ZSYMM3M_OLCOPYB +#define SYMM3M_OUCOPYB ZSYMM3M_OUCOPYB +#define SYMM3M_OLCOPYR ZSYMM3M_OLCOPYR +#define SYMM3M_OUCOPYR ZSYMM3M_OUCOPYR +#define SYMM3M_OLCOPYI ZSYMM3M_OLCOPYI +#define SYMM3M_OUCOPYI ZSYMM3M_OUCOPYI + +#define HEMM3M_ILCOPYB ZHEMM3M_ILCOPYB +#define HEMM3M_IUCOPYB ZHEMM3M_IUCOPYB +#define HEMM3M_ILCOPYR ZHEMM3M_ILCOPYR +#define HEMM3M_IUCOPYR ZHEMM3M_IUCOPYR +#define HEMM3M_ILCOPYI ZHEMM3M_ILCOPYI +#define HEMM3M_IUCOPYI ZHEMM3M_IUCOPYI + +#define HEMM3M_OLCOPYB ZHEMM3M_OLCOPYB +#define HEMM3M_OUCOPYB ZHEMM3M_OUCOPYB +#define HEMM3M_OLCOPYR ZHEMM3M_OLCOPYR +#define HEMM3M_OUCOPYR ZHEMM3M_OUCOPYR +#define HEMM3M_OLCOPYI ZHEMM3M_OLCOPYI +#define HEMM3M_OUCOPYI ZHEMM3M_OUCOPYI + +#define GEMM_BETA ZGEMM_BETA + +#define GEMM_KERNEL_N ZGEMM_KERNEL_N +#define GEMM_KERNEL_L ZGEMM_KERNEL_L +#define GEMM_KERNEL_R ZGEMM_KERNEL_R +#define GEMM_KERNEL_B ZGEMM_KERNEL_B + +#define GEMM3M_KERNEL ZGEMM3M_KERNEL + +#define TRMM_KERNEL_LN ZTRMM_KERNEL_LN +#define TRMM_KERNEL_LT ZTRMM_KERNEL_LT +#define TRMM_KERNEL_LR ZTRMM_KERNEL_LR +#define TRMM_KERNEL_LC ZTRMM_KERNEL_LC +#define TRMM_KERNEL_RN ZTRMM_KERNEL_RN +#define TRMM_KERNEL_RT ZTRMM_KERNEL_RT +#define TRMM_KERNEL_RR ZTRMM_KERNEL_RR +#define TRMM_KERNEL_RC ZTRMM_KERNEL_RC + +#define TRSM_KERNEL_LN ZTRSM_KERNEL_LN +#define TRSM_KERNEL_LT ZTRSM_KERNEL_LT +#define TRSM_KERNEL_LR ZTRSM_KERNEL_LR +#define TRSM_KERNEL_LC ZTRSM_KERNEL_LC +#define TRSM_KERNEL_RN ZTRSM_KERNEL_RN +#define TRSM_KERNEL_RT ZTRSM_KERNEL_RT +#define TRSM_KERNEL_RR ZTRSM_KERNEL_RR +#define TRSM_KERNEL_RC ZTRSM_KERNEL_RC + +#define GEMM_NN ZGEMM_NN +#define GEMM_CN ZGEMM_CN +#define GEMM_TN ZGEMM_TN +#define GEMM_NC ZGEMM_NC +#define GEMM_NT ZGEMM_NT +#define GEMM_CC ZGEMM_CC +#define GEMM_CT ZGEMM_CT +#define GEMM_TC ZGEMM_TC +#define GEMM_TT ZGEMM_TT +#define GEMM_NR ZGEMM_NR +#define GEMM_TR ZGEMM_TR +#define GEMM_CR ZGEMM_CR +#define GEMM_RN ZGEMM_RN +#define GEMM_RT ZGEMM_RT +#define GEMM_RC ZGEMM_RC +#define GEMM_RR ZGEMM_RR + +#define SYMM_LU ZSYMM_LU +#define SYMM_LL ZSYMM_LL +#define SYMM_RU ZSYMM_RU +#define SYMM_RL ZSYMM_RL + +#define HEMM_LU ZHEMM_LU +#define HEMM_LL ZHEMM_LL +#define HEMM_RU ZHEMM_RU +#define HEMM_RL ZHEMM_RL + +#define HEMM_IUTCOPY ZHEMM_IUTCOPY +#define HEMM_ILTCOPY ZHEMM_ILTCOPY +#define HEMM_OUTCOPY ZHEMM_OUTCOPY +#define HEMM_OLTCOPY ZHEMM_OLTCOPY + +#define SYRK_UN ZSYRK_UN +#define SYRK_UT ZSYRK_UT +#define SYRK_LN ZSYRK_LN +#define SYRK_LT ZSYRK_LT +#define SYRK_UR ZSYRK_UN +#define SYRK_UC ZSYRK_UT +#define SYRK_LR ZSYRK_LN +#define SYRK_LC ZSYRK_LT + +#define SYRK_KERNEL_U ZSYRK_KERNEL_U +#define SYRK_KERNEL_L ZSYRK_KERNEL_L + +#define HERK_UN ZHERK_UN +#define HERK_LN ZHERK_LN +#define HERK_UC ZHERK_UC +#define HERK_LC ZHERK_LC + +#define HER2K_UN ZHER2K_UN +#define HER2K_LN ZHER2K_LN +#define HER2K_UC ZHER2K_UC +#define HER2K_LC ZHER2K_LC + +#define SYR2K_UN ZSYR2K_UN +#define SYR2K_UT ZSYR2K_UT +#define SYR2K_LN ZSYR2K_LN +#define SYR2K_LT ZSYR2K_LT +#define SYR2K_UR ZSYR2K_UN +#define SYR2K_UC ZSYR2K_UT +#define SYR2K_LR ZSYR2K_LN +#define SYR2K_LC ZSYR2K_LT + +#define SYR2K_KERNEL_U ZSYR2K_KERNEL_U +#define SYR2K_KERNEL_L ZSYR2K_KERNEL_L + +#define TRMM_LNUU ZTRMM_LNUU +#define TRMM_LNUN ZTRMM_LNUN +#define TRMM_LNLU ZTRMM_LNLU +#define TRMM_LNLN ZTRMM_LNLN +#define TRMM_LTUU ZTRMM_LTUU +#define TRMM_LTUN ZTRMM_LTUN +#define TRMM_LTLU ZTRMM_LTLU +#define TRMM_LTLN ZTRMM_LTLN +#define TRMM_LRUU ZTRMM_LRUU +#define TRMM_LRUN ZTRMM_LRUN +#define TRMM_LRLU ZTRMM_LRLU +#define TRMM_LRLN ZTRMM_LRLN +#define TRMM_LCUU ZTRMM_LCUU +#define TRMM_LCUN ZTRMM_LCUN +#define TRMM_LCLU ZTRMM_LCLU +#define TRMM_LCLN ZTRMM_LCLN +#define TRMM_RNUU ZTRMM_RNUU +#define TRMM_RNUN ZTRMM_RNUN +#define TRMM_RNLU ZTRMM_RNLU +#define TRMM_RNLN ZTRMM_RNLN +#define TRMM_RTUU ZTRMM_RTUU +#define TRMM_RTUN ZTRMM_RTUN +#define TRMM_RTLU ZTRMM_RTLU +#define TRMM_RTLN ZTRMM_RTLN +#define TRMM_RRUU ZTRMM_RRUU +#define TRMM_RRUN ZTRMM_RRUN +#define TRMM_RRLU ZTRMM_RRLU +#define TRMM_RRLN ZTRMM_RRLN +#define TRMM_RCUU ZTRMM_RCUU +#define TRMM_RCUN ZTRMM_RCUN +#define TRMM_RCLU ZTRMM_RCLU +#define TRMM_RCLN ZTRMM_RCLN + +#define TRSM_LNUU ZTRSM_LNUU +#define TRSM_LNUN ZTRSM_LNUN +#define TRSM_LNLU ZTRSM_LNLU +#define TRSM_LNLN ZTRSM_LNLN +#define TRSM_LTUU ZTRSM_LTUU +#define TRSM_LTUN ZTRSM_LTUN +#define TRSM_LTLU ZTRSM_LTLU +#define TRSM_LTLN ZTRSM_LTLN +#define TRSM_LRUU ZTRSM_LRUU +#define TRSM_LRUN ZTRSM_LRUN +#define TRSM_LRLU ZTRSM_LRLU +#define TRSM_LRLN ZTRSM_LRLN +#define TRSM_LCUU ZTRSM_LCUU +#define TRSM_LCUN ZTRSM_LCUN +#define TRSM_LCLU ZTRSM_LCLU +#define TRSM_LCLN ZTRSM_LCLN +#define TRSM_RNUU ZTRSM_RNUU +#define TRSM_RNUN ZTRSM_RNUN +#define TRSM_RNLU ZTRSM_RNLU +#define TRSM_RNLN ZTRSM_RNLN +#define TRSM_RTUU ZTRSM_RTUU +#define TRSM_RTUN ZTRSM_RTUN +#define TRSM_RTLU ZTRSM_RTLU +#define TRSM_RTLN ZTRSM_RTLN +#define TRSM_RRUU ZTRSM_RRUU +#define TRSM_RRUN ZTRSM_RRUN +#define TRSM_RRLU ZTRSM_RRLU +#define TRSM_RRLN ZTRSM_RRLN +#define TRSM_RCUU ZTRSM_RCUU +#define TRSM_RCUN ZTRSM_RCUN +#define TRSM_RCLU ZTRSM_RCLU +#define TRSM_RCLN ZTRSM_RCLN + + +#define GEMM_THREAD_NN ZGEMM_THREAD_NN +#define GEMM_THREAD_CN ZGEMM_THREAD_CN +#define GEMM_THREAD_TN ZGEMM_THREAD_TN +#define GEMM_THREAD_NC ZGEMM_THREAD_NC +#define GEMM_THREAD_NT ZGEMM_THREAD_NT +#define GEMM_THREAD_CC ZGEMM_THREAD_CC +#define GEMM_THREAD_CT ZGEMM_THREAD_CT +#define GEMM_THREAD_TC ZGEMM_THREAD_TC +#define GEMM_THREAD_TT ZGEMM_THREAD_TT +#define GEMM_THREAD_NR ZGEMM_THREAD_NR +#define GEMM_THREAD_TR ZGEMM_THREAD_TR +#define GEMM_THREAD_CR ZGEMM_THREAD_CR +#define GEMM_THREAD_RN ZGEMM_THREAD_RN +#define GEMM_THREAD_RT ZGEMM_THREAD_RT +#define GEMM_THREAD_RC ZGEMM_THREAD_RC +#define GEMM_THREAD_RR ZGEMM_THREAD_RR + +#define SYMM_THREAD_LU ZSYMM_THREAD_LU +#define SYMM_THREAD_LL ZSYMM_THREAD_LL +#define SYMM_THREAD_RU ZSYMM_THREAD_RU +#define SYMM_THREAD_RL ZSYMM_THREAD_RL + +#define HEMM_THREAD_LU ZHEMM_THREAD_LU +#define HEMM_THREAD_LL ZHEMM_THREAD_LL +#define HEMM_THREAD_RU ZHEMM_THREAD_RU +#define HEMM_THREAD_RL ZHEMM_THREAD_RL + +#define SYRK_THREAD_UN ZSYRK_THREAD_UN +#define SYRK_THREAD_UT ZSYRK_THREAD_UT +#define SYRK_THREAD_LN ZSYRK_THREAD_LN +#define SYRK_THREAD_LT ZSYRK_THREAD_LT +#define SYRK_THREAD_UR ZSYRK_THREAD_UR +#define SYRK_THREAD_UC ZSYRK_THREAD_UC +#define SYRK_THREAD_LR ZSYRK_THREAD_LR +#define SYRK_THREAD_LC ZSYRK_THREAD_LC + +#define HERK_THREAD_UN ZHERK_THREAD_UN +#define HERK_THREAD_UT ZHERK_THREAD_UT +#define HERK_THREAD_LN ZHERK_THREAD_LN +#define HERK_THREAD_LT ZHERK_THREAD_LT +#define HERK_THREAD_UR ZHERK_THREAD_UR +#define HERK_THREAD_UC ZHERK_THREAD_UC +#define HERK_THREAD_LR ZHERK_THREAD_LR +#define HERK_THREAD_LC ZHERK_THREAD_LC + +#define GEMM3M_NN ZGEMM3M_NN +#define GEMM3M_CN ZGEMM3M_CN +#define GEMM3M_TN ZGEMM3M_TN +#define GEMM3M_NC ZGEMM3M_NC +#define GEMM3M_NT ZGEMM3M_NT +#define GEMM3M_CC ZGEMM3M_CC +#define GEMM3M_CT ZGEMM3M_CT +#define GEMM3M_TC ZGEMM3M_TC +#define GEMM3M_TT ZGEMM3M_TT +#define GEMM3M_NR ZGEMM3M_NR +#define GEMM3M_TR ZGEMM3M_TR +#define GEMM3M_CR ZGEMM3M_CR +#define GEMM3M_RN ZGEMM3M_RN +#define GEMM3M_RT ZGEMM3M_RT +#define GEMM3M_RC ZGEMM3M_RC +#define GEMM3M_RR ZGEMM3M_RR + +#define GEMM3M_THREAD_NN ZGEMM3M_THREAD_NN +#define GEMM3M_THREAD_CN ZGEMM3M_THREAD_CN +#define GEMM3M_THREAD_TN ZGEMM3M_THREAD_TN +#define GEMM3M_THREAD_NC ZGEMM3M_THREAD_NC +#define GEMM3M_THREAD_NT ZGEMM3M_THREAD_NT +#define GEMM3M_THREAD_CC ZGEMM3M_THREAD_CC +#define GEMM3M_THREAD_CT ZGEMM3M_THREAD_CT +#define GEMM3M_THREAD_TC ZGEMM3M_THREAD_TC +#define GEMM3M_THREAD_TT ZGEMM3M_THREAD_TT +#define GEMM3M_THREAD_NR ZGEMM3M_THREAD_NR +#define GEMM3M_THREAD_TR ZGEMM3M_THREAD_TR +#define GEMM3M_THREAD_CR ZGEMM3M_THREAD_CR +#define GEMM3M_THREAD_RN ZGEMM3M_THREAD_RN +#define GEMM3M_THREAD_RT ZGEMM3M_THREAD_RT +#define GEMM3M_THREAD_RC ZGEMM3M_THREAD_RC +#define GEMM3M_THREAD_RR ZGEMM3M_THREAD_RR + +#define SYMM3M_LU ZSYMM3M_LU +#define SYMM3M_LL ZSYMM3M_LL +#define SYMM3M_RU ZSYMM3M_RU +#define SYMM3M_RL ZSYMM3M_RL + +#define SYMM3M_THREAD_LU ZSYMM3M_THREAD_LU +#define SYMM3M_THREAD_LL ZSYMM3M_THREAD_LL +#define SYMM3M_THREAD_RU ZSYMM3M_THREAD_RU +#define SYMM3M_THREAD_RL ZSYMM3M_THREAD_RL + +#define HEMM3M_LU ZHEMM3M_LU +#define HEMM3M_LL ZHEMM3M_LL +#define HEMM3M_RU ZHEMM3M_RU +#define HEMM3M_RL ZHEMM3M_RL + +#define HEMM3M_THREAD_LU ZHEMM3M_THREAD_LU +#define HEMM3M_THREAD_LL ZHEMM3M_THREAD_LL +#define HEMM3M_THREAD_RU ZHEMM3M_THREAD_RU +#define HEMM3M_THREAD_RL ZHEMM3M_THREAD_RL + +#define SYMM_IUTCOPY ZSYMM_IUTCOPY +#define SYMM_ILTCOPY ZSYMM_ILTCOPY +#define SYMM_OUTCOPY ZSYMM_OUTCOPY +#define SYMM_OLTCOPY ZSYMM_OLTCOPY + +#else + +#define AMAX_K CAMAX_K +#define AMIN_K CAMIN_K +#define MAX_K CMAX_K +#define MIN_K CMIN_K +#define IAMAX_K ICAMAX_K +#define IAMIN_K ICAMIN_K +#define IMAX_K ICMAX_K +#define IMIN_K ICMIN_K +#define ASUM_K CASUM_K +#define AXPYU_K CAXPYU_K +#define AXPYC_K CAXPYC_K +#define COPY_K CCOPY_K +#define DOTU_K CDOTU_K +#define DOTC_K CDOTC_K +#define NRM2_K CNRM2_K +#define SCAL_K CSCAL_K +#define SWAP_K CSWAP_K +#define ROT_K CROT_K + +#define GEMV_N CGEMV_N +#define GEMV_T CGEMV_T +#define GEMV_R CGEMV_R +#define GEMV_C CGEMV_C +#define GEMV_O CGEMV_O +#define GEMV_U CGEMV_U +#define GEMV_S CGEMV_S +#define GEMV_D CGEMV_D + +#define GERU_K CGERU_K +#define GERC_K CGERC_K +#define GERV_K CGERV_K +#define GERD_K CGERD_K + +#define SYMV_U CSYMV_U +#define SYMV_L CSYMV_L +#define HEMV_U CHEMV_U +#define HEMV_L CHEMV_L +#define HEMV_V CHEMV_V +#define HEMV_M CHEMV_M + +#define SYMV_THREAD_U CSYMV_THREAD_U +#define SYMV_THREAD_L CSYMV_THREAD_L +#define HEMV_THREAD_U CHEMV_THREAD_U +#define HEMV_THREAD_L CHEMV_THREAD_L +#define HEMV_THREAD_V CHEMV_THREAD_V +#define HEMV_THREAD_M CHEMV_THREAD_M + +#define GEMM_ONCOPY CGEMM_ONCOPY +#define GEMM_OTCOPY CGEMM_OTCOPY +#define GEMM_INCOPY CGEMM_INCOPY +#define GEMM_ITCOPY CGEMM_ITCOPY + +#define GEMM3M_ONCOPYB CGEMM3M_ONCOPYB +#define GEMM3M_ONCOPYR CGEMM3M_ONCOPYR +#define GEMM3M_ONCOPYI CGEMM3M_ONCOPYI +#define GEMM3M_OTCOPYB CGEMM3M_OTCOPYB +#define GEMM3M_OTCOPYR CGEMM3M_OTCOPYR +#define GEMM3M_OTCOPYI CGEMM3M_OTCOPYI +#define GEMM3M_INCOPYB CGEMM3M_INCOPYB +#define GEMM3M_INCOPYR CGEMM3M_INCOPYR +#define GEMM3M_INCOPYI CGEMM3M_INCOPYI +#define GEMM3M_ITCOPYB CGEMM3M_ITCOPYB +#define GEMM3M_ITCOPYR CGEMM3M_ITCOPYR +#define GEMM3M_ITCOPYI CGEMM3M_ITCOPYI + +#ifdef UNIT + +#define TRMM_OUNCOPY CTRMM_OUNUCOPY +#define TRMM_OUTCOPY CTRMM_OUTUCOPY +#define TRMM_OLNCOPY CTRMM_OLNUCOPY +#define TRMM_OLTCOPY CTRMM_OLTUCOPY +#define TRSM_OUNCOPY CTRSM_OUNUCOPY +#define TRSM_OUTCOPY CTRSM_OUTUCOPY +#define TRSM_OLNCOPY CTRSM_OLNUCOPY +#define TRSM_OLTCOPY CTRSM_OLTUCOPY + +#define TRMM_IUNCOPY CTRMM_IUNUCOPY +#define TRMM_IUTCOPY CTRMM_IUTUCOPY +#define TRMM_ILNCOPY CTRMM_ILNUCOPY +#define TRMM_ILTCOPY CTRMM_ILTUCOPY +#define TRSM_IUNCOPY CTRSM_IUNUCOPY +#define TRSM_IUTCOPY CTRSM_IUTUCOPY +#define TRSM_ILNCOPY CTRSM_ILNUCOPY +#define TRSM_ILTCOPY CTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY CTRMM_OUNNCOPY +#define TRMM_OUTCOPY CTRMM_OUTNCOPY +#define TRMM_OLNCOPY CTRMM_OLNNCOPY +#define TRMM_OLTCOPY CTRMM_OLTNCOPY +#define TRSM_OUNCOPY CTRSM_OUNNCOPY +#define TRSM_OUTCOPY CTRSM_OUTNCOPY +#define TRSM_OLNCOPY CTRSM_OLNNCOPY +#define TRSM_OLTCOPY CTRSM_OLTNCOPY + +#define TRMM_IUNCOPY CTRMM_IUNNCOPY +#define TRMM_IUTCOPY CTRMM_IUTNCOPY +#define TRMM_ILNCOPY CTRMM_ILNNCOPY +#define TRMM_ILTCOPY CTRMM_ILTNCOPY +#define TRSM_IUNCOPY CTRSM_IUNNCOPY +#define TRSM_IUTCOPY CTRSM_IUTNCOPY +#define TRSM_ILNCOPY CTRSM_ILNNCOPY +#define TRSM_ILTCOPY CTRSM_ILTNCOPY + +#endif + +#define SYMM3M_ILCOPYB CSYMM3M_ILCOPYB +#define SYMM3M_IUCOPYB CSYMM3M_IUCOPYB +#define SYMM3M_ILCOPYR CSYMM3M_ILCOPYR +#define SYMM3M_IUCOPYR CSYMM3M_IUCOPYR +#define SYMM3M_ILCOPYI CSYMM3M_ILCOPYI +#define SYMM3M_IUCOPYI CSYMM3M_IUCOPYI + +#define SYMM3M_OLCOPYB CSYMM3M_OLCOPYB +#define SYMM3M_OUCOPYB CSYMM3M_OUCOPYB +#define SYMM3M_OLCOPYR CSYMM3M_OLCOPYR +#define SYMM3M_OUCOPYR CSYMM3M_OUCOPYR +#define SYMM3M_OLCOPYI CSYMM3M_OLCOPYI +#define SYMM3M_OUCOPYI CSYMM3M_OUCOPYI + +#define HEMM3M_ILCOPYB CHEMM3M_ILCOPYB +#define HEMM3M_IUCOPYB CHEMM3M_IUCOPYB +#define HEMM3M_ILCOPYR CHEMM3M_ILCOPYR +#define HEMM3M_IUCOPYR CHEMM3M_IUCOPYR +#define HEMM3M_ILCOPYI CHEMM3M_ILCOPYI +#define HEMM3M_IUCOPYI CHEMM3M_IUCOPYI + +#define HEMM3M_OLCOPYB CHEMM3M_OLCOPYB +#define HEMM3M_OUCOPYB CHEMM3M_OUCOPYB +#define HEMM3M_OLCOPYR CHEMM3M_OLCOPYR +#define HEMM3M_OUCOPYR CHEMM3M_OUCOPYR +#define HEMM3M_OLCOPYI CHEMM3M_OLCOPYI +#define HEMM3M_OUCOPYI CHEMM3M_OUCOPYI + +#define GEMM_BETA CGEMM_BETA + +#define GEMM_KERNEL_N CGEMM_KERNEL_N +#define GEMM_KERNEL_L CGEMM_KERNEL_L +#define GEMM_KERNEL_R CGEMM_KERNEL_R +#define GEMM_KERNEL_B CGEMM_KERNEL_B + +#define GEMM3M_KERNEL CGEMM3M_KERNEL + +#define TRMM_KERNEL_LN CTRMM_KERNEL_LN +#define TRMM_KERNEL_LT CTRMM_KERNEL_LT +#define TRMM_KERNEL_LR CTRMM_KERNEL_LR +#define TRMM_KERNEL_LC CTRMM_KERNEL_LC +#define TRMM_KERNEL_RN CTRMM_KERNEL_RN +#define TRMM_KERNEL_RT CTRMM_KERNEL_RT +#define TRMM_KERNEL_RR CTRMM_KERNEL_RR +#define TRMM_KERNEL_RC CTRMM_KERNEL_RC + +#define TRSM_KERNEL_LN CTRSM_KERNEL_LN +#define TRSM_KERNEL_LT CTRSM_KERNEL_LT +#define TRSM_KERNEL_LR CTRSM_KERNEL_LR +#define TRSM_KERNEL_LC CTRSM_KERNEL_LC +#define TRSM_KERNEL_RN CTRSM_KERNEL_RN +#define TRSM_KERNEL_RT CTRSM_KERNEL_RT +#define TRSM_KERNEL_RR CTRSM_KERNEL_RR +#define TRSM_KERNEL_RC CTRSM_KERNEL_RC + +#define GEMM_NN CGEMM_NN +#define GEMM_CN CGEMM_CN +#define GEMM_TN CGEMM_TN +#define GEMM_NC CGEMM_NC +#define GEMM_NT CGEMM_NT +#define GEMM_CC CGEMM_CC +#define GEMM_CT CGEMM_CT +#define GEMM_TC CGEMM_TC +#define GEMM_TT CGEMM_TT +#define GEMM_NR CGEMM_NR +#define GEMM_TR CGEMM_TR +#define GEMM_CR CGEMM_CR +#define GEMM_RN CGEMM_RN +#define GEMM_RT CGEMM_RT +#define GEMM_RC CGEMM_RC +#define GEMM_RR CGEMM_RR + +#define SYMM_LU CSYMM_LU +#define SYMM_LL CSYMM_LL +#define SYMM_RU CSYMM_RU +#define SYMM_RL CSYMM_RL + +#define HEMM_LU CHEMM_LU +#define HEMM_LL CHEMM_LL +#define HEMM_RU CHEMM_RU +#define HEMM_RL CHEMM_RL + +#define HEMM_IUTCOPY CHEMM_IUTCOPY +#define HEMM_ILTCOPY CHEMM_ILTCOPY +#define HEMM_OUTCOPY CHEMM_OUTCOPY +#define HEMM_OLTCOPY CHEMM_OLTCOPY + +#define SYRK_UN CSYRK_UN +#define SYRK_UT CSYRK_UT +#define SYRK_LN CSYRK_LN +#define SYRK_LT CSYRK_LT +#define SYRK_UR CSYRK_UN +#define SYRK_UC CSYRK_UT +#define SYRK_LR CSYRK_LN +#define SYRK_LC CSYRK_LT + +#define SYRK_KERNEL_U CSYRK_KERNEL_U +#define SYRK_KERNEL_L CSYRK_KERNEL_L + +#define HERK_UN CHERK_UN +#define HERK_LN CHERK_LN +#define HERK_UC CHERK_UC +#define HERK_LC CHERK_LC + +#define HER2K_UN CHER2K_UN +#define HER2K_LN CHER2K_LN +#define HER2K_UC CHER2K_UC +#define HER2K_LC CHER2K_LC + +#define SYR2K_UN CSYR2K_UN +#define SYR2K_UT CSYR2K_UT +#define SYR2K_LN CSYR2K_LN +#define SYR2K_LT CSYR2K_LT +#define SYR2K_UR CSYR2K_UN +#define SYR2K_UC CSYR2K_UT +#define SYR2K_LR CSYR2K_LN +#define SYR2K_LC CSYR2K_LT + +#define SYR2K_KERNEL_U CSYR2K_KERNEL_U +#define SYR2K_KERNEL_L CSYR2K_KERNEL_L + +#define TRMM_LNUU CTRMM_LNUU +#define TRMM_LNUN CTRMM_LNUN +#define TRMM_LNLU CTRMM_LNLU +#define TRMM_LNLN CTRMM_LNLN +#define TRMM_LTUU CTRMM_LTUU +#define TRMM_LTUN CTRMM_LTUN +#define TRMM_LTLU CTRMM_LTLU +#define TRMM_LTLN CTRMM_LTLN +#define TRMM_LRUU CTRMM_LRUU +#define TRMM_LRUN CTRMM_LRUN +#define TRMM_LRLU CTRMM_LRLU +#define TRMM_LRLN CTRMM_LRLN +#define TRMM_LCUU CTRMM_LCUU +#define TRMM_LCUN CTRMM_LCUN +#define TRMM_LCLU CTRMM_LCLU +#define TRMM_LCLN CTRMM_LCLN +#define TRMM_RNUU CTRMM_RNUU +#define TRMM_RNUN CTRMM_RNUN +#define TRMM_RNLU CTRMM_RNLU +#define TRMM_RNLN CTRMM_RNLN +#define TRMM_RTUU CTRMM_RTUU +#define TRMM_RTUN CTRMM_RTUN +#define TRMM_RTLU CTRMM_RTLU +#define TRMM_RTLN CTRMM_RTLN +#define TRMM_RRUU CTRMM_RRUU +#define TRMM_RRUN CTRMM_RRUN +#define TRMM_RRLU CTRMM_RRLU +#define TRMM_RRLN CTRMM_RRLN +#define TRMM_RCUU CTRMM_RCUU +#define TRMM_RCUN CTRMM_RCUN +#define TRMM_RCLU CTRMM_RCLU +#define TRMM_RCLN CTRMM_RCLN + +#define TRSM_LNUU CTRSM_LNUU +#define TRSM_LNUN CTRSM_LNUN +#define TRSM_LNLU CTRSM_LNLU +#define TRSM_LNLN CTRSM_LNLN +#define TRSM_LTUU CTRSM_LTUU +#define TRSM_LTUN CTRSM_LTUN +#define TRSM_LTLU CTRSM_LTLU +#define TRSM_LTLN CTRSM_LTLN +#define TRSM_LRUU CTRSM_LRUU +#define TRSM_LRUN CTRSM_LRUN +#define TRSM_LRLU CTRSM_LRLU +#define TRSM_LRLN CTRSM_LRLN +#define TRSM_LCUU CTRSM_LCUU +#define TRSM_LCUN CTRSM_LCUN +#define TRSM_LCLU CTRSM_LCLU +#define TRSM_LCLN CTRSM_LCLN +#define TRSM_RNUU CTRSM_RNUU +#define TRSM_RNUN CTRSM_RNUN +#define TRSM_RNLU CTRSM_RNLU +#define TRSM_RNLN CTRSM_RNLN +#define TRSM_RTUU CTRSM_RTUU +#define TRSM_RTUN CTRSM_RTUN +#define TRSM_RTLU CTRSM_RTLU +#define TRSM_RTLN CTRSM_RTLN +#define TRSM_RRUU CTRSM_RRUU +#define TRSM_RRUN CTRSM_RRUN +#define TRSM_RRLU CTRSM_RRLU +#define TRSM_RRLN CTRSM_RRLN +#define TRSM_RCUU CTRSM_RCUU +#define TRSM_RCUN CTRSM_RCUN +#define TRSM_RCLU CTRSM_RCLU +#define TRSM_RCLN CTRSM_RCLN + + +#define GEMM_THREAD_NN CGEMM_THREAD_NN +#define GEMM_THREAD_CN CGEMM_THREAD_CN +#define GEMM_THREAD_TN CGEMM_THREAD_TN +#define GEMM_THREAD_NC CGEMM_THREAD_NC +#define GEMM_THREAD_NT CGEMM_THREAD_NT +#define GEMM_THREAD_CC CGEMM_THREAD_CC +#define GEMM_THREAD_CT CGEMM_THREAD_CT +#define GEMM_THREAD_TC CGEMM_THREAD_TC +#define GEMM_THREAD_TT CGEMM_THREAD_TT +#define GEMM_THREAD_NR CGEMM_THREAD_NR +#define GEMM_THREAD_TR CGEMM_THREAD_TR +#define GEMM_THREAD_CR CGEMM_THREAD_CR +#define GEMM_THREAD_RN CGEMM_THREAD_RN +#define GEMM_THREAD_RT CGEMM_THREAD_RT +#define GEMM_THREAD_RC CGEMM_THREAD_RC +#define GEMM_THREAD_RR CGEMM_THREAD_RR + +#define SYMM_THREAD_LU CSYMM_THREAD_LU +#define SYMM_THREAD_LL CSYMM_THREAD_LL +#define SYMM_THREAD_RU CSYMM_THREAD_RU +#define SYMM_THREAD_RL CSYMM_THREAD_RL + +#define HEMM_THREAD_LU CHEMM_THREAD_LU +#define HEMM_THREAD_LL CHEMM_THREAD_LL +#define HEMM_THREAD_RU CHEMM_THREAD_RU +#define HEMM_THREAD_RL CHEMM_THREAD_RL + +#define SYRK_THREAD_UN CSYRK_THREAD_UN +#define SYRK_THREAD_UT CSYRK_THREAD_UT +#define SYRK_THREAD_LN CSYRK_THREAD_LN +#define SYRK_THREAD_LT CSYRK_THREAD_LT +#define SYRK_THREAD_UR CSYRK_THREAD_UR +#define SYRK_THREAD_UC CSYRK_THREAD_UC +#define SYRK_THREAD_LR CSYRK_THREAD_LR +#define SYRK_THREAD_LC CSYRK_THREAD_LC + +#define HERK_THREAD_UN CHERK_THREAD_UN +#define HERK_THREAD_UT CHERK_THREAD_UT +#define HERK_THREAD_LN CHERK_THREAD_LN +#define HERK_THREAD_LT CHERK_THREAD_LT +#define HERK_THREAD_UR CHERK_THREAD_UR +#define HERK_THREAD_UC CHERK_THREAD_UC +#define HERK_THREAD_LR CHERK_THREAD_LR +#define HERK_THREAD_LC CHERK_THREAD_LC + +#define GEMM3M_NN CGEMM3M_NN +#define GEMM3M_CN CGEMM3M_CN +#define GEMM3M_TN CGEMM3M_TN +#define GEMM3M_NC CGEMM3M_NC +#define GEMM3M_NT CGEMM3M_NT +#define GEMM3M_CC CGEMM3M_CC +#define GEMM3M_CT CGEMM3M_CT +#define GEMM3M_TC CGEMM3M_TC +#define GEMM3M_TT CGEMM3M_TT +#define GEMM3M_NR CGEMM3M_NR +#define GEMM3M_TR CGEMM3M_TR +#define GEMM3M_CR CGEMM3M_CR +#define GEMM3M_RN CGEMM3M_RN +#define GEMM3M_RT CGEMM3M_RT +#define GEMM3M_RC CGEMM3M_RC +#define GEMM3M_RR CGEMM3M_RR + +#define GEMM3M_THREAD_NN CGEMM3M_THREAD_NN +#define GEMM3M_THREAD_CN CGEMM3M_THREAD_CN +#define GEMM3M_THREAD_TN CGEMM3M_THREAD_TN +#define GEMM3M_THREAD_NC CGEMM3M_THREAD_NC +#define GEMM3M_THREAD_NT CGEMM3M_THREAD_NT +#define GEMM3M_THREAD_CC CGEMM3M_THREAD_CC +#define GEMM3M_THREAD_CT CGEMM3M_THREAD_CT +#define GEMM3M_THREAD_TC CGEMM3M_THREAD_TC +#define GEMM3M_THREAD_TT CGEMM3M_THREAD_TT +#define GEMM3M_THREAD_NR CGEMM3M_THREAD_NR +#define GEMM3M_THREAD_TR CGEMM3M_THREAD_TR +#define GEMM3M_THREAD_CR CGEMM3M_THREAD_CR +#define GEMM3M_THREAD_RN CGEMM3M_THREAD_RN +#define GEMM3M_THREAD_RT CGEMM3M_THREAD_RT +#define GEMM3M_THREAD_RC CGEMM3M_THREAD_RC +#define GEMM3M_THREAD_RR CGEMM3M_THREAD_RR + +#define SYMM3M_LU CSYMM3M_LU +#define SYMM3M_LL CSYMM3M_LL +#define SYMM3M_RU CSYMM3M_RU +#define SYMM3M_RL CSYMM3M_RL + +#define SYMM3M_THREAD_LU CSYMM3M_THREAD_LU +#define SYMM3M_THREAD_LL CSYMM3M_THREAD_LL +#define SYMM3M_THREAD_RU CSYMM3M_THREAD_RU +#define SYMM3M_THREAD_RL CSYMM3M_THREAD_RL + +#define HEMM3M_LU CHEMM3M_LU +#define HEMM3M_LL CHEMM3M_LL +#define HEMM3M_RU CHEMM3M_RU +#define HEMM3M_RL CHEMM3M_RL + +#define HEMM3M_THREAD_LU CHEMM3M_THREAD_LU +#define HEMM3M_THREAD_LL CHEMM3M_THREAD_LL +#define HEMM3M_THREAD_RU CHEMM3M_THREAD_RU +#define HEMM3M_THREAD_RL CHEMM3M_THREAD_RL + +#define SYMM_IUTCOPY CSYMM_IUTCOPY +#define SYMM_ILTCOPY CSYMM_ILTCOPY +#define SYMM_OUTCOPY CSYMM_OUTCOPY +#define SYMM_OLTCOPY CSYMM_OLTCOPY + +#endif +#endif + +#ifndef ASSEMBLER +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +extern BLASLONG sgemm_p; +extern BLASLONG sgemm_q; +extern BLASLONG sgemm_r; +extern BLASLONG dgemm_p; +extern BLASLONG dgemm_q; +extern BLASLONG dgemm_r; +extern BLASLONG qgemm_p; +extern BLASLONG qgemm_q; +extern BLASLONG qgemm_r; +extern BLASLONG cgemm_p; +extern BLASLONG cgemm_q; +extern BLASLONG cgemm_r; +extern BLASLONG zgemm_p; +extern BLASLONG zgemm_q; +extern BLASLONG zgemm_r; +extern BLASLONG xgemm_p; +extern BLASLONG xgemm_q; +extern BLASLONG xgemm_r; +#endif + +typedef struct { + void *a, *b, *c, *d, *alpha, *beta; + BLASLONG m, n, k, lda, ldb, ldc, ldd; + +#ifdef SMP + void *common; + BLASLONG nthreads; +#endif + +#ifdef PARAMTEST + BLASLONG gemm_p, gemm_q, gemm_r; +#endif + +#ifdef PREFETCHTEST + BLASLONG prea, preb, prec, pred; +#endif + +} blas_arg_t; +#endif + +#ifdef XDOUBLE + +#define TRSV_NUU qtrsv_NUU +#define TRSV_NUN qtrsv_NUN +#define TRSV_NLU qtrsv_NLU +#define TRSV_NLN qtrsv_NLN +#define TRSV_TUU qtrsv_TUU +#define TRSV_TUN qtrsv_TUN +#define TRSV_TLU qtrsv_TLU +#define TRSV_TLN qtrsv_TLN + +#define ZTRSV_NUU xtrsv_NUU +#define ZTRSV_NUN xtrsv_NUN +#define ZTRSV_NLU xtrsv_NLU +#define ZTRSV_NLN xtrsv_NLN +#define ZTRSV_TUU xtrsv_TUU +#define ZTRSV_TUN xtrsv_TUN +#define ZTRSV_TLU xtrsv_TLU +#define ZTRSV_TLN xtrsv_TLN +#define ZTRSV_RUU xtrsv_RUU +#define ZTRSV_RUN xtrsv_RUN +#define ZTRSV_RLU xtrsv_RLU +#define ZTRSV_RLN xtrsv_RLN +#define ZTRSV_CUU xtrsv_CUU +#define ZTRSV_CUN xtrsv_CUN +#define ZTRSV_CLU xtrsv_CLU +#define ZTRSV_CLN xtrsv_CLN + +#define TRMV_NUU qtrmv_NUU +#define TRMV_NUN qtrmv_NUN +#define TRMV_NLU qtrmv_NLU +#define TRMV_NLN qtrmv_NLN +#define TRMV_TUU qtrmv_TUU +#define TRMV_TUN qtrmv_TUN +#define TRMV_TLU qtrmv_TLU +#define TRMV_TLN qtrmv_TLN + +#define TRMV_THREAD_NUU qtrmv_thread_NUU +#define TRMV_THREAD_NUN qtrmv_thread_NUN +#define TRMV_THREAD_NLU qtrmv_thread_NLU +#define TRMV_THREAD_NLN qtrmv_thread_NLN +#define TRMV_THREAD_TUU qtrmv_thread_TUU +#define TRMV_THREAD_TUN qtrmv_thread_TUN +#define TRMV_THREAD_TLU qtrmv_thread_TLU +#define TRMV_THREAD_TLN qtrmv_thread_TLN + +#define ZTRMV_NUU xtrmv_NUU +#define ZTRMV_NUN xtrmv_NUN +#define ZTRMV_NLU xtrmv_NLU +#define ZTRMV_NLN xtrmv_NLN +#define ZTRMV_TUU xtrmv_TUU +#define ZTRMV_TUN xtrmv_TUN +#define ZTRMV_TLU xtrmv_TLU +#define ZTRMV_TLN xtrmv_TLN +#define ZTRMV_RUU xtrmv_RUU +#define ZTRMV_RUN xtrmv_RUN +#define ZTRMV_RLU xtrmv_RLU +#define ZTRMV_RLN xtrmv_RLN +#define ZTRMV_CUU xtrmv_CUU +#define ZTRMV_CUN xtrmv_CUN +#define ZTRMV_CLU xtrmv_CLU +#define ZTRMV_CLN xtrmv_CLN + +#define ZTRMV_THREAD_NUU xtrmv_thread_NUU +#define ZTRMV_THREAD_NUN xtrmv_thread_NUN +#define ZTRMV_THREAD_NLU xtrmv_thread_NLU +#define ZTRMV_THREAD_NLN xtrmv_thread_NLN +#define ZTRMV_THREAD_TUU xtrmv_thread_TUU +#define ZTRMV_THREAD_TUN xtrmv_thread_TUN +#define ZTRMV_THREAD_TLU xtrmv_thread_TLU +#define ZTRMV_THREAD_TLN xtrmv_thread_TLN +#define ZTRMV_THREAD_RUU xtrmv_thread_RUU +#define ZTRMV_THREAD_RUN xtrmv_thread_RUN +#define ZTRMV_THREAD_RLU xtrmv_thread_RLU +#define ZTRMV_THREAD_RLN xtrmv_thread_RLN +#define ZTRMV_THREAD_CUU xtrmv_thread_CUU +#define ZTRMV_THREAD_CUN xtrmv_thread_CUN +#define ZTRMV_THREAD_CLU xtrmv_thread_CLU +#define ZTRMV_THREAD_CLN xtrmv_thread_CLN + +#elif defined(DOUBLE) + +#define TRSV_NUU dtrsv_NUU +#define TRSV_NUN dtrsv_NUN +#define TRSV_NLU dtrsv_NLU +#define TRSV_NLN dtrsv_NLN +#define TRSV_TUU dtrsv_TUU +#define TRSV_TUN dtrsv_TUN +#define TRSV_TLU dtrsv_TLU +#define TRSV_TLN dtrsv_TLN + +#define ZTRSV_NUU ztrsv_NUU +#define ZTRSV_NUN ztrsv_NUN +#define ZTRSV_NLU ztrsv_NLU +#define ZTRSV_NLN ztrsv_NLN +#define ZTRSV_TUU ztrsv_TUU +#define ZTRSV_TUN ztrsv_TUN +#define ZTRSV_TLU ztrsv_TLU +#define ZTRSV_TLN ztrsv_TLN +#define ZTRSV_RUU ztrsv_RUU +#define ZTRSV_RUN ztrsv_RUN +#define ZTRSV_RLU ztrsv_RLU +#define ZTRSV_RLN ztrsv_RLN +#define ZTRSV_CUU ztrsv_CUU +#define ZTRSV_CUN ztrsv_CUN +#define ZTRSV_CLU ztrsv_CLU +#define ZTRSV_CLN ztrsv_CLN + +#define TRMV_NUU dtrmv_NUU +#define TRMV_NUN dtrmv_NUN +#define TRMV_NLU dtrmv_NLU +#define TRMV_NLN dtrmv_NLN +#define TRMV_TUU dtrmv_TUU +#define TRMV_TUN dtrmv_TUN +#define TRMV_TLU dtrmv_TLU +#define TRMV_TLN dtrmv_TLN + +#define TRMV_THREAD_NUU dtrmv_thread_NUU +#define TRMV_THREAD_NUN dtrmv_thread_NUN +#define TRMV_THREAD_NLU dtrmv_thread_NLU +#define TRMV_THREAD_NLN dtrmv_thread_NLN +#define TRMV_THREAD_TUU dtrmv_thread_TUU +#define TRMV_THREAD_TUN dtrmv_thread_TUN +#define TRMV_THREAD_TLU dtrmv_thread_TLU +#define TRMV_THREAD_TLN dtrmv_thread_TLN + +#define ZTRMV_NUU ztrmv_NUU +#define ZTRMV_NUN ztrmv_NUN +#define ZTRMV_NLU ztrmv_NLU +#define ZTRMV_NLN ztrmv_NLN +#define ZTRMV_TUU ztrmv_TUU +#define ZTRMV_TUN ztrmv_TUN +#define ZTRMV_TLU ztrmv_TLU +#define ZTRMV_TLN ztrmv_TLN +#define ZTRMV_RUU ztrmv_RUU +#define ZTRMV_RUN ztrmv_RUN +#define ZTRMV_RLU ztrmv_RLU +#define ZTRMV_RLN ztrmv_RLN +#define ZTRMV_CUU ztrmv_CUU +#define ZTRMV_CUN ztrmv_CUN +#define ZTRMV_CLU ztrmv_CLU +#define ZTRMV_CLN ztrmv_CLN + +#define ZTRMV_THREAD_NUU ztrmv_thread_NUU +#define ZTRMV_THREAD_NUN ztrmv_thread_NUN +#define ZTRMV_THREAD_NLU ztrmv_thread_NLU +#define ZTRMV_THREAD_NLN ztrmv_thread_NLN +#define ZTRMV_THREAD_TUU ztrmv_thread_TUU +#define ZTRMV_THREAD_TUN ztrmv_thread_TUN +#define ZTRMV_THREAD_TLU ztrmv_thread_TLU +#define ZTRMV_THREAD_TLN ztrmv_thread_TLN +#define ZTRMV_THREAD_RUU ztrmv_thread_RUU +#define ZTRMV_THREAD_RUN ztrmv_thread_RUN +#define ZTRMV_THREAD_RLU ztrmv_thread_RLU +#define ZTRMV_THREAD_RLN ztrmv_thread_RLN +#define ZTRMV_THREAD_CUU ztrmv_thread_CUU +#define ZTRMV_THREAD_CUN ztrmv_thread_CUN +#define ZTRMV_THREAD_CLU ztrmv_thread_CLU +#define ZTRMV_THREAD_CLN ztrmv_thread_CLN + +#else + +#define TRSV_NUU strsv_NUU +#define TRSV_NUN strsv_NUN +#define TRSV_NLU strsv_NLU +#define TRSV_NLN strsv_NLN +#define TRSV_TUU strsv_TUU +#define TRSV_TUN strsv_TUN +#define TRSV_TLU strsv_TLU +#define TRSV_TLN strsv_TLN + +#define ZTRSV_NUU ctrsv_NUU +#define ZTRSV_NUN ctrsv_NUN +#define ZTRSV_NLU ctrsv_NLU +#define ZTRSV_NLN ctrsv_NLN +#define ZTRSV_TUU ctrsv_TUU +#define ZTRSV_TUN ctrsv_TUN +#define ZTRSV_TLU ctrsv_TLU +#define ZTRSV_TLN ctrsv_TLN +#define ZTRSV_RUU ctrsv_RUU +#define ZTRSV_RUN ctrsv_RUN +#define ZTRSV_RLU ctrsv_RLU +#define ZTRSV_RLN ctrsv_RLN +#define ZTRSV_CUU ctrsv_CUU +#define ZTRSV_CUN ctrsv_CUN +#define ZTRSV_CLU ctrsv_CLU +#define ZTRSV_CLN ctrsv_CLN + +#define TRMV_NUU strmv_NUU +#define TRMV_NUN strmv_NUN +#define TRMV_NLU strmv_NLU +#define TRMV_NLN strmv_NLN +#define TRMV_TUU strmv_TUU +#define TRMV_TUN strmv_TUN +#define TRMV_TLU strmv_TLU +#define TRMV_TLN strmv_TLN + +#define TRMV_THREAD_NUU strmv_thread_NUU +#define TRMV_THREAD_NUN strmv_thread_NUN +#define TRMV_THREAD_NLU strmv_thread_NLU +#define TRMV_THREAD_NLN strmv_thread_NLN +#define TRMV_THREAD_TUU strmv_thread_TUU +#define TRMV_THREAD_TUN strmv_thread_TUN +#define TRMV_THREAD_TLU strmv_thread_TLU +#define TRMV_THREAD_TLN strmv_thread_TLN + +#define ZTRMV_NUU ctrmv_NUU +#define ZTRMV_NUN ctrmv_NUN +#define ZTRMV_NLU ctrmv_NLU +#define ZTRMV_NLN ctrmv_NLN +#define ZTRMV_TUU ctrmv_TUU +#define ZTRMV_TUN ctrmv_TUN +#define ZTRMV_TLU ctrmv_TLU +#define ZTRMV_TLN ctrmv_TLN +#define ZTRMV_RUU ctrmv_RUU +#define ZTRMV_RUN ctrmv_RUN +#define ZTRMV_RLU ctrmv_RLU +#define ZTRMV_RLN ctrmv_RLN +#define ZTRMV_CUU ctrmv_CUU +#define ZTRMV_CUN ctrmv_CUN +#define ZTRMV_CLU ctrmv_CLU +#define ZTRMV_CLN ctrmv_CLN + +#define ZTRMV_THREAD_NUU ctrmv_thread_NUU +#define ZTRMV_THREAD_NUN ctrmv_thread_NUN +#define ZTRMV_THREAD_NLU ctrmv_thread_NLU +#define ZTRMV_THREAD_NLN ctrmv_thread_NLN +#define ZTRMV_THREAD_TUU ctrmv_thread_TUU +#define ZTRMV_THREAD_TUN ctrmv_thread_TUN +#define ZTRMV_THREAD_TLU ctrmv_thread_TLU +#define ZTRMV_THREAD_TLN ctrmv_thread_TLN +#define ZTRMV_THREAD_RUU ctrmv_thread_RUU +#define ZTRMV_THREAD_RUN ctrmv_thread_RUN +#define ZTRMV_THREAD_RLU ctrmv_thread_RLU +#define ZTRMV_THREAD_RLN ctrmv_thread_RLN +#define ZTRMV_THREAD_CUU ctrmv_thread_CUU +#define ZTRMV_THREAD_CUN ctrmv_thread_CUN +#define ZTRMV_THREAD_CLU ctrmv_thread_CLU +#define ZTRMV_THREAD_CLN ctrmv_thread_CLN + +#endif + +#define SGETF2 sgetf2_k +#define DGETF2 dgetf2_k +#define QGETF2 qgetf2_k +#define CGETF2 cgetf2_k +#define ZGETF2 zgetf2_k +#define XGETF2 xgetf2_k + +#define SLASWP_PLUS slaswp_plus +#define SLASWP_MINUS slaswp_minus +#define DLASWP_PLUS dlaswp_plus +#define DLASWP_MINUS dlaswp_minus +#define QLASWP_PLUS qlaswp_plus +#define QLASWP_MINUS qlaswp_minus +#define CLASWP_PLUS claswp_plus +#define CLASWP_MINUS claswp_minus +#define ZLASWP_PLUS zlaswp_plus +#define ZLASWP_MINUS zlaswp_minus +#define XLASWP_PLUS xlaswp_plus +#define XLASWP_MINUS xlaswp_minus + +#define SLARF_L slarf_L +#define SLARF_R slarf_R +#define DLARF_L dlarf_L +#define DLARF_R dlarf_R +#define QLARF_L qlarf_L +#define QLARF_R qlarf_R +#define CLARF_L clarf_L +#define CLARF_R clarf_R +#define ZLARF_L zlarf_L +#define ZLARF_R zlarf_R +#define XLARF_L xlarf_L +#define XLARF_R xlarf_R + +#ifndef COMPLEX +#ifdef XDOUBLE +#define GETF2 QGETF2 +#define GETRF QGETRF +#define GETRS_N_SINGLE qgetrs_N_single +#define GETRS_T_SINGLE qgetrs_T_single +#define GETRS_R_SINGLE qgetrs_N_single +#define GETRS_C_SINGLE qgetrs_T_single +#define GETRS_N_PARALLEL qgetrs_N_parallel +#define GETRS_T_PARALLEL qgetrs_T_parallel +#define GETRS_R_PARALLEL qgetrs_N_parallel +#define GETRS_C_PARALLEL qgetrs_T_parallel +#define LASWP_PLUS QLASWP_PLUS +#define LASWP_MINUS QLASWP_MINUS +#define LASWP_NCOPY QLASWP_NCOPY +#define GETRS_N QGETRS_N +#define GETRS_T QGETRS_T +#define GETRF_SINGLE qgetrf_single +#define GETRF_PARALLEL qgetrf_parallel +#define NEG_TCOPY QNEG_TCOPY +#define LARF_L QLARF_L +#define LARF_R QLARF_R +#elif defined(DOUBLE) +#define GETF2 DGETF2 +#define GETRF DGETRF +#define GETRS_N_SINGLE dgetrs_N_single +#define GETRS_T_SINGLE dgetrs_T_single +#define GETRS_R_SINGLE dgetrs_N_single +#define GETRS_C_SINGLE dgetrs_T_single +#define GETRS_N_PARALLEL dgetrs_N_parallel +#define GETRS_T_PARALLEL dgetrs_T_parallel +#define GETRS_R_PARALLEL dgetrs_N_parallel +#define GETRS_C_PARALLEL dgetrs_T_parallel +#define LASWP_PLUS DLASWP_PLUS +#define LASWP_MINUS DLASWP_MINUS +#define LASWP_NCOPY DLASWP_NCOPY +#define GETRS_N DGETRS_N +#define GETRS_T DGETRS_T +#define GETRF_SINGLE dgetrf_single +#define GETRF_PARALLEL dgetrf_parallel +#define NEG_TCOPY DNEG_TCOPY +#define LARF_L DLARF_L +#define LARF_R DLARF_R +#else +#define GETF2 SGETF2 +#define GETRF SGETRF +#define GETRS_N_SINGLE sgetrs_N_single +#define GETRS_T_SINGLE sgetrs_T_single +#define GETRS_R_SINGLE sgetrs_N_single +#define GETRS_C_SINGLE sgetrs_T_single +#define GETRS_N_PARALLEL sgetrs_N_parallel +#define GETRS_T_PARALLEL sgetrs_T_parallel +#define GETRS_R_PARALLEL sgetrs_N_parallel +#define GETRS_C_PARALLEL sgetrs_T_parallel +#define LASWP_PLUS SLASWP_PLUS +#define LASWP_MINUS SLASWP_MINUS +#define LASWP_NCOPY SLASWP_NCOPY +#define GETRS_N SGETRS_N +#define GETRS_T SGETRS_T +#define GETRF_SINGLE sgetrf_single +#define GETRF_PARALLEL sgetrf_parallel +#define NEG_TCOPY SNEG_TCOPY +#define LARF_L SLARF_L +#define LARF_R SLARF_R +#endif +#else +#ifdef XDOUBLE +#define GETF2 XGETF2 +#define GETRF XGETRF +#define GETRS_N_SINGLE xgetrs_N_single +#define GETRS_T_SINGLE xgetrs_T_single +#define GETRS_R_SINGLE xgetrs_R_single +#define GETRS_C_SINGLE xgetrs_C_single +#define GETRS_N_PARALLEL xgetrs_N_parallel +#define GETRS_T_PARALLEL xgetrs_T_parallel +#define GETRS_R_PARALLEL xgetrs_R_parallel +#define GETRS_C_PARALLEL xgetrs_C_parallel +#define LASWP_PLUS XLASWP_PLUS +#define LASWP_MINUS XLASWP_MINUS +#define LASWP_NCOPY XLASWP_NCOPY +#define GETRS_N XGETRS_N +#define GETRS_T XGETRS_T +#define GETRF_SINGLE xgetrf_single +#define GETRF_PARALLEL xgetrf_parallel +#define NEG_TCOPY XNEG_TCOPY +#define LARF_L XLARF_L +#define LARF_R XLARF_R +#elif defined(DOUBLE) +#define GETF2 ZGETF2 +#define GETRF ZGETRF +#define GETRS_N_SINGLE zgetrs_N_single +#define GETRS_T_SINGLE zgetrs_T_single +#define GETRS_R_SINGLE zgetrs_R_single +#define GETRS_C_SINGLE zgetrs_C_single +#define GETRS_N_PARALLEL zgetrs_N_parallel +#define GETRS_T_PARALLEL zgetrs_T_parallel +#define GETRS_R_PARALLEL zgetrs_R_parallel +#define GETRS_C_PARALLEL zgetrs_C_parallel +#define LASWP_PLUS ZLASWP_PLUS +#define LASWP_MINUS ZLASWP_MINUS +#define LASWP_NCOPY ZLASWP_NCOPY +#define GETRS_N ZGETRS_N +#define GETRS_T ZGETRS_T +#define GETRF_SINGLE zgetrf_single +#define GETRF_PARALLEL zgetrf_parallel +#define NEG_TCOPY ZNEG_TCOPY +#define LARF_L ZLARF_L +#define LARF_R ZLARF_R +#else +#define GETF2 CGETF2 +#define GETRF CGETRF +#define GETRS_N_SINGLE cgetrs_N_single +#define GETRS_T_SINGLE cgetrs_T_single +#define GETRS_R_SINGLE cgetrs_R_single +#define GETRS_C_SINGLE cgetrs_C_single +#define GETRS_N_PARALLEL cgetrs_N_parallel +#define GETRS_T_PARALLEL cgetrs_T_parallel +#define GETRS_R_PARALLEL cgetrs_R_parallel +#define GETRS_C_PARALLEL cgetrs_C_parallel +#define LASWP_PLUS CLASWP_PLUS +#define LASWP_MINUS CLASWP_MINUS +#define LASWP_NCOPY CLASWP_NCOPY +#define GETRS_N CGETRS_N +#define GETRS_T CGETRS_T +#define GETRF_SINGLE cgetrf_single +#define GETRF_PARALLEL cgetrf_parallel +#define NEG_TCOPY CNEG_TCOPY +#define LARF_L CLARF_L +#define LARF_R CLARF_R +#endif +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define POTF2_U qpotf2_U +#define POTF2_L qpotf2_L +#define LAUU2_U qlauu2_U +#define LAUU2_L qlauu2_L +#define POTRF_U_SINGLE qpotrf_U_single +#define POTRF_L_SINGLE qpotrf_L_single +#define POTRF_U_PARALLEL qpotrf_U_parallel +#define POTRF_L_PARALLEL qpotrf_L_parallel +#define LAUUM_U_SINGLE qlauum_U_single +#define LAUUM_L_SINGLE qlauum_L_single +#define LAUUM_U_PARALLEL qlauum_U_parallel +#define LAUUM_L_PARALLEL qlauum_L_parallel +#define TRTI2_UU qtrti2_UU +#define TRTI2_UN qtrti2_UN +#define TRTI2_LU qtrti2_LU +#define TRTI2_LN qtrti2_LN +#define TRTRI_UU_SINGLE qtrtri_UU_single +#define TRTRI_UN_SINGLE qtrtri_UN_single +#define TRTRI_LU_SINGLE qtrtri_LU_single +#define TRTRI_LN_SINGLE qtrtri_LN_single +#define TRTRI_UU_PARALLEL qtrtri_UU_parallel +#define TRTRI_UN_PARALLEL qtrtri_UN_parallel +#define TRTRI_LU_PARALLEL qtrtri_LU_parallel +#define TRTRI_LN_PARALLEL qtrtri_LN_parallel +#elif defined(DOUBLE) +#define POTF2_U dpotf2_U +#define POTF2_L dpotf2_L +#define LAUU2_U dlauu2_U +#define LAUU2_L dlauu2_L +#define POTRF_U_SINGLE dpotrf_U_single +#define POTRF_L_SINGLE dpotrf_L_single +#define POTRF_U_PARALLEL dpotrf_U_parallel +#define POTRF_L_PARALLEL dpotrf_L_parallel +#define LAUUM_U_SINGLE dlauum_U_single +#define LAUUM_L_SINGLE dlauum_L_single +#define LAUUM_U_PARALLEL dlauum_U_parallel +#define LAUUM_L_PARALLEL dlauum_L_parallel +#define TRTI2_UU dtrti2_UU +#define TRTI2_UN dtrti2_UN +#define TRTI2_LU dtrti2_LU +#define TRTI2_LN dtrti2_LN +#define TRTRI_UU_SINGLE dtrtri_UU_single +#define TRTRI_UN_SINGLE dtrtri_UN_single +#define TRTRI_LU_SINGLE dtrtri_LU_single +#define TRTRI_LN_SINGLE dtrtri_LN_single +#define TRTRI_UU_PARALLEL dtrtri_UU_parallel +#define TRTRI_UN_PARALLEL dtrtri_UN_parallel +#define TRTRI_LU_PARALLEL dtrtri_LU_parallel +#define TRTRI_LN_PARALLEL dtrtri_LN_parallel +#else +#define POTF2_U spotf2_U +#define POTF2_L spotf2_L +#define LAUU2_U slauu2_U +#define LAUU2_L slauu2_L +#define POTRF_U_SINGLE spotrf_U_single +#define POTRF_L_SINGLE spotrf_L_single +#define POTRF_U_PARALLEL spotrf_U_parallel +#define POTRF_L_PARALLEL spotrf_L_parallel +#define LAUUM_U_SINGLE slauum_U_single +#define LAUUM_L_SINGLE slauum_L_single +#define LAUUM_U_PARALLEL slauum_U_parallel +#define LAUUM_L_PARALLEL slauum_L_parallel +#define TRTI2_UU strti2_UU +#define TRTI2_UN strti2_UN +#define TRTI2_LU strti2_LU +#define TRTI2_LN strti2_LN +#define TRTRI_UU_SINGLE strtri_UU_single +#define TRTRI_UN_SINGLE strtri_UN_single +#define TRTRI_LU_SINGLE strtri_LU_single +#define TRTRI_LN_SINGLE strtri_LN_single +#define TRTRI_UU_PARALLEL strtri_UU_parallel +#define TRTRI_UN_PARALLEL strtri_UN_parallel +#define TRTRI_LU_PARALLEL strtri_LU_parallel +#define TRTRI_LN_PARALLEL strtri_LN_parallel +#endif +#else +#ifdef XDOUBLE +#define POTF2_U xpotf2_U +#define POTF2_L xpotf2_L +#define LAUU2_U xlauu2_U +#define LAUU2_L xlauu2_L +#define POTRF_U_SINGLE xpotrf_U_single +#define POTRF_L_SINGLE xpotrf_L_single +#define POTRF_U_PARALLEL xpotrf_U_parallel +#define POTRF_L_PARALLEL xpotrf_L_parallel +#define LAUUM_U_SINGLE xlauum_U_single +#define LAUUM_L_SINGLE xlauum_L_single +#define LAUUM_U_PARALLEL xlauum_U_parallel +#define LAUUM_L_PARALLEL xlauum_L_parallel +#define TRTI2_UU xtrti2_UU +#define TRTI2_UN xtrti2_UN +#define TRTI2_LU xtrti2_LU +#define TRTI2_LN xtrti2_LN +#define TRTRI_UU_SINGLE xtrtri_UU_single +#define TRTRI_UN_SINGLE xtrtri_UN_single +#define TRTRI_LU_SINGLE xtrtri_LU_single +#define TRTRI_LN_SINGLE xtrtri_LN_single +#define TRTRI_UU_PARALLEL xtrtri_UU_parallel +#define TRTRI_UN_PARALLEL xtrtri_UN_parallel +#define TRTRI_LU_PARALLEL xtrtri_LU_parallel +#define TRTRI_LN_PARALLEL xtrtri_LN_parallel +#elif defined(DOUBLE) +#define POTF2_U zpotf2_U +#define POTF2_L zpotf2_L +#define LAUU2_U zlauu2_U +#define LAUU2_L zlauu2_L +#define POTRF_U_SINGLE zpotrf_U_single +#define POTRF_L_SINGLE zpotrf_L_single +#define POTRF_U_PARALLEL zpotrf_U_parallel +#define POTRF_L_PARALLEL zpotrf_L_parallel +#define LAUUM_U_SINGLE zlauum_U_single +#define LAUUM_L_SINGLE zlauum_L_single +#define LAUUM_U_PARALLEL zlauum_U_parallel +#define LAUUM_L_PARALLEL zlauum_L_parallel +#define TRTI2_UU ztrti2_UU +#define TRTI2_UN ztrti2_UN +#define TRTI2_LU ztrti2_LU +#define TRTI2_LN ztrti2_LN +#define TRTRI_UU_SINGLE ztrtri_UU_single +#define TRTRI_UN_SINGLE ztrtri_UN_single +#define TRTRI_LU_SINGLE ztrtri_LU_single +#define TRTRI_LN_SINGLE ztrtri_LN_single +#define TRTRI_UU_PARALLEL ztrtri_UU_parallel +#define TRTRI_UN_PARALLEL ztrtri_UN_parallel +#define TRTRI_LU_PARALLEL ztrtri_LU_parallel +#define TRTRI_LN_PARALLEL ztrtri_LN_parallel +#else +#define POTF2_U cpotf2_U +#define POTF2_L cpotf2_L +#define LAUU2_U clauu2_U +#define LAUU2_L clauu2_L +#define POTRF_U_SINGLE cpotrf_U_single +#define POTRF_L_SINGLE cpotrf_L_single +#define POTRF_U_PARALLEL cpotrf_U_parallel +#define POTRF_L_PARALLEL cpotrf_L_parallel +#define LAUUM_U_SINGLE clauum_U_single +#define LAUUM_L_SINGLE clauum_L_single +#define LAUUM_U_PARALLEL clauum_U_parallel +#define LAUUM_L_PARALLEL clauum_L_parallel +#define TRTI2_UU ctrti2_UU +#define TRTI2_UN ctrti2_UN +#define TRTI2_LU ctrti2_LU +#define TRTI2_LN ctrti2_LN +#define TRTRI_UU_SINGLE ctrtri_UU_single +#define TRTRI_UN_SINGLE ctrtri_UN_single +#define TRTRI_LU_SINGLE ctrtri_LU_single +#define TRTRI_LN_SINGLE ctrtri_LN_single +#define TRTRI_UU_PARALLEL ctrtri_UU_parallel +#define TRTRI_UN_PARALLEL ctrtri_UN_parallel +#define TRTRI_LU_PARALLEL ctrtri_LU_parallel +#define TRTRI_LN_PARALLEL ctrtri_LN_parallel +#endif +#endif + +#endif diff --git a/common_mips64.h b/common_mips64.h new file mode 100644 index 0000000..332af3e --- /dev/null +++ b/common_mips64.h @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_MIPS64 +#define COMMON_MIPS64 + +#define MB +#define WMB + +#define INLINE inline + +#ifndef ASSEMBLER + +static void INLINE blas_lock(volatile unsigned long *address){ + + long int ret, val = 1; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "1: ll %0, %3\n" + " ori %2, %0, 1\n" + " sc %2, %1\n" + " beqz %2, 1b\n" + " andi %2, %0, 1\n" + " sync\n" + : "=&r" (val), "=m" (address), "=&r" (ret) + : "m" (address) + : "memory"); + + } while (ret); +} + +static inline unsigned int rpcc(void){ + unsigned long ret; + + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $30 \n" + ".set pop" : "=r"(ret) : : "memory"); + + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#ifdef DOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("mov.d %0, $f2" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("mov.s %0, $f2" : "=f"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifdef ASSEMBLER + +#define HALT teq $0, $0 +#define NOP move $0, $0 + +#ifdef DOUBLE +#define LD ldc1 +#define ST sdc1 +#define MADD madd.d +#define NMADD nmadd.d +#define MSUB msub.d +#define NMSUB nmsub.d +#define ADD add.d +#define SUB sub.d +#define MUL mul.d +#define MOV mov.d +#define CMOVF movf.d +#define CMOVT movt.d +#define MTC dmtc1 +#define FABS abs.d +#define CMPEQ c.eq.d +#define CMPLE c.le.d +#define CMPLT c.lt.d +#else +#define LD lwc1 +#define ST swc1 +#define MADD madd.s +#define NMADD nmadd.s +#define MSUB msub.s +#define NMSUB nmsub.s +#define ADD add.s +#define SUB sub.s +#define MUL mul.s +#define MOV mov.s +#define CMOVF movf.s +#define CMOVT movt.s +#define MTC mtc1 +#define FABS abs.s +#define CMPEQ c.eq.s +#define CMPLE c.le.s +#define CMPLT c.lt.s +#endif + +#if defined(__64BIT__) && defined(USE64BITINT) +#define LDINT ld +#define LDARG ld +#define SDARG sd +#elif defined(__64BIT__) && !defined(USE64BITINT) +#define LDINT lw +#define LDARG ld +#define SDARG sd +#else +#define LDINT lw +#define LDARG lw +#define SDARG sw +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .text ;\ + .set mips64 ;\ + .align 5 ;\ + .globl REALNAME ;\ + .ent REALNAME ;\ + .type REALNAME, @function ;\ +REALNAME: ;\ + .set noreorder ;\ + .set nomacro + +#define EPILOGUE \ + .set macro ;\ + .set reorder ;\ + .end REALNAME + +#define PROFCODE +#endif + +#endif + +#define SEEK_ADDRESS + +#define BUFFER_SIZE ( 8 << 20) + +#ifndef PAGESIZE +#define PAGESIZE (64UL << 10) +#endif +#define HUGE_PAGESIZE ( 2 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif diff --git a/common_param.h b/common_param.h new file mode 100644 index 0000000..c4580cc --- /dev/null +++ b/common_param.h @@ -0,0 +1,1098 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_PARAM_H +#define COMMON_PARAM_H + +#ifndef ASSEMBLER + +#ifdef DYNAMIC_ARCH + +typedef struct { + int offsetA, offsetB, align; + + int sgemm_p, sgemm_q, sgemm_r; + int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; + + int exclusive_cache; + + float (*samax_k) (BLASLONG, float *, BLASLONG); + float (*samin_k) (BLASLONG, float *, BLASLONG); + float (*smax_k) (BLASLONG, float *, BLASLONG); + float (*smin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); + + float (*snrm2_k) (BLASLONG, float *, BLASLONG); + float (*sasum_k) (BLASLONG, float *, BLASLONG); + int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + + int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); + int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + + int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + + int dgemm_p, dgemm_q, dgemm_r; + int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; + + double (*damax_k) (BLASLONG, double *, BLASLONG); + double (*damin_k) (BLASLONG, double *, BLASLONG); + double (*dmax_k) (BLASLONG, double *, BLASLONG); + double (*dmin_k) (BLASLONG, double *, BLASLONG); +BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); +BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); +BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); +BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); + + double (*dnrm2_k) (BLASLONG, double *, BLASLONG); + double (*dasum_k) (BLASLONG, double *, BLASLONG); + int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); + + int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); + int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*dgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + + int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + + int (*dtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); + +#ifdef EXPRECISION + + int qgemm_p, qgemm_q, qgemm_r; + int qgemm_unroll_m, qgemm_unroll_n, qgemm_unroll_mn; + + xdouble (*qamax_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qmax_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qmin_k) (BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); + + xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); + int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + + int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*qswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*qgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qger_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*qsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*qgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*qgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*qgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + + int (*qtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*qneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); + +#endif + + int cgemm_p, cgemm_q, cgemm_r; + int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; + + float (*camax_k) (BLASLONG, float *, BLASLONG); + float (*camin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); + + float (*cnrm2_k) (BLASLONG, float *, BLASLONG); + float (*casum_k) (BLASLONG, float *, BLASLONG); + int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float _Complex (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float _Complex (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + + int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*cswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*cgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_r) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_c) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_o) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgerd_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*csymv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*csymv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + + int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*chemm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + + int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_incopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_incopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + + int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_otcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_otcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_otcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + + int (*csymm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + + int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + + int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + + int zgemm_p, zgemm_q, zgemm_r; + int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; + + double (*zamax_k) (BLASLONG, double *, BLASLONG); + double (*zamin_k) (BLASLONG, double *, BLASLONG); +BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); +BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); + + double (*znrm2_k) (BLASLONG, double *, BLASLONG); + double (*zasum_k) (BLASLONG, double *, BLASLONG); + int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + double _Complex (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + double _Complex (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); + + int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*zgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_r) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_c) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_o) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_u) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_s) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_d) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgeru_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgerc_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgerv_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgerd_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*zsymv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zsymv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_M) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_V) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*zgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*zgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + + int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + + int (*ztrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zhemm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + + int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_incopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_incopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + + int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_otcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_otcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_otcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + + int (*zsymm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + + int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + + int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); + +#ifdef EXPRECISION + + int xgemm_p, xgemm_q, xgemm_r; + int xgemm_unroll_m, xgemm_unroll_n, xgemm_unroll_mn; + + xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG); +BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); +BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); + + xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); + int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + xdouble _Complex (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + xdouble _Complex (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + + int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*xgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_r) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_c) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_o) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_u) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_s) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_d) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgeru_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgerc_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgerv_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgerd_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_M) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_V) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*xgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + + int (*xtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xhemm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + + int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_incopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_incopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_otcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_otcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_otcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + + int (*xsymm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + + int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + + int (*xneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); + +#endif + + void (*init)(void); + + int snum_opt, dnum_opt, qnum_opt; + +} gotoblas_t; + +extern gotoblas_t *gotoblas; + +#define GEMM_OFFSET_A gotoblas -> offsetA +#define GEMM_OFFSET_B gotoblas -> offsetB +#define GEMM_ALIGN gotoblas -> align + +#define HAVE_EX_L2 gotoblas -> exclusive_cache + +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R gotoblas -> sgemm_r +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn + +#define DGEMM_P gotoblas -> dgemm_p +#define DGEMM_Q gotoblas -> dgemm_q +#define DGEMM_R gotoblas -> dgemm_r +#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m +#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n +#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn + +#define QGEMM_P gotoblas -> qgemm_p +#define QGEMM_Q gotoblas -> qgemm_q +#define QGEMM_R gotoblas -> qgemm_r +#define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m +#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n +#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn + +#define CGEMM_P gotoblas -> cgemm_p +#define CGEMM_Q gotoblas -> cgemm_q +#define CGEMM_R gotoblas -> cgemm_r +#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m +#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n +#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn + +#define ZGEMM_P gotoblas -> zgemm_p +#define ZGEMM_Q gotoblas -> zgemm_q +#define ZGEMM_R gotoblas -> zgemm_r +#define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m +#define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n +#define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn + +#define XGEMM_P gotoblas -> xgemm_p +#define XGEMM_Q gotoblas -> xgemm_q +#define XGEMM_R gotoblas -> xgemm_r +#define XGEMM_UNROLL_M gotoblas -> xgemm_unroll_m +#define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n +#define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn + +#else + +#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A +#define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B +#define GEMM_ALIGN GEMM_DEFAULT_ALIGN + +#ifdef HAVE_EXCLUSIVE_CACHE +#define HAVE_EX_L2 1 +#else +#define HAVE_EX_L2 0 +#endif + +#define SGEMM_P SGEMM_DEFAULT_P +#define SGEMM_Q SGEMM_DEFAULT_Q +#define SGEMM_R SGEMM_DEFAULT_R +#define SGEMM_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define SGEMM_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#define SGEMM_UNROLL_MN MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N)) + +#define DGEMM_P DGEMM_DEFAULT_P +#define DGEMM_Q DGEMM_DEFAULT_Q +#define DGEMM_R DGEMM_DEFAULT_R +#define DGEMM_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define DGEMM_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#define DGEMM_UNROLL_MN MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N)) + +#define QGEMM_P QGEMM_DEFAULT_P +#define QGEMM_Q QGEMM_DEFAULT_Q +#define QGEMM_R QGEMM_DEFAULT_R +#define QGEMM_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define QGEMM_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#define QGEMM_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) + +#define CGEMM_P CGEMM_DEFAULT_P +#define CGEMM_Q CGEMM_DEFAULT_Q +#define CGEMM_R CGEMM_DEFAULT_R +#define CGEMM_UNROLL_M CGEMM_DEFAULT_UNROLL_M +#define CGEMM_UNROLL_N CGEMM_DEFAULT_UNROLL_N +#define CGEMM_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) + +#define ZGEMM_P ZGEMM_DEFAULT_P +#define ZGEMM_Q ZGEMM_DEFAULT_Q +#define ZGEMM_R ZGEMM_DEFAULT_R +#define ZGEMM_UNROLL_M ZGEMM_DEFAULT_UNROLL_M +#define ZGEMM_UNROLL_N ZGEMM_DEFAULT_UNROLL_N +#define ZGEMM_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) + +#define XGEMM_P XGEMM_DEFAULT_P +#define XGEMM_Q XGEMM_DEFAULT_Q +#define XGEMM_R XGEMM_DEFAULT_R +#define XGEMM_UNROLL_M XGEMM_DEFAULT_UNROLL_M +#define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N +#define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) + +#endif +#endif + +#ifndef COMPLEX +#if defined(XDOUBLE) +#define GEMM_P QGEMM_P +#define GEMM_Q QGEMM_Q +#define GEMM_R QGEMM_R +#define GEMM_UNROLL_M QGEMM_UNROLL_M +#define GEMM_UNROLL_N QGEMM_UNROLL_N +#define GEMM_UNROLL_MN QGEMM_UNROLL_MN +#define GEMM_DEFAULT_P QGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q QGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R QGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#elif defined(DOUBLE) +#define GEMM_P DGEMM_P +#define GEMM_Q DGEMM_Q +#define GEMM_R DGEMM_R +#define GEMM_UNROLL_M DGEMM_UNROLL_M +#define GEMM_UNROLL_N DGEMM_UNROLL_N +#define GEMM_UNROLL_MN DGEMM_UNROLL_MN +#define GEMM_DEFAULT_P DGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q DGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R DGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#else +#define GEMM_P SGEMM_P +#define GEMM_Q SGEMM_Q +#define GEMM_R SGEMM_R +#define GEMM_UNROLL_M SGEMM_UNROLL_M +#define GEMM_UNROLL_N SGEMM_UNROLL_N +#define GEMM_UNROLL_MN SGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#endif +#else +#if defined(XDOUBLE) +#define GEMM_P XGEMM_P +#define GEMM_Q XGEMM_Q +#define GEMM_R XGEMM_R +#define GEMM_UNROLL_M XGEMM_UNROLL_M +#define GEMM_UNROLL_N XGEMM_UNROLL_N +#define GEMM_UNROLL_MN XGEMM_UNROLL_MN +#define GEMM_DEFAULT_P XGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q XGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R XGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M XGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N XGEMM_DEFAULT_UNROLL_N +#elif defined(DOUBLE) +#define GEMM_P ZGEMM_P +#define GEMM_Q ZGEMM_Q +#define GEMM_R ZGEMM_R +#define GEMM_UNROLL_M ZGEMM_UNROLL_M +#define GEMM_UNROLL_N ZGEMM_UNROLL_N +#define GEMM_UNROLL_MN ZGEMM_UNROLL_MN +#define GEMM_DEFAULT_P ZGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q ZGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R ZGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M ZGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N ZGEMM_DEFAULT_UNROLL_N +#else +#define GEMM_P CGEMM_P +#define GEMM_Q CGEMM_Q +#define GEMM_R CGEMM_R +#define GEMM_UNROLL_M CGEMM_UNROLL_M +#define GEMM_UNROLL_N CGEMM_UNROLL_N +#define GEMM_UNROLL_MN CGEMM_UNROLL_MN +#define GEMM_DEFAULT_P CGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q CGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R CGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M CGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N CGEMM_DEFAULT_UNROLL_N +#endif +#endif + +#ifdef XDOUBLE +#define GEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define GEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#elif defined(DOUBLE) +#define GEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define GEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#else +#define GEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define GEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#endif + + +#ifndef QGEMM_DEFAULT_UNROLL_M +#define QGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef QGEMM_DEFAULT_UNROLL_N +#define QGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_M +#define XGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_N +#define XGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef GEMM_THREAD +#define GEMM_THREAD gemm_thread_n +#endif + +#ifndef SGEMM_DEFAULT_R +#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) +#endif + +#ifndef DGEMM_DEFAULT_R +#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) +#endif + +#ifndef QGEMM_DEFAULT_R +#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) +#endif + +#ifndef CGEMM_DEFAULT_R +#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) +#endif + +#ifndef ZGEMM_DEFAULT_R +#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) +#endif + +#ifndef XGEMM_DEFAULT_R +#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) +#endif + +#ifndef SNUMOPT +#define SNUMOPT 2 +#endif + +#ifndef DNUMOPT +#define DNUMOPT 2 +#endif + +#ifndef QNUMOPT +#define QNUMOPT 1 +#endif + +#ifndef GEMM3M_P +#ifdef XDOUBLE +#define GEMM3M_P QGEMM_P +#elif defined(DOUBLE) +#define GEMM3M_P DGEMM_P +#else +#define GEMM3M_P SGEMM_P +#endif +#endif + +#ifndef GEMM3M_Q +#ifdef XDOUBLE +#define GEMM3M_Q QGEMM_Q +#elif defined(DOUBLE) +#define GEMM3M_Q DGEMM_Q +#else +#define GEMM3M_Q SGEMM_Q +#endif +#endif + +#ifndef GEMM3M_R +#ifdef XDOUBLE +#define GEMM3M_R QGEMM_R +#elif defined(DOUBLE) +#define GEMM3M_R DGEMM_R +#else +#define GEMM3M_R SGEMM_R +#endif +#endif + + +#endif diff --git a/common_power.h b/common_power.h new file mode 100644 index 0000000..34a6153 --- /dev/null +++ b/common_power.h @@ -0,0 +1,795 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_POWER +#define COMMON_POWER + +#define MB __asm__ __volatile__ ("sync") +#define WMB __asm__ __volatile__ ("sync") + +#define INLINE inline + +#ifdef PPC440 +#define STDERR stdout +#define QNONCACHE 0x1 +#define QCOMMS 0x2 +#define QFAST 0x4 +#endif + +#ifndef ASSEMBLER + +void *qalloc(int flags, size_t bytes); + +static void INLINE blas_lock(volatile unsigned long *address){ + + long int ret, val = 1; + + do { + while (*address) {YIELDING;}; + +#if defined(OS_LINUX) || defined(OS_DARWIN) + __asm__ __volatile__ ( + "0: lwarx %0, 0, %1\n" + " cmpwi %0, 0\n" + " bne- 1f\n" + " stwcx. %2,0, %1\n" + " bne- 0b\n" + "1: " + : "=&r"(ret) + : "r"(address), "r" (val) + : "cr0", "memory"); +#else + __asm__ __volatile__ ( + ".machine \"any\"\n" + " lwarx %0, 0, %1\n" + " cmpwi %0, 0\n" + " bne- $+12\n" + " stwcx. %2,0, %1\n" + " bne- $-16\n" + : "=&r"(ret) + : "r"(address), "r" (val) + : "cr0", "memory"); +#endif + } while (ret); +} + +static inline unsigned long rpcc(void){ + unsigned long ret; + +#ifdef OS_AIX + __asm__ __volatile__(".machine \"any\" ;"); +#endif + __asm__ __volatile__ ("mftb %0" : "=r" (ret) : ); + +#if defined(POWER5) || defined(PPC970) + return (ret << 6); +#else + return (ret << 3); +#endif + +} + +#ifdef __64BIT__ +#define RPCC64BIT +#endif + +static inline unsigned long getstackaddr(void){ + unsigned long addr; + + __asm__ __volatile__ ("mr %0, 1" + : "=r"(addr) : : "memory"); + + return addr; +}; + +#if defined(OS_LINUX) || defined(OS_AIX) +#define GET_IMAGE(res) __asm__ __volatile__("fmr %0, 2" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fmr %0, f2" : "=f"(res) : : "memory") + +#define GET_IMAGE_CANCEL + +#endif + +#ifdef SMP +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} +#endif + +#endif + + +#ifdef ASSEMBLER + +#ifdef DOUBLE +#define LFD lfd +#define LFDX lfdx +#define LFPDX lfpdx +#define LFSDX lfsdx +#define LFXDX lfxdx +#define LFDU lfdu +#define LFDUX lfdux +#define LFPDUX lfpdux +#define LFSDUX lfsdux +#define LFXDUX lfxdux +#define STFD stfd +#define STFDX stfdx +#define STFPDX stfpdx +#define STFSDX stfsdx +#define STFXDX stfxdx +#define STFDU stfdu +#define STFDUX stfdux +#define STFPDUX stfpdux +#define STFSDUX stfsdux +#define STFXDUX stfxdux +#define FMADD fmadd +#define FMSUB fmsub +#define FNMADD fnmadd +#define FNMSUB fnmsub +#define FMUL fmul +#define FADD fadd +#define FSUB fsub +#else +#define LFD lfs +#define LFDX lfsx +#define LFPDX lfpsx +#define LFSDX lfssx +#define LFXDX lfxsx +#define LFDU lfsu +#define LFDUX lfsux +#define LFPDUX lfpsux +#define LFSDUX lfssux +#define LFXDUX lfxsux +#define STFD stfs +#define STFDX stfsx +#define STFPDX stfpsx +#define STFSDX stfssx +#define STFXDX stfxsx +#define STFDU stfsu +#define STFDUX stfsux +#define STFPDUX stfpsux +#define STFSDUX stfssux +#define STFXDUX stfxsux +#define FMADD fmadds +#define FMSUB fmsubs +#define FNMADD fnmadds +#define FNMSUB fnmsubs +#define FMUL fmuls +#define FADD fadds +#define FSUB fsubs +#endif + +#ifdef __64BIT__ +#define LDLONG ld +#else +#define LDLONG lwz +#endif + +#ifdef OS_DARWIN +#define LL(x) L##x +#endif + +#ifdef OS_LINUX +#define LL(x) .L##x +#endif + +#ifndef LL +#define LL(x) __L##x +#endif + + +#if defined(__64BIT__) && defined(USE64BITINT) +#define LDINT ld +#elif defined(__64BIT__) && !defined(USE64BITINT) +#define LDINT lwa +#else +#define LDINT lwz +#endif + +/* +#define DCBT(REGA, REGB, NUM) .long (0x7c00022c | (REGA << 16) | (REGB << 11) | ((NUM) << 21)) +#define DCBTST(REGA, REGB, NUM) .long (0x7c0001ec | (REGA << 16) | (REGB << 11) | ((NUM) << 21)) +*/ + +#define DSTATTR_H(SIZE, COUNT, STRIDE) ((SIZE << 8) | (COUNT)) +#define DSTATTR_L(SIZE, COUNT, STRIDE) (STRIDE) + +#if defined(PPC970) || defined(POWER3) || defined(POWER4) || defined(POWER5) || defined(PPCG4) +#define HAVE_PREFETCH +#endif + +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) +#define DCBT_ARG 0 +#else +#define DCBT_ARG 8 +#endif + +#ifdef CELL +#define L1_DUALFETCH +#define L1_PREFETCHSIZE (64 + 128 * 13) +#endif + +#if defined(POWER3) || defined(POWER4) || defined(POWER5) +#define L1_DUALFETCH +#define L1_PREFETCHSIZE (96 + 128 * 12) +#endif + +#if defined(POWER6) +#define L1_DUALFETCH +#define L1_PREFETCHSIZE (16 + 128 * 100) +#define L1_PREFETCH dcbtst +#endif + +#ifndef L1_PREFETCH +#define L1_PREFETCH dcbt +#endif + +#ifndef L1_PREFETCHW +#define L1_PREFETCHW dcbtst +#endif + +#if DCBT_ARG == 0 +#define DCBT(REGA, REGB) L1_PREFETCH REGB, REGA +#define DCBTST(REGA, REGB) L1_PREFETCHW REGB, REGA +#else +#define DCBT(REGA, REGB) L1_PREFETCH DCBT_ARG, REGB, REGA +#define DCBTST(REGA, REGB) L1_PREFETCHW DCBT_ARG, REGB, REGA +#endif + + +#ifndef L1_PREFETCHSIZE +#define L1_PREFETCHSIZE (96 + 128 * 12) +#endif + +#if !defined(OS_DARWIN) || defined(NEEDPARAM) +#define f0 0 +#define f1 1 +#define f2 2 +#define f3 3 +#define f4 4 +#define f5 5 +#define f6 6 +#define f7 7 +#define f8 8 +#define f9 9 +#define f10 10 +#define f11 11 +#define f12 12 +#define f13 13 +#define f14 14 +#define f15 15 +#define f16 16 +#define f17 17 +#define f18 18 +#define f19 19 +#define f20 20 +#define f21 21 +#define f22 22 +#define f23 23 +#define f24 24 +#define f25 25 +#define f26 26 +#define f27 27 +#define f28 28 +#define f29 29 +#define f30 30 +#define f31 31 + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 + +#define BO_dCTR_NZERO_AND_NOT 0 +#define BO_dCTR_NZERO_AND_NOT_1 1 +#define BO_dCTR_ZERO_AND_NOT 2 +#define BO_dCTR_ZERO_AND_NOT_1 3 +#define BO_IF_NOT 4 +#define BO_IF_NOT_1 5 +#define BO_IF_NOT_2 6 +#define BO_IF_NOT_3 7 +#define BO_dCTR_NZERO_AND 8 +#define BO_dCTR_NZERO_AND_1 9 +#define BO_dCTR_ZERO_AND 10 +#define BO_dCTR_ZERO_AND_1 11 +#define BO_IF 12 +#define BO_IF_1 13 +#define BO_IF_2 14 +#define BO_IF_3 15 +#define BO_dCTR_NZERO 16 +#define BO_dCTR_NZERO_1 17 +#define BO_dCTR_ZERO 18 +#define BO_dCTR_ZERO_1 19 +#define BO_ALWAYS 20 +#define BO_ALWAYS_1 21 +#define BO_ALWAYS_2 22 +#define BO_ALWAYS_3 23 +#define BO_dCTR_NZERO_8 24 +#define BO_dCTR_NZERO_9 25 +#define BO_dCTR_ZERO_8 26 +#define BO_dCTR_ZERO_9 27 +#define BO_ALWAYS_8 28 +#define BO_ALWAYS_9 29 +#define BO_ALWAYS_10 30 +#define BO_ALWAYS_11 31 + +#define CR0_LT 0 +#define CR0_GT 1 +#define CR0_EQ 2 +#define CR0_SO 3 +#define CR1_FX 4 +#define CR1_FEX 5 +#define CR1_VX 6 +#define CR1_OX 7 +#define CR2_LT 8 +#define CR2_GT 9 +#define CR2_EQ 10 +#define CR2_SO 11 +#define CR3_LT 12 +#define CR3_GT 13 +#define CR3_EQ 14 +#define CR3_SO 15 +#define CR4_LT 16 +#define CR4_GT 17 +#define CR4_EQ 18 +#define CR4_SO 19 +#define CR5_LT 20 +#define CR5_GT 21 +#define CR5_EQ 22 +#define CR5_SO 23 +#define CR6_LT 24 +#define CR6_GT 25 +#define CR6_EQ 26 +#define CR6_SO 27 +#define CR7_LT 28 +#define CR7_GT 29 +#define CR7_EQ 30 +#define CR7_SO 31 +#define TO_LT 16 +#define TO_GT 8 +#define TO_EQ 4 +#define TO_LLT 2 +#define TO_LGT 1 +#define CR0 0 +#define CR1 1 +#define CR2 2 +#define CR3 3 +#define CR4 4 +#define CR5 5 +#define CR6 6 +#define CR7 7 +#define cr0 0 +#define cr1 1 +#define cr2 2 +#define cr3 3 +#define cr4 4 +#define cr5 5 +#define cr6 6 +#define cr7 7 +#define VRsave 256 + +#endif + +#define CTR 9 +#define SP r1 + +#ifdef __64BIT__ +#define slwi sldi +#define cmpwi cmpdi +#define srawi sradi +#define mullw mulld +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#ifdef OS_LINUX +#ifndef __64BIT__ +#define PROLOGUE \ + .section .text;\ + .align 6;\ + .globl REALNAME;\ + .type REALNAME, @function;\ +REALNAME: +#define EPILOGUE .size REALNAME, .-REALNAME +#else +#define PROLOGUE \ + .section .text;\ + .align 5;\ + .globl REALNAME;\ + .section ".opd","aw";\ + .align 3;\ +REALNAME:;\ + .quad .REALNAME, .TOC.@tocbase, 0;\ + .previous;\ + .size REALNAME, 24;\ + .type .REALNAME, @function;\ + .globl .REALNAME;\ +.REALNAME: +#define EPILOGUE \ + .long 0 ; \ + .byte 0,0,0,1,128,0,0,0 ; \ + .size .REALNAME, .-.REALNAME; \ + .section .note.GNU-stack,"",@progbits +#endif + +#ifdef PROFILE +#ifndef __64BIT__ +#define PROFCODE ;\ + .section ".data";\ + .align 2;\ +.LP3:;\ + .long 0;\ + .section ".text";\ + mflr r0;\ + stw r0, 4(SP);\ + lis r12, .LP3@ha;\ + la r0, .LP3@l(r12);\ + bl _mcount;\ + lwz r0, 4(SP);\ + mtlr r0 +#else +#define PROFCODE \ + .globl _mcount; \ + mflr r0; \ + std r0, 16(SP); \ + mr r11, SP; \ + addi SP, SP, -256; \ + std r11, 0(SP); \ + std r3, 128(SP); \ + std r4, 136(SP); \ + std r5, 144(SP); \ + std r6, 152(SP); \ + std r7, 160(SP); \ + std r8, 168(SP); \ + std r9, 176(SP); \ + std r10, 184(SP); \ + stfd f3, 192(SP); \ + stfd f4, 200(SP); \ + bl ._mcount; \ + nop; \ + ld r3, 128(SP);\ + ld r4, 136(SP);\ + ld r5, 144(SP);\ + ld r6, 152(SP);\ + ld r7, 160(SP);\ + ld r8, 168(SP);\ + ld r9, 176(SP);\ + ld r10, 184(SP);\ + lfd f3, 192(SP);\ + lfd f4, 200(SP);\ + addi SP, SP, 256;\ + ld r0, 16(SP);\ + mtlr r0 +#endif +#else +#define PROFCODE +#endif + +#endif + +#if OS_AIX +#ifndef __64BIT__ +#define PROLOGUE \ + .machine "any";\ + .globl .REALNAME;\ + .csect .text[PR],5;\ +.REALNAME:; + +#define EPILOGUE \ +_section_.text:;\ + .csect .data[RW],4;\ + .long _section_.text; + +#else + +#define PROLOGUE \ + .machine "any";\ + .globl .REALNAME;\ + .csect .text[PR], 5;\ +.REALNAME:; + +#define EPILOGUE \ +_section_.text:;\ + .csect .data[RW],4;\ + .llong _section_.text; +#endif + +#define PROFCODE + +#endif + +#ifdef OS_DARWIN +#ifndef __64BIT__ + .macro PROLOGUE + .section __TEXT,__text,regular,pure_instructions + .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 + .machine ppc + .text + .align 4 + .globl REALNAME +REALNAME: + .endmacro +#else + .macro PROLOGUE + .section __TEXT,__text,regular,pure_instructions + .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 + .machine ppc64 + .text + .align 4 + .globl REALNAME +REALNAME: + .endmacro +#endif + +#ifndef PROFILE +#define PROFCODE +#define EPILOGUE .subsections_via_symbols +#else +#ifndef __64BIT__ + + .macro PROFCODE + mflr r0 + stw r0, 8(SP) + addi SP, SP, -64 + stw SP, 0(SP) + stw r3, 12(SP) + stw r4, 16(SP) + stw r5, 20(SP) + stw r6, 24(SP) + stw r7, 28(SP) + stw r8, 32(SP) + stw r9, 36(SP) + stw r10, 40(SP) + stfd f1, 48(SP) + stfd f2, 56(SP) + mr r3, r0 + bl Lmcount$stub + nop + lwz r3, 12(SP) + lwz r4, 16(SP) + lwz r5, 20(SP) + lwz r6, 24(SP) + lwz r7, 28(SP) + lwz r8, 32(SP) + lwz r9, 36(SP) + lwz r10, 40(SP) + lfd f1, 48(SP) + lfd f2, 56(SP) + addi SP, SP, 64 + lwz r0, 8(SP) + mtlr r0 + .endmacro + + .macro EPILOGUE + .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 + .align 5 +Lmcount$stub: + .indirect_symbol mcount + mflr r0 + bcl 20,31,L00000000001$spb +L00000000001$spb: + mflr r11 + addis r11,r11,ha16(Lmcount$lazy_ptr-L00000000001$spb) + mtlr r0 + lwzu r12,lo16(Lmcount$lazy_ptr-L00000000001$spb)(r11) + mtctr r12 + bctr + .lazy_symbol_pointer +Lmcount$lazy_ptr: + .indirect_symbol mcount + .long dyld_stub_binding_helper + .subsections_via_symbols + .endmacro + +#else + .macro PROFCODE + mflr r0 + std r0, 16(SP) + addi SP, SP, -128 + std SP, 0(SP) + std r3, 24(SP) + std r4, 32(SP) + std r5, 40(SP) + std r6, 48(SP) + std r7, 56(SP) + std r8, 64(SP) + std r9, 72(SP) + std r10, 80(SP) + stfd f1, 88(SP) + stfd f2, 96(SP) + mr r3, r0 + bl Lmcount$stub + nop + ld r3, 24(SP) + ld r4, 32(SP) + ld r5, 40(SP) + ld r6, 48(SP) + ld r7, 56(SP) + ld r8, 64(SP) + ld r9, 72(SP) + ld r10, 80(SP) + lfd f1, 88(SP) + lfd f2, 86(SP) + addi SP, SP, 128 + ld r0, 16(SP) + mtlr r0 + .endmacro + + .macro EPILOGUE + .data + .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 + .align 5 +Lmcount$stub: + .indirect_symbol mcount + mflr r0 + bcl 20,31,L00000000001$spb +L00000000001$spb: + mflr r11 + addis r11,r11,ha16(Lmcount$lazy_ptr-L00000000001$spb) + mtlr r0 + ld r12,lo16(Lmcount$lazy_ptr-L00000000001$spb)(r11) + mtctr r12 + bctr + .lazy_symbol_pointer +Lmcount$lazy_ptr: + .indirect_symbol mcount + .quad dyld_stub_binding_helper + .subsections_via_symbols + .endmacro +#endif + +#endif + +#endif +#endif + +#endif + +#define HALT mfspr r0, 1023 + +#ifdef OS_LINUX +#if defined(PPC440) || defined(PPC440FP2) +#undef MAX_CPU_NUMBER +#define MAX_CPU_NUMBER 1 +#endif +#if !defined(__64BIT__) && !defined(PROFILE) && !defined(PPC440) && !defined(PPC440FP2) +#define START_ADDRESS (0x0b000000UL) +#else +#define SEEK_ADDRESS +#endif +#endif + +#ifdef OS_AIX +#ifndef __64BIT__ +#define START_ADDRESS (0xf0000000UL) +#else +#define SEEK_ADDRESS +#endif +#endif + +#ifdef OS_DARWIN +#define SEEK_ADDRESS +#endif + +#if defined(PPC440) +#define BUFFER_SIZE ( 2 << 20) +#elif defined(PPC440FP2) +#define BUFFER_SIZE ( 16 << 20) +#else +#define BUFFER_SIZE ( 16 << 20) +#endif + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE (16 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif diff --git a/common_q.h b/common_q.h new file mode 100644 index 0000000..30ad372 --- /dev/null +++ b/common_q.h @@ -0,0 +1,431 @@ +#ifndef COMMON_Q_H +#define COMMON_Q_H + +#ifndef DYNAMIC_ARCH + +#define QAMAX_K qamax_k +#define QAMIN_K qamin_k +#define QMAX_K qmax_k +#define QMIN_K qmin_k +#define IQAMAX_K iqamax_k +#define IQAMIN_K iqamin_k +#define IQMAX_K iqmax_k +#define IQMIN_K iqmin_k +#define QASUM_K qasum_k +#define QAXPYU_K qaxpy_k +#define QAXPYC_K qaxpy_k +#define QCOPY_K qcopy_k +#define QDOTU_K qdot_k +#define QDOTC_K qdot_k +#define QNRM2_K qnrm2_k +#define QSCAL_K qscal_k +#define QSWAP_K qswap_k +#define QROT_K qrot_k + +#define QGEMV_N qgemv_n +#define QGEMV_T qgemv_t +#define QGEMV_R qgemv_n +#define QGEMV_C qgemv_t +#define QGEMV_O qgemv_n +#define QGEMV_U qgemv_t +#define QGEMV_S qgemv_n +#define QGEMV_D qgemv_t + +#define QGERU_K qger_k +#define QGERC_K qger_k +#define QGERV_K qger_k +#define QGERD_K qger_k + +#define QSYMV_U qsymv_U +#define QSYMV_L qsymv_L +#define QSYMV_THREAD_U qsymv_thread_U +#define QSYMV_THREAD_L qsymv_thread_L + +#define QGEMM_ONCOPY qgemm_oncopy +#define QGEMM_OTCOPY qgemm_otcopy + +#if QGEMM_DEFAULT_UNROLL_M == QGEMM_DEFAULT_UNROLL_N +#define QGEMM_INCOPY qgemm_oncopy +#define QGEMM_ITCOPY qgemm_otcopy +#else +#define QGEMM_INCOPY qgemm_incopy +#define QGEMM_ITCOPY qgemm_itcopy +#endif + +#define QTRMM_OUNUCOPY qtrmm_ounucopy +#define QTRMM_OUNNCOPY qtrmm_ounncopy +#define QTRMM_OUTUCOPY qtrmm_outucopy +#define QTRMM_OUTNCOPY qtrmm_outncopy +#define QTRMM_OLNUCOPY qtrmm_olnucopy +#define QTRMM_OLNNCOPY qtrmm_olnncopy +#define QTRMM_OLTUCOPY qtrmm_oltucopy +#define QTRMM_OLTNCOPY qtrmm_oltncopy + +#define QTRSM_OUNUCOPY qtrsm_ounucopy +#define QTRSM_OUNNCOPY qtrsm_ounncopy +#define QTRSM_OUTUCOPY qtrsm_outucopy +#define QTRSM_OUTNCOPY qtrsm_outncopy +#define QTRSM_OLNUCOPY qtrsm_olnucopy +#define QTRSM_OLNNCOPY qtrsm_olnncopy +#define QTRSM_OLTUCOPY qtrsm_oltucopy +#define QTRSM_OLTNCOPY qtrsm_oltncopy + +#if QGEMM_DEFAULT_UNROLL_M == QGEMM_DEFAULT_UNROLL_N +#define QTRMM_IUNUCOPY qtrmm_ounucopy +#define QTRMM_IUNNCOPY qtrmm_ounncopy +#define QTRMM_IUTUCOPY qtrmm_outucopy +#define QTRMM_IUTNCOPY qtrmm_outncopy +#define QTRMM_ILNUCOPY qtrmm_olnucopy +#define QTRMM_ILNNCOPY qtrmm_olnncopy +#define QTRMM_ILTUCOPY qtrmm_oltucopy +#define QTRMM_ILTNCOPY qtrmm_oltncopy + +#define QTRSM_IUNUCOPY qtrsm_ounucopy +#define QTRSM_IUNNCOPY qtrsm_ounncopy +#define QTRSM_IUTUCOPY qtrsm_outucopy +#define QTRSM_IUTNCOPY qtrsm_outncopy +#define QTRSM_ILNUCOPY qtrsm_olnucopy +#define QTRSM_ILNNCOPY qtrsm_olnncopy +#define QTRSM_ILTUCOPY qtrsm_oltucopy +#define QTRSM_ILTNCOPY qtrsm_oltncopy +#else +#define QTRMM_IUNUCOPY qtrmm_iunucopy +#define QTRMM_IUNNCOPY qtrmm_iunncopy +#define QTRMM_IUTUCOPY qtrmm_iutucopy +#define QTRMM_IUTNCOPY qtrmm_iutncopy +#define QTRMM_ILNUCOPY qtrmm_ilnucopy +#define QTRMM_ILNNCOPY qtrmm_ilnncopy +#define QTRMM_ILTUCOPY qtrmm_iltucopy +#define QTRMM_ILTNCOPY qtrmm_iltncopy + +#define QTRSM_IUNUCOPY qtrsm_iunucopy +#define QTRSM_IUNNCOPY qtrsm_iunncopy +#define QTRSM_IUTUCOPY qtrsm_iutucopy +#define QTRSM_IUTNCOPY qtrsm_iutncopy +#define QTRSM_ILNUCOPY qtrsm_ilnucopy +#define QTRSM_ILNNCOPY qtrsm_ilnncopy +#define QTRSM_ILTUCOPY qtrsm_iltucopy +#define QTRSM_ILTNCOPY qtrsm_iltncopy +#endif + +#define QGEMM_BETA qgemm_beta + +#define QGEMM_KERNEL qgemm_kernel + +#define QTRMM_KERNEL_LN qtrmm_kernel_LN +#define QTRMM_KERNEL_LT qtrmm_kernel_LT +#define QTRMM_KERNEL_LR qtrmm_kernel_LN +#define QTRMM_KERNEL_LC qtrmm_kernel_LT +#define QTRMM_KERNEL_RN qtrmm_kernel_RN +#define QTRMM_KERNEL_RT qtrmm_kernel_RT +#define QTRMM_KERNEL_RR qtrmm_kernel_RN +#define QTRMM_KERNEL_RC qtrmm_kernel_RT + +#define QTRSM_KERNEL_LN qtrsm_kernel_LN +#define QTRSM_KERNEL_LT qtrsm_kernel_LT +#define QTRSM_KERNEL_LR qtrsm_kernel_LN +#define QTRSM_KERNEL_LC qtrsm_kernel_LT +#define QTRSM_KERNEL_RN qtrsm_kernel_RN +#define QTRSM_KERNEL_RT qtrsm_kernel_RT +#define QTRSM_KERNEL_RR qtrsm_kernel_RN +#define QTRSM_KERNEL_RC qtrsm_kernel_RT + +#define QSYMM_OUTCOPY qsymm_outcopy +#define QSYMM_OLTCOPY qsymm_oltcopy +#if QGEMM_DEFAULT_UNROLL_M == QGEMM_DEFAULT_UNROLL_N +#define QSYMM_IUTCOPY qsymm_outcopy +#define QSYMM_ILTCOPY qsymm_oltcopy +#else +#define QSYMM_IUTCOPY qsymm_iutcopy +#define QSYMM_ILTCOPY qsymm_iltcopy +#endif + +#define QNEG_TCOPY qneg_tcopy +#define QLASWP_NCOPY qlaswp_ncopy + +#else + +#define QAMAX_K gotoblas -> qamax_k +#define QAMIN_K gotoblas -> qamin_k +#define QMAX_K gotoblas -> qmax_k +#define QMIN_K gotoblas -> qmin_k +#define IQAMAX_K gotoblas -> iqamax_k +#define IQAMIN_K gotoblas -> iqamin_k +#define IQMAX_K gotoblas -> iqmax_k +#define IQMIN_K gotoblas -> iqmin_k +#define QASUM_K gotoblas -> qasum_k +#define QAXPYU_K gotoblas -> qaxpy_k +#define QAXPYC_K gotoblas -> qaxpy_k +#define QCOPY_K gotoblas -> qcopy_k +#define QDOTU_K gotoblas -> qdot_k +#define QDOTC_K gotoblas -> qdot_k +#define QNRM2_K gotoblas -> qnrm2_k +#define QSCAL_K gotoblas -> qscal_k +#define QSWAP_K gotoblas -> qswap_k +#define QROT_K gotoblas -> qrot_k + +#define QGEMV_N gotoblas -> qgemv_n +#define QGEMV_T gotoblas -> qgemv_t +#define QGEMV_R gotoblas -> qgemv_n +#define QGEMV_C gotoblas -> qgemv_t +#define QGEMV_O gotoblas -> qgemv_n +#define QGEMV_U gotoblas -> qgemv_t +#define QGEMV_S gotoblas -> qgemv_n +#define QGEMV_D gotoblas -> qgemv_t + +#define QGERU_K gotoblas -> qger_k +#define QGERC_K gotoblas -> qger_k +#define QGERV_K gotoblas -> qger_k +#define QGERD_K gotoblas -> qger_k + +#define QSYMV_U gotoblas -> qsymv_U +#define QSYMV_L gotoblas -> qsymv_L + +#define QSYMV_THREAD_U qsymv_thread_U +#define QSYMV_THREAD_L qsymv_thread_L + +#define QGEMM_ONCOPY gotoblas -> qgemm_oncopy +#define QGEMM_OTCOPY gotoblas -> qgemm_otcopy +#define QGEMM_INCOPY gotoblas -> qgemm_incopy +#define QGEMM_ITCOPY gotoblas -> qgemm_itcopy + +#define QTRMM_OUNUCOPY gotoblas -> qtrmm_ounucopy +#define QTRMM_OUTUCOPY gotoblas -> qtrmm_outucopy +#define QTRMM_OLNUCOPY gotoblas -> qtrmm_olnucopy +#define QTRMM_OLTUCOPY gotoblas -> qtrmm_oltucopy +#define QTRSM_OUNUCOPY gotoblas -> qtrsm_ounucopy +#define QTRSM_OUTUCOPY gotoblas -> qtrsm_outucopy +#define QTRSM_OLNUCOPY gotoblas -> qtrsm_olnucopy +#define QTRSM_OLTUCOPY gotoblas -> qtrsm_oltucopy + +#define QTRMM_IUNUCOPY gotoblas -> qtrmm_iunucopy +#define QTRMM_IUTUCOPY gotoblas -> qtrmm_iutucopy +#define QTRMM_ILNUCOPY gotoblas -> qtrmm_ilnucopy +#define QTRMM_ILTUCOPY gotoblas -> qtrmm_iltucopy +#define QTRSM_IUNUCOPY gotoblas -> qtrsm_iunucopy +#define QTRSM_IUTUCOPY gotoblas -> qtrsm_iutucopy +#define QTRSM_ILNUCOPY gotoblas -> qtrsm_ilnucopy +#define QTRSM_ILTUCOPY gotoblas -> qtrsm_iltucopy + +#define QTRMM_OUNNCOPY gotoblas -> qtrmm_ounncopy +#define QTRMM_OUTNCOPY gotoblas -> qtrmm_outncopy +#define QTRMM_OLNNCOPY gotoblas -> qtrmm_olnncopy +#define QTRMM_OLTNCOPY gotoblas -> qtrmm_oltncopy +#define QTRSM_OUNNCOPY gotoblas -> qtrsm_ounncopy +#define QTRSM_OUTNCOPY gotoblas -> qtrsm_outncopy +#define QTRSM_OLNNCOPY gotoblas -> qtrsm_olnncopy +#define QTRSM_OLTNCOPY gotoblas -> qtrsm_oltncopy + +#define QTRMM_IUNNCOPY gotoblas -> qtrmm_iunncopy +#define QTRMM_IUTNCOPY gotoblas -> qtrmm_iutncopy +#define QTRMM_ILNNCOPY gotoblas -> qtrmm_ilnncopy +#define QTRMM_ILTNCOPY gotoblas -> qtrmm_iltncopy +#define QTRSM_IUNNCOPY gotoblas -> qtrsm_iunncopy +#define QTRSM_IUTNCOPY gotoblas -> qtrsm_iutncopy +#define QTRSM_ILNNCOPY gotoblas -> qtrsm_ilnncopy +#define QTRSM_ILTNCOPY gotoblas -> qtrsm_iltncopy + +#define QGEMM_BETA gotoblas -> qgemm_beta +#define QGEMM_KERNEL gotoblas -> qgemm_kernel + +#define QTRMM_KERNEL_LN gotoblas -> qtrmm_kernel_LN +#define QTRMM_KERNEL_LT gotoblas -> qtrmm_kernel_LT +#define QTRMM_KERNEL_LR gotoblas -> qtrmm_kernel_LN +#define QTRMM_KERNEL_LC gotoblas -> qtrmm_kernel_LT +#define QTRMM_KERNEL_RN gotoblas -> qtrmm_kernel_RN +#define QTRMM_KERNEL_RT gotoblas -> qtrmm_kernel_RT +#define QTRMM_KERNEL_RR gotoblas -> qtrmm_kernel_RN +#define QTRMM_KERNEL_RC gotoblas -> qtrmm_kernel_RT + +#define QTRSM_KERNEL_LN gotoblas -> qtrsm_kernel_LN +#define QTRSM_KERNEL_LT gotoblas -> qtrsm_kernel_LT +#define QTRSM_KERNEL_LR gotoblas -> qtrsm_kernel_LN +#define QTRSM_KERNEL_LC gotoblas -> qtrsm_kernel_LT +#define QTRSM_KERNEL_RN gotoblas -> qtrsm_kernel_RN +#define QTRSM_KERNEL_RT gotoblas -> qtrsm_kernel_RT +#define QTRSM_KERNEL_RR gotoblas -> qtrsm_kernel_RN +#define QTRSM_KERNEL_RC gotoblas -> qtrsm_kernel_RT + +#define QSYMM_IUTCOPY gotoblas -> qsymm_iutcopy +#define QSYMM_ILTCOPY gotoblas -> qsymm_iltcopy +#define QSYMM_OUTCOPY gotoblas -> qsymm_outcopy +#define QSYMM_OLTCOPY gotoblas -> qsymm_oltcopy + +#define QNEG_TCOPY gotoblas -> qneg_tcopy +#define QLASWP_NCOPY gotoblas -> qlaswp_ncopy + +#endif + +#define QGEMM_NN qgemm_nn +#define QGEMM_CN qgemm_tn +#define QGEMM_TN qgemm_tn +#define QGEMM_NC qgemm_nt +#define QGEMM_NT qgemm_nt +#define QGEMM_CC qgemm_tt +#define QGEMM_CT qgemm_tt +#define QGEMM_TC qgemm_tt +#define QGEMM_TT qgemm_tt +#define QGEMM_NR qgemm_nn +#define QGEMM_TR qgemm_tn +#define QGEMM_CR qgemm_tn +#define QGEMM_RN qgemm_nn +#define QGEMM_RT qgemm_nt +#define QGEMM_RC qgemm_nt +#define QGEMM_RR qgemm_nn + +#define QSYMM_LU qsymm_LU +#define QSYMM_LL qsymm_LL +#define QSYMM_RU qsymm_RU +#define QSYMM_RL qsymm_RL + +#define QHEMM_LU qhemm_LU +#define QHEMM_LL qhemm_LL +#define QHEMM_RU qhemm_RU +#define QHEMM_RL qhemm_RL + +#define QSYRK_UN qsyrk_UN +#define QSYRK_UT qsyrk_UT +#define QSYRK_LN qsyrk_LN +#define QSYRK_LT qsyrk_LT +#define QSYRK_UR qsyrk_UN +#define QSYRK_UC qsyrk_UT +#define QSYRK_LR qsyrk_LN +#define QSYRK_LC qsyrk_LT + +#define QSYRK_KERNEL_U qsyrk_kernel_U +#define QSYRK_KERNEL_L qsyrk_kernel_L + +#define QHERK_UN qsyrk_UN +#define QHERK_LN qsyrk_LN +#define QHERK_UC qsyrk_UT +#define QHERK_LC qsyrk_LT + +#define QHER2K_UN qsyr2k_UN +#define QHER2K_LN qsyr2k_LN +#define QHER2K_UC qsyr2k_UT +#define QHER2K_LC qsyr2k_LT + +#define QSYR2K_UN qsyr2k_UN +#define QSYR2K_UT qsyr2k_UT +#define QSYR2K_LN qsyr2k_LN +#define QSYR2K_LT qsyr2k_LT +#define QSYR2K_UR qsyr2k_UN +#define QSYR2K_UC qsyr2k_UT +#define QSYR2K_LR qsyr2k_LN +#define QSYR2K_LC qsyr2k_LT + +#define QSYR2K_KERNEL_U qsyr2k_kernel_U +#define QSYR2K_KERNEL_L qsyr2k_kernel_L + +#define QTRMM_LNUU qtrmm_LNUU +#define QTRMM_LNUN qtrmm_LNUN +#define QTRMM_LNLU qtrmm_LNLU +#define QTRMM_LNLN qtrmm_LNLN +#define QTRMM_LTUU qtrmm_LTUU +#define QTRMM_LTUN qtrmm_LTUN +#define QTRMM_LTLU qtrmm_LTLU +#define QTRMM_LTLN qtrmm_LTLN +#define QTRMM_LRUU qtrmm_LNUU +#define QTRMM_LRUN qtrmm_LNUN +#define QTRMM_LRLU qtrmm_LNLU +#define QTRMM_LRLN qtrmm_LNLN +#define QTRMM_LCUU qtrmm_LTUU +#define QTRMM_LCUN qtrmm_LTUN +#define QTRMM_LCLU qtrmm_LTLU +#define QTRMM_LCLN qtrmm_LTLN +#define QTRMM_RNUU qtrmm_RNUU +#define QTRMM_RNUN qtrmm_RNUN +#define QTRMM_RNLU qtrmm_RNLU +#define QTRMM_RNLN qtrmm_RNLN +#define QTRMM_RTUU qtrmm_RTUU +#define QTRMM_RTUN qtrmm_RTUN +#define QTRMM_RTLU qtrmm_RTLU +#define QTRMM_RTLN qtrmm_RTLN +#define QTRMM_RRUU qtrmm_RNUU +#define QTRMM_RRUN qtrmm_RNUN +#define QTRMM_RRLU qtrmm_RNLU +#define QTRMM_RRLN qtrmm_RNLN +#define QTRMM_RCUU qtrmm_RTUU +#define QTRMM_RCUN qtrmm_RTUN +#define QTRMM_RCLU qtrmm_RTLU +#define QTRMM_RCLN qtrmm_RTLN + +#define QTRSM_LNUU qtrsm_LNUU +#define QTRSM_LNUN qtrsm_LNUN +#define QTRSM_LNLU qtrsm_LNLU +#define QTRSM_LNLN qtrsm_LNLN +#define QTRSM_LTUU qtrsm_LTUU +#define QTRSM_LTUN qtrsm_LTUN +#define QTRSM_LTLU qtrsm_LTLU +#define QTRSM_LTLN qtrsm_LTLN +#define QTRSM_LRUU qtrsm_LNUU +#define QTRSM_LRUN qtrsm_LNUN +#define QTRSM_LRLU qtrsm_LNLU +#define QTRSM_LRLN qtrsm_LNLN +#define QTRSM_LCUU qtrsm_LTUU +#define QTRSM_LCUN qtrsm_LTUN +#define QTRSM_LCLU qtrsm_LTLU +#define QTRSM_LCLN qtrsm_LTLN +#define QTRSM_RNUU qtrsm_RNUU +#define QTRSM_RNUN qtrsm_RNUN +#define QTRSM_RNLU qtrsm_RNLU +#define QTRSM_RNLN qtrsm_RNLN +#define QTRSM_RTUU qtrsm_RTUU +#define QTRSM_RTUN qtrsm_RTUN +#define QTRSM_RTLU qtrsm_RTLU +#define QTRSM_RTLN qtrsm_RTLN +#define QTRSM_RRUU qtrsm_RNUU +#define QTRSM_RRUN qtrsm_RNUN +#define QTRSM_RRLU qtrsm_RNLU +#define QTRSM_RRLN qtrsm_RNLN +#define QTRSM_RCUU qtrsm_RTUU +#define QTRSM_RCUN qtrsm_RTUN +#define QTRSM_RCLU qtrsm_RTLU +#define QTRSM_RCLN qtrsm_RTLN + +#define QGEMM_THREAD_NN qgemm_thread_nn +#define QGEMM_THREAD_CN qgemm_thread_tn +#define QGEMM_THREAD_TN qgemm_thread_tn +#define QGEMM_THREAD_NC qgemm_thread_nt +#define QGEMM_THREAD_NT qgemm_thread_nt +#define QGEMM_THREAD_CC qgemm_thread_tt +#define QGEMM_THREAD_CT qgemm_thread_tt +#define QGEMM_THREAD_TC qgemm_thread_tt +#define QGEMM_THREAD_TT qgemm_thread_tt +#define QGEMM_THREAD_NR qgemm_thread_nn +#define QGEMM_THREAD_TR qgemm_thread_tn +#define QGEMM_THREAD_CR qgemm_thread_tn +#define QGEMM_THREAD_RN qgemm_thread_nn +#define QGEMM_THREAD_RT qgemm_thread_nt +#define QGEMM_THREAD_RC qgemm_thread_nt +#define QGEMM_THREAD_RR qgemm_thread_nn + +#define QSYMM_THREAD_LU qsymm_thread_LU +#define QSYMM_THREAD_LL qsymm_thread_LL +#define QSYMM_THREAD_RU qsymm_thread_RU +#define QSYMM_THREAD_RL qsymm_thread_RL + +#define QHEMM_THREAD_LU qhemm_thread_LU +#define QHEMM_THREAD_LL qhemm_thread_LL +#define QHEMM_THREAD_RU qhemm_thread_RU +#define QHEMM_THREAD_RL qhemm_thread_RL + +#define QSYRK_THREAD_UN qsyrk_thread_UN +#define QSYRK_THREAD_UT qsyrk_thread_UT +#define QSYRK_THREAD_LN qsyrk_thread_LN +#define QSYRK_THREAD_LT qsyrk_thread_LT +#define QSYRK_THREAD_UR qsyrk_thread_UN +#define QSYRK_THREAD_UC qsyrk_thread_UT +#define QSYRK_THREAD_LR qsyrk_thread_LN +#define QSYRK_THREAD_LC qsyrk_thread_LT + +#define QHERK_THREAD_UN qsyrk_thread_UN +#define QHERK_THREAD_UT qsyrk_thread_UT +#define QHERK_THREAD_LN qsyrk_thread_LN +#define QHERK_THREAD_LT qsyrk_thread_LT +#define QHERK_THREAD_UR qsyrk_thread_UN +#define QHERK_THREAD_UC qsyrk_thread_UT +#define QHERK_THREAD_LR qsyrk_thread_LN +#define QHERK_THREAD_LC qsyrk_thread_LT + +#endif diff --git a/common_reference.h b/common_reference.h new file mode 100644 index 0000000..e69de29 diff --git a/common_s.h b/common_s.h new file mode 100644 index 0000000..db8d69a --- /dev/null +++ b/common_s.h @@ -0,0 +1,436 @@ +#ifndef COMMON_S_H +#define COMMON_S_H + +#ifndef DYNAMIC_ARCH + +#define SAMAX_K samax_k +#define SAMIN_K samin_k +#define SMAX_K smax_k +#define SMIN_K smin_k +#define ISAMAX_K isamax_k +#define ISAMIN_K isamin_k +#define ISMAX_K ismax_k +#define ISMIN_K ismin_k +#define SASUM_K sasum_k +#define SAXPYU_K saxpy_k +#define SAXPYC_K saxpy_k +#define SCOPY_K scopy_k +#define SDOTU_K sdot_k +#define SDOTC_K sdot_k +#define SDSDOT_K sdot_k +#define DSDOT_K dsdot_k +#define SNRM2_K snrm2_k +#define SSCAL_K sscal_k +#define SSWAP_K sswap_k +#define SROT_K srot_k + +#define SGEMV_N sgemv_n +#define SGEMV_T sgemv_t +#define SGEMV_R sgemv_n +#define SGEMV_C sgemv_t +#define SGEMV_O sgemv_n +#define SGEMV_U sgemv_t +#define SGEMV_S sgemv_n +#define SGEMV_D sgemv_t + +#define SGERU_K sger_k +#define SGERC_K sger_k +#define SGERV_K sger_k +#define SGERD_K sger_k + +#define SSYMV_U ssymv_U +#define SSYMV_L ssymv_L + +#define SSYMV_THREAD_U ssymv_thread_U +#define SSYMV_THREAD_L ssymv_thread_L + +#define SGEMM_ONCOPY sgemm_oncopy +#define SGEMM_OTCOPY sgemm_otcopy + +#if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N +#define SGEMM_INCOPY sgemm_oncopy +#define SGEMM_ITCOPY sgemm_otcopy +#else +#define SGEMM_INCOPY sgemm_incopy +#define SGEMM_ITCOPY sgemm_itcopy +#endif + +#define STRMM_OUNUCOPY strmm_ounucopy +#define STRMM_OUNNCOPY strmm_ounncopy +#define STRMM_OUTUCOPY strmm_outucopy +#define STRMM_OUTNCOPY strmm_outncopy +#define STRMM_OLNUCOPY strmm_olnucopy +#define STRMM_OLNNCOPY strmm_olnncopy +#define STRMM_OLTUCOPY strmm_oltucopy +#define STRMM_OLTNCOPY strmm_oltncopy + +#define STRSM_OUNUCOPY strsm_ounucopy +#define STRSM_OUNNCOPY strsm_ounncopy +#define STRSM_OUTUCOPY strsm_outucopy +#define STRSM_OUTNCOPY strsm_outncopy +#define STRSM_OLNUCOPY strsm_olnucopy +#define STRSM_OLNNCOPY strsm_olnncopy +#define STRSM_OLTUCOPY strsm_oltucopy +#define STRSM_OLTNCOPY strsm_oltncopy + +#if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N +#define STRMM_IUNUCOPY strmm_ounucopy +#define STRMM_IUNNCOPY strmm_ounncopy +#define STRMM_IUTUCOPY strmm_outucopy +#define STRMM_IUTNCOPY strmm_outncopy +#define STRMM_ILNUCOPY strmm_olnucopy +#define STRMM_ILNNCOPY strmm_olnncopy +#define STRMM_ILTUCOPY strmm_oltucopy +#define STRMM_ILTNCOPY strmm_oltncopy + +#define STRSM_IUNUCOPY strsm_ounucopy +#define STRSM_IUNNCOPY strsm_ounncopy +#define STRSM_IUTUCOPY strsm_outucopy +#define STRSM_IUTNCOPY strsm_outncopy +#define STRSM_ILNUCOPY strsm_olnucopy +#define STRSM_ILNNCOPY strsm_olnncopy +#define STRSM_ILTUCOPY strsm_oltucopy +#define STRSM_ILTNCOPY strsm_oltncopy +#else +#define STRMM_IUNUCOPY strmm_iunucopy +#define STRMM_IUNNCOPY strmm_iunncopy +#define STRMM_IUTUCOPY strmm_iutucopy +#define STRMM_IUTNCOPY strmm_iutncopy +#define STRMM_ILNUCOPY strmm_ilnucopy +#define STRMM_ILNNCOPY strmm_ilnncopy +#define STRMM_ILTUCOPY strmm_iltucopy +#define STRMM_ILTNCOPY strmm_iltncopy + +#define STRSM_IUNUCOPY strsm_iunucopy +#define STRSM_IUNNCOPY strsm_iunncopy +#define STRSM_IUTUCOPY strsm_iutucopy +#define STRSM_IUTNCOPY strsm_iutncopy +#define STRSM_ILNUCOPY strsm_ilnucopy +#define STRSM_ILNNCOPY strsm_ilnncopy +#define STRSM_ILTUCOPY strsm_iltucopy +#define STRSM_ILTNCOPY strsm_iltncopy +#endif + +#define SGEMM_BETA sgemm_beta + +#define SGEMM_KERNEL sgemm_kernel + +#define STRMM_KERNEL_LN strmm_kernel_LN +#define STRMM_KERNEL_LT strmm_kernel_LT +#define STRMM_KERNEL_LR strmm_kernel_LN +#define STRMM_KERNEL_LC strmm_kernel_LT +#define STRMM_KERNEL_RN strmm_kernel_RN +#define STRMM_KERNEL_RT strmm_kernel_RT +#define STRMM_KERNEL_RR strmm_kernel_RN +#define STRMM_KERNEL_RC strmm_kernel_RT + +#define STRSM_KERNEL_LN strsm_kernel_LN +#define STRSM_KERNEL_LT strsm_kernel_LT +#define STRSM_KERNEL_LR strsm_kernel_LN +#define STRSM_KERNEL_LC strsm_kernel_LT +#define STRSM_KERNEL_RN strsm_kernel_RN +#define STRSM_KERNEL_RT strsm_kernel_RT +#define STRSM_KERNEL_RR strsm_kernel_RN +#define STRSM_KERNEL_RC strsm_kernel_RT + +#define SSYMM_OUTCOPY ssymm_outcopy +#define SSYMM_OLTCOPY ssymm_oltcopy +#if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N +#define SSYMM_IUTCOPY ssymm_outcopy +#define SSYMM_ILTCOPY ssymm_oltcopy +#else +#define SSYMM_IUTCOPY ssymm_iutcopy +#define SSYMM_ILTCOPY ssymm_iltcopy +#endif + +#define SNEG_TCOPY sneg_tcopy +#define SLASWP_NCOPY slaswp_ncopy + +#else + +#define SAMAX_K gotoblas -> samax_k +#define SAMIN_K gotoblas -> samin_k +#define SMAX_K gotoblas -> smax_k +#define SMIN_K gotoblas -> smin_k +#define ISAMAX_K gotoblas -> isamax_k +#define ISAMIN_K gotoblas -> isamin_k +#define ISMAX_K gotoblas -> ismax_k +#define ISMIN_K gotoblas -> ismin_k +#define SASUM_K gotoblas -> sasum_k +#define SAXPYU_K gotoblas -> saxpy_k +#define SAXPYC_K gotoblas -> saxpy_k +#define SCOPY_K gotoblas -> scopy_k +#define SDOTU_K gotoblas -> sdot_k +#define SDOTC_K gotoblas -> sdot_k +#define SDSDOT_K gotoblas -> sdot_k +#define DSDOT_K gotoblas -> dsdot_k +#define SNRM2_K gotoblas -> snrm2_k +#define SSCAL_K gotoblas -> sscal_k +#define SSWAP_K gotoblas -> sswap_k +#define SROT_K gotoblas -> srot_k + +#define SGEMV_N gotoblas -> sgemv_n +#define SGEMV_T gotoblas -> sgemv_t +#define SGEMV_R gotoblas -> sgemv_n +#define SGEMV_C gotoblas -> sgemv_t +#define SGEMV_O gotoblas -> sgemv_n +#define SGEMV_U gotoblas -> sgemv_t +#define SGEMV_S gotoblas -> sgemv_n +#define SGEMV_D gotoblas -> sgemv_t + +#define SGERU_K gotoblas -> sger_k +#define SGERC_K gotoblas -> sger_k +#define SGERV_K gotoblas -> sger_k +#define SGERD_K gotoblas -> sger_k + +#define SSYMV_U gotoblas -> ssymv_U +#define SSYMV_L gotoblas -> ssymv_L + +#define SSYMV_THREAD_U ssymv_thread_U +#define SSYMV_THREAD_L ssymv_thread_L + +#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy +#define SGEMM_OTCOPY gotoblas -> sgemm_otcopy +#define SGEMM_INCOPY gotoblas -> sgemm_incopy +#define SGEMM_ITCOPY gotoblas -> sgemm_itcopy + +#define STRMM_OUNUCOPY gotoblas -> strmm_ounucopy +#define STRMM_OUTUCOPY gotoblas -> strmm_outucopy +#define STRMM_OLNUCOPY gotoblas -> strmm_olnucopy +#define STRMM_OLTUCOPY gotoblas -> strmm_oltucopy +#define STRSM_OUNUCOPY gotoblas -> strsm_ounucopy +#define STRSM_OUTUCOPY gotoblas -> strsm_outucopy +#define STRSM_OLNUCOPY gotoblas -> strsm_olnucopy +#define STRSM_OLTUCOPY gotoblas -> strsm_oltucopy + +#define STRMM_IUNUCOPY gotoblas -> strmm_iunucopy +#define STRMM_IUTUCOPY gotoblas -> strmm_iutucopy +#define STRMM_ILNUCOPY gotoblas -> strmm_ilnucopy +#define STRMM_ILTUCOPY gotoblas -> strmm_iltucopy +#define STRSM_IUNUCOPY gotoblas -> strsm_iunucopy +#define STRSM_IUTUCOPY gotoblas -> strsm_iutucopy +#define STRSM_ILNUCOPY gotoblas -> strsm_ilnucopy +#define STRSM_ILTUCOPY gotoblas -> strsm_iltucopy + +#define STRMM_OUNNCOPY gotoblas -> strmm_ounncopy +#define STRMM_OUTNCOPY gotoblas -> strmm_outncopy +#define STRMM_OLNNCOPY gotoblas -> strmm_olnncopy +#define STRMM_OLTNCOPY gotoblas -> strmm_oltncopy +#define STRSM_OUNNCOPY gotoblas -> strsm_ounncopy +#define STRSM_OUTNCOPY gotoblas -> strsm_outncopy +#define STRSM_OLNNCOPY gotoblas -> strsm_olnncopy +#define STRSM_OLTNCOPY gotoblas -> strsm_oltncopy + +#define STRMM_IUNNCOPY gotoblas -> strmm_iunncopy +#define STRMM_IUTNCOPY gotoblas -> strmm_iutncopy +#define STRMM_ILNNCOPY gotoblas -> strmm_ilnncopy +#define STRMM_ILTNCOPY gotoblas -> strmm_iltncopy +#define STRSM_IUNNCOPY gotoblas -> strsm_iunncopy +#define STRSM_IUTNCOPY gotoblas -> strsm_iutncopy +#define STRSM_ILNNCOPY gotoblas -> strsm_ilnncopy +#define STRSM_ILTNCOPY gotoblas -> strsm_iltncopy + +#define SGEMM_BETA gotoblas -> sgemm_beta +#define SGEMM_KERNEL gotoblas -> sgemm_kernel + +#define STRMM_KERNEL_LN gotoblas -> strmm_kernel_LN +#define STRMM_KERNEL_LT gotoblas -> strmm_kernel_LT +#define STRMM_KERNEL_LR gotoblas -> strmm_kernel_LN +#define STRMM_KERNEL_LC gotoblas -> strmm_kernel_LT +#define STRMM_KERNEL_RN gotoblas -> strmm_kernel_RN +#define STRMM_KERNEL_RT gotoblas -> strmm_kernel_RT +#define STRMM_KERNEL_RR gotoblas -> strmm_kernel_RN +#define STRMM_KERNEL_RC gotoblas -> strmm_kernel_RT + +#define STRSM_KERNEL_LN gotoblas -> strsm_kernel_LN +#define STRSM_KERNEL_LT gotoblas -> strsm_kernel_LT +#define STRSM_KERNEL_LR gotoblas -> strsm_kernel_LN +#define STRSM_KERNEL_LC gotoblas -> strsm_kernel_LT +#define STRSM_KERNEL_RN gotoblas -> strsm_kernel_RN +#define STRSM_KERNEL_RT gotoblas -> strsm_kernel_RT +#define STRSM_KERNEL_RR gotoblas -> strsm_kernel_RN +#define STRSM_KERNEL_RC gotoblas -> strsm_kernel_RT + +#define SSYMM_IUTCOPY gotoblas -> ssymm_iutcopy +#define SSYMM_ILTCOPY gotoblas -> ssymm_iltcopy +#define SSYMM_OUTCOPY gotoblas -> ssymm_outcopy +#define SSYMM_OLTCOPY gotoblas -> ssymm_oltcopy + +#define SNEG_TCOPY gotoblas -> sneg_tcopy +#define SLASWP_NCOPY gotoblas -> slaswp_ncopy + +#endif + +#define SGEMM_NN sgemm_nn +#define SGEMM_CN sgemm_tn +#define SGEMM_TN sgemm_tn +#define SGEMM_NC sgemm_nt +#define SGEMM_NT sgemm_nt +#define SGEMM_CC sgemm_tt +#define SGEMM_CT sgemm_tt +#define SGEMM_TC sgemm_tt +#define SGEMM_TT sgemm_tt +#define SGEMM_NR sgemm_nn +#define SGEMM_TR sgemm_tn +#define SGEMM_CR sgemm_tn +#define SGEMM_RN sgemm_nn +#define SGEMM_RT sgemm_nt +#define SGEMM_RC sgemm_nt +#define SGEMM_RR sgemm_nn + +#define SSYMM_LU ssymm_LU +#define SSYMM_LL ssymm_LL +#define SSYMM_RU ssymm_RU +#define SSYMM_RL ssymm_RL + +#define SHEMM_LU shemm_LU +#define SHEMM_LL shemm_LL +#define SHEMM_RU shemm_RU +#define SHEMM_RL shemm_RL + +#define SSYRK_UN ssyrk_UN +#define SSYRK_UT ssyrk_UT +#define SSYRK_LN ssyrk_LN +#define SSYRK_LT ssyrk_LT +#define SSYRK_UR ssyrk_UN +#define SSYRK_UC ssyrk_UT +#define SSYRK_LR ssyrk_LN +#define SSYRK_LC ssyrk_LT + +#define SSYRK_KERNEL_U ssyrk_kernel_U +#define SSYRK_KERNEL_L ssyrk_kernel_L + +#define SHERK_UN ssyrk_UN +#define SHERK_LN ssyrk_LN +#define SHERK_UC ssyrk_UT +#define SHERK_LC ssyrk_LT + +#define SHER2K_UN ssyr2k_UN +#define SHER2K_LN ssyr2k_LN +#define SHER2K_UC ssyr2k_UT +#define SHER2K_LC ssyr2k_LT + +#define SSYR2K_UN ssyr2k_UN +#define SSYR2K_UT ssyr2k_UT +#define SSYR2K_LN ssyr2k_LN +#define SSYR2K_LT ssyr2k_LT +#define SSYR2K_UR ssyr2k_UN +#define SSYR2K_UC ssyr2k_UT +#define SSYR2K_LR ssyr2k_LN +#define SSYR2K_LC ssyr2k_LT + +#define SSYR2K_KERNEL_U ssyr2k_kernel_U +#define SSYR2K_KERNEL_L ssyr2k_kernel_L + +#define STRMM_LNUU strmm_LNUU +#define STRMM_LNUN strmm_LNUN +#define STRMM_LNLU strmm_LNLU +#define STRMM_LNLN strmm_LNLN +#define STRMM_LTUU strmm_LTUU +#define STRMM_LTUN strmm_LTUN +#define STRMM_LTLU strmm_LTLU +#define STRMM_LTLN strmm_LTLN +#define STRMM_LRUU strmm_LNUU +#define STRMM_LRUN strmm_LNUN +#define STRMM_LRLU strmm_LNLU +#define STRMM_LRLN strmm_LNLN +#define STRMM_LCUU strmm_LTUU +#define STRMM_LCUN strmm_LTUN +#define STRMM_LCLU strmm_LTLU +#define STRMM_LCLN strmm_LTLN +#define STRMM_RNUU strmm_RNUU +#define STRMM_RNUN strmm_RNUN +#define STRMM_RNLU strmm_RNLU +#define STRMM_RNLN strmm_RNLN +#define STRMM_RTUU strmm_RTUU +#define STRMM_RTUN strmm_RTUN +#define STRMM_RTLU strmm_RTLU +#define STRMM_RTLN strmm_RTLN +#define STRMM_RRUU strmm_RNUU +#define STRMM_RRUN strmm_RNUN +#define STRMM_RRLU strmm_RNLU +#define STRMM_RRLN strmm_RNLN +#define STRMM_RCUU strmm_RTUU +#define STRMM_RCUN strmm_RTUN +#define STRMM_RCLU strmm_RTLU +#define STRMM_RCLN strmm_RTLN + +#define STRSM_LNUU strsm_LNUU +#define STRSM_LNUN strsm_LNUN +#define STRSM_LNLU strsm_LNLU +#define STRSM_LNLN strsm_LNLN +#define STRSM_LTUU strsm_LTUU +#define STRSM_LTUN strsm_LTUN +#define STRSM_LTLU strsm_LTLU +#define STRSM_LTLN strsm_LTLN +#define STRSM_LRUU strsm_LNUU +#define STRSM_LRUN strsm_LNUN +#define STRSM_LRLU strsm_LNLU +#define STRSM_LRLN strsm_LNLN +#define STRSM_LCUU strsm_LTUU +#define STRSM_LCUN strsm_LTUN +#define STRSM_LCLU strsm_LTLU +#define STRSM_LCLN strsm_LTLN +#define STRSM_RNUU strsm_RNUU +#define STRSM_RNUN strsm_RNUN +#define STRSM_RNLU strsm_RNLU +#define STRSM_RNLN strsm_RNLN +#define STRSM_RTUU strsm_RTUU +#define STRSM_RTUN strsm_RTUN +#define STRSM_RTLU strsm_RTLU +#define STRSM_RTLN strsm_RTLN +#define STRSM_RRUU strsm_RNUU +#define STRSM_RRUN strsm_RNUN +#define STRSM_RRLU strsm_RNLU +#define STRSM_RRLN strsm_RNLN +#define STRSM_RCUU strsm_RTUU +#define STRSM_RCUN strsm_RTUN +#define STRSM_RCLU strsm_RTLU +#define STRSM_RCLN strsm_RTLN + +#define SGEMM_THREAD_NN sgemm_thread_nn +#define SGEMM_THREAD_CN sgemm_thread_tn +#define SGEMM_THREAD_TN sgemm_thread_tn +#define SGEMM_THREAD_NC sgemm_thread_nt +#define SGEMM_THREAD_NT sgemm_thread_nt +#define SGEMM_THREAD_CC sgemm_thread_tt +#define SGEMM_THREAD_CT sgemm_thread_tt +#define SGEMM_THREAD_TC sgemm_thread_tt +#define SGEMM_THREAD_TT sgemm_thread_tt +#define SGEMM_THREAD_NR sgemm_thread_nn +#define SGEMM_THREAD_TR sgemm_thread_tn +#define SGEMM_THREAD_CR sgemm_thread_tn +#define SGEMM_THREAD_RN sgemm_thread_nn +#define SGEMM_THREAD_RT sgemm_thread_nt +#define SGEMM_THREAD_RC sgemm_thread_nt +#define SGEMM_THREAD_RR sgemm_thread_nn + +#define SSYMM_THREAD_LU ssymm_thread_LU +#define SSYMM_THREAD_LL ssymm_thread_LL +#define SSYMM_THREAD_RU ssymm_thread_RU +#define SSYMM_THREAD_RL ssymm_thread_RL + +#define SHEMM_THREAD_LU shemm_thread_LU +#define SHEMM_THREAD_LL shemm_thread_LL +#define SHEMM_THREAD_RU shemm_thread_RU +#define SHEMM_THREAD_RL shemm_thread_RL + +#define SSYRK_THREAD_UN ssyrk_thread_UN +#define SSYRK_THREAD_UT ssyrk_thread_UT +#define SSYRK_THREAD_LN ssyrk_thread_LN +#define SSYRK_THREAD_LT ssyrk_thread_LT +#define SSYRK_THREAD_UR ssyrk_thread_UN +#define SSYRK_THREAD_UC ssyrk_thread_UT +#define SSYRK_THREAD_LR ssyrk_thread_LN +#define SSYRK_THREAD_LC ssyrk_thread_LT + +#define SHERK_THREAD_UN ssyrk_thread_UN +#define SHERK_THREAD_UT ssyrk_thread_UT +#define SHERK_THREAD_LN ssyrk_thread_LN +#define SHERK_THREAD_LT ssyrk_thread_LT +#define SHERK_THREAD_UR ssyrk_thread_UN +#define SHERK_THREAD_UC ssyrk_thread_UT +#define SHERK_THREAD_LR ssyrk_thread_LN +#define SHERK_THREAD_LC ssyrk_thread_LT + +#endif diff --git a/common_sparc.h b/common_sparc.h new file mode 100644 index 0000000..35d8bdb --- /dev/null +++ b/common_sparc.h @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_POWER +#define COMMON_POWER + +#define MB __asm__ __volatile__ ("nop") +#define WMB __asm__ __volatile__ ("nop") + +#ifndef ASSEMBLER + +static void __inline blas_lock(volatile unsigned long *address){ + + long int ret = 1; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "ldstub [%1], %0" + : "=&r"(ret) + : "r" (address) + : "memory"); + } while (ret); +} + +static __inline unsigned long rpcc(void){ + unsigned long clocks; + + __asm__ __volatile__ ("rd %%tick, %0" : "=r" (clocks)); + + return clocks; +}; + +#ifdef __64BIT__ +#define RPCC64BIT +#endif + +#ifndef __BIG_ENDIAN__ +#define __BIG_ENDIAN__ +#endif + +#ifdef DOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fmovd %%f2, %0" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fmovs %%f1, %0" : "=f"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#ifdef SMP +static __inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} +#endif +#endif + + +#ifdef ASSEMBLER + +#ifndef __64BIT__ +#define STACK_START 128 +#define SAVESP save %sp, -64, %sp +#else +#define STACK_START 2423 +#define SAVESP save %sp, -256, %sp +#endif + +#define NOP or %g1, %g1, %g1 + +#ifdef DOUBLE +#define LDF ldd +#define STF std +#define FADD faddd +#define FMUL fmuld +#define FMOV fmovd +#define FABS fabsd +#define FSUB fsubd +#define FCMP fcmpd +#define FMOVG fmovdg +#define FMOVL fmovdl +#define FSQRT fsqrtd +#define FDIV fdivd +#else +#define LDF ld +#define STF st +#define FADD fadds +#define FMUL fmuls +#define FMOV fmovs +#define FABS fabss +#define FSUB fsubs +#define FCMP fcmps +#define FMOVG fmovsg +#define FMOVL fmovsl +#define FSQRT fsqrts +#define FDIV fdivs +#endif + +#define HALT prefetch [%g0], 5 + +#define FMADDS(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 1 << 5) | (rs2)) + +#define FMADDD(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 2 << 5) | (rs2)) + +#define FMSUBS(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 5 << 5) | (rs2)) + +#define FMSUBD(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 6 << 5) | (rs2)) + +#define FNMSUBS(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 9 << 5) | (rs2)) + +#define FNMSUBD(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | (10 << 5) | (rs2)) + +#define FNMADDS(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | (13 << 5) | (rs2)) + +#define FNMADDD(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | (14 << 5) | (rs2)) + +#define FCLRS(rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x61 << 5)) + +#define FCLRD(rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x60 << 5)) + +#define FONES(rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x7f << 5)) + +#define FONED(rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x7e << 5)) + +#ifndef DOUBLE +#define FCLR(a) FCLRS(a) +#define FONE(a) FONES(a) +#define FMADD(a, b, c, d) FMADDS(a, b, c, d) +#define FMSUB(a, b, c, d) FMSUBS(a, b, c, d) +#define FNMADD(a, b, c, d) FNMADDS(a, b, c, d) +#define FNMSUB(a, b, c, d) FNMSUBS(a, b, c, d) +#else +#define FCLR(a) FCLRD(a) +#define FONE(a) FONED(a) +#define FMADD(a, b, c, d) FMADDD(a, b, c, d) +#define FMSUB(a, b, c, d) FMSUBD(a, b, c, d) +#define FNMADD(a, b, c, d) FNMADDD(a, b, c, d) +#define FNMSUB(a, b, c, d) FNMSUBD(a, b, c, d) +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#ifdef sparc +#define PROLOGUE \ + .section ".text"; \ + .align 32; \ + .global REALNAME;\ + .type REALNAME, #function; \ + .proc 07; \ +REALNAME:; +#define EPILOGUE \ + .size REALNAME, .-REALNAME +#endif + +#endif + +#ifdef sparc +#define SEEK_ADDRESS +#endif + +#define BUFFER_SIZE (32 << 20) + +#ifndef PAGESIZE +#define PAGESIZE ( 8 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif diff --git a/common_thread.h b/common_thread.h new file mode 100644 index 0000000..d74af32 --- /dev/null +++ b/common_thread.h @@ -0,0 +1,192 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_THREAD +#define COMMON_THREAD + +/* Basic Thread Debugging */ +#undef SMP_DEBUG + +/* Thread Timing Debugging */ +#undef TIMING_DEBUG + +/* Global Parameter */ +extern int blas_cpu_number; +extern int blas_num_threads; +extern int blas_omp_linked; + +#define BLAS_LEGACY 0x8000U +#define BLAS_PTHREAD 0x4000U +#define BLAS_NODE 0x2000U + +#define BLAS_PREC 0x0003U +#define BLAS_SINGLE 0x0000U +#define BLAS_DOUBLE 0x0001U +#define BLAS_XDOUBLE 0x0002U +#define BLAS_REAL 0x0000U +#define BLAS_COMPLEX 0x0004U + +#define BLAS_TRANSA 0x0030U /* 2bit */ +#define BLAS_TRANSA_N 0x0000U +#define BLAS_TRANSA_T 0x0010U +#define BLAS_TRANSA_R 0x0020U +#define BLAS_TRANSA_C 0x0030U +#define BLAS_TRANSA_SHIFT 4 + +#define BLAS_TRANSB 0x0300U /* 2bit */ +#define BLAS_TRANSB_N 0x0000U +#define BLAS_TRANSB_T 0x0100U +#define BLAS_TRANSB_R 0x0200U +#define BLAS_TRANSB_C 0x0300U +#define BLAS_TRANSB_SHIFT 8 + +#define BLAS_RSIDE 0x0400U +#define BLAS_RSIDE_SHIFT 10 +#define BLAS_UPLO 0x0800U +#define BLAS_UPLO_SHIFT 11 + +#define BLAS_STATUS_NOTYET 0 +#define BLAS_STATUS_QUEUED 1 +#define BLAS_STATUS_RUNNING 2 +#define BLAS_STATUS_FINISHED 4 + +typedef struct blas_queue { + + void *routine; + BLASLONG position; + BLASLONG assigned; + + blas_arg_t *args; + void *range_m; + void *range_n; + void *sa, *sb; + + struct blas_queue *next; + +#if defined( __WIN32__) || defined(__CYGWIN32__) + CRITICAL_SECTION lock; + HANDLE finish; +#else + pthread_mutex_t lock; + pthread_cond_t finished; +#endif + + int mode, status; + +#ifdef CONSISTENT_FPCSR + unsigned int sse_mode, x87_mode; +#endif + +#ifdef SMP_DEBUG + int num; +#endif +#ifdef TIMING_DEBUG + unsigned int clocks; +#endif +} blas_queue_t; + +#ifdef SMP_SERVER + +extern int blas_server_avail; + +static __inline int num_cpu_avail(int level) { + + if ((blas_cpu_number == 1) + +#ifdef USE_OPENMP + || omp_in_parallel() +#endif + ) return 1; + + return blas_cpu_number; + +} + +static __inline void blas_queue_init(blas_queue_t *queue){ + + queue -> sa = NULL; + queue -> sb = NULL; + queue-> next = NULL; +} + +int blas_thread_init(void); +int BLASFUNC(blas_thread_shutdown)(void); +int exec_blas(BLASLONG, blas_queue_t *); +int exec_blas_async(BLASLONG, blas_queue_t *); +int exec_blas_async_wait(BLASLONG, blas_queue_t *); + +#else +int exec_blas_async(BLASLONG num_cpu, blas_param_t *param, pthread_t *); +int exec_blas_async_wait(BLASLONG num_cpu, pthread_t *blas_threads); +int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer); +#endif + +#ifndef ASSEMBLER + +int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int threads); + +int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); + +int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); + +int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); + +int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG); + +int trsm_thread(int mode, BLASLONG m, BLASLONG n, + double alpha_r, double alpha_i, + void *a, BLASLONG lda, + void *c, BLASLONG ldc, int (*function)(), void *buffer); + +int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); + +int beta_thread(int mode, BLASLONG m, BLASLONG n, + double alpha_r, double alpha_i, + void *c, BLASLONG ldc, int (*fuction)()); + +int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, + void *offsetA, BLASLONG lda, + void *offsetB, BLASLONG jb, + void *ipiv, BLASLONG offset, int (*function)(), void *buffer); + +#endif /* ENDIF ASSEMBLER */ + +#endif diff --git a/common_x.h b/common_x.h new file mode 100644 index 0000000..03b98db --- /dev/null +++ b/common_x.h @@ -0,0 +1,611 @@ +#ifndef COMMON_X_H +#define COMMON_X_H + +#ifndef DYNAMIC_ARCH + +#define XAMAX_K xamax_k +#define XAMIN_K xamin_k +#define XMAX_K xmax_k +#define XMIN_K xmin_k +#define IXAMAX_K ixamax_k +#define IXAMIN_K ixamin_k +#define IXMAX_K ixmax_k +#define IXMIN_K ixmin_k +#define XASUM_K xasum_k +#define XAXPYU_K xaxpy_k +#define XAXPYC_K xaxpyc_k +#define XCOPY_K xcopy_k +#define XDOTU_K xdotu_k +#define XDOTC_K xdotc_k +#define XNRM2_K xnrm2_k +#define XSCAL_K xscal_k +#define XSWAP_K xswap_k +#define XROT_K xqrot_k + +#define XGEMV_N xgemv_n +#define XGEMV_T xgemv_t +#define XGEMV_R xgemv_r +#define XGEMV_C xgemv_c +#define XGEMV_O xgemv_o +#define XGEMV_U xgemv_u +#define XGEMV_S xgemv_s +#define XGEMV_D xgemv_d + +#define XGERU_K xgeru_k +#define XGERC_K xgerc_k +#define XGERV_K xgerv_k +#define XGERD_K xgerd_k + +#define XSYMV_U xsymv_U +#define XSYMV_L xsymv_L +#define XHEMV_U xhemv_U +#define XHEMV_L xhemv_L +#define XHEMV_V xhemv_V +#define XHEMV_M xhemv_M + +#define XSYMV_THREAD_U xsymv_thread_U +#define XSYMV_THREAD_L xsymv_thread_L +#define XHEMV_THREAD_U xhemv_thread_U +#define XHEMV_THREAD_L xhemv_thread_L +#define XHEMV_THREAD_V xhemv_thread_V +#define XHEMV_THREAD_M xhemv_thread_M + +#define XGEMM_ONCOPY xgemm_oncopy +#define XGEMM_OTCOPY xgemm_otcopy + +#if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N +#define XGEMM_INCOPY xgemm_oncopy +#define XGEMM_ITCOPY xgemm_otcopy +#else +#define XGEMM_INCOPY xgemm_incopy +#define XGEMM_ITCOPY xgemm_itcopy +#endif + +#define XTRMM_OUNUCOPY xtrmm_ounucopy +#define XTRMM_OUNNCOPY xtrmm_ounncopy +#define XTRMM_OUTUCOPY xtrmm_outucopy +#define XTRMM_OUTNCOPY xtrmm_outncopy +#define XTRMM_OLNUCOPY xtrmm_olnucopy +#define XTRMM_OLNNCOPY xtrmm_olnncopy +#define XTRMM_OLTUCOPY xtrmm_oltucopy +#define XTRMM_OLTNCOPY xtrmm_oltncopy + +#define XTRSM_OUNUCOPY xtrsm_ounucopy +#define XTRSM_OUNNCOPY xtrsm_ounncopy +#define XTRSM_OUTUCOPY xtrsm_outucopy +#define XTRSM_OUTNCOPY xtrsm_outncopy +#define XTRSM_OLNUCOPY xtrsm_olnucopy +#define XTRSM_OLNNCOPY xtrsm_olnncopy +#define XTRSM_OLTUCOPY xtrsm_oltucopy +#define XTRSM_OLTNCOPY xtrsm_oltncopy + +#if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N +#define XTRMM_IUNUCOPY xtrmm_ounucopy +#define XTRMM_IUNNCOPY xtrmm_ounncopy +#define XTRMM_IUTUCOPY xtrmm_outucopy +#define XTRMM_IUTNCOPY xtrmm_outncopy +#define XTRMM_ILNUCOPY xtrmm_olnucopy +#define XTRMM_ILNNCOPY xtrmm_olnncopy +#define XTRMM_ILTUCOPY xtrmm_oltucopy +#define XTRMM_ILTNCOPY xtrmm_oltncopy + +#define XTRSM_IUNUCOPY xtrsm_ounucopy +#define XTRSM_IUNNCOPY xtrsm_ounncopy +#define XTRSM_IUTUCOPY xtrsm_outucopy +#define XTRSM_IUTNCOPY xtrsm_outncopy +#define XTRSM_ILNUCOPY xtrsm_olnucopy +#define XTRSM_ILNNCOPY xtrsm_olnncopy +#define XTRSM_ILTUCOPY xtrsm_oltucopy +#define XTRSM_ILTNCOPY xtrsm_oltncopy +#else +#define XTRMM_IUNUCOPY xtrmm_iunucopy +#define XTRMM_IUNNCOPY xtrmm_iunncopy +#define XTRMM_IUTUCOPY xtrmm_iutucopy +#define XTRMM_IUTNCOPY xtrmm_iutncopy +#define XTRMM_ILNUCOPY xtrmm_ilnucopy +#define XTRMM_ILNNCOPY xtrmm_ilnncopy +#define XTRMM_ILTUCOPY xtrmm_iltucopy +#define XTRMM_ILTNCOPY xtrmm_iltncopy + +#define XTRSM_IUNUCOPY xtrsm_iunucopy +#define XTRSM_IUNNCOPY xtrsm_iunncopy +#define XTRSM_IUTUCOPY xtrsm_iutucopy +#define XTRSM_IUTNCOPY xtrsm_iutncopy +#define XTRSM_ILNUCOPY xtrsm_ilnucopy +#define XTRSM_ILNNCOPY xtrsm_ilnncopy +#define XTRSM_ILTUCOPY xtrsm_iltucopy +#define XTRSM_ILTNCOPY xtrsm_iltncopy +#endif + +#define XGEMM_BETA xgemm_beta + +#define XGEMM_KERNEL_N xgemm_kernel_n +#define XGEMM_KERNEL_L xgemm_kernel_l +#define XGEMM_KERNEL_R xgemm_kernel_r +#define XGEMM_KERNEL_B xgemm_kernel_b + +#define XTRMM_KERNEL_LN xtrmm_kernel_LN +#define XTRMM_KERNEL_LT xtrmm_kernel_LT +#define XTRMM_KERNEL_LR xtrmm_kernel_LR +#define XTRMM_KERNEL_LC xtrmm_kernel_LC +#define XTRMM_KERNEL_RN xtrmm_kernel_RN +#define XTRMM_KERNEL_RT xtrmm_kernel_RT +#define XTRMM_KERNEL_RR xtrmm_kernel_RR +#define XTRMM_KERNEL_RC xtrmm_kernel_RC + +#define XTRSM_KERNEL_LN xtrsm_kernel_LN +#define XTRSM_KERNEL_LT xtrsm_kernel_LT +#define XTRSM_KERNEL_LR xtrsm_kernel_LR +#define XTRSM_KERNEL_LC xtrsm_kernel_LC +#define XTRSM_KERNEL_RN xtrsm_kernel_RN +#define XTRSM_KERNEL_RT xtrsm_kernel_RT +#define XTRSM_KERNEL_RR xtrsm_kernel_RR +#define XTRSM_KERNEL_RC xtrsm_kernel_RC + +#define XSYMM_OUTCOPY xsymm_outcopy +#define XSYMM_OLTCOPY xsymm_oltcopy +#if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N +#define XSYMM_IUTCOPY xsymm_outcopy +#define XSYMM_ILTCOPY xsymm_oltcopy +#else +#define XSYMM_IUTCOPY xsymm_iutcopy +#define XSYMM_ILTCOPY xsymm_iltcopy +#endif + +#define XHEMM_OUTCOPY xhemm_outcopy +#define XHEMM_OLTCOPY xhemm_oltcopy +#if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N +#define XHEMM_IUTCOPY xhemm_outcopy +#define XHEMM_ILTCOPY xhemm_oltcopy +#else +#define XHEMM_IUTCOPY xhemm_iutcopy +#define XHEMM_ILTCOPY xhemm_iltcopy +#endif + +#define XGEMM3M_ONCOPYB xgemm3m_oncopyb +#define XGEMM3M_ONCOPYR xgemm3m_oncopyr +#define XGEMM3M_ONCOPYI xgemm3m_oncopyi +#define XGEMM3M_OTCOPYB xgemm3m_otcopyb +#define XGEMM3M_OTCOPYR xgemm3m_otcopyr +#define XGEMM3M_OTCOPYI xgemm3m_otcopyi + +#define XGEMM3M_INCOPYB xgemm3m_incopyb +#define XGEMM3M_INCOPYR xgemm3m_incopyr +#define XGEMM3M_INCOPYI xgemm3m_incopyi +#define XGEMM3M_ITCOPYB xgemm3m_itcopyb +#define XGEMM3M_ITCOPYR xgemm3m_itcopyr +#define XGEMM3M_ITCOPYI xgemm3m_itcopyi + +#define XSYMM3M_ILCOPYB xsymm3m_ilcopyb +#define XSYMM3M_IUCOPYB xsymm3m_iucopyb +#define XSYMM3M_ILCOPYR xsymm3m_ilcopyr +#define XSYMM3M_IUCOPYR xsymm3m_iucopyr +#define XSYMM3M_ILCOPYI xsymm3m_ilcopyi +#define XSYMM3M_IUCOPYI xsymm3m_iucopyi + +#define XSYMM3M_OLCOPYB xsymm3m_olcopyb +#define XSYMM3M_OUCOPYB xsymm3m_oucopyb +#define XSYMM3M_OLCOPYR xsymm3m_olcopyr +#define XSYMM3M_OUCOPYR xsymm3m_oucopyr +#define XSYMM3M_OLCOPYI xsymm3m_olcopyi +#define XSYMM3M_OUCOPYI xsymm3m_oucopyi + +#define XHEMM3M_ILCOPYB xhemm3m_ilcopyb +#define XHEMM3M_IUCOPYB xhemm3m_iucopyb +#define XHEMM3M_ILCOPYR xhemm3m_ilcopyr +#define XHEMM3M_IUCOPYR xhemm3m_iucopyr +#define XHEMM3M_ILCOPYI xhemm3m_ilcopyi +#define XHEMM3M_IUCOPYI xhemm3m_iucopyi + +#define XHEMM3M_OLCOPYB xhemm3m_olcopyb +#define XHEMM3M_OUCOPYB xhemm3m_oucopyb +#define XHEMM3M_OLCOPYR xhemm3m_olcopyr +#define XHEMM3M_OUCOPYR xhemm3m_oucopyr +#define XHEMM3M_OLCOPYI xhemm3m_olcopyi +#define XHEMM3M_OUCOPYI xhemm3m_oucopyi + +#define XGEMM3M_KERNEL xgemm3m_kernel + +#define XNEG_TCOPY xneg_tcopy +#define XLASWP_NCOPY xlaswp_ncopy + +#else + +#define XAMAX_K gotoblas -> xamax_k +#define XAMIN_K gotoblas -> xamin_k +#define XMAX_K gotoblas -> xmax_k +#define XMIN_K gotoblas -> xmin_k +#define IXAMAX_K gotoblas -> ixamax_k +#define IXAMIN_K gotoblas -> ixamin_k +#define IXMAX_K gotoblas -> ixmax_k +#define IXMIN_K gotoblas -> ixmin_k +#define XASUM_K gotoblas -> xasum_k +#define XAXPYU_K gotoblas -> xaxpy_k +#define XAXPYC_K gotoblas -> xaxpyc_k +#define XCOPY_K gotoblas -> xcopy_k +#define XDOTU_K gotoblas -> xdotu_k +#define XDOTC_K gotoblas -> xdotc_k +#define XNRM2_K gotoblas -> xnrm2_k +#define XSCAL_K gotoblas -> xscal_k +#define XSWAP_K gotoblas -> xswap_k +#define XROT_K gotoblas -> xqrot_k + +#define XGEMV_N gotoblas -> xgemv_n +#define XGEMV_T gotoblas -> xgemv_t +#define XGEMV_R gotoblas -> xgemv_r +#define XGEMV_C gotoblas -> xgemv_c +#define XGEMV_O gotoblas -> xgemv_o +#define XGEMV_U gotoblas -> xgemv_u +#define XGEMV_S gotoblas -> xgemv_s +#define XGEMV_D gotoblas -> xgemv_d + +#define XGERU_K gotoblas -> xgeru_k +#define XGERC_K gotoblas -> xgerc_k +#define XGERV_K gotoblas -> xgerv_k +#define XGERD_K gotoblas -> xgerd_k + +#define XSYMV_U gotoblas -> xsymv_U +#define XSYMV_L gotoblas -> xsymv_L +#define XHEMV_U gotoblas -> xhemv_U +#define XHEMV_L gotoblas -> xhemv_L +#define XHEMV_V gotoblas -> xhemv_V +#define XHEMV_M gotoblas -> xhemv_M + +#define XSYMV_THREAD_U xsymv_thread_U +#define XSYMV_THREAD_L xsymv_thread_L +#define XHEMV_THREAD_U xhemv_thread_U +#define XHEMV_THREAD_L xhemv_thread_L +#define XHEMV_THREAD_V xhemv_thread_V +#define XHEMV_THREAD_M xhemv_thread_M + +#define XGEMM_ONCOPY gotoblas -> xgemm_oncopy +#define XGEMM_OTCOPY gotoblas -> xgemm_otcopy +#define XGEMM_INCOPY gotoblas -> xgemm_incopy +#define XGEMM_ITCOPY gotoblas -> xgemm_itcopy + +#define XTRMM_OUNUCOPY gotoblas -> xtrmm_ounucopy +#define XTRMM_OUTUCOPY gotoblas -> xtrmm_outucopy +#define XTRMM_OLNUCOPY gotoblas -> xtrmm_olnucopy +#define XTRMM_OLTUCOPY gotoblas -> xtrmm_oltucopy +#define XTRSM_OUNUCOPY gotoblas -> xtrsm_ounucopy +#define XTRSM_OUTUCOPY gotoblas -> xtrsm_outucopy +#define XTRSM_OLNUCOPY gotoblas -> xtrsm_olnucopy +#define XTRSM_OLTUCOPY gotoblas -> xtrsm_oltucopy + +#define XTRMM_IUNUCOPY gotoblas -> xtrmm_iunucopy +#define XTRMM_IUTUCOPY gotoblas -> xtrmm_iutucopy +#define XTRMM_ILNUCOPY gotoblas -> xtrmm_ilnucopy +#define XTRMM_ILTUCOPY gotoblas -> xtrmm_iltucopy +#define XTRSM_IUNUCOPY gotoblas -> xtrsm_iunucopy +#define XTRSM_IUTUCOPY gotoblas -> xtrsm_iutucopy +#define XTRSM_ILNUCOPY gotoblas -> xtrsm_ilnucopy +#define XTRSM_ILTUCOPY gotoblas -> xtrsm_iltucopy + +#define XTRMM_OUNNCOPY gotoblas -> xtrmm_ounncopy +#define XTRMM_OUTNCOPY gotoblas -> xtrmm_outncopy +#define XTRMM_OLNNCOPY gotoblas -> xtrmm_olnncopy +#define XTRMM_OLTNCOPY gotoblas -> xtrmm_oltncopy +#define XTRSM_OUNNCOPY gotoblas -> xtrsm_ounncopy +#define XTRSM_OUTNCOPY gotoblas -> xtrsm_outncopy +#define XTRSM_OLNNCOPY gotoblas -> xtrsm_olnncopy +#define XTRSM_OLTNCOPY gotoblas -> xtrsm_oltncopy + +#define XTRMM_IUNNCOPY gotoblas -> xtrmm_iunncopy +#define XTRMM_IUTNCOPY gotoblas -> xtrmm_iutncopy +#define XTRMM_ILNNCOPY gotoblas -> xtrmm_ilnncopy +#define XTRMM_ILTNCOPY gotoblas -> xtrmm_iltncopy +#define XTRSM_IUNNCOPY gotoblas -> xtrsm_iunncopy +#define XTRSM_IUTNCOPY gotoblas -> xtrsm_iutncopy +#define XTRSM_ILNNCOPY gotoblas -> xtrsm_ilnncopy +#define XTRSM_ILTNCOPY gotoblas -> xtrsm_iltncopy + +#define XGEMM_BETA gotoblas -> xgemm_beta +#define XGEMM_KERNEL_N gotoblas -> xgemm_kernel_n +#define XGEMM_KERNEL_L gotoblas -> xgemm_kernel_l +#define XGEMM_KERNEL_R gotoblas -> xgemm_kernel_r +#define XGEMM_KERNEL_B gotoblas -> xgemm_kernel_b + +#define XTRMM_KERNEL_LN gotoblas -> xtrmm_kernel_LN +#define XTRMM_KERNEL_LT gotoblas -> xtrmm_kernel_LT +#define XTRMM_KERNEL_LR gotoblas -> xtrmm_kernel_LR +#define XTRMM_KERNEL_LC gotoblas -> xtrmm_kernel_LC +#define XTRMM_KERNEL_RN gotoblas -> xtrmm_kernel_RN +#define XTRMM_KERNEL_RT gotoblas -> xtrmm_kernel_RT +#define XTRMM_KERNEL_RR gotoblas -> xtrmm_kernel_RR +#define XTRMM_KERNEL_RC gotoblas -> xtrmm_kernel_RC + +#define XTRSM_KERNEL_LN gotoblas -> xtrsm_kernel_LN +#define XTRSM_KERNEL_LT gotoblas -> xtrsm_kernel_LT +#define XTRSM_KERNEL_LR gotoblas -> xtrsm_kernel_LR +#define XTRSM_KERNEL_LC gotoblas -> xtrsm_kernel_LC +#define XTRSM_KERNEL_RN gotoblas -> xtrsm_kernel_RN +#define XTRSM_KERNEL_RT gotoblas -> xtrsm_kernel_RT +#define XTRSM_KERNEL_RR gotoblas -> xtrsm_kernel_RR +#define XTRSM_KERNEL_RC gotoblas -> xtrsm_kernel_RC + +#define XSYMM_IUTCOPY gotoblas -> xsymm_iutcopy +#define XSYMM_ILTCOPY gotoblas -> xsymm_iltcopy +#define XSYMM_OUTCOPY gotoblas -> xsymm_outcopy +#define XSYMM_OLTCOPY gotoblas -> xsymm_oltcopy + +#define XHEMM_OUTCOPY gotoblas -> xhemm_outcopy +#define XHEMM_OLTCOPY gotoblas -> xhemm_oltcopy +#define XHEMM_IUTCOPY gotoblas -> xhemm_iutcopy +#define XHEMM_ILTCOPY gotoblas -> xhemm_iltcopy + +#define XGEMM3M_ONCOPYB gotoblas -> xgemm3m_oncopyb +#define XGEMM3M_ONCOPYR gotoblas -> xgemm3m_oncopyr +#define XGEMM3M_ONCOPYI gotoblas -> xgemm3m_oncopyi +#define XGEMM3M_OTCOPYB gotoblas -> xgemm3m_otcopyb +#define XGEMM3M_OTCOPYR gotoblas -> xgemm3m_otcopyr +#define XGEMM3M_OTCOPYI gotoblas -> xgemm3m_otcopyi + +#define XGEMM3M_INCOPYB gotoblas -> xgemm3m_incopyb +#define XGEMM3M_INCOPYR gotoblas -> xgemm3m_incopyr +#define XGEMM3M_INCOPYI gotoblas -> xgemm3m_incopyi +#define XGEMM3M_ITCOPYB gotoblas -> xgemm3m_itcopyb +#define XGEMM3M_ITCOPYR gotoblas -> xgemm3m_itcopyr +#define XGEMM3M_ITCOPYI gotoblas -> xgemm3m_itcopyi + +#define XSYMM3M_ILCOPYB gotoblas -> xsymm3m_ilcopyb +#define XSYMM3M_IUCOPYB gotoblas -> xsymm3m_iucopyb +#define XSYMM3M_ILCOPYR gotoblas -> xsymm3m_ilcopyr +#define XSYMM3M_IUCOPYR gotoblas -> xsymm3m_iucopyr +#define XSYMM3M_ILCOPYI gotoblas -> xsymm3m_ilcopyi +#define XSYMM3M_IUCOPYI gotoblas -> xsymm3m_iucopyi + +#define XSYMM3M_OLCOPYB gotoblas -> xsymm3m_olcopyb +#define XSYMM3M_OUCOPYB gotoblas -> xsymm3m_oucopyb +#define XSYMM3M_OLCOPYR gotoblas -> xsymm3m_olcopyr +#define XSYMM3M_OUCOPYR gotoblas -> xsymm3m_oucopyr +#define XSYMM3M_OLCOPYI gotoblas -> xsymm3m_olcopyi +#define XSYMM3M_OUCOPYI gotoblas -> xsymm3m_oucopyi + +#define XHEMM3M_ILCOPYB gotoblas -> xhemm3m_ilcopyb +#define XHEMM3M_IUCOPYB gotoblas -> xhemm3m_iucopyb +#define XHEMM3M_ILCOPYR gotoblas -> xhemm3m_ilcopyr +#define XHEMM3M_IUCOPYR gotoblas -> xhemm3m_iucopyr +#define XHEMM3M_ILCOPYI gotoblas -> xhemm3m_ilcopyi +#define XHEMM3M_IUCOPYI gotoblas -> xhemm3m_iucopyi + +#define XHEMM3M_OLCOPYB gotoblas -> xhemm3m_olcopyb +#define XHEMM3M_OUCOPYB gotoblas -> xhemm3m_oucopyb +#define XHEMM3M_OLCOPYR gotoblas -> xhemm3m_olcopyr +#define XHEMM3M_OUCOPYR gotoblas -> xhemm3m_oucopyr +#define XHEMM3M_OLCOPYI gotoblas -> xhemm3m_olcopyi +#define XHEMM3M_OUCOPYI gotoblas -> xhemm3m_oucopyi + +#define XGEMM3M_KERNEL gotoblas -> xgemm3m_kernel + +#define XNEG_TCOPY gotoblas -> xneg_tcopy +#define XLASWP_NCOPY gotoblas -> xlaswp_ncopy + +#endif + +#define XGEMM_NN xgemm_nn +#define XGEMM_CN xgemm_cn +#define XGEMM_TN xgemm_tn +#define XGEMM_NC xgemm_nc +#define XGEMM_NT xgemm_nt +#define XGEMM_CC xgemm_cc +#define XGEMM_CT xgemm_ct +#define XGEMM_TC xgemm_tc +#define XGEMM_TT xgemm_tt +#define XGEMM_NR xgemm_nr +#define XGEMM_TR xgemm_tr +#define XGEMM_CR xgemm_cr +#define XGEMM_RN xgemm_rn +#define XGEMM_RT xgemm_rt +#define XGEMM_RC xgemm_rc +#define XGEMM_RR xgemm_rr + +#define XSYMM_LU xsymm_LU +#define XSYMM_LL xsymm_LL +#define XSYMM_RU xsymm_RU +#define XSYMM_RL xsymm_RL + +#define XHEMM_LU xhemm_LU +#define XHEMM_LL xhemm_LL +#define XHEMM_RU xhemm_RU +#define XHEMM_RL xhemm_RL + +#define XSYRK_UN xsyrk_UN +#define XSYRK_UT xsyrk_UT +#define XSYRK_LN xsyrk_LN +#define XSYRK_LT xsyrk_LT +#define XSYRK_UR xsyrk_UN +#define XSYRK_UC xsyrk_UT +#define XSYRK_LR xsyrk_LN +#define XSYRK_LC xsyrk_LT + +#define XSYRK_KERNEL_U xsyrk_kernel_U +#define XSYRK_KERNEL_L xsyrk_kernel_L + +#define XHERK_UN xherk_UN +#define XHERK_LN xherk_LN +#define XHERK_UC xherk_UC +#define XHERK_LC xherk_LC + +#define XHER2K_UN xher2k_UN +#define XHER2K_LN xher2k_LN +#define XHER2K_UC xher2k_UC +#define XHER2K_LC xher2k_LC + +#define XSYR2K_UN xsyr2k_UN +#define XSYR2K_UT xsyr2k_UT +#define XSYR2K_LN xsyr2k_LN +#define XSYR2K_LT xsyr2k_LT +#define XSYR2K_UR xsyr2k_UN +#define XSYR2K_UC xsyr2k_UT +#define XSYR2K_LR xsyr2k_LN +#define XSYR2K_LC xsyr2k_LT + +#define XSYR2K_KERNEL_U xsyr2k_kernel_U +#define XSYR2K_KERNEL_L xsyr2k_kernel_L + +#define XTRMM_LNUU xtrmm_LNUU +#define XTRMM_LNUN xtrmm_LNUN +#define XTRMM_LNLU xtrmm_LNLU +#define XTRMM_LNLN xtrmm_LNLN +#define XTRMM_LTUU xtrmm_LTUU +#define XTRMM_LTUN xtrmm_LTUN +#define XTRMM_LTLU xtrmm_LTLU +#define XTRMM_LTLN xtrmm_LTLN +#define XTRMM_LRUU xtrmm_LRUU +#define XTRMM_LRUN xtrmm_LRUN +#define XTRMM_LRLU xtrmm_LRLU +#define XTRMM_LRLN xtrmm_LRLN +#define XTRMM_LCUU xtrmm_LCUU +#define XTRMM_LCUN xtrmm_LCUN +#define XTRMM_LCLU xtrmm_LCLU +#define XTRMM_LCLN xtrmm_LCLN +#define XTRMM_RNUU xtrmm_RNUU +#define XTRMM_RNUN xtrmm_RNUN +#define XTRMM_RNLU xtrmm_RNLU +#define XTRMM_RNLN xtrmm_RNLN +#define XTRMM_RTUU xtrmm_RTUU +#define XTRMM_RTUN xtrmm_RTUN +#define XTRMM_RTLU xtrmm_RTLU +#define XTRMM_RTLN xtrmm_RTLN +#define XTRMM_RRUU xtrmm_RRUU +#define XTRMM_RRUN xtrmm_RRUN +#define XTRMM_RRLU xtrmm_RRLU +#define XTRMM_RRLN xtrmm_RRLN +#define XTRMM_RCUU xtrmm_RCUU +#define XTRMM_RCUN xtrmm_RCUN +#define XTRMM_RCLU xtrmm_RCLU +#define XTRMM_RCLN xtrmm_RCLN + +#define XTRSM_LNUU xtrsm_LNUU +#define XTRSM_LNUN xtrsm_LNUN +#define XTRSM_LNLU xtrsm_LNLU +#define XTRSM_LNLN xtrsm_LNLN +#define XTRSM_LTUU xtrsm_LTUU +#define XTRSM_LTUN xtrsm_LTUN +#define XTRSM_LTLU xtrsm_LTLU +#define XTRSM_LTLN xtrsm_LTLN +#define XTRSM_LRUU xtrsm_LRUU +#define XTRSM_LRUN xtrsm_LRUN +#define XTRSM_LRLU xtrsm_LRLU +#define XTRSM_LRLN xtrsm_LRLN +#define XTRSM_LCUU xtrsm_LCUU +#define XTRSM_LCUN xtrsm_LCUN +#define XTRSM_LCLU xtrsm_LCLU +#define XTRSM_LCLN xtrsm_LCLN +#define XTRSM_RNUU xtrsm_RNUU +#define XTRSM_RNUN xtrsm_RNUN +#define XTRSM_RNLU xtrsm_RNLU +#define XTRSM_RNLN xtrsm_RNLN +#define XTRSM_RTUU xtrsm_RTUU +#define XTRSM_RTUN xtrsm_RTUN +#define XTRSM_RTLU xtrsm_RTLU +#define XTRSM_RTLN xtrsm_RTLN +#define XTRSM_RRUU xtrsm_RRUU +#define XTRSM_RRUN xtrsm_RRUN +#define XTRSM_RRLU xtrsm_RRLU +#define XTRSM_RRLN xtrsm_RRLN +#define XTRSM_RCUU xtrsm_RCUU +#define XTRSM_RCUN xtrsm_RCUN +#define XTRSM_RCLU xtrsm_RCLU +#define XTRSM_RCLN xtrsm_RCLN + +#define XGEMM_THREAD_NN xgemm_thread_nn +#define XGEMM_THREAD_CN xgemm_thread_cn +#define XGEMM_THREAD_TN xgemm_thread_tn +#define XGEMM_THREAD_NC xgemm_thread_nc +#define XGEMM_THREAD_NT xgemm_thread_nt +#define XGEMM_THREAD_CC xgemm_thread_cc +#define XGEMM_THREAD_CT xgemm_thread_ct +#define XGEMM_THREAD_TC xgemm_thread_tc +#define XGEMM_THREAD_TT xgemm_thread_tt +#define XGEMM_THREAD_NR xgemm_thread_nr +#define XGEMM_THREAD_TR xgemm_thread_tr +#define XGEMM_THREAD_CR xgemm_thread_cr +#define XGEMM_THREAD_RN xgemm_thread_rn +#define XGEMM_THREAD_RT xgemm_thread_rt +#define XGEMM_THREAD_RC xgemm_thread_rc +#define XGEMM_THREAD_RR xgemm_thread_rr + +#define XSYMM_THREAD_LU xsymm_thread_LU +#define XSYMM_THREAD_LL xsymm_thread_LL +#define XSYMM_THREAD_RU xsymm_thread_RU +#define XSYMM_THREAD_RL xsymm_thread_RL + +#define XHEMM_THREAD_LU xhemm_thread_LU +#define XHEMM_THREAD_LL xhemm_thread_LL +#define XHEMM_THREAD_RU xhemm_thread_RU +#define XHEMM_THREAD_RL xhemm_thread_RL + +#define XSYRK_THREAD_UN xsyrk_thread_UN +#define XSYRK_THREAD_UT xsyrk_thread_UT +#define XSYRK_THREAD_LN xsyrk_thread_LN +#define XSYRK_THREAD_LT xsyrk_thread_LT +#define XSYRK_THREAD_UR xsyrk_thread_UN +#define XSYRK_THREAD_UC xsyrk_thread_UT +#define XSYRK_THREAD_LR xsyrk_thread_LN +#define XSYRK_THREAD_LC xsyrk_thread_LT + +#define XHERK_THREAD_UN xherk_thread_UN +#define XHERK_THREAD_UT xherk_thread_UT +#define XHERK_THREAD_LN xherk_thread_LN +#define XHERK_THREAD_LT xherk_thread_LT +#define XHERK_THREAD_UR xherk_thread_UR +#define XHERK_THREAD_UC xherk_thread_UC +#define XHERK_THREAD_LR xherk_thread_LR +#define XHERK_THREAD_LC xherk_thread_LC + +#define XGEMM3M_NN xgemm3m_nn +#define XGEMM3M_CN xgemm3m_cn +#define XGEMM3M_TN xgemm3m_tn +#define XGEMM3M_NC xgemm3m_nc +#define XGEMM3M_NT xgemm3m_nt +#define XGEMM3M_CC xgemm3m_cc +#define XGEMM3M_CT xgemm3m_ct +#define XGEMM3M_TC xgemm3m_tc +#define XGEMM3M_TT xgemm3m_tt +#define XGEMM3M_NR xgemm3m_nr +#define XGEMM3M_TR xgemm3m_tr +#define XGEMM3M_CR xgemm3m_cr +#define XGEMM3M_RN xgemm3m_rn +#define XGEMM3M_RT xgemm3m_rt +#define XGEMM3M_RC xgemm3m_rc +#define XGEMM3M_RR xgemm3m_rr + +#define XGEMM3M_THREAD_NN xgemm3m_thread_nn +#define XGEMM3M_THREAD_CN xgemm3m_thread_cn +#define XGEMM3M_THREAD_TN xgemm3m_thread_tn +#define XGEMM3M_THREAD_NC xgemm3m_thread_nc +#define XGEMM3M_THREAD_NT xgemm3m_thread_nt +#define XGEMM3M_THREAD_CC xgemm3m_thread_cc +#define XGEMM3M_THREAD_CT xgemm3m_thread_ct +#define XGEMM3M_THREAD_TC xgemm3m_thread_tc +#define XGEMM3M_THREAD_TT xgemm3m_thread_tt +#define XGEMM3M_THREAD_NR xgemm3m_thread_nr +#define XGEMM3M_THREAD_TR xgemm3m_thread_tr +#define XGEMM3M_THREAD_CR xgemm3m_thread_cr +#define XGEMM3M_THREAD_RN xgemm3m_thread_rn +#define XGEMM3M_THREAD_RT xgemm3m_thread_rt +#define XGEMM3M_THREAD_RC xgemm3m_thread_rc +#define XGEMM3M_THREAD_RR xgemm3m_thread_rr + +#define XSYMM3M_LU xsymm3m_LU +#define XSYMM3M_LL xsymm3m_LL +#define XSYMM3M_RU xsymm3m_RU +#define XSYMM3M_RL xsymm3m_RL + +#define XSYMM3M_THREAD_LU xsymm3m_thread_LU +#define XSYMM3M_THREAD_LL xsymm3m_thread_LL +#define XSYMM3M_THREAD_RU xsymm3m_thread_RU +#define XSYMM3M_THREAD_RL xsymm3m_thread_RL + +#define XHEMM3M_LU xhemm3m_LU +#define XHEMM3M_LL xhemm3m_LL +#define XHEMM3M_RU xhemm3m_RU +#define XHEMM3M_RL xhemm3m_RL + +#define XHEMM3M_THREAD_LU xhemm3m_thread_LU +#define XHEMM3M_THREAD_LL xhemm3m_thread_LL +#define XHEMM3M_THREAD_RU xhemm3m_thread_RU +#define XHEMM3M_THREAD_RL xhemm3m_thread_RL + +#endif diff --git a/common_x86.h b/common_x86.h new file mode 100644 index 0000000..fbb91f8 --- /dev/null +++ b/common_x86.h @@ -0,0 +1,359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_X86 +#define COMMON_X86 + +#ifndef ASSEMBLER + +#define MB +#define WMB + +#ifdef C_SUN +#define __asm__ __asm +#define __volatile__ +#endif + +static void __inline blas_lock(volatile BLASULONG *address){ + + int ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "xchgl %0, %1\n" + : "=r"(ret), "=m"(*address) + : "0"(1), "m"(*address) + : "memory"); + + } while (ret); + +} + +static __inline unsigned long long rpcc(void){ + unsigned int a, d; + + __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); + + return ((unsigned long long)a + ((unsigned long long)d << 32)); +}; + +static __inline unsigned long getstackaddr(void){ + unsigned long addr; + + __asm__ __volatile__ ("mov %%esp, %0" + : "=r"(addr) : : "memory"); + + return addr; +}; + + +static __inline long double sqrt_long(long double val) { + long double result; + + __asm__ __volatile__ ("fldt %1\n" + "fsqrt\n" + "fstpt %0\n" : "=m" (result) : "m"(val)); + return result; +} + +#define SQRT(a) sqrt_long(a) + +/* This is due to gcc's bug */ +void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); + +#define WHEREAMI + +static inline int WhereAmI(void){ + int eax, ebx, ecx, edx; + int apicid; + + cpuid(1, &eax, &ebx, &ecx, &edx); + apicid = BITMASK(ebx, 24, 0xff); + + return apicid; +} + +#ifdef ENABLE_SSE_EXCEPTION + +#define IDEBUG_START \ +{ \ + unsigned int fp_sse_mode, new_fp_mode; \ + __asm__ __volatile__ ("stmxcsr %0" : "=m" (fp_sse_mode) : ); \ + new_fp_mode = fp_sse_mode & ~0xd00; \ + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (new_fp_mode) ); + +#define IDEBUG_END \ + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (fp_sse_mode) ); \ +} + +#endif + +#ifdef XDOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fstpt %0" : "=m"(res) : : "memory") +#elif defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("fstpl %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fstps %0" : "=m"(res) : : "memory"); +#endif + +#define GET_IMAGE_CANCEL __asm__ __volatile__ ("ffree %st") + +#ifdef SMP +extern unsigned int blas_quick_divide_table[]; + +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ + + unsigned int result; + + if (y <= 1) return x; + + y = blas_quick_divide_table[y]; + + __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + + return result; +} +#endif + +#endif + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + +#define SEEK_ADDRESS + +#if defined(DOUBLE) || defined(XDOUBLE) +#define MMXLOAD movq +#define MMXSTORE movq +#else +#define MMXLOAD movd +#define MMXSTORE movd +#endif + +#if defined(HAVE_3DNOW) +#define EMMS femms +#elif defined(HAVE_MMX) +#define EMMS emms +#endif + +#ifndef EMMS +#define EMMS +#endif + +#if defined(CORE2) || defined(PENTIUM4) +#define movapd movaps +#endif + +#define BRANCH .byte 0x3e +#define NOBRANCH .byte 0x2e +#define PADDING .byte 0x66; +#define HALT hlt + +#ifndef COMPLEX +#ifdef XDOUBLE +#define LOCAL_BUFFER_SIZE QLOCAL_BUFFER_SIZE +#elif defined DOUBLE +#define LOCAL_BUFFER_SIZE DLOCAL_BUFFER_SIZE +#else +#define LOCAL_BUFFER_SIZE SLOCAL_BUFFER_SIZE +#endif +#else +#ifdef XDOUBLE +#define LOCAL_BUFFER_SIZE XLOCAL_BUFFER_SIZE +#elif defined DOUBLE +#define LOCAL_BUFFER_SIZE ZLOCAL_BUFFER_SIZE +#else +#define LOCAL_BUFFER_SIZE CLOCAL_BUFFER_SIZE +#endif +#endif + +#if defined(OS_WINDOWS) +#if LOCAL_BUFFER_SIZE > 16384 +#define STACK_TOUCHING \ + movl $0, 4096 * 4(%esp);\ + movl $0, 4096 * 3(%esp);\ + movl $0, 4096 * 2(%esp);\ + movl $0, 4096 * 1(%esp); +#elif LOCAL_BUFFER_SIZE > 12288 +#define STACK_TOUCHING \ + movl $0, 4096 * 3(%esp);\ + movl $0, 4096 * 2(%esp);\ + movl $0, 4096 * 1(%esp); +#elif LOCAL_BUFFER_SIZE > 8192 +#define STACK_TOUCHING \ + movl $0, 4096 * 2(%esp);\ + movl $0, 4096 * 1(%esp); +#elif LOCAL_BUFFER_SIZE > 4096 +#define STACK_TOUCHING \ + movl $0, 4096 * 1(%esp); +#else +#define STACK_TOUCHING +#endif +#else +#define STACK_TOUCHING +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(F_INTERFACE_PATHSCALE) || defined(F_INTERFACE_OPEN64) +#define RETURN_BY_STRUCT +#elif defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) +#define RETURN_BY_COMPLEX +#else +#define RETURN_BY_STACK +#endif + +#ifdef OS_DARWIN +#define PROLOGUE .text;.align 5; .globl REALNAME; REALNAME: +#define EPILOGUE .subsections_via_symbols +#define PROFCODE +#endif + +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#define SAVEREGISTERS \ + subl $32, %esp;\ + movups %xmm6, 0(%esp);\ + movups %xmm7, 16(%esp) + +#define RESTOREREGISTERS \ + movups 0(%esp), %xmm6;\ + movups 16(%esp), %xmm7;\ + addl $32, %esp +#else +#define SAVEREGISTERS +#define RESTOREREGISTERS +#endif + +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#define PROLOGUE \ + .text; \ + .align 16; \ + .globl REALNAME ;\ + .def REALNAME;.scl 2;.type 32;.endef; \ +REALNAME: + +#define PROFCODE + +#define EPILOGUE .end REALNAME +#endif + +#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) +#define PROLOGUE \ + .text; \ + .align 16; \ + .globl REALNAME ;\ + .type REALNAME, @function; \ +REALNAME: + +#ifdef PROFILE +#define PROFCODE call mcount +#else +#define PROFCODE +#endif + +#define EPILOGUE .size REALNAME, .-REALNAME + +#endif + +#ifdef XDOUBLE +#define FLD fldt +#define FST fstpt +#define FSTU fstt +#define FMUL fmult +#define FADD faddt +#define FSUB fsubt +#define FSUBR fsubrt +#elif defined(DOUBLE) +#define FLD fldl +#define FST fstpl +#define FSTU fstl +#define FMUL fmull +#define FADD faddl +#define FSUB fsubl +#define FSUBR fsubrl +#else +#define FLD flds +#define FST fstps +#define FSTU fsts +#define FMUL fmuls +#define FADD fadds +#define FSUB fsubs +#define FSUBR fsubrs +#endif +#endif + +#ifdef C_SUN +#define ffreep fstp +#endif + +#ifdef __APPLE__ +#define ALIGN_2 .align 2 +#define ALIGN_3 .align 3 +#define ALIGN_4 .align 4 +#define ffreep fstp +#endif + +#ifndef ALIGN_2 +#define ALIGN_2 .align 4 +#endif + +#ifndef ALIGN_3 +#define ALIGN_3 .align 8 +#endif + +#ifndef ALIGN_4 +#define ALIGN_4 .align 16 +#endif + +#ifndef ALIGN_5 +#define ALIGN_5 .align 32 +#endif + +#ifndef ALIGN_6 +#define ALIGN_6 .align 64 +#endif diff --git a/common_x86_64.h b/common_x86_64.h new file mode 100644 index 0000000..53b7021 --- /dev/null +++ b/common_x86_64.h @@ -0,0 +1,451 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_X86 +#define COMMON_X86 + +#ifndef ASSEMBLER + +#ifdef C_SUN +#define __asm__ __asm +#define __volatile__ +#endif + +#ifdef HAVE_SSE2 +#define MB __asm__ __volatile__ ("mfence"); +#define WMB __asm__ __volatile__ ("sfence"); +#else +#define MB +#define WMB +#endif + +static void __inline blas_lock(volatile BLASULONG *address){ + + int ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "xchgl %0, %1\n" + : "=r"(ret), "=m"(*address) + : "0"(1), "m"(*address) + : "memory"); + + } while (ret); +} + +static __inline BLASULONG rpcc(void){ + BLASULONG a, d; + + __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); + + return ((BLASULONG)a + ((BLASULONG)d << 32)); +} + +#define RPCC64BIT + +static __inline BLASULONG getstackaddr(void){ + BLASULONG addr; + + __asm__ __volatile__ ("movq %%rsp, %0" + : "=r"(addr) : : "memory"); + + return addr; +} + +static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ + + __asm__ __volatile__("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (op)); +} + +#define WHEREAMI + +static inline int WhereAmI(void){ + int eax, ebx, ecx, edx; + int apicid; + + cpuid(1, &eax, &ebx, &ecx, &edx); + apicid = BITMASK(ebx, 24, 0xff); + + return apicid; +} + +#ifdef CORE_BARCELONA +#define IFLUSH gotoblas_iflush() +#define IFLUSH_HALF gotoblas_iflush_half() +#endif + +#ifdef ENABLE_SSE_EXCEPTION + +#define IDEBUG_START \ +{ \ + unsigned int fp_sse_mode, new_fp_mode; \ + __asm__ __volatile__ ("stmxcsr %0" : "=m" (fp_sse_mode) : ); \ + new_fp_mode = fp_sse_mode & ~0xd00; \ + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (new_fp_mode) ); + +#define IDEBUG_END \ + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (fp_sse_mode) ); \ +} + +#endif + +#ifdef XDOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fstpt %0" : "=m"(res) : : "memory") +#elif defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("movsd %%xmm1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("movss %%xmm1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#ifdef SMP +#ifdef USE64BITINT +static __inline blasint blas_quickdivide(blasint x, blasint y){ + return x / y; +} +#else +extern unsigned int blas_quick_divide_table[]; + +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ + + unsigned int result; + + if (y <= 1) return x; + + y = blas_quick_divide_table[y]; + + __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + + return result; +} +#endif +#endif + +#endif + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 2 << 20) + +#define BUFFER_SIZE (32 << 20) + +#define SEEK_ADDRESS + +#ifdef F_INTERFACE_G77 +#define RETURN_BY_STACK +#define NEED_F2CCONV +#endif + +#ifdef F_INTERFACE_G95 +#define RETURN_BY_PACKED +#endif + +#ifdef F_INTERFACE_GFORT +#ifdef OS_WINDOWS +#ifndef DOUBLE +#define RETURN_BY_REGS +#else +#define RETURN_BY_STACK +#endif +#else +#define RETURN_BY_PACKED +#endif +#endif + +#ifdef F_INTERFACE_INTEL +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_FUJITSU +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_PGI +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_PATHSCALE +#define RETURN_BY_PACKED +#endif + +#ifdef F_INTERFACE_SUN +#define RETURN_BY_PACKED +#endif + +#ifdef ASSEMBLER + +#if defined(HAVE_3DNOW) +#define EMMS femms +#elif defined(HAVE_MMX) +#define EMMS emms +#endif + +#ifndef EMMS +#define EMMS +#endif + +#define BRANCH .byte 0x3e +#define NOBRANCH .byte 0x2e +#define PADDING .byte 0x66 + +#ifdef OS_WINDOWS +#define ARG1 %rcx +#define ARG2 %rdx +#define ARG3 %r8 +#define ARG4 %r9 +#else +#define ARG1 %rdi +#define ARG2 %rsi +#define ARG3 %rdx +#define ARG4 %rcx +#define ARG5 %r8 +#define ARG6 %r9 +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define LOCAL_BUFFER_SIZE QLOCAL_BUFFER_SIZE +#elif defined DOUBLE +#define LOCAL_BUFFER_SIZE DLOCAL_BUFFER_SIZE +#else +#define LOCAL_BUFFER_SIZE SLOCAL_BUFFER_SIZE +#endif +#else +#ifdef XDOUBLE +#define LOCAL_BUFFER_SIZE XLOCAL_BUFFER_SIZE +#elif defined DOUBLE +#define LOCAL_BUFFER_SIZE ZLOCAL_BUFFER_SIZE +#else +#define LOCAL_BUFFER_SIZE CLOCAL_BUFFER_SIZE +#endif +#endif + +#if defined(OS_WINDOWS) +#if LOCAL_BUFFER_SIZE > 16384 +#define STACK_TOUCHING \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif LOCAL_BUFFER_SIZE > 12288 +#define STACK_TOUCHING \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif LOCAL_BUFFER_SIZE > 8192 +#define STACK_TOUCHING \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif LOCAL_BUFFER_SIZE > 4096 +#define STACK_TOUCHING \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCHING +#endif +#else +#define STACK_TOUCHING +#endif + +#if defined(CORE2) +#define movapd movaps +#define andpd andps +#define movlpd movlps +#define movhpd movhps +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#ifdef OS_DARWIN +#define PROLOGUE .text;.align 5; .globl REALNAME; REALNAME: +#define EPILOGUE .subsections_via_symbols +#define PROFCODE +#endif + +#ifdef OS_WINDOWS +#define SAVEREGISTERS \ + subq $256, %rsp;\ + movups %xmm6, 0(%rsp);\ + movups %xmm7, 16(%rsp);\ + movups %xmm8, 32(%rsp);\ + movups %xmm9, 48(%rsp);\ + movups %xmm10, 64(%rsp);\ + movups %xmm11, 80(%rsp);\ + movups %xmm12, 96(%rsp);\ + movups %xmm13, 112(%rsp);\ + movups %xmm14, 128(%rsp);\ + movups %xmm15, 144(%rsp) + +#define RESTOREREGISTERS \ + movups 0(%rsp), %xmm6;\ + movups 16(%rsp), %xmm7;\ + movups 32(%rsp), %xmm8;\ + movups 48(%rsp), %xmm9;\ + movups 64(%rsp), %xmm10;\ + movups 80(%rsp), %xmm11;\ + movups 96(%rsp), %xmm12;\ + movups 112(%rsp), %xmm13;\ + movups 128(%rsp), %xmm14;\ + movups 144(%rsp), %xmm15;\ + addq $256, %rsp +#else +#define SAVEREGISTERS +#define RESTOREREGISTERS +#endif + +#if defined(OS_WINDOWS) && !defined(C_PGI) +#define PROLOGUE \ + .text; \ + .align 16; \ + .globl REALNAME ;\ + .def REALNAME;.scl 2;.type 32;.endef; \ +REALNAME: + +#define PROFCODE + +#define EPILOGUE .end REALNAME +#endif + +#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) +#define PROLOGUE \ + .text; \ + .align 512; \ + .globl REALNAME ;\ + .type REALNAME, @function; \ +REALNAME: + +#ifdef PROFILE +#define PROFCODE call *mcount@GOTPCREL(%rip) +#else +#define PROFCODE +#endif + +#define EPILOGUE .size REALNAME, .-REALNAME + +#endif + +#endif + +#ifdef XDOUBLE +#define FLD fldt +#define FST fstpt +#define MOVQ movq +#elif defined(DOUBLE) +#define FLD fldl +#define FST fstpl +#define FSTU fstl +#define FMUL fmull +#define FADD faddl +#define MOVSD movsd +#define MULSD mulsd +#define MULPD mulpd +#define CMPEQPD cmpeqpd +#define COMISD comisd +#define PSRLQ psrlq +#define ANDPD andpd +#define ADDPD addpd +#define ADDSD addsd +#define SUBPD subpd +#define SUBSD subsd +#define MOVQ movq +#define MOVUPD movupd +#define XORPD xorpd +#else +#define FLD flds +#define FST fstps +#define FSTU fsts +#define FMUL fmuls +#define FADD fadds +#define MOVSD movss +#define MULSD mulss +#define MULPD mulps +#define CMPEQPD cmpeqps +#define COMISD comiss +#define PSRLQ psrld +#define ANDPD andps +#define ADDPD addps +#define ADDSD addss +#define SUBPD subps +#define SUBSD subss +#define MOVQ movd +#define MOVUPD movups +#define XORPD xorps +#endif + +#define HALT hlt + +#ifdef OS_DARWIN +#define ALIGN_2 .align 2 +#define ALIGN_3 .align 3 +#define ALIGN_4 .align 4 +#define ffreep fstp +#endif + +#ifndef ALIGN_2 +#define ALIGN_2 .align 4 +#endif + +#ifndef ALIGN_3 +#define ALIGN_3 .align 8 +#endif + +#ifndef ALIGN_4 +#define ALIGN_4 .align 16 +#endif + +#ifndef ALIGN_5 +#define ALIGN_5 .align 32 +#endif + +#ifndef ALIGN_6 +#define ALIGN_6 .align 64 +#endif + +#endif diff --git a/common_z.h b/common_z.h new file mode 100644 index 0000000..8832cac --- /dev/null +++ b/common_z.h @@ -0,0 +1,611 @@ +#ifndef COMMON_Z_H +#define COMMON_Z_H + +#ifndef DYNAMIC_ARCH + +#define ZAMAX_K zamax_k +#define ZAMIN_K zamin_k +#define ZMAX_K zmax_k +#define ZMIN_K zmin_k +#define IZAMAX_K izamax_k +#define IZAMIN_K izamin_k +#define IZMAX_K izmax_k +#define IZMIN_K izmin_k +#define ZASUM_K zasum_k +#define ZAXPYU_K zaxpy_k +#define ZAXPYC_K zaxpyc_k +#define ZCOPY_K zcopy_k +#define ZDOTU_K zdotu_k +#define ZDOTC_K zdotc_k +#define ZNRM2_K znrm2_k +#define ZSCAL_K zscal_k +#define ZSWAP_K zswap_k +#define ZROT_K zdrot_k + +#define ZGEMV_N zgemv_n +#define ZGEMV_T zgemv_t +#define ZGEMV_R zgemv_r +#define ZGEMV_C zgemv_c +#define ZGEMV_O zgemv_o +#define ZGEMV_U zgemv_u +#define ZGEMV_S zgemv_s +#define ZGEMV_D zgemv_d + +#define ZGERU_K zgeru_k +#define ZGERC_K zgerc_k +#define ZGERV_K zgerv_k +#define ZGERD_K zgerd_k + +#define ZSYMV_U zsymv_U +#define ZSYMV_L zsymv_L +#define ZHEMV_U zhemv_U +#define ZHEMV_L zhemv_L +#define ZHEMV_V zhemv_V +#define ZHEMV_M zhemv_M + +#define ZSYMV_THREAD_U zsymv_thread_U +#define ZSYMV_THREAD_L zsymv_thread_L +#define ZHEMV_THREAD_U zhemv_thread_U +#define ZHEMV_THREAD_L zhemv_thread_L +#define ZHEMV_THREAD_V zhemv_thread_V +#define ZHEMV_THREAD_M zhemv_thread_M + +#define ZGEMM_ONCOPY zgemm_oncopy +#define ZGEMM_OTCOPY zgemm_otcopy + +#if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N +#define ZGEMM_INCOPY zgemm_oncopy +#define ZGEMM_ITCOPY zgemm_otcopy +#else +#define ZGEMM_INCOPY zgemm_incopy +#define ZGEMM_ITCOPY zgemm_itcopy +#endif + +#define ZTRMM_OUNUCOPY ztrmm_ounucopy +#define ZTRMM_OUNNCOPY ztrmm_ounncopy +#define ZTRMM_OUTUCOPY ztrmm_outucopy +#define ZTRMM_OUTNCOPY ztrmm_outncopy +#define ZTRMM_OLNUCOPY ztrmm_olnucopy +#define ZTRMM_OLNNCOPY ztrmm_olnncopy +#define ZTRMM_OLTUCOPY ztrmm_oltucopy +#define ZTRMM_OLTNCOPY ztrmm_oltncopy + +#define ZTRSM_OUNUCOPY ztrsm_ounucopy +#define ZTRSM_OUNNCOPY ztrsm_ounncopy +#define ZTRSM_OUTUCOPY ztrsm_outucopy +#define ZTRSM_OUTNCOPY ztrsm_outncopy +#define ZTRSM_OLNUCOPY ztrsm_olnucopy +#define ZTRSM_OLNNCOPY ztrsm_olnncopy +#define ZTRSM_OLTUCOPY ztrsm_oltucopy +#define ZTRSM_OLTNCOPY ztrsm_oltncopy + +#if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N +#define ZTRMM_IUNUCOPY ztrmm_ounucopy +#define ZTRMM_IUNNCOPY ztrmm_ounncopy +#define ZTRMM_IUTUCOPY ztrmm_outucopy +#define ZTRMM_IUTNCOPY ztrmm_outncopy +#define ZTRMM_ILNUCOPY ztrmm_olnucopy +#define ZTRMM_ILNNCOPY ztrmm_olnncopy +#define ZTRMM_ILTUCOPY ztrmm_oltucopy +#define ZTRMM_ILTNCOPY ztrmm_oltncopy + +#define ZTRSM_IUNUCOPY ztrsm_ounucopy +#define ZTRSM_IUNNCOPY ztrsm_ounncopy +#define ZTRSM_IUTUCOPY ztrsm_outucopy +#define ZTRSM_IUTNCOPY ztrsm_outncopy +#define ZTRSM_ILNUCOPY ztrsm_olnucopy +#define ZTRSM_ILNNCOPY ztrsm_olnncopy +#define ZTRSM_ILTUCOPY ztrsm_oltucopy +#define ZTRSM_ILTNCOPY ztrsm_oltncopy +#else +#define ZTRMM_IUNUCOPY ztrmm_iunucopy +#define ZTRMM_IUNNCOPY ztrmm_iunncopy +#define ZTRMM_IUTUCOPY ztrmm_iutucopy +#define ZTRMM_IUTNCOPY ztrmm_iutncopy +#define ZTRMM_ILNUCOPY ztrmm_ilnucopy +#define ZTRMM_ILNNCOPY ztrmm_ilnncopy +#define ZTRMM_ILTUCOPY ztrmm_iltucopy +#define ZTRMM_ILTNCOPY ztrmm_iltncopy + +#define ZTRSM_IUNUCOPY ztrsm_iunucopy +#define ZTRSM_IUNNCOPY ztrsm_iunncopy +#define ZTRSM_IUTUCOPY ztrsm_iutucopy +#define ZTRSM_IUTNCOPY ztrsm_iutncopy +#define ZTRSM_ILNUCOPY ztrsm_ilnucopy +#define ZTRSM_ILNNCOPY ztrsm_ilnncopy +#define ZTRSM_ILTUCOPY ztrsm_iltucopy +#define ZTRSM_ILTNCOPY ztrsm_iltncopy +#endif + +#define ZGEMM_BETA zgemm_beta + +#define ZGEMM_KERNEL_N zgemm_kernel_n +#define ZGEMM_KERNEL_L zgemm_kernel_l +#define ZGEMM_KERNEL_R zgemm_kernel_r +#define ZGEMM_KERNEL_B zgemm_kernel_b + +#define ZTRMM_KERNEL_LN ztrmm_kernel_LN +#define ZTRMM_KERNEL_LT ztrmm_kernel_LT +#define ZTRMM_KERNEL_LR ztrmm_kernel_LR +#define ZTRMM_KERNEL_LC ztrmm_kernel_LC +#define ZTRMM_KERNEL_RN ztrmm_kernel_RN +#define ZTRMM_KERNEL_RT ztrmm_kernel_RT +#define ZTRMM_KERNEL_RR ztrmm_kernel_RR +#define ZTRMM_KERNEL_RC ztrmm_kernel_RC + +#define ZTRSM_KERNEL_LN ztrsm_kernel_LN +#define ZTRSM_KERNEL_LT ztrsm_kernel_LT +#define ZTRSM_KERNEL_LR ztrsm_kernel_LR +#define ZTRSM_KERNEL_LC ztrsm_kernel_LC +#define ZTRSM_KERNEL_RN ztrsm_kernel_RN +#define ZTRSM_KERNEL_RT ztrsm_kernel_RT +#define ZTRSM_KERNEL_RR ztrsm_kernel_RR +#define ZTRSM_KERNEL_RC ztrsm_kernel_RC + +#define ZSYMM_OUTCOPY zsymm_outcopy +#define ZSYMM_OLTCOPY zsymm_oltcopy +#if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N +#define ZSYMM_IUTCOPY zsymm_outcopy +#define ZSYMM_ILTCOPY zsymm_oltcopy +#else +#define ZSYMM_IUTCOPY zsymm_iutcopy +#define ZSYMM_ILTCOPY zsymm_iltcopy +#endif + +#define ZHEMM_OUTCOPY zhemm_outcopy +#define ZHEMM_OLTCOPY zhemm_oltcopy +#if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N +#define ZHEMM_IUTCOPY zhemm_outcopy +#define ZHEMM_ILTCOPY zhemm_oltcopy +#else +#define ZHEMM_IUTCOPY zhemm_iutcopy +#define ZHEMM_ILTCOPY zhemm_iltcopy +#endif + +#define ZGEMM3M_ONCOPYB zgemm3m_oncopyb +#define ZGEMM3M_ONCOPYR zgemm3m_oncopyr +#define ZGEMM3M_ONCOPYI zgemm3m_oncopyi +#define ZGEMM3M_OTCOPYB zgemm3m_otcopyb +#define ZGEMM3M_OTCOPYR zgemm3m_otcopyr +#define ZGEMM3M_OTCOPYI zgemm3m_otcopyi + +#define ZGEMM3M_INCOPYB zgemm3m_incopyb +#define ZGEMM3M_INCOPYR zgemm3m_incopyr +#define ZGEMM3M_INCOPYI zgemm3m_incopyi +#define ZGEMM3M_ITCOPYB zgemm3m_itcopyb +#define ZGEMM3M_ITCOPYR zgemm3m_itcopyr +#define ZGEMM3M_ITCOPYI zgemm3m_itcopyi + +#define ZSYMM3M_ILCOPYB zsymm3m_ilcopyb +#define ZSYMM3M_IUCOPYB zsymm3m_iucopyb +#define ZSYMM3M_ILCOPYR zsymm3m_ilcopyr +#define ZSYMM3M_IUCOPYR zsymm3m_iucopyr +#define ZSYMM3M_ILCOPYI zsymm3m_ilcopyi +#define ZSYMM3M_IUCOPYI zsymm3m_iucopyi + +#define ZSYMM3M_OLCOPYB zsymm3m_olcopyb +#define ZSYMM3M_OUCOPYB zsymm3m_oucopyb +#define ZSYMM3M_OLCOPYR zsymm3m_olcopyr +#define ZSYMM3M_OUCOPYR zsymm3m_oucopyr +#define ZSYMM3M_OLCOPYI zsymm3m_olcopyi +#define ZSYMM3M_OUCOPYI zsymm3m_oucopyi + +#define ZHEMM3M_ILCOPYB zhemm3m_ilcopyb +#define ZHEMM3M_IUCOPYB zhemm3m_iucopyb +#define ZHEMM3M_ILCOPYR zhemm3m_ilcopyr +#define ZHEMM3M_IUCOPYR zhemm3m_iucopyr +#define ZHEMM3M_ILCOPYI zhemm3m_ilcopyi +#define ZHEMM3M_IUCOPYI zhemm3m_iucopyi + +#define ZHEMM3M_OLCOPYB zhemm3m_olcopyb +#define ZHEMM3M_OUCOPYB zhemm3m_oucopyb +#define ZHEMM3M_OLCOPYR zhemm3m_olcopyr +#define ZHEMM3M_OUCOPYR zhemm3m_oucopyr +#define ZHEMM3M_OLCOPYI zhemm3m_olcopyi +#define ZHEMM3M_OUCOPYI zhemm3m_oucopyi + +#define ZGEMM3M_KERNEL zgemm3m_kernel + +#define ZNEG_TCOPY zneg_tcopy +#define ZLASWP_NCOPY zlaswp_ncopy + +#else + +#define ZAMAX_K gotoblas -> zamax_k +#define ZAMIN_K gotoblas -> zamin_k +#define ZMAX_K gotoblas -> zmax_k +#define ZMIN_K gotoblas -> zmin_k +#define IZAMAX_K gotoblas -> izamax_k +#define IZAMIN_K gotoblas -> izamin_k +#define IZMAX_K gotoblas -> izmax_k +#define IZMIN_K gotoblas -> izmin_k +#define ZASUM_K gotoblas -> zasum_k +#define ZAXPYU_K gotoblas -> zaxpy_k +#define ZAXPYC_K gotoblas -> zaxpyc_k +#define ZCOPY_K gotoblas -> zcopy_k +#define ZDOTU_K gotoblas -> zdotu_k +#define ZDOTC_K gotoblas -> zdotc_k +#define ZNRM2_K gotoblas -> znrm2_k +#define ZSCAL_K gotoblas -> zscal_k +#define ZSWAP_K gotoblas -> zswap_k +#define ZROT_K gotoblas -> zdrot_k + +#define ZGEMV_N gotoblas -> zgemv_n +#define ZGEMV_T gotoblas -> zgemv_t +#define ZGEMV_R gotoblas -> zgemv_r +#define ZGEMV_C gotoblas -> zgemv_c +#define ZGEMV_O gotoblas -> zgemv_o +#define ZGEMV_U gotoblas -> zgemv_u +#define ZGEMV_S gotoblas -> zgemv_s +#define ZGEMV_D gotoblas -> zgemv_d + +#define ZGERU_K gotoblas -> zgeru_k +#define ZGERC_K gotoblas -> zgerc_k +#define ZGERV_K gotoblas -> zgerv_k +#define ZGERD_K gotoblas -> zgerd_k + +#define ZSYMV_U gotoblas -> zsymv_U +#define ZSYMV_L gotoblas -> zsymv_L +#define ZHEMV_U gotoblas -> zhemv_U +#define ZHEMV_L gotoblas -> zhemv_L +#define ZHEMV_V gotoblas -> zhemv_V +#define ZHEMV_M gotoblas -> zhemv_M + +#define ZSYMV_THREAD_U zsymv_thread_U +#define ZSYMV_THREAD_L zsymv_thread_L +#define ZHEMV_THREAD_U zhemv_thread_U +#define ZHEMV_THREAD_L zhemv_thread_L +#define ZHEMV_THREAD_V zhemv_thread_V +#define ZHEMV_THREAD_M zhemv_thread_M + +#define ZGEMM_ONCOPY gotoblas -> zgemm_oncopy +#define ZGEMM_OTCOPY gotoblas -> zgemm_otcopy +#define ZGEMM_INCOPY gotoblas -> zgemm_incopy +#define ZGEMM_ITCOPY gotoblas -> zgemm_itcopy + +#define ZTRMM_OUNUCOPY gotoblas -> ztrmm_ounucopy +#define ZTRMM_OUTUCOPY gotoblas -> ztrmm_outucopy +#define ZTRMM_OLNUCOPY gotoblas -> ztrmm_olnucopy +#define ZTRMM_OLTUCOPY gotoblas -> ztrmm_oltucopy +#define ZTRSM_OUNUCOPY gotoblas -> ztrsm_ounucopy +#define ZTRSM_OUTUCOPY gotoblas -> ztrsm_outucopy +#define ZTRSM_OLNUCOPY gotoblas -> ztrsm_olnucopy +#define ZTRSM_OLTUCOPY gotoblas -> ztrsm_oltucopy + +#define ZTRMM_IUNUCOPY gotoblas -> ztrmm_iunucopy +#define ZTRMM_IUTUCOPY gotoblas -> ztrmm_iutucopy +#define ZTRMM_ILNUCOPY gotoblas -> ztrmm_ilnucopy +#define ZTRMM_ILTUCOPY gotoblas -> ztrmm_iltucopy +#define ZTRSM_IUNUCOPY gotoblas -> ztrsm_iunucopy +#define ZTRSM_IUTUCOPY gotoblas -> ztrsm_iutucopy +#define ZTRSM_ILNUCOPY gotoblas -> ztrsm_ilnucopy +#define ZTRSM_ILTUCOPY gotoblas -> ztrsm_iltucopy + +#define ZTRMM_OUNNCOPY gotoblas -> ztrmm_ounncopy +#define ZTRMM_OUTNCOPY gotoblas -> ztrmm_outncopy +#define ZTRMM_OLNNCOPY gotoblas -> ztrmm_olnncopy +#define ZTRMM_OLTNCOPY gotoblas -> ztrmm_oltncopy +#define ZTRSM_OUNNCOPY gotoblas -> ztrsm_ounncopy +#define ZTRSM_OUTNCOPY gotoblas -> ztrsm_outncopy +#define ZTRSM_OLNNCOPY gotoblas -> ztrsm_olnncopy +#define ZTRSM_OLTNCOPY gotoblas -> ztrsm_oltncopy + +#define ZTRMM_IUNNCOPY gotoblas -> ztrmm_iunncopy +#define ZTRMM_IUTNCOPY gotoblas -> ztrmm_iutncopy +#define ZTRMM_ILNNCOPY gotoblas -> ztrmm_ilnncopy +#define ZTRMM_ILTNCOPY gotoblas -> ztrmm_iltncopy +#define ZTRSM_IUNNCOPY gotoblas -> ztrsm_iunncopy +#define ZTRSM_IUTNCOPY gotoblas -> ztrsm_iutncopy +#define ZTRSM_ILNNCOPY gotoblas -> ztrsm_ilnncopy +#define ZTRSM_ILTNCOPY gotoblas -> ztrsm_iltncopy + +#define ZGEMM_BETA gotoblas -> zgemm_beta +#define ZGEMM_KERNEL_N gotoblas -> zgemm_kernel_n +#define ZGEMM_KERNEL_L gotoblas -> zgemm_kernel_l +#define ZGEMM_KERNEL_R gotoblas -> zgemm_kernel_r +#define ZGEMM_KERNEL_B gotoblas -> zgemm_kernel_b + +#define ZTRMM_KERNEL_LN gotoblas -> ztrmm_kernel_LN +#define ZTRMM_KERNEL_LT gotoblas -> ztrmm_kernel_LT +#define ZTRMM_KERNEL_LR gotoblas -> ztrmm_kernel_LR +#define ZTRMM_KERNEL_LC gotoblas -> ztrmm_kernel_LC +#define ZTRMM_KERNEL_RN gotoblas -> ztrmm_kernel_RN +#define ZTRMM_KERNEL_RT gotoblas -> ztrmm_kernel_RT +#define ZTRMM_KERNEL_RR gotoblas -> ztrmm_kernel_RR +#define ZTRMM_KERNEL_RC gotoblas -> ztrmm_kernel_RC + +#define ZTRSM_KERNEL_LN gotoblas -> ztrsm_kernel_LN +#define ZTRSM_KERNEL_LT gotoblas -> ztrsm_kernel_LT +#define ZTRSM_KERNEL_LR gotoblas -> ztrsm_kernel_LR +#define ZTRSM_KERNEL_LC gotoblas -> ztrsm_kernel_LC +#define ZTRSM_KERNEL_RN gotoblas -> ztrsm_kernel_RN +#define ZTRSM_KERNEL_RT gotoblas -> ztrsm_kernel_RT +#define ZTRSM_KERNEL_RR gotoblas -> ztrsm_kernel_RR +#define ZTRSM_KERNEL_RC gotoblas -> ztrsm_kernel_RC + +#define ZSYMM_IUTCOPY gotoblas -> zsymm_iutcopy +#define ZSYMM_ILTCOPY gotoblas -> zsymm_iltcopy +#define ZSYMM_OUTCOPY gotoblas -> zsymm_outcopy +#define ZSYMM_OLTCOPY gotoblas -> zsymm_oltcopy + +#define ZHEMM_OUTCOPY gotoblas -> zhemm_outcopy +#define ZHEMM_OLTCOPY gotoblas -> zhemm_oltcopy +#define ZHEMM_IUTCOPY gotoblas -> zhemm_iutcopy +#define ZHEMM_ILTCOPY gotoblas -> zhemm_iltcopy + +#define ZGEMM3M_ONCOPYB gotoblas -> zgemm3m_oncopyb +#define ZGEMM3M_ONCOPYR gotoblas -> zgemm3m_oncopyr +#define ZGEMM3M_ONCOPYI gotoblas -> zgemm3m_oncopyi +#define ZGEMM3M_OTCOPYB gotoblas -> zgemm3m_otcopyb +#define ZGEMM3M_OTCOPYR gotoblas -> zgemm3m_otcopyr +#define ZGEMM3M_OTCOPYI gotoblas -> zgemm3m_otcopyi + +#define ZGEMM3M_INCOPYB gotoblas -> zgemm3m_incopyb +#define ZGEMM3M_INCOPYR gotoblas -> zgemm3m_incopyr +#define ZGEMM3M_INCOPYI gotoblas -> zgemm3m_incopyi +#define ZGEMM3M_ITCOPYB gotoblas -> zgemm3m_itcopyb +#define ZGEMM3M_ITCOPYR gotoblas -> zgemm3m_itcopyr +#define ZGEMM3M_ITCOPYI gotoblas -> zgemm3m_itcopyi + +#define ZSYMM3M_ILCOPYB gotoblas -> zsymm3m_ilcopyb +#define ZSYMM3M_IUCOPYB gotoblas -> zsymm3m_iucopyb +#define ZSYMM3M_ILCOPYR gotoblas -> zsymm3m_ilcopyr +#define ZSYMM3M_IUCOPYR gotoblas -> zsymm3m_iucopyr +#define ZSYMM3M_ILCOPYI gotoblas -> zsymm3m_ilcopyi +#define ZSYMM3M_IUCOPYI gotoblas -> zsymm3m_iucopyi + +#define ZSYMM3M_OLCOPYB gotoblas -> zsymm3m_olcopyb +#define ZSYMM3M_OUCOPYB gotoblas -> zsymm3m_oucopyb +#define ZSYMM3M_OLCOPYR gotoblas -> zsymm3m_olcopyr +#define ZSYMM3M_OUCOPYR gotoblas -> zsymm3m_oucopyr +#define ZSYMM3M_OLCOPYI gotoblas -> zsymm3m_olcopyi +#define ZSYMM3M_OUCOPYI gotoblas -> zsymm3m_oucopyi + +#define ZHEMM3M_ILCOPYB gotoblas -> zhemm3m_ilcopyb +#define ZHEMM3M_IUCOPYB gotoblas -> zhemm3m_iucopyb +#define ZHEMM3M_ILCOPYR gotoblas -> zhemm3m_ilcopyr +#define ZHEMM3M_IUCOPYR gotoblas -> zhemm3m_iucopyr +#define ZHEMM3M_ILCOPYI gotoblas -> zhemm3m_ilcopyi +#define ZHEMM3M_IUCOPYI gotoblas -> zhemm3m_iucopyi + +#define ZHEMM3M_OLCOPYB gotoblas -> zhemm3m_olcopyb +#define ZHEMM3M_OUCOPYB gotoblas -> zhemm3m_oucopyb +#define ZHEMM3M_OLCOPYR gotoblas -> zhemm3m_olcopyr +#define ZHEMM3M_OUCOPYR gotoblas -> zhemm3m_oucopyr +#define ZHEMM3M_OLCOPYI gotoblas -> zhemm3m_olcopyi +#define ZHEMM3M_OUCOPYI gotoblas -> zhemm3m_oucopyi + +#define ZGEMM3M_KERNEL gotoblas -> zgemm3m_kernel + +#define ZNEG_TCOPY gotoblas -> zneg_tcopy +#define ZLASWP_NCOPY gotoblas -> zlaswp_ncopy + +#endif + +#define ZGEMM_NN zgemm_nn +#define ZGEMM_CN zgemm_cn +#define ZGEMM_TN zgemm_tn +#define ZGEMM_NC zgemm_nc +#define ZGEMM_NT zgemm_nt +#define ZGEMM_CC zgemm_cc +#define ZGEMM_CT zgemm_ct +#define ZGEMM_TC zgemm_tc +#define ZGEMM_TT zgemm_tt +#define ZGEMM_NR zgemm_nr +#define ZGEMM_TR zgemm_tr +#define ZGEMM_CR zgemm_cr +#define ZGEMM_RN zgemm_rn +#define ZGEMM_RT zgemm_rt +#define ZGEMM_RC zgemm_rc +#define ZGEMM_RR zgemm_rr + +#define ZSYMM_LU zsymm_LU +#define ZSYMM_LL zsymm_LL +#define ZSYMM_RU zsymm_RU +#define ZSYMM_RL zsymm_RL + +#define ZHEMM_LU zhemm_LU +#define ZHEMM_LL zhemm_LL +#define ZHEMM_RU zhemm_RU +#define ZHEMM_RL zhemm_RL + +#define ZSYRK_UN zsyrk_UN +#define ZSYRK_UT zsyrk_UT +#define ZSYRK_LN zsyrk_LN +#define ZSYRK_LT zsyrk_LT +#define ZSYRK_UR zsyrk_UN +#define ZSYRK_UC zsyrk_UT +#define ZSYRK_LR zsyrk_LN +#define ZSYRK_LC zsyrk_LT + +#define ZSYRK_KERNEL_U zsyrk_kernel_U +#define ZSYRK_KERNEL_L zsyrk_kernel_L + +#define ZHERK_UN zherk_UN +#define ZHERK_LN zherk_LN +#define ZHERK_UC zherk_UC +#define ZHERK_LC zherk_LC + +#define ZHER2K_UN zher2k_UN +#define ZHER2K_LN zher2k_LN +#define ZHER2K_UC zher2k_UC +#define ZHER2K_LC zher2k_LC + +#define ZSYR2K_UN zsyr2k_UN +#define ZSYR2K_UT zsyr2k_UT +#define ZSYR2K_LN zsyr2k_LN +#define ZSYR2K_LT zsyr2k_LT +#define ZSYR2K_UR zsyr2k_UN +#define ZSYR2K_UC zsyr2k_UT +#define ZSYR2K_LR zsyr2k_LN +#define ZSYR2K_LC zsyr2k_LT + +#define ZSYR2K_KERNEL_U zsyr2k_kernel_U +#define ZSYR2K_KERNEL_L zsyr2k_kernel_L + +#define ZTRMM_LNUU ztrmm_LNUU +#define ZTRMM_LNUN ztrmm_LNUN +#define ZTRMM_LNLU ztrmm_LNLU +#define ZTRMM_LNLN ztrmm_LNLN +#define ZTRMM_LTUU ztrmm_LTUU +#define ZTRMM_LTUN ztrmm_LTUN +#define ZTRMM_LTLU ztrmm_LTLU +#define ZTRMM_LTLN ztrmm_LTLN +#define ZTRMM_LRUU ztrmm_LRUU +#define ZTRMM_LRUN ztrmm_LRUN +#define ZTRMM_LRLU ztrmm_LRLU +#define ZTRMM_LRLN ztrmm_LRLN +#define ZTRMM_LCUU ztrmm_LCUU +#define ZTRMM_LCUN ztrmm_LCUN +#define ZTRMM_LCLU ztrmm_LCLU +#define ZTRMM_LCLN ztrmm_LCLN +#define ZTRMM_RNUU ztrmm_RNUU +#define ZTRMM_RNUN ztrmm_RNUN +#define ZTRMM_RNLU ztrmm_RNLU +#define ZTRMM_RNLN ztrmm_RNLN +#define ZTRMM_RTUU ztrmm_RTUU +#define ZTRMM_RTUN ztrmm_RTUN +#define ZTRMM_RTLU ztrmm_RTLU +#define ZTRMM_RTLN ztrmm_RTLN +#define ZTRMM_RRUU ztrmm_RRUU +#define ZTRMM_RRUN ztrmm_RRUN +#define ZTRMM_RRLU ztrmm_RRLU +#define ZTRMM_RRLN ztrmm_RRLN +#define ZTRMM_RCUU ztrmm_RCUU +#define ZTRMM_RCUN ztrmm_RCUN +#define ZTRMM_RCLU ztrmm_RCLU +#define ZTRMM_RCLN ztrmm_RCLN + +#define ZTRSM_LNUU ztrsm_LNUU +#define ZTRSM_LNUN ztrsm_LNUN +#define ZTRSM_LNLU ztrsm_LNLU +#define ZTRSM_LNLN ztrsm_LNLN +#define ZTRSM_LTUU ztrsm_LTUU +#define ZTRSM_LTUN ztrsm_LTUN +#define ZTRSM_LTLU ztrsm_LTLU +#define ZTRSM_LTLN ztrsm_LTLN +#define ZTRSM_LRUU ztrsm_LRUU +#define ZTRSM_LRUN ztrsm_LRUN +#define ZTRSM_LRLU ztrsm_LRLU +#define ZTRSM_LRLN ztrsm_LRLN +#define ZTRSM_LCUU ztrsm_LCUU +#define ZTRSM_LCUN ztrsm_LCUN +#define ZTRSM_LCLU ztrsm_LCLU +#define ZTRSM_LCLN ztrsm_LCLN +#define ZTRSM_RNUU ztrsm_RNUU +#define ZTRSM_RNUN ztrsm_RNUN +#define ZTRSM_RNLU ztrsm_RNLU +#define ZTRSM_RNLN ztrsm_RNLN +#define ZTRSM_RTUU ztrsm_RTUU +#define ZTRSM_RTUN ztrsm_RTUN +#define ZTRSM_RTLU ztrsm_RTLU +#define ZTRSM_RTLN ztrsm_RTLN +#define ZTRSM_RRUU ztrsm_RRUU +#define ZTRSM_RRUN ztrsm_RRUN +#define ZTRSM_RRLU ztrsm_RRLU +#define ZTRSM_RRLN ztrsm_RRLN +#define ZTRSM_RCUU ztrsm_RCUU +#define ZTRSM_RCUN ztrsm_RCUN +#define ZTRSM_RCLU ztrsm_RCLU +#define ZTRSM_RCLN ztrsm_RCLN + +#define ZGEMM_THREAD_NN zgemm_thread_nn +#define ZGEMM_THREAD_CN zgemm_thread_cn +#define ZGEMM_THREAD_TN zgemm_thread_tn +#define ZGEMM_THREAD_NC zgemm_thread_nc +#define ZGEMM_THREAD_NT zgemm_thread_nt +#define ZGEMM_THREAD_CC zgemm_thread_cc +#define ZGEMM_THREAD_CT zgemm_thread_ct +#define ZGEMM_THREAD_TC zgemm_thread_tc +#define ZGEMM_THREAD_TT zgemm_thread_tt +#define ZGEMM_THREAD_NR zgemm_thread_nr +#define ZGEMM_THREAD_TR zgemm_thread_tr +#define ZGEMM_THREAD_CR zgemm_thread_cr +#define ZGEMM_THREAD_RN zgemm_thread_rn +#define ZGEMM_THREAD_RT zgemm_thread_rt +#define ZGEMM_THREAD_RC zgemm_thread_rc +#define ZGEMM_THREAD_RR zgemm_thread_rr + +#define ZSYMM_THREAD_LU zsymm_thread_LU +#define ZSYMM_THREAD_LL zsymm_thread_LL +#define ZSYMM_THREAD_RU zsymm_thread_RU +#define ZSYMM_THREAD_RL zsymm_thread_RL + +#define ZHEMM_THREAD_LU zhemm_thread_LU +#define ZHEMM_THREAD_LL zhemm_thread_LL +#define ZHEMM_THREAD_RU zhemm_thread_RU +#define ZHEMM_THREAD_RL zhemm_thread_RL + +#define ZSYRK_THREAD_UN zsyrk_thread_UN +#define ZSYRK_THREAD_UT zsyrk_thread_UT +#define ZSYRK_THREAD_LN zsyrk_thread_LN +#define ZSYRK_THREAD_LT zsyrk_thread_LT +#define ZSYRK_THREAD_UR zsyrk_thread_UN +#define ZSYRK_THREAD_UC zsyrk_thread_UT +#define ZSYRK_THREAD_LR zsyrk_thread_LN +#define ZSYRK_THREAD_LC zsyrk_thread_LT + +#define ZHERK_THREAD_UN zherk_thread_UN +#define ZHERK_THREAD_UT zherk_thread_UT +#define ZHERK_THREAD_LN zherk_thread_LN +#define ZHERK_THREAD_LT zherk_thread_LT +#define ZHERK_THREAD_UR zherk_thread_UR +#define ZHERK_THREAD_UC zherk_thread_UC +#define ZHERK_THREAD_LR zherk_thread_LR +#define ZHERK_THREAD_LC zherk_thread_LC + +#define ZGEMM3M_NN zgemm3m_nn +#define ZGEMM3M_CN zgemm3m_cn +#define ZGEMM3M_TN zgemm3m_tn +#define ZGEMM3M_NC zgemm3m_nc +#define ZGEMM3M_NT zgemm3m_nt +#define ZGEMM3M_CC zgemm3m_cc +#define ZGEMM3M_CT zgemm3m_ct +#define ZGEMM3M_TC zgemm3m_tc +#define ZGEMM3M_TT zgemm3m_tt +#define ZGEMM3M_NR zgemm3m_nr +#define ZGEMM3M_TR zgemm3m_tr +#define ZGEMM3M_CR zgemm3m_cr +#define ZGEMM3M_RN zgemm3m_rn +#define ZGEMM3M_RT zgemm3m_rt +#define ZGEMM3M_RC zgemm3m_rc +#define ZGEMM3M_RR zgemm3m_rr + +#define ZGEMM3M_THREAD_NN zgemm3m_thread_nn +#define ZGEMM3M_THREAD_CN zgemm3m_thread_cn +#define ZGEMM3M_THREAD_TN zgemm3m_thread_tn +#define ZGEMM3M_THREAD_NC zgemm3m_thread_nc +#define ZGEMM3M_THREAD_NT zgemm3m_thread_nt +#define ZGEMM3M_THREAD_CC zgemm3m_thread_cc +#define ZGEMM3M_THREAD_CT zgemm3m_thread_ct +#define ZGEMM3M_THREAD_TC zgemm3m_thread_tc +#define ZGEMM3M_THREAD_TT zgemm3m_thread_tt +#define ZGEMM3M_THREAD_NR zgemm3m_thread_nr +#define ZGEMM3M_THREAD_TR zgemm3m_thread_tr +#define ZGEMM3M_THREAD_CR zgemm3m_thread_cr +#define ZGEMM3M_THREAD_RN zgemm3m_thread_rn +#define ZGEMM3M_THREAD_RT zgemm3m_thread_rt +#define ZGEMM3M_THREAD_RC zgemm3m_thread_rc +#define ZGEMM3M_THREAD_RR zgemm3m_thread_rr + +#define ZSYMM3M_LU zsymm3m_LU +#define ZSYMM3M_LL zsymm3m_LL +#define ZSYMM3M_RU zsymm3m_RU +#define ZSYMM3M_RL zsymm3m_RL + +#define ZSYMM3M_THREAD_LU zsymm3m_thread_LU +#define ZSYMM3M_THREAD_LL zsymm3m_thread_LL +#define ZSYMM3M_THREAD_RU zsymm3m_thread_RU +#define ZSYMM3M_THREAD_RL zsymm3m_thread_RL + +#define ZHEMM3M_LU zhemm3m_LU +#define ZHEMM3M_LL zhemm3m_LL +#define ZHEMM3M_RU zhemm3m_RU +#define ZHEMM3M_RL zhemm3m_RL + +#define ZHEMM3M_THREAD_LU zhemm3m_thread_LU +#define ZHEMM3M_THREAD_LL zhemm3m_thread_LL +#define ZHEMM3M_THREAD_RU zhemm3m_thread_RU +#define ZHEMM3M_THREAD_RL zhemm3m_thread_RL + +#endif diff --git a/cpuid.S b/cpuid.S new file mode 100644 index 0000000..3f7bf5f --- /dev/null +++ b/cpuid.S @@ -0,0 +1,67 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#if defined(__APPLE__) && defined(__i386__) + +/* Quick hack for Darwin/x86 */ + + .text + .globl _cpuid +_cpuid: + pushl %esi + pushl %ebx + + movl 12(%esp), %eax + cpuid + + movl 16(%esp), %esi + movl %eax, (%esi) + movl 20(%esp), %esi + movl %ebx, (%esi) + movl 24(%esp), %esi + movl %ecx, (%esi) + movl 28(%esp), %esi + movl %edx, (%esi) + + popl %ebx + popl %esi + ret + + .subsections_via_symbols + +#endif diff --git a/cpuid.h b/cpuid.h new file mode 100644 index 0000000..665ede0 --- /dev/null +++ b/cpuid.h @@ -0,0 +1,191 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef CPUID_H +#define CPUID_H + +#define VENDOR_INTEL 1 +#define VENDOR_UMC 2 +#define VENDOR_AMD 3 +#define VENDOR_CYRIX 4 +#define VENDOR_NEXGEN 5 +#define VENDOR_CENTAUR 6 +#define VENDOR_RISE 7 +#define VENDOR_SIS 8 +#define VENDOR_TRANSMETA 9 +#define VENDOR_NSC 10 +#define VENDOR_UNKNOWN 99 + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#define FAMILY_80486 4 +#define FAMILY_P5 5 +#define FAMILY_P6 6 +#define FAMILY_PM 7 +#define FAMILY_IA64 8 + +#if defined(__i386__) || defined(__x86_64__) +#define GET_EXFAMILY 1 +#define GET_EXMODEL 2 +#define GET_TYPE 3 +#define GET_FAMILY 4 +#define GET_MODEL 5 +#define GET_APICID 6 +#define GET_LCOUNT 7 +#define GET_CHUNKS 8 +#define GET_STEPPING 9 +#define GET_BLANDID 10 +#define GET_FEATURE 11 +#define GET_NUMSHARE 12 +#define GET_NUMCORES 13 +#endif + +#ifdef __ia64__ +#define GET_ARCHREV 1 +#define GET_FAMILY 2 +#define GET_MODEL 3 +#define GET_REVISION 4 +#define GET_NUMBER 5 +#endif + +#define CORE_UNKNOWN 0 +#define CORE_80486 1 +#define CORE_P5 2 +#define CORE_P6 3 +#define CORE_KATMAI 4 +#define CORE_COPPERMINE 5 +#define CORE_NORTHWOOD 6 +#define CORE_PRESCOTT 7 +#define CORE_BANIAS 8 +#define CORE_ATHLON 9 +#define CORE_OPTERON 10 +#define CORE_BARCELONA 11 +#define CORE_VIAC3 12 +#define CORE_YONAH 13 +#define CORE_CORE2 14 +#define CORE_PENRYN 15 +#define CORE_DUNNINGTON 16 +#define CORE_NEHALEM 17 +#define CORE_ATOM 18 +#define CORE_NANO 19 + +#define HAVE_SSE (1 << 0) +#define HAVE_SSE2 (1 << 1) +#define HAVE_SSE3 (1 << 2) +#define HAVE_SSSE3 (1 << 3) +#define HAVE_SSE4_1 (1 << 4) +#define HAVE_SSE4_2 (1 << 5) +#define HAVE_SSE4A (1 << 6) +#define HAVE_SSE5 (1 << 7) +#define HAVE_MMX (1 << 8) +#define HAVE_3DNOW (1 << 9) +#define HAVE_3DNOWEX (1 << 10) +#define HAVE_CMOV (1 << 11) +#define HAVE_PSE (1 << 12) +#define HAVE_CFLUSH (1 << 13) +#define HAVE_HIT (1 << 14) +#define HAVE_MISALIGNSSE (1 << 15) +#define HAVE_128BITFPU (1 << 16) +#define HAVE_FASTMOVU (1 << 17) + +#define CACHE_INFO_L1_I 1 +#define CACHE_INFO_L1_D 2 +#define CACHE_INFO_L2 3 +#define CACHE_INFO_L3 4 +#define CACHE_INFO_L1_ITB 5 +#define CACHE_INFO_L1_DTB 6 +#define CACHE_INFO_L1_LITB 7 +#define CACHE_INFO_L1_LDTB 8 +#define CACHE_INFO_L2_ITB 9 +#define CACHE_INFO_L2_DTB 10 +#define CACHE_INFO_L2_LITB 11 +#define CACHE_INFO_L2_LDTB 12 + +typedef struct { + int size; + int associative; + int linesize; + int shared; +} cache_info_t; + +#define CPUTYPE_UNKNOWN 0 +#define CPUTYPE_INTEL_UNKNOWN 1 +#define CPUTYPE_UMC_UNKNOWN 2 +#define CPUTYPE_AMD_UNKNOWN 3 +#define CPUTYPE_CYRIX_UNKNOWN 4 +#define CPUTYPE_NEXGEN_UNKNOWN 5 +#define CPUTYPE_CENTAUR_UNKNOWN 6 +#define CPUTYPE_RISE_UNKNOWN 7 +#define CPUTYPE_SIS_UNKNOWN 8 +#define CPUTYPE_TRANSMETA_UNKNOWN 9 +#define CPUTYPE_NSC_UNKNOWN 10 + +#define CPUTYPE_80386 11 +#define CPUTYPE_80486 12 +#define CPUTYPE_PENTIUM 13 +#define CPUTYPE_PENTIUM2 14 +#define CPUTYPE_PENTIUM3 15 +#define CPUTYPE_PENTIUMM 16 +#define CPUTYPE_PENTIUM4 17 +#define CPUTYPE_CORE2 18 +#define CPUTYPE_PENRYN 19 +#define CPUTYPE_DUNNINGTON 20 +#define CPUTYPE_NEHALEM 21 +#define CPUTYPE_ATOM 22 +#define CPUTYPE_ITANIUM 23 +#define CPUTYPE_ITANIUM2 24 +#define CPUTYPE_AMD5X86 25 +#define CPUTYPE_AMDK6 26 +#define CPUTYPE_ATHLON 27 +#define CPUTYPE_DURON 28 +#define CPUTYPE_OPTERON 29 +#define CPUTYPE_BARCELONA 30 +#define CPUTYPE_SHANGHAI 31 +#define CPUTYPE_ISTANBUL 32 +#define CPUTYPE_CYRIX5X86 33 +#define CPUTYPE_CYRIXM1 34 +#define CPUTYPE_CYRIXM2 35 +#define CPUTYPE_NEXGENNX586 36 +#define CPUTYPE_CENTAURC6 37 +#define CPUTYPE_RISEMP6 38 +#define CPUTYPE_SYS55X 39 +#define CPUTYPE_CRUSOETM3X 40 +#define CPUTYPE_NSGEODE 41 +#define CPUTYPE_VIAC3 42 +#define CPUTYPE_NANO 43 +#endif diff --git a/cpuid_alpha.c b/cpuid_alpha.c new file mode 100644 index 0000000..ca786d5 --- /dev/null +++ b/cpuid_alpha.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#if defined(__alpha) && defined(__DECC) +#include +#endif + +int implver(void){ + int arch; + +#ifndef __DECC + asm __volatile__("implver %0" : "=r"(arch) : : "memory"); +#else + arch = asm("implver %v0"); +#endif + return arch; +} + +void get_architecture(void){ + printf("ALPHA"); +} + +void get_subarchitecture(void){ + printf("ev%d", implver() + 4); +} + +void get_subdirname(void){ + printf("alpha"); +} + +void get_cpuconfig(void){ + printf("#define EV%d\n", implver() + 4); + + switch (implver()){ + case 0: + printf("#define L1_DATA_SIZE 16384\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 2097152\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_ENTRIES 32\n"); + printf("#define DTB_SIZE 8192\n"); + break; + + case 1: + printf("#define L1_DATA_SIZE 16384\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 2097152\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_ENTRIES 64\n"); + printf("#define DTB_SIZE 8192\n"); + break; + + case 2: + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 4194304\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_ENTRIES 64\n"); + printf("#define DTB_SIZE 8192\n"); + break; + } +} + +void get_libname(void){ + printf("ev%d\n", implver() + 4); +} diff --git a/cpuid_ia64.c b/cpuid_ia64.c new file mode 100644 index 0000000..7f0fa6d --- /dev/null +++ b/cpuid_ia64.c @@ -0,0 +1,138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include "cpuid.h" + +#ifdef __ECC +#include +#endif + +static inline unsigned long cpuid(unsigned long regnum){ + unsigned long value; + +#ifdef __ECC + value = __getIndReg(_IA64_REG_INDR_CPUID, regnum); +#else + asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum)); +#endif + + return value; +} + +int have_cpuid(void){ return 1;} + +int get_vendor(void){ + unsigned long cpuid0, cpuid1; + char vendor[18]; + + cpuid0 = cpuid(0); + cpuid1 = cpuid(1); + + *(unsigned long *)(&vendor[0]) = cpuid0; + *(unsigned long *)(&vendor[8]) = cpuid1; + vendor[17] = (char)0; + + if (!strcmp(vendor, "GenuineIntel")) return VENDOR_INTEL; + + return VENDOR_UNKNOWN; +} + +int get_cputype(int gettype){ + unsigned long cpuid3; + + cpuid3 = cpuid(3); + + switch (gettype) { + case GET_ARCHREV : + return BITMASK(cpuid3, 32, 0xff); + case GET_FAMILY : + return BITMASK(cpuid3, 24, 0xff); + case GET_MODEL : + return BITMASK(cpuid3, 16, 0xff); + case GET_REVISION : + return BITMASK(cpuid3, 8, 0xff); + case GET_NUMBER : + return BITMASK(cpuid3, 0, 0xff); + } + + return 0; +} + +char *get_cpunamechar(void){ + if (get_cputype(GET_FAMILY) == 0x07) return "ITANIUM"; + if (get_cputype(GET_FAMILY) == 0x1f) return "ITANIUM2"; + if (get_cputype(GET_FAMILY) == 0x20) return "ITANIUM2"; + + return "UNKNOWN"; +} + +char *get_libname(void){ + if (get_cputype(GET_FAMILY) == 0x07) { printf("itanium"); return NULL;} + if (get_cputype(GET_FAMILY) == 0x1f) { printf("itanium2"); return NULL;} + if (get_cputype(GET_FAMILY) == 0x20) { printf("itanium2"); return NULL;} + + printf("UNKNOWN"); + + return NULL; +} + +void get_architecture(void){ + printf("IA64"); +} + +void get_subarchitecture(void){ + printf("%s", get_cpunamechar()); +} + +void get_subdirname(void){ + printf("ia64"); +} + +void get_cpuconfig(void){ + printf("#define %s\n", get_cpunamechar()); + printf("#define L1_DATA_SIZE 262144\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 1572864\n"); + printf("#define L2_LINESIZE 128\n"); + printf("#define DTB_SIZE 16384\n"); + printf("#define DTB_ENTRIES 128\n"); +} + diff --git a/cpuid_mips.c b/cpuid_mips.c new file mode 100644 index 0000000..99e4bcc --- /dev/null +++ b/cpuid_mips.c @@ -0,0 +1,68 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +void get_architecture(void){ + printf("MIPS64"); +} + +void get_subarchitecture(void){ + printf("SICORTEX"); +} + +void get_subdirname(void){ + printf("mips64"); +} + +void get_cpuconfig(void){ + printf("#define SICORTEX\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_ENTRIES 32\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); +} + +void get_libname(void){ +#ifdef __mips64 + printf("mips64\n"); +#else + printf("mips32\n"); +#endif +} diff --git a/cpuid_power.c b/cpuid_power.c new file mode 100644 index 0000000..46ff30a --- /dev/null +++ b/cpuid_power.c @@ -0,0 +1,190 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#ifdef _AIX +#include +#endif +#ifdef __APPLE__ +#include +#include +#include +#include +#endif + +#define CPUTYPE_UNKNOWN 0 +#define CPUTYPE_POWER3 1 +#define CPUTYPE_POWER4 2 +#define CPUTYPE_PPC970 3 +#define CPUTYPE_POWER5 4 +#define CPUTYPE_POWER6 5 +#define CPUTYPE_CELL 6 +#define CPUTYPE_PPCG4 7 + +char *cpuname[] = { + "UNKNOWN", + "POWER3", + "POWER4", + "PPC970", + "POWER5", + "POWER6", + "CELL", + "PPCG4", +}; + +char *lowercpuname[] = { + "unknown", + "power3", + "power4", + "ppc970", + "power5", + "power6", + "cell", + "ppcg4", +}; + +char *corename[] = { + "UNKNOWN", + "POWER3", + "POWER4", + "POWER4", + "POWER4", + "POWER6", + "CELL", + "PPCG4", +}; + +int detect(void){ + +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("cpu", buffer, 3)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + fclose(infile); + + if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; + if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; + if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; + if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; + if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; + if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; + + return CPUTYPE_UNKNOWN; +#endif + +#ifdef _AIX + return CPUTYPE_POWER5; +#endif + +#ifdef __APPLE__ + host_basic_info_data_t hostInfo; + mach_msg_type_number_t infoCount; + + infoCount = HOST_BASIC_INFO_COUNT; + host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount); + + if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4; + if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970; + + return CPUTYPE_PPC970; +#endif +} + +void get_architecture(void){ + printf("POWER"); +} + +void get_subdirname(void){ + printf("power"); +} + + +void get_subarchitecture(void){ + printf("%s", cpuname[detect()]); +} + +void get_cpuconfig(void){ +#if 0 +#ifdef _AIX + struct vminfo info; +#endif +#endif + + printf("#define %s\n", cpuname[detect()]); + printf("#define CORE_%s\n", corename[detect()]); + + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 128 \n"); + printf("#define DTB_ENTRIES 128\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + +#if 0 +#ifdef _AIX + if (vmgetinfo(&info, VMINFO, 0) == 0) { + if ((info.lgpg_size >> 20) >= 1024) { + printf("#define ALLOC_HUGETLB\n"); + } + } +#endif +#endif + +} + +void get_libname(void){ + printf("%s", lowercpuname[detect()]); +} + +char *get_corename(void){ + return cpuname[detect()]; +} diff --git a/cpuid_sparc.c b/cpuid_sparc.c new file mode 100644 index 0000000..b65c69d --- /dev/null +++ b/cpuid_sparc.c @@ -0,0 +1,58 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +void get_architecture(void){ + printf("SPARC"); +} + +void get_subarchitecture(void){ + printf("v9"); +} + +void get_subdirname(void){ + printf("sparc"); +} + +void get_cpuconfig(void){ + printf("#define V9\n"); + printf("#define DTB_ENTRIES 32\n"); +} + +void get_libname(void){ + printf("v9\n"); +} diff --git a/cpuid_x86.c b/cpuid_x86.c new file mode 100644 index 0000000..2887544 --- /dev/null +++ b/cpuid_x86.c @@ -0,0 +1,1453 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "cpuid.h" + +#ifndef CPUIDEMU + +#if defined(__APPLE__) && defined(__i386__) +void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); +#else +static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ + __asm__ __volatile__ + ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + +} +#endif + +#else + +typedef struct { + unsigned int id, a, b, c, d; +} idlist_t; + +typedef struct { + char *vendor; + char *name; + int start, stop; +} vendor_t; + +extern idlist_t idlist[]; +extern vendor_t vendor[]; + +static int cv = VENDOR; + +void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx){ + + static int current = 0; + + int start = vendor[cv].start; + int stop = vendor[cv].stop; + int count = stop - start; + + if ((current < start) || (current > stop)) current = start; + + while ((count > 0) && (idlist[current].id != op)) { + + current ++; + if (current > stop) current = start; + count --; + + } + + *eax = idlist[current].a; + *ebx = idlist[current].b; + *ecx = idlist[current].c; + *edx = idlist[current].d; +} + +#endif + +static inline int have_cpuid(void){ + int eax, ebx, ecx, edx; + + cpuid(0, &eax, &ebx, &ecx, &edx); + return eax; +} + +static inline int have_excpuid(void){ + int eax, ebx, ecx, edx; + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + return eax & 0xffff; +} + +int get_vendor(void){ + int eax, ebx, ecx, edx; + char vendor[13]; + + cpuid(0, &eax, &ebx, &ecx, &edx); + + *(int *)(&vendor[0]) = ebx; + *(int *)(&vendor[4]) = edx; + *(int *)(&vendor[8]) = ecx; + vendor[12] = (char)0; + + if (!strcmp(vendor, "GenuineIntel")) return VENDOR_INTEL; + if (!strcmp(vendor, " UMC UMC UMC")) return VENDOR_UMC; + if (!strcmp(vendor, "AuthenticAMD")) return VENDOR_AMD; + if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; + if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; + if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; + if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; + if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; + if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; + + if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; + + return VENDOR_UNKNOWN; +} + +int get_cputype(int gettype){ + int eax, ebx, ecx, edx; + int extend_family, family; + int extend_model, model; + int type, stepping; + int feature = 0; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + switch (gettype) { + case GET_EXFAMILY : + return BITMASK(eax, 20, 0xff); + case GET_EXMODEL : + return BITMASK(eax, 16, 0x0f); + case GET_TYPE : + return BITMASK(eax, 12, 0x03); + case GET_FAMILY : + return BITMASK(eax, 8, 0x0f); + case GET_MODEL : + return BITMASK(eax, 4, 0x0f); + case GET_APICID : + return BITMASK(ebx, 24, 0x0f); + case GET_LCOUNT : + return BITMASK(ebx, 16, 0x0f); + case GET_CHUNKS : + return BITMASK(ebx, 8, 0x0f); + case GET_STEPPING : + return BITMASK(eax, 0, 0x0f); + case GET_BLANDID : + return BITMASK(ebx, 0, 0xff); + case GET_NUMSHARE : + if (have_cpuid() < 4) return 0; + cpuid(4, &eax, &ebx, &ecx, &edx); + return BITMASK(eax, 14, 0xfff); + case GET_NUMCORES : + if (have_cpuid() < 4) return 0; + cpuid(4, &eax, &ebx, &ecx, &edx); + return BITMASK(eax, 26, 0x3f); + + case GET_FEATURE : + if ((edx & (1 << 3)) != 0) feature |= HAVE_PSE; + if ((edx & (1 << 15)) != 0) feature |= HAVE_CMOV; + if ((edx & (1 << 19)) != 0) feature |= HAVE_CFLUSH; + if ((edx & (1 << 23)) != 0) feature |= HAVE_MMX; + if ((edx & (1 << 25)) != 0) feature |= HAVE_SSE; + if ((edx & (1 << 26)) != 0) feature |= HAVE_SSE2; + if ((edx & (1 << 27)) != 0) { + if (BITMASK(ebx, 16, 0x0f) > 0) feature |= HAVE_HIT; + } + if ((ecx & (1 << 0)) != 0) feature |= HAVE_SSE3; + if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; + if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; + if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; + + if (have_excpuid() >= 0x01) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; + if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; + if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; + if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; + } + + if (have_excpuid() >= 0x1a) { + cpuid(0x8000001a, &eax, &ebx, &ecx, &edx); + if ((eax & (1 << 0)) != 0) feature |= HAVE_128BITFPU; + if ((eax & (1 << 1)) != 0) feature |= HAVE_FASTMOVU; + } + + } + return feature; +} + +int get_cacheinfo(int type, cache_info_t *cacheinfo){ + int eax, ebx, ecx, edx, cpuid_level; + int info[15]; + int i; + cache_info_t LC1, LD1, L2, L3, + ITB, DTB, LITB, LDTB, + L2ITB, L2DTB, L2LITB, L2LDTB; + + LC1.size = 0; LC1.associative = 0; LC1.linesize = 0; LC1.shared = 0; + LD1.size = 0; LD1.associative = 0; LD1.linesize = 0; LD1.shared = 0; + L2.size = 0; L2.associative = 0; L2.linesize = 0; L2.shared = 0; + L3.size = 0; L3.associative = 0; L3.linesize = 0; L3.shared = 0; + ITB.size = 0; ITB.associative = 0; ITB.linesize = 0; ITB.shared = 0; + DTB.size = 0; DTB.associative = 0; DTB.linesize = 0; DTB.shared = 0; + LITB.size = 0; LITB.associative = 0; LITB.linesize = 0; LITB.shared = 0; + LDTB.size = 0; LDTB.associative = 0; LDTB.linesize = 0; LDTB.shared = 0; + L2ITB.size = 0; L2ITB.associative = 0; L2ITB.linesize = 0; L2ITB.shared = 0; + L2DTB.size = 0; L2DTB.associative = 0; L2DTB.linesize = 0; L2DTB.shared = 0; + L2LITB.size = 0; L2LITB.associative = 0; L2LITB.linesize = 0; L2LITB.shared = 0; + L2LDTB.size = 0; L2LDTB.associative = 0; L2LDTB.linesize = 0; L2LDTB.shared = 0; + + cpuid(0, &cpuid_level, &ebx, &ecx, &edx); + + if (cpuid_level > 1) { + + cpuid(2, &eax, &ebx, &ecx, &edx); + + info[ 0] = BITMASK(eax, 8, 0xff); + info[ 1] = BITMASK(eax, 16, 0xff); + info[ 2] = BITMASK(eax, 24, 0xff); + + info[ 3] = BITMASK(ebx, 0, 0xff); + info[ 4] = BITMASK(ebx, 8, 0xff); + info[ 5] = BITMASK(ebx, 16, 0xff); + info[ 6] = BITMASK(ebx, 24, 0xff); + + info[ 7] = BITMASK(ecx, 0, 0xff); + info[ 8] = BITMASK(ecx, 8, 0xff); + info[ 9] = BITMASK(ecx, 16, 0xff); + info[10] = BITMASK(ecx, 24, 0xff); + + info[11] = BITMASK(edx, 0, 0xff); + info[12] = BITMASK(edx, 8, 0xff); + info[13] = BITMASK(edx, 16, 0xff); + info[14] = BITMASK(edx, 24, 0xff); + + for (i = 0; i < 15; i++){ + + switch (info[i]){ + + /* This table is from http://www.sandpile.org/ia32/cpuid.htm */ + + case 0x01 : + ITB.size = 4; + ITB.associative = 4; + ITB.linesize = 32; + break; + case 0x02 : + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 2; + break; + case 0x03 : + DTB.size = 4; + DTB.associative = 4; + DTB.linesize = 64; + break; + case 0x04 : + LDTB.size = 4096; + LDTB.associative = 4; + LDTB.linesize = 8; + break; + case 0x05 : + LDTB.size = 4096; + LDTB.associative = 4; + LDTB.linesize = 32; + break; + case 0x06 : + LC1.size = 8; + LC1.associative = 4; + LC1.linesize = 32; + break; + case 0x08 : + LC1.size = 16; + LC1.associative = 4; + LC1.linesize = 32; + break; + case 0x09 : + LC1.size = 32; + LC1.associative = 4; + LC1.linesize = 64; + break; + case 0x0a : + LD1.size = 8; + LD1.associative = 2; + LD1.linesize = 32; + break; + case 0x0c : + LD1.size = 16; + LD1.associative = 4; + LD1.linesize = 32; + break; + case 0x0d : + LD1.size = 16; + LD1.associative = 4; + LD1.linesize = 64; + break; + case 0x0e : + LD1.size = 24; + LD1.associative = 6; + LD1.linesize = 64; + break; + case 0x10 : + LD1.size = 16; + LD1.associative = 4; + LD1.linesize = 32; + break; + case 0x15 : + LC1.size = 16; + LC1.associative = 4; + LC1.linesize = 32; + break; + case 0x1a : + L2.size = 96; + L2.associative = 6; + L2.linesize = 64; + break; + case 0x21 : + L2.size = 256; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x22 : + L3.size = 512; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x23 : + L3.size = 1024; + L3.associative = 8; + L3.linesize = 64; + break; + case 0x25 : + L3.size = 2048; + L3.associative = 8; + L3.linesize = 64; + break; + case 0x29 : + L3.size = 4096; + L3.associative = 8; + L3.linesize = 64; + break; + case 0x2c : + LD1.size = 32; + LD1.associative = 8; + LD1.linesize = 64; + break; + case 0x30 : + LC1.size = 32; + LC1.associative = 8; + LC1.linesize = 64; + break; + case 0x39 : + L2.size = 128; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x3a : + L2.size = 192; + L2.associative = 6; + L2.linesize = 64; + break; + case 0x3b : + L2.size = 128; + L2.associative = 2; + L2.linesize = 64; + break; + case 0x3c : + L2.size = 256; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x3d : + L2.size = 384; + L2.associative = 6; + L2.linesize = 64; + break; + case 0x3e : + L2.size = 512; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x41 : + L2.size = 128; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x42 : + L2.size = 256; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x43 : + L2.size = 512; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x44 : + L2.size = 1024; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x45 : + L2.size = 2048; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x46 : + L3.size = 4096; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x47 : + L3.size = 8192; + L3.associative = 8; + L3.linesize = 64; + break; + case 0x48 : + L2.size = 3184; + L2.associative = 12; + L2.linesize = 64; + break; + case 0x49 : + if ((get_cputype(GET_FAMILY) == 0x0f) && (get_cputype(GET_MODEL) == 0x06)) { + L3.size = 4096; + L3.associative = 16; + L3.linesize = 64; + } else { + L2.size = 4096; + L2.associative = 16; + L2.linesize = 64; + } + break; + case 0x4a : + L3.size = 6144; + L3.associative = 12; + L3.linesize = 64; + break; + case 0x4b : + L3.size = 8192; + L3.associative = 16; + L3.linesize = 64; + break; + case 0x4c : + L3.size = 12280; + L3.associative = 12; + L3.linesize = 64; + break; + case 0x4d : + L3.size = 16384; + L3.associative = 16; + L3.linesize = 64; + break; + case 0x4e : + L2.size = 6144; + L2.associative = 24; + L2.linesize = 64; + break; + case 0x4f : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 32; + break; + case 0x50 : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 64; + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 64; + LITB.shared = 1; + break; + case 0x51 : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 128; + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 128; + LITB.shared = 1; + break; + case 0x52 : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 256; + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 256; + LITB.shared = 1; + break; + case 0x55 : + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 7; + LITB.shared = 1; + break; + case 0x56 : + LDTB.size = 4096; + LDTB.associative = 4; + LDTB.linesize = 16; + break; + case 0x57 : + LDTB.size = 4096; + LDTB.associative = 4; + LDTB.linesize = 16; + break; + case 0x5b : + DTB.size = 4; + DTB.associative = 0; + DTB.linesize = 64; + LDTB.size = 4096; + LDTB.associative = 0; + LDTB.linesize = 64; + LDTB.shared = 1; + break; + case 0x5c : + DTB.size = 4; + DTB.associative = 0; + DTB.linesize = 128; + LDTB.size = 4096; + LDTB.associative = 0; + LDTB.linesize = 128; + LDTB.shared = 1; + break; + case 0x5d : + DTB.size = 4; + DTB.associative = 0; + DTB.linesize = 256; + LDTB.size = 4096; + LDTB.associative = 0; + LDTB.linesize = 256; + LDTB.shared = 1; + break; + case 0x60 : + LD1.size = 16; + LD1.associative = 8; + LD1.linesize = 64; + break; + case 0x66 : + LD1.size = 8; + LD1.associative = 4; + LD1.linesize = 64; + break; + case 0x67 : + LD1.size = 16; + LD1.associative = 4; + LD1.linesize = 64; + break; + case 0x68 : + LD1.size = 32; + LD1.associative = 4; + LD1.linesize = 64; + break; + case 0x70 : + LC1.size = 12; + LC1.associative = 8; + break; + case 0x71 : + LC1.size = 16; + LC1.associative = 8; + break; + case 0x72 : + LC1.size = 32; + LC1.associative = 8; + break; + case 0x73 : + LC1.size = 64; + LC1.associative = 8; + break; + case 0x77 : + LC1.size = 16; + LC1.associative = 4; + LC1.linesize = 64; + break; + case 0x78 : + L2.size = 1024; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x79 : + L2.size = 128; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7a : + L2.size = 256; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7b : + L2.size = 512; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7c : + L2.size = 1024; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7d : + L2.size = 2048; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7e : + L2.size = 256; + L2.associative = 8; + L2.linesize = 128; + break; + case 0x7f : + L2.size = 512; + L2.associative = 2; + L2.linesize = 64; + break; + case 0x81 : + L2.size = 128; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x82 : + L2.size = 256; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x83 : + L2.size = 512; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x84 : + L2.size = 1024; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x85 : + L2.size = 2048; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x86 : + L2.size = 512; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x87 : + L2.size = 1024; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x88 : + L3.size = 2048; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x89 : + L3.size = 4096; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x8a : + L3.size = 8192; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x8d : + L3.size = 3096; + L3.associative = 12; + L3.linesize = 128; + break; + case 0x90 : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 64; + break; + case 0x96 : + DTB.size = 4; + DTB.associative = 0; + DTB.linesize = 32; + break; + case 0x9b : + L2DTB.size = 4; + L2DTB.associative = 0; + L2DTB.linesize = 96; + break; + case 0xb0 : + ITB.size = 4; + ITB.associative = 4; + ITB.linesize = 128; + break; + case 0xb1 : + LITB.size = 4096; + LITB.associative = 4; + LITB.linesize = 4; + break; + case 0xb2 : + ITB.size = 4; + ITB.associative = 4; + ITB.linesize = 64; + break; + case 0xb3 : + DTB.size = 4; + DTB.associative = 4; + DTB.linesize = 128; + break; + case 0xb4 : + DTB.size = 4; + DTB.associative = 4; + DTB.linesize = 256; + break; + case 0xba : + DTB.size = 4; + DTB.associative = 4; + DTB.linesize = 64; + break; + case 0xd0 : + L3.size = 512; + L3.associative = 4; + L3.linesize = 64; + break; + case 0xd1 : + L3.size = 1024; + L3.associative = 4; + L3.linesize = 64; + break; + case 0xd2 : + L3.size = 2048; + L3.associative = 4; + L3.linesize = 64; + break; + case 0xd6 : + L3.size = 1024; + L3.associative = 8; + L3.linesize = 64; + break; + case 0xd7 : + L3.size = 2048; + L3.associative = 8; + L3.linesize = 64; + break; + case 0xd8 : + L3.size = 4096; + L3.associative = 8; + L3.linesize = 64; + break; + case 0xdc : + L3.size = 2048; + L3.associative = 12; + L3.linesize = 64; + break; + case 0xdd : + L3.size = 4096; + L3.associative = 12; + L3.linesize = 64; + break; + case 0xde : + L3.size = 8192; + L3.associative = 12; + L3.linesize = 64; + break; + case 0xe2 : + L3.size = 2048; + L3.associative = 16; + L3.linesize = 64; + break; + case 0xe3 : + L3.size = 4096; + L3.associative = 16; + L3.linesize = 64; + break; + case 0xe4 : + L3.size = 8192; + L3.associative = 16; + L3.linesize = 64; + break; + } + } + } + + if (get_vendor() == VENDOR_INTEL) { + cpuid(0x80000000, &cpuid_level, &ebx, &ecx, &edx); + if (cpuid_level >= 0x80000006) { + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + L2.size = BITMASK(ecx, 16, 0xffff); + L2.associative = BITMASK(ecx, 12, 0x0f); + L2.linesize = BITMASK(ecx, 0, 0xff); + } + } + + if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + + LDTB.size = 4096; + LDTB.associative = BITMASK(eax, 24, 0xff); + if (LDTB.associative == 0xff) LDTB.associative = 0; + LDTB.linesize = BITMASK(eax, 16, 0xff); + + LITB.size = 4096; + LITB.associative = BITMASK(eax, 8, 0xff); + if (LITB.associative == 0xff) LITB.associative = 0; + LITB.linesize = BITMASK(eax, 0, 0xff); + + DTB.size = 4; + DTB.associative = BITMASK(ebx, 24, 0xff); + if (DTB.associative == 0xff) DTB.associative = 0; + DTB.linesize = BITMASK(ebx, 16, 0xff); + + ITB.size = 4; + ITB.associative = BITMASK(ebx, 8, 0xff); + if (ITB.associative == 0xff) ITB.associative = 0; + ITB.linesize = BITMASK(ebx, 0, 0xff); + + LD1.size = BITMASK(ecx, 24, 0xff); + LD1.associative = BITMASK(ecx, 16, 0xff); + if (LD1.associative == 0xff) LD1.associative = 0; + LD1.linesize = BITMASK(ecx, 0, 0xff); + + LC1.size = BITMASK(ecx, 24, 0xff); + LC1.associative = BITMASK(ecx, 16, 0xff); + if (LC1.associative == 0xff) LC1.associative = 0; + LC1.linesize = BITMASK(ecx, 0, 0xff); + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + L2LDTB.size = 4096; + L2LDTB.associative = BITMASK(eax, 24, 0xff); + if (L2LDTB.associative == 0xff) L2LDTB.associative = 0; + L2LDTB.linesize = BITMASK(eax, 16, 0xff); + + L2LITB.size = 4096; + L2LITB.associative = BITMASK(eax, 8, 0xff); + if (L2LITB.associative == 0xff) L2LITB.associative = 0; + L2LITB.linesize = BITMASK(eax, 0, 0xff); + + L2DTB.size = 4; + L2DTB.associative = BITMASK(ebx, 24, 0xff); + if (L2DTB.associative == 0xff) L2DTB.associative = 0; + L2DTB.linesize = BITMASK(ebx, 16, 0xff); + + L2ITB.size = 4; + L2ITB.associative = BITMASK(ebx, 8, 0xff); + if (L2ITB.associative == 0xff) L2ITB.associative = 0; + L2ITB.linesize = BITMASK(ebx, 0, 0xff); + + L2.size = BITMASK(ecx, 16, 0xffff); + L2.associative = BITMASK(ecx, 12, 0xf); + if (L2.associative == 0xff) L2.associative = 0; + L2.linesize = BITMASK(ecx, 0, 0xff); + + L3.size = BITMASK(edx, 18, 0x3fff) * 512; + L3.associative = BITMASK(edx, 12, 0xf); + if (L3.associative == 0xff) L2.associative = 0; + L3.linesize = BITMASK(edx, 0, 0xff); + + } + + switch (type) { + + case CACHE_INFO_L1_I : + *cacheinfo = LC1; + break; + case CACHE_INFO_L1_D : + *cacheinfo = LD1; + break; + case CACHE_INFO_L2 : + *cacheinfo = L2; + break; + case CACHE_INFO_L3 : + *cacheinfo = L3; + break; + case CACHE_INFO_L1_DTB : + *cacheinfo = DTB; + break; + case CACHE_INFO_L1_ITB : + *cacheinfo = ITB; + break; + case CACHE_INFO_L1_LDTB : + *cacheinfo = LDTB; + break; + case CACHE_INFO_L1_LITB : + *cacheinfo = LITB; + break; + case CACHE_INFO_L2_DTB : + *cacheinfo = L2DTB; + break; + case CACHE_INFO_L2_ITB : + *cacheinfo = L2ITB; + break; + case CACHE_INFO_L2_LDTB : + *cacheinfo = L2LDTB; + break; + case CACHE_INFO_L2_LITB : + *cacheinfo = L2LITB; + break; + } + return 0; +} + +int get_cpuname(void){ + + int family, exfamily, model, vendor, exmodel; + + if (!have_cpuid()) return CPUTYPE_80386; + + family = get_cputype(GET_FAMILY); + exfamily = get_cputype(GET_EXFAMILY); + model = get_cputype(GET_MODEL); + exmodel = get_cputype(GET_EXMODEL); + + vendor = get_vendor(); + + if (vendor == VENDOR_INTEL){ + switch (family) { + case 0x4: + return CPUTYPE_80486; + case 0x5: + return CPUTYPE_PENTIUM; + case 0x6: + switch (exmodel) { + case 0: + switch (model) { + case 1: + case 3: + case 5: + case 6: + return CPUTYPE_PENTIUM2; + case 7: + case 8: + case 10: + case 11: + return CPUTYPE_PENTIUM3; + case 9: + case 13: + case 14: + return CPUTYPE_PENTIUMM; + case 15: + return CPUTYPE_CORE2; + } + break; + case 1: + switch (model) { + case 6: + return CPUTYPE_CORE2; + case 7: + return CPUTYPE_PENRYN; + case 10: + case 11: + case 14: + case 15: + return CPUTYPE_NEHALEM; + case 12: + return CPUTYPE_ATOM; + case 13: + return CPUTYPE_DUNNINGTON; + break; + } + } + break; + case 0x7: + return CPUTYPE_ITANIUM; + case 0xf: + switch (exfamily) { + case 0 : + return CPUTYPE_PENTIUM4; + case 1 : + return CPUTYPE_ITANIUM; + } + break; + } + return CPUTYPE_INTEL_UNKNOWN; + } + + if (vendor == VENDOR_AMD){ + switch (family) { + case 0x4: + return CPUTYPE_AMD5X86; + case 0x5: + return CPUTYPE_AMDK6; + case 0x6: + return CPUTYPE_ATHLON; + case 0xf: + switch (exfamily) { + case 0: + case 2: + return CPUTYPE_OPTERON; + case 1: + case 10: + return CPUTYPE_BARCELONA; + } + break; + } + return CPUTYPE_AMD_UNKNOWN; + } + + if (vendor == VENDOR_CYRIX){ + switch (family) { + case 0x4: + return CPUTYPE_CYRIX5X86; + case 0x5: + return CPUTYPE_CYRIXM1; + case 0x6: + return CPUTYPE_CYRIXM2; + } + return CPUTYPE_CYRIX_UNKNOWN; + } + + if (vendor == VENDOR_NEXGEN){ + switch (family) { + case 0x5: + return CPUTYPE_NEXGENNX586; + } + return CPUTYPE_NEXGEN_UNKNOWN; + } + + if (vendor == VENDOR_CENTAUR){ + switch (family) { + case 0x5: + return CPUTYPE_CENTAURC6; + break; + case 0x6: + return CPUTYPE_NANO; + break; + + } + return CPUTYPE_VIAC3; + } + + if (vendor == VENDOR_RISE){ + switch (family) { + case 0x5: + return CPUTYPE_RISEMP6; + } + return CPUTYPE_RISE_UNKNOWN; + } + + if (vendor == VENDOR_SIS){ + switch (family) { + case 0x5: + return CPUTYPE_SYS55X; + } + return CPUTYPE_SIS_UNKNOWN; + } + + if (vendor == VENDOR_TRANSMETA){ + switch (family) { + case 0x5: + return CPUTYPE_CRUSOETM3X; + } + return CPUTYPE_TRANSMETA_UNKNOWN; + } + + if (vendor == VENDOR_NSC){ + switch (family) { + case 0x5: + return CPUTYPE_NSGEODE; + } + return CPUTYPE_NSC_UNKNOWN; + } + + return CPUTYPE_UNKNOWN; +} + +static char *cpuname[] = { + "UNKNOWN", + "INTEL_UNKNOWN", + "UMC_UNKNOWN", + "AMD_UNKNOWN", + "CYRIX_UNKNOWN", + "NEXGEN_UNKNOWN", + "CENTAUR_UNKNOWN", + "RISE_UNKNOWN", + "SIS_UNKNOWN", + "TRANSMETA_UNKNOWN", + "NSC_UNKNOWN", + "80386", + "80486", + "PENTIUM", + "PENTIUM2", + "PENTIUM3", + "PENTIUMM", + "PENTIUM4", + "CORE2", + "PENRYN", + "DUNNINGTON", + "NEHALEM", + "ATOM", + "ITANIUM", + "ITANIUM2", + "5X86", + "K6", + "ATHLON", + "DURON", + "OPTERON", + "BARCELONA", + "SHANGHAI", + "ISTANBUL", + "CYRIX5X86", + "CYRIXM1", + "CYRIXM2", + "NEXGENNX586", + "CENTAURC6", + "RISEMP6", + "SYS55X", + "TM3X00", + "NSGEODE", + "VIAC3", + "NANO", +}; + +static char *lowercpuname[] = { + "unknown", + "intel_unknown", + "umc_unknown", + "amd_unknown", + "cyrix_unknown", + "nexgen_unknown", + "centaur_unknown", + "rise_unknown", + "sis_unknown", + "transmeta_unknown", + "nsc_unknown", + "80386", + "80486", + "pentium", + "pentium2", + "pentium3", + "pentiumm", + "pentium4", + "core2", + "penryn", + "dunnington", + "nehalem", + "atom", + "itanium", + "itanium2", + "5x86", + "k6", + "athlon", + "duron", + "opteron", + "barcelona", + "shanghai", + "istanbul", + "cyrix5x86", + "cyrixm1", + "cyrixm2", + "nexgennx586", + "centaurc6", + "risemp6", + "sys55x", + "tms3x00", + "nsgeode", + "nano", +}; + +static char *corename[] = { + "UNKOWN", + "80486", + "P5", + "P6", + "KATMAI", + "COPPERMINE", + "NORTHWOOD", + "PRESCOTT", + "BANIAS", + "ATHLON", + "OPTERON", + "BARCELONA", + "VIAC3", + "YONAH", + "CORE2", + "PENRYN", + "DUNNINGTON", + "NEHALEM", + "ATOM", + "NANO", +}; + +static char *corename_lower[] = { + "unknown", + "80486", + "p5", + "p6", + "katmai", + "coppermine", + "northwood", + "prescott", + "banias", + "athlon", + "opteron", + "barcelona", + "viac3", + "yonah", + "core2", + "penryn", + "dunnington", + "nehalem", + "atom", + "nano", +}; + + +char *get_cpunamechar(void){ + return cpuname[get_cpuname()]; +} + +char *get_lower_cpunamechar(void){ + return lowercpuname[get_cpuname()]; +} + + +int get_coretype(void){ + + int family, exfamily, model, exmodel, vendor; + + if (!have_cpuid()) return CORE_80486; + + family = get_cputype(GET_FAMILY); + exfamily = get_cputype(GET_EXFAMILY); + model = get_cputype(GET_MODEL); + exmodel = get_cputype(GET_EXMODEL); + + vendor = get_vendor(); + + if (vendor == VENDOR_INTEL){ + switch (family) { + case 4: + return CORE_80486; + case 5: + return CORE_P5; + case 6: + switch (exmodel) { + case 0: + switch (model) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + return CORE_P6; + case 7: + return CORE_KATMAI; + case 8: + case 10: + case 11: + return CORE_COPPERMINE; + case 9: + case 13: + case 14: + return CORE_BANIAS; + case 15: + return CORE_CORE2; + } + break; + case 1: + switch (model) { + case 6: + return CORE_CORE2; + case 7: + return CORE_PENRYN; + case 10: + case 11: + case 14: + case 15: + return CORE_NEHALEM; + case 12: + return CORE_ATOM; + case 13: + return CORE_DUNNINGTON; + break; + } + } + case 15: + if (model <= 0x2) return CORE_NORTHWOOD; + return CORE_PRESCOTT; + } + } + + if (vendor == VENDOR_AMD){ + if (family <= 0x5) return CORE_80486; + if (family <= 0xe) return CORE_ATHLON; + if (family == 0xf){ + if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; + } + } + + if (vendor == VENDOR_CENTAUR) { + switch (family) { + case 0x6: + return CORE_NANO; + break; + } + return CORE_VIAC3; + } + + return CORE_UNKNOWN; +} + +void get_cpuconfig(void){ + + cache_info_t info; + int features; + + printf("#define %s\n", cpuname[get_cpuname()]); + + + if (get_coretype() != CORE_P5) { + + get_cacheinfo(CACHE_INFO_L1_I, &info); + if (info.size > 0) { + printf("#define L1_CODE_SIZE %d\n", info.size * 1024); + printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_CODE_LINESIZE %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L1_D, &info); + if (info.size > 0) { + printf("#define L1_DATA_SIZE %d\n", info.size * 1024); + printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_DATA_LINESIZE %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L2, &info); + if (info.size > 0) { + printf("#define L2_SIZE %d\n", info.size * 1024); + printf("#define L2_ASSOCIATIVE %d\n", info.associative); + printf("#define L2_LINESIZE %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L3, &info); + if (info.size > 0) { + printf("#define L3_SIZE %d\n", info.size * 1024); + printf("#define L3_ASSOCIATIVE %d\n", info.associative); + printf("#define L3_LINESIZE %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L1_ITB, &info); + if (info.size > 0) { + printf("#define ITB_SIZE %d\n", info.size * 1024); + printf("#define ITB_ASSOCIATIVE %d\n", info.associative); + printf("#define ITB_ENTRIES %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L1_DTB, &info); + if (info.size > 0) { + printf("#define DTB_SIZE %d\n", info.size * 1024); + printf("#define DTB_ASSOCIATIVE %d\n", info.associative); + printf("#define DTB_ENTRIES %d\n", info.linesize); + } + + features = get_cputype(GET_FEATURE); + + if (features & HAVE_CMOV ) printf("#define HAVE_CMOV\n"); + if (features & HAVE_MMX ) printf("#define HAVE_MMX\n"); + if (features & HAVE_SSE ) printf("#define HAVE_SSE\n"); + if (features & HAVE_SSE2 ) printf("#define HAVE_SSE2\n"); + if (features & HAVE_SSE3 ) printf("#define HAVE_SSE3\n"); + if (features & HAVE_SSSE3) printf("#define HAVE_SSSE3\n"); + if (features & HAVE_SSE4_1) printf("#define HAVE_SSE4_1\n"); + if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); + if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); + if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); + if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); + if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); + if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); + if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); + if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); + if (features & HAVE_128BITFPU) printf("#define HAVE_128BITFPU\n"); + if (features & HAVE_FASTMOVU) printf("#define HAVE_FASTMOVU\n"); + + printf("#define NUM_SHAREDCACHE %d\n", get_cputype(GET_NUMSHARE) + 1); + printf("#define NUM_CORES %d\n", get_cputype(GET_NUMCORES) + 1); + + features = get_coretype(); + if (features > 0) printf("#define CORE_%s\n", corename[features]); + } else { + printf("#define DTB_ENTRIES 16\n"); + printf("#define L1_CODE_SIZE 8192\n"); + printf("#define L1_DATA_SIZE 8192\n"); + printf("#define L2_SIZE 0\n"); + } +} + +void get_architecture(void){ +#ifndef __64BIT__ + printf("X86"); +#else + printf("X86_64"); +#endif +} + +void get_subarchitecture(void){ + printf("%s", get_cpunamechar()); +} + +void get_subdirname(void){ +#ifndef __64BIT__ + printf("x86"); +#else + printf("x86_64"); +#endif +} + +char *get_corename(void){ + return corename[get_coretype()]; +} + +void get_libname(void){ + printf("%s", corename_lower[get_coretype()]); +} + +/* This if for Makefile */ +void get_sse(void){ + + int features; + + features = get_cputype(GET_FEATURE); + + if (features & HAVE_MMX ) printf("HAVE_MMX=1\n"); + if (features & HAVE_SSE ) printf("HAVE_SSE=1\n"); + if (features & HAVE_SSE2 ) printf("HAVE_SSE2=1\n"); + if (features & HAVE_SSE3 ) printf("HAVE_SSE3=1\n"); + if (features & HAVE_SSSE3) printf("HAVE_SSSE3=1\n"); + if (features & HAVE_SSE4_1) printf("HAVE_SSE4_1=1\n"); + if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); + if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); + if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); + if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); + if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); + +} diff --git a/ctest.c b/ctest.c new file mode 100644 index 0000000..0c373bf --- /dev/null +++ b/ctest.c @@ -0,0 +1,107 @@ +#if defined(__PGI) || defined(__PGIC__) +COMPILER_PGI +#endif + +#if defined(__PATHSCALE__) || defined(__PATHCC__) +COMPILER_PATHSCALE +#endif + +#if defined(__INTEL_COMPILER) || defined(__ICC) || defined(__ECC) +COMPILER_INTEL +#endif + +#if defined(__OPENCC__) +COMPILER_OPEN64 +#endif + +#if defined(__SUNPRO_C) +COMPILER_SUN +#endif + +#if defined(__IBMC__) || defined(__xlc__) +COMPILER_IBM +#endif + +#if defined(__DECCC__) +COMPILER_DEC +#endif + +#if defined(__GNUC__) +COMPILER_GNU +#endif + +#if defined(__linux__) +OS_LINUX +#endif + +#if defined(__FreeBSD__) +OS_FreeBSD +#endif + +#if defined(__NetBSD__) +OS_NetBSD +#endif + +#if defined(__sun) +OS_SunOS +#endif + +#if defined(__APPLE__) +OS_Darwin +#endif + +#if defined(_AIX) +OS_AIX +#endif + +#if defined(__OSF) +OS_OSF +#endif + +#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT) +OS_WINNT +#endif + +#if defined(__CYGWIN__) +OS_CYGWIN +#endif + +#if defined(__INTERIX) +OS_INTERIX +#endif + +#if defined(__i386) || defined(_X86) +ARCH_X86 +#endif + +#if defined(__x86_64__) || defined(__amd64__) +ARCH_X86_64 +#endif + +#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) +ARCH_POWER +#endif + +#ifdef __mips64 +ARCH_MIPS64 +#endif + +#if defined(__mips32) || defined(__mips) +ARCH_MIPS32 +#endif + +#ifdef __alpha +ARCH_ALPHA +#endif + +#if defined(__sparc) || defined(__sparc__) +ARCH_SPARC +#endif + +#if defined(__ia64__) || defined(__ia64) +ARCH_IA64 +#endif + +#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) +BINARY_64 +#endif diff --git a/ctest/._Makefile b/ctest/._Makefile new file mode 100644 index 0000000..b95f7ae Binary files /dev/null and b/ctest/._Makefile differ diff --git a/ctest/._auxiliary.c b/ctest/._auxiliary.c new file mode 100644 index 0000000..23bed03 Binary files /dev/null and b/ctest/._auxiliary.c differ diff --git a/ctest/._c_c2chke.c b/ctest/._c_c2chke.c new file mode 100644 index 0000000..bf2f65a Binary files /dev/null and b/ctest/._c_c2chke.c differ diff --git a/ctest/._c_c3chke.c b/ctest/._c_c3chke.c new file mode 100644 index 0000000..6e9a3c7 Binary files /dev/null and b/ctest/._c_c3chke.c differ diff --git a/ctest/._c_cblas1.c b/ctest/._c_cblas1.c new file mode 100644 index 0000000..d637537 Binary files /dev/null and b/ctest/._c_cblas1.c differ diff --git a/ctest/._c_cblas2.c b/ctest/._c_cblas2.c new file mode 100644 index 0000000..951119c Binary files /dev/null and b/ctest/._c_cblas2.c differ diff --git a/ctest/._c_cblas3.c b/ctest/._c_cblas3.c new file mode 100644 index 0000000..0f0affc Binary files /dev/null and b/ctest/._c_cblas3.c differ diff --git a/ctest/._c_cblat1.f b/ctest/._c_cblat1.f new file mode 100644 index 0000000..d3ba142 Binary files /dev/null and b/ctest/._c_cblat1.f differ diff --git a/ctest/._c_cblat2.f b/ctest/._c_cblat2.f new file mode 100644 index 0000000..6cd3a62 Binary files /dev/null and b/ctest/._c_cblat2.f differ diff --git a/ctest/._c_cblat3.f b/ctest/._c_cblat3.f new file mode 100644 index 0000000..ea52a19 Binary files /dev/null and b/ctest/._c_cblat3.f differ diff --git a/ctest/._c_d2chke.c b/ctest/._c_d2chke.c new file mode 100644 index 0000000..21af210 Binary files /dev/null and b/ctest/._c_d2chke.c differ diff --git a/ctest/._c_d3chke.c b/ctest/._c_d3chke.c new file mode 100644 index 0000000..c289aa4 Binary files /dev/null and b/ctest/._c_d3chke.c differ diff --git a/ctest/._c_dblas1.c b/ctest/._c_dblas1.c new file mode 100644 index 0000000..9862fb0 Binary files /dev/null and b/ctest/._c_dblas1.c differ diff --git a/ctest/._c_dblas2.c b/ctest/._c_dblas2.c new file mode 100644 index 0000000..3e0941b Binary files /dev/null and b/ctest/._c_dblas2.c differ diff --git a/ctest/._c_dblas3.c b/ctest/._c_dblas3.c new file mode 100644 index 0000000..21cf676 Binary files /dev/null and b/ctest/._c_dblas3.c differ diff --git a/ctest/._c_dblat1.f b/ctest/._c_dblat1.f new file mode 100644 index 0000000..c895704 Binary files /dev/null and b/ctest/._c_dblat1.f differ diff --git a/ctest/._c_dblat2.f b/ctest/._c_dblat2.f new file mode 100644 index 0000000..20d4aec Binary files /dev/null and b/ctest/._c_dblat2.f differ diff --git a/ctest/._c_dblat3.f b/ctest/._c_dblat3.f new file mode 100644 index 0000000..bbb1e6f Binary files /dev/null and b/ctest/._c_dblat3.f differ diff --git a/ctest/._c_s2chke.c b/ctest/._c_s2chke.c new file mode 100644 index 0000000..9fe2f7d Binary files /dev/null and b/ctest/._c_s2chke.c differ diff --git a/ctest/._c_s3chke.c b/ctest/._c_s3chke.c new file mode 100644 index 0000000..a35d77a Binary files /dev/null and b/ctest/._c_s3chke.c differ diff --git a/ctest/._c_sblas1.c b/ctest/._c_sblas1.c new file mode 100644 index 0000000..f8ff84b Binary files /dev/null and b/ctest/._c_sblas1.c differ diff --git a/ctest/._c_sblas2.c b/ctest/._c_sblas2.c new file mode 100644 index 0000000..ea81095 Binary files /dev/null and b/ctest/._c_sblas2.c differ diff --git a/ctest/._c_sblas3.c b/ctest/._c_sblas3.c new file mode 100644 index 0000000..0688df2 Binary files /dev/null and b/ctest/._c_sblas3.c differ diff --git a/ctest/._c_sblat1.f b/ctest/._c_sblat1.f new file mode 100644 index 0000000..8912dce Binary files /dev/null and b/ctest/._c_sblat1.f differ diff --git a/ctest/._c_sblat2.f b/ctest/._c_sblat2.f new file mode 100644 index 0000000..a7455c1 Binary files /dev/null and b/ctest/._c_sblat2.f differ diff --git a/ctest/._c_sblat3.f b/ctest/._c_sblat3.f new file mode 100644 index 0000000..6a35c69 Binary files /dev/null and b/ctest/._c_sblat3.f differ diff --git a/ctest/._c_xerbla.c b/ctest/._c_xerbla.c new file mode 100644 index 0000000..0127f15 Binary files /dev/null and b/ctest/._c_xerbla.c differ diff --git a/ctest/._c_z2chke.c b/ctest/._c_z2chke.c new file mode 100644 index 0000000..b2f8632 Binary files /dev/null and b/ctest/._c_z2chke.c differ diff --git a/ctest/._c_z3chke.c b/ctest/._c_z3chke.c new file mode 100644 index 0000000..998845e Binary files /dev/null and b/ctest/._c_z3chke.c differ diff --git a/ctest/._c_zblas1.c b/ctest/._c_zblas1.c new file mode 100644 index 0000000..b7260cf Binary files /dev/null and b/ctest/._c_zblas1.c differ diff --git a/ctest/._c_zblas2.c b/ctest/._c_zblas2.c new file mode 100644 index 0000000..1c360bd Binary files /dev/null and b/ctest/._c_zblas2.c differ diff --git a/ctest/._c_zblas3.c b/ctest/._c_zblas3.c new file mode 100644 index 0000000..c10082a Binary files /dev/null and b/ctest/._c_zblas3.c differ diff --git a/ctest/._c_zblat1.f b/ctest/._c_zblat1.f new file mode 100644 index 0000000..2359822 Binary files /dev/null and b/ctest/._c_zblat1.f differ diff --git a/ctest/._c_zblat2.f b/ctest/._c_zblat2.f new file mode 100644 index 0000000..a1dae91 Binary files /dev/null and b/ctest/._c_zblat2.f differ diff --git a/ctest/._c_zblat3.f b/ctest/._c_zblat3.f new file mode 100644 index 0000000..3d6eaa8 Binary files /dev/null and b/ctest/._c_zblat3.f differ diff --git a/ctest/._cblas_test.h b/ctest/._cblas_test.h new file mode 100644 index 0000000..7431bbc Binary files /dev/null and b/ctest/._cblas_test.h differ diff --git a/ctest/._cin2 b/ctest/._cin2 new file mode 100644 index 0000000..ccfb3a7 Binary files /dev/null and b/ctest/._cin2 differ diff --git a/ctest/._cin3 b/ctest/._cin3 new file mode 100644 index 0000000..bf05a0e Binary files /dev/null and b/ctest/._cin3 differ diff --git a/ctest/._constant.c b/ctest/._constant.c new file mode 100644 index 0000000..fd5de25 Binary files /dev/null and b/ctest/._constant.c differ diff --git a/ctest/._din2 b/ctest/._din2 new file mode 100644 index 0000000..ef3bb4e Binary files /dev/null and b/ctest/._din2 differ diff --git a/ctest/._din3 b/ctest/._din3 new file mode 100644 index 0000000..70741ca Binary files /dev/null and b/ctest/._din3 differ diff --git a/ctest/._sin2 b/ctest/._sin2 new file mode 100644 index 0000000..bc27bb9 Binary files /dev/null and b/ctest/._sin2 differ diff --git a/ctest/._sin3 b/ctest/._sin3 new file mode 100644 index 0000000..11df605 Binary files /dev/null and b/ctest/._sin3 differ diff --git a/ctest/._zin2 b/ctest/._zin2 new file mode 100644 index 0000000..4eefae9 Binary files /dev/null and b/ctest/._zin2 differ diff --git a/ctest/._zin3 b/ctest/._zin3 new file mode 100644 index 0000000..a1f9448 Binary files /dev/null and b/ctest/._zin3 differ diff --git a/ctest/LICENSE b/ctest/LICENSE new file mode 100644 index 0000000..85061f2 --- /dev/null +++ b/ctest/LICENSE @@ -0,0 +1,23 @@ +This directory contains the reference implementation of BLAS +which is obtainable at: http://netlib.org/blas/ + +The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, +2010, is as follows: + +2) Are there legal restrictions on the use of BLAS reference implementation +software? + +The reference BLAS is a freely-available software package. It is available from +netlib via anonymous ftp and the World Wide Web. Thus, it can be included in +commercial software packages (and has been). We only ask that proper credit be +given to the authors. + +Like all software, it is copyrighted. It is not trademarked, but we do ask the +following: + +If you modify the source for these routines we ask that you change the name of +the routine and comment the changes made to the original. + +We will gladly answer any questions regarding the software. If a modification +is done, however, it is the responsibility of the person who modified the +routine to provide support. diff --git a/ctest/Makefile b/ctest/Makefile new file mode 100644 index 0000000..3cd6cc8 --- /dev/null +++ b/ctest/Makefile @@ -0,0 +1,93 @@ +# +# The Makefile compiles c wrappers and testers for CBLAS. +# + +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +CFLAGS += -DADD$(BU) -DCBLAS + +LIB = $(TOPDIR)/$(LIBNAME) + +stestl1o = c_sblas1.o + +stestl2o = c_sblas2.o c_s2chke.o auxiliary.o c_xerbla.o constant.o + +stestl3o = c_sblas3.o c_s3chke.o auxiliary.o c_xerbla.o constant.o + +dtestl1o = c_dblas1.o + +dtestl2o = c_dblas2.o c_d2chke.o auxiliary.o c_xerbla.o constant.o + +dtestl3o = c_dblas3.o c_d3chke.o auxiliary.o c_xerbla.o constant.o + +ctestl1o = c_cblas1.o + +ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o + +ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o + +ztestl1o = c_zblas1.o + +ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o + +ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o + +all :: all1 all2 all3 + +all1: xscblat1 xdcblat1 xccblat1 xzcblat1 + GOTO_NUM_THREADS=2 ./xscblat1 + GOTO_NUM_THREADS=2 ./xdcblat1 + GOTO_NUM_THREADS=2 ./xccblat1 + GOTO_NUM_THREADS=2 ./xzcblat1 + +all2: xscblat2 xdcblat2 xccblat2 xzcblat2 + GOTO_NUM_THREADS=2 ./xscblat2 < sin2 + GOTO_NUM_THREADS=2 ./xdcblat2 < din2 + GOTO_NUM_THREADS=2 ./xccblat2 < cin2 + GOTO_NUM_THREADS=2 ./xzcblat2 < zin2 + +all3: xscblat3 xdcblat3 xccblat3 xzcblat3 + GOTO_NUM_THREADS=2 ./xscblat3 < sin3 + GOTO_NUM_THREADS=2 ./xdcblat3 < din3 + GOTO_NUM_THREADS=2 ./xccblat3 < cin3 + GOTO_NUM_THREADS=2 ./xzcblat3 < zin3 + +clean :: + rm -f x* + +FLDFLAGS = $(FFLAGS:-fPIC=) +CEXTRALIB = + +# Single real +xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +# Double real +xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + +# Single complex +xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + +# Double complex +xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + +include $(TOPDIR)/Makefile.tail diff --git a/ctest/auxiliary.c b/ctest/auxiliary.c new file mode 100644 index 0000000..1f47acf --- /dev/null +++ b/ctest/auxiliary.c @@ -0,0 +1,38 @@ +/* + * Written by T. H. Do, 1/23/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void get_transpose_type(char *type, enum CBLAS_TRANSPOSE *trans) { + if( (strncmp( type,"n",1 )==0)||(strncmp( type,"N",1 )==0) ) + *trans = CblasNoTrans; + else if( (strncmp( type,"t",1 )==0)||(strncmp( type,"T",1 )==0) ) + *trans = CblasTrans; + else if( (strncmp( type,"c",1 )==0)||(strncmp( type,"C",1 )==0) ) + *trans = CblasConjTrans; + else *trans = UNDEFINED; +} + +void get_uplo_type(char *type, enum CBLAS_UPLO *uplo) { + if( (strncmp( type,"u",1 )==0)||(strncmp( type,"U",1 )==0) ) + *uplo = CblasUpper; + else if( (strncmp( type,"l",1 )==0)||(strncmp( type,"L",1 )==0) ) + *uplo = CblasLower; + else *uplo = UNDEFINED; +} +void get_diag_type(char *type, enum CBLAS_DIAG *diag) { + if( (strncmp( type,"u",1 )==0)||(strncmp( type,"U",1 )==0) ) + *diag = CblasUnit; + else if( (strncmp( type,"n",1 )==0)||(strncmp( type,"N",1 )==0) ) + *diag = CblasNonUnit; + else *diag = UNDEFINED; +} +void get_side_type(char *type, enum CBLAS_SIDE *side) { + if( (strncmp( type,"l",1 )==0)||(strncmp( type,"L",1 )==0) ) + *side = CblasLeft; + else if( (strncmp( type,"r",1 )==0)||(strncmp( type,"R",1 )==0) ) + *side = CblasRight; + else *side = UNDEFINED; +} diff --git a/ctest/c_c2chke.c b/ctest/c_c2chke.c new file mode 100644 index 0000000..611cc21 --- /dev/null +++ b/ctest/c_c2chke.c @@ -0,0 +1,826 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_c2chke(char *rout) { + char *sf = ( rout ) ; + float A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_cgemv",11)==0) { + cblas_rout = "cblas_cgemv"; + cblas_info = 1; + cblas_cgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_cgbmv",11)==0) { + cblas_rout = "cblas_cgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_chemv",11)==0) { + cblas_rout = "cblas_chemv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chemv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_chbmv",11)==0) { + cblas_rout = "cblas_chbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_chpmv",11)==0) { + cblas_rout = "cblas_chpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chpmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctrmv",11)==0) { + cblas_rout = "cblas_ctrmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctrmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctbmv",11)==0) { + cblas_rout = "cblas_ctbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctpmv",11)==0) { + cblas_rout = "cblas_ctpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctrsv",11)==0) { + cblas_rout = "cblas_ctrsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctrsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctbsv",11)==0) { + cblas_rout = "cblas_ctbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctpsv",11)==0) { + cblas_rout = "cblas_ctpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_cgeru",10)==0) { + cblas_rout = "cblas_cgeru"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cgeru(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_cgerc",10)==0) { + cblas_rout = "cblas_cgerc"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cgerc(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_cher2",11)==0) { + cblas_rout = "cblas_cher2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cher2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_chpr2",11)==0) { + cblas_rout = "cblas_chpr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chpr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_cher",10)==0) { + cblas_rout = "cblas_cher"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cher(INVALID, CblasUpper, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_chpr",10)==0) { + cblas_rout = "cblas_chpr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chpr(INVALID, CblasUpper, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_c3chke.c b/ctest/c_c3chke.c new file mode 100644 index 0000000..2951552 --- /dev/null +++ b/ctest/c_c3chke.c @@ -0,0 +1,1706 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_c3chke(char * rout) { + char *sf = ( rout ) ; + float A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + if (strncmp( sf,"cblas_cgemm" ,11)==0) { + cblas_rout = "cblas_cgemm" ; + + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_chemm" ,11)==0) { + cblas_rout = "cblas_chemm" ; + + cblas_info = 1; + cblas_chemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csymm" ,11)==0) { + cblas_rout = "cblas_csymm" ; + + cblas_info = 1; + cblas_csymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrmm" ,11)==0) { + cblas_rout = "cblas_ctrmm" ; + + cblas_info = 1; + cblas_ctrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrsm" ,11)==0) { + cblas_rout = "cblas_ctrsm" ; + + cblas_info = 1; + cblas_ctrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cherk" ,11)==0) { + cblas_rout = "cblas_cherk" ; + + cblas_info = 1; + cblas_cherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyrk" ,11)==0) { + cblas_rout = "cblas_csyrk" ; + + cblas_info = 1; + cblas_csyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cher2k" ,12)==0) { + cblas_rout = "cblas_cher2k" ; + + cblas_info = 1; + cblas_cher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyr2k" ,12)==0) { + cblas_rout = "cblas_csyr2k" ; + + cblas_info = 1; + cblas_csyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_cblas1.c b/ctest/c_cblas1.c new file mode 100644 index 0000000..f5ffc14 --- /dev/null +++ b/ctest/c_cblas1.c @@ -0,0 +1,75 @@ +/* + * c_cblas1.c + * + * The program is a C wrapper for ccblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "common.h" +#include "cblas_test.h" + +void F77_caxpy(const int *N, const void *alpha, void *X, + const int *incX, void *Y, const int *incY) +{ + cblas_caxpy(*N, alpha, X, *incX, Y, *incY); + return; +} + +void F77_ccopy(const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_ccopy(*N, X, *incX, Y, *incY); + return; +} + +void F77_cdotc(const int *N, void *X, const int *incX, + void *Y, const int *incY, void *dotc) +{ + cblas_cdotc_sub(*N, X, *incX, Y, *incY, dotc); + return; +} + +void F77_cdotu(const int *N, void *X, const int *incX, + void *Y, const int *incY,void *dotu) +{ + cblas_cdotu_sub(*N, X, *incX, Y, *incY, dotu); + return; +} + +void F77_cscal(const int *N, const void * *alpha, void *X, + const int *incX) +{ + cblas_cscal(*N, alpha, X, *incX); + return; +} + +void F77_csscal(const int *N, const float *alpha, void *X, + const int *incX) +{ + cblas_csscal(*N, *alpha, X, *incX); + return; +} + +void F77_cswap( const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_cswap(*N,X,*incX,Y,*incY); + return; +} + +int F77_icamax(const int *N, const void *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return (cblas_icamax(*N, X, *incX)+1); +} + +float F77_scnrm2(const int *N, const void *X, const int *incX) +{ + return cblas_scnrm2(*N, X, *incX); +} + +float F77_scasum(const int *N, void *X, const int *incX) +{ + return cblas_scasum(*N, X, *incX); +} diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c new file mode 100644 index 0000000..7a886ac --- /dev/null +++ b/ctest/c_cblas2.c @@ -0,0 +1,807 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/08/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void F77_cgemv(int *order, char *transp, int *m, int *n, + const void *alpha, + CBLAS_TEST_COMPLEX *a, int *lda, const void *x, int *incx, + const void *beta, void *y, int *incy) { + + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_cgemv( CblasRowMajor, trans, *m, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_cgemv( CblasColMajor, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); + else + cblas_cgemv( UNDEFINED, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); +} + +void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *x, int *incx, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy) { + + CBLAS_TEST_COMPLEX *A; + int i,j,irow,jcol,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ){ + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + cblas_cgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, alpha, A, LDA, x, + *incx, beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_cgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else + cblas_cgbmv( UNDEFINED, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); +} + +void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, + CBLAS_TEST_COMPLEX *a, int *lda){ + + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_cgeru( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_cgeru( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_cgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, + CBLAS_TEST_COMPLEX *a, int *lda) { + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_cgerc( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_cgerc( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_cgerc( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *x, + int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ + + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_chemv( CblasRowMajor, uplo, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_chemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_chemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_chbmv(int *order, char *uplow, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *y, int *incy){ + +CBLAS_TEST_COMPLEX *A; +int i,irow,j,jcol,LDA; + + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_chbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else { + LDA = *k+2; + A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) { + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + } + cblas_chbmv( CblasRowMajor, uplo, *n, *k, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + } + else if (*order == TEST_COL_MJR) + cblas_chbmv(CblasColMajor, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_chbmv(UNDEFINED, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *ap, CBLAS_TEST_COMPLEX *x, int *incx, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ + + CBLAS_TEST_COMPLEX *A, *AP; + int i,j,k,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + if (*order == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_chpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, + beta, y, *incy); + else { + LDA = *n; + A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); + AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + sizeof( CBLAS_TEST_COMPLEX )); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include "common.h" +#include "cblas_test.h" + +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k, + float *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, float *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX )); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + B=(CBLAS_TEST_COMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f new file mode 100644 index 0000000..c741ce5 --- /dev/null +++ b/ctest/c_cblat1.f @@ -0,0 +1,682 @@ + PROGRAM CCBLAT1 +* Test program for the COMPLEX Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_CDOTC'/ + DATA L(2)/'CBLAS_CDOTU'/ + DATA L(3)/'CBLAS_CAXPY'/ + DATA L(4)/'CBLAS_CCOPY'/ + DATA L(5)/'CBLAS_CSWAP'/ + DATA L(6)/'CBLAS_SCNRM2'/ + DATA L(7)/'CBLAS_SCASUM'/ + DATA L(8)/'CBLAS_CSCAL'/ + DATA L(9)/'CBLAS_CSSCAL'/ + DATA L(10)/'CBLAS_ICAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA + REAL SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + REAL STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + REAL SCASUMTEST, SCNRM2TEST + INTEGER ICAMAXTEST + EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST +* .. External Subroutines .. + EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3E0, (0.4E0,-0.7E0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.3E0,-0.4E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.1E0,-0.3E0), (0.5E0,-0.1E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), + + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.3E0,-0.4E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.1E0,-0.3E0), (8.0E0,9.0E0), (0.5E0,-0.1E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (0.1E0,0.1E0), + + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), + + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), + + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ + DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ + DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (-0.16E0,-0.37E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (-0.17E0,-0.19E0), (0.13E0,-0.39E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.19E0,-0.17E0), (0.32E0,0.09E0), + + (0.23E0,-0.24E0), (0.18E0,0.01E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (-0.16E0,-0.37E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (-0.17E0,-0.19E0), (8.0E0,9.0E0), + + (0.13E0,-0.39E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.11E0,-0.03E0), (3.0E0,6.0E0), + + (-0.17E0,0.46E0), (4.0E0,7.0E0), + + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), + + (0.32E0,0.09E0), (6.0E0,9.0E0), + + (0.23E0,-0.24E0), (8.0E0,3.0E0), + + (0.18E0,0.01E0), (9.0E0,4.0E0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.09E0,-0.12E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.03E0,-0.09E0), (0.15E0,-0.03E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.03E0,0.03E0), (-0.18E0,0.03E0), + + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.09E0,0.03E0), (0.03E0,0.12E0), + + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.09E0,-0.12E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.03E0,-0.09E0), (8.0E0,9.0E0), + + (0.15E0,-0.03E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.03E0,0.03E0), (3.0E0,6.0E0), + + (-0.18E0,0.03E0), (4.0E0,7.0E0), + + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), + + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), + + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. SCNRM2TEST .. + CALL STEST1(SCNRM2TEST(N,CX,INCX),STRUE2(NP1), + + STRUE2(NP1), SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. SCASUMTEST .. + CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), + + STRUE4(NP1),SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. CSCAL .. + CALL CSCAL(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. CSSCALTEST .. + CALL CSSCALTEST(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ICAMAXTEST .. + CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* CSCAL +* Add a test for alpha equal to zero. + CA = (0.0E0,0.0E0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 80 CONTINUE + CALL CSCAL(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* CSSCALTEST +* Add a test for alpha equal to zero. + SA = 0.0E0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 100 CONTINUE + CALL CSSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0E0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL CSSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0E0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL CSSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA,CTEMP + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + EXTERNAL CDOTCTEST, CDOTUTEST +* .. External Subroutines .. + EXTERNAL CAXPYTEST, CCOPYTEST, CSWAPTEST, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4E0,-0.7E0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (-0.1E0,-0.9E0), (0.2E0,-0.8E0), + + (-0.9E0,-0.4E0), (0.1E0,0.4E0), (-0.6E0,0.6E0)/ + DATA CY1/(0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.7E0,-0.6E0), (0.1E0,-0.5E0), (-0.1E0,-0.2E0), + + (-0.5E0,-0.3E0), (0.8E0,-0.7E0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-1.55E0,0.5E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (-1.55E0,0.5E0), + + (0.03E0,-0.89E0), (-0.38E0,-0.96E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-0.9E0,0.5E0), (0.42E0,-1.41E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-0.9E0,0.5E0), + + (0.06E0,-0.13E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.52E0,-1.51E0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-1.18E0,-0.31E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-1.54E0,0.97E0), + + (0.03E0,-0.89E0), (-0.18E0,-1.31E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), (-0.9E0,0.5E0), + + (0.05E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-0.9E0,0.5E0), (0.05E0,-0.6E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.32E0,-1.16E0)/ + DATA CT7/(0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (0.65E0,-0.47E0), (-0.34E0,-1.22E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.59E0,-1.46E0), (-1.04E0,-0.04E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.83E0,0.59E0), (0.07E0,-0.37E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.76E0,-1.15E0), (-1.33E0,-1.82E0)/ + DATA CT6/(0.0E0,0.0E0), (0.90E0,0.06E0), + + (0.91E0,-0.77E0), (1.80E0,-0.10E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.45E0,0.74E0), + + (0.20E0,0.90E0), (0.0E0,0.0E0), (0.90E0,0.06E0), + + (-0.55E0,0.23E0), (0.83E0,-0.39E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.04E0,0.79E0), + + (1.95E0,1.22E0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (-0.9E0,0.5E0), (0.7E0,-0.6E0), (0.1E0,-0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.6E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.8E0,-0.7E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.2E0), + + (0.2E0,-0.8E0), (0.7E0,-0.6E0), (0.1E0,0.4E0), + + (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.9E0,0.5E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.1E0,-0.5E0), + + (-0.4E0,-0.7E0), (0.7E0,-0.6E0), (0.2E0,-0.8E0), + + (-0.9E0,0.5E0), (0.1E0,0.4E0), (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (0.7E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (0.7E0,-0.6E0), (-0.1E0,-0.2E0), (0.8E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.9E0), + + (0.2E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (-0.9E0,0.5E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,0.5E0), (-0.9E0,-0.4E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.7E0,-0.8E0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,-0.4E0), (-0.1E0,-0.9E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.9E0,0.5E0), + + (-0.4E0,-0.7E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.9E0,0.5E0), (-0.4E0,-0.7E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.2E0,-0.8E0)/ + DATA CSIZE1/(0.0E0,0.0E0), (0.9E0,0.9E0), + + (1.63E0,1.73E0), (2.90E0,2.78E0)/ + DATA CSIZE3/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0)/ + DATA CSIZE2/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. CDOTCTEST .. + CALL CDOTCTEST(N,CX,INCX,CY,INCY,CTEMP) + CDOT(1) = CTEMP + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. CDOTUTEST .. + CALL CDOTUTEST(N,CX,INCX,CY,INCY,CTEMP) + CDOT(1) = CTEMP + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. CAXPYTEST .. + CALL CAXPYTEST(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. CCOPYTEST .. + CALL CCOPYTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE IF (ICASE.EQ.5) THEN +* .. CSWAPTEST .. + CALL CSWAPTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0E0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + REAL SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC AIMAG, REAL +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = REAL(CCOMP(I)) + SCOMP(2*I) = AIMAG(CCOMP(I)) + STRUE(2*I-1) = REAL(CTRUE(I)) + STRUE(2*I) = AIMAG(CTRUE(I)) + SSIZE(2*I-1) = REAL(CSIZE(I)) + SSIZE(2*I) = AIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/ctest/c_cblat2.f b/ctest/c_cblat2.f new file mode 100644 index 0000000..545ba4b --- /dev/null +++ b/ctest/c_cblat2.f @@ -0,0 +1,2932 @@ + PROGRAM CBLAT2 +* +* Test program for the COMPLEX Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_cgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cgerc T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cgeru T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chpr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chpr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHK6, + $ CC2CHKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_cgemv ', 'cblas_cgbmv ', + $ 'cblas_chemv ','cblas_chbmv ','cblas_chpmv ', + $ 'cblas_ctrmv ','cblas_ctbmv ','cblas_ctpmv ', + $ 'cblas_ctrsv ','cblas_ctbsv ','cblas_ctpsv ', + $ 'cblas_cgerc ','cblas_cgeru ','cblas_cher ', + $ 'cblas_chpr ','cblas_cher2 ','cblas_chpr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from CMVCH YT holds +* the result computed by CMVCH. + TRANS = 'N' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CC2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test CGEMV, 01, and CGBMV, 02. + 140 IF (CORDER) THEN + CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. + 150 IF (CORDER) THEN + CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test CTRMV, 06, CTBMV, 07, CTPMV, 08, +* CTRSV, 09, CTBSV, 10, and CTPSV, 11. + 160 IF (CORDER) THEN + CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test CGERC, 12, CGERU, 13. + 170 IF (CORDER) THEN + CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test CHER, 14, and CHPR, 15. + 180 IF (CORDER) THEN + CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test CHER2, 16, and CHPR2, 17. + 190 IF (CORDER) THEN + CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT(' TESTS OF THE COMPLEX LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT(' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT(' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT(A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT2. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CGEMV and CGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGBMV, CCGEMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CCGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* +* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LCERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CHEMV, CHBMV and CHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHBMV, CCHEMV, CCHPMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHEMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LCE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LCERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LCE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LCERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,',/ 10x, I2, ',(', F4.1, ',', F4.1, + $ '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', F4.1, ',', + $ F4.1, '), ', 'Y,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests CTRMV, CTBMV, CTPMV, CTRSV, CTBSV and CTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX TRANSL + REAL ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMVCH, CCTBMV, CCTBSV, CCTPMV, + $ CCTPSV, CCTRMV, CCTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for CMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LCERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LCERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LCERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests CGERC and CGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGERC, CCGERU, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. + CONJ = SNAME( 11: 11 ).EQ.'c' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE(SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL CCGERC( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL CCGERU( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = CONJG( W( 1 ) ) + CALL CMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests CHER and CHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, TRANSL + REAL ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER, CCHPR, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = REAL( ALF( IA ) ) + ALPHA = CMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CCHER( IORDER, UPLO, N, RALPHA, XX, + $ INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCHPR( IORDER, UPLO, N, RALPHA, + $ XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LCERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = CONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END + SUBROUTINE CCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests CHER2 and CHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER2, CCHPR2, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CCHER2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*CONJG( Z( J, 2 ) ) + W( 2 ) = CONJG( ALPHA )*CONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK6. +* + END + SUBROUTINE CMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX C + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + CONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of CMVCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'he' or 'hp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'he', 'hb', 'hp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, MIN, REAL +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'h' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = CBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'hb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'hp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = CMPLX( REAL( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f new file mode 100644 index 0000000..b03d479 --- /dev/null +++ b/ctest/c_cblat3.f @@ -0,0 +1,2786 @@ + PROGRAM CBLAT3 +* +* Test program for the COMPLEX Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 22 lines: +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_cgemm ', 'cblas_chemm ', + $ 'cblas_csymm ', 'cblas_ctrmm ', 'cblas_ctrsm ', + $ 'cblas_cherk ', 'cblas_csyrk ', 'cblas_cher2k', + $ 'cblas_csyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from CMMCH CT holds +* the result computed by CMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CC3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test CGEMM, 01. + 140 IF (CORDER) THEN + CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CHEMM, 02, CSYMM, 03. + 150 IF (CORDER) THEN + CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CTRMM, 04, CTRSM, 05. + 160 IF (CORDER) THEN + CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test CHERK, 06, CSYRK, 07. + 170 IF (CORDER) THEN + CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CHER2K, 08, CSYR2K, 09. + 180 IF (CORDER) THEN + CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT(' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT(' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT(' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, + $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT3. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGEMM, CMAKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL CPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LCERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END +* + SUBROUTINE CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', + $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) + END +* + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CHEMM and CSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHEMM, CMAKE, CMMCH, CCSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL CMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL CPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CCHEMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + ELSE + CALL CCSYMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END +* + SUBROUTINE CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, + $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) + END +* + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests CTRMM and CTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS + REAL ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMMCH, CCTRMM, CCTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for CMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL CMAKE( 'tr', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CCTRMM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CCTRSM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LCE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LCE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT(' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END +* + SUBROUTINE CPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + COMPLEX ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', + $ F4.1, '), A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CHERK and CSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHERK, CMAKE, CMMCH, CCSYRK +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = REAL( ALPHA ) + ALPHA = CMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL CPRCN6( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, + $ LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCHERK( IORDER, UPLO, TRANS, N, K, + $ RALPHA, AA, LDA, RBETA, CC, + $ LDC ) + ELSE + IF( TRACE ) + $ CALL CPRCN4( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCSYRK( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LCERES( SNAME( 8: 9 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL CMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL CPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, + $ LDA, rBETA, LDC) + ELSE + CALL CPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END +* + SUBROUTINE CPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', + $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE CPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests CHER2K and CSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER2K, CMAKE, CMMCH, CCSYR2K +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL CPRCN7( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ RBETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCHER2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, RBETA, + $ CC, LDC ) + ELSE + IF( TRACE ) + $ CALL CPRCN5( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'he', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = CONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*CONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = CONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL CPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, RBETA, LDC) + ELSE + CALL CPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, BETA, LDC) + END IF +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END +* + SUBROUTINE CPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE CPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + COMPLEX ALPHA + REAL BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE CMAKE(TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'he', 'sy' or 'tr'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, REAL +* .. Executable Statements .. + GEN = TYPE.EQ.'ge' + HER = TYPE.EQ.'he' + SYM = TYPE.EQ.'sy' + TRI = TYPE.EQ.'tr' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = CBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END + SUBROUTINE CMMCH(TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX CL + REAL ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )* + $ CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of CMMCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge' or 'he' or 'sy'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/ctest/c_d2chke.c b/ctest/c_d2chke.c new file mode 100644 index 0000000..23de9a4 --- /dev/null +++ b/ctest/c_d2chke.c @@ -0,0 +1,789 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_d2chke(char *rout) { + char *sf = ( rout ) ; + double A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_dgemv",11)==0) { + cblas_rout = "cblas_dgemv"; + cblas_info = 1; + cblas_dgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dgbmv",11)==0) { + cblas_rout = "cblas_dgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dsymv",11)==0) { + cblas_rout = "cblas_dsymv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsymv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dsbmv",11)==0) { + cblas_rout = "cblas_dsbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dspmv",11)==0) { + cblas_rout = "cblas_dspmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dspmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtrmv",11)==0) { + cblas_rout = "cblas_dtrmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtrmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtbmv",11)==0) { + cblas_rout = "cblas_dtbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtpmv",11)==0) { + cblas_rout = "cblas_dtpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtrsv",11)==0) { + cblas_rout = "cblas_dtrsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtrsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtbsv",11)==0) { + cblas_rout = "cblas_dtbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtpsv",11)==0) { + cblas_rout = "cblas_dtpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dger",10)==0) { + cblas_rout = "cblas_dger"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dger(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_dsyr2",11)==0) { + cblas_rout = "cblas_dsyr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsyr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_dspr2",11)==0) { + cblas_rout = "cblas_dspr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dspr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_dsyr",10)==0) { + cblas_rout = "cblas_dsyr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsyr(INVALID, CblasUpper, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_dspr",10)==0) { + cblas_rout = "cblas_dspr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dspr(INVALID, CblasUpper, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_d3chke.c b/ctest/c_d3chke.c new file mode 100644 index 0000000..1149475 --- /dev/null +++ b/ctest/c_d3chke.c @@ -0,0 +1,1271 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_d3chke(char *rout) { + char *sf = ( rout ) ; + double A[2] = {0.0,0.0}, + B[2] = {0.0,0.0}, + C[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_dgemm" ,11)==0) { + cblas_rout = "cblas_dgemm" ; + + cblas_info = 1; + cblas_dgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_dgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_dgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_dgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dsymm" ,11)==0) { + cblas_rout = "cblas_dsymm" ; + + cblas_info = 1; + cblas_dsymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dtrmm" ,11)==0) { + cblas_rout = "cblas_dtrmm" ; + + cblas_info = 1; + cblas_dtrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dtrsm" ,11)==0) { + cblas_rout = "cblas_dtrsm" ; + + cblas_info = 1; + cblas_dtrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dsyrk" ,11)==0) { + cblas_rout = "cblas_dsyrk" ; + + cblas_info = 1; + cblas_dsyrk( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dsyr2k" ,12)==0) { + cblas_rout = "cblas_dsyr2k" ; + + cblas_info = 1; + cblas_dsyr2k( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + } + if (cblas_ok == TRUE ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_dblas1.c b/ctest/c_dblas1.c new file mode 100644 index 0000000..2371d33 --- /dev/null +++ b/ctest/c_dblas1.c @@ -0,0 +1,84 @@ +/* + * c_dblas1.c + * + * The program is a C wrapper for dcblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "common.h" +#include "cblas_test.h" + +double F77_dasum(const int *N, double *X, const int *incX) +{ + return cblas_dasum(*N, X, *incX); +} + +void F77_daxpy(const int *N, const double *alpha, const double *X, + const int *incX, double *Y, const int *incY) +{ + cblas_daxpy(*N, *alpha, X, *incX, Y, *incY); + return; +} + +void F77_dcopy(const int *N, double *X, const int *incX, + double *Y, const int *incY) +{ + cblas_dcopy(*N, X, *incX, Y, *incY); + return; +} + +double F77_ddot(const int *N, const double *X, const int *incX, + const double *Y, const int *incY) +{ + return cblas_ddot(*N, X, *incX, Y, *incY); +} + +double F77_dnrm2(const int *N, const double *X, const int *incX) +{ + return cblas_dnrm2(*N, X, *incX); +} + +void F77_drotg( double *a, double *b, double *c, double *s) +{ + cblas_drotg(a,b,c,s); + return; +} + +void F77_drot( const int *N, double *X, const int *incX, double *Y, + const int *incY, const double *c, const double *s) +{ + + cblas_drot(*N,X,*incX,Y,*incY,*c,*s); + return; +} + +void F77_dscal(const int *N, const double *alpha, double *X, + const int *incX) +{ + cblas_dscal(*N, *alpha, X, *incX); + return; +} + +void F77_dswap( const int *N, double *X, const int *incX, + double *Y, const int *incY) +{ + cblas_dswap(*N,X,*incX,Y,*incY); + return; +} + +double F77_dzasum(const int *N, void *X, const int *incX) +{ + return cblas_dzasum(*N, X, *incX); +} + +double F77_dznrm2(const int *N, const void *X, const int *incX) +{ + return cblas_dznrm2(*N, X, *incX); +} + +int F77_idamax(const int *N, const double *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return (cblas_idamax(*N, X, *incX)+1); +} diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c new file mode 100644 index 0000000..ed68402 --- /dev/null +++ b/ctest/c_dblas2.c @@ -0,0 +1,583 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 1/23/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, + double *a, int *lda, double *x, int *incx, double *beta, + double *y, int *incy ) { + + double *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dgemv( CblasRowMajor, trans, + *m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_dgemv( CblasColMajor, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); + else + cblas_dgemv( UNDEFINED, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx, + double *y, int *incy, double *a, int *lda ) { + + double *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + + for( i=0; i<*m; i++ ) { + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + } + + cblas_dger(CblasRowMajor, *m, *n, *alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_dger( CblasColMajor, *m, *n, *alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, + int *n, double *a, int *lda, double *x, int *incx) { + double *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dtrmv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_dtrmv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx); + else { + cblas_dtrmv(UNDEFINED, uplo, trans, diag, *n, a, *lda, x, *incx); + } +} + +void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, + int *n, double *a, int *lda, double *x, int *incx ) { + double *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dtrsv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx ); + free(A); + } + else + cblas_dtrsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx ); +} +void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, + int *lda, double *x, int *incx, double *beta, double *y, + int *incy) { + double *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dsymv(CblasRowMajor, uplo, *n, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_dsymv(CblasColMajor, uplo, *n, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, + int *incx, double *a, int *lda) { + double *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dsyr(CblasRowMajor, uplo, *n, *alpha, x, *incx, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_dsyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda); +} + +void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, + int *incx, double *y, int *incy, double *a, int *lda) { + double *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dsyr2(CblasRowMajor, uplo, *n, *alpha, x, *incx, y, *incy, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_dsyr2(CblasColMajor, uplo, *n, *alpha, x, *incx, y, *incy, a, *lda); +} + +void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + double *alpha, double *a, int *lda, double *x, int *incx, + double *beta, double *y, int *incy ) { + + double *A; + int i,irow,j,jcol,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + + if (*order == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) ); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + cblas_dgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, + A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else + cblas_dgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, *alpha, + a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn, + int *n, int *k, double *a, int *lda, double *x, int *incx) { + double *A; + int irow, jcol, i, j, LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_dtbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_dtbmv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, + int *n, int *k, double *a, int *lda, double *x, int *incx) { + double *A; + int irow, jcol, i, j, LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_dtbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_dtbsv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha, + double *a, int *lda, double *x, int *incx, double *beta, + double *y, int *incy) { + double *A; + int i,j,irow,jcol,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_dsbmv(CblasRowMajor, uplo, *n, *k, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_dsbmv(CblasColMajor, uplo, *n, *k, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap, + double *x, int *incx, double *beta, double *y, int *incy) { + double *A,*AP; + int i,j,k,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n; + A = ( double* )malloc( LDA*LDA*sizeof( double ) ); + AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include "common.h" +#include "cblas_test.h" + +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, double *alpha, double *a, int *lda, double *b, int *ldb, + double *beta, double *c, int *ldc ) { + + double *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A = (double *)malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else { + LDA = *m+1; + A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + if (transb == CblasNoTrans) { + LDB = *n+1; + B = ( double* )malloc( (*k)*LDB*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + else { + LDB = *k+1; + B = ( double* )malloc( LDB*(*n)*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + LDC = *n+1; + C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + + cblas_dgemm( CblasRowMajor, transa, transb, *m, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_dgemm( CblasColMajor, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_dgemm( UNDEFINED, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n, + double *alpha, double *a, int *lda, double *b, int *ldb, + double *beta, double *c, int *ldc ) { + + double *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_dsymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB, + *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_dsymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); + else + cblas_dsymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); +} + +void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k, + double *alpha, double *a, int *lda, + double *beta, double *c, int *ldc ) { + + int i,j,LDA,LDC; + double *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*k)*LDA*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDC = *n+1; + C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_dsyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_dsyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_dsyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k, + double *alpha, double *a, int *lda, double *b, int *ldb, + double *beta, double *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + double *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + B = ( double* )malloc( (*n)*LDB*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); + B = ( double* )malloc( LDB*(*k)*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + LDC = *n+1; + C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_dsyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_dsyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_dsyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, double *alpha, double *a, int *lda, double *b, + int *ldb) { + int i,j,LDA,LDB; + double *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_dtrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_dtrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_dtrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} + +void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, double *alpha, double *a, int *lda, double *b, + int *ldb) { + int i,j,LDA,LDB; + double *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_dtrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_dtrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_dtrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f new file mode 100644 index 0000000..63e1ed8 --- /dev/null +++ b/ctest/c_dblat1.f @@ -0,0 +1,728 @@ + PROGRAM DCBLAT1 +* Test program for the DOUBLE PRECISION Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_DDOT'/ + DATA L(2)/'CBLAS_DAXPY '/ + DATA L(3)/'CBLAS_DROTG '/ + DATA L(4)/'CBLAS_DROT '/ + DATA L(5)/'CBLAS_DCOPY '/ + DATA L(6)/'CBLAS_DSWAP '/ + DATA L(7)/'CBLAS_DNRM2 '/ + DATA L(8)/'CBLAS_DASUM '/ + DATA L(9)/'CBLAS_DSCAL '/ + DATA L(10)/'CBLAS_IDAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + DOUBLE PRECISION DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL DROTGTEST, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3D0, 0.4D0, -0.3D0, -0.4D0, -0.3D0, 0.0D0, + + 0.0D0, 1.0D0/ + DATA DB1/0.4D0, 0.3D0, 0.4D0, 0.3D0, -0.4D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DC1/0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.6D0, 1.0D0, + + 0.0D0, 1.0D0/ + DATA DS1/0.8D0, 0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DATRUE/0.5D0, 0.5D0, 0.5D0, -0.5D0, -0.5D0, + + 0.0D0, 1.0D0, 1.0D0/ + DATA DBTRUE/0.0D0, 0.6D0, 0.0D0, -0.6D0, 0.0D0, + + 0.0D0, 1.0D0, 0.0D0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0D0/0.6D0 + DBTRUE(3) = -1.0D0/0.6D0 + DBTRUE(5) = 1.0D0/0.6D0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. DROTGTEST .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL DROTGTEST(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + DOUBLE PRECISION DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + DOUBLE PRECISION DASUMTEST, DNRM2TEST + INTEGER IDAMAXTEST + EXTERNAL DASUMTEST, DNRM2TEST, IDAMAXTEST +* .. External Subroutines .. + EXTERNAL ITEST1, DSCALTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0, -1.0D0, 0.0D0, 1.0D0, 0.3D0, 0.3D0, + + 0.3D0, 0.3D0, 0.3D0, 0.3D0/ + DATA DV/0.1D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 0.3D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 0.3D0, -0.4D0, 4.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 0.2D0, + + -0.6D0, 0.3D0, 5.0D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 0.1D0, -0.3D0, 0.5D0, -0.1D0, 6.0D0, + + 6.0D0, 6.0D0, 6.0D0, 0.1D0, 8.0D0, 8.0D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 0.3D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 0.3D0, 2.0D0, + + -0.4D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 0.2D0, 3.0D0, -0.6D0, 5.0D0, 0.3D0, 2.0D0, + + 2.0D0, 2.0D0, 0.1D0, 4.0D0, -0.3D0, 6.0D0, + + -0.5D0, 7.0D0, -0.1D0, 3.0D0/ + DATA DTRUE1/0.0D0, 0.3D0, 0.5D0, 0.7D0, 0.6D0/ + DATA DTRUE3/0.0D0, 0.3D0, 0.7D0, 1.1D0, 1.0D0/ + DATA DTRUE5/0.10D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, -0.3D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, 0.0D0, 0.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, + + 0.20D0, -0.60D0, 0.30D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 5.0D0, 0.03D0, -0.09D0, 0.15D0, -0.03D0, + + 6.0D0, 6.0D0, 6.0D0, 6.0D0, 0.10D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, + + 0.09D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 0.09D0, 2.0D0, -0.12D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, 2.0D0, 0.06D0, 3.0D0, + + -0.18D0, 5.0D0, 0.09D0, 2.0D0, 2.0D0, 2.0D0, + + 0.03D0, 4.0D0, -0.09D0, 6.0D0, -0.15D0, 7.0D0, + + -0.03D0, 3.0D0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. DNRM2TEST .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(DNRM2TEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. DASUMTEST .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(DASUMTEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. DSCALTEST .. + CALL DSCALTEST(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IDAMAXTEST .. + CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + EXTERNAL DDOTTEST + DOUBLE PRECISION DDOTTEST +* .. External Subroutines .. + EXTERNAL DAXPYTEST, DCOPYTEST, DSWAPTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA DT7/0.0D0, 0.30D0, 0.21D0, 0.62D0, 0.0D0, + + 0.30D0, -0.07D0, 0.85D0, 0.0D0, 0.30D0, -0.79D0, + + -0.74D0, 0.0D0, 0.30D0, 0.33D0, 1.27D0/ + DATA DT8/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.15D0, + + 0.94D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.68D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.35D0, -0.9D0, 0.48D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.38D0, -0.9D0, 0.57D0, 0.7D0, -0.75D0, + + 0.2D0, 0.98D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.35D0, -0.72D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.38D0, + + -0.63D0, 0.15D0, 0.88D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.7D0, + + -0.75D0, 0.2D0, 1.04D0/ + DATA DT10X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, -0.9D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, -0.9D0, 0.3D0, 0.7D0, + + 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.3D0, 0.1D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.8D0, 0.1D0, -0.6D0, + + 0.8D0, 0.3D0, -0.3D0, 0.5D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.9D0, + + 0.1D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + 0.1D0, 0.3D0, 0.8D0, -0.9D0, -0.3D0, 0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, -0.6D0, 0.8D0, 0.0D0, 0.0D0, + + 0.0D0/ + DATA DT10Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, -0.5D0, -0.9D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, -0.4D0, -0.9D0, 0.9D0, + + 0.7D0, -0.5D0, 0.2D0, 0.6D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + -0.4D0, 0.9D0, -0.5D0, 0.6D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.7D0, + + -0.5D0, 0.2D0, 0.8D0/ + DATA SSIZE1/0.0D0, 0.3D0, 1.6D0, 3.2D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. DDOTTEST .. + CALL STEST1(DDOTTEST(N,SX,INCX,SY,INCY),DT7(KN,KI), + + SSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. DAXPYTEST .. + CALL DAXPYTEST(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. DCOPYTEST .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL DCOPYTEST(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE IF (ICASE.EQ.6) THEN +* .. DSWAPTEST .. + CALL DSWAPTEST(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0D0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL STEST,DROTTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA SC, SS/0.8D0, 0.6D0/ + DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + + 0.0D0, 0.0D0, 0.0D0/ + DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + + -0.18D0, 0.2D0, 0.16D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. DROTTEST .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL DROTTEST(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0.0 + DO 100 I = 2, 6 + MWPS(I) = 1.0 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1.0 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL DROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/ctest/c_dblat2.f b/ctest/c_dblat2.f new file mode 100644 index 0000000..357816b --- /dev/null +++ b/ctest/c_dblat2.f @@ -0,0 +1,2907 @@ + PROGRAM DBLAT2 +* +* Test program for the DOUBLE PRECISION Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 33 lines: +* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* cblas_dgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsymv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dspmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dger T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dspr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyr2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dspr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHK6, + $ CD2CHKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_dgemv ', 'cblas_dgbmv ', + $ 'cblas_dsymv ','cblas_dsbmv ','cblas_dspmv ', + $ 'cblas_dtrmv ','cblas_dtbmv ','cblas_dtpmv ', + $ 'cblas_dtrsv ','cblas_dtbsv ','cblas_dtpsv ', + $ 'cblas_dger ','cblas_dsyr ','cblas_dspr ', + $ 'cblas_dsyr2 ','cblas_dspr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from DMVCH YT holds +* the result computed by DMVCH. + TRANS = 'N' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CD2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test DGEMV, 01, and DGBMV, 02. + 140 IF (CORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. + 150 IF (CORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test DTRMV, 06, DTBMV, 07, DTPMV, 08, +* DTRSV, 09, DTBSV, 10, and DTPSV, 11. + 160 IF (CORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test DGER, 12. + 170 IF (CORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test DSYR, 13, and DSPR, 14. + 180 IF (CORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test DSYR2, 15, and DSPR2, 16. + 190 IF (CORDER) THEN + CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN DMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT(A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT2. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests DGEMV and DGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL CDGBMV, CDGEMV, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, + $ BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LDERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ',',/ 10x,'X,', I2, ',', F4.1, ', Y,', + $ I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests DSYMV, DSBMV and DSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDSBMV, CDSPMV, CDSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSYMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LDE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LDERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LDERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests DTRMV, DTBMV, DTPMV, DTRSV, DTBSV and DTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDTBMV, CDTBSV, CDTPMV, + $ CDTPSV, CDTRMV, CDTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for DMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LDERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LDERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LDERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests DGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DGER, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CDGER( IORDER, M, N, ALPHA, XX, INCX, YY, + $ INCY, AA, LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL DMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests DSYR and DSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDSPR, CDSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CDSYR( IORDER, UPLO, N, ALPHA, XX, INCX, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDSPR( IORDER, UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LDERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END + SUBROUTINE DCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests DSYR2 and DSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDSPR2, CDSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CDSYR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK6. +* + END + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'sy', 'sb', 'sp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'s' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = DBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'sb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'sp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of DMVCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'sy' or 'sp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC DBLE +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = DBLE( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END diff --git a/ctest/c_dblat3.f b/ctest/c_dblat3.f new file mode 100644 index 0000000..fb9acbb --- /dev/null +++ b/ctest/c_dblat3.f @@ -0,0 +1,2475 @@ + PROGRAM DBLAT3 +* +* Test program for the DOUBLE PRECISION Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 19 lines: +* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* cblas_dgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, CD3CHKE, + $ DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_dgemm ', 'cblas_dsymm ', + $ 'cblas_dtrmm ', 'cblas_dtrsm ','cblas_dsyrk ', + $ 'cblas_dsyr2k'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + NOUTC = NOUT +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from DMMCH CT holds +* the result computed by DMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CD3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test DGEMM, 01. + 140 IF (CORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test DSYMM, 02. + 150 IF (CORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test DTRMM, 03, DTRSM, 04. + 160 IF (CORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test DSYRK, 05. + 170 IF (CORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test DSYR2K, 06. + 180 IF (CORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN DMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT3. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) +* +* Tests DGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL CDGEMM, DMAKE, DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) +* +* Tests DSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL DMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDSYMM( IORDER, SIDE, UPLO, M, N, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END +* + SUBROUTINE DPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 2( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests DTRMM and DTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDTRMM, CDTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for DMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL DMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CDTRMM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CDTRSM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LDE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LDE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LDERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END +* + SUBROUTINE DPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + DOUBLE PRECISION ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) +* +* Tests DSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN4( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDSYRK( IORDER, UPLO, TRANS, N, K, ALPHA, + $ AA, LDA, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL DMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END +* + SUBROUTINE DPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests DSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN5( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL DMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL DMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END +* + SUBROUTINE DPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = DBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of DMMCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = ( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END diff --git a/ctest/c_s2chke.c b/ctest/c_s2chke.c new file mode 100644 index 0000000..b0a48a6 --- /dev/null +++ b/ctest/c_s2chke.c @@ -0,0 +1,789 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_s2chke(char *rout) { + char *sf = ( rout ) ; + float A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_sgemv",11)==0) { + cblas_rout = "cblas_sgemv"; + cblas_info = 1; + cblas_sgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_sgbmv",11)==0) { + cblas_rout = "cblas_sgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ssymv",11)==0) { + cblas_rout = "cblas_ssymv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssymv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ssbmv",11)==0) { + cblas_rout = "cblas_ssbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_sspmv",11)==0) { + cblas_rout = "cblas_sspmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sspmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_strmv",11)==0) { + cblas_rout = "cblas_strmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_strmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stbmv",11)==0) { + cblas_rout = "cblas_stbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stpmv",11)==0) { + cblas_rout = "cblas_stpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_strsv",11)==0) { + cblas_rout = "cblas_strsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_strsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stbsv",11)==0) { + cblas_rout = "cblas_stbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stpsv",11)==0) { + cblas_rout = "cblas_stpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_sger",10)==0) { + cblas_rout = "cblas_sger"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sger(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_ssyr2",11)==0) { + cblas_rout = "cblas_ssyr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssyr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_sspr2",11)==0) { + cblas_rout = "cblas_sspr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sspr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_ssyr",10)==0) { + cblas_rout = "cblas_ssyr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssyr(INVALID, CblasUpper, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_sspr",10)==0) { + cblas_rout = "cblas_sspr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sspr(INVALID, CblasUpper, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_s3chke.c b/ctest/c_s3chke.c new file mode 100644 index 0000000..7c832c1 --- /dev/null +++ b/ctest/c_s3chke.c @@ -0,0 +1,1273 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_s3chke(char *rout) { + char *sf = ( rout ) ; + float A[2] = {0.0,0.0}, + B[2] = {0.0,0.0}, + C[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_sgemm" ,11)==0) { + cblas_rout = "cblas_sgemm" ; + cblas_info = 1; + cblas_sgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_sgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_sgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_sgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ssymm" ,11)==0) { + cblas_rout = "cblas_ssymm" ; + + cblas_info = 1; + cblas_ssymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_strmm" ,11)==0) { + cblas_rout = "cblas_strmm" ; + + cblas_info = 1; + cblas_strmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_strsm" ,11)==0) { + cblas_rout = "cblas_strsm" ; + + cblas_info = 1; + cblas_strsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ssyrk" ,11)==0) { + cblas_rout = "cblas_ssyrk" ; + + cblas_info = 1; + cblas_ssyrk( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ssyr2k" ,12)==0) { + cblas_rout = "cblas_ssyr2k" ; + + cblas_info = 1; + cblas_ssyr2k( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + } + if (cblas_ok == TRUE ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_sblas1.c b/ctest/c_sblas1.c new file mode 100644 index 0000000..5ccb2d3 --- /dev/null +++ b/ctest/c_sblas1.c @@ -0,0 +1,83 @@ +/* + * c_sblas1.c + * + * The program is a C wrapper for scblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "common.h" +#include "cblas_test.h" + +float F77_sasum(blasint *N, float *X, blasint *incX) +{ + return cblas_sasum(*N, X, *incX); +} + +void F77_saxpy(blasint *N, const float *alpha, const float *X, + blasint *incX, float *Y, blasint *incY) +{ + cblas_saxpy(*N, *alpha, X, *incX, Y, *incY); + return; +} + +float F77_scasum(blasint *N, float *X, blasint *incX) +{ + return cblas_scasum(*N, X, *incX); +} + +float F77_scnrm2(blasint *N, const float *X, blasint *incX) +{ + return cblas_scnrm2(*N, X, *incX); +} + +void F77_scopy(blasint *N, const float *X, blasint *incX, + float *Y, blasint *incY) +{ + cblas_scopy(*N, X, *incX, Y, *incY); + return; +} + +float F77_sdot(blasint *N, const float *X, blasint *incX, + const float *Y, blasint *incY) +{ + return cblas_sdot(*N, X, *incX, Y, *incY); +} + +float F77_snrm2(blasint *N, const float *X, blasint *incX) +{ + return cblas_snrm2(*N, X, *incX); +} + +void F77_srotg( float *a, float *b, float *c, float *s) +{ + cblas_srotg(a,b,c,s); + return; +} + +void F77_srot( blasint *N, float *X, blasint *incX, float *Y, + blasint *incY, const float *c, const float *s) +{ + cblas_srot(*N,X,*incX,Y,*incY,*c,*s); + return; +} + +void F77_sscal(blasint *N, const float *alpha, float *X, + blasint *incX) +{ + cblas_sscal(*N, *alpha, X, *incX); + return; +} + +void F77_sswap( blasint *N, float *X, blasint *incX, + float *Y, blasint *incY) +{ + cblas_sswap(*N,X,*incX,Y,*incY); + return; +} + +int F77_isamax(blasint *N, const float *X, blasint *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return (cblas_isamax(*N, X, *incX)+1); +} diff --git a/ctest/c_sblas2.c b/ctest/c_sblas2.c new file mode 100644 index 0000000..3059525 --- /dev/null +++ b/ctest/c_sblas2.c @@ -0,0 +1,579 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 1/23/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha, + float *a, int *lda, float *x, int *incx, float *beta, + float *y, int *incy ) { + + float *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_sgemv( CblasRowMajor, trans, + *m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_sgemv( CblasColMajor, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); + else + cblas_sgemv( UNDEFINED, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx, + float *y, int *incy, float *a, int *lda ) { + + float *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + + for( i=0; i<*m; i++ ) { + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + } + + cblas_sger(CblasRowMajor, *m, *n, *alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_sger( CblasColMajor, *m, *n, *alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_strmv(int *order, char *uplow, char *transp, char *diagn, + int *n, float *a, int *lda, float *x, int *incx) { + float *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_strmv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_strmv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx); + else { + cblas_strmv(UNDEFINED, uplo, trans, diag, *n, a, *lda, x, *incx); + } +} + +void F77_strsv(int *order, char *uplow, char *transp, char *diagn, + int *n, float *a, int *lda, float *x, int *incx ) { + float *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_strsv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx ); + free(A); + } + else + cblas_strsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx ); +} +void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a, + int *lda, float *x, int *incx, float *beta, float *y, + int *incy) { + float *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_ssymv(CblasRowMajor, uplo, *n, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_ssymv(CblasColMajor, uplo, *n, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x, + int *incx, float *a, int *lda) { + float *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_ssyr(CblasRowMajor, uplo, *n, *alpha, x, *incx, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_ssyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda); +} + +void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x, + int *incx, float *y, int *incy, float *a, int *lda) { + float *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_ssyr2(CblasRowMajor, uplo, *n, *alpha, x, *incx, y, *incy, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_ssyr2(CblasColMajor, uplo, *n, *alpha, x, *incx, y, *incy, a, *lda); +} + +void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + float *alpha, float *a, int *lda, float *x, int *incx, + float *beta, float *y, int *incy ) { + + float *A; + int i,irow,j,jcol,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + + if (*order == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) ); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + cblas_sgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, + A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else + cblas_sgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, *alpha, + a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_stbmv(int *order, char *uplow, char *transp, char *diagn, + int *n, int *k, float *a, int *lda, float *x, int *incx) { + float *A; + int irow, jcol, i, j, LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_stbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_stbmv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_stbsv(int *order, char *uplow, char *transp, char *diagn, + int *n, int *k, float *a, int *lda, float *x, int *incx) { + float *A; + int irow, jcol, i, j, LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_stbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_stbsv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha, + float *a, int *lda, float *x, int *incx, float *beta, + float *y, int *incy) { + float *A; + int i,j,irow,jcol,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_ssbmv(CblasRowMajor, uplo, *n, *k, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_ssbmv(CblasColMajor, uplo, *n, *k, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap, + float *x, int *incx, float *beta, float *y, int *incy) { + float *A,*AP; + int i,j,k,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n; + A = ( float* )malloc( LDA*LDA*sizeof( float ) ); + AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) ); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include +#include "common.h" +#include "cblas_test.h" + +void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, float *alpha, float *a, int *lda, float *b, int *ldb, + float *beta, float *c, int *ldc ) { + + float *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A = (float *)malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else { + LDA = *m+1; + A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + if (transb == CblasNoTrans) { + LDB = *n+1; + B = ( float* )malloc( (*k)*LDB*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + else { + LDB = *k+1; + B = ( float* )malloc( LDB*(*n)*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + LDC = *n+1; + C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_sgemm( CblasRowMajor, transa, transb, *m, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_sgemm( CblasColMajor, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_sgemm( UNDEFINED, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n, + float *alpha, float *a, int *lda, float *b, int *ldb, + float *beta, float *c, int *ldc ) { + + float *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_ssymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB, + *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_ssymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); + else + cblas_ssymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); +} + +void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k, + float *alpha, float *a, int *lda, + float *beta, float *c, int *ldc ) { + + int i,j,LDA,LDC; + float *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*k)*LDA*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDC = *n+1; + C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_ssyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_ssyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_ssyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k, + float *alpha, float *a, int *lda, float *b, int *ldb, + float *beta, float *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + float *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + B = ( float* )malloc( (*n)*LDB*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); + B = ( float* )malloc( LDB*(*k)*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + LDC = *n+1; + C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_ssyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_ssyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_ssyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, float *alpha, float *a, int *lda, float *b, + int *ldb) { + int i,j,LDA,LDB; + float *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_strmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_strmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_strmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} + +void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, float *alpha, float *a, int *lda, float *b, + int *ldb) { + int i,j,LDA,LDB; + float *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_strsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_strsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_strsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f new file mode 100644 index 0000000..de2b038 --- /dev/null +++ b/ctest/c_sblat1.f @@ -0,0 +1,728 @@ + PROGRAM SCBLAT1 +* Test program for the REAL Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_SDOT '/ + DATA L(2)/'CBLAS_SAXPY '/ + DATA L(3)/'CBLAS_SROTG '/ + DATA L(4)/'CBLAS_SROT '/ + DATA L(5)/'CBLAS_SCOPY '/ + DATA L(6)/'CBLAS_SSWAP '/ + DATA L(7)/'CBLAS_SNRM2 '/ + DATA L(8)/'CBLAS_SASUM '/ + DATA L(9)/'CBLAS_SSCAL '/ + DATA L(10)/'CBLAS_ISAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + REAL DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL SROTGTEST, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3E0, 0.4E0, -0.3E0, -0.4E0, -0.3E0, 0.0E0, + + 0.0E0, 1.0E0/ + DATA DB1/0.4E0, 0.3E0, 0.4E0, 0.3E0, -0.4E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DC1/0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.6E0, 1.0E0, + + 0.0E0, 1.0E0/ + DATA DS1/0.8E0, 0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DATRUE/0.5E0, 0.5E0, 0.5E0, -0.5E0, -0.5E0, + + 0.0E0, 1.0E0, 1.0E0/ + DATA DBTRUE/0.0E0, 0.6E0, 0.0E0, -0.6E0, 0.0E0, + + 0.0E0, 1.0E0, 0.0E0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0E0/0.6E0 + DBTRUE(3) = -1.0E0/0.6E0 + DBTRUE(5) = 1.0E0/0.6E0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. SROTGTEST .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL SROTGTEST(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + REAL DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + REAL SASUMTEST, SNRM2TEST + INTEGER ISAMAXTEST + EXTERNAL SASUMTEST, SNRM2TEST, ISAMAXTEST +* .. External Subroutines .. + EXTERNAL ITEST1, SSCALTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0, -1.0E0, 0.0E0, 1.0E0, 0.3E0, 0.3E0, + + 0.3E0, 0.3E0, 0.3E0, 0.3E0/ + DATA DV/0.1E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 0.3E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 0.3E0, -0.4E0, 4.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 0.2E0, + + -0.6E0, 0.3E0, 5.0E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 0.1E0, -0.3E0, 0.5E0, -0.1E0, 6.0E0, + + 6.0E0, 6.0E0, 6.0E0, 0.1E0, 8.0E0, 8.0E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 0.3E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 0.3E0, 2.0E0, + + -0.4E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 0.2E0, 3.0E0, -0.6E0, 5.0E0, 0.3E0, 2.0E0, + + 2.0E0, 2.0E0, 0.1E0, 4.0E0, -0.3E0, 6.0E0, + + -0.5E0, 7.0E0, -0.1E0, 3.0E0/ + DATA DTRUE1/0.0E0, 0.3E0, 0.5E0, 0.7E0, 0.6E0/ + DATA DTRUE3/0.0E0, 0.3E0, 0.7E0, 1.1E0, 1.0E0/ + DATA DTRUE5/0.10E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, -0.3E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, 0.0E0, 0.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, + + 0.20E0, -0.60E0, 0.30E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 5.0E0, 0.03E0, -0.09E0, 0.15E0, -0.03E0, + + 6.0E0, 6.0E0, 6.0E0, 6.0E0, 0.10E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, + + 0.09E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 0.09E0, 2.0E0, -0.12E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, 2.0E0, 0.06E0, 3.0E0, + + -0.18E0, 5.0E0, 0.09E0, 2.0E0, 2.0E0, 2.0E0, + + 0.03E0, 4.0E0, -0.09E0, 6.0E0, -0.15E0, 7.0E0, + + -0.03E0, 3.0E0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. SNRM2TEST .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(SNRM2TEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. SASUMTEST .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(SASUMTEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. SSCALTEST .. + CALL SSCALTEST(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ISAMAXTEST .. + CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + REAL SDOTTEST + EXTERNAL SDOTTEST +* .. External Subroutines .. + EXTERNAL SAXPYTEST, SCOPYTEST, SSWAPTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA DT7/0.0E0, 0.30E0, 0.21E0, 0.62E0, 0.0E0, + + 0.30E0, -0.07E0, 0.85E0, 0.0E0, 0.30E0, -0.79E0, + + -0.74E0, 0.0E0, 0.30E0, 0.33E0, 1.27E0/ + DATA DT8/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.15E0, + + 0.94E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.68E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.35E0, -0.9E0, 0.48E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.38E0, -0.9E0, 0.57E0, 0.7E0, -0.75E0, + + 0.2E0, 0.98E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.35E0, -0.72E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.38E0, + + -0.63E0, 0.15E0, 0.88E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.7E0, + + -0.75E0, 0.2E0, 1.04E0/ + DATA DT10X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, -0.9E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, -0.9E0, 0.3E0, 0.7E0, + + 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.3E0, 0.1E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.8E0, 0.1E0, -0.6E0, + + 0.8E0, 0.3E0, -0.3E0, 0.5E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.9E0, + + 0.1E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + 0.1E0, 0.3E0, 0.8E0, -0.9E0, -0.3E0, 0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, -0.6E0, 0.8E0, 0.0E0, 0.0E0, + + 0.0E0/ + DATA DT10Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, -0.5E0, -0.9E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, -0.4E0, -0.9E0, 0.9E0, + + 0.7E0, -0.5E0, 0.2E0, 0.6E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + -0.4E0, 0.9E0, -0.5E0, 0.6E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.7E0, + + -0.5E0, 0.2E0, 0.8E0/ + DATA SSIZE1/0.0E0, 0.3E0, 1.6E0, 3.2E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. SDOTTEST .. + CALL STEST1(SDOTTEST(N,SX,INCX,SY,INCY),DT7(KN,KI), + + SSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. SAXPYTEST .. + CALL SAXPYTEST(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. SCOPYTEST .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL SCOPYTEST(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE IF (ICASE.EQ.6) THEN +* .. SSWAPTEST .. + CALL SSWAPTEST(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0E0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL SROTTEST, STEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA SC, SS/0.8E0, 0.6E0/ + DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + + 0.0E0, 0.0E0, 0.0E0/ + DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + + -0.18E0, 0.2E0, 0.16E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. SROTTEST .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL SROTTEST(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0 + DO 100 I = 2, 6 + MWPS(I) = 1 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL SROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/ctest/c_sblat2.f b/ctest/c_sblat2.f new file mode 100644 index 0000000..bf6f3e4 --- /dev/null +++ b/ctest/c_sblat2.f @@ -0,0 +1,2907 @@ + PROGRAM SBLAT2 +* +* Test program for the REAL Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 33 lines: +* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* cblas_sgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssymv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sspmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sger T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sspr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyr2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sspr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHK6, + $ CS2CHKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_sgemv ', 'cblas_sgbmv ', + $ 'cblas_ssymv ','cblas_ssbmv ','cblas_sspmv ', + $ 'cblas_strmv ','cblas_stbmv ','cblas_stpmv ', + $ 'cblas_strsv ','cblas_stbsv ','cblas_stpsv ', + $ 'cblas_sger ','cblas_ssyr ','cblas_sspr ', + $ 'cblas_ssyr2 ','cblas_sspr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from SMVCH YT holds +* the result computed by SMVCH. + TRANS = 'N' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CS2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test SGEMV, 01, and SGBMV, 02. + 140 IF (CORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. + 150 IF (CORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test STRMV, 06, STBMV, 07, STPMV, 08, +* STRSV, 09, STBSV, 10, and STPSV, 11. + 160 IF (CORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test SGER, 12. + 170 IF (CORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test SSYR, 13, and SSPR, 14. + 180 IF (CORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test SSYR2, 15, and SSPR2, 16. + 190 IF (CORDER) THEN + CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE REAL LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN SMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT(A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT2. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests SGEMV and SGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL CSGBMV, CSGEMV, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, + $ BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LSERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ',',/ 10x, 'X,', I2, ',', F4.1, ', Y,', + $ I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests SSYMV, SSBMV and SSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSSBMV, CSSPMV, CSSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSYMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LSE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LSERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LSE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LSERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSTBMV, CSTBSV, CSTPMV, + $ CSTPSV, CSTRMV, CSTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for SMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LSERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LSERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LSERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ K, LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests SGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL CSGER, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CSGER( IORDER, M, N, ALPHA, XX, INCX, YY, + $ INCY, AA, LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL SMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests SSYR and SSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSSPR, CSSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CSSYR( IORDER, UPLO, N, ALPHA, XX, INCX, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSSPR( IORDER, UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LSERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END + SUBROUTINE SCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests SSYR2 and SSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSSPR2, CSSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CSSYR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK6. +* + END + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'sy', 'sb', 'sp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'s' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = SBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'sb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'sp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + REAL A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of SMVCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'sy' or 'sp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = REAL( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/ctest/c_sblat3.f b/ctest/c_sblat3.f new file mode 100644 index 0000000..948fd6e --- /dev/null +++ b/ctest/c_sblat3.f @@ -0,0 +1,2479 @@ + PROGRAM SBLAT3 +* +* Test program for the REAL Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 19 lines: +* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* cblas_sgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + REAL AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, CS3CHKE, + $ SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_sgemm ', 'cblas_ssymm ', + $ 'cblas_strmm ', 'cblas_strsm ','cblas_ssyrk ', + $ 'cblas_ssyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN +* OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from SMMCH CT holds +* the result computed by SMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CS3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test SGEMM, 01. + 140 IF (CORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test SSYMM, 02. + 150 IF (CORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test STRMM, 03, STRSM, 04. + 160 IF (CORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test SSYRK, 05. + 170 IF (CORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test SSYR2K, 06. + 180 IF (CORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE REAL LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* ', + $ 'TESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN SMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT3. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests SGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL CSGEMM, SMAKE, SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END +* +* +* + SUBROUTINE SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + REAL ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests SSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL SMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSSYMM( IORDER, SIDE, UPLO, M, N, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END +* + SUBROUTINE SPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + REAL ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 2( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests STRMM and STRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSTRMM, CSTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for SMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL SMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CSTRMM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CSTRSM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LSE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LSE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LSERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END +* + SUBROUTINE SPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + REAL ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = 'CblasRowMajor' + ELSE + CRC = 'CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests SSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN4( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSSYRK( IORDER, UPLO, TRANS, N, K, ALPHA, + $ AA, LDA, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL SMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END +* + SUBROUTINE SPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests SSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN5( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSSYR2K( IORDER, UPLO, TRANS, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL SMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL SMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END +* + SUBROUTINE SPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = SBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of SMMCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = ( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/ctest/c_xerbla.c b/ctest/c_xerbla.c new file mode 100644 index 0000000..3402460 --- /dev/null +++ b/ctest/c_xerbla.c @@ -0,0 +1,137 @@ +#include +#include +#include +#include +#include "common.h" +#include "cblas_test.h" + +void cblas_xerbla(blasint info, char *rout, char *form, ...) +{ + extern int cblas_lerr, cblas_info, cblas_ok; + extern int link_xerbla; + extern int RowMajorStrg; + extern char *cblas_rout; + + /* Initially, c__3chke will call this routine with + * global variable link_xerbla=1, and F77_xerbla will set link_xerbla=0. + * This is done to fool the linker into loading these subroutines first + * instead of ones in the CBLAS or the legacy BLAS library. + */ + if (link_xerbla) return; + + if (cblas_rout != NULL && strcmp(cblas_rout, rout) != 0){ + printf("***** XERBLA WAS CALLED WITH SRNAME = <%s> INSTEAD OF <%s> *******\n", rout, cblas_rout); + cblas_ok = FALSE; + } + + if (RowMajorStrg) + { + /* To properly check leading dimension problems in cblas__gemm, we + * need to do the following trick. When cblas__gemm is called with + * CblasRowMajor, the arguments A and B switch places in the call to + * f77__gemm. Thus when we test for bad leading dimension problems + * for A and B, lda is in position 11 instead of 9, and ldb is in + * position 9 instead of 11. + */ + if (strstr(rout,"gemm") != 0) + { + if (info == 5 ) info = 4; + else if (info == 4 ) info = 5; + else if (info == 11) info = 9; + else if (info == 9 ) info = 11; + } + else if (strstr(rout,"symm") != 0 || strstr(rout,"hemm") != 0) + { + if (info == 5 ) info = 4; + else if (info == 4 ) info = 5; + } + else if (strstr(rout,"trmm") != 0 || strstr(rout,"trsm") != 0) + { + if (info == 7 ) info = 6; + else if (info == 6 ) info = 7; + } + else if (strstr(rout,"gemv") != 0) + { + if (info == 4) info = 3; + else if (info == 3) info = 4; + } + else if (strstr(rout,"gbmv") != 0) + { + if (info == 4) info = 3; + else if (info == 3) info = 4; + else if (info == 6) info = 5; + else if (info == 5) info = 6; + } + else if (strstr(rout,"ger") != 0) + { + if (info == 3) info = 2; + else if (info == 2) info = 3; + else if (info == 8) info = 6; + else if (info == 6) info = 8; + } + else if ( ( strstr(rout,"her2") != 0 || strstr(rout,"hpr2") != 0 ) + && strstr(rout,"her2k") == 0 ) + { + if (info == 8) info = 6; + else if (info == 6) info = 8; + } + } + + if (info != cblas_info){ + printf("***** XERBLA WAS CALLED WITH INFO = %d INSTEAD OF %d in %s *******\n",info, cblas_info, rout); + cblas_lerr = PASSED; + cblas_ok = FALSE; + } else cblas_lerr = FAILED; +} + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo) +#else +void F77_xerbla(char *srname, void *vinfo) +#endif +{ +#ifdef F77_Char + char *srname; +#endif + + char rout[] = {'c','b','l','a','s','_','\0','\0','\0','\0','\0','\0','\0'}; + +#ifdef F77_Integer + F77_Integer *info=vinfo; + F77_Integer i; + extern F77_Integer link_xerbla; +#else + int *info=vinfo; + int i; + extern int link_xerbla; +#endif +#ifdef F77_Char + srname = F2C_STR(F77_srname, XerblaStrLen); +#endif + + /* See the comment in cblas_xerbla() above */ + if (link_xerbla) + { + link_xerbla = 0; + return; + } + for(i=0; i < 6; i++) rout[i+6] = tolower(srname[i]); + for(i=11; i >= 9; i--) if (rout[i] == ' ') rout[i] = '\0'; + + /* We increment *info by 1 since the CBLAS interface adds one more + * argument to all level 2 and 3 routines. + */ + cblas_xerbla(*info+1,rout,""); +} + +#ifdef USE64BITINT +#undef int +#endif + +int BLASFUNC(xerbla)(char *name, blasint *info, blasint length) { + + F77_xerbla(name, info); + +}; + + diff --git a/ctest/c_z2chke.c b/ctest/c_z2chke.c new file mode 100644 index 0000000..ac60971 --- /dev/null +++ b/ctest/c_z2chke.c @@ -0,0 +1,826 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_z2chke(char *rout) { + char *sf = ( rout ) ; + double A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_zgemv",11)==0) { + cblas_rout = "cblas_zgemv"; + cblas_info = 1; + cblas_zgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zgbmv",11)==0) { + cblas_rout = "cblas_zgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhemv",11)==0) { + cblas_rout = "cblas_zhemv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhemv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhbmv",11)==0) { + cblas_rout = "cblas_zhbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhpmv",11)==0) { + cblas_rout = "cblas_zhpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhpmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztrmv",11)==0) { + cblas_rout = "cblas_ztrmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztrmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztbmv",11)==0) { + cblas_rout = "cblas_ztbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztpmv",11)==0) { + cblas_rout = "cblas_ztpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztrsv",11)==0) { + cblas_rout = "cblas_ztrsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztrsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztbsv",11)==0) { + cblas_rout = "cblas_ztbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztpsv",11)==0) { + cblas_rout = "cblas_ztpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zgeru",10)==0) { + cblas_rout = "cblas_zgeru"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zgeru(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zgerc",10)==0) { + cblas_rout = "cblas_zgerc"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zgerc(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zher2",11)==0) { + cblas_rout = "cblas_zher2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zher2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhpr2",11)==0) { + cblas_rout = "cblas_zhpr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhpr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_zher",10)==0) { + cblas_rout = "cblas_zher"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zher(INVALID, CblasUpper, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhpr",10)==0) { + cblas_rout = "cblas_zhpr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhpr(INVALID, CblasUpper, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_z3chke.c b/ctest/c_z3chke.c new file mode 100644 index 0000000..b58cb62 --- /dev/null +++ b/ctest/c_z3chke.c @@ -0,0 +1,1706 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_z3chke(char * rout) { + char *sf = ( rout ) ; + double A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + if (strncmp( sf,"cblas_zgemm" ,11)==0) { + cblas_rout = "cblas_zgemm" ; + + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zhemm" ,11)==0) { + cblas_rout = "cblas_zhemm" ; + + cblas_info = 1; + cblas_zhemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsymm" ,11)==0) { + cblas_rout = "cblas_zsymm" ; + + cblas_info = 1; + cblas_zsymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrmm" ,11)==0) { + cblas_rout = "cblas_ztrmm" ; + + cblas_info = 1; + cblas_ztrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrsm" ,11)==0) { + cblas_rout = "cblas_ztrsm" ; + + cblas_info = 1; + cblas_ztrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zherk" ,11)==0) { + cblas_rout = "cblas_zherk" ; + + cblas_info = 1; + cblas_zherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyrk" ,11)==0) { + cblas_rout = "cblas_zsyrk" ; + + cblas_info = 1; + cblas_zsyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zher2k" ,12)==0) { + cblas_rout = "cblas_zher2k" ; + + cblas_info = 1; + cblas_zher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyr2k" ,12)==0) { + cblas_rout = "cblas_zsyr2k" ; + + cblas_info = 1; + cblas_zsyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_zblas1.c b/ctest/c_zblas1.c new file mode 100644 index 0000000..0a36f33 --- /dev/null +++ b/ctest/c_zblas1.c @@ -0,0 +1,75 @@ +/* + * c_zblas1.c + * + * The program is a C wrapper for zcblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "common.h" +#include "cblas_test.h" + +void F77_zaxpy(const int *N, const void *alpha, void *X, + const int *incX, void *Y, const int *incY) +{ + cblas_zaxpy(*N, alpha, X, *incX, Y, *incY); + return; +} + +void F77_zcopy(const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_zcopy(*N, X, *incX, Y, *incY); + return; +} + +void F77_zdotc(const int *N, const void *X, const int *incX, + const void *Y, const int *incY,void *dotc) +{ + cblas_zdotc_sub(*N, X, *incX, Y, *incY, dotc); + return; +} + +void F77_zdotu(const int *N, void *X, const int *incX, + void *Y, const int *incY,void *dotu) +{ + cblas_zdotu_sub(*N, X, *incX, Y, *incY, dotu); + return; +} + +void F77_zdscal(const int *N, const double *alpha, void *X, + const int *incX) +{ + cblas_zdscal(*N, *alpha, X, *incX); + return; +} + +void F77_zscal(const int *N, const void * *alpha, void *X, + const int *incX) +{ + cblas_zscal(*N, alpha, X, *incX); + return; +} + +void F77_zswap( const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_zswap(*N,X,*incX,Y,*incY); + return; +} + +int F77_izamax(const int *N, const void *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return(cblas_izamax(*N, X, *incX)+1); +} + +double F77_dznrm2(const int *N, const void *X, const int *incX) +{ + return cblas_dznrm2(*N, X, *incX); +} + +double F77_dzasum(const int *N, void *X, const int *incX) +{ + return cblas_dzasum(*N, X, *incX); +} diff --git a/ctest/c_zblas2.c b/ctest/c_zblas2.c new file mode 100644 index 0000000..6291abe --- /dev/null +++ b/ctest/c_zblas2.c @@ -0,0 +1,807 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/08/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void F77_zgemv(int *order, char *transp, int *m, int *n, + const void *alpha, + CBLAS_TEST_ZOMPLEX *a, int *lda, const void *x, int *incx, + const void *beta, void *y, int *incy) { + + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zgemv( CblasRowMajor, trans, *m, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zgemv( CblasColMajor, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); + else + cblas_zgemv( UNDEFINED, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); +} + +void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *x, int *incx, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy) { + + CBLAS_TEST_ZOMPLEX *A; + int i,j,irow,jcol,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ){ + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + cblas_zgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, alpha, A, LDA, x, + *incx, beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else + cblas_zgbmv( UNDEFINED, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); +} + +void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, + CBLAS_TEST_ZOMPLEX *a, int *lda){ + + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zgeru( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zgeru( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_zgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, + CBLAS_TEST_ZOMPLEX *a, int *lda) { + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zgerc( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zgerc( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_zgerc( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *x, + int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ + + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zhemv( CblasRowMajor, uplo, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zhemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_zhemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_zhbmv(int *order, char *uplow, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *y, int *incy){ + +CBLAS_TEST_ZOMPLEX *A; +int i,irow,j,jcol,LDA; + + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_zhbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else { + LDA = *k+2; + A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) { + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + } + cblas_zhbmv( CblasRowMajor, uplo, *n, *k, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + } + else if (*order == TEST_COL_MJR) + cblas_zhbmv(CblasColMajor, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_zhbmv(UNDEFINED, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *ap, CBLAS_TEST_ZOMPLEX *x, int *incx, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ + + CBLAS_TEST_ZOMPLEX *A, *AP; + int i,j,k,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + if (*order == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_zhpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, + beta, y, *incy); + else { + LDA = *n; + A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); + AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)* + sizeof( CBLAS_TEST_ZOMPLEX )); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include "common.h" +#include "cblas_test.h" +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, + double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, double *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f new file mode 100644 index 0000000..03753e7 --- /dev/null +++ b/ctest/c_zblat1.f @@ -0,0 +1,682 @@ + PROGRAM ZCBLAT1 +* Test program for the COMPLEX*16 Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_ZDOTC'/ + DATA L(2)/'CBLAS_ZDOTU'/ + DATA L(3)/'CBLAS_ZAXPY'/ + DATA L(4)/'CBLAS_ZCOPY'/ + DATA L(5)/'CBLAS_ZSWAP'/ + DATA L(6)/'CBLAS_DZNRM2'/ + DATA L(7)/'CBLAS_DZASUM'/ + DATA L(8)/'CBLAS_ZSCAL'/ + DATA L(9)/'CBLAS_ZDSCAL'/ + DATA L(10)/'CBLAS_IZAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA + DOUBLE PRECISION SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX*16 CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + DOUBLE PRECISION STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + DOUBLE PRECISION DZASUMTEST, DZNRM2TEST + INTEGER IZAMAXTEST + EXTERNAL DZASUMTEST, DZNRM2TEST, IZAMAXTEST +* .. External Subroutines .. + EXTERNAL ZSCALTEST, ZDSCALTEST, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3D0, (0.4D0,-0.7D0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.3D0,-0.4D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.1D0,-0.3D0), (0.5D0,-0.1D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), + + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.3D0,-0.4D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.1D0,-0.3D0), (8.0D0,9.0D0), (0.5D0,-0.1D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (0.1D0,0.1D0), + + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), + + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), + + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ + DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ + DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (-0.16D0,-0.37D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (-0.17D0,-0.19D0), (0.13D0,-0.39D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.19D0,-0.17D0), (0.32D0,0.09D0), + + (0.23D0,-0.24D0), (0.18D0,0.01D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (-0.16D0,-0.37D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (-0.17D0,-0.19D0), (8.0D0,9.0D0), + + (0.13D0,-0.39D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.11D0,-0.03D0), (3.0D0,6.0D0), + + (-0.17D0,0.46D0), (4.0D0,7.0D0), + + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), + + (0.32D0,0.09D0), (6.0D0,9.0D0), + + (0.23D0,-0.24D0), (8.0D0,3.0D0), + + (0.18D0,0.01D0), (9.0D0,4.0D0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.09D0,-0.12D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.03D0,-0.09D0), (0.15D0,-0.03D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.03D0,0.03D0), (-0.18D0,0.03D0), + + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.09D0,0.03D0), (0.03D0,0.12D0), + + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.09D0,-0.12D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.03D0,-0.09D0), (8.0D0,9.0D0), + + (0.15D0,-0.03D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.03D0,0.03D0), (3.0D0,6.0D0), + + (-0.18D0,0.03D0), (4.0D0,7.0D0), + + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), + + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), + + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. DZNRM2TEST .. + CALL STEST1(DZNRM2TEST(N,CX,INCX),STRUE2(NP1), + + STRUE2(NP1),SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. DZASUMTEST .. + CALL STEST1(DZASUMTEST(N,CX,INCX),STRUE4(NP1), + + STRUE4(NP1),SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. ZSCALTEST .. + CALL ZSCALTEST(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. ZDSCALTEST .. + CALL ZDSCALTEST(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IZAMAXTEST .. + CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* ZSCALTEST +* Add a test for alpha equal to zero. + CA = (0.0D0,0.0D0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 80 CONTINUE + CALL ZSCALTEST(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* ZDSCALTEST +* Add a test for alpha equal to zero. + SA = 0.0D0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 100 CONTINUE + CALL ZDSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0D0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL ZDSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0D0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL ZDSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA,ZTEMP + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX*16 CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + EXTERNAL ZDOTCTEST, ZDOTUTEST +* .. External Subroutines .. + EXTERNAL ZAXPYTEST, ZCOPYTEST, ZSWAPTEST, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4D0,-0.7D0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (-0.1D0,-0.9D0), (0.2D0,-0.8D0), + + (-0.9D0,-0.4D0), (0.1D0,0.4D0), (-0.6D0,0.6D0)/ + DATA CY1/(0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.7D0,-0.6D0), (0.1D0,-0.5D0), (-0.1D0,-0.2D0), + + (-0.5D0,-0.3D0), (0.8D0,-0.7D0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-1.55D0,0.5D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (-1.55D0,0.5D0), + + (0.03D0,-0.89D0), (-0.38D0,-0.96D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-0.9D0,0.5D0), (0.42D0,-1.41D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-0.9D0,0.5D0), + + (0.06D0,-0.13D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.52D0,-1.51D0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-1.18D0,-0.31D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-1.54D0,0.97D0), + + (0.03D0,-0.89D0), (-0.18D0,-1.31D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), (-0.9D0,0.5D0), + + (0.05D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-0.9D0,0.5D0), (0.05D0,-0.6D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.32D0,-1.16D0)/ + DATA CT7/(0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (0.65D0,-0.47D0), (-0.34D0,-1.22D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.59D0,-1.46D0), (-1.04D0,-0.04D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.83D0,0.59D0), (0.07D0,-0.37D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.76D0,-1.15D0), (-1.33D0,-1.82D0)/ + DATA CT6/(0.0D0,0.0D0), (0.90D0,0.06D0), + + (0.91D0,-0.77D0), (1.80D0,-0.10D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.45D0,0.74D0), + + (0.20D0,0.90D0), (0.0D0,0.0D0), (0.90D0,0.06D0), + + (-0.55D0,0.23D0), (0.83D0,-0.39D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.04D0,0.79D0), + + (1.95D0,1.22D0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (-0.9D0,0.5D0), (0.7D0,-0.6D0), (0.1D0,-0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.6D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.8D0,-0.7D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.2D0), + + (0.2D0,-0.8D0), (0.7D0,-0.6D0), (0.1D0,0.4D0), + + (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.9D0,0.5D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.1D0,-0.5D0), + + (-0.4D0,-0.7D0), (0.7D0,-0.6D0), (0.2D0,-0.8D0), + + (-0.9D0,0.5D0), (0.1D0,0.4D0), (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (0.7D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (0.7D0,-0.6D0), (-0.1D0,-0.2D0), (0.8D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.9D0), + + (0.2D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (-0.9D0,0.5D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,0.5D0), (-0.9D0,-0.4D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.7D0,-0.8D0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,-0.4D0), (-0.1D0,-0.9D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.9D0,0.5D0), + + (-0.4D0,-0.7D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.9D0,0.5D0), (-0.4D0,-0.7D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.2D0,-0.8D0)/ + DATA CSIZE1/(0.0D0,0.0D0), (0.9D0,0.9D0), + + (1.63D0,1.73D0), (2.90D0,2.78D0)/ + DATA CSIZE3/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0)/ + DATA CSIZE2/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. ZDOTCTEST .. + CALL ZDOTCTEST(N,CX,INCX,CY,INCY,ZTEMP) + CDOT(1) = ZTEMP + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. ZDOTUTEST .. + CALL ZDOTUTEST(N,CX,INCX,CY,INCY,ZTEMP) + CDOT(1) = ZTEMP + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. ZAXPYTEST .. + CALL ZAXPYTEST(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. ZCOPYTEST .. + CALL ZCOPYTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE IF (ICASE.EQ.5) THEN +* .. ZSWAPTEST .. + CALL ZSWAPTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0D0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX*16 CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC DIMAG, DBLE +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = DBLE(CCOMP(I)) + SCOMP(2*I) = DIMAG(CCOMP(I)) + STRUE(2*I-1) = DBLE(CTRUE(I)) + STRUE(2*I) = DIMAG(CTRUE(I)) + SSIZE(2*I-1) = DBLE(CSIZE(I)) + SSIZE(2*I) = DIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/ctest/c_zblat2.f b/ctest/c_zblat2.f new file mode 100644 index 0000000..236088f --- /dev/null +++ b/ctest/c_zblat2.f @@ -0,0 +1,2939 @@ + PROGRAM ZBLAT2 +* +* Test program for the COMPLEX*16 Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_zgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztrmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztrsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zgerc T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zgeru T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zher T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhpr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zher2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhpr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHK6, + $ CZ2CHKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_zgemv ', 'cblas_zgbmv ', + $ 'cblas_zhemv ','cblas_zhbmv ','cblas_zhpmv ', + $ 'cblas_ztrmv ','cblas_ztbmv ','cblas_ztpmv ', + $ 'cblas_ztrsv ','cblas_ztbsv ','cblas_ztpsv ', + $ 'cblas_zgerc ','cblas_zgeru ','cblas_zher ', + $ 'cblas_zhpr ','cblas_zher2 ','cblas_zhpr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from CMVCH YT holds +* the result computed by CMVCH. + TRANS = 'N' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CZ2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test ZGEMV, 01, and ZGBMV, 02. + 140 IF (CORDER) THEN + CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. + 150 IF (CORDER) THEN + CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, +* ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. + 160 IF (CORDER) THEN + CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test ZGERC, 12, ZGERU, 13. + 170 IF (CORDER) THEN + CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test ZHER, 14, and ZHPR, 15. + 180 IF (CORDER) THEN + CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test ZHER2, 16, and ZHPR2, 17. + 190 IF (CORDER) THEN + CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT(' TESTS OF THE COMPLEX*16 LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT(' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT(' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT2. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CGEMV and CGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGBMV, CZGEMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CZGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* +* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LZERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CHEMV, CHBMV and CHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHBMV, CZHEMV, CZHPMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHEMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LZE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LZERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LZE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LZERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,',/ 10x, I2, ',(', F4.1, ',', F4.1, + $ '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', F4.1, ',', + $ F4.1, '), ', 'Y,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CZHK2. +* + END + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests ZTRMV, ZTBMV, ZTPMV, ZTRSV, ZTBSV and ZTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMVCH, CZTBMV, CZTBSV, CZTPMV, + $ CZTPSV, CZTRMV, CZTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for ZMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LZERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LZERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LZERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests ZGERC and ZGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGERC, CZGERU, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. + CONJ = SNAME( 5: 5 ).EQ.'c' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE(SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL CZGERC( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL CZGERU( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = DCONJG( W( 1 ) ) + CALL ZMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK4. +* + END + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests ZHER and ZHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, TRANSL + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER, CZHPR, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = DBLE( ALF( IA ) ) + ALPHA = DCMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CZHER( IORDER, UPLO, N, RALPHA, XX, + $ INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZHPR( IORDER, UPLO, N, RALPHA, + $ XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LZERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = DCONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CZHK5. +* + END + SUBROUTINE ZCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests ZHER2 and ZHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER2, CZHPR2, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CZHER2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*DCONJG( Z( J, 2 ) ) + W( 2 ) = DCONJG( ALPHA )*DCONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK6. +* + END + SUBROUTINE ZMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 C + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + DCONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of ZMVCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'he' or 'hp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'he', 'hb', 'hp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, MIN, DBLE +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'h' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'hb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'hp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = DCMPLX( DBLE( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f new file mode 100644 index 0000000..6e9dbbd --- /dev/null +++ b/ctest/c_zblat3.f @@ -0,0 +1,2791 @@ + PROGRAM ZBLAT3 +* +* Test program for the COMPLEX*16 Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A12,L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 22 lines: +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5,ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_zgemm ', 'cblas_zhemm ', + $ 'cblas_zsymm ', 'cblas_ztrmm ', 'cblas_ztrsm ', + $ 'cblas_zherk ', 'cblas_zsyrk ', 'cblas_zher2k', + $ 'cblas_zsyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from ZMMCH CT holds +* the result computed by ZMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CZ3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test ZGEMM, 01. + 140 IF (CORDER) THEN + CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZHEMM, 02, ZSYMM, 03. + 150 IF (CORDER) THEN + CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZTRMM, 04, ZTRSM, 05. + 160 IF (CORDER) THEN + CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test ZHERK, 06, ZSYRK, 07. + 170 IF (CORDER) THEN + CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZHER2K, 08, ZSYR2K, 09. + 180 IF (CORDER) THEN + CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT('TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT(' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT(' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, + $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT3. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGEMM, ZMAKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL ZPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LZERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END +* + SUBROUTINE ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', + $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) + END +* + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZHEMM and ZSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHEMM, ZMAKE, ZMMCH, CZSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL ZMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CZHEMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + ELSE + CALL CZSYMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK2. +* + END +* + SUBROUTINE ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, + $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) + END +* + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests ZTRMM and ZTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMMCH, CZTRMM, CZTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for ZMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL ZMAKE( 'tr', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CZTRMM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CZTRSM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LZE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LZE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT(' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END +* + SUBROUTINE ZPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + DOUBLE COMPLEX ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', + $ F4.1, '), A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZHERK and ZSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHERK, ZMAKE, ZMMCH, CZSYRK +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = DBLE( ALPHA ) + ALPHA = DCMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL ZPRCN6( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, + $ LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZHERK( IORDER, UPLO, TRANS, N, K, + $ RALPHA, AA, LDA, RBETA, CC, + $ LDC ) + ELSE + IF( TRACE ) + $ CALL ZPRCN4( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZSYRK( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LZERES( SNAME( 8: 9 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL ZMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL ZPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, + $ LDA, rBETA, LDC) + ELSE + CALL ZPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END +* + SUBROUTINE ZPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', + $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE ZPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests ZHER2K and ZSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER2K, ZMAKE, ZMMCH, CZSYR2K +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL ZPRCN7( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ RBETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZHER2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, RBETA, + $ CC, LDC ) + ELSE + IF( TRACE ) + $ CALL ZPRCN5( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'he', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = DCONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*DCONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = DCONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL ZPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, RBETA, LDC) + ELSE + CALL ZPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, BETA, LDC) + END IF +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK5. +* + END +* + SUBROUTINE ZPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE ZPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA + DOUBLE PRECISION BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'he', 'sy' or 'tr'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, DBLE +* .. Executable Statements .. + GEN = TYPE.EQ.'ge' + HER = TYPE.EQ.'he' + SYM = TYPE.EQ.'sy' + TRI = TYPE.EQ.'tr' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END + SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 CL + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of ZMMCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge' or 'he' or 'sy'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + diff --git a/ctest/cblas_test.h b/ctest/cblas_test.h new file mode 100644 index 0000000..53cb99f --- /dev/null +++ b/ctest/cblas_test.h @@ -0,0 +1,514 @@ +/* + * cblas_test.h + * Written by Keita Teranishi + */ +#ifndef CBLAS_TEST_H +#define CBLAS_TEST_H +#include "cblas.h" + +#ifdef USE64BITINT +#define int long +#endif + +#define TRUE 1 +#define PASSED 1 +#define TEST_ROW_MJR 1 + +#define FALSE 0 +#define FAILED 0 +#define TEST_COL_MJR 0 + +#define INVALID -1 +#define UNDEFINED -1 + +typedef struct { float real; float imag; } CBLAS_TEST_COMPLEX; +typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX; + +#if defined(ADD_) +/* + * Level 1 BLAS + */ + #define F77_srotg srotgtest_ + #define F77_srotmg srotmgtest_ + #define F77_srot srottest_ + #define F77_srotm srotmtest_ + #define F77_drotg drotgtest_ + #define F77_drotmg drotmgtest_ + #define F77_drot drottest_ + #define F77_drotm drotmtest_ + #define F77_sswap sswaptest_ + #define F77_scopy scopytest_ + #define F77_saxpy saxpytest_ + #define F77_isamax isamaxtest_ + #define F77_dswap dswaptest_ + #define F77_dcopy dcopytest_ + #define F77_daxpy daxpytest_ + #define F77_idamax idamaxtest_ + #define F77_cswap cswaptest_ + #define F77_ccopy ccopytest_ + #define F77_caxpy caxpytest_ + #define F77_icamax icamaxtest_ + #define F77_zswap zswaptest_ + #define F77_zcopy zcopytest_ + #define F77_zaxpy zaxpytest_ + #define F77_izamax izamaxtest_ + #define F77_sdot sdottest_ + #define F77_ddot ddottest_ + #define F77_dsdot dsdottest_ + #define F77_sscal sscaltest_ + #define F77_dscal dscaltest_ + #define F77_cscal cscaltest_ + #define F77_zscal zscaltest_ + #define F77_csscal csscaltest_ + #define F77_zdscal zdscaltest_ + #define F77_cdotu cdotutest_ + #define F77_cdotc cdotctest_ + #define F77_zdotu zdotutest_ + #define F77_zdotc zdotctest_ + #define F77_snrm2 snrm2test_ + #define F77_sasum sasumtest_ + #define F77_dnrm2 dnrm2test_ + #define F77_dasum dasumtest_ + #define F77_scnrm2 scnrm2test_ + #define F77_scasum scasumtest_ + #define F77_dznrm2 dznrm2test_ + #define F77_dzasum dzasumtest_ + #define F77_sdsdot sdsdottest_ +/* + * Level 2 BLAS + */ + #define F77_s2chke cs2chke_ + #define F77_d2chke cd2chke_ + #define F77_c2chke cc2chke_ + #define F77_z2chke cz2chke_ + #define F77_ssymv cssymv_ + #define F77_ssbmv cssbmv_ + #define F77_sspmv csspmv_ + #define F77_sger csger_ + #define F77_ssyr cssyr_ + #define F77_sspr csspr_ + #define F77_ssyr2 cssyr2_ + #define F77_sspr2 csspr2_ + #define F77_dsymv cdsymv_ + #define F77_dsbmv cdsbmv_ + #define F77_dspmv cdspmv_ + #define F77_dger cdger_ + #define F77_dsyr cdsyr_ + #define F77_dspr cdspr_ + #define F77_dsyr2 cdsyr2_ + #define F77_dspr2 cdspr2_ + #define F77_chemv cchemv_ + #define F77_chbmv cchbmv_ + #define F77_chpmv cchpmv_ + #define F77_cgeru ccgeru_ + #define F77_cgerc ccgerc_ + #define F77_cher ccher_ + #define F77_chpr cchpr_ + #define F77_cher2 ccher2_ + #define F77_chpr2 cchpr2_ + #define F77_zhemv czhemv_ + #define F77_zhbmv czhbmv_ + #define F77_zhpmv czhpmv_ + #define F77_zgeru czgeru_ + #define F77_zgerc czgerc_ + #define F77_zher czher_ + #define F77_zhpr czhpr_ + #define F77_zher2 czher2_ + #define F77_zhpr2 czhpr2_ + #define F77_sgemv csgemv_ + #define F77_sgbmv csgbmv_ + #define F77_strmv cstrmv_ + #define F77_stbmv cstbmv_ + #define F77_stpmv cstpmv_ + #define F77_strsv cstrsv_ + #define F77_stbsv cstbsv_ + #define F77_stpsv cstpsv_ + #define F77_dgemv cdgemv_ + #define F77_dgbmv cdgbmv_ + #define F77_dtrmv cdtrmv_ + #define F77_dtbmv cdtbmv_ + #define F77_dtpmv cdtpmv_ + #define F77_dtrsv cdtrsv_ + #define F77_dtbsv cdtbsv_ + #define F77_dtpsv cdtpsv_ + #define F77_cgemv ccgemv_ + #define F77_cgbmv ccgbmv_ + #define F77_ctrmv cctrmv_ + #define F77_ctbmv cctbmv_ + #define F77_ctpmv cctpmv_ + #define F77_ctrsv cctrsv_ + #define F77_ctbsv cctbsv_ + #define F77_ctpsv cctpsv_ + #define F77_zgemv czgemv_ + #define F77_zgbmv czgbmv_ + #define F77_ztrmv cztrmv_ + #define F77_ztbmv cztbmv_ + #define F77_ztpmv cztpmv_ + #define F77_ztrsv cztrsv_ + #define F77_ztbsv cztbsv_ + #define F77_ztpsv cztpsv_ +/* + * Level 3 BLAS + */ + #define F77_s3chke cs3chke_ + #define F77_d3chke cd3chke_ + #define F77_c3chke cc3chke_ + #define F77_z3chke cz3chke_ + #define F77_chemm cchemm_ + #define F77_cherk ccherk_ + #define F77_cher2k ccher2k_ + #define F77_zhemm czhemm_ + #define F77_zherk czherk_ + #define F77_zher2k czher2k_ + #define F77_sgemm csgemm_ + #define F77_ssymm cssymm_ + #define F77_ssyrk cssyrk_ + #define F77_ssyr2k cssyr2k_ + #define F77_strmm cstrmm_ + #define F77_strsm cstrsm_ + #define F77_dgemm cdgemm_ + #define F77_dsymm cdsymm_ + #define F77_dsyrk cdsyrk_ + #define F77_dsyr2k cdsyr2k_ + #define F77_dtrmm cdtrmm_ + #define F77_dtrsm cdtrsm_ + #define F77_cgemm ccgemm_ + #define F77_csymm ccsymm_ + #define F77_csyrk ccsyrk_ + #define F77_csyr2k ccsyr2k_ + #define F77_ctrmm cctrmm_ + #define F77_ctrsm cctrsm_ + #define F77_zgemm czgemm_ + #define F77_zsymm czsymm_ + #define F77_zsyrk czsyrk_ + #define F77_zsyr2k czsyr2k_ + #define F77_ztrmm cztrmm_ + #define F77_ztrsm cztrsm_ +#elif defined(UPCASE) +/* + * Level 1 BLAS + */ + #define F77_srotg SROTGTEST + #define F77_srotmg SROTMGTEST + #define F77_srot SROTCTEST + #define F77_srotm SROTMTEST + #define F77_drotg DROTGTEST + #define F77_drotmg DROTMGTEST + #define F77_drot DROTTEST + #define F77_drotm DROTMTEST + #define F77_sswap SSWAPTEST + #define F77_scopy SCOPYTEST + #define F77_saxpy SAXPYTEST + #define F77_isamax ISAMAXTEST + #define F77_dswap DSWAPTEST + #define F77_dcopy DCOPYTEST + #define F77_daxpy DAXPYTEST + #define F77_idamax IDAMAXTEST + #define F77_cswap CSWAPTEST + #define F77_ccopy CCOPYTEST + #define F77_caxpy CAXPYTEST + #define F77_icamax ICAMAXTEST + #define F77_zswap ZSWAPTEST + #define F77_zcopy ZCOPYTEST + #define F77_zaxpy ZAXPYTEST + #define F77_izamax IZAMAXTEST + #define F77_sdot SDOTTEST + #define F77_ddot DDOTTEST + #define F77_dsdot DSDOTTEST + #define F77_sscal SSCALTEST + #define F77_dscal DSCALTEST + #define F77_cscal CSCALTEST + #define F77_zscal ZSCALTEST + #define F77_csscal CSSCALTEST + #define F77_zdscal ZDSCALTEST + #define F77_cdotu CDOTUTEST + #define F77_cdotc CDOTCTEST + #define F77_zdotu ZDOTUTEST + #define F77_zdotc ZDOTCTEST + #define F77_snrm2 SNRM2TEST + #define F77_sasum SASUMTEST + #define F77_dnrm2 DNRM2TEST + #define F77_dasum DASUMTEST + #define F77_scnrm2 SCNRM2TEST + #define F77_scasum SCASUMTEST + #define F77_dznrm2 DZNRM2TEST + #define F77_dzasum DZASUMTEST + #define F77_sdsdot SDSDOTTEST +/* + * Level 2 BLAS + */ + #define F77_s2chke CS2CHKE + #define F77_d2chke CD2CHKE + #define F77_c2chke CC2CHKE + #define F77_z2chke CZ2CHKE + #define F77_ssymv CSSYMV + #define F77_ssbmv CSSBMV + #define F77_sspmv CSSPMV + #define F77_sger CSGER + #define F77_ssyr CSSYR + #define F77_sspr CSSPR + #define F77_ssyr2 CSSYR2 + #define F77_sspr2 CSSPR2 + #define F77_dsymv CDSYMV + #define F77_dsbmv CDSBMV + #define F77_dspmv CDSPMV + #define F77_dger CDGER + #define F77_dsyr CDSYR + #define F77_dspr CDSPR + #define F77_dsyr2 CDSYR2 + #define F77_dspr2 CDSPR2 + #define F77_chemv CCHEMV + #define F77_chbmv CCHBMV + #define F77_chpmv CCHPMV + #define F77_cgeru CCGERU + #define F77_cgerc CCGERC + #define F77_cher CCHER + #define F77_chpr CCHPR + #define F77_cher2 CCHER2 + #define F77_chpr2 CCHPR2 + #define F77_zhemv CZHEMV + #define F77_zhbmv CZHBMV + #define F77_zhpmv CZHPMV + #define F77_zgeru CZGERU + #define F77_zgerc CZGERC + #define F77_zher CZHER + #define F77_zhpr CZHPR + #define F77_zher2 CZHER2 + #define F77_zhpr2 CZHPR2 + #define F77_sgemv CSGEMV + #define F77_sgbmv CSGBMV + #define F77_strmv CSTRMV + #define F77_stbmv CSTBMV + #define F77_stpmv CSTPMV + #define F77_strsv CSTRSV + #define F77_stbsv CSTBSV + #define F77_stpsv CSTPSV + #define F77_dgemv CDGEMV + #define F77_dgbmv CDGBMV + #define F77_dtrmv CDTRMV + #define F77_dtbmv CDTBMV + #define F77_dtpmv CDTPMV + #define F77_dtrsv CDTRSV + #define F77_dtbsv CDTBSV + #define F77_dtpsv CDTPSV + #define F77_cgemv CCGEMV + #define F77_cgbmv CCGBMV + #define F77_ctrmv CCTRMV + #define F77_ctbmv CCTBMV + #define F77_ctpmv CCTPMV + #define F77_ctrsv CCTRSV + #define F77_ctbsv CCTBSV + #define F77_ctpsv CCTPSV + #define F77_zgemv CZGEMV + #define F77_zgbmv CZGBMV + #define F77_ztrmv CZTRMV + #define F77_ztbmv CZTBMV + #define F77_ztpmv CZTPMV + #define F77_ztrsv CZTRSV + #define F77_ztbsv CZTBSV + #define F77_ztpsv CZTPSV +/* + * Level 3 BLAS + */ + #define F77_s3chke CS3CHKE + #define F77_d3chke CD3CHKE + #define F77_c3chke CC3CHKE + #define F77_z3chke CZ3CHKE + #define F77_chemm CCHEMM + #define F77_cherk CCHERK + #define F77_cher2k CCHER2K + #define F77_zhemm CZHEMM + #define F77_zherk CZHERK + #define F77_zher2k CZHER2K + #define F77_sgemm CSGEMM + #define F77_ssymm CSSYMM + #define F77_ssyrk CSSYRK + #define F77_ssyr2k CSSYR2K + #define F77_strmm CSTRMM + #define F77_strsm CSTRSM + #define F77_dgemm CDGEMM + #define F77_dsymm CDSYMM + #define F77_dsyrk CDSYRK + #define F77_dsyr2k CDSYR2K + #define F77_dtrmm CDTRMM + #define F77_dtrsm CDTRSM + #define F77_cgemm CCGEMM + #define F77_csymm CCSYMM + #define F77_csyrk CCSYRK + #define F77_csyr2k CCSYR2K + #define F77_ctrmm CCTRMM + #define F77_ctrsm CCTRSM + #define F77_zgemm CZGEMM + #define F77_zsymm CZSYMM + #define F77_zsyrk CZSYRK + #define F77_zsyr2k CZSYR2K + #define F77_ztrmm CZTRMM + #define F77_ztrsm CZTRSM +#elif defined(NOCHANGE) +/* + * Level 1 BLAS + */ + #define F77_srotg srotgtest + #define F77_srotmg srotmgtest + #define F77_srot srottest + #define F77_srotm srotmtest + #define F77_drotg drotgtest + #define F77_drotmg drotmgtest + #define F77_drot drottest + #define F77_drotm drotmtest + #define F77_sswap sswaptest + #define F77_scopy scopytest + #define F77_saxpy saxpytest + #define F77_isamax isamaxtest + #define F77_dswap dswaptest + #define F77_dcopy dcopytest + #define F77_daxpy daxpytest + #define F77_idamax idamaxtest + #define F77_cswap cswaptest + #define F77_ccopy ccopytest + #define F77_caxpy caxpytest + #define F77_icamax icamaxtest + #define F77_zswap zswaptest + #define F77_zcopy zcopytest + #define F77_zaxpy zaxpytest + #define F77_izamax izamaxtest + #define F77_sdot sdottest + #define F77_ddot ddottest + #define F77_dsdot dsdottest + #define F77_sscal sscaltest + #define F77_dscal dscaltest + #define F77_cscal cscaltest + #define F77_zscal zscaltest + #define F77_csscal csscaltest + #define F77_zdscal zdscaltest + #define F77_cdotu cdotutest + #define F77_cdotc cdotctest + #define F77_zdotu zdotutest + #define F77_zdotc zdotctest + #define F77_snrm2 snrm2test + #define F77_sasum sasumtest + #define F77_dnrm2 dnrm2test + #define F77_dasum dasumtest + #define F77_scnrm2 scnrm2test + #define F77_scasum scasumtest + #define F77_dznrm2 dznrm2test + #define F77_dzasum dzasumtest + #define F77_sdsdot sdsdottest +/* + * Level 2 BLAS + */ + #define F77_s2chke cs2chke + #define F77_d2chke cd2chke + #define F77_c2chke cc2chke + #define F77_z2chke cz2chke + #define F77_ssymv cssymv + #define F77_ssbmv cssbmv + #define F77_sspmv csspmv + #define F77_sger csger + #define F77_ssyr cssyr + #define F77_sspr csspr + #define F77_ssyr2 cssyr2 + #define F77_sspr2 csspr2 + #define F77_dsymv cdsymv + #define F77_dsbmv cdsbmv + #define F77_dspmv cdspmv + #define F77_dger cdger + #define F77_dsyr cdsyr + #define F77_dspr cdspr + #define F77_dsyr2 cdsyr2 + #define F77_dspr2 cdspr2 + #define F77_chemv cchemv + #define F77_chbmv cchbmv + #define F77_chpmv cchpmv + #define F77_cgeru ccgeru + #define F77_cgerc ccgerc + #define F77_cher ccher + #define F77_chpr cchpr + #define F77_cher2 ccher2 + #define F77_chpr2 cchpr2 + #define F77_zhemv czhemv + #define F77_zhbmv czhbmv + #define F77_zhpmv czhpmv + #define F77_zgeru czgeru + #define F77_zgerc czgerc + #define F77_zher czher + #define F77_zhpr czhpr + #define F77_zher2 czher2 + #define F77_zhpr2 czhpr2 + #define F77_sgemv csgemv + #define F77_sgbmv csgbmv + #define F77_strmv cstrmv + #define F77_stbmv cstbmv + #define F77_stpmv cstpmv + #define F77_strsv cstrsv + #define F77_stbsv cstbsv + #define F77_stpsv cstpsv + #define F77_dgemv cdgemv + #define F77_dgbmv cdgbmv + #define F77_dtrmv cdtrmv + #define F77_dtbmv cdtbmv + #define F77_dtpmv cdtpmv + #define F77_dtrsv cdtrsv + #define F77_dtbsv cdtbsv + #define F77_dtpsv cdtpsv + #define F77_cgemv ccgemv + #define F77_cgbmv ccgbmv + #define F77_ctrmv cctrmv + #define F77_ctbmv cctbmv + #define F77_ctpmv cctpmv + #define F77_ctrsv cctrsv + #define F77_ctbsv cctbsv + #define F77_ctpsv cctpsv + #define F77_zgemv czgemv + #define F77_zgbmv czgbmv + #define F77_ztrmv cztrmv + #define F77_ztbmv cztbmv + #define F77_ztpmv cztpmv + #define F77_ztrsv cztrsv + #define F77_ztbsv cztbsv + #define F77_ztpsv cztpsv +/* + * Level 3 BLAS + */ + #define F77_s3chke cs3chke + #define F77_d3chke cd3chke + #define F77_c3chke cc3chke + #define F77_z3chke cz3chke + #define F77_chemm cchemm + #define F77_cherk ccherk + #define F77_cher2k ccher2k + #define F77_zhemm czhemm + #define F77_zherk czherk + #define F77_zher2k czher2k + #define F77_sgemm csgemm + #define F77_ssymm cssymm + #define F77_ssyrk cssyrk + #define F77_ssyr2k cssyr2k + #define F77_strmm cstrmm + #define F77_strsm cstrsm + #define F77_dgemm cdgemm + #define F77_dsymm cdsymm + #define F77_dsyrk cdsyrk + #define F77_dsyr2k cdsyr2k + #define F77_dtrmm cdtrmm + #define F77_dtrsm cdtrsm + #define F77_cgemm ccgemm + #define F77_csymm ccsymm + #define F77_csyrk ccsyrk + #define F77_csyr2k ccsyr2k + #define F77_ctrmm cctrmm + #define F77_ctrsm cctrsm + #define F77_zgemm czgemm + #define F77_zsymm czsymm + #define F77_zsyrk czsyrk + #define F77_zsyr2k czsyr2k + #define F77_ztrmm cztrmm + #define F77_ztrsm cztrsm +#endif + +void get_transpose_type(char *type, enum CBLAS_TRANSPOSE *trans); +void get_uplo_type(char *type, enum CBLAS_UPLO *uplo); +void get_diag_type(char *type, enum CBLAS_DIAG *diag); +void get_side_type(char *type, enum CBLAS_SIDE *side); + +#endif /* CBLAS_TEST_H */ diff --git a/ctest/cin2 b/ctest/cin2 new file mode 100644 index 0000000..032fcbb --- /dev/null +++ b/ctest/cin2 @@ -0,0 +1,34 @@ +'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_cgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cgerc T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cgeru T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chpr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chpr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/cin3 b/ctest/cin3 new file mode 100644 index 0000000..223d165 --- /dev/null +++ b/ctest/cin3 @@ -0,0 +1,22 @@ +'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. +cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/constant.c b/ctest/constant.c new file mode 100644 index 0000000..861d70b --- /dev/null +++ b/ctest/constant.c @@ -0,0 +1,3 @@ +int CBLAS_CallFromC; +int RowMajorStrg; + diff --git a/ctest/din2 b/ctest/din2 new file mode 100644 index 0000000..6f42b27 --- /dev/null +++ b/ctest/din2 @@ -0,0 +1,33 @@ +'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +cblas_dgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsymv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dspmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dger T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dspr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyr2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dspr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/din3 b/ctest/din3 new file mode 100644 index 0000000..cbbcc22 --- /dev/null +++ b/ctest/din3 @@ -0,0 +1,19 @@ +'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +1 2 3 5 7 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +cblas_dgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/sin2 b/ctest/sin2 new file mode 100644 index 0000000..3eee5c2 --- /dev/null +++ b/ctest/sin2 @@ -0,0 +1,33 @@ +'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +cblas_sgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssymv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sspmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sger T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sspr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyr2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sspr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/sin3 b/ctest/sin3 new file mode 100644 index 0000000..01e32d6 --- /dev/null +++ b/ctest/sin3 @@ -0,0 +1,19 @@ +'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +cblas_sgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/zin2 b/ctest/zin2 new file mode 100644 index 0000000..4c0affe --- /dev/null +++ b/ctest/zin2 @@ -0,0 +1,34 @@ +'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_zgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zgerc T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zgeru T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhpr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhpr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/zin3 b/ctest/zin3 new file mode 100644 index 0000000..70050b6 --- /dev/null +++ b/ctest/zin3 @@ -0,0 +1,22 @@ +'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_zgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zherk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher2k T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest1.c b/ctest1.c new file mode 100644 index 0000000..5ab6338 --- /dev/null +++ b/ctest1.c @@ -0,0 +1 @@ +int hogehoge(void){return 0;} diff --git a/ctest2.c b/ctest2.c new file mode 100644 index 0000000..f7e582f --- /dev/null +++ b/ctest2.c @@ -0,0 +1 @@ +int main(void){return 0;} diff --git a/driver/._level2 b/driver/._level2 new file mode 100755 index 0000000..d4728a1 Binary files /dev/null and b/driver/._level2 differ diff --git a/driver/._level3 b/driver/._level3 new file mode 100755 index 0000000..0e94994 Binary files /dev/null and b/driver/._level3 differ diff --git a/driver/._mapper b/driver/._mapper new file mode 100755 index 0000000..2d46d9c Binary files /dev/null and b/driver/._mapper differ diff --git a/driver/._others b/driver/._others new file mode 100755 index 0000000..747dcad Binary files /dev/null and b/driver/._others differ diff --git a/driver/level2/._Makefile b/driver/level2/._Makefile new file mode 100644 index 0000000..5a3d304 Binary files /dev/null and b/driver/level2/._Makefile differ diff --git a/driver/level2/._gbmv_k.c b/driver/level2/._gbmv_k.c new file mode 100644 index 0000000..e2f54de Binary files /dev/null and b/driver/level2/._gbmv_k.c differ diff --git a/driver/level2/._gbmv_thread.c b/driver/level2/._gbmv_thread.c new file mode 100644 index 0000000..5fa2dc1 Binary files /dev/null and b/driver/level2/._gbmv_thread.c differ diff --git a/driver/level2/._gemv_thread.c b/driver/level2/._gemv_thread.c new file mode 100644 index 0000000..f327437 Binary files /dev/null and b/driver/level2/._gemv_thread.c differ diff --git a/driver/level2/._ger_thread.c b/driver/level2/._ger_thread.c new file mode 100644 index 0000000..9d16441 Binary files /dev/null and b/driver/level2/._ger_thread.c differ diff --git a/driver/level2/._sbmv_k.c b/driver/level2/._sbmv_k.c new file mode 100644 index 0000000..59aa8a3 Binary files /dev/null and b/driver/level2/._sbmv_k.c differ diff --git a/driver/level2/._sbmv_thread.c b/driver/level2/._sbmv_thread.c new file mode 100644 index 0000000..58bf447 Binary files /dev/null and b/driver/level2/._sbmv_thread.c differ diff --git a/driver/level2/._spmv_k.c b/driver/level2/._spmv_k.c new file mode 100644 index 0000000..621c397 Binary files /dev/null and b/driver/level2/._spmv_k.c differ diff --git a/driver/level2/._spmv_thread.c b/driver/level2/._spmv_thread.c new file mode 100644 index 0000000..21ad0ba Binary files /dev/null and b/driver/level2/._spmv_thread.c differ diff --git a/driver/level2/._spr2_k.c b/driver/level2/._spr2_k.c new file mode 100644 index 0000000..22043cf Binary files /dev/null and b/driver/level2/._spr2_k.c differ diff --git a/driver/level2/._spr2_thread.c b/driver/level2/._spr2_thread.c new file mode 100644 index 0000000..42f9d76 Binary files /dev/null and b/driver/level2/._spr2_thread.c differ diff --git a/driver/level2/._spr_k.c b/driver/level2/._spr_k.c new file mode 100644 index 0000000..8cdf8a0 Binary files /dev/null and b/driver/level2/._spr_k.c differ diff --git a/driver/level2/._spr_thread.c b/driver/level2/._spr_thread.c new file mode 100644 index 0000000..3bf48fb Binary files /dev/null and b/driver/level2/._spr_thread.c differ diff --git a/driver/level2/._symv_thread.c b/driver/level2/._symv_thread.c new file mode 100644 index 0000000..3294c8c Binary files /dev/null and b/driver/level2/._symv_thread.c differ diff --git a/driver/level2/._syr2_k.c b/driver/level2/._syr2_k.c new file mode 100644 index 0000000..c03eabd Binary files /dev/null and b/driver/level2/._syr2_k.c differ diff --git a/driver/level2/._syr2_thread.c b/driver/level2/._syr2_thread.c new file mode 100644 index 0000000..5f8dd9f Binary files /dev/null and b/driver/level2/._syr2_thread.c differ diff --git a/driver/level2/._syr_k.c b/driver/level2/._syr_k.c new file mode 100644 index 0000000..ab50d90 Binary files /dev/null and b/driver/level2/._syr_k.c differ diff --git a/driver/level2/._syr_thread.c b/driver/level2/._syr_thread.c new file mode 100644 index 0000000..dccfc7a Binary files /dev/null and b/driver/level2/._syr_thread.c differ diff --git a/driver/level2/._tbmv_L.c b/driver/level2/._tbmv_L.c new file mode 100644 index 0000000..2356511 Binary files /dev/null and b/driver/level2/._tbmv_L.c differ diff --git a/driver/level2/._tbmv_U.c b/driver/level2/._tbmv_U.c new file mode 100644 index 0000000..d847610 Binary files /dev/null and b/driver/level2/._tbmv_U.c differ diff --git a/driver/level2/._tbmv_thread.c b/driver/level2/._tbmv_thread.c new file mode 100644 index 0000000..6f23121 Binary files /dev/null and b/driver/level2/._tbmv_thread.c differ diff --git a/driver/level2/._tbsv_L.c b/driver/level2/._tbsv_L.c new file mode 100644 index 0000000..d967e79 Binary files /dev/null and b/driver/level2/._tbsv_L.c differ diff --git a/driver/level2/._tbsv_U.c b/driver/level2/._tbsv_U.c new file mode 100644 index 0000000..d717d3d Binary files /dev/null and b/driver/level2/._tbsv_U.c differ diff --git a/driver/level2/._tpmv_L.c b/driver/level2/._tpmv_L.c new file mode 100644 index 0000000..0f757d5 Binary files /dev/null and b/driver/level2/._tpmv_L.c differ diff --git a/driver/level2/._tpmv_U.c b/driver/level2/._tpmv_U.c new file mode 100644 index 0000000..6d34015 Binary files /dev/null and b/driver/level2/._tpmv_U.c differ diff --git a/driver/level2/._tpmv_thread.c b/driver/level2/._tpmv_thread.c new file mode 100644 index 0000000..8713fbe Binary files /dev/null and b/driver/level2/._tpmv_thread.c differ diff --git a/driver/level2/._tpsv_L.c b/driver/level2/._tpsv_L.c new file mode 100644 index 0000000..28c1205 Binary files /dev/null and b/driver/level2/._tpsv_L.c differ diff --git a/driver/level2/._tpsv_U.c b/driver/level2/._tpsv_U.c new file mode 100644 index 0000000..c8cae80 Binary files /dev/null and b/driver/level2/._tpsv_U.c differ diff --git a/driver/level2/._trmv_L.c b/driver/level2/._trmv_L.c new file mode 100644 index 0000000..2f400e4 Binary files /dev/null and b/driver/level2/._trmv_L.c differ diff --git a/driver/level2/._trmv_U.c b/driver/level2/._trmv_U.c new file mode 100644 index 0000000..eff3a7c Binary files /dev/null and b/driver/level2/._trmv_U.c differ diff --git a/driver/level2/._trmv_thread.c b/driver/level2/._trmv_thread.c new file mode 100644 index 0000000..866af02 Binary files /dev/null and b/driver/level2/._trmv_thread.c differ diff --git a/driver/level2/._trsv_L.c b/driver/level2/._trsv_L.c new file mode 100644 index 0000000..37588f3 Binary files /dev/null and b/driver/level2/._trsv_L.c differ diff --git a/driver/level2/._trsv_U.c b/driver/level2/._trsv_U.c new file mode 100644 index 0000000..8caa1bb Binary files /dev/null and b/driver/level2/._trsv_U.c differ diff --git a/driver/level2/._zgbmv_k.c b/driver/level2/._zgbmv_k.c new file mode 100644 index 0000000..07a658d Binary files /dev/null and b/driver/level2/._zgbmv_k.c differ diff --git a/driver/level2/._zhbmv_k.c b/driver/level2/._zhbmv_k.c new file mode 100644 index 0000000..2d084cd Binary files /dev/null and b/driver/level2/._zhbmv_k.c differ diff --git a/driver/level2/._zher2_k.c b/driver/level2/._zher2_k.c new file mode 100644 index 0000000..dd08fdd Binary files /dev/null and b/driver/level2/._zher2_k.c differ diff --git a/driver/level2/._zher_k.c b/driver/level2/._zher_k.c new file mode 100644 index 0000000..bb175e8 Binary files /dev/null and b/driver/level2/._zher_k.c differ diff --git a/driver/level2/._zhpmv_k.c b/driver/level2/._zhpmv_k.c new file mode 100644 index 0000000..dce9757 Binary files /dev/null and b/driver/level2/._zhpmv_k.c differ diff --git a/driver/level2/._zhpr2_k.c b/driver/level2/._zhpr2_k.c new file mode 100644 index 0000000..4ae362f Binary files /dev/null and b/driver/level2/._zhpr2_k.c differ diff --git a/driver/level2/._zhpr_k.c b/driver/level2/._zhpr_k.c new file mode 100644 index 0000000..57ddd7d Binary files /dev/null and b/driver/level2/._zhpr_k.c differ diff --git a/driver/level2/._zsbmv_k.c b/driver/level2/._zsbmv_k.c new file mode 100644 index 0000000..eede00b Binary files /dev/null and b/driver/level2/._zsbmv_k.c differ diff --git a/driver/level2/._zspmv_k.c b/driver/level2/._zspmv_k.c new file mode 100644 index 0000000..2e77442 Binary files /dev/null and b/driver/level2/._zspmv_k.c differ diff --git a/driver/level2/._zspr2_k.c b/driver/level2/._zspr2_k.c new file mode 100644 index 0000000..95162c3 Binary files /dev/null and b/driver/level2/._zspr2_k.c differ diff --git a/driver/level2/._zspr_k.c b/driver/level2/._zspr_k.c new file mode 100644 index 0000000..cfbade7 Binary files /dev/null and b/driver/level2/._zspr_k.c differ diff --git a/driver/level2/._zsyr2_k.c b/driver/level2/._zsyr2_k.c new file mode 100644 index 0000000..755e261 Binary files /dev/null and b/driver/level2/._zsyr2_k.c differ diff --git a/driver/level2/._zsyr_k.c b/driver/level2/._zsyr_k.c new file mode 100644 index 0000000..f14a1de Binary files /dev/null and b/driver/level2/._zsyr_k.c differ diff --git a/driver/level2/._ztbmv_L.c b/driver/level2/._ztbmv_L.c new file mode 100644 index 0000000..24d885b Binary files /dev/null and b/driver/level2/._ztbmv_L.c differ diff --git a/driver/level2/._ztbmv_U.c b/driver/level2/._ztbmv_U.c new file mode 100644 index 0000000..63fc37b Binary files /dev/null and b/driver/level2/._ztbmv_U.c differ diff --git a/driver/level2/._ztbsv_L.c b/driver/level2/._ztbsv_L.c new file mode 100644 index 0000000..db00f6d Binary files /dev/null and b/driver/level2/._ztbsv_L.c differ diff --git a/driver/level2/._ztbsv_U.c b/driver/level2/._ztbsv_U.c new file mode 100644 index 0000000..c03de13 Binary files /dev/null and b/driver/level2/._ztbsv_U.c differ diff --git a/driver/level2/._ztpmv_L.c b/driver/level2/._ztpmv_L.c new file mode 100644 index 0000000..f87f908 Binary files /dev/null and b/driver/level2/._ztpmv_L.c differ diff --git a/driver/level2/._ztpmv_U.c b/driver/level2/._ztpmv_U.c new file mode 100644 index 0000000..a15d18f Binary files /dev/null and b/driver/level2/._ztpmv_U.c differ diff --git a/driver/level2/._ztpsv_L.c b/driver/level2/._ztpsv_L.c new file mode 100644 index 0000000..92c455a Binary files /dev/null and b/driver/level2/._ztpsv_L.c differ diff --git a/driver/level2/._ztpsv_U.c b/driver/level2/._ztpsv_U.c new file mode 100644 index 0000000..fccde67 Binary files /dev/null and b/driver/level2/._ztpsv_U.c differ diff --git a/driver/level2/._ztrmv_L.c b/driver/level2/._ztrmv_L.c new file mode 100644 index 0000000..ed733a1 Binary files /dev/null and b/driver/level2/._ztrmv_L.c differ diff --git a/driver/level2/._ztrmv_U.c b/driver/level2/._ztrmv_U.c new file mode 100644 index 0000000..f446f21 Binary files /dev/null and b/driver/level2/._ztrmv_U.c differ diff --git a/driver/level2/._ztrsv_L.c b/driver/level2/._ztrsv_L.c new file mode 100644 index 0000000..b9334bc Binary files /dev/null and b/driver/level2/._ztrsv_L.c differ diff --git a/driver/level2/._ztrsv_U.c b/driver/level2/._ztrsv_U.c new file mode 100644 index 0000000..9458dfe Binary files /dev/null and b/driver/level2/._ztrsv_U.c differ diff --git a/driver/level2/Makefile b/driver/level2/Makefile new file mode 100644 index 0000000..7043e52 --- /dev/null +++ b/driver/level2/Makefile @@ -0,0 +1,3618 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = \ + sgbmv_n.$(SUFFIX) sgbmv_t.$(SUFFIX) \ + ssbmv_U.$(SUFFIX) ssbmv_L.$(SUFFIX) sspmv_U.$(SUFFIX) sspmv_L.$(SUFFIX) \ + sspr_U.$(SUFFIX) sspr_L.$(SUFFIX) sspr2_U.$(SUFFIX) sspr2_L.$(SUFFIX) \ + ssyr_U.$(SUFFIX) ssyr_L.$(SUFFIX) ssyr2_U.$(SUFFIX) ssyr2_L.$(SUFFIX) \ + stbmv_NUU.$(SUFFIX) stbmv_NUN.$(SUFFIX) stbmv_NLU.$(SUFFIX) stbmv_NLN.$(SUFFIX) \ + stbmv_TUU.$(SUFFIX) stbmv_TUN.$(SUFFIX) stbmv_TLU.$(SUFFIX) stbmv_TLN.$(SUFFIX) \ + stbsv_NUU.$(SUFFIX) stbsv_NUN.$(SUFFIX) stbsv_NLU.$(SUFFIX) stbsv_NLN.$(SUFFIX) \ + stbsv_TUU.$(SUFFIX) stbsv_TUN.$(SUFFIX) stbsv_TLU.$(SUFFIX) stbsv_TLN.$(SUFFIX) \ + stpmv_NUU.$(SUFFIX) stpmv_NUN.$(SUFFIX) stpmv_NLU.$(SUFFIX) stpmv_NLN.$(SUFFIX) \ + stpmv_TUU.$(SUFFIX) stpmv_TUN.$(SUFFIX) stpmv_TLU.$(SUFFIX) stpmv_TLN.$(SUFFIX) \ + stpsv_NUU.$(SUFFIX) stpsv_NUN.$(SUFFIX) stpsv_NLU.$(SUFFIX) stpsv_NLN.$(SUFFIX) \ + stpsv_TUU.$(SUFFIX) stpsv_TUN.$(SUFFIX) stpsv_TLU.$(SUFFIX) stpsv_TLN.$(SUFFIX) \ + strmv_NUU.$(SUFFIX) strmv_NUN.$(SUFFIX) strmv_NLU.$(SUFFIX) strmv_NLN.$(SUFFIX) \ + strmv_TUU.$(SUFFIX) strmv_TUN.$(SUFFIX) strmv_TLU.$(SUFFIX) strmv_TLN.$(SUFFIX) \ + strsv_NUU.$(SUFFIX) strsv_NUN.$(SUFFIX) strsv_NLU.$(SUFFIX) strsv_NLN.$(SUFFIX) \ + strsv_TUU.$(SUFFIX) strsv_TUN.$(SUFFIX) strsv_TLU.$(SUFFIX) strsv_TLN.$(SUFFIX) + +DBLASOBJS = \ + dgbmv_n.$(SUFFIX) dgbmv_t.$(SUFFIX) \ + dsbmv_U.$(SUFFIX) dsbmv_L.$(SUFFIX) dspmv_U.$(SUFFIX) dspmv_L.$(SUFFIX) \ + dspr_U.$(SUFFIX) dspr_L.$(SUFFIX) dspr2_U.$(SUFFIX) dspr2_L.$(SUFFIX) \ + dsyr_U.$(SUFFIX) dsyr_L.$(SUFFIX) dsyr2_U.$(SUFFIX) dsyr2_L.$(SUFFIX) \ + dtbmv_NUU.$(SUFFIX) dtbmv_NUN.$(SUFFIX) dtbmv_NLU.$(SUFFIX) dtbmv_NLN.$(SUFFIX) \ + dtbmv_TUU.$(SUFFIX) dtbmv_TUN.$(SUFFIX) dtbmv_TLU.$(SUFFIX) dtbmv_TLN.$(SUFFIX) \ + dtbsv_NUU.$(SUFFIX) dtbsv_NUN.$(SUFFIX) dtbsv_NLU.$(SUFFIX) dtbsv_NLN.$(SUFFIX) \ + dtbsv_TUU.$(SUFFIX) dtbsv_TUN.$(SUFFIX) dtbsv_TLU.$(SUFFIX) dtbsv_TLN.$(SUFFIX) \ + dtpmv_NUU.$(SUFFIX) dtpmv_NUN.$(SUFFIX) dtpmv_NLU.$(SUFFIX) dtpmv_NLN.$(SUFFIX) \ + dtpmv_TUU.$(SUFFIX) dtpmv_TUN.$(SUFFIX) dtpmv_TLU.$(SUFFIX) dtpmv_TLN.$(SUFFIX) \ + dtpsv_NUU.$(SUFFIX) dtpsv_NUN.$(SUFFIX) dtpsv_NLU.$(SUFFIX) dtpsv_NLN.$(SUFFIX) \ + dtpsv_TUU.$(SUFFIX) dtpsv_TUN.$(SUFFIX) dtpsv_TLU.$(SUFFIX) dtpsv_TLN.$(SUFFIX) \ + dtrmv_NUU.$(SUFFIX) dtrmv_NUN.$(SUFFIX) dtrmv_NLU.$(SUFFIX) dtrmv_NLN.$(SUFFIX) \ + dtrmv_TUU.$(SUFFIX) dtrmv_TUN.$(SUFFIX) dtrmv_TLU.$(SUFFIX) dtrmv_TLN.$(SUFFIX) \ + dtrsv_NUU.$(SUFFIX) dtrsv_NUN.$(SUFFIX) dtrsv_NLU.$(SUFFIX) dtrsv_NLN.$(SUFFIX) \ + dtrsv_TUU.$(SUFFIX) dtrsv_TUN.$(SUFFIX) dtrsv_TLU.$(SUFFIX) dtrsv_TLN.$(SUFFIX) + +QBLASOBJS = \ + qgbmv_n.$(SUFFIX) qgbmv_t.$(SUFFIX) \ + qsbmv_U.$(SUFFIX) qsbmv_L.$(SUFFIX) qspmv_U.$(SUFFIX) qspmv_L.$(SUFFIX) \ + qspr_U.$(SUFFIX) qspr_L.$(SUFFIX) qspr2_U.$(SUFFIX) qspr2_L.$(SUFFIX) \ + qsyr_U.$(SUFFIX) qsyr_L.$(SUFFIX) qsyr2_U.$(SUFFIX) qsyr2_L.$(SUFFIX) \ + qtbmv_NUU.$(SUFFIX) qtbmv_NUN.$(SUFFIX) qtbmv_NLU.$(SUFFIX) qtbmv_NLN.$(SUFFIX) \ + qtbmv_TUU.$(SUFFIX) qtbmv_TUN.$(SUFFIX) qtbmv_TLU.$(SUFFIX) qtbmv_TLN.$(SUFFIX) \ + qtbsv_NUU.$(SUFFIX) qtbsv_NUN.$(SUFFIX) qtbsv_NLU.$(SUFFIX) qtbsv_NLN.$(SUFFIX) \ + qtbsv_TUU.$(SUFFIX) qtbsv_TUN.$(SUFFIX) qtbsv_TLU.$(SUFFIX) qtbsv_TLN.$(SUFFIX) \ + qtpmv_NUU.$(SUFFIX) qtpmv_NUN.$(SUFFIX) qtpmv_NLU.$(SUFFIX) qtpmv_NLN.$(SUFFIX) \ + qtpmv_TUU.$(SUFFIX) qtpmv_TUN.$(SUFFIX) qtpmv_TLU.$(SUFFIX) qtpmv_TLN.$(SUFFIX) \ + qtpsv_NUU.$(SUFFIX) qtpsv_NUN.$(SUFFIX) qtpsv_NLU.$(SUFFIX) qtpsv_NLN.$(SUFFIX) \ + qtpsv_TUU.$(SUFFIX) qtpsv_TUN.$(SUFFIX) qtpsv_TLU.$(SUFFIX) qtpsv_TLN.$(SUFFIX) \ + qtrmv_NUU.$(SUFFIX) qtrmv_NUN.$(SUFFIX) qtrmv_NLU.$(SUFFIX) qtrmv_NLN.$(SUFFIX) \ + qtrmv_TUU.$(SUFFIX) qtrmv_TUN.$(SUFFIX) qtrmv_TLU.$(SUFFIX) qtrmv_TLN.$(SUFFIX) \ + qtrsv_NUU.$(SUFFIX) qtrsv_NUN.$(SUFFIX) qtrsv_NLU.$(SUFFIX) qtrsv_NLN.$(SUFFIX) \ + qtrsv_TUU.$(SUFFIX) qtrsv_TUN.$(SUFFIX) qtrsv_TLU.$(SUFFIX) qtrsv_TLN.$(SUFFIX) + +CBLASOBJS += \ + cgbmv_n.$(SUFFIX) cgbmv_t.$(SUFFIX) cgbmv_r.$(SUFFIX) cgbmv_c.$(SUFFIX) \ + cgbmv_o.$(SUFFIX) cgbmv_u.$(SUFFIX) cgbmv_s.$(SUFFIX) cgbmv_d.$(SUFFIX) \ + chbmv_U.$(SUFFIX) chbmv_L.$(SUFFIX) chbmv_V.$(SUFFIX) chbmv_M.$(SUFFIX) \ + cher_U.$(SUFFIX) cher_L.$(SUFFIX) cher_V.$(SUFFIX) cher_M.$(SUFFIX) \ + cher2_U.$(SUFFIX) cher2_L.$(SUFFIX) cher2_V.$(SUFFIX) cher2_M.$(SUFFIX) \ + chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \ + chpr_U.$(SUFFIX) chpr_L.$(SUFFIX) chpr_V.$(SUFFIX) chpr_M.$(SUFFIX) \ + chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \ + csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ + cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ + csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ + ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \ + ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \ + ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \ + ctbmv_CUU.$(SUFFIX) ctbmv_CUN.$(SUFFIX) ctbmv_CLU.$(SUFFIX) ctbmv_CLN.$(SUFFIX) \ + ctbsv_NUU.$(SUFFIX) ctbsv_NUN.$(SUFFIX) ctbsv_NLU.$(SUFFIX) ctbsv_NLN.$(SUFFIX) \ + ctbsv_TUU.$(SUFFIX) ctbsv_TUN.$(SUFFIX) ctbsv_TLU.$(SUFFIX) ctbsv_TLN.$(SUFFIX) \ + ctbsv_RUU.$(SUFFIX) ctbsv_RUN.$(SUFFIX) ctbsv_RLU.$(SUFFIX) ctbsv_RLN.$(SUFFIX) \ + ctbsv_CUU.$(SUFFIX) ctbsv_CUN.$(SUFFIX) ctbsv_CLU.$(SUFFIX) ctbsv_CLN.$(SUFFIX) \ + ctpmv_NUU.$(SUFFIX) ctpmv_NUN.$(SUFFIX) ctpmv_NLU.$(SUFFIX) ctpmv_NLN.$(SUFFIX) \ + ctpmv_TUU.$(SUFFIX) ctpmv_TUN.$(SUFFIX) ctpmv_TLU.$(SUFFIX) ctpmv_TLN.$(SUFFIX) \ + ctpmv_RUU.$(SUFFIX) ctpmv_RUN.$(SUFFIX) ctpmv_RLU.$(SUFFIX) ctpmv_RLN.$(SUFFIX) \ + ctpmv_CUU.$(SUFFIX) ctpmv_CUN.$(SUFFIX) ctpmv_CLU.$(SUFFIX) ctpmv_CLN.$(SUFFIX) \ + ctpsv_NUU.$(SUFFIX) ctpsv_NUN.$(SUFFIX) ctpsv_NLU.$(SUFFIX) ctpsv_NLN.$(SUFFIX) \ + ctpsv_TUU.$(SUFFIX) ctpsv_TUN.$(SUFFIX) ctpsv_TLU.$(SUFFIX) ctpsv_TLN.$(SUFFIX) \ + ctpsv_RUU.$(SUFFIX) ctpsv_RUN.$(SUFFIX) ctpsv_RLU.$(SUFFIX) ctpsv_RLN.$(SUFFIX) \ + ctpsv_CUU.$(SUFFIX) ctpsv_CUN.$(SUFFIX) ctpsv_CLU.$(SUFFIX) ctpsv_CLN.$(SUFFIX) \ + ctrmv_NUU.$(SUFFIX) ctrmv_NUN.$(SUFFIX) ctrmv_NLU.$(SUFFIX) ctrmv_NLN.$(SUFFIX) \ + ctrmv_TUU.$(SUFFIX) ctrmv_TUN.$(SUFFIX) ctrmv_TLU.$(SUFFIX) ctrmv_TLN.$(SUFFIX) \ + ctrmv_RUU.$(SUFFIX) ctrmv_RUN.$(SUFFIX) ctrmv_RLU.$(SUFFIX) ctrmv_RLN.$(SUFFIX) \ + ctrmv_CUU.$(SUFFIX) ctrmv_CUN.$(SUFFIX) ctrmv_CLU.$(SUFFIX) ctrmv_CLN.$(SUFFIX) \ + ctrsv_NUU.$(SUFFIX) ctrsv_NUN.$(SUFFIX) ctrsv_NLU.$(SUFFIX) ctrsv_NLN.$(SUFFIX) \ + ctrsv_TUU.$(SUFFIX) ctrsv_TUN.$(SUFFIX) ctrsv_TLU.$(SUFFIX) ctrsv_TLN.$(SUFFIX) \ + ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ + ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) + +ZBLASOBJS += \ + zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \ + zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \ + zhbmv_U.$(SUFFIX) zhbmv_L.$(SUFFIX) zhbmv_V.$(SUFFIX) zhbmv_M.$(SUFFIX) \ + zher_U.$(SUFFIX) zher_L.$(SUFFIX) zher_V.$(SUFFIX) zher_M.$(SUFFIX) \ + zher2_U.$(SUFFIX) zher2_L.$(SUFFIX) zher2_V.$(SUFFIX) zher2_M.$(SUFFIX) \ + zhpmv_U.$(SUFFIX) zhpmv_L.$(SUFFIX) zhpmv_V.$(SUFFIX) zhpmv_M.$(SUFFIX) \ + zhpr_U.$(SUFFIX) zhpr_L.$(SUFFIX) zhpr_V.$(SUFFIX) zhpr_M.$(SUFFIX) \ + zhpr2_U.$(SUFFIX) zhpr2_L.$(SUFFIX) zhpr2_V.$(SUFFIX) zhpr2_M.$(SUFFIX) \ + zsbmv_U.$(SUFFIX) zsbmv_L.$(SUFFIX) zspmv_U.$(SUFFIX) zspmv_L.$(SUFFIX) \ + zspr_U.$(SUFFIX) zspr_L.$(SUFFIX) zspr2_U.$(SUFFIX) zspr2_L.$(SUFFIX) \ + zsyr_U.$(SUFFIX) zsyr_L.$(SUFFIX) zsyr2_U.$(SUFFIX) zsyr2_L.$(SUFFIX) \ + ztbmv_NUU.$(SUFFIX) ztbmv_NUN.$(SUFFIX) ztbmv_NLU.$(SUFFIX) ztbmv_NLN.$(SUFFIX) \ + ztbmv_TUU.$(SUFFIX) ztbmv_TUN.$(SUFFIX) ztbmv_TLU.$(SUFFIX) ztbmv_TLN.$(SUFFIX) \ + ztbmv_RUU.$(SUFFIX) ztbmv_RUN.$(SUFFIX) ztbmv_RLU.$(SUFFIX) ztbmv_RLN.$(SUFFIX) \ + ztbmv_CUU.$(SUFFIX) ztbmv_CUN.$(SUFFIX) ztbmv_CLU.$(SUFFIX) ztbmv_CLN.$(SUFFIX) \ + ztbsv_NUU.$(SUFFIX) ztbsv_NUN.$(SUFFIX) ztbsv_NLU.$(SUFFIX) ztbsv_NLN.$(SUFFIX) \ + ztbsv_TUU.$(SUFFIX) ztbsv_TUN.$(SUFFIX) ztbsv_TLU.$(SUFFIX) ztbsv_TLN.$(SUFFIX) \ + ztbsv_RUU.$(SUFFIX) ztbsv_RUN.$(SUFFIX) ztbsv_RLU.$(SUFFIX) ztbsv_RLN.$(SUFFIX) \ + ztbsv_CUU.$(SUFFIX) ztbsv_CUN.$(SUFFIX) ztbsv_CLU.$(SUFFIX) ztbsv_CLN.$(SUFFIX) \ + ztpmv_NUU.$(SUFFIX) ztpmv_NUN.$(SUFFIX) ztpmv_NLU.$(SUFFIX) ztpmv_NLN.$(SUFFIX) \ + ztpmv_TUU.$(SUFFIX) ztpmv_TUN.$(SUFFIX) ztpmv_TLU.$(SUFFIX) ztpmv_TLN.$(SUFFIX) \ + ztpmv_RUU.$(SUFFIX) ztpmv_RUN.$(SUFFIX) ztpmv_RLU.$(SUFFIX) ztpmv_RLN.$(SUFFIX) \ + ztpmv_CUU.$(SUFFIX) ztpmv_CUN.$(SUFFIX) ztpmv_CLU.$(SUFFIX) ztpmv_CLN.$(SUFFIX) \ + ztpsv_NUU.$(SUFFIX) ztpsv_NUN.$(SUFFIX) ztpsv_NLU.$(SUFFIX) ztpsv_NLN.$(SUFFIX) \ + ztpsv_TUU.$(SUFFIX) ztpsv_TUN.$(SUFFIX) ztpsv_TLU.$(SUFFIX) ztpsv_TLN.$(SUFFIX) \ + ztpsv_RUU.$(SUFFIX) ztpsv_RUN.$(SUFFIX) ztpsv_RLU.$(SUFFIX) ztpsv_RLN.$(SUFFIX) \ + ztpsv_CUU.$(SUFFIX) ztpsv_CUN.$(SUFFIX) ztpsv_CLU.$(SUFFIX) ztpsv_CLN.$(SUFFIX) \ + ztrmv_NUU.$(SUFFIX) ztrmv_NUN.$(SUFFIX) ztrmv_NLU.$(SUFFIX) ztrmv_NLN.$(SUFFIX) \ + ztrmv_TUU.$(SUFFIX) ztrmv_TUN.$(SUFFIX) ztrmv_TLU.$(SUFFIX) ztrmv_TLN.$(SUFFIX) \ + ztrmv_RUU.$(SUFFIX) ztrmv_RUN.$(SUFFIX) ztrmv_RLU.$(SUFFIX) ztrmv_RLN.$(SUFFIX) \ + ztrmv_CUU.$(SUFFIX) ztrmv_CUN.$(SUFFIX) ztrmv_CLU.$(SUFFIX) ztrmv_CLN.$(SUFFIX) \ + ztrsv_NUU.$(SUFFIX) ztrsv_NUN.$(SUFFIX) ztrsv_NLU.$(SUFFIX) ztrsv_NLN.$(SUFFIX) \ + ztrsv_TUU.$(SUFFIX) ztrsv_TUN.$(SUFFIX) ztrsv_TLU.$(SUFFIX) ztrsv_TLN.$(SUFFIX) \ + ztrsv_RUU.$(SUFFIX) ztrsv_RUN.$(SUFFIX) ztrsv_RLU.$(SUFFIX) ztrsv_RLN.$(SUFFIX) \ + ztrsv_CUU.$(SUFFIX) ztrsv_CUN.$(SUFFIX) ztrsv_CLU.$(SUFFIX) ztrsv_CLN.$(SUFFIX) + +XBLASOBJS += \ + xgbmv_n.$(SUFFIX) xgbmv_t.$(SUFFIX) xgbmv_r.$(SUFFIX) xgbmv_c.$(SUFFIX) \ + xgbmv_o.$(SUFFIX) xgbmv_u.$(SUFFIX) xgbmv_s.$(SUFFIX) xgbmv_d.$(SUFFIX) \ + xhbmv_U.$(SUFFIX) xhbmv_L.$(SUFFIX) xhbmv_V.$(SUFFIX) xhbmv_M.$(SUFFIX) \ + xher_U.$(SUFFIX) xher_L.$(SUFFIX) xher_V.$(SUFFIX) xher_M.$(SUFFIX) \ + xher2_U.$(SUFFIX) xher2_L.$(SUFFIX) xher2_V.$(SUFFIX) xher2_M.$(SUFFIX) \ + xhpmv_U.$(SUFFIX) xhpmv_L.$(SUFFIX) xhpmv_V.$(SUFFIX) xhpmv_M.$(SUFFIX) \ + xhpr_U.$(SUFFIX) xhpr_L.$(SUFFIX) xhpr_V.$(SUFFIX) xhpr_M.$(SUFFIX) \ + xhpr2_U.$(SUFFIX) xhpr2_L.$(SUFFIX) xhpr2_V.$(SUFFIX) xhpr2_M.$(SUFFIX) \ + xsbmv_U.$(SUFFIX) xsbmv_L.$(SUFFIX) xspmv_U.$(SUFFIX) xspmv_L.$(SUFFIX) \ + xspr_U.$(SUFFIX) xspr_L.$(SUFFIX) xspr2_U.$(SUFFIX) xspr2_L.$(SUFFIX) \ + xsyr_U.$(SUFFIX) xsyr_L.$(SUFFIX) xsyr2_U.$(SUFFIX) xsyr2_L.$(SUFFIX) \ + xtbmv_NUU.$(SUFFIX) xtbmv_NUN.$(SUFFIX) xtbmv_NLU.$(SUFFIX) xtbmv_NLN.$(SUFFIX) \ + xtbmv_TUU.$(SUFFIX) xtbmv_TUN.$(SUFFIX) xtbmv_TLU.$(SUFFIX) xtbmv_TLN.$(SUFFIX) \ + xtbmv_RUU.$(SUFFIX) xtbmv_RUN.$(SUFFIX) xtbmv_RLU.$(SUFFIX) xtbmv_RLN.$(SUFFIX) \ + xtbmv_CUU.$(SUFFIX) xtbmv_CUN.$(SUFFIX) xtbmv_CLU.$(SUFFIX) xtbmv_CLN.$(SUFFIX) \ + xtbsv_NUU.$(SUFFIX) xtbsv_NUN.$(SUFFIX) xtbsv_NLU.$(SUFFIX) xtbsv_NLN.$(SUFFIX) \ + xtbsv_TUU.$(SUFFIX) xtbsv_TUN.$(SUFFIX) xtbsv_TLU.$(SUFFIX) xtbsv_TLN.$(SUFFIX) \ + xtbsv_RUU.$(SUFFIX) xtbsv_RUN.$(SUFFIX) xtbsv_RLU.$(SUFFIX) xtbsv_RLN.$(SUFFIX) \ + xtbsv_CUU.$(SUFFIX) xtbsv_CUN.$(SUFFIX) xtbsv_CLU.$(SUFFIX) xtbsv_CLN.$(SUFFIX) \ + xtpmv_NUU.$(SUFFIX) xtpmv_NUN.$(SUFFIX) xtpmv_NLU.$(SUFFIX) xtpmv_NLN.$(SUFFIX) \ + xtpmv_TUU.$(SUFFIX) xtpmv_TUN.$(SUFFIX) xtpmv_TLU.$(SUFFIX) xtpmv_TLN.$(SUFFIX) \ + xtpmv_RUU.$(SUFFIX) xtpmv_RUN.$(SUFFIX) xtpmv_RLU.$(SUFFIX) xtpmv_RLN.$(SUFFIX) \ + xtpmv_CUU.$(SUFFIX) xtpmv_CUN.$(SUFFIX) xtpmv_CLU.$(SUFFIX) xtpmv_CLN.$(SUFFIX) \ + xtpsv_NUU.$(SUFFIX) xtpsv_NUN.$(SUFFIX) xtpsv_NLU.$(SUFFIX) xtpsv_NLN.$(SUFFIX) \ + xtpsv_TUU.$(SUFFIX) xtpsv_TUN.$(SUFFIX) xtpsv_TLU.$(SUFFIX) xtpsv_TLN.$(SUFFIX) \ + xtpsv_RUU.$(SUFFIX) xtpsv_RUN.$(SUFFIX) xtpsv_RLU.$(SUFFIX) xtpsv_RLN.$(SUFFIX) \ + xtpsv_CUU.$(SUFFIX) xtpsv_CUN.$(SUFFIX) xtpsv_CLU.$(SUFFIX) xtpsv_CLN.$(SUFFIX) \ + xtrmv_NUU.$(SUFFIX) xtrmv_NUN.$(SUFFIX) xtrmv_NLU.$(SUFFIX) xtrmv_NLN.$(SUFFIX) \ + xtrmv_TUU.$(SUFFIX) xtrmv_TUN.$(SUFFIX) xtrmv_TLU.$(SUFFIX) xtrmv_TLN.$(SUFFIX) \ + xtrmv_RUU.$(SUFFIX) xtrmv_RUN.$(SUFFIX) xtrmv_RLU.$(SUFFIX) xtrmv_RLN.$(SUFFIX) \ + xtrmv_CUU.$(SUFFIX) xtrmv_CUN.$(SUFFIX) xtrmv_CLU.$(SUFFIX) xtrmv_CLN.$(SUFFIX) \ + xtrsv_NUU.$(SUFFIX) xtrsv_NUN.$(SUFFIX) xtrsv_NLU.$(SUFFIX) xtrsv_NLN.$(SUFFIX) \ + xtrsv_TUU.$(SUFFIX) xtrsv_TUN.$(SUFFIX) xtrsv_TLU.$(SUFFIX) xtrsv_TLN.$(SUFFIX) \ + xtrsv_RUU.$(SUFFIX) xtrsv_RUN.$(SUFFIX) xtrsv_RLU.$(SUFFIX) xtrsv_RLN.$(SUFFIX) \ + xtrsv_CUU.$(SUFFIX) xtrsv_CUN.$(SUFFIX) xtrsv_CLU.$(SUFFIX) xtrsv_CLN.$(SUFFIX) + +HPLOBJS = \ + dtrsv_NLU.$(SUFFIX) dtrsv_NUN.$(SUFFIX) dtrsv_NUU.$(SUFFIX) dtrsv_NLN.$(SUFFIX) \ + dtrsv_TLN.$(SUFFIX) dtrsv_TLU.$(SUFFIX) dtrsv_TUN.$(SUFFIX) dtrsv_TUU.$(SUFFIX) + +ifdef SMP +SBLASOBJS += \ + sgemv_thread_n.$(SUFFIX) sgemv_thread_t.$(SUFFIX) \ + sger_thread.$(SUFFIX) \ + ssymv_thread_U.$(SUFFIX) ssymv_thread_L.$(SUFFIX) \ + ssyr_thread_U.$(SUFFIX) ssyr_thread_L.$(SUFFIX) \ + ssyr2_thread_U.$(SUFFIX) ssyr2_thread_L.$(SUFFIX) \ + sspr_thread_U.$(SUFFIX) sspr_thread_L.$(SUFFIX) \ + sspr2_thread_U.$(SUFFIX) sspr2_thread_L.$(SUFFIX) \ + strmv_thread_NUU.$(SUFFIX) strmv_thread_NUN.$(SUFFIX) \ + strmv_thread_NLU.$(SUFFIX) strmv_thread_NLN.$(SUFFIX) \ + strmv_thread_TUU.$(SUFFIX) strmv_thread_TUN.$(SUFFIX) \ + strmv_thread_TLU.$(SUFFIX) strmv_thread_TLN.$(SUFFIX) \ + sspmv_thread_U.$(SUFFIX) sspmv_thread_L.$(SUFFIX) \ + stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUN.$(SUFFIX) \ + stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLN.$(SUFFIX) \ + stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUN.$(SUFFIX) \ + stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLN.$(SUFFIX) \ + sgbmv_thread_n.$(SUFFIX) sgbmv_thread_t.$(SUFFIX) \ + ssbmv_thread_U.$(SUFFIX) ssbmv_thread_L.$(SUFFIX) \ + stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUN.$(SUFFIX) \ + stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLN.$(SUFFIX) \ + stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUN.$(SUFFIX) \ + stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLN.$(SUFFIX) \ + +DBLASOBJS += \ + dgemv_thread_n.$(SUFFIX) dgemv_thread_t.$(SUFFIX) \ + dger_thread.$(SUFFIX) \ + dsymv_thread_U.$(SUFFIX) dsymv_thread_L.$(SUFFIX) \ + dsyr_thread_U.$(SUFFIX) dsyr_thread_L.$(SUFFIX) \ + dsyr2_thread_U.$(SUFFIX) dsyr2_thread_L.$(SUFFIX) \ + dspr_thread_U.$(SUFFIX) dspr_thread_L.$(SUFFIX) \ + dspr2_thread_U.$(SUFFIX) dspr2_thread_L.$(SUFFIX) \ + dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUN.$(SUFFIX) \ + dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLN.$(SUFFIX) \ + dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUN.$(SUFFIX) \ + dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLN.$(SUFFIX) \ + dspmv_thread_U.$(SUFFIX) dspmv_thread_L.$(SUFFIX) \ + dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUN.$(SUFFIX) \ + dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLN.$(SUFFIX) \ + dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUN.$(SUFFIX) \ + dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLN.$(SUFFIX) \ + dgbmv_thread_n.$(SUFFIX) dgbmv_thread_t.$(SUFFIX) \ + dsbmv_thread_U.$(SUFFIX) dsbmv_thread_L.$(SUFFIX) \ + dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUN.$(SUFFIX) \ + dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLN.$(SUFFIX) \ + dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUN.$(SUFFIX) \ + dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLN.$(SUFFIX) \ + +QBLASOBJS += \ + qgemv_thread_n.$(SUFFIX) qgemv_thread_t.$(SUFFIX) \ + qger_thread.$(SUFFIX) \ + qsymv_thread_U.$(SUFFIX) qsymv_thread_L.$(SUFFIX) \ + qsyr_thread_U.$(SUFFIX) qsyr_thread_L.$(SUFFIX) \ + qsyr2_thread_U.$(SUFFIX) qsyr2_thread_L.$(SUFFIX) \ + qspr_thread_U.$(SUFFIX) qspr_thread_L.$(SUFFIX) \ + qspr2_thread_U.$(SUFFIX) qspr2_thread_L.$(SUFFIX) \ + qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUN.$(SUFFIX) \ + qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLN.$(SUFFIX) \ + qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUN.$(SUFFIX) \ + qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLN.$(SUFFIX) \ + qspmv_thread_U.$(SUFFIX) qspmv_thread_L.$(SUFFIX) \ + qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUN.$(SUFFIX) \ + qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLN.$(SUFFIX) \ + qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUN.$(SUFFIX) \ + qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLN.$(SUFFIX) \ + qgbmv_thread_n.$(SUFFIX) qgbmv_thread_t.$(SUFFIX) \ + qsbmv_thread_U.$(SUFFIX) qsbmv_thread_L.$(SUFFIX) \ + qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUN.$(SUFFIX) \ + qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLN.$(SUFFIX) \ + qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUN.$(SUFFIX) \ + qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLN.$(SUFFIX) \ + +CBLASOBJS += \ + cgemv_thread_n.$(SUFFIX) cgemv_thread_t.$(SUFFIX) \ + cgemv_thread_r.$(SUFFIX) cgemv_thread_c.$(SUFFIX) \ + cgemv_thread_o.$(SUFFIX) cgemv_thread_u.$(SUFFIX) \ + cgemv_thread_s.$(SUFFIX) cgemv_thread_d.$(SUFFIX) \ + cger_thread_U.$(SUFFIX) cger_thread_C.$(SUFFIX) \ + cger_thread_V.$(SUFFIX) cger_thread_D.$(SUFFIX) \ + csymv_thread_U.$(SUFFIX) csymv_thread_L.$(SUFFIX) \ + chemv_thread_U.$(SUFFIX) chemv_thread_L.$(SUFFIX) \ + chemv_thread_V.$(SUFFIX) chemv_thread_M.$(SUFFIX) \ + csyr_thread_U.$(SUFFIX) csyr_thread_L.$(SUFFIX) \ + cher_thread_U.$(SUFFIX) cher_thread_L.$(SUFFIX) \ + cher_thread_V.$(SUFFIX) cher_thread_M.$(SUFFIX) \ + csyr2_thread_U.$(SUFFIX) csyr2_thread_L.$(SUFFIX) \ + cher2_thread_U.$(SUFFIX) cher2_thread_L.$(SUFFIX) \ + cher2_thread_V.$(SUFFIX) cher2_thread_M.$(SUFFIX) \ + cspr_thread_U.$(SUFFIX) cspr_thread_L.$(SUFFIX) \ + chpr_thread_U.$(SUFFIX) chpr_thread_L.$(SUFFIX) \ + chpr_thread_V.$(SUFFIX) chpr_thread_M.$(SUFFIX) \ + cspr2_thread_U.$(SUFFIX) cspr2_thread_L.$(SUFFIX) \ + chpr2_thread_U.$(SUFFIX) chpr2_thread_L.$(SUFFIX) \ + chpr2_thread_V.$(SUFFIX) chpr2_thread_M.$(SUFFIX) \ + ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUN.$(SUFFIX) \ + ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLN.$(SUFFIX) \ + ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUN.$(SUFFIX) \ + ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLN.$(SUFFIX) \ + ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUN.$(SUFFIX) \ + ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLN.$(SUFFIX) \ + ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUN.$(SUFFIX) \ + ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLN.$(SUFFIX) \ + cspmv_thread_U.$(SUFFIX) cspmv_thread_L.$(SUFFIX) \ + chpmv_thread_U.$(SUFFIX) chpmv_thread_L.$(SUFFIX) \ + chpmv_thread_V.$(SUFFIX) chpmv_thread_M.$(SUFFIX) \ + ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUN.$(SUFFIX) \ + ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLN.$(SUFFIX) \ + ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUN.$(SUFFIX) \ + ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLN.$(SUFFIX) \ + ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUN.$(SUFFIX) \ + ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLN.$(SUFFIX) \ + ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUN.$(SUFFIX) \ + ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLN.$(SUFFIX) \ + cgbmv_thread_n.$(SUFFIX) cgbmv_thread_t.$(SUFFIX) \ + cgbmv_thread_r.$(SUFFIX) cgbmv_thread_c.$(SUFFIX) \ + cgbmv_thread_o.$(SUFFIX) cgbmv_thread_u.$(SUFFIX) \ + cgbmv_thread_s.$(SUFFIX) cgbmv_thread_d.$(SUFFIX) \ + csbmv_thread_U.$(SUFFIX) csbmv_thread_L.$(SUFFIX) \ + chbmv_thread_U.$(SUFFIX) chbmv_thread_L.$(SUFFIX) \ + chbmv_thread_V.$(SUFFIX) chbmv_thread_M.$(SUFFIX) \ + ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUN.$(SUFFIX) \ + ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLN.$(SUFFIX) \ + ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUN.$(SUFFIX) \ + ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLN.$(SUFFIX) \ + ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUN.$(SUFFIX) \ + ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLN.$(SUFFIX) \ + ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUN.$(SUFFIX) \ + ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLN.$(SUFFIX) \ + + +ZBLASOBJS += \ + zgemv_thread_n.$(SUFFIX) zgemv_thread_t.$(SUFFIX) \ + zgemv_thread_r.$(SUFFIX) zgemv_thread_c.$(SUFFIX) \ + zgemv_thread_o.$(SUFFIX) zgemv_thread_u.$(SUFFIX) \ + zgemv_thread_s.$(SUFFIX) zgemv_thread_d.$(SUFFIX) \ + zger_thread_U.$(SUFFIX) zger_thread_C.$(SUFFIX) \ + zger_thread_V.$(SUFFIX) zger_thread_D.$(SUFFIX) \ + zsymv_thread_U.$(SUFFIX) zsymv_thread_L.$(SUFFIX) \ + zhemv_thread_U.$(SUFFIX) zhemv_thread_L.$(SUFFIX) \ + zhemv_thread_V.$(SUFFIX) zhemv_thread_M.$(SUFFIX) \ + zsyr_thread_U.$(SUFFIX) zsyr_thread_L.$(SUFFIX) \ + zher_thread_U.$(SUFFIX) zher_thread_L.$(SUFFIX) \ + zher_thread_V.$(SUFFIX) zher_thread_M.$(SUFFIX) \ + zsyr2_thread_U.$(SUFFIX) zsyr2_thread_L.$(SUFFIX) \ + zher2_thread_U.$(SUFFIX) zher2_thread_L.$(SUFFIX) \ + zher2_thread_V.$(SUFFIX) zher2_thread_M.$(SUFFIX) \ + zspr_thread_U.$(SUFFIX) zspr_thread_L.$(SUFFIX) \ + zhpr_thread_U.$(SUFFIX) zhpr_thread_L.$(SUFFIX) \ + zhpr_thread_V.$(SUFFIX) zhpr_thread_M.$(SUFFIX) \ + zspr2_thread_U.$(SUFFIX) zspr2_thread_L.$(SUFFIX) \ + zhpr2_thread_U.$(SUFFIX) zhpr2_thread_L.$(SUFFIX) \ + zhpr2_thread_V.$(SUFFIX) zhpr2_thread_M.$(SUFFIX) \ + ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUN.$(SUFFIX) \ + ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLN.$(SUFFIX) \ + ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUN.$(SUFFIX) \ + ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLN.$(SUFFIX) \ + ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUN.$(SUFFIX) \ + ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLN.$(SUFFIX) \ + ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUN.$(SUFFIX) \ + ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLN.$(SUFFIX) \ + zspmv_thread_U.$(SUFFIX) zspmv_thread_L.$(SUFFIX) \ + zhpmv_thread_U.$(SUFFIX) zhpmv_thread_L.$(SUFFIX) \ + zhpmv_thread_V.$(SUFFIX) zhpmv_thread_M.$(SUFFIX) \ + ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUN.$(SUFFIX) \ + ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLN.$(SUFFIX) \ + ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUN.$(SUFFIX) \ + ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLN.$(SUFFIX) \ + ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUN.$(SUFFIX) \ + ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLN.$(SUFFIX) \ + ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUN.$(SUFFIX) \ + ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLN.$(SUFFIX) \ + zgbmv_thread_n.$(SUFFIX) zgbmv_thread_t.$(SUFFIX) \ + zgbmv_thread_r.$(SUFFIX) zgbmv_thread_c.$(SUFFIX) \ + zgbmv_thread_o.$(SUFFIX) zgbmv_thread_u.$(SUFFIX) \ + zgbmv_thread_s.$(SUFFIX) zgbmv_thread_d.$(SUFFIX) \ + zsbmv_thread_U.$(SUFFIX) zsbmv_thread_L.$(SUFFIX) \ + zhbmv_thread_U.$(SUFFIX) zhbmv_thread_L.$(SUFFIX) \ + zhbmv_thread_V.$(SUFFIX) zhbmv_thread_M.$(SUFFIX) \ + ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUN.$(SUFFIX) \ + ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLN.$(SUFFIX) \ + ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUN.$(SUFFIX) \ + ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLN.$(SUFFIX) \ + ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUN.$(SUFFIX) \ + ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLN.$(SUFFIX) \ + ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUN.$(SUFFIX) \ + ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLN.$(SUFFIX) \ + +XBLASOBJS += \ + xgemv_thread_n.$(SUFFIX) xgemv_thread_t.$(SUFFIX) \ + xgemv_thread_r.$(SUFFIX) xgemv_thread_c.$(SUFFIX) \ + xgemv_thread_o.$(SUFFIX) xgemv_thread_u.$(SUFFIX) \ + xgemv_thread_s.$(SUFFIX) xgemv_thread_d.$(SUFFIX) \ + xger_thread_U.$(SUFFIX) xger_thread_C.$(SUFFIX) \ + xger_thread_V.$(SUFFIX) xger_thread_D.$(SUFFIX) \ + xsymv_thread_U.$(SUFFIX) xsymv_thread_L.$(SUFFIX) \ + xhemv_thread_U.$(SUFFIX) xhemv_thread_L.$(SUFFIX) \ + xhemv_thread_V.$(SUFFIX) xhemv_thread_M.$(SUFFIX) \ + xsyr_thread_U.$(SUFFIX) xsyr_thread_L.$(SUFFIX) \ + xher_thread_U.$(SUFFIX) xher_thread_L.$(SUFFIX) \ + xher_thread_V.$(SUFFIX) xher_thread_M.$(SUFFIX) \ + xsyr2_thread_U.$(SUFFIX) xsyr2_thread_L.$(SUFFIX) \ + xher2_thread_U.$(SUFFIX) xher2_thread_L.$(SUFFIX) \ + xher2_thread_V.$(SUFFIX) xher2_thread_M.$(SUFFIX) \ + xspr_thread_U.$(SUFFIX) xspr_thread_L.$(SUFFIX) \ + xhpr_thread_U.$(SUFFIX) xhpr_thread_L.$(SUFFIX) \ + xhpr_thread_V.$(SUFFIX) xhpr_thread_M.$(SUFFIX) \ + xspr2_thread_U.$(SUFFIX) xspr2_thread_L.$(SUFFIX) \ + xhpr2_thread_U.$(SUFFIX) xhpr2_thread_L.$(SUFFIX) \ + xhpr2_thread_V.$(SUFFIX) xhpr2_thread_M.$(SUFFIX) \ + xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUN.$(SUFFIX) \ + xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLN.$(SUFFIX) \ + xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUN.$(SUFFIX) \ + xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLN.$(SUFFIX) \ + xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUN.$(SUFFIX) \ + xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLN.$(SUFFIX) \ + xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUN.$(SUFFIX) \ + xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLN.$(SUFFIX) \ + xspmv_thread_U.$(SUFFIX) xspmv_thread_L.$(SUFFIX) \ + xhpmv_thread_U.$(SUFFIX) xhpmv_thread_L.$(SUFFIX) \ + xhpmv_thread_V.$(SUFFIX) xhpmv_thread_M.$(SUFFIX) \ + xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUN.$(SUFFIX) \ + xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLN.$(SUFFIX) \ + xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUN.$(SUFFIX) \ + xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLN.$(SUFFIX) \ + xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUN.$(SUFFIX) \ + xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLN.$(SUFFIX) \ + xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUN.$(SUFFIX) \ + xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLN.$(SUFFIX) \ + xgbmv_thread_n.$(SUFFIX) xgbmv_thread_t.$(SUFFIX) \ + xgbmv_thread_r.$(SUFFIX) xgbmv_thread_c.$(SUFFIX) \ + xgbmv_thread_o.$(SUFFIX) xgbmv_thread_u.$(SUFFIX) \ + xgbmv_thread_s.$(SUFFIX) xgbmv_thread_d.$(SUFFIX) \ + xsbmv_thread_U.$(SUFFIX) xsbmv_thread_L.$(SUFFIX) \ + xhbmv_thread_U.$(SUFFIX) xhbmv_thread_L.$(SUFFIX) \ + xhbmv_thread_V.$(SUFFIX) xhbmv_thread_M.$(SUFFIX) \ + xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUN.$(SUFFIX) \ + xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLN.$(SUFFIX) \ + xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUN.$(SUFFIX) \ + xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLN.$(SUFFIX) \ + xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUN.$(SUFFIX) \ + xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLN.$(SUFFIX) \ + xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \ + xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) \ + +endif + +all :: + +sgbmv_n.$(SUFFIX) sgbmv_n.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -UDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< + +sgbmv_t.$(SUFFIX) sgbmv_t.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -UDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< + +dgbmv_n.$(SUFFIX) dgbmv_n.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -DDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< + +dgbmv_t.$(SUFFIX) dgbmv_t.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -DDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< + +qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< + +qgbmv_t.$(SUFFIX) qgbmv_t.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< + +cgbmv_n.$(SUFFIX) cgbmv_n.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_t.$(SUFFIX) cgbmv_t.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_r.$(SUFFIX) cgbmv_r.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_c.$(SUFFIX) cgbmv_c.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_o.$(SUFFIX) cgbmv_o.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_u.$(SUFFIX) cgbmv_u.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_s.$(SUFFIX) cgbmv_s.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_d.$(SUFFIX) cgbmv_d.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_n.$(SUFFIX) zgbmv_n.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_t.$(SUFFIX) zgbmv_t.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_r.$(SUFFIX) zgbmv_r.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_c.$(SUFFIX) zgbmv_c.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_o.$(SUFFIX) zgbmv_o.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_u.$(SUFFIX) zgbmv_u.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_s.$(SUFFIX) zgbmv_s.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_d.$(SUFFIX) zgbmv_d.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_n.$(SUFFIX) xgbmv_n.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_t.$(SUFFIX) xgbmv_t.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_r.$(SUFFIX) xgbmv_r.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_c.$(SUFFIX) xgbmv_c.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_o.$(SUFFIX) xgbmv_o.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_u.$(SUFFIX) xgbmv_u.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_s.$(SUFFIX) xgbmv_s.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_d.$(SUFFIX) xgbmv_d.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +sgbmv_thread_n.$(SUFFIX) sgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -UDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< + +sgbmv_thread_t.$(SUFFIX) sgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -UDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< + +dgbmv_thread_n.$(SUFFIX) dgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -DDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< + +dgbmv_thread_t.$(SUFFIX) dgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -DDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< + +qgbmv_thread_n.$(SUFFIX) qgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< + +qgbmv_thread_t.$(SUFFIX) qgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< + +cgbmv_thread_n.$(SUFFIX) cgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_t.$(SUFFIX) cgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_r.$(SUFFIX) cgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_c.$(SUFFIX) cgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_o.$(SUFFIX) cgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_u.$(SUFFIX) cgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_s.$(SUFFIX) cgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_d.$(SUFFIX) cgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_n.$(SUFFIX) zgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_t.$(SUFFIX) zgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_r.$(SUFFIX) zgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_c.$(SUFFIX) zgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_o.$(SUFFIX) zgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_u.$(SUFFIX) zgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_s.$(SUFFIX) zgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_d.$(SUFFIX) zgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_n.$(SUFFIX) xgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_t.$(SUFFIX) xgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_r.$(SUFFIX) xgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_c.$(SUFFIX) xgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_o.$(SUFFIX) xgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_u.$(SUFFIX) xgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_s.$(SUFFIX) xgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_d.$(SUFFIX) xgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +sgemv_thread_n.$(SUFFIX) sgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +sgemv_thread_t.$(SUFFIX) sgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +dgemv_thread_n.$(SUFFIX) dgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +dgemv_thread_t.$(SUFFIX) dgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +qgemv_thread_n.$(SUFFIX) qgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +qgemv_thread_t.$(SUFFIX) qgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_n.$(SUFFIX) cgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_t.$(SUFFIX) cgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_r.$(SUFFIX) cgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_c.$(SUFFIX) cgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_o.$(SUFFIX) cgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) + +cgemv_thread_u.$(SUFFIX) cgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) + +cgemv_thread_s.$(SUFFIX) cgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) + +cgemv_thread_d.$(SUFFIX) cgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) + +zgemv_thread_n.$(SUFFIX) zgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +zgemv_thread_t.$(SUFFIX) zgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +zgemv_thread_r.$(SUFFIX) zgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) + +zgemv_thread_c.$(SUFFIX) zgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) + +zgemv_thread_o.$(SUFFIX) zgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) + +zgemv_thread_u.$(SUFFIX) zgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) + +zgemv_thread_s.$(SUFFIX) zgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) + +zgemv_thread_d.$(SUFFIX) zgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) + +xgemv_thread_n.$(SUFFIX) xgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +xgemv_thread_t.$(SUFFIX) xgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +xgemv_thread_r.$(SUFFIX) xgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) + +xgemv_thread_c.$(SUFFIX) xgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) + +xgemv_thread_o.$(SUFFIX) xgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) + +xgemv_thread_u.$(SUFFIX) xgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) + +xgemv_thread_s.$(SUFFIX) xgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) + +xgemv_thread_d.$(SUFFIX) xgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) + +sger_thread.$(SUFFIX) sger_thread.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +dger_thread.$(SUFFIX) dger_thread.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +qger_thread.$(SUFFIX) qger_thread.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +cger_thread_U.$(SUFFIX) cger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +cger_thread_C.$(SUFFIX) cger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -UXCONJ $< -o $(@F) + +cger_thread_V.$(SUFFIX) cger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -DXCONJ $< -o $(@F) + +cger_thread_D.$(SUFFIX) cger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -DXCONJ $< -o $(@F) + +zger_thread_U.$(SUFFIX) zger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +zger_thread_C.$(SUFFIX) zger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -UXCONJ $< -o $(@F) + +zger_thread_V.$(SUFFIX) zger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -DXCONJ $< -o $(@F) + +zger_thread_D.$(SUFFIX) zger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -DXCONJ $< -o $(@F) + +xger_thread_U.$(SUFFIX) xger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +xger_thread_C.$(SUFFIX) xger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -UXCONJ $< -o $(@F) + +xger_thread_V.$(SUFFIX) xger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -DXCONJ $< -o $(@F) + +xger_thread_D.$(SUFFIX) xger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -DXCONJ $< -o $(@F) + +ssymv_thread_U.$(SUFFIX) ssymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssymv_thread_L.$(SUFFIX) ssymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsymv_thread_U.$(SUFFIX) dsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsymv_thread_L.$(SUFFIX) dsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsymv_thread_U.$(SUFFIX) qsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsymv_thread_L.$(SUFFIX) qsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csymv_thread_U.$(SUFFIX) csymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csymv_thread_L.$(SUFFIX) csymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsymv_thread_U.$(SUFFIX) zsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsymv_thread_L.$(SUFFIX) zsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsymv_thread_U.$(SUFFIX) xsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsymv_thread_L.$(SUFFIX) xsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +chemv_thread_U.$(SUFFIX) chemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) + +chemv_thread_L.$(SUFFIX) chemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) + +chemv_thread_V.$(SUFFIX) chemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chemv_thread_M.$(SUFFIX) chemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhemv_thread_U.$(SUFFIX) zhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) + +zhemv_thread_L.$(SUFFIX) zhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) + +zhemv_thread_V.$(SUFFIX) zhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhemv_thread_M.$(SUFFIX) zhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhemv_thread_U.$(SUFFIX) xhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) + +xhemv_thread_L.$(SUFFIX) xhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) + +xhemv_thread_V.$(SUFFIX) xhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhemv_thread_M.$(SUFFIX) xhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +ssyr_thread_U.$(SUFFIX) ssyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssyr_thread_L.$(SUFFIX) ssyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsyr_thread_U.$(SUFFIX) dsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsyr_thread_L.$(SUFFIX) dsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsyr_thread_U.$(SUFFIX) qsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsyr_thread_L.$(SUFFIX) qsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csyr_thread_U.$(SUFFIX) csyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csyr_thread_L.$(SUFFIX) csyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsyr_thread_U.$(SUFFIX) zsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsyr_thread_L.$(SUFFIX) zsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsyr_thread_U.$(SUFFIX) xsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsyr_thread_L.$(SUFFIX) xsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cher_thread_U.$(SUFFIX) cher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F) + +cher_thread_L.$(SUFFIX) cher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F) + +cher_thread_V.$(SUFFIX) cher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F) + +cher_thread_M.$(SUFFIX) cher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F) + +zher_thread_U.$(SUFFIX) zher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F) + +zher_thread_L.$(SUFFIX) zher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F) + +zher_thread_V.$(SUFFIX) zher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F) + +zher_thread_M.$(SUFFIX) zher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F) + +xher_thread_U.$(SUFFIX) xher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F) + +xher_thread_L.$(SUFFIX) xher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F) + +xher_thread_V.$(SUFFIX) xher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F) + +xher_thread_M.$(SUFFIX) xher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F) + +ssyr2_thread_U.$(SUFFIX) ssyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssyr2_thread_L.$(SUFFIX) ssyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsyr2_thread_U.$(SUFFIX) dsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsyr2_thread_L.$(SUFFIX) dsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsyr2_thread_U.$(SUFFIX) qsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsyr2_thread_L.$(SUFFIX) qsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csyr2_thread_U.$(SUFFIX) csyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csyr2_thread_L.$(SUFFIX) csyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsyr2_thread_U.$(SUFFIX) zsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsyr2_thread_L.$(SUFFIX) zsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsyr2_thread_U.$(SUFFIX) xsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsyr2_thread_L.$(SUFFIX) xsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cher2_thread_U.$(SUFFIX) cher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F) + +cher2_thread_L.$(SUFFIX) cher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F) + +cher2_thread_V.$(SUFFIX) cher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F) + +cher2_thread_M.$(SUFFIX) cher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F) + +zher2_thread_U.$(SUFFIX) zher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F) + +zher2_thread_L.$(SUFFIX) zher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F) + +zher2_thread_V.$(SUFFIX) zher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F) + +zher2_thread_M.$(SUFFIX) zher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F) + +xher2_thread_U.$(SUFFIX) xher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F) + +xher2_thread_L.$(SUFFIX) xher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F) + +xher2_thread_V.$(SUFFIX) xher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F) + +xher2_thread_M.$(SUFFIX) xher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F) + +chbmv_U.$(SUFFIX) chbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +chbmv_L.$(SUFFIX) chbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +chbmv_V.$(SUFFIX) chbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chbmv_M.$(SUFFIX) chbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhbmv_U.$(SUFFIX) zhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zhbmv_L.$(SUFFIX) zhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +zhbmv_V.$(SUFFIX) zhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhbmv_M.$(SUFFIX) zhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhbmv_U.$(SUFFIX) xhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xhbmv_L.$(SUFFIX) xhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +xhbmv_V.$(SUFFIX) xhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhbmv_M.$(SUFFIX) xhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chbmv_thread_U.$(SUFFIX) chbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) + +chbmv_thread_L.$(SUFFIX) chbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) + +chbmv_thread_V.$(SUFFIX) chbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chbmv_thread_M.$(SUFFIX) chbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhbmv_thread_U.$(SUFFIX) zhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) + +zhbmv_thread_L.$(SUFFIX) zhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) + +zhbmv_thread_V.$(SUFFIX) zhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhbmv_thread_M.$(SUFFIX) zhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhbmv_thread_U.$(SUFFIX) xhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) + +xhbmv_thread_L.$(SUFFIX) xhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) + +xhbmv_thread_V.$(SUFFIX) xhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhbmv_thread_M.$(SUFFIX) xhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +cher_U.$(SUFFIX) cher_U.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F) + +cher_L.$(SUFFIX) cher_L.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F) + +cher_V.$(SUFFIX) cher_V.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +cher_M.$(SUFFIX) cher_M.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zher_U.$(SUFFIX) zher_U.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F) + +zher_L.$(SUFFIX) zher_L.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F) + +zher_V.$(SUFFIX) zher_V.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zher_M.$(SUFFIX) zher_M.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xher_U.$(SUFFIX) xher_U.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F) + +xher_L.$(SUFFIX) xher_L.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F) + +xher_V.$(SUFFIX) xher_V.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xher_M.$(SUFFIX) xher_M.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +cher2_U.$(SUFFIX) cher2_U.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +cher2_L.$(SUFFIX) cher2_L.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +cher2_V.$(SUFFIX) cher2_V.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +cher2_M.$(SUFFIX) cher2_M.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +zher2_U.$(SUFFIX) zher2_U.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zher2_L.$(SUFFIX) zher2_L.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zher2_V.$(SUFFIX) zher2_V.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +zher2_M.$(SUFFIX) zher2_M.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +xher2_U.$(SUFFIX) xher2_U.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xher2_L.$(SUFFIX) xher2_L.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xher2_V.$(SUFFIX) xher2_V.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -DHEMVREV -o $(@F) + +xher2_M.$(SUFFIX) xher2_M.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) + +chpmv_U.$(SUFFIX) chpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +chpmv_L.$(SUFFIX) chpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +chpmv_V.$(SUFFIX) chpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chpmv_M.$(SUFFIX) chpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhpmv_U.$(SUFFIX) zhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zhpmv_L.$(SUFFIX) zhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +zhpmv_V.$(SUFFIX) zhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhpmv_M.$(SUFFIX) zhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhpmv_U.$(SUFFIX) xhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xhpmv_L.$(SUFFIX) xhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +xhpmv_V.$(SUFFIX) xhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhpmv_M.$(SUFFIX) xhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chpmv_thread_U.$(SUFFIX) chpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) + +chpmv_thread_L.$(SUFFIX) chpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) + +chpmv_thread_V.$(SUFFIX) chpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chpmv_thread_M.$(SUFFIX) chpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhpmv_thread_U.$(SUFFIX) zhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) + +zhpmv_thread_L.$(SUFFIX) zhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) + +zhpmv_thread_V.$(SUFFIX) zhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhpmv_thread_M.$(SUFFIX) zhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhpmv_thread_U.$(SUFFIX) xhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) + +xhpmv_thread_L.$(SUFFIX) xhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) + +xhpmv_thread_V.$(SUFFIX) xhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhpmv_thread_M.$(SUFFIX) xhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chpr_U.$(SUFFIX) chpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F) + +chpr_L.$(SUFFIX) chpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F) + +chpr_V.$(SUFFIX) chpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chpr_M.$(SUFFIX) chpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhpr_U.$(SUFFIX) zhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F) + +zhpr_L.$(SUFFIX) zhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F) + +zhpr_V.$(SUFFIX) zhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhpr_M.$(SUFFIX) zhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhpr_U.$(SUFFIX) xhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F) + +xhpr_L.$(SUFFIX) xhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F) + +xhpr_V.$(SUFFIX) xhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhpr_M.$(SUFFIX) xhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chpr_thread_U.$(SUFFIX) chpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMV $< -o $(@F) + +chpr_thread_L.$(SUFFIX) chpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMV $< -o $(@F) + +chpr_thread_V.$(SUFFIX) chpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chpr_thread_M.$(SUFFIX) chpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhpr_thread_U.$(SUFFIX) zhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMV $< -o $(@F) + +zhpr_thread_L.$(SUFFIX) zhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMV $< -o $(@F) + +zhpr_thread_V.$(SUFFIX) zhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhpr_thread_M.$(SUFFIX) zhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhpr_thread_U.$(SUFFIX) xhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) + +xhpr_thread_L.$(SUFFIX) xhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) + +xhpr_thread_V.$(SUFFIX) xhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhpr_thread_M.$(SUFFIX) xhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chpr2_U.$(SUFFIX) chpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +chpr2_L.$(SUFFIX) chpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +chpr2_V.$(SUFFIX) chpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +chpr2_M.$(SUFFIX) chpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +zhpr2_U.$(SUFFIX) zhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zhpr2_L.$(SUFFIX) zhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zhpr2_V.$(SUFFIX) zhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +zhpr2_M.$(SUFFIX) zhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +xhpr2_U.$(SUFFIX) xhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xhpr2_L.$(SUFFIX) xhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xhpr2_V.$(SUFFIX) xhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +xhpr2_M.$(SUFFIX) xhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) + +chpr2_thread_U.$(SUFFIX) chpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) + +chpr2_thread_L.$(SUFFIX) chpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) + +chpr2_thread_V.$(SUFFIX) chpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +chpr2_thread_M.$(SUFFIX) chpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +zhpr2_thread_U.$(SUFFIX) zhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) + +zhpr2_thread_L.$(SUFFIX) zhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) + +zhpr2_thread_V.$(SUFFIX) zhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +zhpr2_thread_M.$(SUFFIX) zhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +xhpr2_thread_U.$(SUFFIX) xhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) + +xhpr2_thread_L.$(SUFFIX) xhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) + +xhpr2_thread_V.$(SUFFIX) xhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +xhpr2_thread_M.$(SUFFIX) xhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) + +ssbmv_U.$(SUFFIX) ssbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssbmv_L.$(SUFFIX) ssbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsbmv_U.$(SUFFIX) dsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsbmv_L.$(SUFFIX) dsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsbmv_U.$(SUFFIX) qsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsbmv_L.$(SUFFIX) qsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csbmv_U.$(SUFFIX) csbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csbmv_L.$(SUFFIX) csbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsbmv_U.$(SUFFIX) zsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsbmv_L.$(SUFFIX) zsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsbmv_U.$(SUFFIX) xsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsbmv_L.$(SUFFIX) xsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +ssbmv_thread_U.$(SUFFIX) ssbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssbmv_thread_L.$(SUFFIX) ssbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsbmv_thread_U.$(SUFFIX) dsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsbmv_thread_L.$(SUFFIX) dsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsbmv_thread_U.$(SUFFIX) qsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsbmv_thread_L.$(SUFFIX) qsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csbmv_thread_U.$(SUFFIX) csbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csbmv_thread_L.$(SUFFIX) csbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsbmv_thread_U.$(SUFFIX) zsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsbmv_thread_L.$(SUFFIX) zsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsbmv_thread_U.$(SUFFIX) xsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsbmv_thread_L.$(SUFFIX) xsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspmv_U.$(SUFFIX) sspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspmv_L.$(SUFFIX) sspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspmv_U.$(SUFFIX) dspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspmv_L.$(SUFFIX) dspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspmv_U.$(SUFFIX) qspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspmv_L.$(SUFFIX) qspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspmv_U.$(SUFFIX) cspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspmv_L.$(SUFFIX) cspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspmv_U.$(SUFFIX) zspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspmv_L.$(SUFFIX) zspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspmv_U.$(SUFFIX) xspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspmv_L.$(SUFFIX) xspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspmv_thread_U.$(SUFFIX) sspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspmv_thread_L.$(SUFFIX) sspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspmv_thread_U.$(SUFFIX) dspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspmv_thread_L.$(SUFFIX) dspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspmv_thread_U.$(SUFFIX) qspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspmv_thread_L.$(SUFFIX) qspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspmv_thread_U.$(SUFFIX) cspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspmv_thread_L.$(SUFFIX) cspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspmv_thread_U.$(SUFFIX) zspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspmv_thread_L.$(SUFFIX) zspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspmv_thread_U.$(SUFFIX) xspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspmv_thread_L.$(SUFFIX) xspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspr_U.$(SUFFIX) sspr_U.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspr_L.$(SUFFIX) sspr_L.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspr_U.$(SUFFIX) dspr_U.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspr_L.$(SUFFIX) dspr_L.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspr_U.$(SUFFIX) qspr_U.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspr_L.$(SUFFIX) qspr_L.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspr_U.$(SUFFIX) cspr_U.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspr_L.$(SUFFIX) cspr_L.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspr_U.$(SUFFIX) zspr_U.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspr_L.$(SUFFIX) zspr_L.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspr_U.$(SUFFIX) xspr_U.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspr_L.$(SUFFIX) xspr_L.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspr_thread_U.$(SUFFIX) sspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspr_thread_L.$(SUFFIX) sspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspr_thread_U.$(SUFFIX) dspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspr_thread_L.$(SUFFIX) dspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspr_thread_U.$(SUFFIX) qspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspr_thread_L.$(SUFFIX) qspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspr_thread_U.$(SUFFIX) cspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspr_thread_L.$(SUFFIX) cspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspr_thread_U.$(SUFFIX) zspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspr_thread_L.$(SUFFIX) zspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspr_thread_U.$(SUFFIX) xspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspr_thread_L.$(SUFFIX) xspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspr2_U.$(SUFFIX) sspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspr2_L.$(SUFFIX) sspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspr2_U.$(SUFFIX) dspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspr2_L.$(SUFFIX) dspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspr2_U.$(SUFFIX) qspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspr2_L.$(SUFFIX) qspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspr2_U.$(SUFFIX) cspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspr2_L.$(SUFFIX) cspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspr2_U.$(SUFFIX) zspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspr2_L.$(SUFFIX) zspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspr2_U.$(SUFFIX) xspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspr2_L.$(SUFFIX) xspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspr2_thread_U.$(SUFFIX) sspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspr2_thread_L.$(SUFFIX) sspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspr2_thread_U.$(SUFFIX) dspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspr2_thread_L.$(SUFFIX) dspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspr2_thread_U.$(SUFFIX) qspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspr2_thread_L.$(SUFFIX) qspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspr2_thread_U.$(SUFFIX) cspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspr2_thread_L.$(SUFFIX) cspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspr2_thread_U.$(SUFFIX) zspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspr2_thread_L.$(SUFFIX) zspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspr2_thread_U.$(SUFFIX) xspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspr2_thread_L.$(SUFFIX) xspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +ssyr_U.$(SUFFIX) ssyr_U.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssyr_L.$(SUFFIX) ssyr_L.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsyr_U.$(SUFFIX) dsyr_U.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsyr_L.$(SUFFIX) dsyr_L.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsyr_U.$(SUFFIX) qsyr_U.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsyr_L.$(SUFFIX) qsyr_L.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csyr_U.$(SUFFIX) csyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csyr_L.$(SUFFIX) csyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsyr_U.$(SUFFIX) zsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsyr_L.$(SUFFIX) zsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsyr_U.$(SUFFIX) xsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsyr_L.$(SUFFIX) xsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +ssyr2_U.$(SUFFIX) ssyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssyr2_L.$(SUFFIX) ssyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsyr2_U.$(SUFFIX) dsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsyr2_L.$(SUFFIX) dsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsyr2_U.$(SUFFIX) qsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsyr2_L.$(SUFFIX) qsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csyr2_U.$(SUFFIX) csyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csyr2_L.$(SUFFIX) csyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsyr2_U.$(SUFFIX) zsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsyr2_L.$(SUFFIX) zsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsyr2_U.$(SUFFIX) xsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsyr2_L.$(SUFFIX) xsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +stbmv_NUU.$(SUFFIX) stbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stbmv_NUN.$(SUFFIX) stbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stbmv_TLU.$(SUFFIX) stbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stbmv_TLN.$(SUFFIX) stbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +stbmv_NLU.$(SUFFIX) stbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stbmv_NLN.$(SUFFIX) stbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stbmv_TUU.$(SUFFIX) stbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stbmv_TUN.$(SUFFIX) stbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtbmv_NUU.$(SUFFIX) dtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtbmv_NUN.$(SUFFIX) dtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtbmv_TLU.$(SUFFIX) dtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtbmv_TLN.$(SUFFIX) dtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtbmv_NLU.$(SUFFIX) dtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtbmv_NLN.$(SUFFIX) dtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtbmv_TUU.$(SUFFIX) dtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtbmv_TUN.$(SUFFIX) dtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtbmv_NUU.$(SUFFIX) qtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtbmv_NUN.$(SUFFIX) qtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtbmv_TLU.$(SUFFIX) qtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtbmv_TLN.$(SUFFIX) qtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtbmv_NLU.$(SUFFIX) qtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtbmv_NLN.$(SUFFIX) qtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtbmv_TUU.$(SUFFIX) qtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtbmv_TUN.$(SUFFIX) qtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctbmv_NUU.$(SUFFIX) ctbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbmv_NUN.$(SUFFIX) ctbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbmv_TLU.$(SUFFIX) ctbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbmv_TLN.$(SUFFIX) ctbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbmv_RLU.$(SUFFIX) ctbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbmv_RLN.$(SUFFIX) ctbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbmv_CLU.$(SUFFIX) ctbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbmv_CLN.$(SUFFIX) ctbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ctbmv_NLU.$(SUFFIX) ctbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbmv_NLN.$(SUFFIX) ctbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbmv_TUU.$(SUFFIX) ctbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbmv_TUN.$(SUFFIX) ctbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbmv_RUU.$(SUFFIX) ctbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbmv_RUN.$(SUFFIX) ctbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbmv_CUU.$(SUFFIX) ctbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbmv_CUN.$(SUFFIX) ctbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbmv_NUU.$(SUFFIX) ztbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbmv_NUN.$(SUFFIX) ztbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbmv_TLU.$(SUFFIX) ztbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbmv_TLN.$(SUFFIX) ztbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbmv_RLU.$(SUFFIX) ztbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbmv_RLN.$(SUFFIX) ztbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbmv_CLU.$(SUFFIX) ztbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbmv_CLN.$(SUFFIX) ztbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbmv_NLU.$(SUFFIX) ztbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbmv_NLN.$(SUFFIX) ztbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbmv_TUU.$(SUFFIX) ztbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbmv_TUN.$(SUFFIX) ztbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbmv_RUU.$(SUFFIX) ztbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbmv_RUN.$(SUFFIX) ztbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbmv_CUU.$(SUFFIX) ztbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbmv_CUN.$(SUFFIX) ztbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbmv_NUU.$(SUFFIX) xtbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbmv_NUN.$(SUFFIX) xtbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbmv_TLU.$(SUFFIX) xtbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbmv_TLN.$(SUFFIX) xtbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbmv_RLU.$(SUFFIX) xtbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbmv_RLN.$(SUFFIX) xtbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbmv_CLU.$(SUFFIX) xtbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbmv_CLN.$(SUFFIX) xtbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbmv_NLU.$(SUFFIX) xtbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbmv_NLN.$(SUFFIX) xtbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbmv_TUU.$(SUFFIX) xtbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbmv_TUN.$(SUFFIX) xtbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbmv_RUU.$(SUFFIX) xtbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbmv_RUN.$(SUFFIX) xtbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbmv_CUU.$(SUFFIX) xtbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbmv_CUN.$(SUFFIX) xtbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +stbmv_thread_NUN.$(SUFFIX) stbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +stbmv_thread_TLN.$(SUFFIX) stbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +stbmv_thread_NLN.$(SUFFIX) stbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +stbmv_thread_TUN.$(SUFFIX) stbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +dtbmv_thread_NUN.$(SUFFIX) dtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +dtbmv_thread_TLN.$(SUFFIX) dtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +dtbmv_thread_NLN.$(SUFFIX) dtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +dtbmv_thread_TUN.$(SUFFIX) dtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +qtbmv_thread_NUN.$(SUFFIX) qtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +qtbmv_thread_TLN.$(SUFFIX) qtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +qtbmv_thread_NLN.$(SUFFIX) qtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +qtbmv_thread_TUN.$(SUFFIX) qtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbmv_thread_NUN.$(SUFFIX) ctbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbmv_thread_TLN.$(SUFFIX) ctbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbmv_thread_RLN.$(SUFFIX) ctbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbmv_thread_CLN.$(SUFFIX) ctbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbmv_thread_NLN.$(SUFFIX) ctbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbmv_thread_TUN.$(SUFFIX) ctbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbmv_thread_RUN.$(SUFFIX) ctbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbmv_thread_CUN.$(SUFFIX) ctbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbmv_thread_NUN.$(SUFFIX) ztbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbmv_thread_TLN.$(SUFFIX) ztbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbmv_thread_RLN.$(SUFFIX) ztbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbmv_thread_CLN.$(SUFFIX) ztbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbmv_thread_NLN.$(SUFFIX) ztbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbmv_thread_TUN.$(SUFFIX) ztbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbmv_thread_RUN.$(SUFFIX) ztbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbmv_thread_CUN.$(SUFFIX) ztbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbmv_thread_NUN.$(SUFFIX) xtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbmv_thread_TLN.$(SUFFIX) xtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbmv_thread_RLN.$(SUFFIX) xtbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbmv_thread_CLN.$(SUFFIX) xtbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbmv_thread_NLN.$(SUFFIX) xtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbmv_thread_TUN.$(SUFFIX) xtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbmv_thread_RUN.$(SUFFIX) xtbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbmv_thread_CUN.$(SUFFIX) xtbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +stbsv_NUU.$(SUFFIX) stbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stbsv_NUN.$(SUFFIX) stbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stbsv_TLU.$(SUFFIX) stbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stbsv_TLN.$(SUFFIX) stbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +stbsv_NLU.$(SUFFIX) stbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stbsv_NLN.$(SUFFIX) stbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stbsv_TUU.$(SUFFIX) stbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stbsv_TUN.$(SUFFIX) stbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtbsv_NUU.$(SUFFIX) dtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtbsv_NUN.$(SUFFIX) dtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtbsv_TLU.$(SUFFIX) dtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtbsv_TLN.$(SUFFIX) dtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtbsv_NLU.$(SUFFIX) dtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtbsv_NLN.$(SUFFIX) dtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtbsv_TUU.$(SUFFIX) dtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtbsv_TUN.$(SUFFIX) dtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtbsv_NUU.$(SUFFIX) qtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtbsv_NUN.$(SUFFIX) qtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtbsv_TLU.$(SUFFIX) qtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtbsv_TLN.$(SUFFIX) qtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtbsv_NLU.$(SUFFIX) qtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtbsv_NLN.$(SUFFIX) qtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtbsv_TUU.$(SUFFIX) qtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtbsv_TUN.$(SUFFIX) qtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctbsv_NUU.$(SUFFIX) ctbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbsv_NUN.$(SUFFIX) ctbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbsv_TLU.$(SUFFIX) ctbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbsv_TLN.$(SUFFIX) ctbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbsv_RLU.$(SUFFIX) ctbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbsv_RLN.$(SUFFIX) ctbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbsv_CLU.$(SUFFIX) ctbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbsv_CLN.$(SUFFIX) ctbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ctbsv_NLU.$(SUFFIX) ctbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbsv_NLN.$(SUFFIX) ctbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbsv_TUU.$(SUFFIX) ctbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbsv_TUN.$(SUFFIX) ctbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbsv_RUU.$(SUFFIX) ctbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbsv_RUN.$(SUFFIX) ctbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbsv_CUU.$(SUFFIX) ctbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbsv_CUN.$(SUFFIX) ctbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbsv_NUU.$(SUFFIX) ztbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbsv_NUN.$(SUFFIX) ztbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbsv_TLU.$(SUFFIX) ztbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbsv_TLN.$(SUFFIX) ztbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbsv_RLU.$(SUFFIX) ztbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbsv_RLN.$(SUFFIX) ztbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbsv_CLU.$(SUFFIX) ztbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbsv_CLN.$(SUFFIX) ztbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbsv_NLU.$(SUFFIX) ztbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbsv_NLN.$(SUFFIX) ztbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbsv_TUU.$(SUFFIX) ztbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbsv_TUN.$(SUFFIX) ztbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbsv_RUU.$(SUFFIX) ztbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbsv_RUN.$(SUFFIX) ztbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbsv_CUU.$(SUFFIX) ztbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbsv_CUN.$(SUFFIX) ztbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbsv_NUU.$(SUFFIX) xtbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbsv_NUN.$(SUFFIX) xtbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbsv_TLU.$(SUFFIX) xtbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbsv_TLN.$(SUFFIX) xtbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbsv_RLU.$(SUFFIX) xtbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbsv_RLN.$(SUFFIX) xtbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbsv_CLU.$(SUFFIX) xtbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbsv_CLN.$(SUFFIX) xtbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbsv_NLU.$(SUFFIX) xtbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbsv_NLN.$(SUFFIX) xtbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbsv_TUU.$(SUFFIX) xtbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbsv_TUN.$(SUFFIX) xtbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbsv_RUU.$(SUFFIX) xtbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbsv_RUN.$(SUFFIX) xtbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbsv_CUU.$(SUFFIX) xtbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbsv_CUN.$(SUFFIX) xtbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +stpmv_NUU.$(SUFFIX) stpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stpmv_NUN.$(SUFFIX) stpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stpmv_TLU.$(SUFFIX) stpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stpmv_TLN.$(SUFFIX) stpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +stpmv_NLU.$(SUFFIX) stpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stpmv_NLN.$(SUFFIX) stpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stpmv_TUU.$(SUFFIX) stpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stpmv_TUN.$(SUFFIX) stpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtpmv_NUU.$(SUFFIX) dtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtpmv_NUN.$(SUFFIX) dtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtpmv_TLU.$(SUFFIX) dtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtpmv_TLN.$(SUFFIX) dtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtpmv_NLU.$(SUFFIX) dtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtpmv_NLN.$(SUFFIX) dtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtpmv_TUU.$(SUFFIX) dtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtpmv_TUN.$(SUFFIX) dtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtpmv_NUU.$(SUFFIX) qtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtpmv_NUN.$(SUFFIX) qtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtpmv_TLU.$(SUFFIX) qtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtpmv_TLN.$(SUFFIX) qtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtpmv_NLU.$(SUFFIX) qtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtpmv_NLN.$(SUFFIX) qtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtpmv_TUU.$(SUFFIX) qtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtpmv_TUN.$(SUFFIX) qtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctpmv_NUU.$(SUFFIX) ctpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpmv_NUN.$(SUFFIX) ctpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpmv_TLU.$(SUFFIX) ctpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpmv_TLN.$(SUFFIX) ctpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpmv_RLU.$(SUFFIX) ctpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpmv_RLN.$(SUFFIX) ctpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpmv_CLU.$(SUFFIX) ctpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpmv_CLN.$(SUFFIX) ctpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ctpmv_NLU.$(SUFFIX) ctpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpmv_NLN.$(SUFFIX) ctpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpmv_TUU.$(SUFFIX) ctpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpmv_TUN.$(SUFFIX) ctpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpmv_RUU.$(SUFFIX) ctpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpmv_RUN.$(SUFFIX) ctpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpmv_CUU.$(SUFFIX) ctpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpmv_CUN.$(SUFFIX) ctpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpmv_NUU.$(SUFFIX) ztpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpmv_NUN.$(SUFFIX) ztpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpmv_TLU.$(SUFFIX) ztpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpmv_TLN.$(SUFFIX) ztpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpmv_RLU.$(SUFFIX) ztpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpmv_RLN.$(SUFFIX) ztpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpmv_CLU.$(SUFFIX) ztpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpmv_CLN.$(SUFFIX) ztpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpmv_NLU.$(SUFFIX) ztpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpmv_NLN.$(SUFFIX) ztpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpmv_TUU.$(SUFFIX) ztpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpmv_TUN.$(SUFFIX) ztpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpmv_RUU.$(SUFFIX) ztpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpmv_RUN.$(SUFFIX) ztpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpmv_CUU.$(SUFFIX) ztpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpmv_CUN.$(SUFFIX) ztpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpmv_NUU.$(SUFFIX) xtpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpmv_NUN.$(SUFFIX) xtpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpmv_TLU.$(SUFFIX) xtpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpmv_TLN.$(SUFFIX) xtpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpmv_RLU.$(SUFFIX) xtpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpmv_RLN.$(SUFFIX) xtpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpmv_CLU.$(SUFFIX) xtpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpmv_CLN.$(SUFFIX) xtpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpmv_NLU.$(SUFFIX) xtpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpmv_NLN.$(SUFFIX) xtpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpmv_TUU.$(SUFFIX) xtpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpmv_TUN.$(SUFFIX) xtpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpmv_RUU.$(SUFFIX) xtpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpmv_RUN.$(SUFFIX) xtpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpmv_CUU.$(SUFFIX) xtpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpmv_CUN.$(SUFFIX) xtpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + + +stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +stpmv_thread_NUN.$(SUFFIX) stpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +stpmv_thread_TLN.$(SUFFIX) stpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +stpmv_thread_NLN.$(SUFFIX) stpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +stpmv_thread_TUN.$(SUFFIX) stpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +dtpmv_thread_NUN.$(SUFFIX) dtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +dtpmv_thread_TLN.$(SUFFIX) dtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +dtpmv_thread_NLN.$(SUFFIX) dtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +dtpmv_thread_TUN.$(SUFFIX) dtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +qtpmv_thread_NUN.$(SUFFIX) qtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +qtpmv_thread_TLN.$(SUFFIX) qtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +qtpmv_thread_NLN.$(SUFFIX) qtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +qtpmv_thread_TUN.$(SUFFIX) qtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpmv_thread_NUN.$(SUFFIX) ctpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpmv_thread_TLN.$(SUFFIX) ctpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpmv_thread_RLN.$(SUFFIX) ctpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpmv_thread_CLN.$(SUFFIX) ctpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpmv_thread_NLN.$(SUFFIX) ctpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpmv_thread_TUN.$(SUFFIX) ctpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpmv_thread_RUN.$(SUFFIX) ctpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpmv_thread_CUN.$(SUFFIX) ctpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpmv_thread_NUN.$(SUFFIX) ztpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpmv_thread_TLN.$(SUFFIX) ztpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpmv_thread_RLN.$(SUFFIX) ztpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpmv_thread_CLN.$(SUFFIX) ztpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpmv_thread_NLN.$(SUFFIX) ztpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpmv_thread_TUN.$(SUFFIX) ztpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpmv_thread_RUN.$(SUFFIX) ztpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpmv_thread_CUN.$(SUFFIX) ztpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpmv_thread_NUN.$(SUFFIX) xtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpmv_thread_TLN.$(SUFFIX) xtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpmv_thread_RLN.$(SUFFIX) xtpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpmv_thread_CLN.$(SUFFIX) xtpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpmv_thread_NLN.$(SUFFIX) xtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpmv_thread_TUN.$(SUFFIX) xtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpmv_thread_RUN.$(SUFFIX) xtpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpmv_thread_CUN.$(SUFFIX) xtpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +stpsv_NUU.$(SUFFIX) stpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stpsv_NUN.$(SUFFIX) stpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stpsv_TLU.$(SUFFIX) stpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stpsv_TLN.$(SUFFIX) stpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +stpsv_NLU.$(SUFFIX) stpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stpsv_NLN.$(SUFFIX) stpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stpsv_TUU.$(SUFFIX) stpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stpsv_TUN.$(SUFFIX) stpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtpsv_NUU.$(SUFFIX) dtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtpsv_NUN.$(SUFFIX) dtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtpsv_TLU.$(SUFFIX) dtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtpsv_TLN.$(SUFFIX) dtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtpsv_NLU.$(SUFFIX) dtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtpsv_NLN.$(SUFFIX) dtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtpsv_TUU.$(SUFFIX) dtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtpsv_TUN.$(SUFFIX) dtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtpsv_NUU.$(SUFFIX) qtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtpsv_NUN.$(SUFFIX) qtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtpsv_TLU.$(SUFFIX) qtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtpsv_TLN.$(SUFFIX) qtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtpsv_NLU.$(SUFFIX) qtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtpsv_NLN.$(SUFFIX) qtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtpsv_TUU.$(SUFFIX) qtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtpsv_TUN.$(SUFFIX) qtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctpsv_NUU.$(SUFFIX) ctpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpsv_NUN.$(SUFFIX) ctpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpsv_TLU.$(SUFFIX) ctpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpsv_TLN.$(SUFFIX) ctpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpsv_RLU.$(SUFFIX) ctpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpsv_RLN.$(SUFFIX) ctpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpsv_CLU.$(SUFFIX) ctpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpsv_CLN.$(SUFFIX) ctpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ctpsv_NLU.$(SUFFIX) ctpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpsv_NLN.$(SUFFIX) ctpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpsv_TUU.$(SUFFIX) ctpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpsv_TUN.$(SUFFIX) ctpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpsv_RUU.$(SUFFIX) ctpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpsv_RUN.$(SUFFIX) ctpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpsv_CUU.$(SUFFIX) ctpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpsv_CUN.$(SUFFIX) ctpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpsv_NUU.$(SUFFIX) ztpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpsv_NUN.$(SUFFIX) ztpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpsv_TLU.$(SUFFIX) ztpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpsv_TLN.$(SUFFIX) ztpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpsv_RLU.$(SUFFIX) ztpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpsv_RLN.$(SUFFIX) ztpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpsv_CLU.$(SUFFIX) ztpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpsv_CLN.$(SUFFIX) ztpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpsv_NLU.$(SUFFIX) ztpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpsv_NLN.$(SUFFIX) ztpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpsv_TUU.$(SUFFIX) ztpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpsv_TUN.$(SUFFIX) ztpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpsv_RUU.$(SUFFIX) ztpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpsv_RUN.$(SUFFIX) ztpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpsv_CUU.$(SUFFIX) ztpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpsv_CUN.$(SUFFIX) ztpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpsv_NUU.$(SUFFIX) xtpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpsv_NUN.$(SUFFIX) xtpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpsv_TLU.$(SUFFIX) xtpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpsv_TLN.$(SUFFIX) xtpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpsv_RLU.$(SUFFIX) xtpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpsv_RLN.$(SUFFIX) xtpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpsv_CLU.$(SUFFIX) xtpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpsv_CLN.$(SUFFIX) xtpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpsv_NLU.$(SUFFIX) xtpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpsv_NLN.$(SUFFIX) xtpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpsv_TUU.$(SUFFIX) xtpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpsv_TUN.$(SUFFIX) xtpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpsv_RUU.$(SUFFIX) xtpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpsv_RUN.$(SUFFIX) xtpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpsv_CUU.$(SUFFIX) xtpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpsv_CUN.$(SUFFIX) xtpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +strmv_NUU.$(SUFFIX) strmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +strmv_NUN.$(SUFFIX) strmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +strmv_TLU.$(SUFFIX) strmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +strmv_TLN.$(SUFFIX) strmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +strmv_NLU.$(SUFFIX) strmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +strmv_NLN.$(SUFFIX) strmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +strmv_TUU.$(SUFFIX) strmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +strmv_TUN.$(SUFFIX) strmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtrmv_NUU.$(SUFFIX) dtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtrmv_NUN.$(SUFFIX) dtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtrmv_TLU.$(SUFFIX) dtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtrmv_TLN.$(SUFFIX) dtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtrmv_NLU.$(SUFFIX) dtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtrmv_NLN.$(SUFFIX) dtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtrmv_TUU.$(SUFFIX) dtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtrmv_TUN.$(SUFFIX) dtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtrmv_NUU.$(SUFFIX) qtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtrmv_NUN.$(SUFFIX) qtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtrmv_TLU.$(SUFFIX) qtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtrmv_TLN.$(SUFFIX) qtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtrmv_NLU.$(SUFFIX) qtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtrmv_NLN.$(SUFFIX) qtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtrmv_TUU.$(SUFFIX) qtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtrmv_TUN.$(SUFFIX) qtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctrmv_NUU.$(SUFFIX) ctrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrmv_NUN.$(SUFFIX) ctrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrmv_TLU.$(SUFFIX) ctrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrmv_TLN.$(SUFFIX) ctrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrmv_RLU.$(SUFFIX) ctrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrmv_RLN.$(SUFFIX) ctrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrmv_CLU.$(SUFFIX) ctrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrmv_CLN.$(SUFFIX) ctrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ctrmv_NLU.$(SUFFIX) ctrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrmv_NLN.$(SUFFIX) ctrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrmv_TUU.$(SUFFIX) ctrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrmv_TUN.$(SUFFIX) ctrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrmv_RUU.$(SUFFIX) ctrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrmv_RUN.$(SUFFIX) ctrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrmv_CUU.$(SUFFIX) ctrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrmv_CUN.$(SUFFIX) ctrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrmv_NUU.$(SUFFIX) ztrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrmv_NUN.$(SUFFIX) ztrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrmv_TLU.$(SUFFIX) ztrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrmv_TLN.$(SUFFIX) ztrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrmv_RLU.$(SUFFIX) ztrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrmv_RLN.$(SUFFIX) ztrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrmv_CLU.$(SUFFIX) ztrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrmv_CLN.$(SUFFIX) ztrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrmv_NLU.$(SUFFIX) ztrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrmv_NLN.$(SUFFIX) ztrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrmv_TUU.$(SUFFIX) ztrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrmv_TUN.$(SUFFIX) ztrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrmv_RUU.$(SUFFIX) ztrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrmv_RUN.$(SUFFIX) ztrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrmv_CUU.$(SUFFIX) ztrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrmv_CUN.$(SUFFIX) ztrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrmv_NUU.$(SUFFIX) xtrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrmv_NUN.$(SUFFIX) xtrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrmv_TLU.$(SUFFIX) xtrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrmv_TLN.$(SUFFIX) xtrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrmv_RLU.$(SUFFIX) xtrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrmv_RLN.$(SUFFIX) xtrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrmv_CLU.$(SUFFIX) xtrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrmv_CLN.$(SUFFIX) xtrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrmv_NLU.$(SUFFIX) xtrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrmv_NLN.$(SUFFIX) xtrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrmv_TUU.$(SUFFIX) xtrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrmv_TUN.$(SUFFIX) xtrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrmv_RUU.$(SUFFIX) xtrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrmv_RUN.$(SUFFIX) xtrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrmv_CUU.$(SUFFIX) xtrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrmv_CUN.$(SUFFIX) xtrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +strmv_thread_NUU.$(SUFFIX) strmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +strmv_thread_NUN.$(SUFFIX) strmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +strmv_thread_TLU.$(SUFFIX) strmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +strmv_thread_TLN.$(SUFFIX) strmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +strmv_thread_NLU.$(SUFFIX) strmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +strmv_thread_NLN.$(SUFFIX) strmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +strmv_thread_TUU.$(SUFFIX) strmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +strmv_thread_TUN.$(SUFFIX) strmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +dtrmv_thread_NUN.$(SUFFIX) dtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +dtrmv_thread_TLN.$(SUFFIX) dtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +dtrmv_thread_NLN.$(SUFFIX) dtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +dtrmv_thread_TUN.$(SUFFIX) dtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +qtrmv_thread_NUN.$(SUFFIX) qtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +qtrmv_thread_TLN.$(SUFFIX) qtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +qtrmv_thread_NLN.$(SUFFIX) qtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +qtrmv_thread_TUN.$(SUFFIX) qtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrmv_thread_NUN.$(SUFFIX) ctrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrmv_thread_TLN.$(SUFFIX) ctrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrmv_thread_RLN.$(SUFFIX) ctrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrmv_thread_CLN.$(SUFFIX) ctrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrmv_thread_NLN.$(SUFFIX) ctrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrmv_thread_TUN.$(SUFFIX) ctrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrmv_thread_RUN.$(SUFFIX) ctrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrmv_thread_CUN.$(SUFFIX) ctrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrmv_thread_NUN.$(SUFFIX) ztrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrmv_thread_TLN.$(SUFFIX) ztrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrmv_thread_RLN.$(SUFFIX) ztrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrmv_thread_CLN.$(SUFFIX) ztrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrmv_thread_NLN.$(SUFFIX) ztrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrmv_thread_TUN.$(SUFFIX) ztrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrmv_thread_RUN.$(SUFFIX) ztrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrmv_thread_CUN.$(SUFFIX) ztrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrmv_thread_NUN.$(SUFFIX) xtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrmv_thread_TLN.$(SUFFIX) xtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrmv_thread_RLN.$(SUFFIX) xtrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrmv_thread_CLN.$(SUFFIX) xtrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrmv_thread_NLN.$(SUFFIX) xtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrmv_thread_TUN.$(SUFFIX) xtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrmv_thread_RUN.$(SUFFIX) xtrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrmv_thread_CUN.$(SUFFIX) xtrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +strsv_NUU.$(SUFFIX) strsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +strsv_NUN.$(SUFFIX) strsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +strsv_TLU.$(SUFFIX) strsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +strsv_TLN.$(SUFFIX) strsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +strsv_NLU.$(SUFFIX) strsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +strsv_NLN.$(SUFFIX) strsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +strsv_TUU.$(SUFFIX) strsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +strsv_TUN.$(SUFFIX) strsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtrsv_NUU.$(SUFFIX) dtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtrsv_NUN.$(SUFFIX) dtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtrsv_TLU.$(SUFFIX) dtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtrsv_TLN.$(SUFFIX) dtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtrsv_NLU.$(SUFFIX) dtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtrsv_NLN.$(SUFFIX) dtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtrsv_TUU.$(SUFFIX) dtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtrsv_TUN.$(SUFFIX) dtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtrsv_NUU.$(SUFFIX) qtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtrsv_NUN.$(SUFFIX) qtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtrsv_TLU.$(SUFFIX) qtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtrsv_TLN.$(SUFFIX) qtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtrsv_NLU.$(SUFFIX) qtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtrsv_NLN.$(SUFFIX) qtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtrsv_TUU.$(SUFFIX) qtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtrsv_TUN.$(SUFFIX) qtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctrsv_NUU.$(SUFFIX) ctrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrsv_NUN.$(SUFFIX) ctrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrsv_TLU.$(SUFFIX) ctrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrsv_TLN.$(SUFFIX) ctrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrsv_RLU.$(SUFFIX) ctrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrsv_RLN.$(SUFFIX) ctrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrsv_CLU.$(SUFFIX) ctrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrsv_CLN.$(SUFFIX) ctrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ctrsv_NLU.$(SUFFIX) ctrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrsv_NLN.$(SUFFIX) ctrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrsv_TUU.$(SUFFIX) ctrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrsv_TUN.$(SUFFIX) ctrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrsv_RUU.$(SUFFIX) ctrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrsv_RUN.$(SUFFIX) ctrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrsv_CUU.$(SUFFIX) ctrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrsv_CUN.$(SUFFIX) ctrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrsv_NUU.$(SUFFIX) ztrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrsv_NUN.$(SUFFIX) ztrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrsv_TLU.$(SUFFIX) ztrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrsv_TLN.$(SUFFIX) ztrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrsv_RLU.$(SUFFIX) ztrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrsv_RLN.$(SUFFIX) ztrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrsv_CLU.$(SUFFIX) ztrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrsv_CLN.$(SUFFIX) ztrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrsv_NLU.$(SUFFIX) ztrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrsv_NLN.$(SUFFIX) ztrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrsv_TUU.$(SUFFIX) ztrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrsv_TUN.$(SUFFIX) ztrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrsv_RUU.$(SUFFIX) ztrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrsv_RUN.$(SUFFIX) ztrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrsv_CUU.$(SUFFIX) ztrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrsv_CUN.$(SUFFIX) ztrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrsv_NUU.$(SUFFIX) xtrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrsv_NUN.$(SUFFIX) xtrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrsv_TLU.$(SUFFIX) xtrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrsv_TLN.$(SUFFIX) xtrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrsv_RLU.$(SUFFIX) xtrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrsv_RLN.$(SUFFIX) xtrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrsv_CLU.$(SUFFIX) xtrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrsv_CLN.$(SUFFIX) xtrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrsv_NLU.$(SUFFIX) xtrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrsv_NLN.$(SUFFIX) xtrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrsv_TUU.$(SUFFIX) xtrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrsv_TUN.$(SUFFIX) xtrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrsv_RUU.$(SUFFIX) xtrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrsv_RUN.$(SUFFIX) xtrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +include ../../Makefile.tail diff --git a/driver/level2/gbmv_k.c b/driver/level2/gbmv_k.c new file mode 100644 index 0000000..317d420 --- /dev/null +++ b/driver/level2/gbmv_k.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifndef TRANS +#define M m +#define N n +#else +#define N m +#define M n +#endif + +void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, offset_u, offset_l, start, end, length; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + M * sizeof(FLOAT) + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(M, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + N * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(N, x, incx, X, 1); + } + + offset_u = ku; + offset_l = ku + m; + + for (i = 0; i < MIN(n, m + ku); i++) { + + start = MAX(offset_u, 0); + end = MIN(offset_l, ku + kl + 1); + + length = end - start; + +#ifndef TRANS + AXPYU_K(length, 0, 0, + alpha * X[i], + a + start, 1, Y + start - offset_u, 1, NULL, 0); +#else + Y[i] += alpha * DOTU_K(length, a + start, 1, X + start - offset_u, 1); +#endif + + offset_u --; + offset_l --; + + a += lda; + } + + if (incy != 1) { + COPY_K(M, Y, 1, y, incy); + } + + return; +} + diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c new file mode 100644 index 0000000..18aae26 --- /dev/null +++ b/driver/level2/gbmv_thread.c @@ -0,0 +1,294 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#if !defined(CONJ) && !defined(XCONJ) +#define MYAXPY AXPYU_K +#define MYDOT DOTU_K +#elif defined(CONJ) && !defined(XCONJ) +#define MYAXPY AXPYC_K +#define MYDOT DOTC_K +#elif !defined(CONJ) && defined(XCONJ) +#define MYAXPY AXPYU_K +#define MYDOT DOTC_K +#else +#define MYAXPY AXPYC_K +#define MYDOT DOTU_K +#endif + +static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx; + BLASLONG n_from, n_to; + BLASLONG i, offset_l, offset_u, uu, ll, ku, kl; +#ifdef TRANSA +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + ku = args -> ldc; + kl = args -> ldd; + + n_from = 0; + n_to = args -> n; + + if (range_m) y += *range_m * COMPSIZE; + + if (range_n) { + n_from = *(range_n + 0); + n_to = *(range_n + 1); + + a += n_from * lda * COMPSIZE; + } + + n_to = MIN(n_to, args -> m + ku); + +#ifdef TRANSA + if (incx != 1) { + COPY_K(args -> m, x, incx, buffer, 1); + + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } +#endif + + SCAL_K( +#ifndef TRANSA + args -> m, +#else + args -> n, +#endif + 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); + + offset_u = ku - n_from; + offset_l = ku - n_from + args -> m; + +#ifndef TRANSA + x += n_from * incx * COMPSIZE; + y -= offset_u * COMPSIZE; +#else + x -= offset_u * COMPSIZE; + y += n_from * COMPSIZE; +#endif + + for (i = n_from; i < n_to; i++) { + + uu = MAX(offset_u, 0); + ll = MIN(offset_l, ku + kl + 1); + +#ifndef TRANSA + MYAXPY(ll - uu, 0, 0, + *(x + 0), +#ifdef COMPLEX +#ifndef XCONJ + *(x + 1), +#else + -*(x + 1), +#endif +#endif + a + uu * COMPSIZE, 1, y + uu * COMPSIZE, 1, NULL, 0); + + x += incx * COMPSIZE; +#else + result = MYDOT(ll - uu, a + uu * COMPSIZE, 1, x + uu * COMPSIZE, 1); + +#ifndef COMPLEX + *y = result; +#else + *(y + 0) += CREAL(result); +#ifndef XCONJ + *(y + 1) += CIMAG(result); +#else + *(y + 1) -= CIMAG(result); +#endif +#endif + + x += COMPSIZE; +#endif + + y += COMPSIZE; + + offset_u --; + offset_l --; + + a += lda * COMPSIZE; + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + args.n = n; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)buffer; + + args.lda = lda; + args.ldb = incx; + args.ldc = ku; + args.ldd = kl; + + num_cpu = 0; + + range_n[0] = 0; + i = n; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + if (width < 4) width = 4; + if (i < width) width = i; + + range_n[num_cpu + 1] = range_n[num_cpu] + width; + +#ifndef TRANSA + range_m[num_cpu] = num_cpu * ((m + 15) & ~15); +#else + range_m[num_cpu] = num_cpu * ((n + 15) & ~15); +#endif + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = gbmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + + if (num_cpu) { + queue[0].sa = NULL; +#ifndef TRANSA + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; +#else + queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; +#endif + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + for (i = 1; i < num_cpu; i ++) { + AXPYU_K( +#ifndef TRANSA + m, +#else + n, +#endif + 0, 0, +#ifndef COMPLEX + ONE, +#else + ONE, ZERO, +#endif + buffer + range_m[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + } + + AXPYU_K( +#ifndef TRANSA + m, +#else + n, +#endif + 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer, 1, y, incy, NULL, 0); + + return 0; +} diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c new file mode 100644 index 0000000..5f8abf2 --- /dev/null +++ b/driver/level2/gemv_thread.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifndef TRANSA +#if !defined(CONJ) && !defined(XCONJ) +#define GEMV GEMV_N +#elif defined(CONJ) && !defined(XCONJ) +#define GEMV GEMV_R +#elif !defined(CONJ) && defined(XCONJ) +#define GEMV GEMV_O +#else +#define GEMV GEMV_S +#endif +#else +#if !defined(CONJ) && !defined(XCONJ) +#define GEMV GEMV_T +#elif defined(CONJ) && !defined(XCONJ) +#define GEMV GEMV_C +#elif !defined(CONJ) && defined(XCONJ) +#define GEMV GEMV_U +#else +#define GEMV GEMV_D +#endif +#endif + +static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx, incy; + BLASLONG m_from, m_to, n_from, n_to; + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + incy = args -> ldc; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + + a += m_from * COMPSIZE; +#ifndef TRANSA + y += m_from * incy * COMPSIZE; +#endif + } + + n_from = 0; + n_to = args -> n; + + if (range_n) { + n_from = *(range_n + 0); + n_to = *(range_n + 1); + + a += n_from * lda * COMPSIZE; +#ifdef TRANSA + y += n_from * incy * COMPSIZE; +#endif + } + + // fprintf(stderr, "M_From = %d M_To = %d N_From = %d N_To = %d\n", m_from, m_to, n_from, n_to); + + GEMV(m_to - m_from, n_to - n_from, 0, + *((FLOAT *)args -> alpha + 0), +#ifdef COMPLEX + *((FLOAT *)args -> alpha + 1), +#endif + a, lda, x, incx, y, incy, buffer); + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + args.n = n; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)y; + + args.lda = lda; + args.ldb = incx; + args.ldc = incy; + +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *) alpha; +#endif + + num_cpu = 0; + + range[0] = 0; +#ifndef TRANSA + i = m; +#else + i = n; +#endif + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + if (width < 4) width = 4; + if (i < width) width = i; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = gemv_kernel; + queue[num_cpu].args = &args; +#ifndef TRANSA + queue[num_cpu].range_m = &range[num_cpu]; + queue[num_cpu].range_n = NULL; +#else + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = &range[num_cpu]; +#endif + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/ger_thread.c b/driver/level2/ger_thread.c new file mode 100644 index 0000000..9e2f520 --- /dev/null +++ b/driver/level2/ger_thread.c @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#ifndef XCONJ +#define AXPY AXPYU_K +#else +#define AXPY AXPYC_K +#endif + +static int ger_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + FLOAT alpha_r; +#ifdef COMPLEX + FLOAT alpha_i; +#endif + BLASLONG lda, incx, incy; + BLASLONG m, n_from, n_to; + BLASLONG i; + + x = (FLOAT *)args -> a; + y = (FLOAT *)args -> b; + a = (FLOAT *)args -> c; + + incx = args -> lda; + incy = args -> ldb; + lda = args -> ldc; + + m = args -> m; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#ifdef COMPLEX + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + n_from = 0; + n_to = args -> n; + + if (range_n) { + n_from = *(range_n + 0); + n_to = *(range_n + 1); + + y += n_from * incy * COMPSIZE; + a += n_from * lda * COMPSIZE; + } + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + x = buffer; + } + + for (i = n_from; i < n_to; i ++) { + + AXPY(m, 0, 0, +#ifndef COMPLEX + alpha_r * *y, +#else +#ifndef CONJ + alpha_r * *(y + 0) - alpha_i * *(y + 1), alpha_r * *(y + 1) + alpha_i * *(y + 0), +#else + alpha_r * *(y + 0) + alpha_i * *(y + 1), - alpha_r * *(y + 1) + alpha_i * *(y + 0), +#endif +#endif + x, 1, a, 1, NULL, 0); + + y += incy * COMPSIZE; + a += lda * COMPSIZE; + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + args.n = n; + + args.a = (void *)x; + args.b = (void *)y; + args.c = (void *)a; + + args.lda = incx; + args.ldb = incy; + args.ldc = lda; + +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *) alpha; +#endif + + num_cpu = 0; + + range_n[0] = 0; + i = n; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + if (width < 4) width = 4; + if (i < width) width = i; + + range_n[num_cpu + 1] = range_n[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = ger_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/sbmv_k.c b/driver/level2/sbmv_k.c new file mode 100644 index 0000000..d0adc67 --- /dev/null +++ b/driver/level2/sbmv_k.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, length; + + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *sbmvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = sbmvbuffer; + FLOAT *bufferX = sbmvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) + 4095) & ~4095); + sbmvbuffer = bufferX; + COPY_K(n, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, x, incx, X, 1); + } + + for (i = 0; i < n; i++) { + +#ifndef LOWER + length = i; + if (length > k) length = k; + + AXPYU_K(length + 1, 0, 0, + alpha * X[i], + a + k - length, 1, Y + i - length, 1, NULL, 0); + Y[i] += alpha * DOTU_K(length, a + k - length, 1, X + i - length, 1); +#else + length = k; + if (n - i - 1 < k) length = n - i - 1; + + AXPYU_K(length + 1, 0, 0, + alpha * X[i], + a, 1, Y + i, 1, NULL, 0); + Y[i] += alpha * DOTU_K(length, a + 1, 1, X + i + 1, 1); +#endif + + a += lda; + } + + if (incy != 1) { + COPY_K(n, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c new file mode 100644 index 0000000..222734d --- /dev/null +++ b/driver/level2/sbmv_thread.c @@ -0,0 +1,359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#if !defined(HEMV) && !defined(HEMVREV) +#define MYAXPY AXPYU_K +#define MYDOT DOTU_K +#elif defined HEMV +#define MYAXPY AXPYU_K +#define MYDOT DOTC_K +#else +#define MYAXPY AXPYC_K +#define MYDOT DOTU_K +#endif + +static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx; + BLASLONG n, k, n_from, n_to; + BLASLONG i, length; +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + + n = args -> n; + k = args -> k; + + n_from = 0; + n_to = n; + + if (range_m) { + n_from = *(range_m + 0); + n_to = *(range_m + 1); + + a += n_from * lda * COMPSIZE; + } + + if (range_n) y += *range_n * COMPSIZE; + + if (incx != 1) { + COPY_K(n, x, incx, buffer, 1); + + x = buffer; + buffer += ((COMPSIZE * n + 1023) & ~1023); + } + + SCAL_K(n, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); + + for (i = n_from; i < n_to; i++) { + +#ifndef LOWER + + length = i; + if (length > k) length = k; + + MYAXPY(length, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0); + +#if !defined(HEMV) && !defined(HEMVREV) + result = MYDOT(length + 1, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); +#else + result = MYDOT(length , a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); +#endif + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else +#if !defined(HEMV) && !defined(HEMVREV) + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#else + *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 1); +#endif +#endif + +#else + + length = k; + if (n - i - 1 < k) length = n - i - 1; + + MYAXPY(length, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); + +#if !defined(HEMV) && !defined(HEMVREV) + result = MYDOT(length + 1, a, 1, x + i * COMPSIZE, 1); +#else + result = MYDOT(length , a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1) ; +#endif + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else +#if !defined(HEMV) && !defined(HEMVREV) + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#else + *(y + i * COMPSIZE + 0) += CREAL(result) + *a * *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += CIMAG(result) + *a * *(x + i * COMPSIZE + 1); +#endif +#endif + +#endif + + a += lda * COMPSIZE; + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.n = n; + args.k = k; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)buffer; + + args.lda = lda; + args.ldb = incx; + args.ldc = incy; + + dnum = (double)n * (double)n / (double)nthreads; + num_cpu = 0; + + if (n < 2 * k) { + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = n; + i = 0; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(n - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = n - i; + } + + if (width < 16) width = 16; + if (width > n - i) width = n - i; + + } else { + width = n - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = sbmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(n - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = n - i; + } + + if (width < 16) width = 16; + if (width > n - i) width = n - i; + + } else { + width = n - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = sbmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + } else { + + range_m[0] = 0; + i = n; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + if (width < 4) width = 4; + if (i < width) width = i; + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + range_n[num_cpu] = num_cpu * ((n + 15) & ~15); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = sbmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + } + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + for (i = 1; i < num_cpu; i ++) { + AXPYU_K(n, 0, 0, +#ifndef COMPLEX + ONE, +#else + ONE, ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + } + + AXPYU_K(n, 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer, 1, y, incy, NULL, 0); + + return 0; +} diff --git a/driver/level2/spmv_k.c b/driver/level2/spmv_k.c new file mode 100644 index 0000000..07ec660 --- /dev/null +++ b/driver/level2/spmv_k.c @@ -0,0 +1,86 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + + for (i = 0; i < m; i++) { +#ifndef LOWER + if (i > 0) Y[i] += alpha * DOTU_K(i, a, 1, X, 1); + AXPYU_K(i + 1, 0, 0, alpha * X[i], a, 1, Y, 1, NULL, 0); + a += i + 1; + +#else + Y[i] += alpha * DOTU_K(m - i, a + i, 1, X + i, 1); + if (m - i > 1) AXPYU_K(m - i - 1, 0, 0, alpha * X[i], + a + i + 1, 1, Y + i + 1, 1, NULL, 0); + a += m - i - 1; +#endif + } + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c new file mode 100644 index 0000000..7717bbf --- /dev/null +++ b/driver/level2/spmv_thread.c @@ -0,0 +1,345 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#if! defined(HEMV) && !defined(HEMVREV) +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif defined HEMV +#define MYDOT DOTC_K +#define MYAXPY AXPYU_K +#else +#define MYDOT DOTU_K +#define MYAXPY AXPYC_K +#endif + +static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG incx, incy; + BLASLONG m_from, m_to, i; +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + incx = args -> ldb; + incy = args -> ldc; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (range_n) y += *range_n * COMPSIZE; + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + + x = buffer; + } + +#ifndef LOWER + SCAL_K(m_to, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); +#else + SCAL_K(args -> m - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); +#endif + +#ifndef LOWER + a += (m_from + 1) * m_from / 2 * COMPSIZE; +#else + a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; +#endif + + for (i = m_from; i < m_to; i++) { +#ifndef LOWER + +#if !defined(HEMV) && !defined(HEMVREV) + result = MYDOT(i + 1, a, 1, x, 1); +#else + result = MYDOT(i , a, 1, x, 1); +#endif + +#ifndef COMPLEX + *(y + i * COMPSIZE) += result; +#else +#if !defined(HEMV) && !defined(HEMVREV) + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#else + *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); +#endif +#endif + + MYAXPY(i, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a, 1, y, 1, NULL, 0); + + a += (i + 1) * COMPSIZE; + +#else +#if !defined(HEMV) && !defined(HEMVREV) + result = MYDOT(args -> m - i , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1); +#else + result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); +#endif + +#ifndef COMPLEX + *(y + i * COMPSIZE) += result; +#else +#if !defined(HEMV) && !defined(HEMVREV) + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#else + *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); +#endif +#endif + + MYAXPY(args -> m - i - 1, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (i + 1) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); + + a += (args -> m - i - 1) * COMPSIZE; + +#endif + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)buffer; + + args.ldb = incx; + args.ldc = incy; + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = spmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = spmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + for (i = 1; i < num_cpu; i ++) { + +#ifndef LOWER + + AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + +#else + + AXPYU_K(m - range_m[i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); + +#endif + + } + + AXPYU_K(m, 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer, 1, y, incy, NULL, 0); + + return 0; +} diff --git a/driver/level2/spr2_k.c b/driver/level2/spr2_k.c new file mode 100644 index 0000000..58e14eb --- /dev/null +++ b/driver/level2/spr2_k.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * X[i], Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha_r * Y[i], X, 1, a, 1, NULL, 0); + a += i + 1; +#else + AXPYU_K(m - i, 0, 0, alpha_r * X[i], Y + i, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, alpha_r * Y[i], X + i, 1, a, 1, NULL, 0); + a += m - i; +#endif + } + + return 0; +} diff --git a/driver/level2/spr2_thread.c b/driver/level2/spr2_thread.c new file mode 100644 index 0000000..b20eb05 --- /dev/null +++ b/driver/level2/spr2_thread.c @@ -0,0 +1,356 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx, incy; + BLASLONG i, m_from, m_to; + FLOAT alpha_r; +#ifdef COMPLEX + FLOAT alpha_i; +#endif + + x = (FLOAT *)args -> a; + y = (FLOAT *)args -> b; + a = (FLOAT *)args -> c; + + incx = args -> lda; + incy = args -> ldb; + lda = args -> ldc; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#ifdef COMPLEX + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } + + if (incy != 1) { +#ifndef LOWER + COPY_K(m_to, y, incy, buffer, 1); +#else + COPY_K(args -> m - m_from, y + m_from * incy * COMPSIZE, incy, buffer + m_from * COMPSIZE, 1); +#endif + y = buffer; + } + +#ifndef LOWER + a += (m_from + 1) * m_from / 2 * COMPSIZE; +#else + a += (2 * args -> m - m_from + 1) * m_from / 2 * COMPSIZE; +#endif + + for (i = m_from; i < m_to; i++){ +#if !defined(HEMV) && !defined(HEMVREV) +#ifndef COMPLEX + if (x[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * x[i], y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], y + i, 1, a, 1, NULL, 0); +#endif + } + if (y[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * y[i], x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i], x + i, 1, a, 1, NULL, 0); +#endif + } +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif + } + if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif + } +#endif +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#endif + } + if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#endif + } +#ifndef LOWER + a[i * COMPSIZE + 1] = ZERO; +#else + a[ 1] = ZERO; +#endif +#endif + +#ifndef LOWER + a += (i + 1) * COMPSIZE; +#else + a += (args -> m - i) * COMPSIZE; +#endif + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)x; + args.b = (void *)y; + args.c = (void *)a; + + args.lda = incx; + args.ldb = incy; +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/spr_k.c b/driver/level2/spr_k.c new file mode 100644 index 0000000..996d925 --- /dev/null +++ b/driver/level2/spr_k.c @@ -0,0 +1,69 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, + FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + if (X[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha_r * X[i], X, 1, a, 1, NULL, 0); + } + a += i + 1; +#else + if (X[i] != ZERO) { + AXPYU_K(m - i, 0, 0, alpha_r * X[i], X + i, 1, a, 1, NULL, 0); + } + a += m - i; +#endif + } + + return 0; +} diff --git a/driver/level2/spr_thread.c b/driver/level2/spr_thread.c new file mode 100644 index 0000000..f889506 --- /dev/null +++ b/driver/level2/spr_thread.c @@ -0,0 +1,291 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x; + BLASLONG incx; + BLASLONG i, m_from, m_to; + FLOAT alpha_r; +#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) + FLOAT alpha_i; +#endif + + x = (FLOAT *)args -> a; + a = (FLOAT *)args -> b; + + incx = args -> lda; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + x = buffer; + } + +#ifndef LOWER + a += (m_from + 1) * m_from / 2 * COMPSIZE; +#else + a += (2 * args -> m - m_from + 1) * m_from / 2 * COMPSIZE; +#endif + + for (i = m_from; i < m_to; i++){ +#if !defined(HEMV) && !defined(HEMVREV) +#ifndef COMPLEX + if (x[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * x[i], x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], x + i, 1, a, 1, NULL, 0); +#endif + } +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif + } +#endif +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0], - alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0], - alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#endif + } +#ifndef LOWER + a[i * COMPSIZE + 1] = ZERO; +#else + a[ 1] = ZERO; +#endif +#endif + +#ifndef LOWER + a += (i + 1) * COMPSIZE; +#else + a += (args -> m - i) * COMPSIZE; +#endif + } + + return 0; +} + +#if !defined(COMPLEX) || defined(HEMV) || defined(HEMVREV) +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)x; + args.b = (void *)a; + + args.lda = incx; + +#if !defined(COMPLEX) || defined(HEMV) || defined(HEMVREV) + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c new file mode 100644 index 0000000..cf0e2d0 --- /dev/null +++ b/driver/level2/symv_thread.c @@ -0,0 +1,295 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#if! defined(HEMV) && !defined(HEMVREV) +#define MYSYMV_U SYMV_U +#define MYSYMV_L SYMV_L +#elif defined HEMV +#define MYSYMV_U HEMV_U +#define MYSYMV_L HEMV_L +#else +#define MYSYMV_U HEMV_V +#define MYSYMV_L HEMV_M +#endif + +static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx, incy; + BLASLONG m_from, m_to; + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + incy = args -> ldc; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (range_n) y += *range_n * COMPSIZE; + +#ifndef LOWER + + SCAL_K(m_to, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); + + MYSYMV_U (m_to, m_to - m_from, ONE, +#ifdef COMPLEX + ZERO, +#endif + a, lda, x, incx, y, 1, buffer); + +#else + + SCAL_K(args -> m - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + + MYSYMV_L (args -> m - m_from, m_to - m_from, ONE, +#ifdef COMPLEX + ZERO, +#endif + a + m_from * (lda + 1) * COMPSIZE, lda, x + m_from * incx * COMPSIZE, incx, y + m_from * COMPSIZE, 1, buffer); +#endif + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 3; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)buffer; + + args.lda = lda; + args.ldb = incx; + args.ldc = incy; + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask; + + if (width < 4) width = 4; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; + queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; + queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; + queue[MAX_CPU_NUMBER - num_cpu - 1].range_m = &range_m[num_cpu]; + queue[MAX_CPU_NUMBER - num_cpu - 1].range_n = &range_n[num_cpu]; + queue[MAX_CPU_NUMBER - num_cpu - 1].sa = NULL; + queue[MAX_CPU_NUMBER - num_cpu - 1].sb = NULL; + queue[MAX_CPU_NUMBER - num_cpu - 1].next = &queue[MAX_CPU_NUMBER - num_cpu]; + + num_cpu ++; + i += width; + } + + if (num_cpu) { + queue[MAX_CPU_NUMBER - num_cpu].sa = NULL; + queue[MAX_CPU_NUMBER - num_cpu].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[MAX_CPU_NUMBER - 1].next = NULL; + + exec_blas(num_cpu, &queue[MAX_CPU_NUMBER - num_cpu]); + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 4) width = 4; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = symv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + +#endif + +#ifndef LOWER + + for (i = 0; i < num_cpu - 1; i ++) { + + AXPYU_K(range_m[i + 1], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer + range_n[num_cpu - 1] * COMPSIZE, 1, NULL, 0); + } + + AXPYU_K(m, 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer + range_n[num_cpu - 1] * COMPSIZE, 1, y, incy, NULL, 0); + +#else + + for (i = 1; i < num_cpu; i ++) { + + AXPYU_K(m - range_m[i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); + } + + AXPYU_K(m, 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer, 1, y, incy, NULL, 0); + +#endif + + return 0; +} diff --git a/driver/level2/syr2_k.c b/driver/level2/syr2_k.c new file mode 100644 index 0000000..bca8b3b --- /dev/null +++ b/driver/level2/syr2_k.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * X[i], Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha_r * Y[i], X, 1, a, 1, NULL, 0); + a += lda; +#else + AXPYU_K(m - i, 0, 0, alpha_r * X[i], Y + i, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, alpha_r * Y[i], X + i, 1, a, 1, NULL, 0); + a += 1 + lda; +#endif + } + + return 0; +} diff --git a/driver/level2/syr2_thread.c b/driver/level2/syr2_thread.c new file mode 100644 index 0000000..130a62d --- /dev/null +++ b/driver/level2/syr2_thread.c @@ -0,0 +1,345 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx, incy; + BLASLONG i, m_from, m_to; + FLOAT alpha_r; +#ifdef COMPLEX + FLOAT alpha_i; +#endif + + x = (FLOAT *)args -> a; + y = (FLOAT *)args -> b; + a = (FLOAT *)args -> c; + + incx = args -> lda; + incy = args -> ldb; + lda = args -> ldc; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#ifdef COMPLEX + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } + + if (incy != 1) { +#ifndef LOWER + COPY_K(m_to, y, incy, buffer, 1); +#else + COPY_K(args -> m - m_from, y + m_from * incy * COMPSIZE, incy, buffer + m_from * COMPSIZE, 1); +#endif + y = buffer; + } + + a += m_from * lda * COMPSIZE; + + for (i = m_from; i < m_to; i++){ +#if !defined(HER) && !defined(HERREV) +#ifndef COMPLEX + if (x[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * x[i], y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], y + i, 1, a + i, 1, NULL, 0); +#endif + } + if (y[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * y[i], x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i], x + i, 1, a + i, 1, NULL, 0); +#endif + } +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif + } + if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif + } +#endif +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef HERREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#endif + } + if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { +#ifndef HERREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#endif + } + a[i * COMPSIZE + 1] = ZERO; +#endif + a += lda * COMPSIZE; + + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)x; + args.b = (void *)y; + args.c = (void *)a; + + args.lda = incx; + args.ldb = incy; + args.ldc = lda; +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/syr_k.c b/driver/level2/syr_k.c new file mode 100644 index 0000000..a0d9a2f --- /dev/null +++ b/driver/level2/syr_k.c @@ -0,0 +1,69 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, + FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + if (X[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha_r * X[i], X, 1, a, 1, NULL, 0); + } + a += lda; +#else + if (X[i] != ZERO) { + AXPYU_K(m - i, 0, 0, alpha_r * X[i], X + i, 1, a, 1, NULL, 0); + } + a += 1 + lda; +#endif + } + + return 0; +} diff --git a/driver/level2/syr_thread.c b/driver/level2/syr_thread.c new file mode 100644 index 0000000..250e8c0 --- /dev/null +++ b/driver/level2/syr_thread.c @@ -0,0 +1,283 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x; + BLASLONG lda, incx; + BLASLONG i, m_from, m_to; + FLOAT alpha_r; +#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) + FLOAT alpha_i; +#endif + + + x = (FLOAT *)args -> a; + a = (FLOAT *)args -> b; + + incx = args -> lda; + lda = args -> ldb; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + + x = buffer; + } + + a += m_from * lda * COMPSIZE; + + for (i = m_from; i < m_to; i++){ +#if !defined(HER) && !defined(HERREV) +#ifndef COMPLEX + if (x[i * COMPSIZE] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * x[i], x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], x + i, 1, a + i, 1, NULL, 0); +#endif + } +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif + } +#endif +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef HERREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#endif + + } + a[i * COMPSIZE + 1] = ZERO; +#endif + a += lda * COMPSIZE; + + } + + return 0; +} + +#if !defined(COMPLEX) || defined(HER) || defined(HERREV) +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)x; + args.b = (void *)a; + + args.lda = incx; + args.ldb = lda; +#if !defined(COMPLEX) || defined(HER) || defined(HERREV) + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/tbmv_L.c b/driver/level2/tbmv_L.c new file mode 100644 index 0000000..05e7cf8 --- /dev/null +++ b/driver/level2/tbmv_L.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + a += (n - 1) * lda; + + for (i = n - 1; i >= 0; i--) { + +#ifndef TRANSA + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { + AXPYU_K(length, 0, 0, + B[i], + a + 1, 1, B + i + 1, 1, NULL, 0); + } +#endif + +#ifndef UNIT +#ifndef TRANSA + B[i] *= a[0]; +#else + B[i] *= a[k]; +#endif +#endif + +#ifdef TRANSA + length = i; + if (length > k) length = k; + + if (length > 0) { + B[i] += DOTU_K(length, a + k - length, 1, B + i - length, 1); + } +#endif + + a -= lda; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tbmv_U.c b/driver/level2/tbmv_U.c new file mode 100644 index 0000000..49d28dc --- /dev/null +++ b/driver/level2/tbmv_U.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + for (i = 0; i < n; i++) { + +#ifndef TRANSA + length = i; + if (length > k) length = k; + + if (length > 0) { + AXPYU_K(length, 0, 0, + B[i], + a + k - length, 1, B + i - length, 1, NULL, 0); + } +#endif + +#ifndef UNIT +#ifndef TRANSA + B[i] *= a[k]; +#else + B[i] *= a[0]; +#endif +#endif + +#ifdef TRANSA + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { + B[i] += DOTU_K(length, a + 1, 1, B + i + 1, 1); + } +#endif + + a += lda; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c new file mode 100644 index 0000000..e3d0588 --- /dev/null +++ b/driver/level2/tbmv_thread.c @@ -0,0 +1,396 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#ifndef COMPLEX +#ifndef TRANSA +#undef TRANS +#else +#define TRANS +#endif +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#else +#if (TRANSA == 1) || (TRANSA == 3) +#undef TRANS +#else +#define TRANS +#endif +#if (TRANSA == 1) || (TRANSA == 2) +#define MYAXPY AXPYU_K +#define MYDOT DOTU_K +#else +#define MYAXPY AXPYC_K +#define MYDOT DOTC_K +#endif +#endif + +static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + + BLASLONG k, lda, incx; + BLASLONG n_from, n_to; + BLASLONG i, length; + +#ifdef TRANS +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif +#endif + +#if defined(COMPLEX) && !defined(UNIT) + FLOAT ar, ai, xr, xi; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + k = args -> k; + n_from = 0; + n_to = args -> n; + + lda = args -> lda; + incx = args -> ldb; + + if (range_m) { + n_from = *(range_m + 0); + n_to = *(range_m + 1); + + a += n_from * lda * COMPSIZE; + } + + if (incx != 1) { + + COPY_K(args -> n, x, incx, buffer, 1); + + x = buffer; + buffer += ((args -> n * COMPSIZE + 1023) & ~1023); + } + + if (range_n) y += *range_n * COMPSIZE; + + SCAL_K(args -> n, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); + + for (i = n_from; i < n_to; i++) { + +#ifndef LOWER + length = i; +#else + length = args -> n - i - 1; +#endif + if (length > k) length = k; + +#ifndef LOWER + if (length > 0) { +#ifndef TRANS + MYAXPY(length, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0); +#else + result = MYDOT(length, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif +#endif + } +#endif + +#ifndef COMPLEX +#ifdef UNIT + *(y + i * COMPSIZE) += *(x + i * COMPSIZE); +#else +#ifndef LOWER + *(y + i * COMPSIZE) += *(a + k * COMPSIZE) * *(x + i * COMPSIZE); +#else + *(y + i * COMPSIZE) += *(a + 0 * COMPSIZE) * *(x + i * COMPSIZE); +#endif +#endif +#else +#ifdef UNIT + *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); +#else +#ifndef LOWER + ar = *(a + k * COMPSIZE + 0); + ai = *(a + k * COMPSIZE + 1); +#else + ar = *(a + 0); + ai = *(a + 1); +#endif + xr = *(x + i * COMPSIZE + 0); + xi = *(x + i * COMPSIZE + 1); + +#if (TRANSA == 1) || (TRANSA == 2) + *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; +#else + *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; +#endif +#endif +#endif + +#ifdef LOWER + if (length > 0) { +#ifndef TRANS + MYAXPY(length, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + result = MYDOT(length, a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif +#endif + } +#endif + + a += lda * COMPSIZE; + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.n = n; + args.k = k; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)(buffer); + + args.lda = lda; + args.ldb = incx; + + dnum = (double)n * (double)n / (double)nthreads; + num_cpu = 0; + + if (n < 2 * k) { + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = n; + i = 0; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(n - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = n - i; + } + + if (width < 16) width = 16; + if (width > n - i) width = n - i; + + } else { + width = n - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(n - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = n - i; + } + + if (width < 16) width = 16; + if (width > n - i) width = n - i; + + } else { + width = n - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + } else { + + range_m[0] = 0; + i = n; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + if (width < 4) width = 4; + if (i < width) width = i; + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + } + + + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + for (i = 1; i < num_cpu; i ++) { + AXPYU_K(n, 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + } + + COPY_K(n, buffer, 1, x, incx); + + return 0; +} diff --git a/driver/level2/tbsv_L.c b/driver/level2/tbsv_L.c new file mode 100644 index 0000000..e9c9158 --- /dev/null +++ b/driver/level2/tbsv_L.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + for (i = 0; i < n; i++) { + +#ifdef TRANSA + length = i; + if (length > k) length = k; + + if (length > 0) { + B[i] -= DOTU_K(length, a + k - length, 1, B + i - length, 1); + } +#endif + +#ifndef UNIT +#ifdef TRANSA + B[i] /= a[k]; +#else + B[i] /= a[0]; +#endif +#endif + +#ifndef TRANSA + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { + AXPYU_K(length, 0, 0, + -B[i], + a + 1, 1, B + i + 1, 1, NULL, 0); + } +#endif + + a += lda; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tbsv_U.c b/driver/level2/tbsv_U.c new file mode 100644 index 0000000..0b1fca8 --- /dev/null +++ b/driver/level2/tbsv_U.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + a += (n - 1) * lda; + + for (i = n - 1; i >= 0; i--) { + +#ifdef TRANSA + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { + B[i] -= DOTU_K(length, a + 1, 1, B + i + 1, 1); + } +#endif + +#ifndef UNIT +#ifdef TRANSA + B[i] /= a[0]; +#else + B[i] /= a[k]; +#endif +#endif + +#ifndef TRANSA + length = i; + if (length > k) length = k; + + if (length > 0) { + AXPYU_K(length, 0, 0, + - B[i], + a + k - length, 1, B + i - length, 1, NULL, 0); + } +#endif + + a -= lda; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tpmv_L.c b/driver/level2/tpmv_L.c new file mode 100644 index 0000000..c139eb7 --- /dev/null +++ b/driver/level2/tpmv_L.c @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + COPY_K(m, b, incb, buffer, 1); + } + + a += (m + 1) * m / 2 - 1; + + for (i = 0; i < m; i++) { +#ifndef TRANSA + if (i > 0) AXPYU_K(i, 0, 0, B[m - i - 1], a + 1, 1, B + m - i, 1, NULL, 0); +#endif + +#ifndef UNIT + B[m - i - 1] *= a[0]; +#endif + +#ifdef TRANSA + if (i < m - 1) B[m - i - 1] += DOTU_K(m - i - 1, a - (m - i - 1), 1, B, 1); +#endif + +#ifndef TRANSA + a -= (i + 2); +#else + a -= (m - i); +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tpmv_U.c b/driver/level2/tpmv_U.c new file mode 100644 index 0000000..6d69df6 --- /dev/null +++ b/driver/level2/tpmv_U.c @@ -0,0 +1,86 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + COPY_K(m, b, incb, buffer, 1); + } + + for (i = 0; i < m; i++) { + +#ifndef TRANSA + if (i > 0) AXPYU_K(i, 0, 0, B[i], a, 1, B, 1, NULL, 0); +#endif + +#ifndef UNIT +#ifndef TRANSA + B[i] *= a[i]; +#else + B[i] *= a[0]; +#endif +#endif + +#ifdef TRANSA + if (i < m - 1) B[i] += DOTU_K(m - i - 1, a + 1, 1, B + i + 1, 1); +#endif + +#ifndef TRANSA + a += (i + 1); +#else + a += (m - i); +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c new file mode 100644 index 0000000..64b725f --- /dev/null +++ b/driver/level2/tpmv_thread.c @@ -0,0 +1,401 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#ifndef COMPLEX +#ifndef TRANSA +#undef TRANS +#else +#define TRANS +#endif +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#else +#if TRANSA == 1 +#undef TRANS +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif TRANSA == 2 +#define TRANS +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif TRANSA == 3 +#undef TRANS +#define MYDOT DOTC_K +#define MYAXPY AXPYC_K +#else +#define TRANS +#define MYDOT DOTC_K +#define MYAXPY AXPYC_K +#endif +#endif + +static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + + BLASLONG incx; + BLASLONG m_from, m_to; + BLASLONG i; + +#ifdef TRANS +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif +#endif + +#if defined(COMPLEX) && !defined(UNIT) + FLOAT ar, ai, xr, xi; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + incx = args -> ldb; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { + +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } + +#ifndef TRANS + if (range_n) y += *range_n * COMPSIZE; + +#ifndef LOWER + SCAL_K(m_to, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); +#else + SCAL_K(args -> m - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); +#endif + +#else + + SCAL_K(m_to - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + +#endif + +#ifndef LOWER + a += (m_from + 1) * m_from / 2 * COMPSIZE; +#else + a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; +#endif + + for (i = m_from; i < m_to; i++) { + +#ifndef LOWER + if (i > 0) { +#ifndef TRANS + MYAXPY(i, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a, 1, y, 1, NULL, 0); +#else + result = MYDOT(i, a, 1, x, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif + +#endif + } +#endif + +#ifndef COMPLEX +#ifdef UNIT + *(y + i * COMPSIZE) += *(x + i * COMPSIZE); +#else + *(y + i * COMPSIZE) += *(a + i * COMPSIZE) * *(x + i * COMPSIZE); +#endif +#else +#ifdef UNIT + *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); +#else + ar = *(a + i * COMPSIZE + 0); + ai = *(a + i * COMPSIZE + 1); + xr = *(x + i * COMPSIZE + 0); + xi = *(x + i * COMPSIZE + 1); + +#if (TRANSA == 1) || (TRANSA == 2) + *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; +#else + *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; +#endif +#endif +#endif + +#ifdef LOWER + if (args -> m > i + 1) { +#ifndef TRANS + MYAXPY(args -> m - i - 1, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (i + 1 ) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + + result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif + +#endif + } +#endif + +#ifndef LOWER + a += (i + 1) * COMPSIZE; +#else + a += (args -> m - i - 1) * COMPSIZE; +#endif + + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)(buffer); + + args.ldb = incx; + args.ldc = incx; + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = tpmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = tpmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + +#ifndef TRANS + for (i = 1; i < num_cpu; i ++) { + +#ifndef LOWER + + AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + +#else + + AXPYU_K(m - range_m[i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); + +#endif + + } +#endif + + COPY_K(m, buffer, 1, x, incx); + + return 0; +} diff --git a/driver/level2/tpsv_L.c b/driver/level2/tpsv_L.c new file mode 100644 index 0000000..9f76181 --- /dev/null +++ b/driver/level2/tpsv_L.c @@ -0,0 +1,87 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (i = 0; i < m; i++) { +#ifdef TRANSA + if (i > 0) B[i] -= DOTU_K(i, a, 1, B, 1); +#endif + +#ifndef UNIT +#ifndef TRANSA + B[i] /= a[0]; +#else + B[i] /= a[i]; +#endif +#endif + +#ifndef TRANSA + if (i < m - 1) { + AXPYU_K(m - i - 1 , 0, 0, - B[i], + a + 1, 1, B + i + 1, 1, NULL, 0); + } +#endif + +#ifndef TRANSA + a += (m - i); +#else + a += (i + 1); +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} diff --git a/driver/level2/tpsv_U.c b/driver/level2/tpsv_U.c new file mode 100644 index 0000000..7a09580 --- /dev/null +++ b/driver/level2/tpsv_U.c @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + a += (m + 1) * m / 2 - 1; + + for (i = 0; i < m; i++) { +#ifdef TRANSA + if (i > 0) B[m - i - 1] -= DOTU_K(i, a + 1, 1, B + m - i, 1); +#endif + +#ifndef UNIT + B[m - i - 1] /= a[0]; +#endif + +#ifndef TRANSA + if (i < m - 1) AXPYU_K(m - i - 1, 0, 0, -B[m - i - 1], a - (m - i - 1), 1, B, 1, NULL, 0); +#endif + +#ifndef TRANSA + a -= (m - i); +#else + a -= (i + 2); +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/trmv_L.c b/driver/level2/trmv_L.c new file mode 100644 index 0000000..e515ba6 --- /dev/null +++ b/driver/level2/trmv_L.c @@ -0,0 +1,103 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ + + BLASLONG i, is, min_i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = m; is > 0; is -= DTB_ENTRIES){ + + min_i = MIN(is, DTB_ENTRIES); + +#ifndef TRANSA + if (m - is > 0){ + GEMV_N(m - is, min_i, 0, dp1, + a + is + (is - min_i) * lda, lda, + B + is - min_i, 1, + B + is, 1, gemvbuffer); + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda; + FLOAT *BB = B + (is - i - 1); + +#ifndef TRANSA + if (i > 0) AXPYU_K(i, 0, 0, BB[0], AA + 1, 1, BB + 1, 1, NULL, 0); +#endif + +#ifndef UNIT + BB[0] *= AA[0]; +#endif + +#ifdef TRANSA + if (i < min_i - 1) BB[0] += DOTU_K(min_i - i - 1, AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1); +#endif + } + +#ifdef TRANSA + if (is - min_i > 0){ + GEMV_T(is - min_i, min_i, 0, dp1, + a + (is - min_i) * lda, lda, + B, 1, + B + is - min_i, 1, gemvbuffer); + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/trmv_U.c b/driver/level2/trmv_U.c new file mode 100644 index 0000000..3c36f77 --- /dev/null +++ b/driver/level2/trmv_U.c @@ -0,0 +1,104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ + + BLASLONG i, is, min_i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = 0; is < m; is += DTB_ENTRIES){ + + min_i = MIN(m - is, DTB_ENTRIES); + +#ifndef TRANSA + if (is > 0){ + GEMV_N(is, min_i, 0, dp1, + a + is * lda, lda, + B + is, 1, + B, 1, gemvbuffer); + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + is + (i + is) * lda; + FLOAT *BB = B + is; + +#ifndef TRANSA + if (i > 0) AXPYU_K(i, 0, 0, BB[i], AA, 1, BB, 1, NULL, 0); +#endif + +#ifndef UNIT + BB[i] *= AA[i]; +#endif + +#ifdef TRANSA + if (i < min_i - 1) BB[i] += DOTU_K(min_i - i - 1, AA + i + 1, 1, BB + i + 1, 1); +#endif + } + +#ifdef TRANSA + if (m - is > min_i){ + GEMV_T(m - is - min_i, min_i, 0, dp1, + a + is + min_i + is * lda, lda, + B + is + min_i, 1, + B + is, 1, gemvbuffer); + } +#endif + + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c new file mode 100644 index 0000000..4f5b27c --- /dev/null +++ b/driver/level2/trmv_thread.c @@ -0,0 +1,440 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#ifndef COMPLEX +#ifndef TRANSA +#define MYGEMV GEMV_N +#undef TRANS +#else +#define MYGEMV GEMV_T +#define TRANS +#endif +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#else +#if TRANSA == 1 +#define MYGEMV GEMV_N +#undef TRANS +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif TRANSA == 2 +#define MYGEMV GEMV_T +#define TRANS +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif TRANSA == 3 +#define MYGEMV GEMV_R +#undef TRANS +#define MYDOT DOTC_K +#define MYAXPY AXPYC_K +#else +#define MYGEMV GEMV_C +#define TRANS +#define MYDOT DOTC_K +#define MYAXPY AXPYC_K +#endif +#endif + +static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + + BLASLONG lda, incx; + BLASLONG m_from, m_to; + BLASLONG i, is, min_i; + +#ifdef TRANS +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif +#endif + +#if defined(COMPLEX) && !defined(UNIT) + FLOAT ar, ai, xr, xi; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { + +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } + +#ifndef TRANS + if (range_n) y += *range_n * COMPSIZE; + +#ifndef LOWER + SCAL_K(m_to, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); +#else + SCAL_K(args -> m - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); +#endif + +#else + + SCAL_K(m_to - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + +#endif + + for (is = m_from; is < m_to; is += DTB_ENTRIES){ + + min_i = MIN(m_to - is, DTB_ENTRIES); + +#ifndef LOWER + if (is > 0){ + MYGEMV(is, min_i, 0, + ONE, +#ifdef COMPLEX + ZERO, +#endif + a + is * lda * COMPSIZE, lda, +#ifndef TRANS + x + is * COMPSIZE, 1, + y, 1, +#else + x, 1, + y + is * COMPSIZE, 1, +#endif + buffer); + } +#endif + + for (i = is; i < is + min_i; i++) { + +#ifndef LOWER + if (i - is > 0) { +#ifndef TRANS + MYAXPY(i - is, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0); +#else + + result = MYDOT(i - is, a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif + +#endif + } +#endif + +#ifndef COMPLEX +#ifdef UNIT + *(y + i * COMPSIZE) += *(x + i * COMPSIZE); +#else + *(y + i * COMPSIZE) += *(a + (i + i * lda) * COMPSIZE) * *(x + i * COMPSIZE); +#endif +#else +#ifdef UNIT + *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); +#else + ar = *(a + (i + i * lda) * COMPSIZE + 0); + ai = *(a + (i + i * lda) * COMPSIZE + 1); + xr = *(x + i * COMPSIZE + 0); + xi = *(x + i * COMPSIZE + 1); + +#if (TRANSA == 1) || (TRANSA == 2) + *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; +#else + *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; +#endif +#endif +#endif + +#ifdef LOWER + if (is + min_i > i + 1) { +#ifndef TRANS + MYAXPY(is + min_i - i - 1, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (i + 1 + i * lda) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + + result = MYDOT(is + min_i - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif + +#endif + } +#endif + } + +#ifdef LOWER + if (args -> m > is + min_i){ + MYGEMV(args -> m - is - min_i, min_i, 0, + ONE, +#ifdef COMPLEX + ZERO, +#endif + a + (is + min_i + is * lda) * COMPSIZE, lda, +#ifndef TRANS + x + is * COMPSIZE, 1, + y + (is + min_i) * COMPSIZE, 1, +#else + x + (is + min_i) * COMPSIZE, 1, + y + is * COMPSIZE, 1, +#endif + buffer); + } +#endif + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)(buffer); + + args.lda = lda; + args.ldb = incx; + args.ldc = incx; + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + +#ifndef TRANS + for (i = 1; i < num_cpu; i ++) { + +#ifndef LOWER + + AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + +#else + + AXPYU_K(m - range_m[i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); + +#endif + + } +#endif + + COPY_K(m, buffer, 1, x, incx); + + return 0; +} diff --git a/driver/level2/trsv_L.c b/driver/level2/trsv_L.c new file mode 100644 index 0000000..44bcfe3 --- /dev/null +++ b/driver/level2/trsv_L.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +#undef GEMV_UNROLL +#define GEMV_UNROLL DTB_ENTRIES + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i, is, min_i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = 0; is < m; is += GEMV_UNROLL){ + + min_i = MIN(m - is, GEMV_UNROLL); + +#ifdef TRANSA + if (is > 0){ + GEMV_T(is, min_i, 0, dm1, + a + is * lda , lda, + B, 1, + B + is, 1, gemvbuffer); + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + is + (i + is) * lda; + FLOAT *BB = B + is; + +#ifdef TRANSA + if (i > 0) BB[i] -= DOTU_K(i, AA, 1, BB, 1); +#endif + +#ifndef UNIT + BB[i] /= AA[i]; +#endif + +#ifndef TRANSA + if (i < min_i - 1) { + AXPYU_K(min_i - i - 1 , 0, 0, - BB[i], + AA + i + 1, 1, BB + i + 1, 1, NULL, 0); + } +#endif + } + +#ifndef TRANSA + if (m - is > min_i){ + GEMV_N(m - is - min_i, min_i, 0, dm1, + a + is + min_i + is * lda, lda, + B + is, 1, + B + (is + min_i), 1, gemvbuffer); + } +#endif + + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} diff --git a/driver/level2/trsv_U.c b/driver/level2/trsv_U.c new file mode 100644 index 0000000..f02512b --- /dev/null +++ b/driver/level2/trsv_U.c @@ -0,0 +1,104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i, is, min_i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = m; is > 0; is -= DTB_ENTRIES){ + + min_i = MIN(is, DTB_ENTRIES); + +#ifdef TRANSA + if (m - is > 0){ + GEMV_T(m - is, min_i, 0, dm1, + a + is + (is - min_i) * lda, lda, + B + is, 1, + B + is - min_i, 1, gemvbuffer); + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda; + FLOAT *BB = B + (is - i - 1); + +#ifdef TRANSA + if (i > 0) BB[0] -= DOTU_K(i, AA + 1, 1, BB + 1, 1); +#endif + +#ifndef UNIT + BB[0] /= AA[0]; +#endif + +#ifndef TRANSA + if (i < min_i - 1) AXPYU_K(min_i - i - 1, 0, 0, -BB[0], AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1, NULL, 0); +#endif + } + +#ifndef TRANSA + if (is - min_i > 0){ + GEMV_N(is - min_i, min_i, 0, dm1, + a + (is - min_i) * lda, lda, + B + is - min_i, 1, + B, 1, gemvbuffer); + } +#endif + + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/zgbmv_k.c b/driver/level2/zgbmv_k.c new file mode 100644 index 0000000..7832a7e --- /dev/null +++ b/driver/level2/zgbmv_k.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifndef XCONJ +#ifndef CONJ +#define ZAXPY AXPYU_K +#define ZDOT DOTU_K +#else +#define ZAXPY AXPYC_K +#define ZDOT DOTC_K +#endif +#else +#ifndef CONJ +#define ZAXPY AXPYU_K +#define ZDOT DOTC_K +#else +#define ZAXPY AXPYC_K +#define ZDOT DOTU_K +#endif +#endif + +#ifndef TRANS +#define M m +#define N n +#else +#define N m +#define M n +#endif + +void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, offset_u, offset_l, start, end, length; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; +#ifdef TRANS + FLOAT _Complex temp; +#endif + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + M * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(M, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + N * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(N, x, incx, X, 1); + } + + offset_u = ku; + offset_l = ku + m; + + for (i = 0; i < MIN(n, m + ku); i++) { + + start = MAX(offset_u, 0); + end = MIN(offset_l, ku + kl + 1); + + length = end - start; + +#ifndef TRANS + ZAXPY(length, 0, 0, +#ifndef XCONJ + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], +#else + alpha_r * X[i * 2 + 0] + alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], +#endif + a + start * 2, 1, Y + (start - offset_u) * 2, 1, NULL, 0); +#else + +#ifndef XCONJ + temp = ZDOT(length, a + start * 2, 1, X + (start - offset_u) * 2, 1); +#else + temp = ZDOT(length, X + (start - offset_u) * 2, 1, a + start * 2, 1); +#endif + +#if !defined(XCONJ) || !defined(CONJ) + Y[i * 2 + 0] += alpha_r * CREAL(temp) - alpha_i * CIMAG(temp); + Y[i * 2 + 1] += alpha_i * CREAL(temp) + alpha_r * CIMAG(temp); +#else + Y[i * 2 + 0] += alpha_r * CREAL(temp) + alpha_i * CIMAG(temp); + Y[i * 2 + 1] += alpha_i * CREAL(temp) - alpha_r * CIMAG(temp); +#endif +#endif + + offset_u --; + offset_l --; + + a += lda * 2; + } + + if (incy != 1) { + COPY_K(M, Y, 1, y, incy); + } + + return; +} + diff --git a/driver/level2/zhbmv_k.c b/driver/level2/zhbmv_k.c new file mode 100644 index 0000000..8771942 --- /dev/null +++ b/driver/level2/zhbmv_k.c @@ -0,0 +1,189 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, length; +#ifndef LOWER + BLASLONG offset; +#endif + + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *sbmvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = sbmvbuffer; + FLOAT *bufferX = sbmvbuffer; + FLOAT temp[2]; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + sbmvbuffer = bufferX; + COPY_K(n, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + COPY_K(n, x, incx, X, 1); + } + +#ifndef LOWER + offset = k; +#endif + + for (i = 0; i < n; i++) { + +#ifndef HEMVREV +#ifndef LOWER + length = k - offset; + + if (length > 0) { + AXPYU_K(length, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); + } + + temp[0] = a[k * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[k * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (length > 0) { + FLOAT _Complex result = DOTC_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + if (offset > 0) offset --; +#else + length = k; + if (n - i - 1 < k) length = n - i - 1; + + if (length > 0) { + AXPYU_K(length, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0); + } + + temp[0] = a[0] * X[i * 2 + 0]; + temp[1] = a[0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (length > 0) { + FLOAT _Complex result = DOTC_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } +#endif +#else +#ifndef LOWER + + length = k - offset; + + if (length > 0) { + AXPYC_K(length, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); + } + + temp[0] = a[k * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[k * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (length > 0) { + FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + if (offset > 0) offset --; +#else + length = k; + if (n - i - 1 < k) length = n - i - 1; + + if (length > 0) { + AXPYC_K(length, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0); + } + + temp[0] = a[0] * X[i * 2 + 0]; + temp[1] = a[0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (length > 0) { + FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } +#endif +#endif + + + a += lda * 2; + } + + if (incy != 1) { + COPY_K(n, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/zher2_k.c b/driver/level2/zher2_k.c new file mode 100644 index 0000000..3e92458 --- /dev/null +++ b/driver/level2/zher2_k.c @@ -0,0 +1,120 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + lda *= 2; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += lda; +#else + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += 2 + lda; +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYC_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += lda; +#else + AXPYC_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYC_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += 2 + lda; +#endif +#endif + + } + + + return 0; +} diff --git a/driver/level2/zher_k.c b/driver/level2/zher_k.c new file mode 100644 index 0000000..772034f --- /dev/null +++ b/driver/level2/zher_k.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, + BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + lda *= 2; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += lda; +#else + AXPYU_K(m - i, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += 2 + lda; +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += lda; +#else + AXPYC_K(m - i, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += 2 + lda; +#endif +#endif + } + + return 0; +} diff --git a/driver/level2/zhpmv_k.c b/driver/level2/zhpmv_k.c new file mode 100644 index 0000000..5f95ce7 --- /dev/null +++ b/driver/level2/zhpmv_k.c @@ -0,0 +1,177 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + FLOAT temp[2]; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + + for (i = 0; i < m; i++) { + +#ifndef HEMVREV +#ifndef LOWER + if (i > 0) { + FLOAT _Complex result = DOTC_K(i, a, 1, X, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (i > 0) { + AXPYU_K(i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a, 1, Y, 1, NULL, 0); + } + + a += (i + 1) * 2; + +#else + + if (m - i > 1) { + FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (m - i > 1) { + AXPYU_K(m - i - 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); + } + + a += (m - i - 1) * 2; + +#endif +#else +#ifndef LOWER + if (i > 0) { + FLOAT _Complex result = DOTU_K(i, a, 1, X, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (i > 0) { + AXPYC_K(i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a, 1, Y, 1, NULL, 0); + } + + a += (i + 1) * 2; + +#else + + if (m - i > 1) { + FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (m - i > 1) { + AXPYC_K(m - i - 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); + } + + a += (m - i - 1) * 2; + +#endif +#endif + + } + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/zhpr2_k.c b/driver/level2/zhpr2_k.c new file mode 100644 index 0000000..f4608ff --- /dev/null +++ b/driver/level2/zhpr2_k.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += (i + 1) * 2; +#else + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += (m - i) * 2; +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYC_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += (i + 1) * 2; +#else + AXPYC_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYC_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += (m - i) * 2; +#endif +#endif + } + + + return 0; +} diff --git a/driver/level2/zhpr_k.c b/driver/level2/zhpr_k.c new file mode 100644 index 0000000..c564d49 --- /dev/null +++ b/driver/level2/zhpr_k.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, + BLASLONG incx, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += (i + 1) * 2; +#else + AXPYU_K(m - i, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += (m - i) * 2; +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += (i + 1) * 2; +#else + AXPYC_K(m - i, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += (m - i) * 2; +#endif +#endif + } + + return 0; +} diff --git a/driver/level2/zsbmv_k.c b/driver/level2/zsbmv_k.c new file mode 100644 index 0000000..de5dfdd --- /dev/null +++ b/driver/level2/zsbmv_k.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, length; +#ifndef LOWER + BLASLONG offset; +#endif + + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *sbmvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = sbmvbuffer; + FLOAT *bufferX = sbmvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + sbmvbuffer = bufferX; + COPY_K(n, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + COPY_K(n, x, incx, X, 1); + } + +#ifndef LOWER + offset = k; +#endif + + for (i = 0; i < n; i++) { + +#ifndef LOWER + length = k - offset; + + AXPYU_K(length + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); + + if (length > 0) { + FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + if (offset > 0) offset --; +#else + length = k; + if (n - i - 1 < k) length = n - i - 1; + + AXPYU_K(length + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a, 1, Y + i * COMPSIZE, 1, NULL, 0); + + if (length > 0) { + FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } +#endif + + a += lda * 2; + } + + if (incy != 1) { + COPY_K(n, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/zspmv_k.c b/driver/level2/zspmv_k.c new file mode 100644 index 0000000..c93b1e1 --- /dev/null +++ b/driver/level2/zspmv_k.c @@ -0,0 +1,108 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + FLOAT _Complex result; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + + for (i = 0; i < m; i++) { +#ifndef LOWER + + if (i > 0) { + result = DOTU_K(i, a, 1, X, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a, 1, Y, 1, NULL, 0); + + a += (i + 1) * 2; + +#else + + result = DOTU_K(m - i, a + i * 2, 1, X + i * 2, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + + if (m - i > 1) + AXPYU_K(m - i - 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); + + a += (m - i - 1) * 2; + +#endif + } + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/zspr2_k.c b/driver/level2/zspr2_k.c new file mode 100644 index 0000000..48c81a3 --- /dev/null +++ b/driver/level2/zspr2_k.c @@ -0,0 +1,87 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a += (i + 1) * 2; +#else + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a += (m - i) * 2; +#endif + } + + return 0; +} diff --git a/driver/level2/zspr_k.c b/driver/level2/zspr_k.c new file mode 100644 index 0000000..a187bdb --- /dev/null +++ b/driver/level2/zspr_k.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + X, 1, a, 1, NULL, 0); + } + a += (i + 1) * 2; +#else + if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + } + a += (m - i) * 2; +#endif + } + + return 0; +} diff --git a/driver/level2/zsyr2_k.c b/driver/level2/zsyr2_k.c new file mode 100644 index 0000000..f7bbbb2 --- /dev/null +++ b/driver/level2/zsyr2_k.c @@ -0,0 +1,89 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + lda *= 2; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a += lda; +#else + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a += 2 + lda; +#endif + } + + return 0; +} diff --git a/driver/level2/zsyr_k.c b/driver/level2/zsyr_k.c new file mode 100644 index 0000000..9d800d3 --- /dev/null +++ b/driver/level2/zsyr_k.c @@ -0,0 +1,76 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + lda *= 2; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) { + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + X, 1, a, 1, NULL, 0); + } + a += lda; +#else + if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) { + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + } + a += 2 + lda; +#endif + } + + return 0; +} diff --git a/driver/level2/ztbmv_L.c b/driver/level2/ztbmv_L.c new file mode 100644 index 0000000..9b604c0 --- /dev/null +++ b/driver/level2/ztbmv_L.c @@ -0,0 +1,131 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + a += (n - 1) * lda * COMPSIZE; + + for (i = n - 1; i >= 0; i--) { + +#if (TRANSA == 1) || (TRANSA == 3) + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 1 + AXPYU_K(length, 0, 0, + B[i * 2 + 0], B[i * 2 + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(length, 0, 0, + B[i * 2 + 0], B[i * 2 + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + atemp1 = a[0]; + atemp2 = a[1]; +#else + atemp1 = a[k * 2 + 0]; + atemp2 = a[k * 2 + 1]; +#endif + + btemp1 = B[i * 2 + 0]; + btemp2 = B[i * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + length = i; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 2 + temp = DOTU_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); +#else + temp = DOTC_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); +#endif + + B[i * 2 + 0] += CREAL(temp); + B[i * 2 + 1] += CIMAG(temp); + } +#endif + + a -= lda * COMPSIZE; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztbmv_U.c b/driver/level2/ztbmv_U.c new file mode 100644 index 0000000..4e86f4f --- /dev/null +++ b/driver/level2/ztbmv_U.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + for (i = 0; i < n; i++) { + +#if (TRANSA == 1) || (TRANSA == 3) + length = i; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 1 + AXPYU_K(length, 0, 0, + B[i * 2 + 0], B[i * 2 + 1], + a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(length, 0, 0, + B[i * 2 + 0], B[i * 2 + 1], + a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); +#endif + + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + atemp1 = a[k * 2 + 0]; + atemp2 = a[k * 2 + 1]; +#else + atemp1 = a[0]; + atemp2 = a[1]; +#endif + + btemp1 = B[i * 2 + 0]; + btemp2 = B[i * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 2 + temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); +#else + temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); +#endif + + B[i * 2 + 0] += CREAL(temp); + B[i * 2 + 1] += CIMAG(temp); + } +#endif + + a += lda * COMPSIZE; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztbsv_L.c b/driver/level2/ztbsv_L.c new file mode 100644 index 0000000..f32ddff --- /dev/null +++ b/driver/level2/ztbsv_L.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + for (i = 0; i < n; i++) { + +#if (TRANSA == 2) || (TRANSA == 4) + length = i; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 2 + temp = DOTU_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); +#else + temp = DOTC_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); +#endif + + B[i * 2 + 0] -= CREAL(temp); + B[i * 2 + 1] -= CIMAG(temp); + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + ar = a[0]; + ai = a[1]; +#else + ar = a[k * 2 + 0]; + ai = a[k * 2 + 1]; +#endif + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = B[i * 2 + 0]; + bi = B[i * 2 + 1]; + + B[i * 2 + 0] = ar*br - ai*bi; + B[i * 2 + 1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 1 + AXPYU_K(length, 0, 0, + -B[i * 2 + 0], -B[i * 2 + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(length, 0, 0, + -B[i * 2 + 0], -B[i * 2 + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + + a += lda * COMPSIZE; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztbsv_U.c b/driver/level2/ztbsv_U.c new file mode 100644 index 0000000..252f3ba --- /dev/null +++ b/driver/level2/ztbsv_U.c @@ -0,0 +1,148 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + a += (n - 1) * lda * COMPSIZE; + + for (i = n - 1; i >= 0; i--) { + +#if (TRANSA == 2) || (TRANSA == 4) + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 2 + temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); +#else + temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); +#endif + + B[i * 2 + 0] -= CREAL(temp); + B[i * 2 + 1] -= CIMAG(temp); + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + ar = a[k * 2 + 0]; + ai = a[k * 2 + 1]; +#else + ar = a[0]; + ai = a[1]; +#endif + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = B[i * 2 + 0]; + bi = B[i * 2 + 1]; + + B[i * 2 + 0] = ar*br - ai*bi; + B[i * 2 + 1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + length = i; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 1 + AXPYU_K(length, 0, 0, + -B[i * 2 + 0], -B[i * 2 + 1], + a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(length, 0, 0, + -B[i * 2 + 0], -B[i * 2 + 1], + a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); +#endif + + } +#endif + + a -= lda * COMPSIZE; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztpmv_L.c b/driver/level2/ztpmv_L.c new file mode 100644 index 0000000..62b9dc6 --- /dev/null +++ b/driver/level2/ztpmv_L.c @@ -0,0 +1,121 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + a += (m + 1) * m - 2; + + for (i = 0; i < m; i++) { + +#if (TRANSA == 1) || (TRANSA == 3) +#if TRANSA == 1 + if (i > 0) AXPYU_K (i, 0, 0, + B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], + a + 2, 1, B + (m - i) * 2, 1, NULL, 0); +#else + if (i > 0) AXPYC_K(i, 0, 0, + B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], + a + 2, 1, B + (m - i) * 2, 1, NULL, 0); +#endif +#endif + +#ifndef UNIT + atemp1 = a[0]; + atemp2 = a[1]; + + btemp1 = B[(m - i - 1) * 2 + 0]; + btemp2 = B[(m - i - 1) * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + if (i < m - 1) { +#if TRANSA == 2 + temp = DOTU_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); +#else + temp = DOTC_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); +#endif + + B[(m - i - 1) * 2 + 0] += CREAL(temp); + B[(m - i - 1) * 2 + 1] += CIMAG(temp); + } +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + a -= (i + 2) * 2; +#else + a -= (m - i) * 2; +#endif + + } + + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztpmv_U.c b/driver/level2/ztpmv_U.c new file mode 100644 index 0000000..2ff3bfb --- /dev/null +++ b/driver/level2/ztpmv_U.c @@ -0,0 +1,124 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (i = 0; i < m; i++) { + +#if (TRANSA == 1) || (TRANSA == 3) +#if TRANSA == 1 + if (i > 0) AXPYU_K (i, 0, 0, B[i * 2 + 0], B[i * 2 + 1], + a, 1, B, 1, NULL, 0); +#else + if (i > 0) AXPYC_K(i, 0, 0, B[i * 2 + 0], B[i * 2 + 1], + a, 1, B, 1, NULL, 0); +#endif +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + atemp1 = a[i * 2 + 0]; + atemp2 = a[i * 2 + 1]; +#else + atemp1 = a[0]; + atemp2 = a[1]; +#endif + + btemp1 = B[i * 2 + 0]; + btemp2 = B[i * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + if (i < m - 1) { +#if TRANSA == 2 + temp = DOTU_K(m - i - 1, + a + 2, 1, + B + (i + 1) * 2, 1); +#else + temp = DOTC_K(m - i - 1, + a + 2, 1, + B + (i + 1) * 2, 1); +#endif + + B[i * 2 + 0] += CREAL(temp); + B[i * 2 + 1] += CIMAG(temp); + } +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + a += (i + 1) * 2; +#else + a += (m - i) * 2; +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztpsv_L.c b/driver/level2/ztpsv_L.c new file mode 100644 index 0000000..e9317fb --- /dev/null +++ b/driver/level2/ztpsv_L.c @@ -0,0 +1,142 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex result; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (i = 0; i < m; i++) { + +#if (TRANSA == 2) || (TRANSA == 4) + if (i > 0) { +#if TRANSA == 2 + result = DOTU_K(i, a, 1, B, 1); +#else + result = DOTC_K(i, a, 1, B, 1); +#endif + + B[i * COMPSIZE + 0] -= CREAL(result); + B[i * COMPSIZE + 1] -= CIMAG(result); + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + ar = a[0]; + ai = a[1]; +#else + ar = a[i * COMPSIZE + 0]; + ai = a[i * COMPSIZE + 1]; +#endif + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = B[i * COMPSIZE + 0]; + bi = B[i * COMPSIZE + 1]; + + B[i * COMPSIZE + 0] = ar*br - ai*bi; + B[i * COMPSIZE + 1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + if (i < m - 1) { +#if TRANSA == 1 + AXPYU_K(m - i - 1 , 0, 0, + - B[i * COMPSIZE + 0], - B[i * COMPSIZE + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(m - i - 1 , 0, 0, + - B[i * COMPSIZE + 0], - B[i * COMPSIZE + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + a += (m - i) * 2; +#else + a += (i + 1) * 2; +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztpsv_U.c b/driver/level2/ztpsv_U.c new file mode 100644 index 0000000..54903dc --- /dev/null +++ b/driver/level2/ztpsv_U.c @@ -0,0 +1,135 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex result; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + a += (m + 1) * m - 2; + + for (i = 0; i < m; i++) { + +#if (TRANSA == 2) || (TRANSA == 4) + if (i > 0) { +#if TRANSA == 2 + result = DOTU_K(i, a + 2, 1, B + (m - i) * 2, 1); +#else + result = DOTC_K(i, a + 2, 1, B + (m - i) * 2, 1); +#endif + + B[(m - i - 1) * 2 + 0] -= CREAL(result); + B[(m - i - 1) * 2 + 1] -= CIMAG(result); + } +#endif + +#ifndef UNIT + ar = a[0]; + ai = a[1]; + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if (TRANSA == 1) || (TRANSA == 2) + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if (TRANSA == 1) || (TRANSA == 2) + ai = -den; +#else + ai = den; +#endif + } + + br = B[(m - i - 1) * 2 + 0]; + bi = B[(m - i - 1) * 2 + 1]; + + B[(m - i - 1) * 2 + 0] = ar*br - ai*bi; + B[(m - i - 1) * 2 + 1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + if (i < m - 1) { +#if TRANSA == 1 + AXPYU_K (m - i - 1, 0, 0, - B[(m - i - 1) * 2 + 0], -B[(m - i - 1) * 2 + 1], + a - (m - i - 1) * COMPSIZE, 1, B, 1, NULL, 0); +#else + AXPYC_K (m - i - 1, 0, 0, - B[(m - i - 1) * 2 + 0], -B[(m - i - 1) * 2 + 1], + a - (m - i - 1) * COMPSIZE, 1, B, 1, NULL, 0); +#endif + } +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + a -= (m - i) * 2; +#else + a -= (i + 2) * 2; +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c new file mode 100644 index 0000000..3688f58 --- /dev/null +++ b/driver/level2/ztrmv_L.c @@ -0,0 +1,149 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ + + BLASLONG i, is, min_i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = m; is > 0; is -= DTB_ENTRIES){ + + min_i = MIN(is, DTB_ENTRIES); + +#if (TRANSA == 1) || (TRANSA == 3) + if (m - is > 0){ +#if TRANSA == 1 + GEMV_N(m - is, min_i, 0, dp1, ZERO, + a + (is + (is - min_i) * lda) * 2, lda, + B + (is - min_i) * 2, 1, + B + is * 2, 1, gemvbuffer); +#else + GEMV_R(m - is, min_i, 0, dp1, ZERO, + a + (is + (is - min_i) * lda) * 2, lda, + B + (is - min_i) * 2, 1, + B + is * 2, 1, gemvbuffer); +#endif + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * 2; + FLOAT *BB = B + (is - i - 1) * 2; + +#if (TRANSA == 1) || (TRANSA == 3) +#if TRANSA == 1 + if (i > 0) AXPYU_K (i, 0, 0, BB[0], BB[1], AA + 2, 1, BB + 2, 1, NULL, 0); +#else + if (i > 0) AXPYC_K(i, 0, 0, BB[0], BB[1], AA + 2, 1, BB + 2, 1, NULL, 0); +#endif +#endif + +#ifndef UNIT + atemp1 = AA[0]; + atemp2 = AA[1]; + + btemp1 = BB[0]; + btemp2 = BB[1]; + +#if (TRANSA == 1) || (TRANSA == 2) + BB[0] = atemp1 * btemp1 - atemp2 * btemp2; + BB[1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + BB[0] = atemp1 * btemp1 + atemp2 * btemp2; + BB[1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + if (i < min_i - 1) { +#if TRANSA == 2 + temp = DOTU_K(min_i - i - 1, AA - (min_i - i - 1) * 2, 1, BB - (min_i - i - 1) * 2, 1); +#else + temp = DOTC_K(min_i - i - 1, AA - (min_i - i - 1) * 2, 1, BB - (min_i - i - 1) * 2, 1); +#endif + + BB[0] += CREAL(temp); + BB[1] += CIMAG(temp); + } +#endif + + } + +#if (TRANSA == 2) || (TRANSA == 4) + if (is - min_i > 0){ +#if TRANSA == 2 + GEMV_T(is - min_i, min_i, 0, dp1, ZERO, + a + (is - min_i) * lda * 2, lda, + B, 1, + B + (is - min_i) * 2, 1, gemvbuffer); +#else + GEMV_C(is - min_i, min_i, 0, dp1, ZERO, + a + (is - min_i) * lda * 2, lda, + B, 1, + B + (is - min_i) * 2, 1, gemvbuffer); +#endif + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c new file mode 100644 index 0000000..a9fb6d1 --- /dev/null +++ b/driver/level2/ztrmv_U.c @@ -0,0 +1,155 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ + + BLASLONG i, is, min_i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is =0; is < m; is += DTB_ENTRIES){ + + min_i = MIN(m - is, DTB_ENTRIES); + +#if (TRANSA) == 1 || (TRANSA == 3) + if (is > 0){ +#if TRANSA == 1 + GEMV_N(is, min_i, 0, dp1, ZERO, + a + is * lda * 2, lda, + B + is * 2, 1, + B, 1, gemvbuffer); +#else + GEMV_R(is, min_i, 0, dp1, ZERO, + a + is * lda * 2, lda, + B + is * 2, 1, + B, 1, gemvbuffer); +#endif + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + (is + (i + is) * lda) * 2; + FLOAT *BB = B + is * 2; + +#if (TRANSA == 1) || (TRANSA == 3) +#if TRANSA == 1 + if (i > 0) AXPYU_K (i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], + AA, 1, BB, 1, NULL, 0); +#else + if (i > 0) AXPYC_K(i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], + AA, 1, BB, 1, NULL, 0); +#endif +#endif + +#ifndef UNIT + atemp1 = AA[i * 2 + 0]; + atemp2 = AA[i * 2 + 1]; + + btemp1 = BB[i * 2 + 0]; + btemp2 = BB[i * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + BB[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + BB[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + BB[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + BB[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + if (i < min_i - 1) { +#if TRANSA == 2 + temp = DOTU_K(min_i - i - 1, + AA + (i + 1) * 2, 1, + BB + (i + 1) * 2, 1); +#else + temp = DOTC_K(min_i - i - 1, + AA + (i + 1) * 2, 1, + BB + (i + 1) * 2, 1); +#endif + + BB[i * 2 + 0] += CREAL(temp); + BB[i * 2 + 1] += CIMAG(temp); + } +#endif + + } + +#if (TRANSA) == 2 || (TRANSA == 4) + if (m - is > min_i){ +#if TRANSA == 2 + GEMV_T(m - is - min_i, min_i, 0, dp1, ZERO, + a + (is + min_i + is * lda) * 2, lda, + B + (is + min_i) * 2, 1, + B + is * 2, 1, gemvbuffer); +#else + GEMV_C(m - is - min_i, min_i, 0, dp1, ZERO, + a + (is + min_i + is * lda) * 2, lda, + B + (is + min_i) * 2, 1, + B + is * 2, 1, gemvbuffer); +#endif + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztrsv_L.c b/driver/level2/ztrsv_L.c new file mode 100644 index 0000000..f825c61 --- /dev/null +++ b/driver/level2/ztrsv_L.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i, is, min_i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex result; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is =0; is < m; is += DTB_ENTRIES){ + + min_i = MIN(m - is, DTB_ENTRIES); + +#if (TRANSA == 2) || (TRANSA == 4) + if (is > 0){ +#if TRANSA == 2 + GEMV_T(is, min_i, 0, dm1, ZERO, + a + is * lda * COMPSIZE, lda, + B, 1, + B + is * COMPSIZE, 1, gemvbuffer); +#else + GEMV_C(is, min_i, 0, dm1, ZERO, + a + is * lda * COMPSIZE, lda, + B, 1, + B + is * COMPSIZE, 1, gemvbuffer); +#endif + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + (is + (i + is) * lda) * COMPSIZE; + FLOAT *BB = B + is * COMPSIZE; + +#if (TRANSA == 2) || (TRANSA == 4) + if (i > 0) { +#if TRANSA == 2 + result = DOTU_K(i, AA, 1, BB, 1); +#else + result = DOTC_K(i, AA, 1, BB, 1); +#endif + + BB[i * COMPSIZE + 0] -= CREAL(result); + BB[i * COMPSIZE + 1] -= CIMAG(result); + } +#endif + +#ifndef UNIT + ar = AA[i * COMPSIZE + 0]; + ai = AA[i * COMPSIZE + 1]; + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = BB[i * COMPSIZE + 0]; + bi = BB[i * COMPSIZE + 1]; + + BB[i * COMPSIZE + 0] = ar*br - ai*bi; + BB[i * COMPSIZE + 1] = ar*bi + ai*br; +#endif + + +#if (TRANSA == 1) || (TRANSA == 3) + if (i < min_i - 1) { +#if TRANSA == 1 + AXPYU_K(min_i - i - 1 , 0, 0, + - BB[i * COMPSIZE + 0], - BB[i * COMPSIZE + 1], + AA + (i + 1) * COMPSIZE, 1, BB + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(min_i - i - 1 , 0, 0, + - BB[i * COMPSIZE + 0], - BB[i * COMPSIZE + 1], + AA + (i + 1) * COMPSIZE, 1, BB + (i + 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + } + +#if (TRANSA == 1) || (TRANSA == 3) + if (m - is > min_i){ +#if TRANSA == 1 + GEMV_N(m - is - min_i, min_i, 0, dm1, ZERO, + a + (is + min_i + is * lda) * COMPSIZE, lda, + B + is * COMPSIZE, 1, + B + (is + min_i) * COMPSIZE, 1, gemvbuffer); +#else + GEMV_R(m - is - min_i, min_i, 0, dm1, ZERO, + a + (is + min_i + is * lda) * COMPSIZE, lda, + B + is * COMPSIZE, 1, + B + (is + min_i) * COMPSIZE, 1, gemvbuffer); +#endif + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztrsv_U.c b/driver/level2/ztrsv_U.c new file mode 100644 index 0000000..3b750a2 --- /dev/null +++ b/driver/level2/ztrsv_U.c @@ -0,0 +1,168 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i, is, min_i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex result; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = m; is > 0; is -= DTB_ENTRIES){ + + min_i = MIN(is, DTB_ENTRIES); + +#if (TRANSA == 2) || (TRANSA == 4) + if (m - is > 0){ +#if TRANSA == 2 + GEMV_T(m - is, min_i, 0, dm1, ZERO, + a + (is + (is - min_i) * lda) * COMPSIZE, lda, + B + is * COMPSIZE, 1, + B + (is - min_i) * COMPSIZE, 1, gemvbuffer); +#else + GEMV_C(m - is, min_i, 0, dm1, ZERO, + a + (is + (is - min_i) * lda) * COMPSIZE, lda, + B + is * COMPSIZE, 1, + B + (is - min_i) * COMPSIZE, 1, gemvbuffer); +#endif + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * COMPSIZE; + FLOAT *BB = B + (is - i - 1) * COMPSIZE; + +#if (TRANSA == 2) || (TRANSA == 4) + if (i > 0) { +#if TRANSA == 2 + result = DOTU_K(i, AA + 2, 1, BB + 2, 1); +#else + result = DOTC_K(i, AA + 2, 1, BB + 2, 1); +#endif + + BB[0] -= CREAL(result); + BB[1] -= CIMAG(result); + } +#endif + +#ifndef UNIT + ar = AA[0]; + ai = AA[1]; + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = BB[0]; + bi = BB[1]; + + BB[0] = ar*br - ai*bi; + BB[1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + if (i < min_i - 1) { +#if TRANSA == 1 + AXPYU_K (min_i - i - 1, 0, 0, - BB[0], -BB[1], + AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(min_i - i - 1, 0, 0, - BB[0], -BB[1], + AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + } + +#if (TRANSA == 1) || (TRANSA == 3) + if (is - min_i > 0){ +#if TRANSA == 1 + GEMV_N(is - min_i, min_i, 0, dm1, ZERO, + a + (is - min_i) * lda * COMPSIZE, lda, + B + (is - min_i) * COMPSIZE, 1, + B, 1, gemvbuffer); +#else + GEMV_R(is - min_i, min_i, 0, dm1, ZERO, + a + (is - min_i) * lda * COMPSIZE, lda, + B + (is - min_i) * COMPSIZE, 1, + B, 1, gemvbuffer); +#endif + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level3/._Makefile b/driver/level3/._Makefile new file mode 100644 index 0000000..53473c4 Binary files /dev/null and b/driver/level3/._Makefile differ diff --git a/driver/level3/._gemm.c b/driver/level3/._gemm.c new file mode 100644 index 0000000..8e34020 Binary files /dev/null and b/driver/level3/._gemm.c differ diff --git a/driver/level3/._gemm3m.c b/driver/level3/._gemm3m.c new file mode 100644 index 0000000..10825ef Binary files /dev/null and b/driver/level3/._gemm3m.c differ diff --git a/driver/level3/._gemm3m_level3.c b/driver/level3/._gemm3m_level3.c new file mode 100644 index 0000000..b9f68f7 Binary files /dev/null and b/driver/level3/._gemm3m_level3.c differ diff --git a/driver/level3/._gemm_thread_m.c b/driver/level3/._gemm_thread_m.c new file mode 100644 index 0000000..6752903 Binary files /dev/null and b/driver/level3/._gemm_thread_m.c differ diff --git a/driver/level3/._gemm_thread_mn.c b/driver/level3/._gemm_thread_mn.c new file mode 100644 index 0000000..6d82d3f Binary files /dev/null and b/driver/level3/._gemm_thread_mn.c differ diff --git a/driver/level3/._gemm_thread_n.c b/driver/level3/._gemm_thread_n.c new file mode 100644 index 0000000..a413539 Binary files /dev/null and b/driver/level3/._gemm_thread_n.c differ diff --git a/driver/level3/._gemm_thread_variable.c b/driver/level3/._gemm_thread_variable.c new file mode 100644 index 0000000..fb25c1c Binary files /dev/null and b/driver/level3/._gemm_thread_variable.c differ diff --git a/driver/level3/._hemm3m_k.c b/driver/level3/._hemm3m_k.c new file mode 100644 index 0000000..4149c54 Binary files /dev/null and b/driver/level3/._hemm3m_k.c differ diff --git a/driver/level3/._level3.c b/driver/level3/._level3.c new file mode 100644 index 0000000..6212ccb Binary files /dev/null and b/driver/level3/._level3.c differ diff --git a/driver/level3/._level3_gemm3m_thread.c b/driver/level3/._level3_gemm3m_thread.c new file mode 100644 index 0000000..66f0de6 Binary files /dev/null and b/driver/level3/._level3_gemm3m_thread.c differ diff --git a/driver/level3/._level3_syr2k.c b/driver/level3/._level3_syr2k.c new file mode 100644 index 0000000..692184d Binary files /dev/null and b/driver/level3/._level3_syr2k.c differ diff --git a/driver/level3/._level3_syrk.c b/driver/level3/._level3_syrk.c new file mode 100644 index 0000000..ce74b4e Binary files /dev/null and b/driver/level3/._level3_syrk.c differ diff --git a/driver/level3/._level3_syrk_threaded.c b/driver/level3/._level3_syrk_threaded.c new file mode 100644 index 0000000..510620e Binary files /dev/null and b/driver/level3/._level3_syrk_threaded.c differ diff --git a/driver/level3/._level3_thread.c b/driver/level3/._level3_thread.c new file mode 100644 index 0000000..375d7e3 Binary files /dev/null and b/driver/level3/._level3_thread.c differ diff --git a/driver/level3/._symm3m_k.c b/driver/level3/._symm3m_k.c new file mode 100644 index 0000000..f74fdd9 Binary files /dev/null and b/driver/level3/._symm3m_k.c differ diff --git a/driver/level3/._symm_k.c b/driver/level3/._symm_k.c new file mode 100644 index 0000000..ddd1d72 Binary files /dev/null and b/driver/level3/._symm_k.c differ diff --git a/driver/level3/._syr2k_k.c b/driver/level3/._syr2k_k.c new file mode 100644 index 0000000..5aa524c Binary files /dev/null and b/driver/level3/._syr2k_k.c differ diff --git a/driver/level3/._syr2k_kernel.c b/driver/level3/._syr2k_kernel.c new file mode 100644 index 0000000..6d7c11e Binary files /dev/null and b/driver/level3/._syr2k_kernel.c differ diff --git a/driver/level3/._syrk_k.c b/driver/level3/._syrk_k.c new file mode 100644 index 0000000..3e1c552 Binary files /dev/null and b/driver/level3/._syrk_k.c differ diff --git a/driver/level3/._syrk_kernel.c b/driver/level3/._syrk_kernel.c new file mode 100644 index 0000000..13bbd61 Binary files /dev/null and b/driver/level3/._syrk_kernel.c differ diff --git a/driver/level3/._syrk_thread.c b/driver/level3/._syrk_thread.c new file mode 100644 index 0000000..513c69f Binary files /dev/null and b/driver/level3/._syrk_thread.c differ diff --git a/driver/level3/._trmm_L.c b/driver/level3/._trmm_L.c new file mode 100644 index 0000000..f95d11d Binary files /dev/null and b/driver/level3/._trmm_L.c differ diff --git a/driver/level3/._trmm_R.c b/driver/level3/._trmm_R.c new file mode 100644 index 0000000..44328bd Binary files /dev/null and b/driver/level3/._trmm_R.c differ diff --git a/driver/level3/._trsm_L.c b/driver/level3/._trsm_L.c new file mode 100644 index 0000000..e62e7d2 Binary files /dev/null and b/driver/level3/._trsm_L.c differ diff --git a/driver/level3/._trsm_R.c b/driver/level3/._trsm_R.c new file mode 100644 index 0000000..db2da0e Binary files /dev/null and b/driver/level3/._trsm_R.c differ diff --git a/driver/level3/._zhemm_k.c b/driver/level3/._zhemm_k.c new file mode 100644 index 0000000..d2e2587 Binary files /dev/null and b/driver/level3/._zhemm_k.c differ diff --git a/driver/level3/._zher2k_k.c b/driver/level3/._zher2k_k.c new file mode 100644 index 0000000..43041c9 Binary files /dev/null and b/driver/level3/._zher2k_k.c differ diff --git a/driver/level3/._zher2k_kernel.c b/driver/level3/._zher2k_kernel.c new file mode 100644 index 0000000..3023e7f Binary files /dev/null and b/driver/level3/._zher2k_kernel.c differ diff --git a/driver/level3/._zherk_beta.c b/driver/level3/._zherk_beta.c new file mode 100644 index 0000000..ae48e67 Binary files /dev/null and b/driver/level3/._zherk_beta.c differ diff --git a/driver/level3/._zherk_k.c b/driver/level3/._zherk_k.c new file mode 100644 index 0000000..931003e Binary files /dev/null and b/driver/level3/._zherk_k.c differ diff --git a/driver/level3/._zherk_kernel.c b/driver/level3/._zherk_kernel.c new file mode 100644 index 0000000..6dfb73f Binary files /dev/null and b/driver/level3/._zherk_kernel.c differ diff --git a/driver/level3/._zsyrk_beta.c b/driver/level3/._zsyrk_beta.c new file mode 100644 index 0000000..797e64b Binary files /dev/null and b/driver/level3/._zsyrk_beta.c differ diff --git a/driver/level3/Makefile b/driver/level3/Makefile new file mode 100644 index 0000000..7d7d723 --- /dev/null +++ b/driver/level3/Makefile @@ -0,0 +1,5022 @@ +TOPDIR = ../.. +include ../../Makefile.system + +ifeq ($(ARCH), x86) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +USE_GEMM3M = 1 +endif + +SBLASOBJS += \ + sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ + strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ + strmm_LTUU.$(SUFFIX) strmm_LTUN.$(SUFFIX) strmm_LTLU.$(SUFFIX) strmm_LTLN.$(SUFFIX) \ + strmm_RNUU.$(SUFFIX) strmm_RNUN.$(SUFFIX) strmm_RNLU.$(SUFFIX) strmm_RNLN.$(SUFFIX) \ + strmm_RTUU.$(SUFFIX) strmm_RTUN.$(SUFFIX) strmm_RTLU.$(SUFFIX) strmm_RTLN.$(SUFFIX) \ + strsm_LNUU.$(SUFFIX) strsm_LNUN.$(SUFFIX) strsm_LNLU.$(SUFFIX) strsm_LNLN.$(SUFFIX) \ + strsm_LTUU.$(SUFFIX) strsm_LTUN.$(SUFFIX) strsm_LTLU.$(SUFFIX) strsm_LTLN.$(SUFFIX) \ + strsm_RNUU.$(SUFFIX) strsm_RNUN.$(SUFFIX) strsm_RNLU.$(SUFFIX) strsm_RNLN.$(SUFFIX) \ + strsm_RTUU.$(SUFFIX) strsm_RTUN.$(SUFFIX) strsm_RTLU.$(SUFFIX) strsm_RTLN.$(SUFFIX) \ + ssymm_LU.$(SUFFIX) ssymm_LL.$(SUFFIX) ssymm_RU.$(SUFFIX) ssymm_RL.$(SUFFIX) \ + ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \ + ssyr2k_UN.$(SUFFIX) ssyr2k_UT.$(SUFFIX) ssyr2k_LN.$(SUFFIX) ssyr2k_LT.$(SUFFIX) \ + ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) \ + ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) + +DBLASOBJS += \ + dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \ + dtrmm_LNUU.$(SUFFIX) dtrmm_LNUN.$(SUFFIX) dtrmm_LNLU.$(SUFFIX) dtrmm_LNLN.$(SUFFIX) \ + dtrmm_LTUU.$(SUFFIX) dtrmm_LTUN.$(SUFFIX) dtrmm_LTLU.$(SUFFIX) dtrmm_LTLN.$(SUFFIX) \ + dtrmm_RNUU.$(SUFFIX) dtrmm_RNUN.$(SUFFIX) dtrmm_RNLU.$(SUFFIX) dtrmm_RNLN.$(SUFFIX) \ + dtrmm_RTUU.$(SUFFIX) dtrmm_RTUN.$(SUFFIX) dtrmm_RTLU.$(SUFFIX) dtrmm_RTLN.$(SUFFIX) \ + dtrsm_LNUU.$(SUFFIX) dtrsm_LNUN.$(SUFFIX) dtrsm_LNLU.$(SUFFIX) dtrsm_LNLN.$(SUFFIX) \ + dtrsm_LTUU.$(SUFFIX) dtrsm_LTUN.$(SUFFIX) dtrsm_LTLU.$(SUFFIX) dtrsm_LTLN.$(SUFFIX) \ + dtrsm_RNUU.$(SUFFIX) dtrsm_RNUN.$(SUFFIX) dtrsm_RNLU.$(SUFFIX) dtrsm_RNLN.$(SUFFIX) \ + dtrsm_RTUU.$(SUFFIX) dtrsm_RTUN.$(SUFFIX) dtrsm_RTLU.$(SUFFIX) dtrsm_RTLN.$(SUFFIX) \ + dsymm_LU.$(SUFFIX) dsymm_LL.$(SUFFIX) dsymm_RU.$(SUFFIX) dsymm_RL.$(SUFFIX) \ + dsyrk_UN.$(SUFFIX) dsyrk_UT.$(SUFFIX) dsyrk_LN.$(SUFFIX) dsyrk_LT.$(SUFFIX) \ + dsyr2k_UN.$(SUFFIX) dsyr2k_UT.$(SUFFIX) dsyr2k_LN.$(SUFFIX) dsyr2k_LT.$(SUFFIX) \ + dsyrk_kernel_U.$(SUFFIX) dsyrk_kernel_L.$(SUFFIX) \ + dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) + +QBLASOBJS += \ + qgemm_nn.$(SUFFIX) qgemm_nt.$(SUFFIX) qgemm_tn.$(SUFFIX) qgemm_tt.$(SUFFIX) \ + qtrmm_LNUU.$(SUFFIX) qtrmm_LNUN.$(SUFFIX) qtrmm_LNLU.$(SUFFIX) qtrmm_LNLN.$(SUFFIX) \ + qtrmm_LTUU.$(SUFFIX) qtrmm_LTUN.$(SUFFIX) qtrmm_LTLU.$(SUFFIX) qtrmm_LTLN.$(SUFFIX) \ + qtrmm_RNUU.$(SUFFIX) qtrmm_RNUN.$(SUFFIX) qtrmm_RNLU.$(SUFFIX) qtrmm_RNLN.$(SUFFIX) \ + qtrmm_RTUU.$(SUFFIX) qtrmm_RTUN.$(SUFFIX) qtrmm_RTLU.$(SUFFIX) qtrmm_RTLN.$(SUFFIX) \ + qtrsm_LNUU.$(SUFFIX) qtrsm_LNUN.$(SUFFIX) qtrsm_LNLU.$(SUFFIX) qtrsm_LNLN.$(SUFFIX) \ + qtrsm_LTUU.$(SUFFIX) qtrsm_LTUN.$(SUFFIX) qtrsm_LTLU.$(SUFFIX) qtrsm_LTLN.$(SUFFIX) \ + qtrsm_RNUU.$(SUFFIX) qtrsm_RNUN.$(SUFFIX) qtrsm_RNLU.$(SUFFIX) qtrsm_RNLN.$(SUFFIX) \ + qtrsm_RTUU.$(SUFFIX) qtrsm_RTUN.$(SUFFIX) qtrsm_RTLU.$(SUFFIX) qtrsm_RTLN.$(SUFFIX) \ + qsymm_LU.$(SUFFIX) qsymm_LL.$(SUFFIX) qsymm_RU.$(SUFFIX) qsymm_RL.$(SUFFIX) \ + qsyrk_UN.$(SUFFIX) qsyrk_UT.$(SUFFIX) qsyrk_LN.$(SUFFIX) qsyrk_LT.$(SUFFIX) \ + qsyr2k_UN.$(SUFFIX) qsyr2k_UT.$(SUFFIX) qsyr2k_LN.$(SUFFIX) qsyr2k_LT.$(SUFFIX) \ + qsyrk_kernel_U.$(SUFFIX) qsyrk_kernel_L.$(SUFFIX) \ + qsyr2k_kernel_U.$(SUFFIX) qsyr2k_kernel_L.$(SUFFIX) + +CBLASOBJS += \ + cgemm_nn.$(SUFFIX) cgemm_cn.$(SUFFIX) cgemm_tn.$(SUFFIX) cgemm_nc.$(SUFFIX) \ + cgemm_nt.$(SUFFIX) cgemm_cc.$(SUFFIX) cgemm_ct.$(SUFFIX) cgemm_tc.$(SUFFIX) \ + cgemm_tt.$(SUFFIX) cgemm_nr.$(SUFFIX) cgemm_tr.$(SUFFIX) cgemm_cr.$(SUFFIX) \ + cgemm_rn.$(SUFFIX) cgemm_rt.$(SUFFIX) cgemm_rc.$(SUFFIX) cgemm_rr.$(SUFFIX) \ + ctrmm_LNUU.$(SUFFIX) ctrmm_LNUN.$(SUFFIX) ctrmm_LNLU.$(SUFFIX) ctrmm_LNLN.$(SUFFIX) \ + ctrmm_LTUU.$(SUFFIX) ctrmm_LTUN.$(SUFFIX) ctrmm_LTLU.$(SUFFIX) ctrmm_LTLN.$(SUFFIX) \ + ctrmm_LRUU.$(SUFFIX) ctrmm_LRUN.$(SUFFIX) ctrmm_LRLU.$(SUFFIX) ctrmm_LRLN.$(SUFFIX) \ + ctrmm_LCUU.$(SUFFIX) ctrmm_LCUN.$(SUFFIX) ctrmm_LCLU.$(SUFFIX) ctrmm_LCLN.$(SUFFIX) \ + ctrmm_RNUU.$(SUFFIX) ctrmm_RNUN.$(SUFFIX) ctrmm_RNLU.$(SUFFIX) ctrmm_RNLN.$(SUFFIX) \ + ctrmm_RTUU.$(SUFFIX) ctrmm_RTUN.$(SUFFIX) ctrmm_RTLU.$(SUFFIX) ctrmm_RTLN.$(SUFFIX) \ + ctrmm_RRUU.$(SUFFIX) ctrmm_RRUN.$(SUFFIX) ctrmm_RRLU.$(SUFFIX) ctrmm_RRLN.$(SUFFIX) \ + ctrmm_RCUU.$(SUFFIX) ctrmm_RCUN.$(SUFFIX) ctrmm_RCLU.$(SUFFIX) ctrmm_RCLN.$(SUFFIX) \ + ctrsm_LNUU.$(SUFFIX) ctrsm_LNUN.$(SUFFIX) ctrsm_LNLU.$(SUFFIX) ctrsm_LNLN.$(SUFFIX) \ + ctrsm_LTUU.$(SUFFIX) ctrsm_LTUN.$(SUFFIX) ctrsm_LTLU.$(SUFFIX) ctrsm_LTLN.$(SUFFIX) \ + ctrsm_LRUU.$(SUFFIX) ctrsm_LRUN.$(SUFFIX) ctrsm_LRLU.$(SUFFIX) ctrsm_LRLN.$(SUFFIX) \ + ctrsm_LCUU.$(SUFFIX) ctrsm_LCUN.$(SUFFIX) ctrsm_LCLU.$(SUFFIX) ctrsm_LCLN.$(SUFFIX) \ + ctrsm_RNUU.$(SUFFIX) ctrsm_RNUN.$(SUFFIX) ctrsm_RNLU.$(SUFFIX) ctrsm_RNLN.$(SUFFIX) \ + ctrsm_RTUU.$(SUFFIX) ctrsm_RTUN.$(SUFFIX) ctrsm_RTLU.$(SUFFIX) ctrsm_RTLN.$(SUFFIX) \ + ctrsm_RRUU.$(SUFFIX) ctrsm_RRUN.$(SUFFIX) ctrsm_RRLU.$(SUFFIX) ctrsm_RRLN.$(SUFFIX) \ + ctrsm_RCUU.$(SUFFIX) ctrsm_RCUN.$(SUFFIX) ctrsm_RCLU.$(SUFFIX) ctrsm_RCLN.$(SUFFIX) \ + csymm_LU.$(SUFFIX) csymm_LL.$(SUFFIX) csymm_RU.$(SUFFIX) csymm_RL.$(SUFFIX) \ + chemm_LU.$(SUFFIX) chemm_LL.$(SUFFIX) chemm_RU.$(SUFFIX) chemm_RL.$(SUFFIX) \ + csyrk_UN.$(SUFFIX) csyrk_UT.$(SUFFIX) csyrk_LN.$(SUFFIX) csyrk_LT.$(SUFFIX) \ + cherk_UN.$(SUFFIX) cherk_UC.$(SUFFIX) cherk_LN.$(SUFFIX) cherk_LC.$(SUFFIX) \ + csyr2k_UN.$(SUFFIX) csyr2k_UT.$(SUFFIX) csyr2k_LN.$(SUFFIX) csyr2k_LT.$(SUFFIX) \ + cher2k_UN.$(SUFFIX) cher2k_UC.$(SUFFIX) cher2k_LN.$(SUFFIX) cher2k_LC.$(SUFFIX) \ + csyrk_kernel_U.$(SUFFIX) csyrk_kernel_L.$(SUFFIX) \ + cherk_kernel_UN.$(SUFFIX) cherk_kernel_UC.$(SUFFIX) \ + cherk_kernel_LN.$(SUFFIX) cherk_kernel_LC.$(SUFFIX) \ + csyr2k_kernel_U.$(SUFFIX) csyr2k_kernel_L.$(SUFFIX) \ + cher2k_kernel_UN.$(SUFFIX) cher2k_kernel_UC.$(SUFFIX) \ + cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) + +ZBLASOBJS += \ + zgemm_nn.$(SUFFIX) zgemm_cn.$(SUFFIX) zgemm_tn.$(SUFFIX) zgemm_nc.$(SUFFIX) \ + zgemm_nt.$(SUFFIX) zgemm_cc.$(SUFFIX) zgemm_ct.$(SUFFIX) zgemm_tc.$(SUFFIX) \ + zgemm_tt.$(SUFFIX) zgemm_nr.$(SUFFIX) zgemm_tr.$(SUFFIX) zgemm_cr.$(SUFFIX) \ + zgemm_rn.$(SUFFIX) zgemm_rt.$(SUFFIX) zgemm_rc.$(SUFFIX) zgemm_rr.$(SUFFIX) \ + ztrmm_LNUU.$(SUFFIX) ztrmm_LNUN.$(SUFFIX) ztrmm_LNLU.$(SUFFIX) ztrmm_LNLN.$(SUFFIX) \ + ztrmm_LTUU.$(SUFFIX) ztrmm_LTUN.$(SUFFIX) ztrmm_LTLU.$(SUFFIX) ztrmm_LTLN.$(SUFFIX) \ + ztrmm_LRUU.$(SUFFIX) ztrmm_LRUN.$(SUFFIX) ztrmm_LRLU.$(SUFFIX) ztrmm_LRLN.$(SUFFIX) \ + ztrmm_LCUU.$(SUFFIX) ztrmm_LCUN.$(SUFFIX) ztrmm_LCLU.$(SUFFIX) ztrmm_LCLN.$(SUFFIX) \ + ztrmm_RNUU.$(SUFFIX) ztrmm_RNUN.$(SUFFIX) ztrmm_RNLU.$(SUFFIX) ztrmm_RNLN.$(SUFFIX) \ + ztrmm_RTUU.$(SUFFIX) ztrmm_RTUN.$(SUFFIX) ztrmm_RTLU.$(SUFFIX) ztrmm_RTLN.$(SUFFIX) \ + ztrmm_RRUU.$(SUFFIX) ztrmm_RRUN.$(SUFFIX) ztrmm_RRLU.$(SUFFIX) ztrmm_RRLN.$(SUFFIX) \ + ztrmm_RCUU.$(SUFFIX) ztrmm_RCUN.$(SUFFIX) ztrmm_RCLU.$(SUFFIX) ztrmm_RCLN.$(SUFFIX) \ + ztrsm_LNUU.$(SUFFIX) ztrsm_LNUN.$(SUFFIX) ztrsm_LNLU.$(SUFFIX) ztrsm_LNLN.$(SUFFIX) \ + ztrsm_LTUU.$(SUFFIX) ztrsm_LTUN.$(SUFFIX) ztrsm_LTLU.$(SUFFIX) ztrsm_LTLN.$(SUFFIX) \ + ztrsm_LRUU.$(SUFFIX) ztrsm_LRUN.$(SUFFIX) ztrsm_LRLU.$(SUFFIX) ztrsm_LRLN.$(SUFFIX) \ + ztrsm_LCUU.$(SUFFIX) ztrsm_LCUN.$(SUFFIX) ztrsm_LCLU.$(SUFFIX) ztrsm_LCLN.$(SUFFIX) \ + ztrsm_RNUU.$(SUFFIX) ztrsm_RNUN.$(SUFFIX) ztrsm_RNLU.$(SUFFIX) ztrsm_RNLN.$(SUFFIX) \ + ztrsm_RTUU.$(SUFFIX) ztrsm_RTUN.$(SUFFIX) ztrsm_RTLU.$(SUFFIX) ztrsm_RTLN.$(SUFFIX) \ + ztrsm_RRUU.$(SUFFIX) ztrsm_RRUN.$(SUFFIX) ztrsm_RRLU.$(SUFFIX) ztrsm_RRLN.$(SUFFIX) \ + ztrsm_RCUU.$(SUFFIX) ztrsm_RCUN.$(SUFFIX) ztrsm_RCLU.$(SUFFIX) ztrsm_RCLN.$(SUFFIX) \ + zsymm_LU.$(SUFFIX) zsymm_LL.$(SUFFIX) zsymm_RU.$(SUFFIX) zsymm_RL.$(SUFFIX) \ + zhemm_LU.$(SUFFIX) zhemm_LL.$(SUFFIX) zhemm_RU.$(SUFFIX) zhemm_RL.$(SUFFIX) \ + zsyrk_UN.$(SUFFIX) zsyrk_UT.$(SUFFIX) zsyrk_LN.$(SUFFIX) zsyrk_LT.$(SUFFIX) \ + zherk_UN.$(SUFFIX) zherk_UC.$(SUFFIX) zherk_LN.$(SUFFIX) zherk_LC.$(SUFFIX) \ + zsyr2k_UN.$(SUFFIX) zsyr2k_UT.$(SUFFIX) zsyr2k_LN.$(SUFFIX) zsyr2k_LT.$(SUFFIX) \ + zher2k_UN.$(SUFFIX) zher2k_UC.$(SUFFIX) zher2k_LN.$(SUFFIX) zher2k_LC.$(SUFFIX) \ + zsyrk_kernel_U.$(SUFFIX) zsyrk_kernel_L.$(SUFFIX) \ + zherk_kernel_UN.$(SUFFIX) zherk_kernel_UC.$(SUFFIX) \ + zherk_kernel_LN.$(SUFFIX) zherk_kernel_LC.$(SUFFIX) \ + zsyr2k_kernel_U.$(SUFFIX) zsyr2k_kernel_L.$(SUFFIX) \ + zher2k_kernel_UN.$(SUFFIX) zher2k_kernel_UC.$(SUFFIX) \ + zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) + + +XBLASOBJS += \ + xgemm_nn.$(SUFFIX) xgemm_cn.$(SUFFIX) xgemm_tn.$(SUFFIX) xgemm_nc.$(SUFFIX) \ + xgemm_nt.$(SUFFIX) xgemm_cc.$(SUFFIX) xgemm_ct.$(SUFFIX) xgemm_tc.$(SUFFIX) \ + xgemm_tt.$(SUFFIX) xgemm_nr.$(SUFFIX) xgemm_tr.$(SUFFIX) xgemm_cr.$(SUFFIX) \ + xgemm_rn.$(SUFFIX) xgemm_rt.$(SUFFIX) xgemm_rc.$(SUFFIX) xgemm_rr.$(SUFFIX) \ + xtrmm_LNUU.$(SUFFIX) xtrmm_LNUN.$(SUFFIX) xtrmm_LNLU.$(SUFFIX) xtrmm_LNLN.$(SUFFIX) \ + xtrmm_LTUU.$(SUFFIX) xtrmm_LTUN.$(SUFFIX) xtrmm_LTLU.$(SUFFIX) xtrmm_LTLN.$(SUFFIX) \ + xtrmm_LRUU.$(SUFFIX) xtrmm_LRUN.$(SUFFIX) xtrmm_LRLU.$(SUFFIX) xtrmm_LRLN.$(SUFFIX) \ + xtrmm_LCUU.$(SUFFIX) xtrmm_LCUN.$(SUFFIX) xtrmm_LCLU.$(SUFFIX) xtrmm_LCLN.$(SUFFIX) \ + xtrmm_RNUU.$(SUFFIX) xtrmm_RNUN.$(SUFFIX) xtrmm_RNLU.$(SUFFIX) xtrmm_RNLN.$(SUFFIX) \ + xtrmm_RTUU.$(SUFFIX) xtrmm_RTUN.$(SUFFIX) xtrmm_RTLU.$(SUFFIX) xtrmm_RTLN.$(SUFFIX) \ + xtrmm_RRUU.$(SUFFIX) xtrmm_RRUN.$(SUFFIX) xtrmm_RRLU.$(SUFFIX) xtrmm_RRLN.$(SUFFIX) \ + xtrmm_RCUU.$(SUFFIX) xtrmm_RCUN.$(SUFFIX) xtrmm_RCLU.$(SUFFIX) xtrmm_RCLN.$(SUFFIX) \ + xtrsm_LNUU.$(SUFFIX) xtrsm_LNUN.$(SUFFIX) xtrsm_LNLU.$(SUFFIX) xtrsm_LNLN.$(SUFFIX) \ + xtrsm_LTUU.$(SUFFIX) xtrsm_LTUN.$(SUFFIX) xtrsm_LTLU.$(SUFFIX) xtrsm_LTLN.$(SUFFIX) \ + xtrsm_LRUU.$(SUFFIX) xtrsm_LRUN.$(SUFFIX) xtrsm_LRLU.$(SUFFIX) xtrsm_LRLN.$(SUFFIX) \ + xtrsm_LCUU.$(SUFFIX) xtrsm_LCUN.$(SUFFIX) xtrsm_LCLU.$(SUFFIX) xtrsm_LCLN.$(SUFFIX) \ + xtrsm_RNUU.$(SUFFIX) xtrsm_RNUN.$(SUFFIX) xtrsm_RNLU.$(SUFFIX) xtrsm_RNLN.$(SUFFIX) \ + xtrsm_RTUU.$(SUFFIX) xtrsm_RTUN.$(SUFFIX) xtrsm_RTLU.$(SUFFIX) xtrsm_RTLN.$(SUFFIX) \ + xtrsm_RRUU.$(SUFFIX) xtrsm_RRUN.$(SUFFIX) xtrsm_RRLU.$(SUFFIX) xtrsm_RRLN.$(SUFFIX) \ + xtrsm_RCUU.$(SUFFIX) xtrsm_RCUN.$(SUFFIX) xtrsm_RCLU.$(SUFFIX) xtrsm_RCLN.$(SUFFIX) \ + xsymm_LU.$(SUFFIX) xsymm_LL.$(SUFFIX) xsymm_RU.$(SUFFIX) xsymm_RL.$(SUFFIX) \ + xhemm_LU.$(SUFFIX) xhemm_LL.$(SUFFIX) xhemm_RU.$(SUFFIX) xhemm_RL.$(SUFFIX) \ + xsyrk_UN.$(SUFFIX) xsyrk_UT.$(SUFFIX) xsyrk_LN.$(SUFFIX) xsyrk_LT.$(SUFFIX) \ + xherk_UN.$(SUFFIX) xherk_UC.$(SUFFIX) xherk_LN.$(SUFFIX) xherk_LC.$(SUFFIX) \ + xsyr2k_UN.$(SUFFIX) xsyr2k_UT.$(SUFFIX) xsyr2k_LN.$(SUFFIX) xsyr2k_LT.$(SUFFIX) \ + xher2k_UN.$(SUFFIX) xher2k_UC.$(SUFFIX) xher2k_LN.$(SUFFIX) xher2k_LC.$(SUFFIX) \ + xsyrk_kernel_U.$(SUFFIX) xsyrk_kernel_L.$(SUFFIX) \ + xherk_kernel_UN.$(SUFFIX) xherk_kernel_UC.$(SUFFIX) \ + xherk_kernel_LN.$(SUFFIX) xherk_kernel_LC.$(SUFFIX) \ + xsyr2k_kernel_U.$(SUFFIX) xsyr2k_kernel_L.$(SUFFIX) \ + xher2k_kernel_UN.$(SUFFIX) xher2k_kernel_UC.$(SUFFIX) \ + xher2k_kernel_LN.$(SUFFIX) xher2k_kernel_LC.$(SUFFIX) + +ifdef USE_GEMM3M + +CBLASOBJS += \ + cgemm3m_nn.$(SUFFIX) cgemm3m_cn.$(SUFFIX) cgemm3m_tn.$(SUFFIX) cgemm3m_nc.$(SUFFIX) \ + cgemm3m_nt.$(SUFFIX) cgemm3m_cc.$(SUFFIX) cgemm3m_ct.$(SUFFIX) cgemm3m_tc.$(SUFFIX) \ + cgemm3m_tt.$(SUFFIX) cgemm3m_nr.$(SUFFIX) cgemm3m_tr.$(SUFFIX) cgemm3m_cr.$(SUFFIX) \ + cgemm3m_rn.$(SUFFIX) cgemm3m_rt.$(SUFFIX) cgemm3m_rc.$(SUFFIX) cgemm3m_rr.$(SUFFIX) \ + csymm3m_LU.$(SUFFIX) csymm3m_LL.$(SUFFIX) csymm3m_RU.$(SUFFIX) csymm3m_RL.$(SUFFIX) \ + chemm3m_LU.$(SUFFIX) chemm3m_LL.$(SUFFIX) chemm3m_RU.$(SUFFIX) chemm3m_RL.$(SUFFIX) + +ZBLASOBJS += \ + zgemm3m_nn.$(SUFFIX) zgemm3m_cn.$(SUFFIX) zgemm3m_tn.$(SUFFIX) zgemm3m_nc.$(SUFFIX) \ + zgemm3m_nt.$(SUFFIX) zgemm3m_cc.$(SUFFIX) zgemm3m_ct.$(SUFFIX) zgemm3m_tc.$(SUFFIX) \ + zgemm3m_tt.$(SUFFIX) zgemm3m_nr.$(SUFFIX) zgemm3m_tr.$(SUFFIX) zgemm3m_cr.$(SUFFIX) \ + zgemm3m_rn.$(SUFFIX) zgemm3m_rt.$(SUFFIX) zgemm3m_rc.$(SUFFIX) zgemm3m_rr.$(SUFFIX) \ + zsymm3m_LU.$(SUFFIX) zsymm3m_LL.$(SUFFIX) zsymm3m_RU.$(SUFFIX) zsymm3m_RL.$(SUFFIX) \ + zhemm3m_LU.$(SUFFIX) zhemm3m_LL.$(SUFFIX) zhemm3m_RU.$(SUFFIX) zhemm3m_RL.$(SUFFIX) + +XBLASOBJS += \ + xgemm3m_nn.$(SUFFIX) xgemm3m_cn.$(SUFFIX) xgemm3m_tn.$(SUFFIX) xgemm3m_nc.$(SUFFIX) \ + xgemm3m_nt.$(SUFFIX) xgemm3m_cc.$(SUFFIX) xgemm3m_ct.$(SUFFIX) xgemm3m_tc.$(SUFFIX) \ + xgemm3m_tt.$(SUFFIX) xgemm3m_nr.$(SUFFIX) xgemm3m_tr.$(SUFFIX) xgemm3m_cr.$(SUFFIX) \ + xgemm3m_rn.$(SUFFIX) xgemm3m_rt.$(SUFFIX) xgemm3m_rc.$(SUFFIX) xgemm3m_rr.$(SUFFIX) \ + xsymm3m_LU.$(SUFFIX) xsymm3m_LL.$(SUFFIX) xsymm3m_RU.$(SUFFIX) xsymm3m_RL.$(SUFFIX) \ + xhemm3m_LU.$(SUFFIX) xhemm3m_LL.$(SUFFIX) xhemm3m_RU.$(SUFFIX) xhemm3m_RL.$(SUFFIX) + +endif + +ifdef SMP +COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(SUFFIX) gemm_thread_variable.$(SUFFIX) +COMMONOBJS += syrk_thread.$(SUFFIX) + +ifndef USE_SIMPLE_THREADED_LEVEL3 + +SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) +DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) +QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) +CBLASOBJS += cgemm_thread_nn.$(SUFFIX) cgemm_thread_nt.$(SUFFIX) cgemm_thread_nr.$(SUFFIX) cgemm_thread_nc.$(SUFFIX) +CBLASOBJS += cgemm_thread_tn.$(SUFFIX) cgemm_thread_tt.$(SUFFIX) cgemm_thread_tr.$(SUFFIX) cgemm_thread_tc.$(SUFFIX) +CBLASOBJS += cgemm_thread_rn.$(SUFFIX) cgemm_thread_rt.$(SUFFIX) cgemm_thread_rr.$(SUFFIX) cgemm_thread_rc.$(SUFFIX) +CBLASOBJS += cgemm_thread_cn.$(SUFFIX) cgemm_thread_ct.$(SUFFIX) cgemm_thread_cr.$(SUFFIX) cgemm_thread_cc.$(SUFFIX) +ZBLASOBJS += zgemm_thread_nn.$(SUFFIX) zgemm_thread_nt.$(SUFFIX) zgemm_thread_nr.$(SUFFIX) zgemm_thread_nc.$(SUFFIX) +ZBLASOBJS += zgemm_thread_tn.$(SUFFIX) zgemm_thread_tt.$(SUFFIX) zgemm_thread_tr.$(SUFFIX) zgemm_thread_tc.$(SUFFIX) +ZBLASOBJS += zgemm_thread_rn.$(SUFFIX) zgemm_thread_rt.$(SUFFIX) zgemm_thread_rr.$(SUFFIX) zgemm_thread_rc.$(SUFFIX) +ZBLASOBJS += zgemm_thread_cn.$(SUFFIX) zgemm_thread_ct.$(SUFFIX) zgemm_thread_cr.$(SUFFIX) zgemm_thread_cc.$(SUFFIX) +XBLASOBJS += xgemm_thread_nn.$(SUFFIX) xgemm_thread_nt.$(SUFFIX) xgemm_thread_nr.$(SUFFIX) xgemm_thread_nc.$(SUFFIX) +XBLASOBJS += xgemm_thread_tn.$(SUFFIX) xgemm_thread_tt.$(SUFFIX) xgemm_thread_tr.$(SUFFIX) xgemm_thread_tc.$(SUFFIX) +XBLASOBJS += xgemm_thread_rn.$(SUFFIX) xgemm_thread_rt.$(SUFFIX) xgemm_thread_rr.$(SUFFIX) xgemm_thread_rc.$(SUFFIX) +XBLASOBJS += xgemm_thread_cn.$(SUFFIX) xgemm_thread_ct.$(SUFFIX) xgemm_thread_cr.$(SUFFIX) xgemm_thread_cc.$(SUFFIX) + +SBLASOBJS += ssymm_thread_LU.$(SUFFIX) ssymm_thread_LL.$(SUFFIX) ssymm_thread_RU.$(SUFFIX) ssymm_thread_RL.$(SUFFIX) +DBLASOBJS += dsymm_thread_LU.$(SUFFIX) dsymm_thread_LL.$(SUFFIX) dsymm_thread_RU.$(SUFFIX) dsymm_thread_RL.$(SUFFIX) +QBLASOBJS += qsymm_thread_LU.$(SUFFIX) qsymm_thread_LL.$(SUFFIX) qsymm_thread_RU.$(SUFFIX) qsymm_thread_RL.$(SUFFIX) +CBLASOBJS += csymm_thread_LU.$(SUFFIX) csymm_thread_LL.$(SUFFIX) csymm_thread_RU.$(SUFFIX) csymm_thread_RL.$(SUFFIX) +ZBLASOBJS += zsymm_thread_LU.$(SUFFIX) zsymm_thread_LL.$(SUFFIX) zsymm_thread_RU.$(SUFFIX) zsymm_thread_RL.$(SUFFIX) +XBLASOBJS += xsymm_thread_LU.$(SUFFIX) xsymm_thread_LL.$(SUFFIX) xsymm_thread_RU.$(SUFFIX) xsymm_thread_RL.$(SUFFIX) + +CBLASOBJS += chemm_thread_LU.$(SUFFIX) chemm_thread_LL.$(SUFFIX) chemm_thread_RU.$(SUFFIX) chemm_thread_RL.$(SUFFIX) +ZBLASOBJS += zhemm_thread_LU.$(SUFFIX) zhemm_thread_LL.$(SUFFIX) zhemm_thread_RU.$(SUFFIX) zhemm_thread_RL.$(SUFFIX) +XBLASOBJS += xhemm_thread_LU.$(SUFFIX) xhemm_thread_LL.$(SUFFIX) xhemm_thread_RU.$(SUFFIX) xhemm_thread_RL.$(SUFFIX) + +SBLASOBJS += ssyrk_thread_UN.$(SUFFIX) ssyrk_thread_UT.$(SUFFIX) ssyrk_thread_LN.$(SUFFIX) ssyrk_thread_LT.$(SUFFIX) +DBLASOBJS += dsyrk_thread_UN.$(SUFFIX) dsyrk_thread_UT.$(SUFFIX) dsyrk_thread_LN.$(SUFFIX) dsyrk_thread_LT.$(SUFFIX) +QBLASOBJS += qsyrk_thread_UN.$(SUFFIX) qsyrk_thread_UT.$(SUFFIX) qsyrk_thread_LN.$(SUFFIX) qsyrk_thread_LT.$(SUFFIX) +CBLASOBJS += csyrk_thread_UN.$(SUFFIX) csyrk_thread_UT.$(SUFFIX) csyrk_thread_LN.$(SUFFIX) csyrk_thread_LT.$(SUFFIX) +ZBLASOBJS += zsyrk_thread_UN.$(SUFFIX) zsyrk_thread_UT.$(SUFFIX) zsyrk_thread_LN.$(SUFFIX) zsyrk_thread_LT.$(SUFFIX) +XBLASOBJS += xsyrk_thread_UN.$(SUFFIX) xsyrk_thread_UT.$(SUFFIX) xsyrk_thread_LN.$(SUFFIX) xsyrk_thread_LT.$(SUFFIX) +CBLASOBJS += cherk_thread_UN.$(SUFFIX) cherk_thread_UC.$(SUFFIX) cherk_thread_LN.$(SUFFIX) cherk_thread_LC.$(SUFFIX) +ZBLASOBJS += zherk_thread_UN.$(SUFFIX) zherk_thread_UC.$(SUFFIX) zherk_thread_LN.$(SUFFIX) zherk_thread_LC.$(SUFFIX) +XBLASOBJS += xherk_thread_UN.$(SUFFIX) xherk_thread_UC.$(SUFFIX) xherk_thread_LN.$(SUFFIX) xherk_thread_LC.$(SUFFIX) + +ifdef USE_GEMM3M + +CBLASOBJS += cgemm3m_thread_nn.$(SUFFIX) cgemm3m_thread_nt.$(SUFFIX) cgemm3m_thread_nr.$(SUFFIX) cgemm3m_thread_nc.$(SUFFIX) +CBLASOBJS += cgemm3m_thread_tn.$(SUFFIX) cgemm3m_thread_tt.$(SUFFIX) cgemm3m_thread_tr.$(SUFFIX) cgemm3m_thread_tc.$(SUFFIX) +CBLASOBJS += cgemm3m_thread_rn.$(SUFFIX) cgemm3m_thread_rt.$(SUFFIX) cgemm3m_thread_rr.$(SUFFIX) cgemm3m_thread_rc.$(SUFFIX) +CBLASOBJS += cgemm3m_thread_cn.$(SUFFIX) cgemm3m_thread_ct.$(SUFFIX) cgemm3m_thread_cr.$(SUFFIX) cgemm3m_thread_cc.$(SUFFIX) +ZBLASOBJS += zgemm3m_thread_nn.$(SUFFIX) zgemm3m_thread_nt.$(SUFFIX) zgemm3m_thread_nr.$(SUFFIX) zgemm3m_thread_nc.$(SUFFIX) +ZBLASOBJS += zgemm3m_thread_tn.$(SUFFIX) zgemm3m_thread_tt.$(SUFFIX) zgemm3m_thread_tr.$(SUFFIX) zgemm3m_thread_tc.$(SUFFIX) +ZBLASOBJS += zgemm3m_thread_rn.$(SUFFIX) zgemm3m_thread_rt.$(SUFFIX) zgemm3m_thread_rr.$(SUFFIX) zgemm3m_thread_rc.$(SUFFIX) +ZBLASOBJS += zgemm3m_thread_cn.$(SUFFIX) zgemm3m_thread_ct.$(SUFFIX) zgemm3m_thread_cr.$(SUFFIX) zgemm3m_thread_cc.$(SUFFIX) +XBLASOBJS += xgemm3m_thread_nn.$(SUFFIX) xgemm3m_thread_nt.$(SUFFIX) xgemm3m_thread_nr.$(SUFFIX) xgemm3m_thread_nc.$(SUFFIX) +XBLASOBJS += xgemm3m_thread_tn.$(SUFFIX) xgemm3m_thread_tt.$(SUFFIX) xgemm3m_thread_tr.$(SUFFIX) xgemm3m_thread_tc.$(SUFFIX) +XBLASOBJS += xgemm3m_thread_rn.$(SUFFIX) xgemm3m_thread_rt.$(SUFFIX) xgemm3m_thread_rr.$(SUFFIX) xgemm3m_thread_rc.$(SUFFIX) +XBLASOBJS += xgemm3m_thread_cn.$(SUFFIX) xgemm3m_thread_ct.$(SUFFIX) xgemm3m_thread_cr.$(SUFFIX) xgemm3m_thread_cc.$(SUFFIX) + +CBLASOBJS += csymm3m_thread_LU.$(SUFFIX) csymm3m_thread_LL.$(SUFFIX) csymm3m_thread_RU.$(SUFFIX) csymm3m_thread_RL.$(SUFFIX) +ZBLASOBJS += zsymm3m_thread_LU.$(SUFFIX) zsymm3m_thread_LL.$(SUFFIX) zsymm3m_thread_RU.$(SUFFIX) zsymm3m_thread_RL.$(SUFFIX) +XBLASOBJS += xsymm3m_thread_LU.$(SUFFIX) xsymm3m_thread_LL.$(SUFFIX) xsymm3m_thread_RU.$(SUFFIX) xsymm3m_thread_RL.$(SUFFIX) + +CBLASOBJS += chemm3m_thread_LU.$(SUFFIX) chemm3m_thread_LL.$(SUFFIX) chemm3m_thread_RU.$(SUFFIX) chemm3m_thread_RL.$(SUFFIX) +ZBLASOBJS += zhemm3m_thread_LU.$(SUFFIX) zhemm3m_thread_LL.$(SUFFIX) zhemm3m_thread_RU.$(SUFFIX) zhemm3m_thread_RL.$(SUFFIX) +XBLASOBJS += xhemm3m_thread_LU.$(SUFFIX) xhemm3m_thread_LL.$(SUFFIX) xhemm3m_thread_RU.$(SUFFIX) xhemm3m_thread_RL.$(SUFFIX) + +endif + +endif +endif + +HPLOBJS = \ + dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \ + dtrsm_LNUU.$(SUFFIX) dtrsm_LNUN.$(SUFFIX) dtrsm_LNLU.$(SUFFIX) dtrsm_LNLN.$(SUFFIX) \ + dtrsm_LTUU.$(SUFFIX) dtrsm_LTUN.$(SUFFIX) dtrsm_LTLU.$(SUFFIX) dtrsm_LTLN.$(SUFFIX) \ + dtrsm_RNUU.$(SUFFIX) dtrsm_RNUN.$(SUFFIX) dtrsm_RNLU.$(SUFFIX) dtrsm_RNLN.$(SUFFIX) \ + dtrsm_RTUU.$(SUFFIX) dtrsm_RTUN.$(SUFFIX) dtrsm_RTLU.$(SUFFIX) dtrsm_RTLN.$(SUFFIX) + +ifndef USE_SIMPLE_THREADED_LEVEL3 +HPLOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) \ + dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) +endif + +all :: + +sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +sgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +sgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +sgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +dgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +dgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +dgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +dgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +qgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +qgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +qgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +qgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +cgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm_nr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm_nc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm_tr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm_tc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm_rn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm_rt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm_nr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm_nc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm_tr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm_tc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm_rn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm_rt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm_nr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm_nc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm_tr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm_tc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm_rn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm_rt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +gemm_thread_m.$(SUFFIX) : gemm_thread_m.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +gemm_thread_n.$(SUFFIX) : gemm_thread_n.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +gemm_thread_mn.$(SUFFIX) : gemm_thread_mn.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +beta_thread.$(SUFFIX) : beta_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + + +sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +sgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +sgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +sgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +dgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +dgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +dgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +dgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +qgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +qgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +qgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +qgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +cgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm_thread_nr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm_thread_nc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm_thread_tr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm_thread_tc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm_thread_rn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm_thread_rt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm_thread_nr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm_thread_nc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm_thread_tr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm_thread_tc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm_thread_rn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm_thread_rt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm_thread_nr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm_thread_nc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm_thread_tr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm_thread_tc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm_thread_rn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm_thread_rt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +strmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +ctrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LRUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LRUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LRLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LRLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LCUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LCUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LCLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LCLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RRUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RRUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RRLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RRLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RCUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RCUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RCLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RCLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LRUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LRUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LRLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LRLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LCUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LCUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LCLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LCLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RRUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RRUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RRLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RRLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RCUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RCUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RCLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RCLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LRUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LRUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LRLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LRLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LCUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LCUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LCLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LCLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RRUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RRUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RRLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RRLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RCUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RCUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RCLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RCLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ssymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +ssymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +ssymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +dsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +dsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +qsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +qsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +ssymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +ssymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +dsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +dsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +qsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +qsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +ssyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +ssyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +dsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +dsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +qsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +qsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +csyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +csyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +syrk_thread.$(SUFFIX) : syrk_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +ssyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +dsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +dsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +qsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +qsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +csyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +csyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +chemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +chemm_RL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_RL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_RL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +chemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +chemm_thread_RL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_thread_RL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_thread_RL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +cherk_UN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cherk_UC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cherk_LN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cherk_LC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zherk_UN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zherk_UC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zherk_LN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zherk_LC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xherk_UN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xherk_UC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xherk_LN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xherk_LC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cherk_kernel_UN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +cherk_kernel_UC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +cherk_kernel_LN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +cherk_kernel_LC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +zherk_kernel_UN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +zherk_kernel_UC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +zherk_kernel_LN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +zherk_kernel_LC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +xherk_kernel_UN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +xherk_kernel_UC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +xherk_kernel_LN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +xherk_kernel_LC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +cherk_thread_UN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +cherk_thread_UC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +cherk_thread_LN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +cherk_thread_LC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +zherk_thread_UN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +zherk_thread_UC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +zherk_thread_LN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +zherk_thread_LC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +xherk_thread_UN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +xherk_thread_UC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +xherk_thread_LN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +xherk_thread_LC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +cher2k_UN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cher2k_UC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cher2k_LN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cher2k_LC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zher2k_UN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zher2k_UC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zher2k_LN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zher2k_LC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xher2k_UN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xher2k_UC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xher2k_LN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xher2k_LC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cher2k_kernel_UN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +cher2k_kernel_UC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +cher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +cher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +zher2k_kernel_UN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +zher2k_kernel_UC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +zher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +zher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +xher2k_kernel_UN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +xher2k_kernel_UC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +xher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +xher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +cgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +cgemmf.$(SUFFIX) : zgemmf.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX $< -o $(@F) + +zgemmf.$(SUFFIX) : zgemmf.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX $< -o $(@F) + +xgemmf.$(SUFFIX) : zgemmf.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX $< -o $(@F) + +cgemm3m_thread_nn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm3m_thread_nt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm3m_thread_nr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm3m_thread_nc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm3m_thread_tn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm3m_thread_tt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm3m_thread_tr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm3m_thread_tc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm3m_thread_rn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm3m_thread_rt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm3m_thread_nn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm3m_thread_nt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm3m_thread_nr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm3m_thread_nc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm3m_thread_tn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm3m_thread_tt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm3m_thread_tr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm3m_thread_tc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm3m_thread_rn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm3m_thread_rt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm3m_thread_nn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm3m_thread_nt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm3m_thread_nr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm3m_thread_nc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm3m_thread_tn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm3m_thread_tt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm3m_thread_tr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm3m_thread_tc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm3m_thread_rn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm3m_thread_rt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +csymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_RL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_RL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_RL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_thread_RL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_thread_RL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_thread_RL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_RL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_RL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_RL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_thread_RL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_thread_RL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_thread_RL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +strsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +ctrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LRUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LRUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LRLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LRLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LCUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LCUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LCLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LCLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RRUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RRUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RRLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RRLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RCUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RCUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RCLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RCLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + + +ztrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LRUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LRUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LRLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LRLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LCUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LCUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LCLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LCLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RRUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RRUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RRLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RRLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RCUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RCUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RCLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RCLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LRUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LRUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LRLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LRLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LCUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LCUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LCLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LCLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RRUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RRUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RRLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RRLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RCUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RCUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RCLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RCLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +sgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +sgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +sgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +dgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +dgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +dgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +dgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +qgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +qgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +qgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +qgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +cgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm_nr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm_nc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm_tr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm_tc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm_rn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm_rt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm_nr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm_nc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm_tr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm_tc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm_rn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm_rt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm_nr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm_nc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm_tr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm_tc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm_rn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm_rt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +gemm_thread_m.$(PSUFFIX) : gemm_thread_m.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +gemm_thread_n.$(PSUFFIX) : gemm_thread_n.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +gemm_thread_mn.$(PSUFFIX) : gemm_thread_mn.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +gemm_thread_variable.$(PSUFFIX) : gemm_thread_variable.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + + +sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +sgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +sgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +sgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +dgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +dgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +dgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +dgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +qgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +qgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +qgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +qgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +cgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm_thread_nr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm_thread_nc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm_thread_tr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm_thread_tc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm_thread_rn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm_thread_rt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm_thread_nr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm_thread_nc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm_thread_tr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm_thread_tc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm_thread_rn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm_thread_rt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm_thread_nr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm_thread_nc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm_thread_tr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm_thread_tc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm_thread_rn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm_thread_rt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +strmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +ctrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LRUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LRUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LRLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LRLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LCUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LCUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LCLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LCLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RRUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RRUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RRLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RRLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RCUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RCUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RCLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RCLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LRUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LRUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LRLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LRLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LCUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LCUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LCLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LCLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RRUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RRUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RRLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RRLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RCUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RCUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RCLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RCLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LRUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LRUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LRLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LRLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LCUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LCUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LCLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LCLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RRUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RRUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RRLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RRLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RCUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RCUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RCLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RCLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ssymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +ssymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +ssymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +dsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +dsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +qsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +qsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +ssymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +ssymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +dsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +dsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +qsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +qsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +ssyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +ssyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +dsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +dsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +qsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +qsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +csyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +csyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +syrk_thread.$(PSUFFIX) : syrk_thread.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +ssyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +ssyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +dsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +dsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +qsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +qsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +csyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +csyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +chemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +chemm_RL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_RL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_RL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +chemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +chemm_thread_RL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_thread_RL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_thread_RL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +cherk_UN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cherk_UC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cherk_LN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cherk_LC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zherk_UN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zherk_UC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zherk_LN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zherk_LC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xherk_UN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xherk_UC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xherk_LN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xherk_LC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cherk_kernel_UN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +cherk_kernel_UC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +cherk_kernel_LN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +cherk_kernel_LC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +zherk_kernel_UN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +zherk_kernel_UC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +zherk_kernel_LN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +zherk_kernel_LC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +xherk_kernel_UN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +xherk_kernel_UC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +xherk_kernel_LN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +xherk_kernel_LC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +cherk_thread_UN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +cherk_thread_UC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +cherk_thread_LN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +cherk_thread_LC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +zherk_thread_UN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +zherk_thread_UC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +zherk_thread_LN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +zherk_thread_LC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +xherk_thread_UN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +xherk_thread_UC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +xherk_thread_LN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +xherk_thread_LC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +cher2k_UN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cher2k_UC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cher2k_LN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cher2k_LC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zher2k_UN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zher2k_UC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zher2k_LN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zher2k_LC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xher2k_UN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xher2k_UC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xher2k_LN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xher2k_LC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cher2k_kernel_UN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +cher2k_kernel_UC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +cher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +cher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +zher2k_kernel_UN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +zher2k_kernel_UC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +zher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +zher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +xher2k_kernel_UN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +xher2k_kernel_UC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +xher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +xher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +cgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +cgemmf.$(PSUFFIX) : zgemmf.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX $< -o $(@F) + +zgemmf.$(PSUFFIX) : zgemmf.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX $< -o $(@F) + +xgemmf.$(PSUFFIX) : zgemmf.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX $< -o $(@F) + +cgemm3m_thread_nn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm3m_thread_nt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm3m_thread_nr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm3m_thread_nc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm3m_thread_tn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm3m_thread_tt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm3m_thread_tr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm3m_thread_tc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm3m_thread_rn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm3m_thread_rt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm3m_thread_nn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm3m_thread_nt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm3m_thread_nr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm3m_thread_nc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm3m_thread_tn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm3m_thread_tt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm3m_thread_tr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm3m_thread_tc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm3m_thread_rn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm3m_thread_rt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm3m_thread_nn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm3m_thread_nt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm3m_thread_nr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm3m_thread_nc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm3m_thread_tn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm3m_thread_tt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm3m_thread_tr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm3m_thread_tc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm3m_thread_rn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm3m_thread_rt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +csymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_RL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_RL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_RL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_thread_RL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_thread_RL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_thread_RL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_RL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_RL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_RL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_thread_RL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_thread_RL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_thread_RL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +strsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +ctrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LRUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LRUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LRLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LRLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LCUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LCUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LCLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LCLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RRUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RRUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RRLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RRLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RCUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RCUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RCLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RCLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + + +ztrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LRUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LRUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LRLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LRLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LCUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LCUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LCLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LCLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RRUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RRUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RRLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RRLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RCUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RCUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RCLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RCLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LRUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LRUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LRLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LRLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LCUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LCUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LCLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LCLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RRUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RRUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RRLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RRLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RCUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RCUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RCLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RCLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +include ../../Makefile.tail diff --git a/driver/level3/gemm.c b/driver/level3/gemm.c new file mode 100644 index 0000000..2b13da7 --- /dev/null +++ b/driver/level3/gemm.c @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifdef PARAMTEST +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P (args -> gemm_p) +#define GEMM_Q (args -> gemm_q) +#define GEMM_R (args -> gemm_r) +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q + +#define GEMM_P 504 +#define GEMM_Q 128 +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_thread.c" +#else +#include "level3.c" +#endif diff --git a/driver/level3/gemm3m.c b/driver/level3/gemm3m.c new file mode 100644 index 0000000..8f31cf5 --- /dev/null +++ b/driver/level3/gemm3m.c @@ -0,0 +1,58 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifdef PARAMTEST +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P (args -> gemm_p) +#define GEMM_Q (args -> gemm_q) +#define GEMM_R (args -> gemm_r) +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_gemm3m_thread.c" +#else +#include "gemm3m_level3.c" +#endif diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c new file mode 100644 index 0000000..8c5473c --- /dev/null +++ b/driver/level3/gemm3m_level3.c @@ -0,0 +1,531 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef BETA_OPERATION +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) +#endif + +#ifndef ICOPYB_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER) +#else +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER) +#endif +#endif + +#ifndef ICOPYR_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER) +#else +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER) +#endif +#endif + +#ifndef ICOPYI_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER) +#else +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER) +#endif +#endif + + +#ifndef OCOPYB_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + +#ifndef OCOPYR_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + + +#ifndef OCOPYI_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + +#ifndef KERNEL_FUNC +#define KERNEL_FUNC GEMM3M_KERNEL +#endif + +#ifndef KERNEL_OPERATION +#define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef B +#define B args -> b +#endif +#ifndef LDB +#define LDB args -> ldb +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ZERO +#define ALPHA6 ONE + +#define ALPHA7 ONE +#define ALPHA8 ZERO +#define ALPHA11 ONE +#define ALPHA12 -ONE + +#define ALPHA13 ZERO +#define ALPHA14 ONE +#define ALPHA17 -ONE +#define ALPHA18 -ONE +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ONE +#define ALPHA6 ZERO + +#define ALPHA7 ZERO +#define ALPHA8 ONE +#define ALPHA11 -ONE +#define ALPHA12 -ONE + +#define ALPHA13 ONE +#define ALPHA14 ZERO +#define ALPHA17 -ONE +#define ALPHA18 ONE +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ONE +#define ALPHA6 ZERO + +#define ALPHA7 ZERO +#define ALPHA8 ONE +#define ALPHA11 -ONE +#define ALPHA12 ONE + +#define ALPHA13 ONE +#define ALPHA14 ZERO +#define ALPHA17 -ONE +#define ALPHA18 -ONE +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ZERO +#define ALPHA6 -ONE + +#define ALPHA7 ONE +#define ALPHA8 ZERO +#define ALPHA11 ONE +#define ALPHA12 ONE + +#define ALPHA13 ZERO +#define ALPHA14 ONE +#define ALPHA17 -ONE +#define ALPHA18 ONE +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + FLOAT *sa, FLOAT *sb, BLASLONG dummy){ + BLASLONG k, lda, ldb, ldc; + FLOAT *alpha, *beta; + FLOAT *a, *b, *c; + BLASLONG m_from, m_to, n_from, n_to; + + BLASLONG ls, is, js, jjs; + BLASLONG min_l, min_i, min_j, min_jj; + +#ifdef TIMING + BLASULONG rpcc_counter; + BLASULONG BLASLONG innercost = 0; + BLASULONG BLASLONG outercost = 0; + BLASULONG BLASLONG kernelcost = 0; + double total; +#endif + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = *(((BLASLONG *)range_m) + 0); + m_to = *(((BLASLONG *)range_m) + 1); + } + + n_from = 0; + n_to = N; + + if (range_n) { + n_from = *(((BLASLONG *)range_n) + 0); + n_to = *(((BLASLONG *)range_n) + 1); + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#ifdef COMPLEX + && (alpha[1] == ZERO) +#endif + ) return 0; + +#if 0 + printf("GEMM: M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k); + printf("GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM3M_P, (BLASLONG)GEMM3M_Q, (BLASLONG)GEMM3M_R); + printf("GEMM: SA .. %p SB .. %p\n", sa, sb); +#endif + +#ifdef DEBUG + innercost = 0; + outercost = 0; + kernelcost = 0; +#endif + + for(js = n_from; js < n_to; js += GEMM3M_R){ + min_j = n_to - js; + if (min_j > GEMM3M_R) min_j = GEMM3M_R; + + for(ls = 0; ls < k; ls += min_l){ + min_l = k - ls; + + if (min_l >= GEMM3M_Q * 2) { + min_l = GEMM3M_Q; + } else { + if (min_l > GEMM3M_Q) { + min_l = (min_l + 1) / 2; +#ifdef UNROLL_X + min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1); +#endif + } + } + + min_i = m_to - m_from; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else { + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + } + + START_RPCC(); + + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#else + OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#endif + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6, + sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + } + + min_i = m_to - m_from; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else { + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + } + + START_RPCC(); + + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#else + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#endif + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12, + sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + + min_i = m_to - m_from; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else { + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + } + + START_RPCC(); + + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#else + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#endif + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18, + sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + + } /* end of js */ + } /* end of ls */ + + +#ifdef TIMING + total = (double)outercost + (double)innercost + (double)kernelcost; + + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n", + innercost / total * 100., outercost / total * 100., + kernelcost / total * 100.); + + printf( " Total %10.3f%% %10.3f MFlops\n", + ((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100, + 2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost); +#endif + + return 0; +} diff --git a/driver/level3/gemm_thread_m.c b/driver/level3/gemm_thread_m.c new file mode 100644 index 0000000..52c9b2d --- /dev/null +++ b/driver/level3/gemm_thread_m.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + if (!range_m) { + range[0] = 0; + i = arg -> m; + } else { + range[0] = range_m[0]; + i = range_m[1] - range_m[0]; + } + + num_cpu = 0; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + i -= width; + if (i < 0) width = width + i; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = arg; + queue[num_cpu].range_m = &range[num_cpu]; + queue[num_cpu].range_n = range_n; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + num_cpu ++; + } + + if (num_cpu) { + queue[0].sa = sa; + queue[0].sb = sb; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c new file mode 100644 index 0000000..321e88f --- /dev/null +++ b/driver/level3/gemm_thread_mn.c @@ -0,0 +1,148 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static const int divide_rule[][2] = + {{ 0, 0}, + { 1, 1}, { 1, 2}, { 1, 3}, { 2, 2}, + { 1, 5}, { 2, 3}, { 1, 7}, { 2, 4}, + { 3, 3}, { 2, 5}, { 1, 11}, { 2, 6}, + { 1, 13}, { 2, 7}, { 3, 5}, { 4, 4}, + { 1, 17}, { 3, 6}, { 1, 19}, { 4, 5}, + { 3, 7}, { 2, 11}, { 1, 23}, { 4, 6}, + { 5, 5}, { 2, 13}, { 3, 9}, { 4, 7}, + { 1, 29}, { 5, 6}, { 1, 31}, { 4, 8}, + { 3, 11}, { 2, 17}, { 5, 7}, { 6, 6}, + { 1, 37}, { 2, 19}, { 3, 13}, { 5, 8}, + { 1, 41}, { 6, 7}, { 1, 43}, { 4, 11}, + { 5, 9}, { 2, 23}, { 1, 47}, { 6, 8}, + { 7, 7}, { 5, 10}, { 3, 17}, { 4, 13}, + { 1, 53}, { 6, 9}, { 5, 11}, { 7, 8}, + { 3, 19}, { 2, 29}, { 1, 59}, { 6, 10}, + { 1, 61}, { 2, 31}, { 7, 9}, { 8, 8}, +}; + +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; + BLASLONG procs, total_procs, num_cpu_m, num_cpu_n; + + BLASLONG width, i, j; + BLASLONG divM, divN; + + divM = divide_rule[nthreads][0]; + divN = divide_rule[nthreads][1]; + + if (!range_m) { + range_M[0] = 0; + i = arg -> m; + } else { + range_M[0] = range_M[0]; + i = range_M[1] - range_M[0]; + } + + num_cpu_m = 0; + + while (i > 0){ + + width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m); + + i -= width; + if (i < 0) width = width + i; + + range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; + + num_cpu_m ++; + } + + if (!range_n) { + range_N[0] = 0; + i = arg -> n; + } else { + range_N[0] = range_n[0]; + i = range_n[1] - range_n[0]; + } + + num_cpu_n = 0; + + while (i > 0){ + + width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n); + + i -= width; + if (i < 0) width = width + i; + + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; + + num_cpu_n ++; + } + + procs = 0; + + for (j = 0; j < num_cpu_n; j++) { + for (i = 0; i < num_cpu_m; i++) { + + queue[procs].mode = mode; + queue[procs].routine = function; + queue[procs].args = arg; + queue[procs].range_m = &range_M[i]; + queue[procs].range_n = &range_N[j]; + queue[procs].sa = NULL; + queue[procs].sb = NULL; + queue[procs].next = &queue[procs + 1]; + + procs ++; + } + } + + if (procs) { + queue[0].sa = sa; + queue[0].sb = sb; + + queue[procs - 1].next = NULL; + + exec_blas(procs, queue); + } + + return 0; +} diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c new file mode 100644 index 0000000..ba54612 --- /dev/null +++ b/driver/level3/gemm_thread_n.c @@ -0,0 +1,91 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + if (!range_n) { + range[0] = 0; + i = arg -> n; + } else { + range[0] = range_n[0]; + i = range_n[1] - range_n[0]; + } + + num_cpu = 0; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + i -= width; + if (i < 0) width = width + i; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = arg; + queue[num_cpu].range_m = range_m; + queue[num_cpu].range_n = &range[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + num_cpu ++; + } + + if (num_cpu) { + queue[0].sa = sa; + queue[0].sb = sb; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, + queue); + } + + return 0; +} diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c new file mode 100644 index 0000000..9d83e95 --- /dev/null +++ b/driver/level3/gemm_thread_variable.c @@ -0,0 +1,127 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(int mode, + blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, + int (*function)(), void *sa, void *sb, BLASLONG divM, BLASLONG divN) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; + BLASLONG procs, num_cpu_m, num_cpu_n; + + BLASLONG width, i, j; + + if (!range_m) { + range_M[0] = 0; + i = arg -> m; + } else { + range_M[0] = range_M[0]; + i = range_M[1] - range_M[0]; + } + + num_cpu_m = 0; + + while (i > 0){ + + width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m); + + i -= width; + if (i < 0) width = width + i; + + range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; + + num_cpu_m ++; + } + + if (!range_n) { + range_N[0] = 0; + i = arg -> n; + } else { + range_N[0] = range_n[0]; + i = range_n[1] - range_n[0]; + } + + num_cpu_n = 0; + + while (i > 0){ + + width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n); + + i -= width; + if (i < 0) width = width + i; + + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; + + num_cpu_n ++; + } + + procs = 0; + + for (j = 0; j < num_cpu_n; j++) { + for (i = 0; i < num_cpu_m; i++) { + + queue[procs].mode = mode; + queue[procs].routine = function; + queue[procs].args = arg; + queue[procs].range_m = &range_M[i]; + queue[procs].range_n = &range_N[j]; + queue[procs].sa = NULL; + queue[procs].sb = NULL; + queue[procs].next = &queue[procs + 1]; + + procs ++; + } + } + + if (procs) { + queue[0].sa = sa; + queue[0].sb = sb; + + queue[procs - 1].next = NULL; + + exec_blas(procs, queue); + } + + return 0; +} + diff --git a/driver/level3/hemm3m_k.c b/driver/level3/hemm3m_k.c new file mode 100644 index 0000000..2f3cf82 --- /dev/null +++ b/driver/level3/hemm3m_k.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) + +#ifndef RSIDE +#ifndef LOWER +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_IUCOPYB(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_IUCOPYR(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_IUCOPYI(M, N, A, LDA, Y, X, BUFFER) +#else +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_ILCOPYB(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_ILCOPYR(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_ILCOPYI(M, N, A, LDA, Y, X, BUFFER) +#endif +#endif + +#ifdef RSIDE +#ifndef LOWER +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OUCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OUCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OUCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OLCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OLCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OLCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + +#ifndef RSIDE +#define K args -> m +#ifndef LOWER +#define GEMM3M_LOCAL HEMM3M_LU +#else +#define GEMM3M_LOCAL HEMM3M_LL +#endif +#else +#define K args -> n +#ifndef LOWER +#define GEMM3M_LOCAL HEMM3M_RU +#else +#define GEMM3M_LOCAL HEMM3M_RL +#endif +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_gemm3m_thread.c" +#else +#include "gemm3m_level3.c" +#endif diff --git a/driver/level3/level3.c b/driver/level3/level3.c new file mode 100644 index 0000000..62b310a --- /dev/null +++ b/driver/level3/level3.c @@ -0,0 +1,401 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/* This file is a template for level 3 operation */ + +#ifndef BETA_OPERATION +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) +#ifndef COMPLEX +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#else +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#endif +#else +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA, NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#endif +#endif + +#ifndef ICOPY_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef KERNEL_FUNC +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define KERNEL_FUNC GEMM_KERNEL_N +#endif +#if defined(CN) || defined(CT) || defined(RN) || defined(RT) +#define KERNEL_FUNC GEMM_KERNEL_L +#endif +#if defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define KERNEL_FUNC GEMM_KERNEL_R +#endif +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define KERNEL_FUNC GEMM_KERNEL_B +#endif +#endif + +#ifndef KERNEL_OPERATION +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif +#endif + +#ifndef FUSED_KERNEL_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#ifndef COMPLEX +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#else +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) + +#endif +#else +#ifndef COMPLEX +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#else +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#endif +#endif +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef B +#define B args -> b +#endif +#ifndef LDB +#define LDB args -> ldb +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ + BLASLONG k, lda, ldb, ldc; + FLOAT *alpha, *beta; + FLOAT *a, *b, *c; + BLASLONG m_from, m_to, n_from, n_to; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; +#if !defined(FUSED_GEMM) || defined(TIMING) + BLASLONG jjs, min_jj; +#endif + + BLASLONG l1stride, gemm_p, l2size; + +#if defined(XDOUBLE) && defined(QUAD_PRECISION) + xidouble xalpha; +#endif + +#ifdef TIMING + unsigned long long rpcc_counter; + unsigned long long innercost = 0; + unsigned long long outercost = 0; + unsigned long long kernelcost = 0; + double total; +#endif + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = *(((BLASLONG *)range_m) + 0); + m_to = *(((BLASLONG *)range_m) + 1); + } + + n_from = 0; + n_to = N; + + if (range_n) { + n_from = *(((BLASLONG *)range_n) + 0); + n_to = *(((BLASLONG *)range_n) + 1); + } + + if (beta) { +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) +#ifndef COMPLEX + if (beta[0] != ONE +#else + if ((beta[0] != ONE) || (beta[1] != ZERO) +#endif +#else + if (((beta[0].x[1] != 0x3fff000000000000UL) || beta[0].x[0] != 0) +#ifdef COMPLEX + &&(((beta[1].x[0] | beta[1].x[1]) << 1) != 0) +#endif +#endif + ) { +#if defined(XDOUBLE) && defined(QUAD_PRECISION) + xidouble xbeta; + + qtox(&xbeta, beta); +#endif + BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); + } + } + + if ((k == 0) || (alpha == NULL)) return 0; + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + if ((alpha[0] == ZERO) +#ifdef COMPLEX + && (alpha[1] == ZERO) +#endif + ) return 0; +#else + if (((alpha[0].x[0] | alpha[0].x[1] +#ifdef COMPLEX + | alpha[1].x[0] | alpha[1].x[1] +#endif + ) << 1) == 0) return 0; +#endif + +#if defined(XDOUBLE) && defined(QUAD_PRECISION) + qtox(&xalpha, alpha); +#endif + + l2size = GEMM_P * GEMM_Q; + +#if 0 + fprintf(stderr, "GEMM(Single): M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k); + fprintf(stderr, "GEMM(Single):: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); + // fprintf(stderr, "GEMM: SA .. %p SB .. %p\n", sa, sb); + + // fprintf(stderr, "A = %p B = %p C = %p\n\tlda = %ld ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc); +#endif + +#ifdef DEBUG + innercost = 0; + outercost = 0; + kernelcost = 0; +#endif + + for(js = n_from; js < n_to; js += GEMM_R){ + min_j = n_to - js; + if (min_j > GEMM_R) min_j = GEMM_R; + + for(ls = 0; ls < k; ls += min_l){ + + min_l = k - ls; + + if (min_l >= GEMM_Q * 2) { + gemm_p = GEMM_P; + min_l = GEMM_Q; + } else { + if (min_l > GEMM_Q) { + min_l = (min_l / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } + gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1)); + while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; + } + + /* First, we have to move data A to L2 cache */ + min_i = m_to - m_from; + l1stride = 1; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else { + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } else { + l1stride = 0; + } + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(innercost); + +#if defined(FUSED_GEMM) && !defined(TIMING) + + FUSED_KERNEL_OPERATION(min_i, min_j, min_l, alpha, + sa, sb, b, ldb, c, ldc, m_from, js, ls); + + +#else + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, + sb + min_l * (jjs - js) * COMPSIZE * l1stride); + + STOP_RPCC(outercost); + + START_RPCC(); + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); +#else + KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, + sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); +#endif + + STOP_RPCC(kernelcost); + } +#endif + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); +#else + KERNEL_OPERATION(min_i, min_j, min_l, (void *)&xalpha, sa, sb, c, ldc, is, js); +#endif + + STOP_RPCC(kernelcost); + + } /* end of is */ + } /* end of js */ + } /* end of ls */ + + +#ifdef TIMING + total = (double)outercost + (double)innercost + (double)kernelcost; + + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", + innercost / total * 100., outercost / total * 100., + kernelcost / total * 100., + (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., + (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); + +#endif + + return 0; +} diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c new file mode 100644 index 0000000..bddb5eb --- /dev/null +++ b/driver/level3/level3_gemm3m_thread.c @@ -0,0 +1,1015 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef GEMM3M_LOCAL +#if defined(NN) +#define GEMM3M_LOCAL GEMM3M_NN +#elif defined(NT) +#define GEMM3M_LOCAL GEMM3M_NT +#elif defined(NR) +#define GEMM3M_LOCAL GEMM3M_NR +#elif defined(NC) +#define GEMM3M_LOCAL GEMM3M_NC +#elif defined(TN) +#define GEMM3M_LOCAL GEMM3M_TN +#elif defined(TT) +#define GEMM3M_LOCAL GEMM3M_TT +#elif defined(TR) +#define GEMM3M_LOCAL GEMM3M_TR +#elif defined(TC) +#define GEMM3M_LOCAL GEMM3M_TC +#elif defined(RN) +#define GEMM3M_LOCAL GEMM3M_RN +#elif defined(RT) +#define GEMM3M_LOCAL GEMM3M_RT +#elif defined(RR) +#define GEMM3M_LOCAL GEMM3M_RR +#elif defined(RC) +#define GEMM3M_LOCAL GEMM3M_RC +#elif defined(CN) +#define GEMM3M_LOCAL GEMM3M_CN +#elif defined(CT) +#define GEMM3M_LOCAL GEMM3M_CT +#elif defined(CR) +#define GEMM3M_LOCAL GEMM3M_CR +#elif defined(CC) +#define GEMM3M_LOCAL GEMM3M_CC +#endif +#endif + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef BETA_OPERATION +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) +#endif + +#ifndef ICOPYB_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef ICOPYR_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef ICOPYI_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + + +#ifndef OCOPYB_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#else +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#endif +#endif + +#ifndef OCOPYR_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#else +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#endif +#endif + + +#ifndef OCOPYI_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#else +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#endif +#endif + +#ifndef KERNEL_FUNC +#define KERNEL_FUNC GEMM3M_KERNEL +#endif + +#ifndef KERNEL_OPERATION +#define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef B +#define B args -> b +#endif +#ifndef LDB +#define LDB args -> ldb +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ZERO +#define ALPHA6 ONE + +#define ALPHA7 ONE +#define ALPHA8 ZERO +#define ALPHA11 ONE +#define ALPHA12 -ONE + +#define ALPHA13 ZERO +#define ALPHA14 ONE +#define ALPHA17 -ONE +#define ALPHA18 -ONE +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ONE +#define ALPHA6 ZERO + +#define ALPHA7 ZERO +#define ALPHA8 ONE +#define ALPHA11 -ONE +#define ALPHA12 -ONE + +#define ALPHA13 ONE +#define ALPHA14 ZERO +#define ALPHA17 -ONE +#define ALPHA18 ONE +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ONE +#define ALPHA6 ZERO + +#define ALPHA7 ZERO +#define ALPHA8 ONE +#define ALPHA11 -ONE +#define ALPHA12 ONE + +#define ALPHA13 ONE +#define ALPHA14 ZERO +#define ALPHA17 -ONE +#define ALPHA18 -ONE +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ZERO +#define ALPHA6 -ONE + +#define ALPHA7 ONE +#define ALPHA8 ZERO +#define ALPHA11 ONE +#define ALPHA12 ONE + +#define ALPHA13 ZERO +#define ALPHA14 ONE +#define ALPHA17 -ONE +#define ALPHA18 ONE +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG k, lda, ldb, ldc; + BLASLONG m_from, m_to, n_from, n_to, N_from, N_to; + + FLOAT *alpha, *beta; + FLOAT *a, *b, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG ls, min_l, jjs, min_jj; + BLASLONG is, min_i, div_n; + BLASLONG i, current; + +#ifdef TIMING + BLASLONG rpcc_counter; + BLASLONG copy_A = 0; + BLASLONG copy_B = 0; + BLASLONG kernel = 0; + BLASLONG waiting1 = 0; + BLASLONG waiting2 = 0; + BLASLONG waiting3 = 0; + BLASLONG waiting6[MAX_CPU_NUMBER]; + BLASLONG ops = 0; + + for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; +#endif + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = range_m[0]; + m_to = range_m[1]; + } + + n_from = 0; + n_to = N; + + N_from = 0; + N_to = N; + + if (range_n) { + n_from = range_n[mypos + 0]; + n_to = range_n[mypos + 1]; + + N_from = range_n[0]; + N_to = range_n[args -> nthreads]; + } + + if (beta) { + if ((beta[0] != ONE) || (beta[1] != ZERO)) + BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", + mypos, m_from, m_to, n_from, n_to, N_from, N_to); +#endif + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + buffer[0] = sb; + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1)); + } + + for(ls = 0; ls < k; ls += min_l){ + min_l = k - ls; + if (min_l >= GEMM3M_Q * 2) { + min_l = GEMM3M_Q; + } else { + if (min_l > GEMM3M_Q) { + min_l = (min_l + 1) / 2; + } + } + + min_i = m_to - m_from; + + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else { + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + } + + + START_RPCC(); + + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(copy_A); + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using another buffer */ + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#else + OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#endif + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6, + sa, buffer[bufferside] + min_l * (jjs - xxx), + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + + for (i = 0; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + } + + current = mypos; + + do { + current ++; + if (current >= args -> nthreads) current = 0; + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if (current != mypos) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, m_from, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + } + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + } while (current != mypos); + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; +#endif + if (is + min_i >= m_to) { + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + + } /* end of is */ + + START_RPCC(); + + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(copy_A); + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using another buffer */ + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#else + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#endif + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12, + sa, buffer[bufferside] + min_l * (jjs - xxx), + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + + for (i = 0; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + } + + current = mypos; + + do { + current ++; + if (current >= args -> nthreads) current = 0; + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if (current != mypos) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, m_from, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + } + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + } while (current != mypos); + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; +#endif + if (is + min_i >= m_to) { + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + + } /* end of is */ + + + START_RPCC(); + + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(copy_A); + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using another buffer */ + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#else + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#endif + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18, + sa, buffer[bufferside] + min_l * (jjs - xxx), + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + + for (i = 0; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + } + + current = mypos; + + do { + current ++; + if (current >= args -> nthreads) current = 0; + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if (current != mypos) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, m_from, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + } + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + } while (current != mypos); + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; +#endif + if (is + min_i >= m_to) { + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + + } /* end of is */ + + } + + START_RPCC(); + + for (i = 0; i < args -> nthreads; i++) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + + STOP_RPCC(waiting3); + +#ifdef TIMING + BLASLONG waiting = waiting1 + waiting2 + waiting3; + BLASLONG total = copy_A + copy_B + kernel + waiting; + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait : %6.2f Kernel : %6.2f\n", + mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., + (double)waiting /(double)total * 100., + (double)ops/(double)kernel / 2. * 100.); + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", + mypos, copy_A, copy_B, waiting); + +#if 0 + fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", + mypos, + (double)waiting1/(double)waiting * 100., + (double)waiting2/(double)waiting * 100., + (double)waiting3/(double)waiting * 100.); +#endif + fprintf(stderr, "\n"); +#endif + + + + return 0; +} + +static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG + *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + blas_arg_t newarg; + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1]; + BLASLONG range_N[MAX_CPU_NUMBER + 1]; + + job_t job[MAX_CPU_NUMBER]; + + BLASLONG num_cpu_m, num_cpu_n; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k, js; + BLASLONG m, n, n_from, n_to; + int mode; + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE; +#else + mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; +#endif + + newarg.m = args -> m; + newarg.n = args -> n; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.ldb = args -> ldb; + newarg.ldc = args -> ldc; + newarg.alpha = args -> alpha; + newarg.beta = args -> beta; + newarg.nthreads = args -> nthreads; + newarg.common = (void *)job; + + if (!range_m) { + range_M[0] = 0; + m = args -> m; + } else { + range_M[0] = range_m[0]; + m = range_m[1] - range_m[0]; + } + + num_cpu_m = 0; + + while (m > 0){ + + width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m); + + m -= width; + if (m < 0) width = width + m; + + range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; + + num_cpu_m ++; + } + + for (i = 0; i < num_cpu_m; i++) { + queue[i].mode = mode; + queue[i].routine = inner_thread; + queue[i].args = &newarg; + queue[i].range_m = &range_M[i]; + queue[i].range_n = &range_N[0]; + queue[i].sa = NULL; + queue[i].sb = NULL; + queue[i].next = &queue[i + 1]; + } + + queue[0].sa = sa; + queue[0].sb = sb; + + if (!range_n) { + n_from = 0; + n_to = args -> n; + } else { + n_from = range_n[0]; + n_to = range_n[1]; + } + + for(js = n_from; js < n_to; js += GEMM_R * nthreads){ + n = n_to - js; + if (n > GEMM_R * nthreads) n = GEMM_R * nthreads; + + range_N[0] = js; + + num_cpu_n = 0; + + while (n > 0){ + + width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); + + n -= width; + if (n < 0) width = width + n; + + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; + + num_cpu_n ++; + } + + for (j = 0; j < num_cpu_m; j++) { + for (i = 0; i < num_cpu_m; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[num_cpu_m - 1].next = NULL; + + exec_blas(num_cpu_m, queue); + } + + return 0; +} + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG m = args -> m; + BLASLONG n = args -> n; + BLASLONG nthreads = args -> nthreads; + BLASLONG divN, divT; + int mode; + + if (range_m) { + BLASLONG m_from = *(((BLASLONG *)range_m) + 0); + BLASLONG m_to = *(((BLASLONG *)range_m) + 1); + + m = m_to - m_from; + } + + if (range_n) { + BLASLONG n_from = *(((BLASLONG *)range_n) + 0); + BLASLONG n_to = *(((BLASLONG *)range_n) + 1); + + n = n_to - n_from; + } + + if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { + GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); + return 0; + } + + divT = nthreads; + divN = 1; + + while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) { + do { + divT --; + divN = 1; + while (divT * divN < nthreads) divN ++; + } while ((divT * divN != nthreads) && (divT > 1)); + } + + args -> nthreads = divT; + + if (divN == 1){ + gemm_driver(args, range_m, range_n, sa, sb, 0); + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + +#if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \ + defined(CN) || defined(CT) || defined(CR) || defined(CC) + mode |= (BLAS_TRANSA_T); +#endif +#if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \ + defined(NC) || defined(TC) || defined(RC) || defined(CC) + mode |= (BLAS_TRANSB_T); +#endif + + gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); + } + + return 0; +} diff --git a/driver/level3/level3_syr2k.c b/driver/level3/level3_syr2k.c new file mode 100644 index 0000000..2db1857 --- /dev/null +++ b/driver/level3/level3_syr2k.c @@ -0,0 +1,418 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) +#endif +#endif + +#ifndef KERNEL_OPERATION_C +#define KERNEL_OPERATION_C KERNEL_OPERATION +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef M +#define M args -> n +#endif + +#ifndef N +#define N args -> n +#endif + +#ifndef K +#define K args -> k +#endif + +#ifndef A +#define A args -> a +#endif + +#ifndef B +#define B args -> b +#endif + +#ifndef C +#define C args -> c +#endif + +#ifndef LDA +#define LDA args -> lda +#endif + +#ifndef LDB +#define LDB args -> ldb +#endif + +#ifndef LDC +#define LDC args -> ldc +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m_from, m_to, n_from, n_to, k, lda, ldb, ldc; + FLOAT *a, *b, *c, *alpha, *beta; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + BLASLONG m_start, m_end; + + FLOAT *aa; + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = *(((BLASLONG *)range_m) + 0); + m_to = *(((BLASLONG *)range_m) + 1); + } + + n_from = 0; + n_to = N; + + if (range_n) { + n_from = *(((BLASLONG *)range_n) + 0); + n_to = *(((BLASLONG *)range_n) + 1); + } + + if (beta) { +#if !defined(COMPLEX) || defined(HER2K) + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#ifdef COMPLEX + && (alpha[1] == ZERO) +#endif + ) return 0; + + for(js = n_from; js < n_to; js += GEMM_R){ + min_j = n_to - js; + if (min_j > GEMM_R) min_j = GEMM_R; + +#ifndef LOWER + m_start = m_from; + m_end = js + min_j; + if (m_end > m_to) m_end = m_to; +#else + m_start = m_from; + m_end = m_to; + if (m_start < js) m_start = js; +#endif + + for(ls = 0; ls < k; ls += min_l){ + min_l = k - ls; + if (min_l >= GEMM_Q * 2) { + min_l = GEMM_Q; + } else + if (min_l > GEMM_Q) { + min_l = (min_l + 1) / 2; + } + + min_i = m_end - m_start; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifndef LOWER + + if (m_start >= js) { + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa); + + KERNEL_OPERATION(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1); + + jjs = m_start + min_i; + + } else { + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + jjs = js; + } + + for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE, + c, ldc, m_start, jjs, 1); + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + min_i = m_end - is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1); + + } + + min_i = m_end - m_start; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + if (m_start >= js) { + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa); + + KERNEL_OPERATION_C(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0); + + jjs = m_start + min_i; + + } else { + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); + + jjs = js; + } + + for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE, + c, ldc, m_start, jjs, 0); + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + min_i = m_end - is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); + + KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0); + + } + +#else + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa); + + KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, + sa, aa, c, ldc, m_start, m_start, 1); + + for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){ + min_jj = m_start - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 1); + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + + min_i = m_end - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + aa = sb + min_l * (is - js) * COMPSIZE; + + if (is < js + min_j) { + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + OCOPY_OPERATION(min_l, min_i, b, ldb, ls, is, aa); + + KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 1); + + KERNEL_OPERATION(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 1); + + } else { + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1); + + } + + } + + min_i = m_end - m_start; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); + + OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa); + + KERNEL_OPERATION_C(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, + sa, aa, c, ldc, m_start, m_start, 0); + + for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){ + min_jj = m_start - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 0); + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + + min_i = m_end - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + aa = sb + min_l * (is - js) * COMPSIZE; + + if (is < js + min_j) { + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); + + OCOPY_OPERATION(min_l, min_i, a, lda, ls, is, aa); + + KERNEL_OPERATION_C(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 0); + + KERNEL_OPERATION_C(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 0); + + } else { + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); + + KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0); + + } + + } + + + +#endif + } + } + + return 0; +} diff --git a/driver/level3/level3_syrk.c b/driver/level3/level3_syrk.c new file mode 100644 index 0000000..249c140 --- /dev/null +++ b/driver/level3/level3_syrk.c @@ -0,0 +1,495 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#endif +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef M +#define M args -> n +#endif + +#ifndef N +#define N args -> n +#endif + +#ifndef K +#define K args -> k +#endif + +#ifndef A +#define A args -> a +#endif + +#ifndef C +#define C args -> c +#endif + +#ifndef LDA +#define LDA args -> lda +#endif + +#ifndef LDC +#define LDC args -> ldc +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m_from, m_to, n_from, n_to, k, lda, ldc; + FLOAT *a, *c, *alpha, *beta; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + BLASLONG m_start, m_end; + + int shared = ((GEMM_UNROLL_M == GEMM_UNROLL_N) && !HAVE_EX_L2); + + FLOAT *aa; + +#ifdef TIMING + unsigned long long rpcc_counter; + unsigned long long innercost = 0; + unsigned long long outercost = 0; + unsigned long long kernelcost = 0; + double total; +#endif + + k = K; + + a = (FLOAT *)A; + c = (FLOAT *)C; + + lda = LDA; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = *(((BLASLONG *)range_m) + 0); + m_to = *(((BLASLONG *)range_m) + 1); + } + + n_from = 0; + n_to = N; + + if (range_n) { + n_from = *(((BLASLONG *)range_n) + 0); + n_to = *(((BLASLONG *)range_n) + 1); + } + + if (beta) { +#if !defined(COMPLEX) || defined(HERK) + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#if defined(COMPLEX) && !defined(HERK) + && (alpha[1] == ZERO) +#endif + ) return 0; + +#if 0 + fprintf(stderr, "m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", + m_from, m_to, n_from, n_to); +#endif + + for(js = n_from; js < n_to; js += GEMM_R){ + min_j = n_to - js; + if (min_j > GEMM_R) min_j = GEMM_R; + +#ifndef LOWER + m_start = m_from; + m_end = js + min_j; + if (m_end > m_to) m_end = m_to; +#else + m_start = m_from; + m_end = m_to; + if (m_start < js) m_start = js; +#endif + + for(ls = 0; ls < k; ls += min_l){ + min_l = k - ls; + if (min_l >= GEMM_Q * 2) { + min_l = GEMM_Q; + } else + if (min_l > GEMM_Q) { + min_l = (min_l + 1) / 2; + } + + min_i = m_end - m_start; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifndef LOWER + + if (m_end >= js) { + + aa = sb + min_l * MAX(m_start - js, 0) * COMPSIZE; + if (!shared) aa = sa; + + for(jjs = MAX(m_start, js); jjs < js + min_j; jjs += min_jj){ + min_jj = js + min_j - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + if (!shared && (jjs - MAX(m_start, js) < min_i)) { + START_RPCC(); + + ICOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sa + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(innercost); + } + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, aa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, MAX(m_start, js), jjs); + + STOP_RPCC(kernelcost); + } + + for(is = MAX(m_start, js) + min_i; is < m_end; is += min_i){ + min_i = m_end - is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + aa = sb + min_l * (is - js) * COMPSIZE; + + if (!shared) { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + aa = sa; + } + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, aa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + + } + + if (m_start < js) { + + if (m_end < js) { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); + + STOP_RPCC(kernelcost); + + } + } else { + min_i = 0; + } + + for(is = m_start + min_i; is < MIN(m_end, js); is += min_i){ + + min_i = MIN(m_end, js)- is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + } + +#else + + if (m_start < js + min_j) { + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + if (!shared) { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + STOP_RPCC(innercost); + + } + + START_RPCC(); + + OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j + js - m_start)), a, lda, ls, m_start, aa); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, m_start, m_start); + + STOP_RPCC(kernelcost); + + for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_N){ + min_jj = m_start - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, (shared? (aa) : (sa)), sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + + min_i = m_end - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + if (is < js + min_j) { + + if (!shared) { + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + } + + aa = sb + min_l * (is - js) * COMPSIZE; + + START_RPCC(); + + OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j - is + js)), a, lda, ls, is, aa); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, is, is); + + STOP_RPCC(kernelcost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, is - js, min_l, alpha, (shared? (aa) : (sa)), sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } else { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + + } + + } else { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < min_j; jjs += GEMM_UNROLL_N){ + min_jj = min_j - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + + min_i = m_end - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + } +#endif + } + } + +#ifdef TIMING + total = (double)outercost + (double)innercost + (double)kernelcost; + + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", + innercost / total * 100., outercost / total * 100., kernelcost / total * 100., + (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / (double)DNUMOPT, + (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / (double)DNUMOPT); + +#endif + + return 0; +} diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c new file mode 100644 index 0000000..9d1f4d2 --- /dev/null +++ b/driver/level3/level3_syrk_threaded.c @@ -0,0 +1,673 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef SYRK_LOCAL +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL SYRK_UT +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_LN +#else +#define SYRK_LOCAL SYRK_LT +#endif +#endif + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#endif +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#undef TIMING + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG k, lda, ldc; + BLASLONG m_from, m_to, n_from, n_to; + + FLOAT *alpha, *beta; + FLOAT *a, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + + BLASLONG ls, min_l, jjs, min_jj; + BLASLONG is, min_i, div_n; + + BLASLONG i, current; +#ifdef LOWER + BLASLONG start_i; +#endif + +#ifdef TIMING + BLASLONG rpcc_counter; + BLASLONG copy_A = 0; + BLASLONG copy_B = 0; + BLASLONG kernel = 0; + BLASLONG waiting1 = 0; + BLASLONG waiting2 = 0; + BLASLONG waiting3 = 0; + BLASLONG waiting6[MAX_CPU_NUMBER]; + BLASLONG ops = 0; + + for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; +#endif + + k = K; + + a = (FLOAT *)A; + c = (FLOAT *)C; + + lda = LDA; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = N; + + /* Global Range */ + n_from = 0; + n_to = N; + + if (range_n) { + m_from = range_n[mypos + 0]; + m_to = range_n[mypos + 1]; + + n_from = range_n[0]; + n_to = range_n[args -> nthreads]; + } + + if (beta) { +#if !defined(COMPLEX) || defined(HERK) + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#if defined(COMPLEX) && !defined(HERK) + && (alpha[1] == ZERO) +#endif + ) return 0; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", mypos, m_from, m_to, n_from, n_to); +#endif + + div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + buffer[0] = sb; + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; + } + + for(ls = 0; ls < k; ls += min_l){ + + min_l = k - ls; + if (min_l >= GEMM_Q * 2) { + min_l = GEMM_Q; + } else { + if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; + } + + min_i = m_to - m_from; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else { + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + } + +#ifdef LOWER + xxx = (m_to - m_from - min_i) % GEMM_P; + + if (xxx) min_i -= GEMM_P - xxx; +#endif + + START_RPCC(); + +#ifndef LOWER + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); +#else + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_to - min_i, sa); +#endif + + STOP_RPCC(copy_A); + + div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using buffer */ +#ifndef LOWER + for (i = 0; i < mypos; i++) +#else + for (i = mypos + 1; i < args -> nthreads; i++) +#endif + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + +#ifndef LOWER + + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ + + min_jj = MIN(m_to, xxx + div_n) - jjs; + + if (xxx == m_from) { + if (min_jj > min_i) min_jj = min_i; + } else { + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + } + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, + buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE); + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE, + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + +#else + + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ + + min_jj = MIN(m_to, xxx + div_n) - jjs; + + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, + buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE); + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE, + c, ldc, m_to - min_i, jjs); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + +#endif + +#ifndef LOWER + for (i = 0; i <= mypos; i++) +#else + for (i = mypos; i < args -> nthreads; i++) +#endif + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + + WMB; + } + + +#ifndef LOWER + current = mypos + 1; + while (current < args -> nthreads) { +#else + current = mypos - 1; + while (current >= 0) { +#endif + + div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + +#ifndef LOWER + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, + m_from, + xxx); +#else + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, + m_to - min_i, + xxx); +#endif + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + } + } + +#ifndef LOWER + current ++; +#else + current --; +#endif + } + +#ifndef LOWER + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; +#else + start_i = min_i; + + for(is = m_from; is < m_to - start_i; is += min_i){ + min_i = m_to - start_i - is; +#endif + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + + do { + + div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + +#ifndef LOWER + if (is + min_i >= m_to) { +#else + if (is + min_i >= m_to - start_i) { +#endif + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } + +#ifndef LOWER + current ++; + } while (current != args -> nthreads); +#else + current --; + } while (current >= 0); +#endif + + + } + } + + START_RPCC(); + + for (i = 0; i < args -> nthreads; i++) { + if (i != mypos) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + } + + STOP_RPCC(waiting3); + +#ifdef TIMING + BLASLONG waiting = waiting1 + waiting2 + waiting3; + BLASLONG total = copy_A + copy_B + kernel + waiting; + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", + mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., + (double)waiting1 /(double)total * 100., + (double)waiting2 /(double)total * 100., + (double)waiting3 /(double)total * 100., + (double)ops/(double)kernel / 4. * 100.); + +#if 0 + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", + mypos, copy_A, copy_B, waiting); + + fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", + mypos, + (double)waiting1/(double)waiting * 100., + (double)waiting2/(double)waiting * 100., + (double)waiting3/(double)waiting * 100.); +#endif + fprintf(stderr, "\n"); +#endif + + return 0; +} + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + blas_arg_t newarg; + + job_t job[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range[MAX_CPU_NUMBER + 100]; + + BLASLONG num_cpu; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k; + BLASLONG n, n_from, n_to; + int mode, mask; + double dnum; + + if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { + SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); + return 0; + } + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; + mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; + mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_REAL; + mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; + mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; + mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; + mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; +#endif +#endif + + newarg.m = args -> m; + newarg.n = args -> n; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.ldb = args -> ldb; + newarg.ldc = args -> ldc; + newarg.alpha = args -> alpha; + newarg.beta = args -> beta; + newarg.common = (void *)job; + + if (!range_n) { + n_from = 0; + n_to = args -> n; + } else { + n_from = range_n[0]; + n_to = range_n[1] - range_n[0]; + } + +#ifndef LOWER + + range[MAX_CPU_NUMBER] = n_to - n_from; + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); + + if (num_cpu == 0) width = n - ((n - width) & ~mask); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = range_m; + + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; + +#else + + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = range_m; + queue[num_cpu].range_n = range; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + newarg.nthreads = num_cpu; + + if (num_cpu) { + + for (j = 0; j < num_cpu; j++) { + for (i = 0; i < num_cpu; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[0].sa = sa; + queue[0].sb = sb; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + + return 0; +} diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c new file mode 100644 index 0000000..000d423 --- /dev/null +++ b/driver/level3/level3_thread.c @@ -0,0 +1,743 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef GEMM_LOCAL +#if defined(NN) +#define GEMM_LOCAL GEMM_NN +#elif defined(NT) +#define GEMM_LOCAL GEMM_NT +#elif defined(NR) +#define GEMM_LOCAL GEMM_NR +#elif defined(NC) +#define GEMM_LOCAL GEMM_NC +#elif defined(TN) +#define GEMM_LOCAL GEMM_TN +#elif defined(TT) +#define GEMM_LOCAL GEMM_TT +#elif defined(TR) +#define GEMM_LOCAL GEMM_TR +#elif defined(TC) +#define GEMM_LOCAL GEMM_TC +#elif defined(RN) +#define GEMM_LOCAL GEMM_RN +#elif defined(RT) +#define GEMM_LOCAL GEMM_RT +#elif defined(RR) +#define GEMM_LOCAL GEMM_RR +#elif defined(RC) +#define GEMM_LOCAL GEMM_RC +#elif defined(CN) +#define GEMM_LOCAL GEMM_CN +#elif defined(CT) +#define GEMM_LOCAL GEMM_CT +#elif defined(CR) +#define GEMM_LOCAL GEMM_CR +#elif defined(CC) +#define GEMM_LOCAL GEMM_CC +#endif +#endif + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef BETA_OPERATION +#ifndef COMPLEX +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#else +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#endif +#endif + +#ifndef ICOPY_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef KERNEL_FUNC +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define KERNEL_FUNC GEMM_KERNEL_N +#endif +#if defined(CN) || defined(CT) || defined(RN) || defined(RT) +#define KERNEL_FUNC GEMM_KERNEL_L +#endif +#if defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define KERNEL_FUNC GEMM_KERNEL_R +#endif +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define KERNEL_FUNC GEMM_KERNEL_B +#endif +#endif + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif +#endif + +#ifndef FUSED_KERNEL_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#ifndef COMPLEX +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#else +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) + +#endif +#else +#ifndef COMPLEX +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#else +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#endif +#endif +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef B +#define B args -> b +#endif +#ifndef LDB +#define LDB args -> ldb +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG k, lda, ldb, ldc; + BLASLONG m_from, m_to, n_from, n_to, N_from, N_to; + + FLOAT *alpha, *beta; + FLOAT *a, *b, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + + BLASLONG ls, min_l, jjs, min_jj; + BLASLONG is, min_i, div_n; + + BLASLONG i, current; + BLASLONG l1stride, l2size; + +#ifdef TIMING + BLASULONG rpcc_counter; + BLASULONG copy_A = 0; + BLASULONG copy_B = 0; + BLASULONG kernel = 0; + BLASULONG waiting1 = 0; + BLASULONG waiting2 = 0; + BLASULONG waiting3 = 0; + BLASULONG waiting6[MAX_CPU_NUMBER]; + BLASULONG ops = 0; + + for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; +#endif + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = range_m[0]; + m_to = range_m[1]; + } + + n_from = 0; + n_to = N; + + N_from = 0; + N_to = N; + + if (range_n) { + n_from = range_n[mypos + 0]; + n_to = range_n[mypos + 1]; + + N_from = range_n[0]; + N_to = range_n[args -> nthreads]; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#ifdef COMPLEX + && (alpha[1] == ZERO) +#endif + ) return 0; + + l2size = GEMM_P * GEMM_Q; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", + mypos, m_from, m_to, n_from, n_to, N_from, N_to); + + fprintf(stderr, "GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); + +#endif + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + buffer[0] = sb; + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE; + } + + + for(ls = 0; ls < k; ls += min_l){ + + min_l = k - ls; + + if (min_l >= GEMM_Q * 2) { + min_l = GEMM_Q; + } else { + if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; + } + + l1stride = 1; + min_i = m_to - m_from; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else { + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } else { + if (args -> nthreads == 1) l1stride = 0; + } + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(copy_A); + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using buffer */ + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + +#if defined(FUSED_GEMM) && !defined(TIMING) + + FUSED_KERNEL_OPERATION(min_i, MIN(n_to, xxx + div_n) - xxx, min_l, alpha, + sa, buffer[bufferside], b, ldb, c, ldc, m_from, xxx, ls); + +#else + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, + buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride); + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride, + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } +#endif + + for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + WMB; + } + + current = mypos; + + do { + current ++; + if (current >= args -> nthreads) current = 0; + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if (current != mypos) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, m_from, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + } + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + } + } + } while (current != mypos); + + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + + if (is + min_i >= m_to) { + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + + } + + } + + START_RPCC(); + + for (i = 0; i < args -> nthreads; i++) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + + STOP_RPCC(waiting3); + +#ifdef TIMING + BLASLONG waiting = waiting1 + waiting2 + waiting3; + BLASLONG total = copy_A + copy_B + kernel + waiting; + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", + mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., + (double)waiting1 /(double)total * 100., + (double)waiting2 /(double)total * 100., + (double)waiting3 /(double)total * 100., + (double)ops/(double)kernel / 4. * 100.); + +#if 0 + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", + mypos, copy_A, copy_B, waiting); + + fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", + mypos, + (double)waiting1/(double)waiting * 100., + (double)waiting2/(double)waiting * 100., + (double)waiting3/(double)waiting * 100.); +#endif + fprintf(stderr, "\n"); +#endif + + return 0; +} + +static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG + *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + blas_arg_t newarg; + + job_t job[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1]; + BLASLONG range_N[MAX_CPU_NUMBER + 1]; + + BLASLONG num_cpu_m, num_cpu_n; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k, js; + BLASLONG m, n, n_from, n_to; + int mode; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE; +#else + mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE; +#endif +#endif + + newarg.m = args -> m; + newarg.n = args -> n; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.ldb = args -> ldb; + newarg.ldc = args -> ldc; + newarg.alpha = args -> alpha; + newarg.beta = args -> beta; + newarg.nthreads = args -> nthreads; + newarg.common = (void *)job; + +#ifdef PARAMTEST + newarg.gemm_p = args -> gemm_p; + newarg.gemm_q = args -> gemm_q; + newarg.gemm_r = args -> gemm_r; +#endif + + if (!range_m) { + range_M[0] = 0; + m = args -> m; + } else { + range_M[0] = range_m[0]; + m = range_m[1] - range_m[0]; + } + + num_cpu_m = 0; + + while (m > 0){ + + width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m); + + m -= width; + if (m < 0) width = width + m; + + range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; + + num_cpu_m ++; + } + + for (i = 0; i < num_cpu_m; i++) { + queue[i].mode = mode; + queue[i].routine = inner_thread; + queue[i].args = &newarg; + queue[i].range_m = &range_M[i]; + queue[i].range_n = &range_N[0]; + queue[i].sa = NULL; + queue[i].sb = NULL; + queue[i].next = &queue[i + 1]; + } + + queue[0].sa = sa; + queue[0].sb = sb; + + if (!range_n) { + n_from = 0; + n_to = args -> n; + } else { + n_from = range_n[0]; + n_to = range_n[1]; + } + + for(js = n_from; js < n_to; js += GEMM_R * nthreads){ + n = n_to - js; + if (n > GEMM_R * nthreads) n = GEMM_R * nthreads; + + range_N[0] = js; + + num_cpu_n = 0; + + while (n > 0){ + + width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); + + n -= width; + if (n < 0) width = width + n; + + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; + + num_cpu_n ++; + } + + for (j = 0; j < num_cpu_m; j++) { + for (i = 0; i < num_cpu_m; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[num_cpu_m - 1].next = NULL; + + exec_blas(num_cpu_m, queue); + } + + return 0; +} + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG m = args -> m; + BLASLONG n = args -> n; + BLASLONG nthreads = args -> nthreads; + BLASLONG divN, divT; + int mode; + + if (nthreads == 1) { + GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); + return 0; + } + + if (range_m) { + BLASLONG m_from = *(((BLASLONG *)range_m) + 0); + BLASLONG m_to = *(((BLASLONG *)range_m) + 1); + + m = m_to - m_from; + } + + if (range_n) { + BLASLONG n_from = *(((BLASLONG *)range_n) + 0); + BLASLONG n_to = *(((BLASLONG *)range_n) + 1); + + n = n_to - n_from; + } + + if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { + GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); + return 0; + } + + divT = nthreads; + divN = 1; + +#if 0 + while ((GEMM_P * divT > m * SWITCH_RATIO) && (divT > 1)) { + do { + divT --; + divN = 1; + while (divT * divN < nthreads) divN ++; + } while ((divT * divN != nthreads) && (divT > 1)); + } +#endif + + // fprintf(stderr, "divN = %4ld divT = %4ld\n", divN, divT); + + args -> nthreads = divT; + + if (divN == 1){ + + gemm_driver(args, range_m, range_n, sa, sb, 0); + } else { +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + +#if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \ + defined(CN) || defined(CT) || defined(CR) || defined(CC) + mode |= (BLAS_TRANSA_T); +#endif +#if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \ + defined(NC) || defined(TC) || defined(RC) || defined(CC) + mode |= (BLAS_TRANSB_T); +#endif + +#ifdef OS_WINDOWS + gemm_thread_n(mode, args, range_m, range_n, GEMM_LOCAL, sa, sb, divN); +#else + gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); +#endif + + } + + return 0; +} diff --git a/driver/level3/symm3m_k.c b/driver/level3/symm3m_k.c new file mode 100644 index 0000000..764c2ff --- /dev/null +++ b/driver/level3/symm3m_k.c @@ -0,0 +1,100 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) + +#ifndef RSIDE +#ifndef LOWER +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_IUCOPYB(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_IUCOPYR(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_IUCOPYI(M, N, A, LDA, Y, X, BUFFER) +#else +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_ILCOPYB(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_ILCOPYR(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_ILCOPYI(M, N, A, LDA, Y, X, BUFFER) +#endif +#endif + +#ifdef RSIDE +#ifndef LOWER +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OUCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OUCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OUCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OLCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OLCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OLCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + +#ifndef RSIDE +#define K args -> m +#ifndef LOWER +#define GEMM3M_LOCAL SYMM3M_LU +#else +#define GEMM3M_LOCAL SYMM3M_LL +#endif +#else +#define K args -> n +#ifndef LOWER +#define GEMM3M_LOCAL SYMM3M_RU +#else +#define GEMM3M_LOCAL SYMM3M_RL +#endif +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_gemm3m_thread.c" +#else +#include "gemm3m_level3.c" +#endif + diff --git a/driver/level3/symm_k.c b/driver/level3/symm_k.c new file mode 100644 index 0000000..567896a --- /dev/null +++ b/driver/level3/symm_k.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifndef RSIDE +#ifndef LOWER +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_IUTCOPY(M, N, A, LDA, Y, X, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_ILTCOPY(M, N, A, LDA, Y, X, BUFFER); +#endif +#endif + +#ifdef RSIDE +#ifndef LOWER +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_OUTCOPY(M, N, A, LDA, Y, X, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_OLTCOPY(M, N, A, LDA, Y, X, BUFFER); +#endif +#endif + +#ifndef RSIDE +#define K args -> m +#ifndef LOWER +#define GEMM_LOCAL SYMM_LU +#else +#define GEMM_LOCAL SYMM_LL +#endif +#else +#define K args -> n +#ifndef LOWER +#define GEMM_LOCAL SYMM_RU +#else +#define GEMM_LOCAL SYMM_RL +#endif +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_thread.c" +#else +#include "level3.c" +#endif diff --git a/driver/level3/syr2k_k.c b/driver/level3/syr2k_k.c new file mode 100644 index 0000000..01251d4 --- /dev/null +++ b/driver/level3/syr2k_k.c @@ -0,0 +1,103 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef KERNEL_FUNC +#ifndef LOWER +#define KERNEL_FUNC SYR2K_KERNEL_U +#else +#define KERNEL_FUNC SYR2K_KERNEL_L +#endif +#endif + +static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { + + BLASLONG i; + +#ifndef LOWER + if (m_from > n_from) n_from = m_from; + if (m_to > n_to ) m_to = n_to; +#else + if (m_from < n_from) m_from = n_from; + if (m_to < n_to ) n_to = m_to; +#endif + + c += (m_from + n_from * ldc) * COMPSIZE; + + m_to -= m_from; + n_to -= n_from; + + for (i = 0; i < n_to; i++){ + +#ifndef LOWER + + SCAL_K(MIN(i + n_from - m_from + 1, m_to), 0, 0, alpha[0], +#ifdef COMPLEX + alpha[1], +#endif + c, 1, NULL, 0, NULL, 0); + + c += ldc * COMPSIZE; + +#else + + SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], +#ifdef COMPLEX + alpha[1], +#endif + c, 1, NULL, 0, NULL, 0); + + if (i < m_from - n_from) { + c += ldc * COMPSIZE; + } else { + c += (1 + ldc) * COMPSIZE; + } +#endif + + } + + return 0; +} + +#ifdef THREADED_LEVEL3 +#include "level3_syr2k_threaded.c" +#else +#include "level3_syr2k.c" +#endif diff --git a/driver/level3/syr2k_kernel.c b/driver/level3/syr2k_kernel.c new file mode 100644 index 0000000..8c476f5 --- /dev/null +++ b/driver/level3/syr2k_kernel.c @@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, +#ifdef COMPLEX + FLOAT alpha_i, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset, int flag){ + + BLASLONG i, j; + BLASLONG loop; + FLOAT subbuffer[GEMM_UNROLL_MN * GEMM_UNROLL_MN * COMPSIZE]; + + if (m + offset < 0) { +#ifndef LOWER + GEMM_KERNEL_N(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + if (n < offset) { +#ifdef LOWER + GEMM_KERNEL_N(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + + if (offset > 0) { +#ifdef LOWER + GEMM_KERNEL_N(m, offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + b += offset * k * COMPSIZE; + c += offset * ldc * COMPSIZE; + n -= offset; + offset = 0; + + if (n <= 0) return 0; + } + + if (n > m + offset) { +#ifndef LOWER + GEMM_KERNEL_N(m, n - m - offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, + b + (m + offset) * k * COMPSIZE, + c + (m + offset) * ldc * COMPSIZE, ldc); +#endif + + n = m + offset; + if (n <= 0) return 0; + } + + + if (offset < 0) { +#ifndef LOWER + GEMM_KERNEL_N(-offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + a -= offset * k * COMPSIZE; + c -= offset * COMPSIZE; + m += offset; + offset = 0; + + if (m <= 0) return 0; + } + + if (m > n - offset) { +#ifdef LOWER + GEMM_KERNEL_N(m - n + offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (n - offset) * k * COMPSIZE, + b, + c + (n - offset) * COMPSIZE, ldc); +#endif + m = n + offset; + if (m <= 0) return 0; + } + + for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { + + int mm, nn; + + mm = (loop & ~(GEMM_UNROLL_MN - 1)); + nn = MIN(GEMM_UNROLL_MN, n - loop); + +#ifndef LOWER + GEMM_KERNEL_N(mm, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); +#endif + + if (flag) { + GEMM_BETA(nn, nn, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, subbuffer, nn); + + GEMM_KERNEL_N(nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + +#ifndef LOWER + + for (j = 0; j < nn; j ++) { + for (i = 0; i <= j; i ++) { +#ifndef COMPLEX + c[i + loop + (j + loop) * ldc] += + subbuffer[i + j * nn] + subbuffer[j + i * nn]; +#else + c[(i + loop + (j + loop) * ldc) * 2 + 0] += + subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; + c[(i + loop + (j + loop) * ldc) * 2 + 1] += + subbuffer[(i + j * nn) * 2 + 1] + subbuffer[(j + i * nn) * 2 + 1]; +#endif + } + } +#else + for (j = 0; j < nn; j ++) { + for (i = j; i < nn; i ++) { +#ifndef COMPLEX + c[i + loop + (j + loop) * ldc] += + subbuffer[i + j * nn] + subbuffer[j + i * nn]; +#else + c[(i + loop + (j + loop) * ldc) * 2 + 0] += + subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; + c[(i + loop + (j + loop) * ldc) * 2 + 1] += + subbuffer[(i + j * nn) * 2 + 1] + subbuffer[(j + i * nn) * 2 + 1]; +#endif + } + } +#endif + } + +#ifdef LOWER + GEMM_KERNEL_N(m - mm - nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); +#endif + } + + return 0; +} diff --git a/driver/level3/syrk_k.c b/driver/level3/syrk_k.c new file mode 100644 index 0000000..9c9700e --- /dev/null +++ b/driver/level3/syrk_k.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifndef KERNEL_FUNC +#ifndef LOWER +#define KERNEL_FUNC SYRK_KERNEL_U +#else +#define KERNEL_FUNC SYRK_KERNEL_L +#endif +#endif + +static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { + + BLASLONG i; + +#ifndef LOWER + if (m_from > n_from) n_from = m_from; + if (m_to > n_to ) m_to = n_to; +#else + if (m_from < n_from) m_from = n_from; + if (m_to < n_to ) n_to = m_to; +#endif + + c += (m_from + n_from * ldc) * COMPSIZE; + + m_to -= m_from; + n_to -= n_from; + + for (i = 0; i < n_to; i++){ + +#ifndef LOWER + + SCAL_K(MIN(i + n_from - m_from + 1, m_to), 0, 0, alpha[0], +#ifdef COMPLEX + alpha[1], +#endif + c, 1, NULL, 0, NULL, 0); + + c += ldc * COMPSIZE; + +#else + + SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], +#ifdef COMPLEX + alpha[1], +#endif + c, 1, NULL, 0, NULL, 0); + + if (i < m_from - n_from) { + c += ldc * COMPSIZE; + } else { + c += (1 + ldc) * COMPSIZE; + } +#endif + + } + + return 0; +} + +#ifdef THREADED_LEVEL3 +#include "level3_syrk_threaded.c" +#else +#include "level3_syrk.c" +#endif diff --git a/driver/level3/syrk_kernel.c b/driver/level3/syrk_kernel.c new file mode 100644 index 0000000..65d108a --- /dev/null +++ b/driver/level3/syrk_kernel.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef CONJA +#ifndef CONJB +#define GEMM_KERNEL GEMM_KERNEL_N +#else +#define GEMM_KERNEL GEMM_KERNEL_R +#endif +#else +#ifndef CONJB +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_B +#endif +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, +#ifdef COMPLEX + FLOAT alpha_i, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + BLASLONG loop; + FLOAT *cc, *ss; + FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE]; + + if (m + offset < 0) { +#ifndef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + if (n < offset) { +#ifdef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + if (offset > 0) { +#ifdef LOWER + GEMM_KERNEL(m, offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + b += offset * k * COMPSIZE; + c += offset * ldc * COMPSIZE; + n -= offset; + offset = 0; + + if (n <= 0) return 0; + } + + if (n > m + offset) { +#ifndef LOWER + GEMM_KERNEL(m, n - m - offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, + b + (m + offset) * k * COMPSIZE, + c + (m + offset) * ldc * COMPSIZE, ldc); +#endif + + n = m + offset; + if (n <= 0) return 0; + } + + if (offset < 0) { +#ifndef LOWER + GEMM_KERNEL(-offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + a -= offset * k * COMPSIZE; + c -= offset * COMPSIZE; + m += offset; + offset = 0; + + if (m <= 0) return 0; + } + + if (m > n - offset) { +#ifdef LOWER + GEMM_KERNEL(m - n + offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (n - offset) * k * COMPSIZE, + b, + c + (n - offset) * COMPSIZE, ldc); +#endif + m = n + offset; + + if (m <= 0) return 0; + } + + for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { + + int mm, nn; + + mm = (loop & ~(GEMM_UNROLL_MN - 1)); + nn = MIN(GEMM_UNROLL_MN, n - loop); + +#ifndef LOWER + GEMM_KERNEL(mm, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); +#endif + + GEMM_BETA(nn, nn, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, subbuffer, nn); + + GEMM_KERNEL(nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + + cc = c + (loop + loop * ldc) * COMPSIZE; + ss = subbuffer; + +#ifndef LOWER + for (j = 0; j < nn; j ++) { + for (i = 0; i <= j; i ++) { +#ifndef COMPLEX + cc[i] += ss[i]; +#else + cc[i * 2 + 0] += ss[i * 2 + 0]; + cc[i * 2 + 1] += ss[i * 2 + 1]; +#endif + } + ss += nn * COMPSIZE; + cc += ldc * COMPSIZE; + } +#else + for (j = 0; j < nn; j ++) { + for (i = j; i < nn; i ++) { +#ifndef COMPLEX + cc[i] += ss[i]; +#else + cc[i * 2 + 0] += ss[i * 2 + 0]; + cc[i * 2 + 1] += ss[i * 2 + 1]; +#endif + } + ss += nn * COMPSIZE; + cc += ldc * COMPSIZE; + } +#endif + +#ifdef LOWER + GEMM_KERNEL(m - mm - nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); +#endif + + } + + return 0; +} diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c new file mode 100644 index 0000000..837670b --- /dev/null +++ b/driver/level3/syrk_thread.c @@ -0,0 +1,186 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include "common.h" + +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i; + BLASLONG n_from, n_to; + double dnum, nf, nt, di; + + int num_cpu; + int mask = 0; + + if (!(mode & BLAS_COMPLEX)) { + + switch (mode & BLAS_PREC) { + case BLAS_SINGLE: + mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; + break; + case BLAS_DOUBLE: + mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; + break; +#ifdef EXPRECISION + case BLAS_XDOUBLE: + mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; + break; +#endif + } + } else { + switch (mode & BLAS_PREC) { + case BLAS_SINGLE: + mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; + break; + case BLAS_DOUBLE: + mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; + break; +#ifdef EXPRECISION + case BLAS_XDOUBLE: + mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; + break; +#endif + } + } + + n_from = 0; + n_to = arg -> n; + + if (range_n) { + n_from = *(range_n + 0); + n_to = *(range_n + 1); + } + + if (!(mode & BLAS_UPLO)) { + + nf = (double)(n_from); + nt = (double)(n_to); + + dnum = (nt * nt - nf * nf) / (double)nthreads; + + num_cpu = 0; + + range[0] = n_from; + i = n_from; + + while (i < n_to){ + + if (nthreads - num_cpu > 1) { + + di = (double)i; + width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; + + if ((width <= 0) || (width > n_to - i)) width = n_to - i; + + } else { + width = n_to - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = arg; + queue[num_cpu].range_m = range_m; + queue[num_cpu].range_n = &range[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + } else { + + nf = (double)(arg -> n - n_from); + nt = (double)(arg -> n - n_to); + + dnum = (nt * nt - nf * nf) / (double)nthreads; + + num_cpu = 0; + + range[0] = n_from; + i = n_from; + + while (i < n_to){ + + if (nthreads - num_cpu > 1) { + + di = (double)(arg -> n - i); + width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; + + if ((width <= 0) || (width > n_to - i)) width = n_to - i; + + } else { + width = n_to - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = arg; + queue[num_cpu].range_m = range_m; + queue[num_cpu].range_n = &range[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + } + + if (num_cpu) { + queue[0].sa = sa; + queue[0].sb = sb; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c new file mode 100644 index 0000000..9e46df0 --- /dev/null +++ b/driver/level3/trmm_L.c @@ -0,0 +1,444 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#define TRMM_KERNEL_N TRMM_KERNEL_LR +#define TRMM_KERNEL_T TRMM_KERNEL_LC +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#define TRMM_KERNEL_N TRMM_KERNEL_LN +#define TRMM_KERNEL_T TRMM_KERNEL_LT +#endif + +#undef TIMING + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m, n, lda, ldb; + FLOAT *beta, *a, *b; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + +#ifdef TIMING + unsigned long long rpcc_counter; + unsigned long long innercost = 0; + unsigned long long outercost = 0; + unsigned long long gemmcost = 0; + unsigned long long trmmcost = 0; + double total; +#endif + + m = args -> m; + n = args -> n; + + a = (FLOAT *)args -> a; + b = (FLOAT *)args -> b; + + lda = args -> lda; + ldb = args -> ldb; + + beta = (FLOAT *)args -> beta; + + if (range_n) { + BLASLONG n_from = *(((BLASLONG *)range_n) + 0); + BLASLONG n_to = *(((BLASLONG *)range_n) + 1); + + n = n_to - n_from; + + b += n_from * ldb * COMPSIZE; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) + GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); + if (beta[0] == ZERO) return 0; +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) + GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); + if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; +#endif + } + + for(js = 0; js < n; js += GEMM_R){ + min_j = n - js; + if (min_j > GEMM_R) min_j = GEMM_R; + +#if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) + + min_l = m; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_IUTCOPY(min_l, min_i, a, lda, 0, 0, sa); +#else + TRMM_ILNCOPY(min_l, min_i, a, lda, 0, 0, sa); +#endif + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + TRMM_KERNEL_N(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0); + + STOP_RPCC(trmmcost); + } + + + for(is = min_i; is < min_l; is += GEMM_P){ + min_i = min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_IUTCOPY(min_l, min_i, a, lda, 0, is, sa); +#else + TRMM_ILNCOPY(min_l, min_i, a, lda, 0, is, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + TRMM_KERNEL_N(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is); + + STOP_RPCC(trmmcost); + + } + + for(ls = min_l; ls < m; ls += GEMM_Q){ + min_l = m - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = ls; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (ls * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + (ls ) * COMPSIZE, lda, sa); +#endif + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(gemmcost); + + START_RPCC(); + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs * ldb) * COMPSIZE, ldb); + + STOP_RPCC(gemmcost); + } + + for(is = min_i; is < ls; is += GEMM_P){ + min_i = ls - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + GEMM_KERNEL(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + + STOP_RPCC(gemmcost); + } + + for(is = ls; is < ls + min_l; is += GEMM_P){ + min_i = ls + min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_IUTCOPY(min_l, min_i, a, lda, ls, is, sa); +#else + TRMM_ILNCOPY(min_l, min_i, a, lda, ls, is, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + TRMM_KERNEL_N(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); + + STOP_RPCC(trmmcost); + } + } + +#else + min_l = m; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa); +#else + TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa); +#endif + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, + sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0); + + STOP_RPCC(trmmcost); + } + + for(is = m - min_l + min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, is, sa); +#else + TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, is, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + TRMM_KERNEL_T(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l); + + STOP_RPCC(trmmcost); + } + + for(ls = m - min_l; ls > 0; ls -= GEMM_Q){ + min_l = ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa); +#else + TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa); +#endif + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, + sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0); + + STOP_RPCC(trmmcost); + } + + for(is = ls - min_l + min_i; is < ls; is += GEMM_P){ + min_i = ls - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, is, sa); +#else + TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, is, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + TRMM_KERNEL_T(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l); + + STOP_RPCC(trmmcost); + } + + + for(is = ls; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + GEMM_KERNEL(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + + STOP_RPCC(gemmcost); + } + } + +#endif + + } + +#ifdef TIMING + total = (double)outercost + (double)innercost + (double)gemmcost + (double)trmmcost; + + printf( "Copy A : %5.2f Copy B: %5.2f GEMM Kernel : %5.2f TRMM Kerlnel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", + innercost / total * 100., outercost / total * 100., + gemmcost / total * 100., trmmcost / total * 100., + (double)n * (double)n * (double)n / (double)(trmmcost + gemmcost) * 100. * (double)COMPSIZE / 2., + (double)n * (double)n * (double)n / total * 100. * (double)COMPSIZE / 2.); + +#endif + + return 0; +} diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c new file mode 100644 index 0000000..e46553c --- /dev/null +++ b/driver/level3/trmm_R.c @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#define TRMM_KERNEL_N TRMM_KERNEL_RR +#define TRMM_KERNEL_T TRMM_KERNEL_RC +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#define TRMM_KERNEL_N TRMM_KERNEL_RN +#define TRMM_KERNEL_T TRMM_KERNEL_RT +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 12 +#define GEMM_R 16 +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m, n, lda, ldb; + FLOAT *beta, *a, *b; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + + m = args -> m; + n = args -> n; + + a = (FLOAT *)args -> a; + b = (FLOAT *)args -> b; + + lda = args -> lda; + ldb = args -> ldb; + + beta = (FLOAT *)args -> beta; + + if (range_m) { + BLASLONG m_from = *(((BLASLONG *)range_m) + 0); + BLASLONG m_to = *(((BLASLONG *)range_m) + 1); + + m = m_to - m_from; + + b += m_from * COMPSIZE; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) + GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); + if (beta[0] == ZERO) return 0; +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) + GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); + if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; +#endif + } + +#if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) + + for(js = 0; js < n; js += GEMM_R){ + min_j = n - js; + if (min_j > GEMM_R) min_j = GEMM_R; + + for(ls = js; ls < js + min_j; ls += GEMM_Q){ + min_l = js + min_j - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = 0; jjs < ls - js; jjs += min_jj){ + min_jj = ls - js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + ((js + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * jjs * COMPSIZE, + b + ((js + jjs) * ldb) * COMPSIZE, ldb); + } + + for(jjs = 0; jjs < min_l; jjs += min_jj){ + min_jj = min_l - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); +#else + TRMM_OUTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); +#endif + + TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + (ls - js + jjs) * min_l * COMPSIZE, + b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, ls - js, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, + b + (is + js * ldb) * COMPSIZE, ldb); + + TRMM_KERNEL_T(min_i, min_l, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + (ls - js) * min_l * COMPSIZE, + b + (is + ls * ldb) * COMPSIZE, ldb, 0); + } + } + + + for(ls = js + min_j; ls < n; ls += GEMM_Q){ + min_l = n - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs * ldb) * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + } + } + } + +#else + BLASLONG start_ls; + + for(js = n; js > 0; js -= GEMM_R){ + min_j = js; + if (min_j > GEMM_R) min_j = GEMM_R; + + start_ls = js - min_j; + while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; + + for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ + min_l = js - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = 0; jjs < min_l; jjs += min_jj){ + min_jj = min_l - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); +#else + TRMM_OLTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); +#endif + + TRMM_KERNEL_N(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * jjs * COMPSIZE, + b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); + } + + for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ + min_jj = js - ls - min_l - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, + sb + min_l * (min_l + jjs) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, + sb + min_l * (min_l + jjs) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * (min_l + jjs) * COMPSIZE, + b + ((ls + min_l + jjs) * ldb) * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + TRMM_KERNEL_N(min_i, min_l, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb, + b + (is + ls * ldb) * COMPSIZE, ldb, 0); + + if (js - ls - min_l > 0) { + GEMM_KERNEL(min_i, js - ls - min_l, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * min_l * COMPSIZE, + b + (is + (ls + min_l) * ldb) * COMPSIZE, ldb); + } + } + } + + for(ls = 0; ls < js - min_j; ls += GEMM_Q){ + min_l = js - min_j - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + ((jjs - min_j) * ldb) * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); + } + } + } + +#endif + + return 0; +} diff --git a/driver/level3/trsm_L.c b/driver/level3/trsm_L.c new file mode 100644 index 0000000..2c3006f --- /dev/null +++ b/driver/level3/trsm_L.c @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) +#define TRSM_KERNEL TRSM_KERNEL_LR +#else +#define TRSM_KERNEL TRSM_KERNEL_LC +#endif +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) +#define TRSM_KERNEL TRSM_KERNEL_LN +#else +#define TRSM_KERNEL TRSM_KERNEL_LT +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 12 +#define GEMM_R 1600 +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m, n, lda, ldb; + FLOAT *beta, *a, *b; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + + m = args -> m; + n = args -> n; + + a = (FLOAT *)args -> a; + b = (FLOAT *)args -> b; + + lda = args -> lda; + ldb = args -> ldb; + + beta = (FLOAT *)args -> beta; + + if (range_n) { + BLASLONG n_from = *(((BLASLONG *)range_n) + 0); + BLASLONG n_to = *(((BLASLONG *)range_n) + 1); + + n = n_to - n_from; + + b += n_from * ldb * COMPSIZE; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) + GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); + if (beta[0] == ZERO) return 0; +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) + GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); + if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; +#endif + } + + for(js = 0; js < n; js += GEMM_R){ + min_j = n - js; + if (min_j > GEMM_R) min_j = GEMM_R; + +#if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) + for(ls = 0; ls < m; ls += GEMM_Q){ + min_l = m - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); +#else + TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); +#endif + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); + + TRSM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (ls + jjs * ldb) * COMPSIZE, ldb, 0); + } + + for(is = ls + min_i; is < ls + min_l; is += GEMM_P){ + min_i = ls + min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa); +#else + TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa); +#endif + + TRSM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); + } + + + for(is = ls + min_l; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); +#endif + + GEMM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + } + } +#else + BLASLONG start_is; + + for(ls = m; ls > 0; ls -= GEMM_Q){ + min_l = ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + start_is = ls - min_l; + while (start_is + GEMM_P < ls) start_is += GEMM_P; + min_i = ls - start_is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + TRSM_IUTCOPY(min_l, min_i, a + (start_is + (ls - min_l) * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); +#else + TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + start_is * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); +#endif + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); + + TRSM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (start_is + jjs * ldb) * COMPSIZE, ldb, start_is - ls + min_l); + } + + for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){ + min_i = ls - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa); +#else + TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, is - (ls - min_l), sa); +#endif + TRSM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, + b + (is + js * ldb) * COMPSIZE, ldb, + is - (ls - min_l) ); + } + + + for(is = 0; is < ls - min_l; is += GEMM_P){ + min_i = ls - min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); +#endif + + GEMM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + } + } + +#endif + } + + return 0; +} diff --git a/driver/level3/trsm_R.c b/driver/level3/trsm_R.c new file mode 100644 index 0000000..0964d78 --- /dev/null +++ b/driver/level3/trsm_R.c @@ -0,0 +1,348 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) +#define TRSM_KERNEL TRSM_KERNEL_RR +#else +#define TRSM_KERNEL TRSM_KERNEL_RC +#endif +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) +#define TRSM_KERNEL TRSM_KERNEL_RN +#else +#define TRSM_KERNEL TRSM_KERNEL_RT +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 16 +#define GEMM_Q 20 +#define GEMM_R 24 +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m, n, lda, ldb; + FLOAT *beta, *a, *b; + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + + m = args -> m; + n = args -> n; + + a = (FLOAT *)args -> a; + b = (FLOAT *)args -> b; + + lda = args -> lda; + ldb = args -> ldb; + + beta = (FLOAT *)args -> beta; + + if (range_m) { + BLASLONG m_from = *(((BLASLONG *)range_m) + 0); + BLASLONG m_to = *(((BLASLONG *)range_m) + 1); + + m = m_to - m_from; + + b += m_from * COMPSIZE; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) + GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); + if (beta[0] == ZERO) return 0; +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) + GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); + if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; +#endif + } + +#if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) + for(js = 0; js < n; js += GEMM_R){ + min_j = n - js; + if (min_j > GEMM_R) min_j = GEMM_R; + + for(ls = 0; ls < js; ls += GEMM_Q){ + min_l = js - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs * ldb) * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + } + } + + for(ls = js; ls < js + min_j; ls += GEMM_Q){ + min_l = js + min_j - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + +#ifndef TRANSA + TRSM_OUNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); +#else + TRSM_OLTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); +#endif + + TRSM_KERNEL(min_i, min_l, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb, + b + (ls * ldb) * COMPSIZE, ldb, 0); + + for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ + min_jj = min_j - min_l - ls + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, + sb + min_l * (min_l + jjs) * COMPSIZE); +#else + GEMM_OTCOPY (min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, + sb + min_l * (min_l + jjs) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * (min_l + jjs) * COMPSIZE, + b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + TRSM_KERNEL(min_i, min_l, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb, + b + (is + ls * ldb) * COMPSIZE, ldb, 0); + + GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * min_l * COMPSIZE, + b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb); + } + } + } + +#else + BLASLONG start_ls; + + for(js = n; js > 0; js -= GEMM_R){ + min_j = js; + if (min_j > GEMM_R) min_j = GEMM_R; + + for (ls = js; ls < n; ls += GEMM_Q) { + min_l = n - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs - min_j) * ldb * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); + } + } + + start_ls = js - min_j; + while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; + + for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ + min_l = js - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + +#ifndef TRANSA + TRSM_OLNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, + 0, sb + min_l * (min_j - js + ls) * COMPSIZE); +#else + TRSM_OUTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, + 0, sb + min_l * (min_j - js + ls) * COMPSIZE); +#endif + + TRSM_KERNEL(min_i, min_l, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * (min_j - js + ls) * COMPSIZE, + b + (ls * ldb) * COMPSIZE, ldb, 0); + + for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ + min_jj = min_j - js + ls - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda, + sb + min_l * jjs * COMPSIZE); +#else + GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda, + sb + min_l * jjs * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * jjs * COMPSIZE, + b + (js - min_j + jjs) * ldb * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + TRSM_KERNEL(min_i, min_l, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * (min_j - js + ls) * COMPSIZE, + b + (is + ls * ldb) * COMPSIZE, ldb, 0); + + GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb, + b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); + } + + } + } + +#endif + + return 0; +} diff --git a/driver/level3/zhemm_k.c b/driver/level3/zhemm_k.c new file mode 100644 index 0000000..50da97a --- /dev/null +++ b/driver/level3/zhemm_k.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifndef RSIDE +#ifndef LOWER +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_IUTCOPY(M, N, A, LDA, Y, X, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_ILTCOPY(M, N, A, LDA, Y, X, BUFFER); +#endif +#endif + +#ifdef RSIDE +#ifndef LOWER +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_OUTCOPY(M, N, A, LDA, Y, X, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_OLTCOPY(M, N, A, LDA, Y, X, BUFFER); +#endif +#endif + +#ifndef RSIDE +#define K args -> m +#ifndef LOWER +#define GEMM_LOCAL HEMM_LU +#else +#define GEMM_LOCAL HEMM_LL +#endif +#else +#define K args -> n +#ifndef LOWER +#define GEMM_LOCAL HEMM_RU +#else +#define GEMM_LOCAL HEMM_RL +#endif +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_thread.c" +#else +#include "level3.c" +#endif diff --git a/driver/level3/zher2k_k.c b/driver/level3/zher2k_k.c new file mode 100644 index 0000000..93bb781 --- /dev/null +++ b/driver/level3/zher2k_k.c @@ -0,0 +1,160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef LOWER + +#ifndef CONJ +#ifdef XDOUBLE +#define KERNEL_FUNC xher2k_kernel_UN +#elif defined(DOUBLE) +#define KERNEL_FUNC zher2k_kernel_UN +#else +#define KERNEL_FUNC cher2k_kernel_UN +#endif +#else +#ifdef XDOUBLE +#define KERNEL_FUNC xher2k_kernel_UC +#elif defined(DOUBLE) +#define KERNEL_FUNC zher2k_kernel_UC +#else +#define KERNEL_FUNC cher2k_kernel_UC +#endif +#endif + +#else + +#ifndef CONJ +#ifdef XDOUBLE +#define KERNEL_FUNC xher2k_kernel_LN +#elif defined(DOUBLE) +#define KERNEL_FUNC zher2k_kernel_LN +#else +#define KERNEL_FUNC cher2k_kernel_LN +#endif +#else +#ifdef XDOUBLE +#define KERNEL_FUNC xher2k_kernel_LC +#elif defined(DOUBLE) +#define KERNEL_FUNC zher2k_kernel_LC +#else +#define KERNEL_FUNC cher2k_kernel_LC +#endif +#endif + +#endif + +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) + +#define KERNEL_OPERATION_C(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ + KERNEL_FUNC(M, N, K, ALPHA[0], -ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) + +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL HER2K_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL HER2K_UC +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL HER2K_LN +#else +#define SYRK_LOCAL HER2K_LC +#endif + +#undef SCAL_K + +#ifdef XDOUBLE +#define SCAL_K QSCAL_K +#elif defined(DOUBLE) +#define SCAL_K DSCAL_K +#else +#define SCAL_K SSCAL_K +#endif + +static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { + + BLASLONG i; + +#ifndef LOWER + if (m_from > n_from) n_from = m_from; + if (m_to > n_to ) m_to = n_to; +#else + if (m_from < n_from) m_from = n_from; + if (m_to < n_to ) n_to = m_to; +#endif + + c += (m_from + n_from * ldc) * COMPSIZE; + + m_to -= m_from; + n_to -= n_from; + + for (i = 0; i < n_to; i++){ + +#ifndef LOWER + + SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); + + if (i + n_from - m_from + 1 <= m_to) + *(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO; + + c += ldc * COMPSIZE; + +#else + + SCAL_K(MIN(m_to - i + m_from - n_from, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); + + if (i < m_from - n_from) { + c += ldc * COMPSIZE; + } else { + *(c + 1) = ZERO; + c += (1 + ldc) * COMPSIZE; + } + +#endif + + } + + return 0; +} + +#ifdef THREADED_LEVEL3 +#include "level3_syr2k_threaded.c" +#else +#include "level3_syr2k.c" +#endif diff --git a/driver/level3/zher2k_kernel.c b/driver/level3/zher2k_kernel.c new file mode 100644 index 0000000..9b4c450 --- /dev/null +++ b/driver/level3/zher2k_kernel.c @@ -0,0 +1,221 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#define GEMM_KERNEL_B0 GEMM_KERNEL_R_B0 +#else +#define GEMM_KERNEL GEMM_KERNEL_L +#define GEMM_KERNEL_B0 GEMM_KERNEL_L_B0 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset, int flag){ + + BLASLONG i, j; + BLASLONG loop; + FLOAT subbuffer[GEMM_UNROLL_MN * GEMM_UNROLL_MN * COMPSIZE]; + + if (m + offset < 0) { +#ifndef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + if (n < offset) { +#ifdef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + + if (offset > 0) { +#ifdef LOWER + GEMM_KERNEL(m, offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + b += offset * k * COMPSIZE; + c += offset * ldc * COMPSIZE; + n -= offset; + offset = 0; + + if (n <= 0) return 0; + } + + if (n > m + offset) { +#ifndef LOWER + GEMM_KERNEL(m, n - m - offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, + b + (m + offset) * k * COMPSIZE, + c + (m + offset) * ldc * COMPSIZE, ldc); +#endif + + n = m + offset; + if (n <= 0) return 0; + } + + + if (offset < 0) { +#ifndef LOWER + GEMM_KERNEL(-offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + a -= offset * k * COMPSIZE; + c -= offset * COMPSIZE; + m += offset; + offset = 0; + + if (m <= 0) return 0; + } + + if (m > n - offset) { +#ifdef LOWER + GEMM_KERNEL(m - n + offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (n - offset) * k * COMPSIZE, + b, + c + (n - offset) * COMPSIZE, ldc); +#endif + m = n + offset; + if (m <= 0) return 0; + } + + for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { + + int mm, nn; + + mm = (loop & ~(GEMM_UNROLL_MN - 1)); + nn = MIN(GEMM_UNROLL_MN, n - loop); + +#ifndef LOWER + GEMM_KERNEL(mm, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); +#endif + + if (flag) { + GEMM_BETA(nn, nn, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, subbuffer, nn); + + GEMM_KERNEL(nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + + +#ifndef LOWER + + for (j = 0; j < nn; j ++) { + for (i = 0; i <= j; i ++) { + c[(i + loop + (j + loop) * ldc) * 2 + 0] += + subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; + if (i != j) { + c[(i + loop + (j + loop) * ldc) * 2 + 1] += + subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1]; + } else { + c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO; + } + } + } +#else + for (j = 0; j < nn; j ++) { + for (i = j; i < nn; i ++) { + c[(i + loop + (j + loop) * ldc) * 2 + 0] += + subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; + if (i != j) { + c[(i + loop + (j + loop) * ldc) * 2 + 1] += + subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1]; + } else { + c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO; + } + } + } +#endif + } + +#ifdef LOWER + GEMM_KERNEL(m - mm - nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); +#endif + } + + return 0; +} diff --git a/driver/level3/zherk_beta.c b/driver/level3/zherk_beta.c new file mode 100644 index 0000000..6867cc0 --- /dev/null +++ b/driver/level3/zherk_beta.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG dummy1, BLASLONG n, BLASLONG dummy2, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *dummy3, BLASLONG dummy4, FLOAT *dummy5, BLASLONG dummy6, + FLOAT *c, BLASLONG ldc, + FLOAT *dummy7, FLOAT *dummy8, BLASLONG from, BLASLONG to){ + + BLASLONG i; + +#ifndef LOWER + for (i = from; i < to; i++){ + SCAL_K(i * 2, 0, 0, alpha_r, c + i * ldc * 2, 1, NULL, 0, NULL, 0); + if (alpha_r == ZERO ){ + c[i * 2 + 0 + i * ldc * 2] = ZERO; + c[i * 2 + 1 + i * ldc * 2] = ZERO; + } else { + c[i * 2 + 0 + i * ldc * 2] *= alpha_r; + c[i * 2 + 1 + i * ldc * 2] = ZERO; + } + } +#else + for (i = from; i < to; i++){ + if (alpha_r == ZERO) { + c[i * 2 + 0 + i * ldc * 2] = ZERO; + c[i * 2 + 1 + i * ldc * 2] = ZERO; + } else { + c[i * 2 + 0 + i * ldc * 2] *= alpha_r; + c[i * 2 + 1 + i * ldc * 2] = ZERO; + } + SCAL_K((n - i - 1) * 2, 0, 0, alpha_r, c + 2 + i * (ldc + 1) * 2, 1, NULL, 0, NULL, 0); + } +#endif + + return 0; +} diff --git a/driver/level3/zherk_k.c b/driver/level3/zherk_k.c new file mode 100644 index 0000000..d1ffbdb --- /dev/null +++ b/driver/level3/zherk_k.c @@ -0,0 +1,158 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef LOWER + +#ifndef CONJ +#ifdef XDOUBLE +#define KERNEL_FUNC xherk_kernel_UN +#elif defined(DOUBLE) +#define KERNEL_FUNC zherk_kernel_UN +#else +#define KERNEL_FUNC cherk_kernel_UN +#endif +#else +#ifdef XDOUBLE +#define KERNEL_FUNC xherk_kernel_UC +#elif defined(DOUBLE) +#define KERNEL_FUNC zherk_kernel_UC +#else +#define KERNEL_FUNC cherk_kernel_UC +#endif +#endif + +#else + +#ifndef CONJ +#ifdef XDOUBLE +#define KERNEL_FUNC xherk_kernel_LN +#elif defined(DOUBLE) +#define KERNEL_FUNC zherk_kernel_LN +#else +#define KERNEL_FUNC cherk_kernel_LN +#endif +#else +#ifdef XDOUBLE +#define KERNEL_FUNC xherk_kernel_LC +#elif defined(DOUBLE) +#define KERNEL_FUNC zherk_kernel_LC +#else +#define KERNEL_FUNC cherk_kernel_LC +#endif +#endif + +#endif + +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) + +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL HERK_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL HERK_UC +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL HERK_LN +#else +#define SYRK_LOCAL HERK_LC +#endif + +#undef SCAL_K + +#ifdef XDOUBLE +#define SCAL_K QSCAL_K +#elif defined(DOUBLE) +#define SCAL_K DSCAL_K +#else +#define SCAL_K SSCAL_K +#endif + + +static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { + + BLASLONG i; + +#ifndef LOWER + if (m_from > n_from) n_from = m_from; + if (m_to > n_to ) m_to = n_to; +#else + if (m_from < n_from) m_from = n_from; + if (m_to < n_to ) n_to = m_to; +#endif + + c += (m_from + n_from * ldc) * COMPSIZE; + + m_to -= m_from; + n_to -= n_from; + + for (i = 0; i < n_to; i++){ + +#ifndef LOWER + + SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); + + if (i + n_from - m_from + 1 <= m_to) + *(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO; + + c += ldc * COMPSIZE; + +#else + + SCAL_K(MIN(m_to - i + m_from - n_from, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); + + if (i < m_from - n_from) { + c += ldc * COMPSIZE; + } else { + *(c + 1) = ZERO; + c += (1 + ldc) * COMPSIZE; + } + +#endif + + } + + return 0; +} + +#ifdef THREADED_LEVEL3 +#include "level3_syrk_threaded.c" +#else +#include "level3_syrk.c" +#endif diff --git a/driver/level3/zherk_kernel.c b/driver/level3/zherk_kernel.c new file mode 100644 index 0000000..fd8ff9c --- /dev/null +++ b/driver/level3/zherk_kernel.c @@ -0,0 +1,194 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#define GEMM_KERNEL_B0 GEMM_KERNEL_R_B0 +#else +#define GEMM_KERNEL GEMM_KERNEL_L +#define GEMM_KERNEL_B0 GEMM_KERNEL_L_B0 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + BLASLONG loop; + FLOAT *cc, *ss; + FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE]; + + if (m + offset < 0) { +#ifndef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, ZERO, + a, b, c, ldc); +#endif + return 0; + } + + if (n < offset) { +#ifdef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, ZERO, + a, b, c, ldc); +#endif + return 0; + } + + + if (offset > 0) { +#ifdef LOWER + GEMM_KERNEL(m, offset, k, + alpha_r, ZERO, + a, b, c, ldc); +#endif + b += offset * k * COMPSIZE; + c += offset * ldc * COMPSIZE; + n -= offset; + offset = 0; + + if (n <= 0) return 0; + } + + if (n > m + offset) { +#ifndef LOWER + GEMM_KERNEL(m, n - m - offset, k, + alpha_r, ZERO, + a, + b + (m + offset) * k * COMPSIZE, + c + (m + offset) * ldc * COMPSIZE, ldc); +#endif + + n = m + offset; + if (n <= 0) return 0; + } + + + if (offset < 0) { +#ifndef LOWER + GEMM_KERNEL(-offset, n, k, + alpha_r, ZERO, + a, b, c, ldc); +#endif + a -= offset * k * COMPSIZE; + c -= offset * COMPSIZE; + m += offset; + offset = 0; + + if (m <= 0) return 0; + } + + if (m > n - offset) { +#ifdef LOWER + GEMM_KERNEL(m - n + offset, n, k, + alpha_r, ZERO, + a + (n - offset) * k * COMPSIZE, + b, + c + (n - offset) * COMPSIZE, ldc); +#endif + m = n + offset; + if (m <= 0) return 0; + } + + for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { + + int mm, nn; + + mm = (loop & ~(GEMM_UNROLL_MN - 1)); + nn = MIN(GEMM_UNROLL_MN, n - loop); + +#ifndef LOWER + GEMM_KERNEL(mm, nn, k, + alpha_r, ZERO, + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); +#endif + + GEMM_BETA(nn, nn, 0, ZERO, ZERO, + NULL, 0, NULL, 0, subbuffer, nn); + + GEMM_KERNEL(nn, nn, k, + alpha_r, ZERO, + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + + cc = c + (loop + loop * ldc) * COMPSIZE; + ss = subbuffer; + +#ifndef LOWER + for (j = 0; j < nn; j ++) { + + for (i = 0; i +#include +#include "common.h" + +int CNAME(BLASLONG dummy1, BLASLONG n, BLASLONG dummy2, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *dummy3, BLASLONG dummy4, FLOAT *dummy5, BLASLONG dummy6, + FLOAT *c, BLASLONG ldc, + FLOAT *dummy7, FLOAT *dummy8, BLASLONG from, BLASLONG to){ + + BLASLONG i; + +#ifndef LOWER + for (i = from; i < to; i++){ + ZSCAL_K(i + 1, 0, 0, alpha_r, alpha_i, c + i * ldc * 2, 1, NULL, 0, NULL, 0); + } +#else + for (i = from; i < to; i++){ + ZSCAL_K(n - i, 0, 0, alpha_r, alpha_i, c + i * (ldc + 1) * 2, 1, NULL, 0, NULL, 0); + } +#endif + return 0; +} diff --git a/driver/mapper/._Makefile b/driver/mapper/._Makefile new file mode 100644 index 0000000..028eae2 Binary files /dev/null and b/driver/mapper/._Makefile differ diff --git a/driver/mapper/._device_setup b/driver/mapper/._device_setup new file mode 100755 index 0000000..b489aa6 Binary files /dev/null and b/driver/mapper/._device_setup differ diff --git a/driver/mapper/._mapper.c b/driver/mapper/._mapper.c new file mode 100644 index 0000000..adbb1d2 Binary files /dev/null and b/driver/mapper/._mapper.c differ diff --git a/driver/mapper/Makefile b/driver/mapper/Makefile new file mode 100644 index 0000000..67e7e03 --- /dev/null +++ b/driver/mapper/Makefile @@ -0,0 +1,25 @@ +MODULENAME := mapper + +KDIR := /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +CC := gcc -Wall + +ifeq ($(KERNELRELEASE),) +all :: + $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules +else + obj-m := $(MODULENAME).o +endif + +load: + insmod ./$(MODULENAME).ko + +unload: + rmmod $(MODULENAME) + +setup: + ./device_setup + +clean: + rm -rf *.o *.ko Module.symvers *.mod.c .tmp_versions .mapper* modules.order diff --git a/driver/mapper/device_setup b/driver/mapper/device_setup new file mode 100755 index 0000000..0afbdeb --- /dev/null +++ b/driver/mapper/device_setup @@ -0,0 +1,11 @@ +#!/bin/sh + +drivername=mapper + +devicename=/dev/$drivername +major=`cat /proc/devices | grep $drivername | awk '{print $1;}'` + +rm -f $devicename +mknod $devicename c $major 0 +chmod go+rw $devicename + diff --git a/driver/mapper/mapper.c b/driver/mapper/mapper.c new file mode 100644 index 0000000..83805fb --- /dev/null +++ b/driver/mapper/mapper.c @@ -0,0 +1,252 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_BIGPHYS_AREA +#include +#endif +#include +#ifdef MODVERSIONS +#include +#endif +#include + +typedef struct { + pid_t pid; +#ifndef CONFIG_BIGPHYS_AREA + long size; +#endif + caddr_t address; + +} buffer_t; + +#define MAX_BUFF_SIZE 1024 +#define MAX_LENGTH (4UL << 20) + +static spinlock_t lock __attribute__((aligned(64))); + +static buffer_t buffer[MAX_BUFF_SIZE]; + +static dev_t mapper_dev; +static struct cdev mapper_cdev; + +static int mapper_open (struct inode *inode, struct file *fp){ return 0;} + +static int mapper_release(struct inode *inode, struct file *fp){ + + int pos; +#ifndef CONFIG_BIGPHYS_AREA + caddr_t addr; +#endif + + // printk("Releasing memory... %d\n", current -> tgid); + + spin_lock(&lock); + + for (pos = 0; pos < MAX_BUFF_SIZE; pos ++) { + if (buffer[pos].pid == (pid_t) current -> tgid) { + +#ifdef CONFIG_BIGPHYS_AREA + bigphysarea_free_pages(buffer[pos].address); +#else + + for (addr = buffer[pos].address; addr < buffer[pos].address + buffer[pos].size; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + } + + kfree(buffer[pos].address); + buffer[pos].size = 0; +#endif + buffer[pos].pid = 0; + buffer[pos].address = 0; + } + } + + spin_unlock(&lock); + + return 0; +} + +int mapper_mapper(struct file *fp, struct vm_area_struct *vma){ + + int ret, pos; + caddr_t alloc_addr; +#ifndef CONFIG_BIGPHYS_AREA + caddr_t addr; +#endif + long all_length, length, current_addr; + + all_length = vma->vm_end - vma->vm_start; + current_addr = vma -> vm_start; + + spin_lock(&lock); + + while (all_length > 0) { + length = all_length; + if (length > MAX_LENGTH) length = MAX_LENGTH; + all_length -= MAX_LENGTH; + + // printk("Allocating memory... %d\n", length); + + pos = 0; + while ((pos < MAX_BUFF_SIZE) && (buffer[pos].address != 0)) pos ++; + + if (pos >= MAX_BUFF_SIZE) { + + printk("Memory Allocator : too much memory allocation requested.\n"); + + spin_unlock(&lock); + + return -EIO; + } + +#ifdef CONFIG_BIGPHYS_AREA + alloc_addr = (caddr_t)bigphysarea_alloc_pages(length >> PAGE_SHIFT, 1, GFP_KERNEL); +#else + alloc_addr = (caddr_t)kmalloc(length, GFP_KERNEL); +#endif + + if (alloc_addr == (caddr_t)NULL) { + + spin_unlock(&lock); + + return -EIO; + } + +#ifndef CONFIG_BIGPHYS_AREA + for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) { + clear_page(addr); + SetPageReserved(virt_to_page(addr)); + } +#endif + + if ((ret = remap_pfn_range(vma, + current_addr, + virt_to_phys((void *)alloc_addr) >> PAGE_SHIFT, + length, + PAGE_SHARED)) < 0) { + +#ifdef CONFIG_BIGPHYS_AREA + bigphysarea_free_pages((caddr_t)alloc_addr); +#else + + for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) ClearPageReserved(virt_to_page(addr)); + + kfree((caddr_t)alloc_addr); +#endif + + spin_unlock(&lock); + + return ret; + } + + buffer[pos].pid = current -> tgid; + buffer[pos].address = alloc_addr; +#ifndef CONFIG_BIGPHYS_AREA + buffer[pos].size = length; +#endif + + current_addr += length; + } + + spin_unlock(&lock); + + return 0; +} + +static struct file_operations mapper_fops = { + .open = mapper_open, + .release = mapper_release, + .mmap = mapper_mapper, + .owner = THIS_MODULE, +}; + +static int __init mapper_init(void){ + + int ret, i; + + ret = alloc_chrdev_region(&mapper_dev, 0, 1, "mapper"); + + cdev_init(&mapper_cdev, &mapper_fops); + + ret = cdev_add(&mapper_cdev, mapper_dev, 1); + + spin_lock_init(&lock); + + for (i = 0; i < MAX_BUFF_SIZE; i++) { + buffer[i].pid = 0; +#ifndef CONFIG_BIGPHYS_AREA + buffer[i].size = 0; +#endif + buffer[i].address = 0; + } + + return ret; +} + +static void __exit mapper_exit(void){ + + int pos; + + for (pos = 0; pos < MAX_BUFF_SIZE; pos ++) { + if (buffer[pos].address != 0) { +#ifdef CONFIG_BIGPHYS_AREA + bigphysarea_free_pages(buffer[pos].address); +#else + kfree(buffer[pos].address); +#endif + } + } + + cdev_del(&mapper_cdev); + + unregister_chrdev_region(mapper_dev, 1); +} + +module_init(mapper_init); +module_exit(mapper_exit); +MODULE_DESCRIPTION("BigPhysArea User Mapping Driver"); +MODULE_LICENSE("Unknown"); diff --git a/driver/others/._Makefile b/driver/others/._Makefile new file mode 100644 index 0000000..7e3390d Binary files /dev/null and b/driver/others/._Makefile differ diff --git a/driver/others/._abs.c b/driver/others/._abs.c new file mode 100644 index 0000000..31c8775 Binary files /dev/null and b/driver/others/._abs.c differ diff --git a/driver/others/._blas_l1_thread.c b/driver/others/._blas_l1_thread.c new file mode 100644 index 0000000..c039ed8 Binary files /dev/null and b/driver/others/._blas_l1_thread.c differ diff --git a/driver/others/._blas_server.c b/driver/others/._blas_server.c new file mode 100644 index 0000000..2865bd8 Binary files /dev/null and b/driver/others/._blas_server.c differ diff --git a/driver/others/._blas_server_omp.c b/driver/others/._blas_server_omp.c new file mode 100644 index 0000000..c23be56 Binary files /dev/null and b/driver/others/._blas_server_omp.c differ diff --git a/driver/others/._blas_server_win32.c b/driver/others/._blas_server_win32.c new file mode 100644 index 0000000..2d0a2d2 Binary files /dev/null and b/driver/others/._blas_server_win32.c differ diff --git a/driver/others/._divtable.c b/driver/others/._divtable.c new file mode 100644 index 0000000..77ccbdf Binary files /dev/null and b/driver/others/._divtable.c differ diff --git a/driver/others/._dynamic.c b/driver/others/._dynamic.c new file mode 100644 index 0000000..0a62f6e Binary files /dev/null and b/driver/others/._dynamic.c differ diff --git a/driver/others/._init.c b/driver/others/._init.c new file mode 100644 index 0000000..f95de3a Binary files /dev/null and b/driver/others/._init.c differ diff --git a/driver/others/._lamc3.c b/driver/others/._lamc3.c new file mode 100644 index 0000000..4621272 Binary files /dev/null and b/driver/others/._lamc3.c differ diff --git a/driver/others/._lamch.c b/driver/others/._lamch.c new file mode 100644 index 0000000..9b3efa9 Binary files /dev/null and b/driver/others/._lamch.c differ diff --git a/driver/others/._lsame.c b/driver/others/._lsame.c new file mode 100644 index 0000000..e1abec7 Binary files /dev/null and b/driver/others/._lsame.c differ diff --git a/driver/others/._memory.c b/driver/others/._memory.c new file mode 100644 index 0000000..6b3b283 Binary files /dev/null and b/driver/others/._memory.c differ diff --git a/driver/others/._memory_qalloc.c b/driver/others/._memory_qalloc.c new file mode 100644 index 0000000..8da371d Binary files /dev/null and b/driver/others/._memory_qalloc.c differ diff --git a/driver/others/._parameter.c b/driver/others/._parameter.c new file mode 100644 index 0000000..c31e907 Binary files /dev/null and b/driver/others/._parameter.c differ diff --git a/driver/others/._profile.c b/driver/others/._profile.c new file mode 100644 index 0000000..94c1031 Binary files /dev/null and b/driver/others/._profile.c differ diff --git a/driver/others/._xerbla.c b/driver/others/._xerbla.c new file mode 100644 index 0000000..10e4ed3 Binary files /dev/null and b/driver/others/._xerbla.c differ diff --git a/driver/others/Makefile b/driver/others/Makefile new file mode 100644 index 0000000..bc5de38 --- /dev/null +++ b/driver/others/Makefile @@ -0,0 +1,218 @@ +TOPDIR = ../.. +include ../../Makefile.system + +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) + +COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) + +ifdef SMP +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) +ifndef NO_AFFINITY +COMMONOBJS += init.$(SUFFIX) +endif +endif + +# COMMONOBJS += info.$(SUFFIX) + +ifdef DYNAMIC_ARCH +COMMONOBJS += dynamic.$(SUFFIX) +else +COMMONOBJS += parameter.$(SUFFIX) +endif + +ifdef EXPRECISION +COMMONOBJS += x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX) +endif + +ifdef QUAD_PRECISION +COMMONOBJS += addx.$(SUFFIX) mulx.$(SUFFIX) +endif + +ifeq ($(OSNAME), CYGWIN_NT) +ifeq ($(C_COMPILER), PGI) +# COMMONOBJS += __builtin_stinit.$(SUFFIX) +endif +endif + +ifdef USE_CUDA +COMMONOBJS += cuda_init.$(SUFFIX) +endif + +ifdef FUNCTION_PROFILE +COMMONOBJS += profile.$(SUFFIX) +endif + +LIBOTHERS = libothers.$(LIBSUFFIX) + +ifeq ($(CORE), PPC440) +MEMORY = memory_qalloc.c +endif + +ifndef MEMORY +MEMORY = memory.c +endif + +ifeq ($(USE_OPENMP), 1) +BLAS_SERVER = blas_server_omp.c +else +ifeq ($(OSNAME), WINNT) +BLAS_SERVER = blas_server_win32.c +endif +ifeq ($(OSNAME), CYGWIN_NT) +BLAS_SERVER = blas_server_win32.c +endif +ifeq ($(OSNAME), Interix) +BLAS_SERVER = blas_server_win32.c +endif +endif + +ifndef BLAS_SERVER +BLAS_SERVER = blas_server.c +endif + +ifdef DYNAMIC_ARCH +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) +else +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) +endif + +xerbla.$(SUFFIX) : xerbla.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dynamic.$(SUFFIX) : dynamic.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dynamic.$(PSUFFIX) : dynamic.c + $(CC) $(PFLAGS) -c $< -o $(@F) + +parameter.$(SUFFIX) : parameter.c ../../param.h + $(CC) $(CFLAGS) -c $< -o $(@F) + +init.$(SUFFIX) : init.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +profile.$(SUFFIX) : profile.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h + $(CC) $(CFLAGS) -c $< -o $(@F) + +blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h + $(CC) $(CFLAGS) -c $< -o $(@F) + +blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h + $(CC) $(CFLAGS) -c $< -o $(@F) + +cuda_init.$(SUFFIX) : cuda_init.c + $(CUCC) $(COMMON_OPT) -I$(TOPDIR) $(CUFLAGS) -DCNAME=$(*F) -c $< -o $(@F) + +c_abs.$(SUFFIX) : abs.c + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +z_abs.$(SUFFIX) : abs.c + $(CC) $(CFLAGS) -c -DDOUBLE $< -o $(@F) + +x_abs.$(SUFFIX) : abs.c + $(CC) $(CFLAGS) -c -DXDOUBLE $< -o $(@F) + +slamch.$(SUFFIX) : lamch.c + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +dlamch.$(SUFFIX) : lamch.c + $(CC) $(CFLAGS) -c -DDOUBLE $< -o $(@F) + +qlamch.$(SUFFIX) : lamch.c + $(CC) $(CFLAGS) -c -DXDOUBLE $< -o $(@F) + +slamc3.$(SUFFIX) : lamc3.c + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +dlamc3.$(SUFFIX) : lamc3.c + $(CC) $(CFLAGS) -c -DDOUBLE $< -o $(@F) + +qlamc3.$(SUFFIX) : lamc3.c + $(CC) $(CFLAGS) -c -DXDOUBLE $< -o $(@F) + +divtable.$(SUFFIX) : divtable.c + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +__builtin_stinit.$(SUFFIX) : $(ARCH)/builtin_stinit.S + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +addx.$(SUFFIX) : $(ARCH)/addx.c + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) + +mulx.$(SUFFIX) : $(ARCH)/mulx.c + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) + +xerbla.$(PSUFFIX) : xerbla.c + $(CC) $(PFLAGS) -c $< -o $(@F) + +parameter.$(PSUFFIX) : parameter.c ../../param.h + $(CC) $(PFLAGS) -c $< -o $(@F) + +init.$(PSUFFIX) : init.c + $(CC) $(PFLAGS) -c $< -o $(@F) + +profile.$(PSUFFIX) : profile.c + $(CC) $(PFLAGS) -c $< -o $(@F) + +memory.$(PSUFFIX) : $(MEMORY) ../../common.h ../../param.h + $(CC) $(PFLAGS) -c $< -o $(@F) + +blas_server.$(PSUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h + $(CC) $(PFLAGS) -c $< -o $(@F) + +blasL1thread.$(PSUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h + $(CC) $(PFLAGS) -c $< -o $(@F) + +cuda_init.$(PSUFFIX) : cuda_init.c + $(CUCC) $(COMMON_OPT) -I$(TOPDIR) $(CUFLAGS) -DCNAME=$(*F) -c $< -o $(@F) + +c_abs.$(PSUFFIX) : abs.c + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +z_abs.$(PSUFFIX) : abs.c + $(CC) $(PFLAGS) -c -DDOUBLE $< -o $(@F) + +x_abs.$(PSUFFIX) : abs.c + $(CC) $(PFLAGS) -c -DXDOUBLE $< -o $(@F) + +slamch.$(PUFFIX) : lamch.c + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +dlamch.$(PUFFIX) : lamch.c + $(CC) $(PFLAGS) -c -DDOUBLE $< -o $(@F) + +qlamch.$(PUFFIX) : lamch.c + $(CC) $(PFLAGS) -c -DXDOUBLE $< -o $(@F) + +slamc3.$(PUFFIX) : lamc3.c + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +dlamc3.$(PUFFIX) : lamc3.c + $(CC) $(PFLAGS) -c -DDOUBLE $< -o $(@F) + +qlamc3.$(PUFFIX) : lamc3.c + $(CC) $(PFLAGS) -c -DXDOUBLE $< -o $(@F) + +divtable.$(PSUFFIX) : divtable.c + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +__builtin_stinit.$(PPSUFFIX) : $(ARCH)/builtin_stinit.S + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +addx.$(PSUFFIX) : $(ARCH)/addx.c + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) + +mulx.$(PSUFFIX) : $(ARCH)/mulx.c + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) + +info.$(SUFFIX) : info.c info.h ../../common.h ../../param.h + $(CC) $(CFLAGS) -c $< -o $(@F) + + +hpl : CFLAGS += -DHPL +hpl_p : CFLAGS += -DHPL + +include $(TOPDIR)/Makefile.tail diff --git a/driver/others/abs.c b/driver/others/abs.c new file mode 100644 index 0000000..e3ce161 --- /dev/null +++ b/driver/others/abs.c @@ -0,0 +1,71 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +double fabs(double); +double sqrt(double); + +#ifdef NEED_F2CCONV +double +#else +FLOAT +#endif + CNAME(FLOAT *z){ + + FLOAT real = z[0]; + FLOAT imag = z[1]; + double temp; + + real = fabs(real); + imag = fabs(imag); + +if(imag > real){ + temp = real; + real = imag; + imag = temp; +} + + if (imag == 0.) return real; + + temp = imag/real; + temp = real * sqrt(1.0 + temp*temp); + + return temp; + +} diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c new file mode 100644 index 0000000..851135b --- /dev/null +++ b/driver/others/blas_l1_thread.c @@ -0,0 +1,112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads){ + + blas_queue_t queue[MAX_CPU_NUMBER]; + blas_arg_t args [MAX_CPU_NUMBER]; + + BLASLONG i, width, astride, bstride; + int num_cpu, calc_type; + + calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; + + mode |= BLAS_LEGACY; + + for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); + + num_cpu = 0; + i = m; + + while (i > 0){ + + /* Adjust Parameters */ + width = blas_quickdivide(i + nthreads - num_cpu - 1, + nthreads - num_cpu); + + i -= width; + if (i < 0) width = width + i; + + astride = width * lda; + + if (!(mode & BLAS_TRANSB_T)) { + bstride = width * ldb; + } else { + bstride = width; + } + + astride <<= calc_type; + bstride <<= calc_type; + + args[num_cpu].m = width; + args[num_cpu].n = n; + args[num_cpu].k = k; + args[num_cpu].a = (void *)a; + args[num_cpu].b = (void *)b; + args[num_cpu].c = (void *)c; + args[num_cpu].lda = lda; + args[num_cpu].ldb = ldb; + args[num_cpu].ldc = ldc; + args[num_cpu].alpha = alpha; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = &args[num_cpu]; + queue[num_cpu].next = &queue[num_cpu + 1]; + + a = (void *)((BLASULONG)a + astride); + b = (void *)((BLASULONG)b + bstride); + + num_cpu ++; + } + + if (num_cpu) { + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c new file mode 100644 index 0000000..62aefe9 --- /dev/null +++ b/driver/others/blas_server.c @@ -0,0 +1,848 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#ifdef OS_LINUX +#include +#include +#endif + +#ifdef SMP_SERVER + +#undef MONITOR +#undef TIMING +#undef TIMING_DEBUG +#undef NEED_STACKATTR + +#define ATTRIBUTE_SIZE 128 + +/* This is a thread server model implementation. The threads are */ +/* spawned at first access to blas library, and still remains until */ +/* destruction routine is called. The number of threads are */ +/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ +/* jobs is queued. */ + +/* We need this grobal for cheking if initialization is finished. */ +int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; + +/* Local Variables */ +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t server_lock = 0; +#else +static unsigned long server_lock = 0; +#endif + +#define THREAD_STATUS_SLEEP 2 +#define THREAD_STATUS_WAKEUP 4 + +static pthread_t blas_threads [MAX_CPU_NUMBER]; + +typedef struct { + blas_queue_t * volatile queue __attribute__((aligned(ATTRIBUTE_SIZE))); + +#if defined(OS_LINUX) && !defined(NO_AFFINITY) + int node; +#endif + + volatile long status; + + pthread_mutex_t lock; + pthread_cond_t wakeup; + +} thread_status_t; + +static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE))); + +#ifndef THREAD_TIMEOUT +#define THREAD_TIMEOUT 28 +#endif + +static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); + +#ifdef MONITOR + +/* Monitor is a function to see thread's status for every seconds. */ +/* Usually it turns off and it's for debugging. */ + +static pthread_t monitor_thread; +static int main_status[MAX_CPU_NUMBER]; +#define MAIN_ENTER 0x01 +#define MAIN_EXIT 0x02 +#define MAIN_TRYLOCK 0x03 +#define MAIN_LOCKSUCCESS 0x04 +#define MAIN_QUEUING 0x05 +#define MAIN_RECEIVING 0x06 +#define MAIN_RUNNING1 0x07 +#define MAIN_RUNNING2 0x08 +#define MAIN_RUNNING3 0x09 +#define MAIN_WAITING 0x0a +#define MAIN_SLEEPING 0x0b +#define MAIN_FINISH 0x0c +#define MAIN_DONE 0x0d +#endif + +#define BLAS_QUEUE_FINISHED 3 +#define BLAS_QUEUE_RUNNING 4 + +#ifdef TIMING +BLASLONG exit_time[MAX_CPU_NUMBER]; +#endif + +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ + + if (!(mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* REAL / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* REAL / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* REAL / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } else { +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* COMPLEX / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + ((xdouble *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* COMPLEX / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + ((double *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* COMPLEX / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + ((float *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } +} + +#if defined(OS_LINUX) && !defined(NO_AFFINITY) +int gotoblas_set_affinity(int); +int gotoblas_set_affinity2(int); +int get_node(void); +#endif + +static int increased_threads = 0; + +static int blas_thread_server(void *arg){ + + /* Thread identifier */ + BLASLONG cpu = (BLASLONG)arg; + unsigned int last_tick; + void *buffer, *sa, *sb; + blas_queue_t *queue; +#ifdef TIMING_DEBUG + unsigned long start, stop; +#endif + +#if defined(OS_LINUX) && !defined(NO_AFFINITY) + if (!increased_threads) + thread_status[cpu].node = gotoblas_set_affinity(cpu + 1); + else + thread_status[cpu].node = gotoblas_set_affinity(-1); +#endif + +#ifdef MONITOR + main_status[cpu] = MAIN_ENTER; +#endif + + buffer = blas_memory_alloc(2); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu); +#endif + + while (1){ + +#ifdef MONITOR + main_status[cpu] = MAIN_QUEUING; +#endif + +#ifdef TIMING + exit_time[cpu] = rpcc(); +#endif + + last_tick = (unsigned int)rpcc(); + + while (!thread_status[cpu].queue) { + + YIELDING; + + if ((unsigned int)rpcc() - last_tick > thread_timeout) { + + pthread_mutex_lock (&thread_status[cpu].lock); + + if (!thread_status[cpu].queue) { + thread_status[cpu].status = THREAD_STATUS_SLEEP; + while (thread_status[cpu].status == THREAD_STATUS_SLEEP) { + +#ifdef MONITOR + main_status[cpu] = MAIN_SLEEPING; +#endif + + pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock); + } + } + + pthread_mutex_unlock(&thread_status[cpu].lock); + + last_tick = (unsigned int)rpcc(); + } + + } + + queue = thread_status[cpu].queue; + + if ((long)queue == -1) break; + +#ifdef MONITOR + main_status[cpu] = MAIN_RECEIVING; +#endif + +#ifdef TIMING_DEBUG + start = rpcc(); +#endif + + if (queue) { + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + + thread_status[cpu].queue = (blas_queue_t *)1; + + sa = queue -> sa; + sb = queue -> sb; + +#ifdef SMP_DEBUG + if (queue -> args) { + fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); + } +#endif + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; +#endif + + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + + if (sb == NULL) { + if (!(queue -> mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + + } else { + sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } else { +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } + } + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING2; +#endif + + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(routine, queue -> mode, queue -> args, sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + } else + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu); +#endif + +#ifdef MONITOR + main_status[cpu] = MAIN_FINISH; +#endif + + thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ + WMB; + + } + +#ifdef MONITOR + main_status[cpu] = MAIN_DONE; +#endif + +#ifdef TIMING_DEBUG + stop = rpcc(); + + fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1, + start, stop, + stop - start); +#endif + + } + + /* Shutdown procedure */ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); +#endif + + blas_memory_free(buffer); + + pthread_exit(NULL); + + return 0; +} + +#ifdef MONITOR + +static BLASLONG num_suspend = 0; + +static int blas_monitor(void *arg){ + int i; + + while(1){ + for (i = 0; i < blas_num_threads - 1; i++){ + switch (main_status[i]) { + case MAIN_ENTER : + fprintf(STDERR, "THREAD[%2d] : Entering.\n", i); + break; + case MAIN_EXIT : + fprintf(STDERR, "THREAD[%2d] : Exiting.\n", i); + break; + case MAIN_TRYLOCK : + fprintf(STDERR, "THREAD[%2d] : Trying lock operation.\n", i); + break; + case MAIN_QUEUING : + fprintf(STDERR, "THREAD[%2d] : Queuing.\n", i); + break; + case MAIN_RECEIVING : + fprintf(STDERR, "THREAD[%2d] : Receiving.\n", i); + break; + case MAIN_RUNNING1 : + fprintf(STDERR, "THREAD[%2d] : Running1.\n", i); + break; + case MAIN_RUNNING2 : + fprintf(STDERR, "THREAD[%2d] : Running2.\n", i); + break; + case MAIN_RUNNING3 : + fprintf(STDERR, "THREAD[%2d] : Running3.\n", i); + break; + case MAIN_WAITING : + fprintf(STDERR, "THREAD[%2d] : Waiting.\n", i); + break; + case MAIN_SLEEPING : + fprintf(STDERR, "THREAD[%2d] : Sleeping.\n", i); + break; + case MAIN_FINISH : + fprintf(STDERR, "THREAD[%2d] : Finishing.\n", i); + break; + case MAIN_DONE : + fprintf(STDERR, "THREAD[%2d] : Job is done.\n", i); + break; + } + + fprintf(stderr, "Total number of suspended ... %ld\n", num_suspend); + } + sleep(1); + } + + return 0; +} +#endif + +/* Initializing routine */ +int blas_thread_init(void){ + BLASLONG i; +#ifdef NEED_STACKATTR + pthread_attr_t attr; +#endif + + if (blas_server_avail) return 0; + +#ifdef NEED_STACKATTR + pthread_attr_init(&attr); + pthread_attr_setguardsize(&attr, 0x1000U); + pthread_attr_setstacksize( &attr, 0x1000U); +#endif + + LOCK_COMMAND(&server_lock); + + if (!blas_server_avail){ + + char *p; + + p = getenv("GOTO_THREAD_TIMEOUT"); + + if (p) { + thread_timeout = atoi(p); + if (thread_timeout < 4) thread_timeout = 4; + if (thread_timeout > 30) thread_timeout = 30; + thread_timeout = (1 << thread_timeout); + } + + for(i = 0; i < blas_num_threads - 1; i++){ + + thread_status[i].queue = (blas_queue_t *)NULL; + thread_status[i].status = THREAD_STATUS_WAKEUP; + + pthread_mutex_init(&thread_status[i].lock, NULL); + pthread_cond_init (&thread_status[i].wakeup, NULL); + +#ifdef NEED_STACKATTR + pthread_create(&blas_threads[i], &attr, + (void *)&blas_thread_server, (void *)i); +#else + pthread_create(&blas_threads[i], NULL, + (void *)&blas_thread_server, (void *)i); +#endif + } + +#ifdef MONITOR + pthread_create(&monitor_thread, NULL, + (void *)&blas_monitor, (void *)NULL); +#endif + + blas_server_avail = 1; + } + + UNLOCK_COMMAND(&server_lock); + + return 0; +} + +/* + User can call one of two routines. + + exec_blas_async ... immediately returns after jobs are queued. + + exec_blas ... returns after jobs are finished. +*/ + +static BLASULONG exec_queue_lock = 0; + +int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ + + BLASLONG i = 0; + blas_queue_t *current = queue; +#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) + int node = get_node(); + int nodes = get_num_nodes(); +#endif + +#ifdef SMP_DEBUG + int exec_count = 0; + fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos); +#endif + + blas_lock(&exec_queue_lock); + + while (queue) { + queue -> position = pos; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); + __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); +#endif + +#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) + + /* Node Mapping Mode */ + + if (queue -> mode & BLAS_NODE) { + + do { + while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++; + + if (i < blas_num_threads - 1) break; + + i ++; + if (i >= blas_num_threads - 1) { + i = 0; + node ++; + if (node >= nodes) node = 0; + } + + } while (1); + + } else { + while(thread_status[i].queue) { + i ++; + if (i >= blas_num_threads - 1) i = 0; + } + } +#else + while(thread_status[i].queue) { + i ++; + if (i >= blas_num_threads - 1) i = 0; + } +#endif + + queue -> assigned = i; + WMB; + thread_status[i].queue = queue; + WMB; + + queue = queue -> next; + pos ++; +#ifdef SMP_DEBUG + exec_count ++; +#endif + + } + + blas_unlock(&exec_queue_lock); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count); +#endif + + while (current) { + + pos = current -> assigned; + + if ((BLASULONG)thread_status[pos].queue > 1) { + + if (thread_status[pos].status == THREAD_STATUS_SLEEP) { + + pthread_mutex_lock (&thread_status[pos].lock); + +#ifdef MONITOR + num_suspend ++; +#endif + + if (thread_status[pos].status == THREAD_STATUS_SLEEP) { + thread_status[pos].status = THREAD_STATUS_WAKEUP; + pthread_cond_signal(&thread_status[pos].wakeup); + } + pthread_mutex_unlock(&thread_status[pos].lock); + } + } + + current = current -> next; + } + + return 0; +} + +int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ + + while ((num > 0) && queue) { + + while(thread_status[queue -> assigned].queue) { + YIELDING; + }; + + queue = queue -> next; + num --; + } + +#ifdef SMP_DEBUG + fprintf(STDERR, "Done.\n\n"); +#endif + + return 0; +} + +/* Execute Threads */ +int exec_blas(BLASLONG num, blas_queue_t *queue){ + + int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); + +#ifdef TIMING_DEBUG + BLASULONG start, stop; +#endif + + if ((num <= 0) || (queue == NULL)) return 0; + +#ifdef SMP_DEBUG + fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num); +#endif + +#ifdef __ELF__ + if (omp_in_parallel && (num > 1)) { + if (omp_in_parallel() > 0) { + fprintf(stderr, + "GotoBLAS Warning : Detect OpenMP Loop and this application may hang. " + "Please rebuild the library with USE_OPENMP=1 option.\n"); + } + } +#endif + + if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); + +#ifdef TIMING_DEBUG + start = rpcc(); + + fprintf(STDERR, "\n"); +#endif + + routine = queue -> routine; + + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + } else + (routine)(queue -> args, queue -> range_m, queue -> range_n, + queue -> sa, queue -> sb, 0); + +#ifdef TIMING_DEBUG + stop = rpcc(); +#endif + + if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + +#ifdef TIMING_DEBUG + fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", + start, stop, + stop - start); +#endif + + return 0; +} + +void goto_set_num_threads(int num_threads) { + + long i; + + if (num_threads < 1) num_threads = blas_num_threads; + + if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; + + if (num_threads > blas_num_threads) { + + LOCK_COMMAND(&server_lock); + + increased_threads = 1; + + for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + + thread_status[i].queue = (blas_queue_t *)NULL; + thread_status[i].status = THREAD_STATUS_WAKEUP; + + pthread_mutex_init(&thread_status[i].lock, NULL); + pthread_cond_init (&thread_status[i].wakeup, NULL); + +#ifdef NEED_STACKATTR + pthread_create(&blas_threads[i], &attr, + (void *)&blas_thread_server, (void *)i); +#else + pthread_create(&blas_threads[i], NULL, + (void *)&blas_thread_server, (void *)i); +#endif + } + + blas_num_threads = num_threads; + + UNLOCK_COMMAND(&server_lock); + } + + blas_cpu_number = num_threads; + +} + +/* Compatible function with pthread_create / join */ + +int gotoblas_pthread(int numthreads, void *function, void *args, int stride) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + int i; + + if (numthreads <= 0) return 0; + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif +#endif + + for (i = 0; i < numthreads; i ++) { + + queue[i].mode = BLAS_PTHREAD; + queue[i].routine = function; + queue[i].args = args; + queue[i].range_m = NULL; + queue[i].range_n = NULL; + queue[i].sa = args; + queue[i].sb = args; + queue[i].next = &queue[i + 1]; + + args += stride; + } + + queue[numthreads - 1].next = NULL; + + exec_blas(numthreads, queue); + + return 0; +} + +/* Shutdown procedure, but user don't have to call this routine. The */ +/* kernel automatically kill threads. */ + +int BLASFUNC(blas_thread_shutdown)(void){ + + int i; + + if (!blas_server_avail) return 0; + + LOCK_COMMAND(&server_lock); + + for (i = 0; i < blas_num_threads - 1; i++) { + + blas_lock(&exec_queue_lock); + + thread_status[i].queue = (blas_queue_t *)-1; + + blas_unlock(&exec_queue_lock); + + pthread_mutex_lock (&thread_status[i].lock); + + thread_status[i].status = THREAD_STATUS_WAKEUP; + + pthread_cond_signal (&thread_status[i].wakeup); + + pthread_mutex_unlock(&thread_status[i].lock); + + } + + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_join(blas_threads[i], NULL); + } + + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_mutex_destroy(&thread_status[i].lock); + pthread_cond_destroy (&thread_status[i].wakeup); + } + +#ifdef NEED_STACKATTR + pthread_attr_destory(&attr); +#endif + + blas_server_avail = 0; + + UNLOCK_COMMAND(&server_lock); + + return 0; +} + +#endif + diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c new file mode 100644 index 0000000..3e70d85 --- /dev/null +++ b/driver/others/blas_server_omp.c @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include "common.h" + +#ifndef USE_OPENMP + +#include "blas_server.c" + +#else + +int blas_server_avail = 0; + +int blas_thread_init(void){ + + blas_get_cpu_number(); + + blas_server_avail = 1; + + return 0; +} + +int BLASFUNC(blas_thread_shutdown)(void){ + + blas_server_avail = 0; + + return 0; +} + +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ + + if (!(mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* REAL / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* REAL / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* REAL / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } else { +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* COMPLEX / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + ((xdouble *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* COMPLEX / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + ((double *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* COMPLEX / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + ((float *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } +} + +static void exec_threads(blas_queue_t *queue){ + + void *buffer, *sa, *sb; + + buffer = NULL; + sa = queue -> sa; + sb = queue -> sb; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif + + if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { + + buffer = blas_memory_alloc(2); + + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + + if (sb == NULL) { + if (!(queue -> mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + + } else { + sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } else { +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } + } + } + + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(queue -> routine, queue -> mode, queue -> args, sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + + } else { + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + + } + + if (buffer != NULL) blas_memory_free(buffer); + +} + +int exec_blas(BLASLONG num, blas_queue_t *queue){ + + BLASLONG i; + + if ((num <= 0) || (queue == NULL)) return 0; + +#ifdef CONSISTENT_FPCSR + for (i = 0; i < num; i ++) { + __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); + __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); + } +#endif + +#pragma omp parallel for schedule(static) + for (i = 0; i < num; i ++) { + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + queue[i].position = i; +#endif + + exec_threads(&queue[i]); + } + + return 0; +} + +#endif diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c new file mode 100644 index 0000000..6708509 --- /dev/null +++ b/driver/others/blas_server_win32.c @@ -0,0 +1,450 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +/* This is a thread implementation for Win32 lazy implementation */ + +/* Thread server common infomation */ +typedef struct{ + CRITICAL_SECTION lock; + HANDLE filled; + HANDLE killed; + + blas_queue_t *queue; /* Parameter Pointer */ + int shutdown; /* server shutdown flag */ + +} blas_pool_t; + +/* We need this grobal for cheking if initialization is finished. */ +int blas_server_avail = 0; + +/* Local Variables */ +static BLASULONG server_lock = 0; + +static blas_pool_t pool; +static HANDLE blas_threads [MAX_CPU_NUMBER]; +static DWORD blas_threads_id[MAX_CPU_NUMBER]; + +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ + + if (!(mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* REAL / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* REAL / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* REAL / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } else { +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* COMPLEX / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + ((xdouble *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* COMPLEX / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + ((double *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* COMPLEX / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + ((float *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } +} + +/* This is a main routine of threads. Each thread waits until job is */ +/* queued. */ + +static DWORD WINAPI blas_thread_server(void *arg){ + + /* Thread identifier */ +#ifdef SMP_DEBUG + BLASLONG cpu = (BLASLONG)arg; +#endif + + void *buffer, *sa, *sb; + blas_queue_t *queue; + DWORD action; + HANDLE handles[] = {pool.filled, pool.killed}; + + /* Each server needs each buffer */ + buffer = blas_memory_alloc(2); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); +#endif + + while (1){ + + /* Waiting for Queue */ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); +#endif + + do { + action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); + } while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1)); + + if (action == WAIT_OBJECT_0 + 1) break; + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); +#endif + + EnterCriticalSection(&pool.lock); + + queue = pool.queue; + if (queue) pool.queue = queue->next; + + LeaveCriticalSection(&pool.lock); + + if (queue) { + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + + if (pool.queue) SetEvent(pool.filled); + + sa = queue -> sa; + sb = queue -> sb; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); +#endif + + // fprintf(stderr, "queue start[%ld]!!!\n", cpu); + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; +#endif + + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + + if (sb == NULL) { + if (!(queue -> mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + + } else { + sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } else { +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } + } + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING2; +#endif + + if (!(queue -> mode & BLAS_LEGACY)) { + + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + } else { + legacy_exec(routine, queue -> mode, queue -> args, sb); + } + } + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); +#endif + + EnterCriticalSection(&queue->lock); + + queue -> status = BLAS_STATUS_FINISHED; + + LeaveCriticalSection(&queue->lock); + + SetEvent(queue->finish); + } + + /* Shutdown procedure */ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); +#endif + + blas_memory_free(buffer); + + return 0; + } + +/* Initializing routine */ +int blas_thread_init(void){ + BLASLONG i; + + if (blas_server_avail || (blas_cpu_number <= 1)) return 0; + + LOCK_COMMAND(&server_lock); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", + blas_cpu_number); +#endif + + if (!blas_server_avail){ + + InitializeCriticalSection(&pool.lock); + pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); + pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + + pool.shutdown = 0; + pool.queue = NULL; + + for(i = 0; i < blas_cpu_number - 1; i++){ + blas_threads[i] = CreateThread(NULL, 0, + blas_thread_server, (void *)i, + 0, &blas_threads_id[i]); + } + + blas_server_avail = 1; + } + + UNLOCK_COMMAND(&server_lock); + + return 0; +} + +/* + User can call one of two routines. + + exec_blas_async ... immediately returns after jobs are queued. + + exec_blas ... returns after jobs are finished. +*/ + +int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ + + blas_queue_t *current; + + current = queue; + + while (current) { + InitializeCriticalSection(¤t -> lock); + current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL); + current -> position = pos; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("fnstcw %0" : "=m" (current -> x87_mode)); + __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); +#endif + + current = current -> next; + pos ++; + } + + EnterCriticalSection(&pool.lock); + + if (pool.queue) { + current = pool.queue; + while (current -> next) current = current -> next; + current -> next = queue; + } else { + pool.queue = queue; + } + + LeaveCriticalSection(&pool.lock); + + SetEvent(pool.filled); + + return 0; +} + +int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Synchronization Waiting.\n"); +#endif + + while (num){ +#ifdef SMP_DEBUG + fprintf(STDERR, "Waiting Queue ..\n"); +#endif + + WaitForSingleObject(queue->finish, INFINITE); + + CloseHandle(queue->finish); + DeleteCriticalSection(&queue -> lock); + + queue = queue -> next; + num --; + } + +#ifdef SMP_DEBUG + fprintf(STDERR, "Completely Done.\n\n"); +#endif + + return 0; +} + +/* Execute Threads */ +int exec_blas(BLASLONG num, blas_queue_t *queue){ + +#ifndef ALL_THREADED + int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); +#endif + + if ((num <= 0) || (queue == NULL)) return 0; + + if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); + + routine = queue -> routine; + + if (!(queue -> mode & BLAS_LEGACY)) { + (routine)(queue -> args, queue -> range_m, queue -> range_n, + queue -> sa, queue -> sb, 0); + } else { + legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); + } + + if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + + return 0; +} + +/* Shutdown procedure, but user don't have to call this routine. The */ +/* kernel automatically kill threads. */ + +int blas_thread_shutdown_(void){ + + int i; + + if (!blas_server_avail) return 0; + + LOCK_COMMAND(&server_lock); + + if (blas_server_avail){ + + SetEvent(pool.killed); + + for(i = 0; i < blas_cpu_number - 1; i++){ + WaitForSingleObject(blas_threads[i], INFINITE); + } + + blas_server_avail = 0; + } + + UNLOCK_COMMAND(&server_lock); + + return 0; +} diff --git a/driver/others/divtable.c b/driver/others/divtable.c new file mode 100644 index 0000000..7a191db --- /dev/null +++ b/driver/others/divtable.c @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifdef SMP +#ifndef USE64BITINT +unsigned int blas_quick_divide_table[] = { + 0x00000000, 0x00000001, 0x80000001, 0x55555556, + 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925, + 0x20000001, 0x1c71c71d, 0x1999999a, 0x1745d175, + 0x15555556, 0x13b13b14, 0x12492493, 0x11111112, + 0x10000001, 0x0f0f0f10, 0x0e38e38f, 0x0d79435f, + 0x0ccccccd, 0x0c30c30d, 0x0ba2e8bb, 0x0b21642d, + 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f, + 0x0924924a, 0x08d3dcb1, 0x08888889, 0x08421085, + 0x08000001, 0x07c1f07d, 0x07878788, 0x07507508, + 0x071c71c8, 0x06eb3e46, 0x06bca1b0, 0x06906907, + 0x06666667, 0x063e7064, 0x06186187, 0x05f417d1, + 0x05d1745e, 0x05b05b06, 0x0590b217, 0x0572620b, + 0x05555556, 0x0539782a, 0x051eb852, 0x05050506, + 0x04ec4ec5, 0x04d4873f, 0x04bda130, 0x04a7904b, + 0x04924925, 0x047dc120, 0x0469ee59, 0x0456c798, + 0x04444445, 0x04325c54, 0x04210843, 0x04104105, + 0x04000001, +}; +#else +BLASULONG blas_quick_divide_table[] = { +0x0000000000000000, 0x0000000000000001, 0x8000000000000001, 0x5555555555555557, +0x4000000000000001, 0x3333333333333335, 0x2aaaaaaaaaaaaaac, 0x2492492492492494, +0x2000000000000001, 0x1c71c71c71c71c73, 0x199999999999999b, 0x1745d1745d1745d3, +0x1555555555555557, 0x13b13b13b13b13b3, 0x124924924924924b, 0x1111111111111113, +0x1000000000000001, 0x0f0f0f0f0f0f0f11, 0x0e38e38e38e38e3a, 0x0d79435e50d79437, +0x0cccccccccccccce, 0x0c30c30c30c30c32, 0x0ba2e8ba2e8ba2ea, 0x0b21642c8590b218, +0x0aaaaaaaaaaaaaac, 0x0a3d70a3d70a3d72, 0x09d89d89d89d89da, 0x097b425ed097b427, +0x0924924924924926, 0x08d3dcb08d3dcb0a, 0x088888888888888a, 0x0842108421084212, +0x0800000000000001, 0x07c1f07c1f07c1f2, 0x0787878787878789, 0x0750750750750752, +0x071c71c71c71c71e, 0x06eb3e45306eb3e6, 0x06bca1af286bca1c, 0x0690690690690692, +0x0666666666666668, 0x063e7063e7063e72, 0x061861861861861a, 0x05f417d05f417d07, +0x05d1745d1745d176, 0x05b05b05b05b05b2, 0x0590b21642c8590d, 0x0572620ae4c415cb, +0x0555555555555557, 0x05397829cbc14e60, 0x051eb851eb851eba, 0x0505050505050507, +0x04ec4ec4ec4ec4ee, 0x04d4873ecade304f, 0x04bda12f684bda14, 0x04a7904a7904a792, +0x0492492492492494, 0x047dc11f7047dc13, 0x0469ee58469ee586, 0x0456c797dd49c343, +0x0444444444444446, 0x04325c53ef368eb2, 0x042108421084210a, 0x0410410410410412, +0x0400000000000001, +}; +#endif +#endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c new file mode 100644 index 0000000..eef3db9 --- /dev/null +++ b/driver/others/dynamic.c @@ -0,0 +1,219 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifdef ARCH_X86 +#define EXTERN extern +#else +#define EXTERN +#endif + +EXTERN gotoblas_t gotoblas_KATMAI; +EXTERN gotoblas_t gotoblas_COPPERMINE; +EXTERN gotoblas_t gotoblas_NORTHWOOD; +EXTERN gotoblas_t gotoblas_BANIAS; +EXTERN gotoblas_t gotoblas_ATHLON; + +extern gotoblas_t gotoblas_PRESCOTT; +extern gotoblas_t gotoblas_ATOM; +extern gotoblas_t gotoblas_NANO; +extern gotoblas_t gotoblas_CORE2; +extern gotoblas_t gotoblas_PENRYN; +extern gotoblas_t gotoblas_DUNNINGTON; +extern gotoblas_t gotoblas_NEHALEM; +extern gotoblas_t gotoblas_OPTERON; +extern gotoblas_t gotoblas_OPTERON_SSE3; +extern gotoblas_t gotoblas_BARCELONA; + +#define VENDOR_INTEL 1 +#define VENDOR_AMD 2 +#define VENDOR_CENTAUR 3 +#define VENDOR_UNKNOWN 99 + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +static int get_vendor(void){ + int eax, ebx, ecx, edx; + char vendor[13]; + + cpuid(0, &eax, &ebx, &ecx, &edx); + + *(int *)(&vendor[0]) = ebx; + *(int *)(&vendor[4]) = edx; + *(int *)(&vendor[8]) = ecx; + vendor[12] = (char)0; + + if (!strcmp(vendor, "GenuineIntel")) return VENDOR_INTEL; + if (!strcmp(vendor, "AuthenticAMD")) return VENDOR_AMD; + if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; + + if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; + + return VENDOR_UNKNOWN; +} + +static gotoblas_t *get_coretype(void){ + + int eax, ebx, ecx, edx; + int family, exfamily, model, vendor, exmodel; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + family = BITMASK(eax, 8, 0x0f); + exfamily = BITMASK(eax, 20, 0xff); + model = BITMASK(eax, 4, 0x0f); + exmodel = BITMASK(eax, 16, 0x0f); + + vendor = get_vendor(); + + if (vendor == VENDOR_INTEL){ + switch (family) { + case 0x6: + switch (exmodel) { + case 0: + if (model <= 0x7) return &gotoblas_KATMAI; + if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE; + if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS; + if (model == 14) return &gotoblas_BANIAS; + if (model == 15) return &gotoblas_CORE2; + return NULL; + + case 1: + if (model == 6) return &gotoblas_CORE2; + if (model == 7) return &gotoblas_PENRYN; + if (model == 13) return &gotoblas_DUNNINGTON; + if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; + if (model == 12) return &gotoblas_ATOM; + return NULL; + } + case 0xf: + if (model <= 0x2) return &gotoblas_NORTHWOOD; + return &gotoblas_PRESCOTT; + } + } + + if (vendor == VENDOR_AMD){ + if (family <= 0xe) return &gotoblas_ATHLON; + if (family == 0xf){ + if ((exfamily == 0) || (exfamily == 2)) { + if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; + else return &gotoblas_OPTERON; + } else { + return &gotoblas_BARCELONA; + } + } + } + + if (vendor == VENDOR_CENTAUR) { + switch (family) { + case 0x6: + return &gotoblas_NANO; + break; + } + } + + return NULL; +} + +static char *corename[] = { + "Unknown", + "Katmai", + "Coppermine", + "Northwood", + "Prescott", + "Banias", + "Atom", + "Core2", + "Penryn", + "Dunnington", + "Nehalem", + "Athlon", + "Opteron", + "Opteron(SSE3)", + "Barcelona", + "Nano", +}; + +char *gotoblas_corename(void) { + + if (gotoblas == &gotoblas_KATMAI) return corename[ 1]; + if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2]; + if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; + if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; + if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; + if (gotoblas == &gotoblas_ATOM) return corename[ 6]; + if (gotoblas == &gotoblas_CORE2) return corename[ 7]; + if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; + if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; + if (gotoblas == &gotoblas_NEHALEM) return corename[10]; + if (gotoblas == &gotoblas_ATHLON) return corename[11]; + if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; + if (gotoblas == &gotoblas_OPTERON) return corename[13]; + if (gotoblas == &gotoblas_BARCELONA) return corename[14]; + if (gotoblas == &gotoblas_NANO) return corename[15]; + + return corename[0]; +} + +void gotoblas_dynamic_init(void) { + + if (gotoblas) return; + + gotoblas = get_coretype(); + +#ifdef ARCH_X86 + if (gotoblas == NULL) gotoblas = gotoblas_KATMAI; +#else + if (gotoblas == NULL) gotoblas = gotoblas_PRESCOTT; +#endif + + if (gotoblas && gotoblas -> init) { + gotoblas -> init(); + } else { + fprintf(stderr, "GotoBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + + gotoblas = NULL; + +} diff --git a/driver/others/init.c b/driver/others/init.c new file mode 100644 index 0000000..657e8dd --- /dev/null +++ b/driver/others/init.c @@ -0,0 +1,697 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if defined(OS_LINUX) && defined(SMP) + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_NODES 16 +#define MAX_CPUS 256 + +#define SH_MAGIC 0x510510 + +#define CPUMAP_NAME "/sys/devices/system/node/node%d/cpumap" +#define SHARE_NAME "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map" +#define NODE_DIR "/sys/devices/system/node" + +#undef DEBUG + +/* Private variables */ +typedef struct { + unsigned long lock; + unsigned int magic; + unsigned int shmid; + + int num_nodes; + int num_procs; + int final_num_procs; + unsigned long avail; + + unsigned long cpu_info [MAX_CPUS]; + unsigned long node_info [MAX_NODES]; + int cpu_use[MAX_CPUS]; + +} shm_t; + +static cpu_set_t cpu_orig_mask[4]; + +static int cpu_mapping[MAX_CPUS]; +static int node_mapping[MAX_CPUS * 4]; +static int cpu_sub_mapping[MAX_CPUS]; +static int disable_mapping; + +/* Number of cores per nodes */ +static int node_cpu[MAX_NODES]; +static int node_equal = 0; + +static shm_t *common = (void *)-1; +static int shmid, pshmid; +static void *paddr; + +static unsigned long lprocmask, lnodemask; +static int numprocs = 1; +static int numnodes = 1; + +#if 1 +#define READ_CPU(x) ( (x) & 0xff) +#define READ_NODE(x) (((x) >> 8) & 0xff) +#define READ_CORE(x) (((x) >> 16) & 0xff) + +#define WRITE_CPU(x) (x) +#define WRITE_NODE(x) ((x) << 8) +#define WRITE_CORE(x) ((x) << 16) +#else +#define READ_CPU(x) ( (x) & 0xff) +#define READ_CORE(x) (((x) >> 8) & 0xff) +#define READ_NODE(x) (((x) >> 16) & 0xff) + +#define WRITE_CPU(x) (x) +#define WRITE_CORE(x) ((x) << 8) +#define WRITE_NODE(x) ((x) << 16) +#endif + +static inline int popcount(unsigned long number) { + + int count = 0; + + while (number > 0) { + if (number & 1) count ++; + number >>= 1; + } + + return count; +} + +static inline int rcount(unsigned long number) { + + int count = -1; + + while ((number > 0) && ((number & 0)) == 0) { + count ++; + number >>= 1; + } + + return count; +} + +static inline unsigned long get_cpumap(int node) { + + int infile; + unsigned long affinity; + char name[160]; + char *p, *dummy; + + sprintf(name, CPUMAP_NAME, node); + + infile = open(name, O_RDONLY); + + affinity = 0; + + if (infile != -1) { + + read(infile, name, sizeof(name)); + + p = name; + + while ((*p == '0') || (*p == ',')) p++; + + affinity = strtol(p, &dummy, 16); + + close(infile); + } + + return affinity; +} + +static inline unsigned long get_share(int cpu, int level) { + + int infile; + unsigned long affinity; + char name[160]; + char *p; + + sprintf(name, SHARE_NAME, cpu, level); + + infile = open(name, O_RDONLY); + + affinity = (1UL << cpu); + + if (infile != -1) { + + read(infile, name, sizeof(name)); + + p = name; + + while ((*p == '0') || (*p == ',')) p++; + + affinity = strtol(p, &p, 16); + + close(infile); + } + + return affinity; +} + +static int numa_check(void) { + + DIR *dp; + struct dirent *dir; + int node; + + common -> num_nodes = 0; + + dp = opendir(NODE_DIR); + + if (dp == NULL) { + common -> num_nodes = 1; + return 0; + } + + for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; + + while ((dir = readdir(dp)) != NULL) { + if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { + + node = atoi(&dir -> d_name[4]); + + if (node > MAX_NODES) { + fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); + exit(1); + } + + common -> num_nodes ++; + common -> node_info[node] = get_cpumap(node); + + } + } + + closedir(dp); + + if (common -> num_nodes == 1) return 1; + +#ifdef DEBUG + fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); + + for (node = 0; node < common -> num_nodes; node ++) + fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); +#endif + + return common -> num_nodes; +} + +static void numa_mapping(void) { + + int node, cpu, core; + int i, j, h; + unsigned long work, bit; + int count = 0; + + for (node = 0; node < common -> num_nodes; node ++) { + core = 0; + for (cpu = 0; cpu < common -> num_procs; cpu ++) { + if (common -> node_info[node] & common -> avail & (1UL << cpu)) { + common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); + count ++; + core ++; + } + + } + } + +#ifdef DEBUG + fprintf(stderr, "\nFrom /sys ...\n\n"); + + for (cpu = 0; cpu < count; cpu++) + fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); +#endif + + h = 1; + + while (h < count) h = 2 * h + 1; + + while (h > 1) { + h /= 2; + for (i = h; i < count; i++) { + work = common -> cpu_info[i]; + bit = CPU_ISSET(i, &cpu_orig_mask[0]); + j = i - h; + while (work < common -> cpu_info[j]) { + common -> cpu_info[j + h] = common -> cpu_info[j]; + if (CPU_ISSET(j, &cpu_orig_mask[0])) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + j -= h; + if (j < 0) break; + } + common -> cpu_info[j + h] = work; + if (bit) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + + } + } + +#ifdef DEBUG + fprintf(stderr, "\nSorting ...\n\n"); + + for (cpu = 0; cpu < count; cpu++) + fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); +#endif + +} + +static void disable_hyperthread(void) { + + unsigned long share; + int cpu; + + common -> avail = (1UL << common -> num_procs) - 1; + +#ifdef DEBUG + fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); +#endif + + for (cpu = 0; cpu < common -> num_procs; cpu ++) { + + share = (get_share(cpu, 1) & common -> avail); + + if (popcount(share) > 1) { + +#ifdef DEBUG + fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", + cpu, share & ~(1UL << cpu)); +#endif + + common -> avail &= ~((share & ~(1UL << cpu))); + } + } +} + +static void disable_affinity(void) { + +#ifdef DEBUG + fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); + fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); +#endif + + lprocmask = (1UL << common -> final_num_procs) - 1; + +#ifndef USE_OPENMP + lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; +#endif + +#ifdef DEBUG + fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); +#endif + +} + +static void setup_mempolicy(void) { + + int cpu, mynode, maxcpu; + + for (cpu = 0; cpu < MAX_NODES; cpu ++) node_cpu[cpu] = 0; + + maxcpu = 0; + + for (cpu = 0; cpu < numprocs; cpu ++) { + mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[cpu]]); + + lnodemask |= (1UL << mynode); + + node_cpu[mynode] ++; + + if (maxcpu < node_cpu[mynode]) maxcpu = node_cpu[mynode]; + } + + node_equal = 1; + + for (cpu = 0; cpu < MAX_NODES; cpu ++) if ((node_cpu[cpu] != 0) && (node_cpu[cpu] != maxcpu)) node_equal = 0; + + if (lnodemask) { + +#ifdef DEBUG + fprintf(stderr, "Node mask = %lx\n", lnodemask); +#endif + + my_set_mempolicy(MPOL_INTERLEAVE, &lnodemask, sizeof(lnodemask) * 8); + + numnodes = popcount(lnodemask); + } +} + +static inline int is_dead(int id) { + + struct shmid_ds ds; + + return shmctl(id, IPC_STAT, &ds); +} +static void open_shmem(void) { + + int try = 0; + + do { + + shmid = shmget(SH_MAGIC, 4096, 0666); + + if (shmid == -1) { + shmid = shmget(SH_MAGIC, 4096, IPC_CREAT | 0666); + } + + try ++; + + } while ((try < 10) && (shmid == -1)); + + if (shmid == -1) { + fprintf(stderr, "GotoBLAS : Can't open shared memory. Terminated.\n"); + exit(1); + } + + if (shmid != -1) common = (shm_t *)shmat(shmid, NULL, 0); + +#ifdef DEBUG + fprintf(stderr, "Shared Memory id = %x Address = %p\n", shmid, common); +#endif + +} + +static void create_pshmem(void) { + + pshmid = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); + + paddr = shmat(pshmid, NULL, 0); + + shmctl(pshmid, IPC_RMID, 0); + +#ifdef DEBUG + fprintf(stderr, "Private Shared Memory id = %x Address = %p\n", pshmid, paddr); +#endif +} + +static void local_cpu_map(void) { + + int cpu, id, mapping; + + cpu = 0; + mapping = 0; + + do { + id = common -> cpu_use[cpu]; + + if (id > 0) { + if (is_dead(id)) common -> cpu_use[cpu] = 0; + } + + if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { + + common -> cpu_use[cpu] = pshmid; + cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); + cpu_sub_mapping[mapping] = cpu; + + mapping ++; + } + + cpu ++; + + } while ((mapping < numprocs) && (cpu < common -> final_num_procs)); + + disable_mapping = 0; + + if ((mapping < numprocs) || (numprocs == 1)) { + for (cpu = 0; cpu < common -> final_num_procs; cpu ++) { + if (common -> cpu_use[cpu] == pshmid) common -> cpu_use[cpu] = 0; + } + disable_mapping = 1; + } + +#ifdef DEBUG + for (cpu = 0; cpu < numprocs; cpu ++) { + fprintf(stderr, "Local Mapping : %2d --> %2d (%2d)\n", cpu, cpu_mapping[cpu], cpu_sub_mapping[cpu]); + } +#endif +} + +/* Public Functions */ + +int get_num_procs(void) { return numprocs; } +int get_num_nodes(void) { return numnodes; } +int get_node_equal(void) { + + return (((blas_cpu_number % numnodes) == 0) && node_equal); + +} + +int gotoblas_set_affinity(int pos) { + + cpu_set_t cpu_mask; + + int mynode = 1; + + /* if number of threads is larger than inital condition */ + if (pos < 0) { + sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); + return 0; + } + + if (!disable_mapping) { + + mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[pos]]); + +#ifdef DEBUG + fprintf(stderr, "Giving Affinity[%4d %3d] --> %3d My node = %3d\n", getpid(), pos, cpu_mapping[pos], mynode); +#endif + + CPU_ZERO(&cpu_mask); + CPU_SET (cpu_mapping[pos], &cpu_mask); + + sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask); + + node_mapping[WhereAmI()] = mynode; + + } + + return mynode; +} + +int get_node(void) { + + if (!disable_mapping) return node_mapping[WhereAmI()]; + + return 1; +} + +static int initialized = 0; + +void gotoblas_affinity_init(void) { + + int cpu, num_avail; +#ifndef USE_OPENMP + cpu_set_t cpu_mask; +#endif + + if (initialized) return; + + initialized = 1; + + sched_getaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); + +#ifdef USE_OPENMP + numprocs = 0; +#else + numprocs = readenv("GOTO_NUM_THREADS"); +#endif + + if (numprocs == 0) numprocs = readenv("OMP_NUM_THREADS"); + + numnodes = 1; + + if (numprocs == 1) { + disable_mapping = 1; + return; + } + + create_pshmem(); + + open_shmem(); + + while ((common -> lock) && (common -> magic != SH_MAGIC)) { + if (is_dead(common -> shmid)) { + common -> lock = 0; + common -> shmid = 0; + common -> magic = 0; + } else { + sched_yield(); + } + } + + blas_lock(&common -> lock); + + if ((common -> shmid) && is_dead(common -> shmid)) common -> magic = 0; + + common -> shmid = pshmid; + + if (common -> magic != SH_MAGIC) { + +#ifdef DEBUG + fprintf(stderr, "Shared Memory Initialization.\n"); +#endif + + common -> num_procs = get_nprocs(); + + for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; + + numa_check(); + + disable_hyperthread(); + + if (common -> num_nodes > 1) numa_mapping(); + + common -> final_num_procs = popcount(common -> avail); + + for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; + + common -> magic = SH_MAGIC; + + } + + disable_affinity(); + + num_avail = popcount(lprocmask); + + if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; + +#ifdef DEBUG + fprintf(stderr, "Number of threads = %d\n", numprocs); +#endif + + local_cpu_map(); + + blas_unlock(&common -> lock); + +#ifndef USE_OPENMP + if (!disable_mapping) { + +#ifdef DEBUG + fprintf(stderr, "Giving Affinity[%3d] --> %3d\n", 0, cpu_mapping[0]); +#endif + + CPU_ZERO(&cpu_mask); + CPU_SET (cpu_mapping[0], &cpu_mask); + + sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask); + + node_mapping[WhereAmI()] = READ_NODE(common -> cpu_info[cpu_sub_mapping[0]]); + + setup_mempolicy(); + + if (readenv("GOTOBLAS_MAIN_FREE")) { + sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); + } + + } +#endif + +#ifdef DEBUG + fprintf(stderr, "Initialization is done.\n"); +#endif +} + +void gotoblas_affinity_quit(void) { + + int i; + struct shmid_ds ds; + +#ifdef DEBUG + fprintf(stderr, "Terminating ..\n"); +#endif + + if ((numprocs == 1) || (initialized == 0)) return; + + if (!disable_mapping) { + + blas_lock(&common -> lock); + + for (i = 0; i < numprocs; i ++) common -> cpu_use[cpu_mapping[i]] = -1; + + blas_unlock(&common -> lock); + + } + + shmctl(shmid, IPC_STAT, &ds); + + if (ds.shm_nattch == 1) shmctl(shmid, IPC_RMID, 0); + + shmdt(common); + + shmdt(paddr); + + initialized = 0; +} + +#else + +void gotoblas_affinity_init(void) {}; + +void gotoblas_set_affinity(int threads) {}; + +void gotoblas_set_affinity2(int threads) {}; + +void gotoblas_affinity_reschedule(void) {}; + +int get_num_procs(void) { return get_nprocs(); } + +int get_num_nodes(void) { return 1; } + +int get_node(void) { return 1;} +#endif + + diff --git a/driver/others/lamc3.c b/driver/others/lamc3.c new file mode 100644 index 0000000..439ef6e --- /dev/null +++ b/driver/others/lamc3.c @@ -0,0 +1,50 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifdef NEED_F2CCONV +double +#else +FLOAT +#endif +NAME(FLOAT *a, FLOAT *b){ + + return *a + *b; + +} diff --git a/driver/others/lamch.c b/driver/others/lamch.c new file mode 100644 index 0000000..b044500 --- /dev/null +++ b/driver/others/lamch.c @@ -0,0 +1,200 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if 0 +static FLOAT hdata[] __attribute__((aligned(128))) = { +#ifdef XDOUBLE + +0x1.0000000000000000P-00064L, + +0x1.0000000000000000P-16382L, + +0x1.0000000000000000P+00001L, + +0x1.0000000000000000P-00063L, + +0x1.0000000000000000P+00006L, + +0x1.0000000000000000P+00000L, + -0x1.ffe8000000000000P+00013L, + +0x1.0000000000000000P-16382L, + +0x1.0004000000000000P+00014L, + +0x1.fffffffffffffffeP+16383L, +#elif defined DOUBLE + +0x1.0000000000000P-0053, + +0x1.0000000000000P-1022, + +0x1.0000000000000P+0001, + +0x1.0000000000000P-0052, + +0x1.a800000000000P+0005, + +0x1.0000000000000P+0000, + -0x1.fe80000000000P+0009, + +0x1.0000000000000P-1022, + +0x1.0000000000000P+0010, + +0x1.fffffffffffffP+1023, +#else + +0x1.000000P-024f, + +0x1.000000P-126f, + +0x1.000000P+001f, + +0x1.000000P-023f, + +0x1.800000P+004f, + +0x1.000000P+000f, + -0x1.f40000P+006f, + +0x1.000000P-126f, + +0x1.000000P+007f, + +0x1.fffffeP+127f, +#endif +}; + +#endif + +static unsigned int idata[] __attribute__((aligned(128))) = { + +#if defined XDOUBLE +#ifndef __BIG_ENDIAN__ + 0x00000000, 0x80000000, 0x00003fbf, 0x00000000, + 0x00000000, 0x80000000, 0x00000001, 0x00000000, + 0x00000000, 0x80000000, 0x00004000, 0x00000000, + 0x00000000, 0x80000000, 0x00003fc0, 0x00000000, + 0x00000000, 0x80000000, 0x00004005, 0x00000000, + 0x00000000, 0x80000000, 0x00003fff, 0x00000000, + 0x00000000, 0xff400000, 0x0000c00c, 0x00000000, + 0x00000000, 0x80000000, 0x00000001, 0x00000000, + 0x00000000, 0x80200000, 0x0000400d, 0x00000000, + 0xffffffff, 0xffffffff, 0x00007ffe, 0x00000000, +#else + 0x00000000, 0x00003fbf, 0x80000000, 0x00000000, + 0x00000000, 0x00000001, 0x80000000, 0x00000000, + 0x00000000, 0x00004000, 0x80000000, 0x00000000, + 0x00000000, 0x00003fc0, 0x80000000, 0x00000000, + 0x00000000, 0x00004005, 0x80000000, 0x00000000, + 0x00000000, 0x00003fff, 0x80000000, 0x00000000, + 0x00000000, 0x0000c00c, 0xff400000, 0x00000000, + 0x00000000, 0x00000001, 0x80000000, 0x00000000, + 0x00000000, 0x0000400d, 0x80200000, 0x00000000, + 0x00000000, 0x00007ffe, 0xffffffff, 0xffffffff, + +#endif +#elif defined DOUBLE +#ifndef __BIG_ENDIAN__ + 0x00000000, 0x3ca00000, + 0x00000000, 0x00100000, + 0x00000000, 0x40000000, + 0x00000000, 0x3cb00000, + 0x00000000, 0x404a8000, + 0x00000000, 0x3ff00000, + 0x00000000, 0xc08fe800, + 0x00000000, 0x00100000, + 0x00000000, 0x40900000, + 0xffffffff, 0x7fefffff, +#else + 0x3ca00000, 0x00000000, + 0x00100000, 0x00000000, + 0x40000000, 0x00000000, + 0x3cb00000, 0x00000000, + 0x404a8000, 0x00000000, + 0x3ff00000, 0x00000000, + 0xc08fe800, 0x00000000, + 0x00100000, 0x00000000, + 0x40900000, 0x00000000, + 0x7fefffff, 0xffffffff, +#endif +#else + + 0x33800000, + 0x00800000, + 0x40000000, + 0x34000000, + 0x41c00000, + 0x3f800000, + 0xc2fa0000, + 0x00800000, + 0x43000000, + 0x7f7fffff, + +#endif +}; + + +#ifdef NEED_F2CCONV +double +#else +FLOAT +#endif +NAME(char *P){ + + char p = *P; + int pos; + FLOAT *hdata = (FLOAT *)idata; + + TOUPPER(p); + + switch (p) { + case 'E': + pos = 0; + break; + case 'S': + pos = 1; + break; + case 'B': + pos = 2; + break; + case 'P': + pos = 3; + break; + case 'N': + pos = 4; + break; + case 'R': + pos = 5; + break; + case 'M': + pos = 6; + break; + case 'U': + pos = 7; + break; + case 'L': + pos = 8; + break; + case 'O': + pos = 9; + break; + default: + pos = 0; + break; + } + + return hdata[pos]; + +} diff --git a/driver/others/lsame.c b/driver/others/lsame.c new file mode 100644 index 0000000..cae8b4a --- /dev/null +++ b/driver/others/lsame.c @@ -0,0 +1,50 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include + +int NAME(char *A, char *B){ + + char a = *A; + char b = *B; + + if (a > 96) a -= 32; + if (b > 96) b -= 32; + + return (a == b); +} diff --git a/driver/others/memory.c b/driver/others/memory.c new file mode 100644 index 0000000..1983931 --- /dev/null +++ b/driver/others/memory.c @@ -0,0 +1,1257 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#undef DEBUG + +#include "common.h" + +#ifdef OS_WINDOWS +#define ALLOC_WINDOWS +#ifndef MEM_LARGE_PAGES +#define MEM_LARGE_PAGES 0x20000000 +#endif +#else +#define ALLOC_MMAP +#define ALLOC_MALLOC +#endif + +#include +#include +#include + +#ifndef OS_WINDOWS +#include +#include +#include +#endif + +#include + +#ifdef OS_LINUX +#include +#include +#include +#include +#include +#endif + +#if defined(OS_FreeBSD) || defined(OS_Darwin) +#include +#endif + +#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) +#include +#undef printf +#define printf _cprintf +#endif + +#ifdef OS_LINUX + +#ifndef MPOL_PREFERRED +#define MPOL_PREFERRED 1 +#endif + +#endif + +#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP) +#define NO_WARMUP +#endif + +#ifdef ALLOC_HUGETLB +#define SHM_HUGETLB 04000 +#endif + +#ifndef FIXED_PAGESIZE +#define FIXED_PAGESIZE 4096 +#endif + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) + +#ifdef DYNAMIC_ARCH +gotoblas_t *gotoblas = NULL; +#endif + +#ifndef SMP + +#define blas_cpu_number 1 +#define blas_num_threads 1 + +/* Dummy Function */ +int goto_get_num_procs (void) { return 1;}; +void goto_set_num_threads(int num_threads) {}; + +#else + +#ifdef OS_LINUX +#ifndef NO_AFFINITY +int get_num_procs(void); +#else +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = get_nprocs(); + return nums; +} +#endif +#endif + +#ifdef OS_WINDOWS + +int get_num_procs(void) { + + static int nums = 0; + + if (nums == 0) { + + SYSTEM_INFO sysinfo; + + GetSystemInfo(&sysinfo); + + nums = sysinfo.dwNumberOfProcessors; + } + + return nums; +} + +#endif + +#if defined(OS_FreeBSD) || defined(OS_Darwin) + +int get_num_procs(void) { + + static int nums = 0; + + int m[2]; + size_t len; + + if (nums == 0) { + m[0] = CTL_HW; + m[1] = HW_NCPU; + len = sizeof(int); + sysctl(m, 2, &nums, &len, NULL, 0); + } + + return nums; +} + +#endif + +int blas_cpu_number = 0; +int blas_num_threads = 0; + +int goto_get_num_procs (void) { + return blas_cpu_number; +} + +int blas_get_cpu_number(void){ + char *p; +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) + int max_num; +#endif + int blas_goto_num = 0; + int blas_omp_num = 0; + + if (blas_num_threads) return blas_num_threads; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) + max_num = get_num_procs(); +#endif + + blas_goto_num = 0; +#ifndef USE_OPENMP + p = getenv("GOTO_NUM_THREADS"); + if (p) blas_goto_num = atoi(p); + if (blas_goto_num < 0) blas_goto_num = 0; +#endif + + blas_omp_num = 0; + p = getenv("OMP_NUM_THREADS"); + if (p) blas_omp_num = atoi(p); + if (blas_omp_num < 0) blas_omp_num = 0; + + if (blas_goto_num > 0) blas_num_threads = blas_goto_num; + else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; + else blas_num_threads = MAX_CPU_NUMBER; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) + if (blas_num_threads > max_num) blas_num_threads = max_num; +#endif + + if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; + +#ifdef DEBUG + printf( "Adjusted number of threads : %3d\n", blas_num_threads); +#endif + + blas_cpu_number = blas_num_threads; + + return blas_num_threads; +} +#endif + +struct release_t { + void *address; + void (*func)(struct release_t *); + long attr; +}; + +int hugetlb_allocated = 0; + +static struct release_t release_info[NUM_BUFFERS]; +static int release_pos = 0; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) +static int hot_alloc = 0; +#endif + +#ifdef ALLOC_MMAP + +static void alloc_mmap_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("GotoBLAS : munmap failed\n"); + } +} + +#ifdef NO_WARMUP + +static void *alloc_mmap(void *address){ + void *map_address; + + if (address){ + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + } else { + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + } + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + } + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + return map_address; +} + +#else + +#define BENCH_ITERATION 4 +#define SCALING 2 + +static inline BLASULONG run_bench(BLASULONG address, long size) { + + BLASULONG original, *p; + BLASULONG start, stop, min; + int iter, i, count; + + min = (BLASULONG)-1; + + original = *(BLASULONG *)(address + size - PAGESIZE); + + *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address; + + for (iter = 0; iter < BENCH_ITERATION; iter ++ ) { + + p = (BLASULONG *)address; + + count = size / PAGESIZE; + + start = rpcc(); + + for (i = 0; i < count; i ++) { + p = (BLASULONG *)(*p); + } + + stop = rpcc(); + + if (min > stop - start) min = stop - start; + } + + *(BLASULONG *)(address + size - PAGESIZE + 0) = original; + *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p; + + return min; +} + +static void *alloc_mmap(void *address){ + void *map_address, *best_address; + BLASULONG best, start, current; + BLASULONG allocsize; + + if (address){ + /* Just give up use advanced operation */ + map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc == 0) { + map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#endif + + map_address = mmap(NULL, BUFFER_SIZE * SCALING, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + + if (map_address != (void *)-1) { + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#endif + + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + + start = (BLASULONG)map_address; + current = (SCALING - 1) * BUFFER_SIZE; + + while(current > 0) { + *(long *)start = (long)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } + + *(long *)(start - PAGESIZE) = (BLASULONG)map_address; + + start = (BLASULONG)map_address; + + best = (BLASULONG)-1; + best_address = map_address; + + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + + current = run_bench(start, allocsize); + + if (best > current) { + best = current; + best_address = (void *)start; + } + + start += PAGESIZE; + + } + + if ((BLASULONG)best_address > (BLASULONG)map_address) + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + + munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + + map_address = best_address; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + hot_alloc = 2; +#endif + } + } +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#endif + + +#ifdef ALLOC_MALLOC + +static void alloc_malloc_free(struct release_t *release){ + + free(release -> address); + +} + +static void *alloc_malloc(void *address){ + + void *map_address; + + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_malloc_free; + release_pos ++; + } + + return map_address; + +} + +#endif + +#ifdef ALLOC_QALLOC + +void *qalloc(int flags, size_t bytes); +void *qfree (void *address); + +#define QNONCACHE 0x1 +#define QCOMMS 0x2 +#define QFAST 0x4 + +static void alloc_qalloc_free(struct release_t *release){ + + qfree(release -> address); + +} + +static void *alloc_qalloc(void *address){ + void *map_address; + + map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_qalloc_free; + release_pos ++; + } + + return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); +} + +#endif + +#ifdef ALLOC_WINDOWS + +static void alloc_windows_free(struct release_t *release){ + + VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + +} + +static void *alloc_windows(void *address){ + void *map_address; + + map_address = VirtualAlloc(address, + BUFFER_SIZE, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_windows_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_DEVICEDRIVER +#ifndef DEVICEDRIVER_NAME +#define DEVICEDRIVER_NAME "/dev/mapper" +#endif + +static void alloc_devicedirver_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("GotoBLAS : Bugphysarea unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("GotoBLAS : Bugphysarea close failed.\n"); + } + +} + +static void *alloc_devicedirver(void *address){ + + int fd; + void *map_address; + + if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) { + + return (void *)-1; + + } + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_devicedirver_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_SHM + +static void alloc_shm_free(struct release_t *release){ + + if (shmdt(release -> address)) { + printf("GotoBLAS : Shared memory unmap failed.\n"); + } +} + +static void *alloc_shm(void *address){ + void *map_address; + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + + map_address = (void *)shmat(shmid, address, 0); + + if (map_address != (void *)-1){ + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + shmctl(shmid, IPC_RMID, 0); + + release_info[release_pos].address = map_address; + release_info[release_pos].attr = shmid; + release_info[release_pos].func = alloc_shm_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + +static void alloc_hugetlb_free(struct release_t *release){ + +#if defined(OS_LINUX) || defined(OS_AIX) + if (shmdt(release -> address)) { + printf("GotoBLAS : Hugepage unmap failed.\n"); + } +#endif + +#ifdef __sun__ + + munmap(release -> address, BUFFER_SIZE); + +#endif + +#ifdef OS_WINDOWS + + VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + +#endif + +} + +static void *alloc_hugetlb(void *address){ + + void *map_address = (void *)-1; + +#if defined(OS_LINUX) || defined(OS_AIX) + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, +#ifdef OS_LINUX + SHM_HUGETLB | +#endif +#ifdef OS_AIX + SHM_LGPAGE | SHM_PIN | +#endif + IPC_CREAT | SHM_R | SHM_W); + + if (shmid != -1) { + map_address = (void *)shmat(shmid, address, SHM_RND); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + if (map_address != (void *)-1){ + shmctl(shmid, IPC_RMID, 0); + } + } +#endif + +#ifdef __sun__ + struct memcntl_mha mha; + + mha.mha_cmd = MHA_MAPSIZE_BSSBRK; + mha.mha_flags = 0; + mha.mha_pagesize = HUGE_PAGESIZE; + memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); + + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); +#endif + +#ifdef OS_WINDOWS + + HANDLE hToken; + TOKEN_PRIVILEGES tp; + + if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) return (void *) -1; + + if (AdjustTokenPrivileges(hToken, FALSE, (PTOKEN_PRIVILEGES)&tp, 0, NULL, NULL) != TRUE) return (void *) -1; + + map_address = (void *)VirtualAlloc(address, + BUFFER_SIZE, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + AdjustTokenPrivileges(hToken, TRUE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, NULL); + + if (map_address == (void *)NULL) map_address = (void *)-1; + +#endif + + if (map_address != (void *)-1){ + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_hugetlb_free; + release_pos ++; + } + + return map_address; +} +#endif + +#ifdef ALLOC_HUGETLBFILE + +static int hugetlb_pid = 0; + +static void alloc_hugetlbfile_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("GotoBLAS : HugeTLBfs unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("GotoBLAS : HugeTLBfs close failed.\n"); + } +} + +static void *alloc_hugetlbfile(void *address){ + + void *map_address = (void *)-1; + int fd; + char filename[64]; + + if (!hugetlb_pid) hugetlb_pid = getpid(); + + sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid); + + if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) { + return (void *)-1; + } + + unlink(filename); + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_hugetlbfile_free; + release_pos ++; + } + + return map_address; +} +#endif + +/* Global lock for memory allocation */ + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t alloc_lock = 0; +#else +static BLASULONG alloc_lock = 0UL; +#endif + +#ifdef SEEK_ADDRESS +static BLASULONG base_address = 0UL; +#else +static BLASULONG base_address = BASE_ADDRESS; +#endif + +static volatile struct { + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif + +} memory[NUM_BUFFERS]; + +static int memory_initialized = 0; +static void gotoblas_memory_init(void); + +/* Memory allocation routine */ +/* procpos ... indicates where it comes from */ +/* 0 : Level 3 functions */ +/* 1 : Level 2 functions */ +/* 2 : Thread */ + +void *blas_memory_alloc(int procpos){ + + int position; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int mypos; +#endif + + void *map_address; + + void *(*memoryalloc[])(void *address) = { +#ifdef ALLOC_DEVICEDRIVER + alloc_devicedirver, +#endif +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + alloc_hugetlb, +#endif +#ifdef ALLOC_SHM + alloc_shm, +#endif +#ifdef ALLOC_MMAP + alloc_mmap, +#endif +#ifdef ALLOC_QALLOC + alloc_qalloc, +#endif +#ifdef ALLOC_WINDOWS + alloc_windows, +#endif +#ifdef ALLOC_MALLOC + alloc_malloc, +#endif + NULL, + }; + void *(**func)(void *address); + + if (!memory_initialized) { + + LOCK_COMMAND(&alloc_lock); + + if (!memory_initialized) { + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + for (position = 0; position < NUM_BUFFERS; position ++){ + memory[position].addr = (void *)0; + memory[position].pos = -1; + memory[position].used = 0; + memory[position].lock = 0; + } +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#ifdef SMP + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#ifndef DYNAMIC_ARCH + blas_set_parameter(); +#endif +#endif + + memory_initialized = 1; + } + + UNLOCK_COMMAND(&alloc_lock); + } + +#ifdef DEBUG + printf("Alloc Start ...\n"); +#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + mypos = WhereAmI(); + + position = mypos; + while (position > NUM_BUFFERS) position >>= 1; + + do { + if (!memory[position].used && (memory[position].pos == mypos)) { + + blas_lock(&memory[position].lock); + + if (!memory[position].used) goto allocation; + + blas_unlock(&memory[position].lock); + } + + position ++; + + } while (position < NUM_BUFFERS); + + +#endif + + position = 0; + + do { + if (!memory[position].used) { + + blas_lock(&memory[position].lock); + + if (!memory[position].used) goto allocation; + + blas_unlock(&memory[position].lock); + } + + position ++; + + } while (position < NUM_BUFFERS); + + goto error; + + allocation : + +#ifdef DEBUG + printf(" Position -> %d\n", position); +#endif + + memory[position].used = 1; + + blas_unlock(&memory[position].lock); + + if (!memory[position].addr) { + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "GotoBLAS Warning ... Physically contigous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "GotoBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } + +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + + memory[position].addr = map_address; + +#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_area[position], position); +#endif + } + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (memory[position].pos == -1) memory[position].pos = mypos; + +#endif + +#ifdef DYNAMIC_ARCH + + if (memory_initialized == 1) { + + LOCK_COMMAND(&alloc_lock); + + if (memory_initialized == 1) { + + if (!gotoblas) gotoblas_dynamic_init(); + + memory_initialized = 2; + } + + UNLOCK_COMMAND(&alloc_lock); + + } +#endif + + +#ifdef DEBUG + printf("Mapped : %p %3d\n\n", + (void *)alloc_area[position], position); +#endif + + return (void *)memory[position].addr; + + error: + printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + + return NULL; +} + +void blas_memory_free(void *free_area){ + + int position; + +#ifdef DEBUG + printf("Unmapped Start : %p ...\n", free_area); +#endif + + position = 0; + + while ((memory[position].addr != free_area) + && (position < NUM_BUFFERS)) position++; + + if (memory[position].addr != free_area) goto error; + +#ifdef DEBUG + printf(" Position : %d\n", position); +#endif + + memory[position].used = 0; + +#ifdef DEBUG + printf("Unmap Succeeded.\n\n"); +#endif + + return; + + error: + printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); + +#ifdef DEBUG + for (position = 0; position < NUM_BUFFERS; position++) + printf("%4ld %p : %d\n", position, alloc_area[position], alloc_used[position]); +#endif + + return; +} + +void blas_shutdown(void){ + + int pos; + +#ifdef SMP + BLASFUNC(blas_thread_shutdown)(); +#endif + + LOCK_COMMAND(&alloc_lock); + + for (pos = 0; pos < release_pos; pos ++) { + release_info[pos].func(&release_info[pos]); + } + +#ifdef SEEK_ADDRESS + base_address = 0UL; +#else + base_address = BASE_ADDRESS; +#endif + + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + memory[pos].addr = (void *)0; + memory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + memory[pos].pos = -1; +#endif + memory[pos].lock = 0; + } + + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + +#ifdef SMP +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t init_lock = 0; +#else +static BLASULONG init_lock = 0UL; +#endif +#endif + +static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, + void *sa, void *sb, BLASLONG pos) { + +#ifndef ARCH_POWER + + long size; + BLASULONG buffer; + + size = BUFFER_SIZE - PAGESIZE; + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc != 2) { +#endif + +#ifdef SMP + LOCK_COMMAND(&init_lock); +#endif + + while (size > 0) { + *(int *)buffer = size; + buffer += PAGESIZE; + size -= PAGESIZE; + } + +#ifdef SMP + UNLOCK_COMMAND(&init_lock); +#endif + + size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + + while (size > 0) { + *(int *)buffer = size; + buffer += 64; + size -= 64; + } + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + +#endif +} + +#ifdef SMP + +static void _init_thread_memory(void *buffer) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + int num_cpu; + + for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) { + + blas_queue_init(&queue[num_cpu]); + queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL; + queue[num_cpu].routine = &_touch_memory; + queue[num_cpu].args = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + } + + queue[num_cpu - 1].next = NULL; + queue[0].sa = buffer; + + exec_blas(num_cpu, queue); + +} +#endif + +static void gotoblas_memory_init(void) { + + void *buffer; + + hot_alloc = 1; + + buffer = (void *)blas_memory_alloc(0); + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif + + _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A)); + +#else + + _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0); + +#endif + + blas_memory_free(buffer); +} +#endif + +/* Initialization for all function; this function should be called before main */ + +static int gotoblas_initialized = 0; + +void CONSTRUCTOR gotoblas_init(void) { + + if (gotoblas_initialized) return; + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + gotoblas_memory_init(); +#endif + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_init(); +#endif + + gotoblas_initialized = 1; + +#ifdef PROFILE + moncontrol (1); +#endif + +} + +void DESTRUCTOR gotoblas_quit(void) { + + if (gotoblas_initialized == 0) return; + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_quit(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_quit(); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_quit(); +#endif + + gotoblas_initialized = 0; + +#ifdef PROFILE + moncontrol (1); +#endif + +} + +#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) +/* Don't call me; this is just work around for PGI / Sun bug */ +void gotoblas_dummy_for_PGI(void) { + + gotoblas_init(); + gotoblas_quit(); + +#if 0 + asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); + asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); +#else + asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); + asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); +#endif +} +#endif diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c new file mode 100644 index 0000000..10b35aa --- /dev/null +++ b/driver/others/memory_qalloc.c @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef SMP +#define blas_cpu_number 1 +#else + +int blas_cpu_number = 1; + +int blas_get_cpu_number(void){ + + return blas_cpu_number; +} +#endif + +#define FIXED_PAGESIZE 4096 + +void *sa = NULL; +void *sb = NULL; +static double static_buffer[BUFFER_SIZE/sizeof(double)]; + +void *blas_memory_alloc(int numproc){ + + if (sa == NULL){ +#if 1 + sa = (void *)qalloc(QFAST, BUFFER_SIZE); +#else + sa = (void *)malloc(BUFFER_SIZE); +#endif + sb = (void *)&static_buffer[0]; + } + + return sa; +} + +void blas_memory_free(void *free_area){ + return; +} + diff --git a/driver/others/parameter.c b/driver/others/parameter.c new file mode 100644 index 0000000..9e72fd2 --- /dev/null +++ b/driver/others/parameter.c @@ -0,0 +1,668 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int get_L2_size(void); + +#define DEFAULT_GEMM_P 128 +#define DEFAULT_GEMM_Q 128 +#define DEFAULT_GEMM_R 128 + +/* Global Parameter */ +#if SGEMM_P == sgemm_p +BLASLONG sgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG sgemm_p = SGEMM_P; +#endif +#if DGEMM_P == dgemm_p +BLASLONG dgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG dgemm_p = DGEMM_P; +#endif +#if CGEMM_P == cgemm_p +BLASLONG cgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG cgemm_p = CGEMM_P; +#endif +#if ZGEMM_P == zgemm_p +BLASLONG zgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG zgemm_p = ZGEMM_P; +#endif + +#if SGEMM_Q == sgemm_q +BLASLONG sgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG sgemm_q = SGEMM_Q; +#endif +#if DGEMM_Q == dgemm_q +BLASLONG dgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG dgemm_q = DGEMM_Q; +#endif +#if CGEMM_Q == cgemm_q +BLASLONG cgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG cgemm_q = CGEMM_Q; +#endif +#if ZGEMM_Q == zgemm_q +BLASLONG zgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG zgemm_q = ZGEMM_Q; +#endif + +#if SGEMM_R == sgemm_r +BLASLONG sgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG sgemm_r = SGEMM_R; +#endif +#if DGEMM_R == dgemm_r +BLASLONG dgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG dgemm_r = DGEMM_R; +#endif +#if CGEMM_R == cgemm_r +BLASLONG cgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG cgemm_r = CGEMM_R; +#endif +#if ZGEMM_R == zgemm_r +BLASLONG zgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG zgemm_r = ZGEMM_R; +#endif + +#if defined(EXPRECISION) || defined(QUAD_PRECISION) +#if QGEMM_P == qgemm_p +BLASLONG qgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG qgemm_p = QGEMM_P; +#endif +#if XGEMM_P == xgemm_p +BLASLONG xgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG xgemm_p = XGEMM_P; +#endif +#if QGEMM_Q == qgemm_q +BLASLONG qgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG qgemm_q = QGEMM_Q; +#endif +#if XGEMM_Q == xgemm_q +BLASLONG xgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG xgemm_q = XGEMM_Q; +#endif +#if QGEMM_R == qgemm_r +BLASLONG qgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG qgemm_r = QGEMM_R; +#endif +#if XGEMM_R == xgemm_r +BLASLONG xgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG xgemm_r = XGEMM_R; +#endif +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + +int get_L2_size(void){ + + int eax, ebx, ecx, edx; + +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ + defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ + defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + return BITMASK(ecx, 16, 0xffff); + +#else + + int info[15]; + int i; + + cpuid(2, &eax, &ebx, &ecx, &edx); + + info[ 0] = BITMASK(eax, 8, 0xff); + info[ 1] = BITMASK(eax, 16, 0xff); + info[ 2] = BITMASK(eax, 24, 0xff); + + info[ 3] = BITMASK(ebx, 0, 0xff); + info[ 4] = BITMASK(ebx, 8, 0xff); + info[ 5] = BITMASK(ebx, 16, 0xff); + info[ 6] = BITMASK(ebx, 24, 0xff); + + info[ 7] = BITMASK(ecx, 0, 0xff); + info[ 8] = BITMASK(ecx, 8, 0xff); + info[ 9] = BITMASK(ecx, 16, 0xff); + info[10] = BITMASK(ecx, 24, 0xff); + + info[11] = BITMASK(edx, 0, 0xff); + info[12] = BITMASK(edx, 8, 0xff); + info[13] = BITMASK(edx, 16, 0xff); + info[14] = BITMASK(edx, 24, 0xff); + + for (i = 0; i < 15; i++){ + + switch (info[i]){ + case 0x3b : + case 0x41 : + case 0x79 : + return 128; + break; + + case 0x3c : + case 0x42 : + case 0x7a : + case 0x7e : + case 0x82 : + return 256; + break; + + case 0x43 : + case 0x7b : + case 0x7f : + case 0x83 : + case 0x86 : + return 512; + break; + + case 0x44 : + case 0x78 : + case 0x7c : + case 0x84 : + case 0x87 : + return 1024; + break; + + case 0x45 : + case 0x7d : + case 0x85 : + return 2048; + + case 0x49 : + return 4096; + break; + } + } + + /* Never reached */ + return 0; +#endif +} + +void blas_set_parameter(void){ + + char *p; + int factor; + int size = get_L2_size(); + +#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) + size >>= 7; + +#if defined(CORE_BANIAS) && (HAVE_HIT > 1) + sgemm_p = 64 / HAVE_HIT * size; + dgemm_p = 32 / HAVE_HIT * size; + cgemm_p = 32 / HAVE_HIT * size; + zgemm_p = 16 / HAVE_HIT * size; +#ifdef EXPRECISION + qgemm_p = 16 / HAVE_HIT * size; + xgemm_p = 8 / HAVE_HIT * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 8 / HAVE_HIT * size; + xgemm_p = 4 / HAVE_HIT * size; +#endif +#else + sgemm_p = 64 * size; + dgemm_p = 32 * size; + cgemm_p = 32 * size; + zgemm_p = 16 * size; +#ifdef EXPRECISION + qgemm_p = 16 * size; + xgemm_p = 8 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 8 * size; + xgemm_p = 4 * size; +#endif +#endif +#endif + +#if defined(CORE_NORTHWOOD) + size >>= 7; + +#ifdef ALLOC_HUGETLB + sgemm_p = 128 * size; + dgemm_p = 64 * size; + cgemm_p = 64 * size; + zgemm_p = 32 * size; +#ifdef EXPRECISION + qgemm_p = 32 * size; + xgemm_p = 16 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 16 * size; + xgemm_p = 8 * size; +#endif +#else + sgemm_p = 96 * size; + dgemm_p = 48 * size; + cgemm_p = 48 * size; + zgemm_p = 24 * size; +#ifdef EXPRECISION + qgemm_p = 24 * size; + xgemm_p = 12 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 12 * size; + xgemm_p = 6 * size; +#endif +#endif +#endif + +#if defined(CORE_CORE2) + + size >>= 9; + + sgemm_p = 92 * size; + dgemm_p = 46 * size; + cgemm_p = 46 * size; + zgemm_p = 23 * size; + +#ifdef EXPRECISION + qgemm_p = 23 * size; + xgemm_p = 11 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 11 * size; + xgemm_p = 5 * size; +#endif +#endif + +#if defined(PENRYN) + + size >>= 9; + + sgemm_p = 1024; + dgemm_p = 512; + cgemm_p = 512; + zgemm_p = 256; + +#ifdef EXPRECISION + qgemm_p = 256; + xgemm_p = 128; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 21 * size + 4; + xgemm_p = 10 * size + 2; +#endif +#endif + +#if defined(DUNNINGTON) + + size >>= 9; + + sgemm_p = 384; + dgemm_p = 384; + cgemm_p = 384; + zgemm_p = 384; + +#ifdef EXPRECISION + qgemm_p = 384; + xgemm_p = 384; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 21 * size + 4; + xgemm_p = 10 * size + 2; +#endif +#endif + +#if defined(NEHALEM) + sgemm_p = 1024; + dgemm_p = 512; + cgemm_p = 512; + zgemm_p = 256; +#ifdef EXPRECISION + qgemm_p = 256; + xgemm_p = 128; +#endif +#endif + +#if defined(CORE_PRESCOTT) || defined(GENERIC) + size >>= 6; + + if (size > 16) size = 16; + + sgemm_p = 56 * size; + dgemm_p = 28 * size; + cgemm_p = 28 * size; + zgemm_p = 14 * size; +#ifdef EXPRECISION + qgemm_p = 14 * size; + xgemm_p = 7 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 7 * size; + xgemm_p = 3 * size; +#endif +#endif + +#if defined(CORE_OPTERON) + sgemm_p = 224 + 14 * (size >> 5); + dgemm_p = 112 + 14 * (size >> 6); + cgemm_p = 116 + 14 * (size >> 6); + zgemm_p = 58 + 14 * (size >> 7); +#ifdef EXPRECISION + qgemm_p = 58 + 14 * (size >> 7); + xgemm_p = 29 + 14 * (size >> 8); +#endif +#ifdef QUAD_PRECISION + qgemm_p = 29 + 14 * (size >> 8); + xgemm_p = 15 + 14 * (size >> 9); +#endif +#endif + +#if defined(ATOM) + size >>= 8; + + sgemm_p = 256; + dgemm_p = 128; + cgemm_p = 128; + zgemm_p = 64; +#ifdef EXPRECISION + qgemm_p = 64; + xgemm_p = 32; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 32; + xgemm_p = 16; +#endif +#endif + +#if defined(CORE_BARCELONA) + size >>= 8; + + sgemm_p = 232 * size; + dgemm_p = 116 * size; + cgemm_p = 116 * size; + zgemm_p = 58 * size; +#ifdef EXPRECISION + qgemm_p = 58 * size; + xgemm_p = 26 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 26 * size; + xgemm_p = 13 * size; +#endif +#endif + + p = getenv("GOTO_BLOCK_FACTOR"); + + if (p) { + factor = atoi(p); + if (factor < 10) factor = 10; + if (factor > 200) factor = 200; + + sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L; + dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L; + cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L; + zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L; +#ifdef EXPRECISION + qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L; + xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L; +#endif + } + + if (sgemm_p == 0) sgemm_p = 64; + if (dgemm_p == 0) dgemm_p = 64; + if (cgemm_p == 0) cgemm_p = 64; + if (zgemm_p == 0) zgemm_p = 64; +#ifdef EXPRECISION + if (qgemm_p == 0) qgemm_p = 64; + if (xgemm_p == 0) xgemm_p = 64; +#endif + +#ifdef QUAD_PRECISION + if (qgemm_p == 0) qgemm_p = 64; + if (xgemm_p == 0) xgemm_p = 64; +#endif + + sgemm_p = (sgemm_p + SGEMM_UNROLL_M - 1) & ~(SGEMM_UNROLL_M - 1); + dgemm_p = (dgemm_p + DGEMM_UNROLL_M - 1) & ~(DGEMM_UNROLL_M - 1); + cgemm_p = (cgemm_p + CGEMM_UNROLL_M - 1) & ~(CGEMM_UNROLL_M - 1); + zgemm_p = (zgemm_p + ZGEMM_UNROLL_M - 1) & ~(ZGEMM_UNROLL_M - 1); +#ifdef QUAD_PRECISION + qgemm_p = (qgemm_p + QGEMM_UNROLL_M - 1) & ~(QGEMM_UNROLL_M - 1); + xgemm_p = (xgemm_p + XGEMM_UNROLL_M - 1) & ~(XGEMM_UNROLL_M - 1); +#endif + + sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; + dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; + cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; + zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; +#if defined(EXPRECISION) || defined(QUAD_PRECISION) + qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; + xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; +#endif + +#if 0 + fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R); + fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R); + fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R); + fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R); +#endif + + return; +} + +#if 0 + +int get_current_cpu_info(void){ + + int nlprocs, ncores, cmplegacy; + int htt = 0; + int apicid = 0; + +#if defined(CORE_PRESCOTT) || defined(CORE_OPTERON) + int eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + nlprocs = BITMASK(ebx, 16, 0xff); + apicid = BITMASK(ebx, 24, 0xff); + htt = BITMASK(edx, 28, 0x01); +#endif + +#if defined(CORE_PRESCOTT) + cpuid(4, &eax, &ebx, &ecx, &edx); + ncores = BITMASK(eax, 26, 0x3f); + + if (htt == 0) nlprocs = 0; +#endif + +#if defined(CORE_OPTERON) + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + ncores = BITMASK(ecx, 0, 0xff); + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + cmplegacy = BITMASK(ecx, 1, 0x01); + + if (htt == 0) { + nlprocs = 0; + ncores = 0; + cmplegacy = 0; + } +#endif + + ncores ++; + + fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores); + + return 0; +} +#endif + +#endif + +#if defined(ARCH_IA64) + +static inline BLASULONG cpuid(BLASULONG regnum){ + BLASULONG value; + +#ifndef __ECC + asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum)); +#else + value = __getIndReg(_IA64_REG_INDR_CPUID, regnum); +#endif + + return value; +} + +#if 1 + +void blas_set_parameter(void){ + + BLASULONG cpuid3, size; + + cpuid3 = cpuid(3); + + size = BITMASK(cpuid3, 16, 0xff); + + sgemm_p = 192 * (size + 1); + dgemm_p = 96 * (size + 1); + cgemm_p = 96 * (size + 1); + zgemm_p = 48 * (size + 1); +#ifdef EXPRECISION + qgemm_p = 64 * (size + 1); + xgemm_p = 32 * (size + 1); +#endif +#ifdef QUAD_PRECISION + qgemm_p = 32 * (size + 1); + xgemm_p = 16 * (size + 1); +#endif + + sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; + dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; + cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; + zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; +#if defined(EXPRECISION) || defined(QUAD_PRECISION) + qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; + xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; +#endif + + return; +} + +#else + +#define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size" +#define IA64_PROC_NAME "/proc/pal/cpu0/cache_info" + +void blas_set_parameter(void){ + + BLASULONG cpuid3; + int size = 0; + +#if 1 + char buffer[128]; + FILE *infile; + + if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) { + + fgets(buffer, sizeof(buffer), infile); + fclose(infile); + + size = atoi(buffer) / 1536; + } + + if (size <= 0) { + if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) { + + while(fgets(buffer, sizeof(buffer), infile) != NULL) { + if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break; + } + + fgets(buffer, sizeof(buffer), infile); + + fclose(infile); + + *strstr(buffer, "bytes") = (char)NULL; + + size = atoi(strchr(buffer, ':') + 1) / 1572864; + } + } +#endif + + /* The last resort */ + + if (size <= 0) { + cpuid3 = cpuid(3); + + size = BITMASK(cpuid3, 16, 0xff) + 1; + } + + sgemm_p = 320 * size; + dgemm_p = 160 * size; + cgemm_p = 160 * size; + zgemm_p = 80 * size; +#ifdef EXPRECISION + qgemm_p = 80 * size; + xgemm_p = 40 * size; +#endif + + sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; + dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; + cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; + zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; +#ifdef EXPRECISION + qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; + xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; +#endif + + return; +} + +#endif + +#endif diff --git a/driver/others/profile.c b/driver/others/profile.c new file mode 100644 index 0000000..f65550c --- /dev/null +++ b/driver/others/profile.c @@ -0,0 +1,139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include +#include +#define USE_FUNCTABLE +#include "../../interface/functable.h" + +func_profile_t function_profile_table[MAX_PROF_TABLE]; + +int gotoblas_profile = 1; + +static struct sigaction sa, ig; + +void gotoblas_profile_quit(void) { + + int i; + unsigned long long calls, fops, cycles, tcycles, area; + + sigaction(SIGPROF, &ig, NULL); + + calls = 0; + fops = 0; + cycles = 0; + tcycles = 0; + area = 0; + + for (i = 0; i < MAX_PROF_TABLE; i ++) { + if (function_profile_table[i].calls) { + calls += function_profile_table[i].calls; + cycles += function_profile_table[i].cycles; + tcycles += function_profile_table[i].tcycles; + area += function_profile_table[i].area; + fops += function_profile_table[i].fops; + } + } + + if (cycles > 0) { + + fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n"); + fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle\n"); + + for (i = 0; i < MAX_PROF_TABLE; i ++) { + if (function_profile_table[i].calls) { +#ifndef OS_WINDOWS + fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f\n", +#else + fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f\n", +#endif + func_table[i], + function_profile_table[i].calls, + (double)function_profile_table[i].cycles / (double)cycles * 100., + (double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100., + (double)function_profile_table[i].area / (double)function_profile_table[i].cycles + ); + } + } + + fprintf(stderr, " --------------------------------------------------------------------\n"); + +#ifndef OS_WINDOWS + fprintf(stderr, "%-12s : %10Ld %10.3f%% %8.2f\n", +#else + fprintf(stderr, "%-12s : %10lld %10.3f%% %8.2f\n", +#endif + "Total", + calls, + (double)fops / (double)tcycles * 100., + (double)area / (double)cycles); + } + + sigaction(SIGPROF, &sa, NULL); +} + +void gotoblas_profile_clear(void) { + + int i; + + for (i = 0; i < MAX_PROF_TABLE; i ++) { + function_profile_table[i].calls = 0; + function_profile_table[i].cycles = 0; + function_profile_table[i].tcycles = 0; + function_profile_table[i].area = 0; + function_profile_table[i].fops = 0; + } + +} + +void gotoblas_profile_init(void) { + + gotoblas_profile_clear(); + + bzero(&sa, sizeof(struct sigaction)); + sa.sa_handler = (void *)gotoblas_profile_quit; + sa.sa_flags = SA_NODEFER | SA_RESETHAND; + + bzero(&ig, sizeof(struct sigaction)); + ig.sa_handler = SIG_IGN; + ig.sa_flags |= SA_NODEFER | SA_RESETHAND; + + sigaction(SIGPROF, &sa, NULL); + +} diff --git a/driver/others/xerbla.c b/driver/others/xerbla.c new file mode 100644 index 0000000..6f5170e --- /dev/null +++ b/driver/others/xerbla.c @@ -0,0 +1,70 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) +#include +#undef printf +#define printf _cprintf +#endif + +#ifdef __ELF__ +int __xerbla(char *message, blasint *info, blasint length){ + + printf(" ** On entry to %6s parameter number %2d had an illegal value\n", + message, *info); + + return 0; +} + +int BLASFUNC(xerbla)(char *, blasint *, blasint) __attribute__ ((weak, alias ("__xerbla"))); + +#else + +int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){ + + printf(" ** On entry to %6s parameter number %2d had an illegal value\n", + message, *info); + + return 0; +} + +#endif diff --git a/exports/._Makefile b/exports/._Makefile new file mode 100644 index 0000000..7425e12 Binary files /dev/null and b/exports/._Makefile differ diff --git a/exports/._dllinit.c b/exports/._dllinit.c new file mode 100644 index 0000000..5c2cf0f Binary files /dev/null and b/exports/._dllinit.c differ diff --git a/exports/._gensymbol b/exports/._gensymbol new file mode 100755 index 0000000..a68b401 Binary files /dev/null and b/exports/._gensymbol differ diff --git a/exports/Makefile b/exports/Makefile new file mode 100644 index 0000000..00e6fed --- /dev/null +++ b/exports/Makefile @@ -0,0 +1,188 @@ +TOPDIR = .. + +include ../Makefile.system + +ifndef EXPRECISION +EXPRECISION = 0 +endif + +ifndef NO_CBLAS +NO_CBLAS = 0 +endif + +ifeq ($(OSNAME), WINNT) +ifeq ($(F_COMPILER), GFORTRAN) +EXTRALIB += -lgfortran +endif +endif + +ifeq ($(OSNAME), CYGWIN_NT) +ifeq ($(F_COMPILER), GFORTRAN) +EXTRALIB += -lgfortran +endif +endif + +all:: + +libs:: + +prof:: + +hpl:: libgoto_hpl.$(LIBSUFFIX) + +hpl_p:: libgoto_hpl_p.$(LIBSUFFIX) + +libgoto_hpl.$(LIBSUFFIX) : ../$(LIBNAME) + rm -f $(@F) + $(LD) -r $(LDFLAGS) -o goto.$(SUFFIX) --whole-archive $< --no-whole-archive + $(AR) cq $(@F) goto.$(SUFFIX) + $(RANLIB) libgoto_hpl.$(LIBSUFFIX) + +libgoto_hpl_p.$(LIBSUFFIX) : ../$(LIBNAME_P) + rm -f $(@F) + $(LD) -r $(LDFLAGS) -o goto.$(PSUFFIX) --whole-archive $< --no-whole-archive + $(AR) cq $(@F) goto.$(PSUFFIX) + $(RANLIB) libgoto_hpl_p.$(LIBSUFFIX) + +libgoto_hpl.dll : libgoto_hpl.$(LIBSUFFIX) dllinit.$(SUFFIX) libgoto_hpl.def + $(DLLWRAP) -o $(@F) --def libgoto_hpl.def --entry _dllinit -s dllinit.$(SUFFIX) --dllname libgoto_hpl.dll libgoto_hpl.$(LIBSUFFIX) + lib /machine:X64 /def:libgoto_hpl.def + +dyn : $(LIBDYNNAME) + +zip : dll + zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME) + +dll : libgoto2.dll + +dll2 : libgoto2_shared.dll + +libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) + $(RANLIB) ../$(LIBNAME) +ifeq ($(BINARY32), 1) + $(DLLWRAP) -o $(@F) --def libgoto2.def \ + --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + -lib /machine:i386 /def:libgoto2.def +else + $(DLLWRAP) -o $(@F) --def libgoto2.def \ + --entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + -lib /machine:X64 /def:libgoto2.def +endif + +libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def + $(CC) $(CFLAGS) libgoto2_shared.def -shared -o $(@F) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) + +libgoto2.def : gensymbol + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F) + +libgoto2_shared.def : gensymbol + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F) + +libgoto_hpl.def : gensymbol + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F) + +$(LIBDYNNAME) : ../$(LIBNAME) osx.def + $(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + +symbol.$(SUFFIX) : symbol.S + $(CC) $(CFLAGS) -c -o $(@F) $^ + +dllinit.$(SUFFIX) : dllinit.c + $(CC) $(CFLAGS) -c -o $(@F) -s $< + +ifeq ($(OSNAME), Linux) + +so : ../$(LIBSONAME) + +../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c + $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. + rm -f linktest + +endif + +ifeq ($(OSNAME), FreeBSD) + +so : ../$(LIBSONAME) + +../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c + $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. + rm -f linktest + +endif + +ifeq ($(OSNAME), OSF1) + +so : ../$(LIBSONAME) + +../$(LIBSONAME) : + $(CC) -shared -o ../$(LIBSONAME) ../$(LIBNAME) +endif + +ifeq ($(OSNAME), SunOS) + +so : ../$(LIBSONAME) + $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB) + $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. + rm -f linktest + +endif + +ifeq ($(OSNAME), AIX) + +ifeq ($(COMPILER_F77), xlf) + +goto32.$(SUFFIX) : ../$(LIBNAME) aix.def + ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lxlf90 -lc -lm -lpthread + +goto64.$(SUFFIX) : ../$(LIBNAME) aix.def + ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lxlf90 -lc -lm -lpthread +else +goto32.$(SUFFIX) : ../$(LIBNAME) aix.def + ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lg2c -lc -lm + +goto64.$(SUFFIX) : ../$(LIBNAME) aix.def + ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lg2c -lc -lm +endif +endif + +static : ../$(LIBNAME) + $(LD) $(LDFLAGS) -r -o goto.$(SUFFIX) \ + --whole-archive ../$(LIBNAME) --no-whole-archive + rm -f ../$(LIBNAME) + $(AR) -cq ../$(LIBNAME) goto.$(SUFFIX) + rm -f goto.$(SUFFIX) + +linux.def : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F) + +osx.def : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F) + +aix.def : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F) + +symbol.S : gensymbol + perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) > symbol.S + +test : linktest.c + $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. + rm -f linktest + +linktest.c : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > linktest.c + +clean :: + @rm -f *.def *.dylib __.SYMDEF* + +include ../Makefile.tail + + diff --git a/exports/dllinit.c b/exports/dllinit.c new file mode 100644 index 0000000..54ec1c3 --- /dev/null +++ b/exports/dllinit.c @@ -0,0 +1,55 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +void gotoblas_init(void); +void gotoblas_quit(void); + +BOOL APIENTRY dllinit(HINSTANCE hInst, DWORD reason, LPVOID reserved) { + + if (reason == DLL_PROCESS_ATTACH) { + gotoblas_init(); + } + + if (reason == DLL_PROCESS_DETACH) { + gotoblas_quit(); + } + + return TRUE; +} diff --git a/exports/gensymbol b/exports/gensymbol new file mode 100755 index 0000000..8455e51 --- /dev/null +++ b/exports/gensymbol @@ -0,0 +1,462 @@ +#!/usr/bin/perl + +@blasobjs = ( + caxpy,ccopy,cdotc,cdotu,cgbmv,cgemm,cgemv,cgerc,cgeru, + chbmv,chemm,chemv,cher2,cher2k,cher,cherk, + chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap, + csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, + ctrsv, csymv, + damax,damin,dasum,daxpy,dcabs1,dcopy,ddot,dgbmv,dgemm, + dgemv,dger,dmax,dmin,dnrm2,drot,drotg,drotm,drotmg,dsbmv, + dscal,dsdot,dspmv,dspr2, + dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, + dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,dzamax,dzamin,dzasum,dznrm2, + icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, + izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, + scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, + smax,smin,snrm2, + srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, + ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, + strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, + zdscal,zgbmv,zgemm,zgemv,zgerc,zgeru, + zhbmv,zhemm,zhemv,zher2,zher2k,zher,zherk,zhpmv,zhpr2, + zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, + ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv, + xerbla); + +@cblasobjs = ( + cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, + cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, + cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, + cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, + cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, + cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, + cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, + cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, + cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, + cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, + cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_dzasum, + cblas_dznrm2, cblas_icamax, cblas_idamax, + cblas_isamax, cblas_izamax, + cblas_sasum, cblas_saxpy, + cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, + cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, + cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, + cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, + cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, + cblas_strsv, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, + cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, + cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2, + cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, + cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, + cblas_ztrsv); + +@exblasobjs = ( + qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, + qgemv,qger,qmax,qmin, + qnrm2, + qsbmv,qscal,qspmv,qspr2, + qspr,qswap,qsymm,qsymv,qsyr2,qsyr2k,qsyr,qsyrk,qtbmv,qtbsv, + qtpmv,qtpsv,qtrmm,qtrmv,qtrsm,qtrsv, + qxamax,qxamin,qxasum,qxnrm2, + xaxpy,xcopy,xdotc,xdotu, + xqscal,xgbmv,xgemm,xgemv,xgerc,xgeru, + xhbmv,xhemm,xhemv,xher2,xher2k,xher,xherk,xhpmv,xhpr2, + xhpr,xscal,xswap,xsymm,xsyr2k,xsyrk,xtbmv, + xtbsv,xtpmv,xtpsv,xtrmm,xtrmv,xtrsm,xtrsv, +# qrot,qrotg,qrotm,qrotmg, +# xdrot,xrotg, + ); + +@gemm3mobjs = ( + zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, + ); + +@lapackobjs = ( + sgesv, dgesv, cgesv, zgesv, + sgetf2, dgetf2, cgetf2, zgetf2, + sgetrf, dgetrf, cgetrf, zgetrf, + slaswp, dlaswp, claswp, zlaswp, + sgetrs, dgetrs, cgetrs, zgetrs, + slauu2, dlauu2, clauu2, zlauu2, + slauum, dlauum, clauum, zlauum, + spotf2, dpotf2, cpotf2, zpotf2, + spotrf, dpotrf, cpotrf, zpotrf, + strti2, dtrti2, ctrti2, ztrti2, + strtri, dtrtri, ctrtri, ztrtri, + spotri, dpotri, cpotri, zpotri, + ); + +@lapackobjs2 = ( + sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, + sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2, + sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, + sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf, + sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf, + sgeqp3, sgeqpf, sgeqr2, sgeqrf, sgerfs, sgerq2, sgerqf, + sgesc2, sgesdd, sgesvd, sgesvx, sgetc2, + sgetri, + sggbak, sggbal, sgges, sggesx, sggev, sggevx, + sggglm, sgghrd, sgglse, sggqrf, + sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, + sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, + shsein, shseqr, slabrd, slacon, slacn2, + slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, + slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, + slangb, slange, slangt, slanhs, slansb, slansp, + slansy, slantb, slantp, slantr, slanv2, + slapll, slapmt, + slaqgb, slaqge, slaqp2, slaqps, slaqsb, slaqsp, slaqsy, + slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, + slaqtr, slar1v, slar2v, + slarf, slarfb, slarfg, slarft, slarfx, slargv, + slarrv, slartv, + slarz, slarzb, slarzt, slasy2, slasyf, + slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, + sopgtr, sopmtr, sorg2l, sorg2r, + sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, + sorgrq, sorgtr, sorm2l, sorm2r, + sormbr, sormhr, sorml2, sormlq, sormql, sormqr, sormr2, + sormr3, sormrq, sormrz, sormtr, spbcon, spbequ, spbrfs, + spbstf, spbsv, spbsvx, + spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, + sposvx, spotrs, sppcon, sppequ, + spprfs, sppsv, sppsvx, spptrf, spptri, spptrs, sptcon, + spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, + ssbev, ssbevd, ssbevx, ssbgst, ssbgv, ssbgvd, ssbgvx, + ssbtrd, sspcon, sspev, sspevd, sspevx, sspgst, + sspgv, sspgvd, sspgvx, ssprfs, sspsv, sspsvx, ssptrd, + ssptrf, ssptri, ssptrs, sstegr, sstein, sstev, sstevd, sstevr, + sstevx, ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, + ssygst, ssygv, ssygvd, ssygvx, ssyrfs, ssysv, ssysvx, + ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytrs, stbcon, + stbrfs, stbtrs, stgevc, stgex2, stgexc, stgsen, + stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, + stptrs, + strcon, strevc, strexc, strrfs, strsen, strsna, strsyl, + strtrs, stzrqf, stzrzf, sstemr, + + cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, + cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, + cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, + cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf, + cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3, + cgeqpf, cgeqr2, cgeqrf, cgerfs, cgerq2, cgerqf, + cgesc2, cgesdd, cgesvd, cgesvx, cgetc2, + cgetri, + cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, + cgghrd, cgglse, cggqrf, cggrqf, + cggsvd, cggsvp, + cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, + chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd, + checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, + chegv, chegvd, chegvx, cherfs, chesv, chesvx, chetd2, + chetf2, chetrd, + chetrf, chetri, chetrs, chgeqz, chpcon, chpev, chpevd, + chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, + chpsvx, + chptrd, chptrf, chptri, chptrs, chsein, chseqr, clabrd, + clacgv, clacon, clacn2, clacp2, clacpy, clacrm, clacrt, cladiv, + claed0, claed7, claed8, + claein, claesy, claev2, clags2, clagtm, + clahef, clahqr, + clahrd, clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt, + clanhb, clanhe, + clanhp, clanhs, clanht, clansb, clansp, clansy, clantb, + clantp, clantr, clapll, clapmt, clarcm, claqgb, claqge, + claqhb, claqhe, claqhp, claqp2, claqps, claqsb, + claqr0, claqr1, claqr2, claqr3, claqr4, claqr5, + claqsp, claqsy, clar1v, clar2v, clarf, clarfb, clarfg, clarft, + clarfx, clargv, clarnv, clarrv, clartg, clartv, + clarz, clarzb, clarzt, clascl, claset, clasr, classq, + clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, + clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, + cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, + cposv, cposvx, cpotrs, cppcon, + cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, + cptcon, cpteqr, cptrfs, cptsv, cptsvx, cpttrf, cpttrs, cptts2, + crot, cspcon, cspmv, cspr, csprfs, cspsv, + cspsvx, csptrf, csptri, csptrs, csrscl, cstedc, + cstegr, cstein, csteqr, csycon, + csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, + csytrs, ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, + ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, + ctprfs, ctptri, + ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, + ctrsyl, ctrtrs, ctzrqf, ctzrzf, cung2l, cung2r, + cungbr, cunghr, cungl2, cunglq, cungql, cungqr, cungr2, + cungrq, cungtr, cunm2l, cunm2r, cunmbr, cunmhr, cunml2, + cunmlq, cunmql, cunmqr, cunmr2, cunmr3, cunmrq, cunmrz, + cunmtr, cupgtr, cupmtr, icmax1, scsum1, cstemr, + + dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, + dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, + dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, + dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf, + dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf, + dgeqp3, dgeqpf, dgeqr2, dgeqrf, dgerfs, dgerq2, dgerqf, + dgesc2, dgesdd, dgesvd, dgesvx, dgetc2, + dgetri, + dggbak, dggbal, dgges, dggesx, dggev, dggevx, + dggglm, dgghrd, dgglse, dggqrf, + dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, + dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, + dhsein, dhseqr, dlabrd, dlacon, dlacn2, + dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, + dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, + dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, + dlansy, dlantb, dlantp, dlantr, dlanv2, + dlapll, dlapmt, + dlaqgb, dlaqge, dlaqp2, dlaqps, dlaqsb, dlaqsp, dlaqsy, + dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, + dlaqtr, dlar1v, dlar2v, + dlarf, dlarfb, dlarfg, dlarft, dlarfx, dlargv, + dlarrv, dlartv, + dlarz, dlarzb, dlarzt, dlasy2, dlasyf, + dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, + dopgtr, dopmtr, dorg2l, dorg2r, + dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2, + dorgrq, dorgtr, dorm2l, dorm2r, + dormbr, dormhr, dorml2, dormlq, dormql, dormqr, dormr2, + dormr3, dormrq, dormrz, dormtr, dpbcon, dpbequ, dpbrfs, + dpbstf, dpbsv, dpbsvx, + dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, + dposvx, dpotrs, dppcon, dppequ, + dpprfs, dppsv, dppsvx, dpptrf, dpptri, dpptrs, dptcon, + dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, + dsbev, dsbevd, dsbevx, dsbgst, dsbgv, dsbgvd, dsbgvx, + dsbtrd, dspcon, dspev, dspevd, dspevx, dspgst, + dspgv, dspgvd, dspgvx, dsprfs, dspsv, dspsvx, dsptrd, + dsptrf, dsptri, dsptrs, dstegr, dstein, dstev, dstevd, dstevr, + dstevx, dsycon, dsyev, dsyevd, dsyevr, + dsyevx, dsygs2, dsygst, dsygv, dsygvd, dsygvx, dsyrfs, + dsysv, dsysvx, + dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytrs, dtbcon, + dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, + dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, + dtptrs, + dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, + dtrtrs, dtzrqf, dtzrzf, dstemr, + dsgesv, dlag2s, slag2d, + + zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, + zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, + zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, + zgegs, zgegv, zgehd2, zgehrd, zgelq2, zgelqf, + zgels, zgelsd, zgelss, zgelsx, zgelsy, zgeql2, zgeqlf, zgeqp3, + zgeqpf, zgeqr2, zgeqrf, zgerfs, zgerq2, zgerqf, + zgesc2, zgesdd, zgesvd, zgesvx, zgetc2, + zgetri, + zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, + zgghrd, zgglse, zggqrf, zggrqf, + zggsvd, zggsvp, + zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, + zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd, + zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, + zhegv, zhegvd, zhegvx, zherfs, zhesv, zhesvx, zhetd2, + zhetf2, zhetrd, + zhetrf, zhetri, zhetrs, zhgeqz, zhpcon, zhpev, zhpevd, + zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, + zhpsvx, + zhptrd, zhptrf, zhptri, zhptrs, zhsein, zhseqr, zlabrd, + zlacgv, zlacon, zlacn2, zlacp2, zlacpy, zlacrm, zlacrt, zladiv, + zlaed0, zlaed7, zlaed8, + zlaein, zlaesy, zlaev2, zlags2, zlagtm, + zlahef, zlahqr, + zlahrd, zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange, + zlangt, zlanhb, + zlanhe, + zlanhp, zlanhs, zlanht, zlansb, zlansp, zlansy, zlantb, + zlantp, zlantr, zlapll, zlapmt, zlaqgb, zlaqge, + zlaqhb, zlaqhe, zlaqhp, zlaqp2, zlaqps, zlaqsb, + zlaqr0, zlaqr1, zlaqr2, zlaqr3, zlaqr4, zlaqr5, + zlaqsp, zlaqsy, zlar1v, zlar2v, zlarcm, zlarf, zlarfb, + zlarfg, zlarft, + zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, + zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, + zlassq, zlasyf, + zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm, + zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, + zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, + zposv, zposvx, zpotrs, zppcon, + zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, + zptcon, zpteqr, zptrfs, zptsv, zptsvx, zpttrf, zpttrs, zptts2, + zrot, zspcon, zspmv, zspr, zsprfs, zspsv, + zspsvx, zsptrf, zsptri, zsptrs, zdrscl, zstedc, + zstegr, zstein, zsteqr, zsycon, + zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, + zsytrs, ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, + ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, + ztprfs, ztptri, + ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, + ztrsyl, ztrtrs, ztzrqf, ztzrzf, zung2l, + zung2r, zungbr, zunghr, zungl2, zunglq, zungql, zungqr, zungr2, + zungrq, zungtr, zunm2l, zunm2r, zunmbr, zunmhr, zunml2, + zunmlq, zunmql, zunmqr, zunmr2, zunmr3, zunmrq, zunmrz, + zunmtr, zupgtr, + zupmtr, izmax1, dzsum1, zstemr, + zcgesv, zlag2c, clag2z, + ); + +if (-d "../lapack-3.1.1") { + @objs = (@blasobjs, @lapackobjs, @lapackobjs2); +} else { + @objs = (@blasobjs, @lapackobjs); +} + +if ($ARGV[3] == 1){ @objs = (@objs, @exblasobjs); }; + +if ($ARGV[1] eq "X86_64"){ @objs = (@objs, @gemm3mobjs); }; + +if ($ARGV[1] eq "x86"){ @objs = (@objs, @gemm3mobjs); }; + +if ($ARGV[1] eq "ia64"){ @objs = (@objs, @gemm3mobjs); }; + +if ($ARGV[1] eq "MIPS"){ @objs = (@objs, @gemm3mobjs); }; + +@linuxobjs = ('__strtol_internal', 'exit', 'free', 'getenv', 'malloc', + 'mmap', 'printf', 'sqrt', + 'pthread_cond_broadcast', 'pthread_cond_destroy', + 'pthread_cond_init', 'pthread_cond_signal', 'pthread_cond_wait', + 'pthread_create', 'pthread_exit', 'pthread_join', + 'pthread_mutex_destroy', 'pthread_mutex_init', + 'pthread_mutex_lock', 'pthread_mutex_unlock'); + +@hplobjs = (daxpy, dcopy, dscal, idamax, dgemv, dtrsv, dger, dgemm, dtrsm); +@hplobjs2 = (HPL_dlaswp00N, HPL_dlaswp01N, HPL_dlaswp01T); + +$bu = $ARGV[2]; + +$bu = "" if (($bu eq "0") || ($bu eq "1")); + +if ($ARGV[0] eq "linux"){ + foreach $objs (@objs) { + print $objs, $bu, "\n"; + } + + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print $objs, "\n"; + } + } + + foreach $objs (@linuxobjs) { + print $objs, "\n"; + } + exit(0); +} + +if ($ARGV[0] eq "osx"){ + foreach $objs (@objs) { + print "_", $objs, $bu, "\n"; + } + + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print "_", $objs, "\n"; + } + } + exit(0); +} + +if ($ARGV[0] eq "aix"){ + foreach $objs (@objs) { + print $objs, $bu, "\n"; + } + + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print $objs, "\n"; + } + } + exit(0); +} + +if ($ARGV[0] eq "win2k"){ + print "EXPORTS\n"; + $count = 1; + foreach $objs (@objs) { + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t$objs=$objs","_ \@", $count, "\n"; + $count ++; + print "\t",$objs, "_=$objs","_ \@", $count, "\n"; + $count ++; + print "\t$uppercase=$objs", "_ \@", $count, "\n"; + $count ++; + } + + exit(0); +} + +if ($ARGV[0] eq "win2khpl"){ + print "EXPORTS\n"; + $count = 1; + foreach $objs (@hplobjs) { + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t$objs=$objs","_ \@", $count, "\n"; + $count ++; + print "\t",$objs, "_=$objs","_ \@", $count, "\n"; + $count ++; + print "\t$uppercase=$objs", "_ \@", $count, "\n"; + $count ++; + } + +# foreach $objs (@hplobjs2) { +# print "\t$objs=$objs"," \@", $count, "\n"; +# $count ++; +# } + + exit(0); +} + +if ($ARGV[0] eq "microsoft"){ + print "EXPORTS\n"; + $count = 1; + foreach $objs (@objs) { + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t$objs = $objs","_\n"; + $count ++; + print "\t$objs\_ = $objs","_\n"; + $count ++; + print "\t$uppercase = $objs","_\n"; + $count ++; + print "\t$uppercase\_ = $objs","_\n"; + $count ++; + } + exit(0); +} + +if ($ARGV[0] eq "win2kasm"){ + print "\t.text\n"; + foreach $objs (@objs) { + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t.align 16\n"; + print "\t.globl _", $uppercase, "_\n"; + print "_", $uppercase, "_:\n"; + print "\tjmp\t_", $objs, "_\n"; + } + exit(0); +} + +if ($ARGV[0] eq "linktest"){ + print "int main(void){\n"; + foreach $objs (@objs) { + print $objs, $bu, "();\n" if $objs ne "xerbla"; + } + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print $objs, "();\n"; + } + } + + + + + print "return 0;}\n"; + exit(0); +} + diff --git a/f_check b/f_check new file mode 100755 index 0000000..26c57bc --- /dev/null +++ b/f_check @@ -0,0 +1,302 @@ +#!/usr/bin/perl + +# +# 1. Not specified +# 1.1 Automatically detect, then check compiler +# 1.2 If no fortran compiler is detected, g77 is default with NOFORTRAN definition +# 2. Specified +# 2.1 If path is correct, check compiler +# 2.2 If path is not correct, but still valid compiler name, force setting +# 2.2.2 Path is not correct, invalid compiler name, then g77 is default with NOFORTRAN definition +# + +$makefile = shift(@ARGV); +$config = shift(@ARGV); + +$nofortran = 0; + +$compiler = join(" ", @ARGV); + +# f77 is too ambiguous +$compiler = "" if $compiler eq "f77"; + +@path = split(/:/, $ENV{"PATH"}); + +if ($compiler eq "") { + + @lists = ("f77", "g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95", + "sunf77", "sunf90", "sunf95", + "xlf95", "xlf90", "xlf", + "ppuf77", "ppuf95", "ppuf90", "ppuxlf", + "pathf90", "pathf95", + "pgf95", "pgf90", "pgf77", + "ifort"); + + foreach $lists (@lists) { + foreach $path (@path) { + if (-f $path . "/" . $lists) { + $compiler = $lists; + break; + } + } + } + +} + +if ($compiler eq "") { + + $nofortran = 1; + $compiler = "g77"; + $vendor = G77; + $bu = "_"; + +} else { + + $data = `which $compiler > /dev/null 2> /dev/null`; + $vendor = ""; + + if (!$?) { + + $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; + + if ($data =~ /zhoge_/) { + $bu = "_"; + } + + if ($data =~ /GNU/) { + + $data =~ /(\d)\.(\d).(\d)/; + $major = $1; + $minor = $2; + + if ($major >= 4) { + $vendor = GFORTRAN; + $openmp = "-fopenmp"; + } else { + $vendor = G77; + $openmp = ""; + } + + } + + if ($data =~ /g95/) { + $vendor = G95; + $openmp = ""; + } + + if ($data =~ /Intel/) { + $vendor = INTEL; + $openmp = "-openmp"; + } + + if ($data =~ /Sun Fortran/) { + $vendor = SUN; + $openmp = "-xopenmp=parallel"; + } + + if ($data =~ /PathScale/) { + $vendor = PATHSCALE; + $openmp = "-openmp"; + } + + if ($data =~ /Open64/) { + $vendor = OPEN64; + $openmp = "-mp"; + } + + if ($data =~ /PGF/) { + $vendor = PGI; + $openmp = "-mp"; + } + + if ($data =~ /IBM/) { + $vendor = IBM; + $openmp = "-openmp"; + } + } + + if ($vendor eq "") { + + if ($compiler =~ /g77/) { + $vendor = G77; + $bu = "_"; + $openmp = ""; + } + + if ($compiler =~ /g95/) { + $vendor = G95; + $bu = "_"; + $openmp = ""; + } + + if ($compiler =~ /gfortran/) { + $vendor = GFORTRAN; + $bu = "_"; + $openmp = "-fopenmp"; + } + + if ($compiler =~ /ifort/) { + $vendor = INTEL; + $bu = "_"; + $openmp = "-openmp"; + } + + if ($compiler =~ /pathf/) { + $vendor = PATHSCALE; + $bu = "_"; + $openmp = "-mp"; + } + + if ($compiler =~ /pgf/) { + $vendor = PGI; + $bu = "_"; + $openmp = "-mp"; + } + + if ($compiler =~ /ftn/) { + $vendor = PGI; + $bu = "_"; + $openmp = "-openmp"; + } + + if ($compiler =~ /frt/) { + $vendor = FUJITSU; + $bu = "_"; + $openmp = "-openmp"; + } + + if ($compiler =~ /sunf77|sunf90|sunf95/) { + $vendor = SUN; + $bu = "_"; + $openmp = "-xopenmp=parallel"; + } + + if ($compiler =~ /ppuf/) { + $vendor = IBM; + $openmp = "-openmp"; + } + + if ($compiler =~ /xlf/) { + $vendor = IBM; + $openmp = "-openmp"; + } + + if ($compiler =~ /open64/) { + $vendor = OPEN64; + $openmp = "-mp"; + } + + if ($vendor eq "") { + $nofortran = 1; + $compiler = "g77"; + $vendor = G77; + $bu = "_"; + $openmp = ""; + } + + } +} + +$data = `which $compiler > /dev/null 2> /dev/null`; + +if (!$?) { + + $binary = $ENV{"BINARY"}; + + $openmp = "" if $ENV{USE_OPENMP} != 1; + + if ($binary == 32) { + $link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + if ($?) { + $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } + $binary = "" if ($?); + } + + if ($binary == 64) { + $link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + if ($?) { + $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } + $binary = "" if ($?); + } + + if ($binary eq "") { + $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } +} + +$linker_L = ""; +$linker_l = ""; +$linker_a = ""; + +if ($link ne "") { + + $link =~ s/\-Y\sP\,/\-Y/g; + + $link =~ s/\-rpath\s+/\-rpath\@/g; + + @flags = split(/[\s\,\n]/, $link); + + foreach $flags (@flags) { + if ( + ($flags =~ /^\-L/) + && ($flags !~ /^-LIST:/) + && ($flags !~ /^-LANG:/) + ) { + if ($vendor eq "PGI") { + $flags =~ s/lib$/libso/; + } + $linker_L .= $flags . " "; + } + + if ($flags =~ /^\-Y/) { + $linker_L .= "-Wl,". $flags . " "; + } + + if ($flags =~ /^\-rpath/) { + $flags =~ s/\@/\,/g; + if ($vendor eq "PGI") { + $flags =~ s/lib$/libso/; + } + $linker_L .= "-Wl,". $flags . " " ; + } + + if ( + ($flags =~ /^\-l/) + && ($flags !~ /gfortranbegin/) + && ($flags !~ /frtbegin/) + && ($flags !~ /pathfstart/) + && ($flags !~ /numa/) + && ($flags !~ /crt[0-9]/) + && ($flags !~ /gcc/) + && ($flags !~ /user32/) + && ($flags !~ /kernel32/) + && ($flags !~ /advapi32/) + && ($flags !~ /shell32/) + ) { + $linker_l .= $flags . " "; + } + + $linker_a .= $flags . " " if $flags =~ /\.a$/; + } + +} + +open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; +open(CONFFILE, ">> $config" ) || die "Can't append $config"; + +print MAKEFILE "F_COMPILER=$vendor\n"; +print MAKEFILE "FC=$compiler\n"; +print MAKEFILE "BU=$bu\n" if $bu ne ""; +print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1; + +print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne ""; +print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne ""; + +if (($linker_l ne "") || ($linker_a ne "")) { + print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n"; +} + +close(MAKEFILE); +close(CONFFILE); diff --git a/ftest.f b/ftest.f new file mode 100644 index 0000000..94ba566 --- /dev/null +++ b/ftest.f @@ -0,0 +1,6 @@ + double complex function zhoge() + + zhoge = (0.0d0,0.0d0) + + return + end diff --git a/ftest2.f b/ftest2.f new file mode 100644 index 0000000..1d9a114 --- /dev/null +++ b/ftest2.f @@ -0,0 +1,3 @@ + program main + + end diff --git a/getarch.c b/getarch.c new file mode 100644 index 0000000..347dbb1 --- /dev/null +++ b/getarch.c @@ -0,0 +1,732 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#if defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) +#define OS_WINDOWS +#endif + +#include +#include +#ifdef OS_WINDOWS +#include +#endif +#if defined(__FreeBSD__) || defined(__APPLE__) +#include +#include +#endif +#ifdef linux +#include +#endif + +/* #define FORCE_P2 */ +/* #define FORCE_KATMAI */ +/* #define FORCE_COPPERMINE */ +/* #define FORCE_NORTHWOOD */ +/* #define FORCE_PRESCOTT */ +/* #define FORCE_BANIAS */ +/* #define FORCE_YONAH */ +/* #define FORCE_CORE2 */ +/* #define FORCE_PENRYN */ +/* #define FORCE_DUNNINGTON */ +/* #define FORCE_NEHALEM */ +/* #define FORCE_ATHLON */ +/* #define FORCE_OPTERON */ +/* #define FORCE_OPTERON_SSE3 */ +/* #define FORCE_BARCELONA */ +/* #define FORCE_SHANGHAI */ +/* #define FORCE_ISTANBUL */ +/* #define FORCE_SSE_GENERIC */ +/* #define FORCE_VIAC3 */ +/* #define FORCE_NANO */ +/* #define FORCE_POWER3 */ +/* #define FORCE_POWER4 */ +/* #define FORCE_POWER5 */ +/* #define FORCE_POWER6 */ +/* #define FORCE_PPCG4 */ +/* #define FORCE_PPC970 */ +/* #define FORCE_PPC970MP */ +/* #define FORCE_PPC440 */ +/* #define FORCE_PPC440FP2 */ +/* #define FORCE_CELL */ +/* #define FORCE_SICORTEX */ +/* #define FORCE_ITANIUM2 */ +/* #define FORCE_GENERIC */ +/* #define FORCE_SPARC */ +/* #define FORCE_SPARCV7 */ + +#ifdef FORCE_P2 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM2" +#define ARCHCONFIG "-DPENTIUM2 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX" +#define LIBNAME "p2" +#define CORENAME "P5" +#endif + +#ifdef FORCE_COPPERMINE +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM3" +#define ARCHCONFIG "-DPENTIUM3 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " +#define LIBNAME "coppermine" +#define CORENAME "COPPERMINE" +#endif + +#ifdef FORCE_KATMAI +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM3" +#define ARCHCONFIG "-DPENTIUM3 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " +#define LIBNAME "katmai" +#define CORENAME "KATMAI" +#endif + +#ifdef FORCE_NORTHWOOD +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM4" +#define ARCHCONFIG "-DPENTIUM4 " \ + "-DL1_DATA_SIZE=8192 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " +#define LIBNAME "northwood" +#define CORENAME "NORTHWOOD" +#endif + +#ifdef FORCE_PRESCOTT +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM4" +#define ARCHCONFIG "-DPENTIUM4 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3" +#define LIBNAME "prescott" +#define CORENAME "PRESCOTT" +#endif + +#ifdef FORCE_BANIAS +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BANIAS" +#define ARCHCONFIG "-DPENTIUMM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " +#define LIBNAME "banias" +#define CORENAME "BANIAS" +#endif + +#ifdef FORCE_YONAH +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "YONAH" +#define ARCHCONFIG "-DPENTIUMM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " +#define LIBNAME "yonah" +#define CORENAME "YONAH" +#endif + +#ifdef FORCE_CORE2 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "CONRORE" +#define ARCHCONFIG "-DCORE2 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=256 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3" +#define LIBNAME "core2" +#define CORENAME "CORE2" +#endif + +#ifdef FORCE_PENRYN +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENRYN" +#define ARCHCONFIG "-DPENRYN " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=256 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1" +#define LIBNAME "penryn" +#define CORENAME "PENRYN" +#endif + +#ifdef FORCE_DUNNINGTON +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "DUNNINGTON" +#define ARCHCONFIG "-DDUNNINGTON " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DL3_SIZE=16777216 -DL3_LINESIZE=64 " \ + "-DDTB_ENTRIES=256 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1" +#define LIBNAME "dunnington" +#define CORENAME "DUNNINGTON" +#endif + +#ifdef FORCE_NEHALEM +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#endif + +#ifdef FORCE_ATOM +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "ATOM" +#define ARCHCONFIG "-DATOM " \ + "-DL1_DATA_SIZE=24576 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3" +#define LIBNAME "atom" +#define CORENAME "ATOM" +#endif + +#ifdef FORCE_ATHLON +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "ATHLON" +#define ARCHCONFIG "-DATHLON " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=32 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ + "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE " +#define LIBNAME "athlon" +#define CORENAME "ATHLON" +#endif + +#ifdef FORCE_OPTERON +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "OPTERON" +#define ARCHCONFIG "-DOPTERON " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=32 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ + "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " +#define LIBNAME "opteron" +#define CORENAME "OPTERON" +#endif + +#ifdef FORCE_OPTERON_SSE3 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "OPTERON" +#define ARCHCONFIG "-DOPTERON " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=32 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ + "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3" +#define LIBNAME "opteron" +#define CORENAME "OPTERON" +#endif + +#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BARCELONA" +#define ARCHCONFIG "-DBARCELONA " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \ + "-DDTB_ENTRIES=48 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ + "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" +#define LIBNAME "barcelona" +#define CORENAME "BARCELONA" +#endif + +#ifdef FORCE_SSE_GENERIC +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "GENERIC" +#define ARCHCONFIG "-DGENERIC " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2" +#define LIBNAME "generic" +#define CORENAME "GENERIC" +#endif + +#ifdef FORCE_VIAC3 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "VIAC3" +#define ARCHCONFIG "-DVIAC3 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=65536 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE " +#define LIBNAME "viac3" +#define CORENAME "VIAC3" +#endif + +#ifdef FORCE_NANO +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "NANO" +#define ARCHCONFIG "-DNANO " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3" +#define LIBNAME "nano" +#define CORENAME "NANO" +#endif + +#ifdef FORCE_POWER3 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER3" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER3 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=256 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power3" +#define CORENAME "POWER3" +#endif + +#ifdef FORCE_POWER4 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER4" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER4 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=1509949 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=6 " +#define LIBNAME "power4" +#define CORENAME "POWER4" +#endif + +#ifdef FORCE_POWER5 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER5" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER5 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=1509949 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=6 " +#define LIBNAME "power5" +#define CORENAME "POWER5" +#endif + +#ifdef FORCE_POWER6 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER6" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER6 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power6" +#define CORENAME "POWER6" +#endif + +#ifdef FORCE_PPCG4 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPCG4" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPCG4 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ppcg4" +#define CORENAME "PPCG4" +#endif + +#ifdef FORCE_PPC970 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPC970" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPC970 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ppc970" +#define CORENAME "PPC970" +#endif + +#ifdef FORCE_PPC970MP +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPC970" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPC970 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=1024976 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ppc970mp" +#define CORENAME "PPC970" +#endif + +#ifdef FORCE_PPC440 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPC440" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPC440 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=16384 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " +#define LIBNAME "ppc440" +#define CORENAME "PPC440" +#endif + +#ifdef FORCE_PPC440FP2 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPC440FP2" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPC440FP2 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=16384 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " +#define LIBNAME "ppc440FP2" +#define CORENAME "PPC440FP2" +#endif + +#ifdef FORCE_CELL +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "CELL" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DCELL " \ + "-DL1_DATA_SIZE=262144 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "cell" +#define CORENAME "CELL" +#endif + +#ifdef FORCE_SICORTEX +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "SICORTEX" +#define SUBDIRNAME "mips" +#define ARCHCONFIG "-DSICORTEX " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=32 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "mips" +#define CORENAME "sicortex" +#endif + +#ifdef FORCE_ITANIUM2 +#define FORCE +#define ARCHITECTURE "IA64" +#define SUBARCHITECTURE "ITANIUM2" +#define SUBDIRNAME "ia64" +#define ARCHCONFIG "-DITANIUM2 " \ + "-DL1_DATA_SIZE=262144 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=1572864 -DL2_LINESIZE=128 -DDTB_SIZE=16384 -DDTB_ENTRIES=128 " +#define LIBNAME "itanium2" +#define CORENAME "itanium2" +#endif + +#ifdef FORCE_SPARC +#define FORCE +#define ARCHITECTURE "SPARC" +#define SUBARCHITECTURE "SPARC" +#define SUBDIRNAME "sparc" +#define ARCHCONFIG "-DSPARC -DV9 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1572864 -DL2_LINESIZE=64 -DDTB_SIZE=8192 -DDTB_ENTRIES=64 " +#define LIBNAME "sparc" +#define CORENAME "sparc" +#endif + +#ifdef FORCE_SPARCV7 +#define FORCE +#define ARCHITECTURE "SPARC" +#define SUBARCHITECTURE "SPARC" +#define SUBDIRNAME "sparc" +#define ARCHCONFIG "-DSPARC -DV7 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1572864 -DL2_LINESIZE=64 -DDTB_SIZE=8192 -DDTB_ENTRIES=64 " +#define LIBNAME "sparcv7" +#define CORENAME "sparcv7" +#endif + +#ifdef FORCE_GENERIC +#define FORCE +#define ARCHITECTURE "GENERIC" +#define SUBARCHITECTURE "GENERIC" +#define SUBDIRNAME "generic" +#define ARCHCONFIG "-DGENERIC " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "generic" +#define CORENAME "generic" +#endif + +#ifndef FORCE + +#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ + defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) +#ifndef POWER +#define POWER +#endif +#endif + +#if defined(__i386__) || (__x86_64__) +#include "cpuid_x86.c" +#endif + +#ifdef __ia64__ +#include "cpuid_ia64.c" +#endif + +#ifdef __alpha +#include "cpuid_alpha.c" +#endif + +#ifdef POWER +#include "cpuid_power.c" +#endif + +#ifdef sparc +#include "cpuid_sparc.c" +#endif + +#ifdef __mips__ +#include "cpuid_mips.c" +#endif + +#else + +#endif + +static int get_num_cores(void) { + +#ifdef OS_WINDOWS + SYSTEM_INFO sysinfo; +#elif defined(__FreeBSD__) || defined(__APPLE__) + int m[2], count; + size_t len; +#endif + +#ifdef linux + return get_nprocs(); + +#elif defined(OS_WINDOWS) + + GetSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; + +#elif defined(__FreeBSD__) || defined(__APPLE__) + m[0] = CTL_HW; + m[1] = HW_NCPU; + len = sizeof(int); + sysctl(m, 2, &count, &len, NULL, 0); + + return count; +#else + return 2; +#endif +} + +int main(int argc, char *argv[]){ + +#ifdef FORCE + char buffer[8192], *p, *q; + int length; +#endif + + if (argc == 1) return 0; + + switch (argv[1][0]) { + + case '0' : /* for Makefile */ + +#ifdef FORCE + printf("CORE=%s\n", CORENAME); +#else +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) + printf("CORE=%s\n", get_corename()); +#endif +#endif + +#ifdef FORCE + printf("LIBCORE=%s\n", LIBNAME); +#else + printf("LIBCORE="); + get_libname(); + printf("\n"); +#endif + + printf("NUM_CORES=%d\n", get_num_cores()); + +#if defined(__i386__) || defined(__x86_64__) +#ifndef FORCE + get_sse(); +#else + + sprintf(buffer, "%s", ARCHCONFIG); + + p = &buffer[0]; + + while (*p) { + if ((*p == '-') && (*(p + 1) == 'D')) { + p += 2; + + while ((*p != ' ') && (*p != '\0')) { + + if (*p == '=') { + printf("="); + p ++; + while ((*p != ' ') && (*p != '\0')) { + printf("%c", *p); + p ++; + } + } else { + printf("%c", *p); + p ++; + if ((*p == ' ') || (*p =='\0')) printf("=1"); + } + } + + printf("\n"); + } else p ++; + } +#endif +#endif + +#ifndef OS_WINDOWS + printf("MAKE += -j %d\n", get_num_cores()); +#endif + + break; + + case '1' : /* For config.h */ +#ifdef FORCE + sprintf(buffer, "%s -DCORE_%s\n", ARCHCONFIG, CORENAME); + + p = &buffer[0]; + while (*p) { + if ((*p == '-') && (*(p + 1) == 'D')) { + p += 2; + printf("#define "); + + while ((*p != ' ') && (*p != '\0')) { + + if (*p == '=') { + printf(" "); + p ++; + while ((*p != ' ') && (*p != '\0')) { + printf("%c", *p); + p ++; + } + } else { + printf("%c", *p); + p ++; + } + } + + printf("\n"); + } else p ++; + } +#else + get_cpuconfig(); +#endif + break; + + case '2' : /* SMP */ + if (get_num_cores() > 1) printf("SMP=1\n"); + break; + } + + fflush(stdout); + + return 0; +} + diff --git a/getarch_2nd.c b/getarch_2nd.c new file mode 100644 index 0000000..31babd2 --- /dev/null +++ b/getarch_2nd.c @@ -0,0 +1,36 @@ +#include +#ifndef BUILD_KERNEL +#include "config.h" +#else +#include "config_kernel.h" +#endif +#include "param.h" + +int main(int argc, char **argv) { + + if ((argc < 1) || (*argv[1] == '0')) { + printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); + printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); + printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); + printf("DGEMM_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N); + printf("QGEMM_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M); + printf("QGEMM_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N); + + printf("CGEMM_UNROLL_M=%d\n", CGEMM_DEFAULT_UNROLL_M); + printf("CGEMM_UNROLL_N=%d\n", CGEMM_DEFAULT_UNROLL_N); + printf("ZGEMM_UNROLL_M=%d\n", ZGEMM_DEFAULT_UNROLL_M); + printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N); + printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M); + printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N); + } + + + if ((argc >= 1) && (*argv[1] == '1')) { + printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); + printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); + printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); + printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double))); + } + + return 0; +} diff --git a/interface/._Makefile b/interface/._Makefile new file mode 100644 index 0000000..80e7eb5 Binary files /dev/null and b/interface/._Makefile differ diff --git a/interface/._asum.c b/interface/._asum.c new file mode 100644 index 0000000..99b3ab4 Binary files /dev/null and b/interface/._asum.c differ diff --git a/interface/._axpy.c b/interface/._axpy.c new file mode 100644 index 0000000..ec86e48 Binary files /dev/null and b/interface/._axpy.c differ diff --git a/interface/._copy.c b/interface/._copy.c new file mode 100644 index 0000000..2079147 Binary files /dev/null and b/interface/._copy.c differ diff --git a/interface/._create b/interface/._create new file mode 100755 index 0000000..6add011 Binary files /dev/null and b/interface/._create differ diff --git a/interface/._dot.c b/interface/._dot.c new file mode 100644 index 0000000..bdacedf Binary files /dev/null and b/interface/._dot.c differ diff --git a/interface/._dsdot.c b/interface/._dsdot.c new file mode 100644 index 0000000..05773b8 Binary files /dev/null and b/interface/._dsdot.c differ diff --git a/interface/._gbmv.c b/interface/._gbmv.c new file mode 100644 index 0000000..0d3145d Binary files /dev/null and b/interface/._gbmv.c differ diff --git a/interface/._gemm.c b/interface/._gemm.c new file mode 100644 index 0000000..43f4985 Binary files /dev/null and b/interface/._gemm.c differ diff --git a/interface/._gemv.c b/interface/._gemv.c new file mode 100644 index 0000000..a7ae65e Binary files /dev/null and b/interface/._gemv.c differ diff --git a/interface/._ger.c b/interface/._ger.c new file mode 100644 index 0000000..7eea6e5 Binary files /dev/null and b/interface/._ger.c differ diff --git a/interface/._gesv.c b/interface/._gesv.c new file mode 100644 index 0000000..f081385 Binary files /dev/null and b/interface/._gesv.c differ diff --git a/interface/._getf2.c b/interface/._getf2.c new file mode 100644 index 0000000..b432668 Binary files /dev/null and b/interface/._getf2.c differ diff --git a/interface/._getrf.c b/interface/._getrf.c new file mode 100644 index 0000000..369bb3d Binary files /dev/null and b/interface/._getrf.c differ diff --git a/interface/._getrs.c b/interface/._getrs.c new file mode 100644 index 0000000..c1e68bd Binary files /dev/null and b/interface/._getrs.c differ diff --git a/interface/._imax.c b/interface/._imax.c new file mode 100644 index 0000000..700cafe Binary files /dev/null and b/interface/._imax.c differ diff --git a/interface/._larf.c b/interface/._larf.c new file mode 100644 index 0000000..7133f00 Binary files /dev/null and b/interface/._larf.c differ diff --git a/interface/._laswp.c b/interface/._laswp.c new file mode 100644 index 0000000..224e22d Binary files /dev/null and b/interface/._laswp.c differ diff --git a/interface/._lauu2.c b/interface/._lauu2.c new file mode 100644 index 0000000..1181a50 Binary files /dev/null and b/interface/._lauu2.c differ diff --git a/interface/._lauum.c b/interface/._lauum.c new file mode 100644 index 0000000..02ba8ba Binary files /dev/null and b/interface/._lauum.c differ diff --git a/interface/._max.c b/interface/._max.c new file mode 100644 index 0000000..9f5cbd7 Binary files /dev/null and b/interface/._max.c differ diff --git a/interface/._nrm2.c b/interface/._nrm2.c new file mode 100644 index 0000000..f85ffbd Binary files /dev/null and b/interface/._nrm2.c differ diff --git a/interface/._potf2.c b/interface/._potf2.c new file mode 100644 index 0000000..9976ae3 Binary files /dev/null and b/interface/._potf2.c differ diff --git a/interface/._potrf.c b/interface/._potrf.c new file mode 100644 index 0000000..1877c80 Binary files /dev/null and b/interface/._potrf.c differ diff --git a/interface/._potri.c b/interface/._potri.c new file mode 100644 index 0000000..9c673eb Binary files /dev/null and b/interface/._potri.c differ diff --git a/interface/._rot.c b/interface/._rot.c new file mode 100644 index 0000000..d13949a Binary files /dev/null and b/interface/._rot.c differ diff --git a/interface/._rotg.c b/interface/._rotg.c new file mode 100644 index 0000000..bc1de74 Binary files /dev/null and b/interface/._rotg.c differ diff --git a/interface/._rotm.c b/interface/._rotm.c new file mode 100644 index 0000000..f9387d6 Binary files /dev/null and b/interface/._rotm.c differ diff --git a/interface/._rotmg.c b/interface/._rotmg.c new file mode 100644 index 0000000..e41e0b7 Binary files /dev/null and b/interface/._rotmg.c differ diff --git a/interface/._sbmv.c b/interface/._sbmv.c new file mode 100644 index 0000000..6ecefa0 Binary files /dev/null and b/interface/._sbmv.c differ diff --git a/interface/._scal.c b/interface/._scal.c new file mode 100644 index 0000000..b4a89c4 Binary files /dev/null and b/interface/._scal.c differ diff --git a/interface/._sdsdot.c b/interface/._sdsdot.c new file mode 100644 index 0000000..28f507d Binary files /dev/null and b/interface/._sdsdot.c differ diff --git a/interface/._spmv.c b/interface/._spmv.c new file mode 100644 index 0000000..80583d3 Binary files /dev/null and b/interface/._spmv.c differ diff --git a/interface/._spr.c b/interface/._spr.c new file mode 100644 index 0000000..beda234 Binary files /dev/null and b/interface/._spr.c differ diff --git a/interface/._spr2.c b/interface/._spr2.c new file mode 100644 index 0000000..c6d390d Binary files /dev/null and b/interface/._spr2.c differ diff --git a/interface/._swap.c b/interface/._swap.c new file mode 100644 index 0000000..48cf0d9 Binary files /dev/null and b/interface/._swap.c differ diff --git a/interface/._symm.c b/interface/._symm.c new file mode 100644 index 0000000..f15ae52 Binary files /dev/null and b/interface/._symm.c differ diff --git a/interface/._symv.c b/interface/._symv.c new file mode 100644 index 0000000..813848b Binary files /dev/null and b/interface/._symv.c differ diff --git a/interface/._syr.c b/interface/._syr.c new file mode 100644 index 0000000..f34d62c Binary files /dev/null and b/interface/._syr.c differ diff --git a/interface/._syr2.c b/interface/._syr2.c new file mode 100644 index 0000000..f7403af Binary files /dev/null and b/interface/._syr2.c differ diff --git a/interface/._syr2k.c b/interface/._syr2k.c new file mode 100644 index 0000000..54014af Binary files /dev/null and b/interface/._syr2k.c differ diff --git a/interface/._syrk.c b/interface/._syrk.c new file mode 100644 index 0000000..f47ca9b Binary files /dev/null and b/interface/._syrk.c differ diff --git a/interface/._tbmv.c b/interface/._tbmv.c new file mode 100644 index 0000000..26e41c0 Binary files /dev/null and b/interface/._tbmv.c differ diff --git a/interface/._tbsv.c b/interface/._tbsv.c new file mode 100644 index 0000000..43755a8 Binary files /dev/null and b/interface/._tbsv.c differ diff --git a/interface/._tpmv.c b/interface/._tpmv.c new file mode 100644 index 0000000..dac479b Binary files /dev/null and b/interface/._tpmv.c differ diff --git a/interface/._tpsv.c b/interface/._tpsv.c new file mode 100644 index 0000000..b6d8ae0 Binary files /dev/null and b/interface/._tpsv.c differ diff --git a/interface/._trmv.c b/interface/._trmv.c new file mode 100644 index 0000000..96aaa09 Binary files /dev/null and b/interface/._trmv.c differ diff --git a/interface/._trsm.c b/interface/._trsm.c new file mode 100644 index 0000000..c6cc6d2 Binary files /dev/null and b/interface/._trsm.c differ diff --git a/interface/._trsv.c b/interface/._trsv.c new file mode 100644 index 0000000..f31f355 Binary files /dev/null and b/interface/._trsv.c differ diff --git a/interface/._trti2.c b/interface/._trti2.c new file mode 100644 index 0000000..925bb3e Binary files /dev/null and b/interface/._trti2.c differ diff --git a/interface/._trtri.c b/interface/._trtri.c new file mode 100644 index 0000000..848c861 Binary files /dev/null and b/interface/._trtri.c differ diff --git a/interface/._zaxpy.c b/interface/._zaxpy.c new file mode 100644 index 0000000..04e67e2 Binary files /dev/null and b/interface/._zaxpy.c differ diff --git a/interface/._zdot.c b/interface/._zdot.c new file mode 100644 index 0000000..43e1b64 Binary files /dev/null and b/interface/._zdot.c differ diff --git a/interface/._zgbmv.c b/interface/._zgbmv.c new file mode 100644 index 0000000..2c80157 Binary files /dev/null and b/interface/._zgbmv.c differ diff --git a/interface/._zgemv.c b/interface/._zgemv.c new file mode 100644 index 0000000..9114b12 Binary files /dev/null and b/interface/._zgemv.c differ diff --git a/interface/._zger.c b/interface/._zger.c new file mode 100644 index 0000000..80264b5 Binary files /dev/null and b/interface/._zger.c differ diff --git a/interface/._zgetf2.c b/interface/._zgetf2.c new file mode 100644 index 0000000..4db4e06 Binary files /dev/null and b/interface/._zgetf2.c differ diff --git a/interface/._zgetrf.c b/interface/._zgetrf.c new file mode 100644 index 0000000..9d131b1 Binary files /dev/null and b/interface/._zgetrf.c differ diff --git a/interface/._zgetrs.c b/interface/._zgetrs.c new file mode 100644 index 0000000..fab8aa8 Binary files /dev/null and b/interface/._zgetrs.c differ diff --git a/interface/._zhbmv.c b/interface/._zhbmv.c new file mode 100644 index 0000000..f6dbf36 Binary files /dev/null and b/interface/._zhbmv.c differ diff --git a/interface/._zhemv.c b/interface/._zhemv.c new file mode 100644 index 0000000..a8a90bf Binary files /dev/null and b/interface/._zhemv.c differ diff --git a/interface/._zher.c b/interface/._zher.c new file mode 100644 index 0000000..1fce7e4 Binary files /dev/null and b/interface/._zher.c differ diff --git a/interface/._zher2.c b/interface/._zher2.c new file mode 100644 index 0000000..9de5c83 Binary files /dev/null and b/interface/._zher2.c differ diff --git a/interface/._zhpmv.c b/interface/._zhpmv.c new file mode 100644 index 0000000..7bbb297 Binary files /dev/null and b/interface/._zhpmv.c differ diff --git a/interface/._zhpr.c b/interface/._zhpr.c new file mode 100644 index 0000000..af3d2d3 Binary files /dev/null and b/interface/._zhpr.c differ diff --git a/interface/._zhpr2.c b/interface/._zhpr2.c new file mode 100644 index 0000000..893e2e0 Binary files /dev/null and b/interface/._zhpr2.c differ diff --git a/interface/._zlaswp.c b/interface/._zlaswp.c new file mode 100644 index 0000000..ad33210 Binary files /dev/null and b/interface/._zlaswp.c differ diff --git a/interface/._zlauu2.c b/interface/._zlauu2.c new file mode 100644 index 0000000..2845d19 Binary files /dev/null and b/interface/._zlauu2.c differ diff --git a/interface/._zlauum.c b/interface/._zlauum.c new file mode 100644 index 0000000..7d0e73f Binary files /dev/null and b/interface/._zlauum.c differ diff --git a/interface/._zpotf2.c b/interface/._zpotf2.c new file mode 100644 index 0000000..1bb6aee Binary files /dev/null and b/interface/._zpotf2.c differ diff --git a/interface/._zpotrf.c b/interface/._zpotrf.c new file mode 100644 index 0000000..3ab924b Binary files /dev/null and b/interface/._zpotrf.c differ diff --git a/interface/._zpotri.c b/interface/._zpotri.c new file mode 100644 index 0000000..646db60 Binary files /dev/null and b/interface/._zpotri.c differ diff --git a/interface/._zrot.c b/interface/._zrot.c new file mode 100644 index 0000000..2f06184 Binary files /dev/null and b/interface/._zrot.c differ diff --git a/interface/._zrotg.c b/interface/._zrotg.c new file mode 100644 index 0000000..a8b7fe0 Binary files /dev/null and b/interface/._zrotg.c differ diff --git a/interface/._zsbmv.c b/interface/._zsbmv.c new file mode 100644 index 0000000..8f27fd6 Binary files /dev/null and b/interface/._zsbmv.c differ diff --git a/interface/._zscal.c b/interface/._zscal.c new file mode 100644 index 0000000..53d63f7 Binary files /dev/null and b/interface/._zscal.c differ diff --git a/interface/._zspmv.c b/interface/._zspmv.c new file mode 100644 index 0000000..c5ef5a2 Binary files /dev/null and b/interface/._zspmv.c differ diff --git a/interface/._zspr.c b/interface/._zspr.c new file mode 100644 index 0000000..0550f29 Binary files /dev/null and b/interface/._zspr.c differ diff --git a/interface/._zspr2.c b/interface/._zspr2.c new file mode 100644 index 0000000..e088828 Binary files /dev/null and b/interface/._zspr2.c differ diff --git a/interface/._zswap.c b/interface/._zswap.c new file mode 100644 index 0000000..b8151fd Binary files /dev/null and b/interface/._zswap.c differ diff --git a/interface/._zsymv.c b/interface/._zsymv.c new file mode 100644 index 0000000..beb6acc Binary files /dev/null and b/interface/._zsymv.c differ diff --git a/interface/._zsyr.c b/interface/._zsyr.c new file mode 100644 index 0000000..cb9af33 Binary files /dev/null and b/interface/._zsyr.c differ diff --git a/interface/._zsyr2.c b/interface/._zsyr2.c new file mode 100644 index 0000000..cae04e2 Binary files /dev/null and b/interface/._zsyr2.c differ diff --git a/interface/._ztbmv.c b/interface/._ztbmv.c new file mode 100644 index 0000000..0244eb1 Binary files /dev/null and b/interface/._ztbmv.c differ diff --git a/interface/._ztbsv.c b/interface/._ztbsv.c new file mode 100644 index 0000000..d95cc85 Binary files /dev/null and b/interface/._ztbsv.c differ diff --git a/interface/._ztpmv.c b/interface/._ztpmv.c new file mode 100644 index 0000000..74192bc Binary files /dev/null and b/interface/._ztpmv.c differ diff --git a/interface/._ztpsv.c b/interface/._ztpsv.c new file mode 100644 index 0000000..72011d2 Binary files /dev/null and b/interface/._ztpsv.c differ diff --git a/interface/._ztrmv.c b/interface/._ztrmv.c new file mode 100644 index 0000000..730fa54 Binary files /dev/null and b/interface/._ztrmv.c differ diff --git a/interface/._ztrsv.c b/interface/._ztrsv.c new file mode 100644 index 0000000..5ea8a62 Binary files /dev/null and b/interface/._ztrsv.c differ diff --git a/interface/._ztrti2.c b/interface/._ztrti2.c new file mode 100644 index 0000000..acdd912 Binary files /dev/null and b/interface/._ztrti2.c differ diff --git a/interface/._ztrtri.c b/interface/._ztrtri.c new file mode 100644 index 0000000..1cc05ae Binary files /dev/null and b/interface/._ztrtri.c differ diff --git a/interface/Makefile b/interface/Makefile new file mode 100644 index 0000000..5bfc5f3 --- /dev/null +++ b/interface/Makefile @@ -0,0 +1,1942 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + +ifndef NO_FBLAS + +SBLAS1OBJS = \ + saxpy.$(SUFFIX) sswap.$(SUFFIX) \ + scopy.$(SUFFIX) sscal.$(SUFFIX) \ + sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ + sasum.$(SUFFIX) snrm2.$(SUFFIX) \ + smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ + smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ + srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ + +SBLAS2OBJS = \ + sgemv.$(SUFFIX) sger.$(SUFFIX) \ + strsv.$(SUFFIX) strmv.$(SUFFIX) ssymv.$(SUFFIX) \ + ssyr.$(SUFFIX) ssyr2.$(SUFFIX) sgbmv.$(SUFFIX) \ + ssbmv.$(SUFFIX) sspmv.$(SUFFIX) \ + sspr.$(SUFFIX) sspr2.$(SUFFIX) \ + stbsv.$(SUFFIX) stbmv.$(SUFFIX) \ + stpsv.$(SUFFIX) stpmv.$(SUFFIX) + +SBLAS3OBJS = \ + sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ + strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) + +DBLAS1OBJS = \ + daxpy.$(SUFFIX) dswap.$(SUFFIX) \ + dcopy.$(SUFFIX) dscal.$(SUFFIX) \ + ddot.$(SUFFIX) \ + dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ + dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ + dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ + drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ + +DBLAS2OBJS = \ + dgemv.$(SUFFIX) dger.$(SUFFIX) \ + dtrsv.$(SUFFIX) dtrmv.$(SUFFIX) dsymv.$(SUFFIX) \ + dsyr.$(SUFFIX) dsyr2.$(SUFFIX) dgbmv.$(SUFFIX) \ + dsbmv.$(SUFFIX) dspmv.$(SUFFIX) \ + dspr.$(SUFFIX) dspr2.$(SUFFIX) \ + dtbsv.$(SUFFIX) dtbmv.$(SUFFIX) \ + dtpsv.$(SUFFIX) dtpmv.$(SUFFIX) + +DBLAS3OBJS = \ + dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ + dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) + +CBLAS1OBJS = \ + caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ + ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ + cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ + scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ + scamax.$(SUFFIX) icamax.$(SUFFIX) \ + scamin.$(SUFFIX) icamin.$(SUFFIX) \ + csrot.$(SUFFIX) crotg.$(SUFFIX) \ + +CBLAS2OBJS = \ + cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ + ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) csymv.$(SUFFIX) \ + csyr.$(SUFFIX) csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ + csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ + cspr.$(SUFFIX) cspr2.$(SUFFIX) \ + ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ + ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ + chemv.$(SUFFIX) chbmv.$(SUFFIX) \ + cher.$(SUFFIX) cher2.$(SUFFIX) \ + chpmv.$(SUFFIX) chpr.$(SUFFIX) chpr2.$(SUFFIX) + +CBLAS3OBJS = \ + cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \ + ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ + chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) + +ZBLAS1OBJS = \ + zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ + zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ + zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ + dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ + dzamax.$(SUFFIX) izamax.$(SUFFIX) \ + dzamin.$(SUFFIX) izamin.$(SUFFIX) \ + zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ + +ZBLAS2OBJS = \ + zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ + ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) zsymv.$(SUFFIX) \ + zsyr.$(SUFFIX) zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ + zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ + zspr.$(SUFFIX) zspr2.$(SUFFIX) \ + ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ + ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ + zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ + zher.$(SUFFIX) zher2.$(SUFFIX) \ + zhpmv.$(SUFFIX) zhpr.$(SUFFIX) zhpr2.$(SUFFIX) + +ZBLAS3OBJS = \ + zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \ + ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ + zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) + +ifdef SUPPORT_GEMM3M + +CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) + +ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) + +endif + +ifdef EXPRECISION + +QBLAS1OBJS = \ + qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ + qcopy.$(SUFFIX) qscal.$(SUFFIX) \ + qdot.$(SUFFIX) \ + qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ + qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ + qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ + +QBLAS2OBJS = \ + qgemv.$(SUFFIX) qger.$(SUFFIX) \ + qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ + qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ + qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ + qspr.$(SUFFIX) qspr2.$(SUFFIX) \ + qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ + qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) + +QBLAS3OBJS = \ + qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ + qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) + +XBLAS1OBJS = \ + xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ + xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ + xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ + qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ + qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ + xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ + +XBLAS2OBJS = \ + xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ + xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ + xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ + xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ + xspr.$(SUFFIX) xspr2.$(SUFFIX) \ + xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ + xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ + xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ + xher.$(SUFFIX) xher2.$(SUFFIX) \ + xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) + +XBLAS3OBJS = \ + xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ + xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ + xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) + +ifdef SUPPORT_GEMM3M + +XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) + +endif + +endif + +ifdef QUAD_PRECISION + +QBLAS1OBJS = \ + qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ + qcopy.$(SUFFIX) qscal.$(SUFFIX) \ + qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ + qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ + qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ + +QBLAS2OBJS = \ + qgemv.$(SUFFIX) qger.$(SUFFIX) \ + qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ + qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ + qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ + qspr.$(SUFFIX) qspr2.$(SUFFIX) \ + qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ + qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) + +QBLAS3OBJS = \ + qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ + qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) + +XBLAS1OBJS = \ + xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ + xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ + qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ + qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ + xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ + +XBLAS2OBJS = \ + xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ + xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ + xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ + xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ + xspr.$(SUFFIX) xspr2.$(SUFFIX) \ + xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ + xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ + xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ + xher.$(SUFFIX) xher2.$(SUFFIX) \ + xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) + +XBLAS3OBJS = \ + xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ + xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ + xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) + +ifdef SUPPORT_GEMM3M + +XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) + +endif +endif + +endif + +HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ + dgemv.$(SUFFIX) dtrsv.$(SUFFIX) dger.$(SUFFIX) \ + idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) + +CSBLAS1OBJS = \ + cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ + cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ + cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ + cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) + +CSBLAS2OBJS = \ + cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ + cblas_strsv.$(SUFFIX) cblas_ssyr.$(SUFFIX) cblas_ssyr2.$(SUFFIX) cblas_sgbmv.$(SUFFIX) \ + cblas_ssbmv.$(SUFFIX) cblas_sspmv.$(SUFFIX) cblas_sspr.$(SUFFIX) cblas_sspr2.$(SUFFIX) \ + cblas_stbmv.$(SUFFIX) cblas_stbsv.$(SUFFIX) cblas_stpmv.$(SUFFIX) cblas_stpsv.$(SUFFIX) + +CSBLAS3OBJS = \ + cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ + cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) + +CDBLAS1OBJS = \ + cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ + cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ + cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ + cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) + +CDBLAS2OBJS = \ + cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ + cblas_dtrsv.$(SUFFIX) cblas_dsyr.$(SUFFIX) cblas_dsyr2.$(SUFFIX) cblas_dgbmv.$(SUFFIX) \ + cblas_dsbmv.$(SUFFIX) cblas_dspmv.$(SUFFIX) cblas_dspr.$(SUFFIX) cblas_dspr2.$(SUFFIX) \ + cblas_dtbmv.$(SUFFIX) cblas_dtbsv.$(SUFFIX) cblas_dtpmv.$(SUFFIX) cblas_dtpsv.$(SUFFIX) + +CDBLAS3OBJS += \ + cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ + cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) + +CCBLAS1OBJS = \ + cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ + cblas_ccopy.$(SUFFIX) \ + cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ + cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ + cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ + cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) + +CCBLAS2OBJS = \ + cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ + cblas_cgbmv.$(SUFFIX) cblas_chbmv.$(SUFFIX) cblas_chemv.$(SUFFIX) \ + cblas_cher.$(SUFFIX) cblas_cher2.$(SUFFIX) cblas_chpmv.$(SUFFIX) \ + cblas_chpr.$(SUFFIX) cblas_chpr2.$(SUFFIX) cblas_ctbmv.$(SUFFIX) \ + cblas_ctbsv.$(SUFFIX) cblas_ctpmv.$(SUFFIX) cblas_ctpsv.$(SUFFIX) \ + cblas_ctrmv.$(SUFFIX) cblas_ctrsv.$(SUFFIX) + +CCBLAS3OBJS = \ + cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ + cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ + cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) + +CZBLAS1OBJS = \ + cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ + cblas_zcopy.$(SUFFIX) \ + cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ + cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ + cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ + cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) + +CZBLAS2OBJS = \ + cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ + cblas_zgbmv.$(SUFFIX) cblas_zhbmv.$(SUFFIX) cblas_zhemv.$(SUFFIX) \ + cblas_zher.$(SUFFIX) cblas_zher2.$(SUFFIX) cblas_zhpmv.$(SUFFIX) \ + cblas_zhpr.$(SUFFIX) cblas_zhpr2.$(SUFFIX) cblas_ztbmv.$(SUFFIX) \ + cblas_ztbsv.$(SUFFIX) cblas_ztpmv.$(SUFFIX) cblas_ztpsv.$(SUFFIX) \ + cblas_ztrmv.$(SUFFIX) cblas_ztrsv.$(SUFFIX) + +CZBLAS3OBJS = \ + cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ + cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ + cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX) + +ifndef NO_CBLAS + +CFLAGS += -I. + +SBLAS1OBJS += $(CSBLAS1OBJS) +SBLAS2OBJS += $(CSBLAS2OBJS) +SBLAS3OBJS += $(CSBLAS3OBJS) +DBLAS1OBJS += $(CDBLAS1OBJS) +DBLAS2OBJS += $(CDBLAS2OBJS) +DBLAS3OBJS += $(CDBLAS3OBJS) +CBLAS1OBJS += $(CCBLAS1OBJS) +CBLAS2OBJS += $(CCBLAS2OBJS) +CBLAS3OBJS += $(CCBLAS3OBJS) +ZBLAS1OBJS += $(CZBLAS1OBJS) +ZBLAS2OBJS += $(CZBLAS2OBJS) +ZBLAS3OBJS += $(CZBLAS3OBJS) + +endif + +SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) +DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) +QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) +CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) +ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) +XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) + +SBLASOBJS += \ + sgetf2.$(SUFFIX) sgetrf.$(SUFFIX) slauu2.$(SUFFIX) slauum.$(SUFFIX) \ + spotf2.$(SUFFIX) spotrf.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) \ + slaswp.$(SUFFIX) sgetrs.$(SUFFIX) sgesv.$(SUFFIX) spotri.$(SUFFIX) \ + +DBLASOBJS += \ + dgetf2.$(SUFFIX) dgetrf.$(SUFFIX) dlauu2.$(SUFFIX) dlauum.$(SUFFIX) \ + dpotf2.$(SUFFIX) dpotrf.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) \ + dlaswp.$(SUFFIX) dgetrs.$(SUFFIX) dgesv.$(SUFFIX) dpotri.$(SUFFIX) \ + +QBLASOBJS += \ + qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ + qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \ + qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \ + +CBLASOBJS += \ + cgetf2.$(SUFFIX) cgetrf.$(SUFFIX) clauu2.$(SUFFIX) clauum.$(SUFFIX) \ + cpotf2.$(SUFFIX) cpotrf.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) \ + claswp.$(SUFFIX) cgetrs.$(SUFFIX) cgesv.$(SUFFIX) cpotri.$(SUFFIX) \ + +ZBLASOBJS += \ + zgetf2.$(SUFFIX) zgetrf.$(SUFFIX) zlauu2.$(SUFFIX) zlauum.$(SUFFIX) \ + zpotf2.$(SUFFIX) zpotrf.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) \ + zlaswp.$(SUFFIX) zgetrs.$(SUFFIX) zgesv.$(SUFFIX) zpotri.$(SUFFIX) \ + +XBLASOBJS += \ + xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ + xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \ + xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \ + + +FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) + +ifdef EXPRECISION +FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) +endif + +ifdef QUAD_PRECISION +FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) +endif + +FUNCALLFILES = $(FUNCOBJS:.$(SUFFIX)=) + +include $(TOPDIR)/Makefile.tail + +all :: libs + +ifdef FUNCTION_PROFILE +$(BLASOBJS) $(BLASOBJS_P) : functable.h +$(BLASOBJS) $(BLASOBJS_P) : CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) + +functable.h : Makefile + ./create $(FUNCALLFILES) > functable.h + +endif + +clean :: + @rm -f functable.h + +level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +$(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ +$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : CFLAGS += -DCBLAS + +srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +drot.$(SUFFIX) drot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qrot.$(SUFFIX) qrot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +csrot.$(SUFFIX) csrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zdrot.$(SUFFIX) zdrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xqrot.$(SUFFIX) xqrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +srotm.$(SUFFIX) srotm.$(PSUFFIX): rotm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +drotm.$(SUFFIX) drotm.$(PSUFFIX): rotm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qrotm.$(SUFFIX) qrotm.$(PSUFFIX): rotm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +srotmg.$(SUFFIX) srotmg.$(PSUFFIX): rotmg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +drotmg.$(SUFFIX) drotmg.$(PSUFFIX): rotmg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qrotmg.$(SUFFIX) qrotmg.$(PSUFFIX): rotmg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +srotg.$(SUFFIX) srotg.$(PSUFFIX): rotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +drotg.$(SUFFIX) drotg.$(PSUFFIX): rotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qrotg.$(SUFFIX) qrotg.$(PSUFFIX): rotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xrotg.$(SUFFIX) xrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sasum.$(SUFFIX) sasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dasum.$(SUFFIX) dasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qasum.$(SUFFIX) qasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +scasum.$(SUFFIX) scasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dnrm2.$(SUFFIX) dnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qnrm2.$(SUFFIX) qnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +scnrm2.$(SUFFIX) scnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dznrm2.$(SUFFIX) dznrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qxnrm2.$(SUFFIX) qxnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +samax.$(SUFFIX) samax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +damax.$(SUFFIX) damax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +qamax.$(SUFFIX) qamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +scamax.$(SUFFIX) scamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +dzamax.$(SUFFIX) dzamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +qxamax.$(SUFFIX) qxamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +samin.$(SUFFIX) samin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +damin.$(SUFFIX) damin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +qamin.$(SUFFIX) qamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +scamin.$(SUFFIX) scamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +dzamin.$(SUFFIX) dzamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +qxamin.$(SUFFIX) qxamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +smax.$(SUFFIX) smax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +dmax.$(SUFFIX) dmax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +qmax.$(SUFFIX) qmax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +smin.$(SUFFIX) smin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +dmin.$(SUFFIX) dmin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +qmin.$(SUFFIX) qmin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +isamax.$(SUFFIX) isamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +idamax.$(SUFFIX) idamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +iqamax.$(SUFFIX) iqamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +icamax.$(SUFFIX) icamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +izamax.$(SUFFIX) izamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +ixamax.$(SUFFIX) ixamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +isamin.$(SUFFIX) isamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +idamin.$(SUFFIX) idamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +iqamin.$(SUFFIX) iqamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +icamin.$(SUFFIX) icamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +izamin.$(SUFFIX) izamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +ixamin.$(SUFFIX) ixamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +ismax.$(SUFFIX) ismax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +idmax.$(SUFFIX) idmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +iqmax.$(SUFFIX) iqmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +ismin.$(SUFFIX) ismin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +idmin.$(SUFFIX) idmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +iqmin.$(SUFFIX) iqmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +sdot.$(SUFFIX) sdot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +ddot.$(SUFFIX) ddot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qdot.$(SUFFIX) qdot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cdotu.$(SUFFIX) cdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) + +cdotc.$(SUFFIX) cdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +zdotu.$(SUFFIX) zdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) + +zdotc.$(SUFFIX) zdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +xdotu.$(SUFFIX) xdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) + +xdotc.$(SUFFIX) xdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +saxpy.$(SUFFIX) saxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +daxpy.$(SUFFIX) daxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qaxpy.$(SUFFIX) qaxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +caxpy.$(SUFFIX) caxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zaxpy.$(SUFFIX) zaxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xaxpy.$(SUFFIX) xaxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +caxpyc.$(SUFFIX) caxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +zaxpyc.$(SUFFIX) zaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +xaxpyc.$(SUFFIX) xaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qscal.$(SUFFIX) qscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cscal.$(SUFFIX) cscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zscal.$(SUFFIX) zscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xscal.$(SUFFIX) xscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +csscal.$(SUFFIX) csscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) + +zdscal.$(SUFFIX) zdscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) + +xqscal.$(SUFFIX) xqscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) + +scopy.$(SUFFIX) scopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dcopy.$(SUFFIX) dcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qcopy.$(SUFFIX) qcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +ccopy.$(SUFFIX) ccopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zcopy.$(SUFFIX) zcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xcopy.$(SUFFIX) xcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +sswap.$(SUFFIX) sswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dswap.$(SUFFIX) dswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qswap.$(SUFFIX) qswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cswap.$(SUFFIX) cswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zswap.$(SUFFIX) zswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xswap.$(SUFFIX) xswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +sger.$(SUFFIX) sger.$(PSUFFIX) : ger.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dger.$(SUFFIX) dger.$(PSUFFIX) : ger.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qger.$(SUFFIX) qger.$(PSUFFIX) : ger.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgeru.$(SUFFIX) cgeru.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) + +cgerc.$(SUFFIX) cgerc.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) + +zgeru.$(SUFFIX) zgeru.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) + +zgerc.$(SUFFIX) zgerc.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) + +xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) + +xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) + +sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +strsv.$(SUFFIX) strsv.$(PSUFFIX) : trsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrsv.$(SUFFIX) dtrsv.$(PSUFFIX) : trsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrsv.$(SUFFIX) qtrsv.$(PSUFFIX) : trsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrsv.$(SUFFIX) ctrsv.$(PSUFFIX) : ztrsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrsv.$(SUFFIX) ztrsv.$(PSUFFIX) : ztrsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrsv.$(SUFFIX) xtrsv.$(PSUFFIX) : ztrsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strmv.$(SUFFIX) strmv.$(PSUFFIX) : trmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrmv.$(SUFFIX) dtrmv.$(PSUFFIX) : trmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrmv.$(SUFFIX) qtrmv.$(PSUFFIX) : trmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrmv.$(SUFFIX) ctrmv.$(PSUFFIX) : ztrmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrmv.$(SUFFIX) ztrmv.$(PSUFFIX) : ztrmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrmv.$(SUFFIX) xtrmv.$(PSUFFIX) : ztrmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssymv.$(SUFFIX) ssymv.$(PSUFFIX) : symv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr.$(SUFFIX) ssyr.$(PSUFFIX) : syr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr2.$(SUFFIX) ssyr2.$(PSUFFIX) : syr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyr2.$(SUFFIX) dsyr2.$(PSUFFIX) : syr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyr2.$(SUFFIX) qsyr2.$(PSUFFIX) : syr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyr2.$(SUFFIX) csyr2.$(PSUFFIX) : zsyr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyr2.$(SUFFIX) zsyr2.$(PSUFFIX) : zsyr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyr2.$(SUFFIX) xsyr2.$(PSUFFIX) : zsyr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgbmv.$(SUFFIX) sgbmv.$(PSUFFIX): gbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +dgbmv.$(SUFFIX) dgbmv.$(PSUFFIX): gbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +qgbmv.$(SUFFIX) qgbmv.$(PSUFFIX): gbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +cgbmv.$(SUFFIX) cgbmv.$(PSUFFIX): zgbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +zgbmv.$(SUFFIX) zgbmv.$(PSUFFIX): zgbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +xgbmv.$(SUFFIX) xgbmv.$(PSUFFIX): zgbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +ssbmv.$(SUFFIX) ssbmv.$(PSUFFIX) : sbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsbmv.$(SUFFIX) dsbmv.$(PSUFFIX) : sbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsbmv.$(SUFFIX) qsbmv.$(PSUFFIX) : sbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csbmv.$(SUFFIX) csbmv.$(PSUFFIX) : zsbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsbmv.$(SUFFIX) zsbmv.$(PSUFFIX) : zsbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsbmv.$(SUFFIX) xsbmv.$(PSUFFIX) : zsbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sspmv.$(SUFFIX) sspmv.$(PSUFFIX) : spmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sspr.$(SUFFIX) sspr.$(PSUFFIX) : spr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sspr2.$(SUFFIX) sspr2.$(PSUFFIX) : spr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dspr2.$(SUFFIX) dspr2.$(PSUFFIX) : spr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qspr2.$(SUFFIX) qspr2.$(PSUFFIX) : spr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cspr2.$(SUFFIX) cspr2.$(PSUFFIX) : zspr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zspr2.$(SUFFIX) zspr2.$(PSUFFIX) : zspr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xspr2.$(SUFFIX) xspr2.$(PSUFFIX) : zspr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stbmv.$(SUFFIX) stbmv.$(PSUFFIX) : tbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtbmv.$(SUFFIX) dtbmv.$(PSUFFIX) : tbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtbmv.$(SUFFIX) qtbmv.$(PSUFFIX) : tbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctbmv.$(SUFFIX) ctbmv.$(PSUFFIX) : ztbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztbmv.$(SUFFIX) ztbmv.$(PSUFFIX) : ztbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtbmv.$(SUFFIX) xtbmv.$(PSUFFIX) : ztbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stbsv.$(SUFFIX) stbsv.$(PSUFFIX) : tbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtbsv.$(SUFFIX) dtbsv.$(PSUFFIX) : tbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtbsv.$(SUFFIX) qtbsv.$(PSUFFIX) : tbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctbsv.$(SUFFIX) ctbsv.$(PSUFFIX) : ztbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztbsv.$(SUFFIX) ztbsv.$(PSUFFIX) : ztbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtbsv.$(SUFFIX) xtbsv.$(PSUFFIX) : ztbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stpsv.$(SUFFIX) stpsv.$(PSUFFIX) : tpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtpsv.$(SUFFIX) dtpsv.$(PSUFFIX) : tpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtpsv.$(SUFFIX) qtpsv.$(PSUFFIX) : tpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctpsv.$(SUFFIX) ctpsv.$(PSUFFIX) : ztpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztpsv.$(SUFFIX) ztpsv.$(PSUFFIX) : ztpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtpsv.$(SUFFIX) xtpsv.$(PSUFFIX) : ztpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stpmv.$(SUFFIX) stpmv.$(PSUFFIX) : tpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtpmv.$(SUFFIX) dtpmv.$(PSUFFIX) : tpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtpmv.$(SUFFIX) qtpmv.$(PSUFFIX) : tpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctpmv.$(SUFFIX) ctpmv.$(PSUFFIX) : ztpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztpmv.$(SUFFIX) ztpmv.$(PSUFFIX) : ztpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtpmv.$(SUFFIX) xtpmv.$(PSUFFIX) : ztpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chemv.$(SUFFIX) chemv.$(PSUFFIX) : zhemv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhemv.$(SUFFIX) zhemv.$(PSUFFIX) : zhemv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhemv.$(SUFFIX) xhemv.$(PSUFFIX) : zhemv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chbmv.$(SUFFIX) chbmv.$(PSUFFIX) : zhbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhbmv.$(SUFFIX) zhbmv.$(PSUFFIX) : zhbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhbmv.$(SUFFIX) xhbmv.$(PSUFFIX) : zhbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cher.$(SUFFIX) cher.$(PSUFFIX) : zher.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zher.$(SUFFIX) zher.$(PSUFFIX) : zher.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xher.$(SUFFIX) xher.$(PSUFFIX) : zher.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cher2.$(SUFFIX) cher2.$(PSUFFIX) : zher2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zher2.$(SUFFIX) zher2.$(PSUFFIX) : zher2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xher2.$(SUFFIX) xher2.$(PSUFFIX) : zher2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chpmv.$(SUFFIX) chpmv.$(PSUFFIX) : zhpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhpmv.$(SUFFIX) zhpmv.$(PSUFFIX) : zhpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhpmv.$(SUFFIX) xhpmv.$(PSUFFIX) : zhpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chpr.$(SUFFIX) chpr.$(PSUFFIX) : zhpr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhpr.$(SUFFIX) zhpr.$(PSUFFIX) : zhpr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhpr.$(SUFFIX) xhpr.$(PSUFFIX) : zhpr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chpr2.$(SUFFIX) chpr2.$(PSUFFIX) : zhpr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemm.$(SUFFIX) dgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgemm.$(SUFFIX) qgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemm.$(SUFFIX) cgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsymm.$(SUFFIX) dsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsymm.$(SUFFIX) qsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csymm.$(SUFFIX) csymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsymm.$(SUFFIX) zsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsymm.$(SUFFIX) xsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strmm.$(SUFFIX) strmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +dtrmm.$(SUFFIX) dtrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +qtrmm.$(SUFFIX) qtrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +ctrmm.$(SUFFIX) ctrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +ztrmm.$(SUFFIX) ztrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +xtrmm.$(SUFFIX) xtrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +strsm.$(SUFFIX) strsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrsm.$(SUFFIX) dtrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrsm.$(SUFFIX) qtrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrsm.$(SUFFIX) ctrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrsm.$(SUFFIX) ztrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrsm.$(SUFFIX) xtrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyrk.$(SUFFIX) ssyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyrk.$(SUFFIX) dsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyrk.$(SUFFIX) qsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyrk.$(SUFFIX) csyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyrk.$(SUFFIX) zsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyrk.$(SUFFIX) xsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr2k.$(SUFFIX) ssyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyr2k.$(SUFFIX) dsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyr2k.$(SUFFIX) qsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyr2k.$(SUFFIX) csyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyr2k.$(SUFFIX) zsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyr2k.$(SUFFIX) xsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chemm.$(SUFFIX) chemm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +zhemm.$(SUFFIX) zhemm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +xhemm.$(SUFFIX) xhemm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +cherk.$(SUFFIX) cherk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +zherk.$(SUFFIX) zherk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +xherk.$(SUFFIX) xherk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +cher2k.$(SUFFIX) cher2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +zher2k.$(SUFFIX) zher2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +xher2k.$(SUFFIX) xher2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +cgemm3m.$(SUFFIX) cgemm3m.$(PSUFFIX) : gemm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +zgemm3m.$(SUFFIX) zgemm3m.$(PSUFFIX) : gemm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +xgemm3m.$(SUFFIX) xgemm3m.$(PSUFFIX) : gemm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +csymm3m.$(SUFFIX) csymm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +zsymm3m.$(SUFFIX) zsymm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +xsymm3m.$(SUFFIX) xsymm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +chemm3m.$(SUFFIX) chemm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) + +zhemm3m.$(SUFFIX) zhemm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) + +xhemm3m.$(SUFFIX) xhemm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) + +cblas_isamax.$(SUFFIX) cblas_isamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_idamax.$(SUFFIX) cblas_idamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_idmax.$(SUFFIX) cblas_idmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dasum.$(SUFFIX) cblas_dasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_cdotu.$(SUFFIX) cblas_cdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) + +cblas_cdotc.$(SUFFIX) cblas_cdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_zdotu.$(SUFFIX) cblas_zdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) + +cblas_zdotc.$(SUFFIX) cblas_zdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_cdotu_sub.$(SUFFIX) cblas_cdotu_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) + +cblas_cdotc_sub.$(SUFFIX) cblas_cdotc_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) + +cblas_zdotu_sub.$(SUFFIX) cblas_zdotu_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) + +cblas_zdotc_sub.$(SUFFIX) cblas_zdotc_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) + +cblas_snrm2.$(SUFFIX) cblas_snrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dnrm2.$(SUFFIX) cblas_dnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scnrm2.$(SUFFIX) cblas_scnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dznrm2.$(SUFFIX) cblas_dznrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_saxpy.$(SUFFIX) cblas_saxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scopy.$(SUFFIX) cblas_scopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dcopy.$(SUFFIX) cblas_dcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_ccopy.$(SUFFIX) cblas_ccopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zcopy.$(SUFFIX) cblas_zcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sswap.$(SUFFIX) cblas_sswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dswap.$(SUFFIX) cblas_dswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_cswap.$(SUFFIX) cblas_cswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zswap.$(SUFFIX) cblas_zswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srot.$(SUFFIX) cblas_srot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drot.$(SUFFIX) cblas_drot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drotm.$(SUFFIX) cblas_drotm.$(PSUFFIX): rotm.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srotmg.$(SUFFIX) cblas_srotmg.$(PSUFFIX): rotmg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drotmg.$(SUFFIX) cblas_drotmg.$(PSUFFIX): rotmg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sscal.$(SUFFIX) cblas_sscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dscal.$(SUFFIX) cblas_dscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_cscal.$(SUFFIX) cblas_cscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zscal.$(SUFFIX) cblas_zscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) + +cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) + +cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_dgemv.$(SUFFIX) cblas_dgemv.$(PSUFFIX): gemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_cgemv.$(SUFFIX) cblas_cgemv.$(PSUFFIX): zgemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_zgemv.$(SUFFIX) cblas_zgemv.$(PSUFFIX): zgemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_sger.$(SUFFIX) cblas_sger.$(PSUFFIX) : ger.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dger.$(SUFFIX) cblas_dger.$(PSUFFIX) : ger.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgeru.$(SUFFIX) cblas_cgeru.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) + +cblas_cgerc.$(SUFFIX) cblas_cgerc.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) + +cblas_zgeru.$(SUFFIX) cblas_zgeru.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) + +cblas_zgerc.$(SUFFIX) cblas_zgerc.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) + +cblas_strsv.$(SUFFIX) cblas_strsv.$(PSUFFIX) : trsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtrsv.$(SUFFIX) cblas_dtrsv.$(PSUFFIX) : trsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctrsv.$(SUFFIX) cblas_ctrsv.$(PSUFFIX) : ztrsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztrsv.$(SUFFIX) cblas_ztrsv.$(PSUFFIX) : ztrsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_strmv.$(SUFFIX) cblas_strmv.$(PSUFFIX) : trmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtrmv.$(SUFFIX) cblas_dtrmv.$(PSUFFIX) : trmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctrmv.$(SUFFIX) cblas_ctrmv.$(PSUFFIX) : ztrmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztrmv.$(SUFFIX) cblas_ztrmv.$(PSUFFIX) : ztrmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyr.$(SUFFIX) cblas_ssyr.$(PSUFFIX) : syr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyr.$(SUFFIX) cblas_dsyr.$(PSUFFIX) : syr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cher.$(SUFFIX) cblas_cher.$(PSUFFIX) : zher.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zher.$(SUFFIX) cblas_zher.$(PSUFFIX) : zher.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyr2.$(SUFFIX) cblas_ssyr2.$(PSUFFIX) : syr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyr2.$(SUFFIX) cblas_dsyr2.$(PSUFFIX) : syr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cher2.$(SUFFIX) cblas_cher2.$(PSUFFIX) : zher2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zher2.$(SUFFIX) cblas_zher2.$(PSUFFIX) : zher2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sgbmv.$(SUFFIX) cblas_sgbmv.$(PSUFFIX): gbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_dgbmv.$(SUFFIX) cblas_dgbmv.$(PSUFFIX): gbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_cgbmv.$(SUFFIX) cblas_cgbmv.$(PSUFFIX): zgbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_zgbmv.$(SUFFIX) cblas_zgbmv.$(PSUFFIX): zgbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_ssbmv.$(SUFFIX) cblas_ssbmv.$(PSUFFIX) : sbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsbmv.$(SUFFIX) cblas_dsbmv.$(PSUFFIX) : sbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chbmv.$(SUFFIX) cblas_chbmv.$(PSUFFIX) : zhbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhbmv.$(SUFFIX) cblas_zhbmv.$(PSUFFIX) : zhbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sspmv.$(SUFFIX) cblas_sspmv.$(PSUFFIX) : spmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dspmv.$(SUFFIX) cblas_dspmv.$(PSUFFIX) : spmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sspr.$(SUFFIX) cblas_sspr.$(PSUFFIX) : spr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dspr.$(SUFFIX) cblas_dspr.$(PSUFFIX) : spr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chpr.$(SUFFIX) cblas_chpr.$(PSUFFIX) : zhpr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhpr.$(SUFFIX) cblas_zhpr.$(PSUFFIX) : zhpr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sspr2.$(SUFFIX) cblas_sspr2.$(PSUFFIX) : spr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dspr2.$(SUFFIX) cblas_dspr2.$(PSUFFIX) : spr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chpr2.$(SUFFIX) cblas_chpr2.$(PSUFFIX) : zhpr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhpr2.$(SUFFIX) cblas_zhpr2.$(PSUFFIX) : zhpr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stbmv.$(SUFFIX) cblas_stbmv.$(PSUFFIX) : tbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtbmv.$(SUFFIX) cblas_dtbmv.$(PSUFFIX) : tbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctbmv.$(SUFFIX) cblas_ctbmv.$(PSUFFIX) : ztbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztbmv.$(SUFFIX) cblas_ztbmv.$(PSUFFIX) : ztbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stbsv.$(SUFFIX) cblas_stbsv.$(PSUFFIX) : tbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtbsv.$(SUFFIX) cblas_dtbsv.$(PSUFFIX) : tbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctbsv.$(SUFFIX) cblas_ctbsv.$(PSUFFIX) : ztbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztbsv.$(SUFFIX) cblas_ztbsv.$(PSUFFIX) : ztbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stpmv.$(SUFFIX) cblas_stpmv.$(PSUFFIX) : tpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtpmv.$(SUFFIX) cblas_dtpmv.$(PSUFFIX) : tpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctpmv.$(SUFFIX) cblas_ctpmv.$(PSUFFIX) : ztpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztpmv.$(SUFFIX) cblas_ztpmv.$(PSUFFIX) : ztpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chpmv.$(SUFFIX) cblas_chpmv.$(PSUFFIX) : zhpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhpmv.$(SUFFIX) cblas_zhpmv.$(PSUFFIX) : zhpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stpsv.$(SUFFIX) cblas_stpsv.$(PSUFFIX) : tpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtpsv.$(SUFFIX) cblas_dtpsv.$(PSUFFIX) : tpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctpsv.$(SUFFIX) cblas_ctpsv.$(PSUFFIX) : ztpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztpsv.$(SUFFIX) cblas_ztpsv.$(PSUFFIX) : ztpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssymv.$(SUFFIX) cblas_ssymv.$(PSUFFIX) : symv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsymv.$(SUFFIX) cblas_dsymv.$(PSUFFIX) : symv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chemv.$(SUFFIX) cblas_chemv.$(PSUFFIX) : zhemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsymm.$(SUFFIX) cblas_dsymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_csymm.$(SUFFIX) cblas_csymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zsymm.$(SUFFIX) cblas_zsymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyrk.$(SUFFIX) cblas_ssyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyrk.$(SUFFIX) cblas_dsyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_csyrk.$(SUFFIX) cblas_csyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zsyrk.$(SUFFIX) cblas_zsyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyr2k.$(SUFFIX) cblas_ssyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyr2k.$(SUFFIX) cblas_dsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_csyr2k.$(SUFFIX) cblas_csyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zsyr2k.$(SUFFIX) cblas_zsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_strmm.$(SUFFIX) cblas_strmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_dtrmm.$(SUFFIX) cblas_dtrmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_ctrmm.$(SUFFIX) cblas_ctrmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_ztrmm.$(SUFFIX) cblas_ztrmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_strsm.$(SUFFIX) cblas_strsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtrsm.$(SUFFIX) cblas_dtrsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctrsm.$(SUFFIX) cblas_ctrsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztrsm.$(SUFFIX) cblas_ztrsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chemm.$(SUFFIX) cblas_chemm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_zhemm.$(SUFFIX) cblas_zhemm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_cherk.$(SUFFIX) cblas_cherk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_zherk.$(SUFFIX) cblas_zherk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_cher2k.$(SUFFIX) cblas_cher2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : getf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgetf2.$(SUFFIX) dgetf2.$(PSUFFIX) : getf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgetf2.$(SUFFIX) qgetf2.$(PSUFFIX) : getf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgetf2.$(SUFFIX) cgetf2.$(PSUFFIX) : zgetf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgetf2.$(SUFFIX) zgetf2.$(PSUFFIX) : zgetf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgetf2.$(SUFFIX) xgetf2.$(PSUFFIX) : zgetf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgetrf.$(SUFFIX) sgetrf.$(PSUFFIX) : getrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgetrf.$(SUFFIX) dgetrf.$(PSUFFIX) : getrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgetrf.$(SUFFIX) qgetrf.$(PSUFFIX) : getrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgetrf.$(SUFFIX) cgetrf.$(PSUFFIX) : zgetrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgetrf.$(SUFFIX) zgetrf.$(PSUFFIX) : zgetrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgetrf.$(SUFFIX) xgetrf.$(PSUFFIX) : zgetrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slauu2.$(SUFFIX) slauu2.$(PSUFFIX) : lauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlauu2.$(SUFFIX) dlauu2.$(PSUFFIX) : lauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlauu2.$(SUFFIX) qlauu2.$(PSUFFIX) : lauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +clauu2.$(SUFFIX) clauu2.$(PSUFFIX) : zlauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlauu2.$(SUFFIX) zlauu2.$(PSUFFIX) : zlauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlauu2.$(SUFFIX) xlauu2.$(PSUFFIX) : zlauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slauum.$(SUFFIX) slauum.$(PSUFFIX) : lauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlauum.$(SUFFIX) dlauum.$(PSUFFIX) : lauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlauum.$(SUFFIX) qlauum.$(PSUFFIX) : lauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +clauum.$(SUFFIX) clauum.$(PSUFFIX) : zlauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlauum.$(SUFFIX) zlauum.$(PSUFFIX) : zlauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlauum.$(SUFFIX) xlauum.$(PSUFFIX) : zlauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +spotf2.$(SUFFIX) spotf2.$(PSUFFIX) : potf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dpotf2.$(SUFFIX) dpotf2.$(PSUFFIX) : potf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qpotf2.$(SUFFIX) qpotf2.$(PSUFFIX) : potf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cpotf2.$(SUFFIX) cpotf2.$(PSUFFIX) : zpotf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zpotf2.$(SUFFIX) zpotf2.$(PSUFFIX) : zpotf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xpotf2.$(SUFFIX) xpotf2.$(PSUFFIX) : zpotf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +spotrf.$(SUFFIX) spotrf.$(PSUFFIX) : potrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dpotrf.$(SUFFIX) dpotrf.$(PSUFFIX) : potrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qpotrf.$(SUFFIX) qpotrf.$(PSUFFIX) : potrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cpotrf.$(SUFFIX) cpotrf.$(PSUFFIX) : zpotrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zpotrf.$(SUFFIX) zpotrf.$(PSUFFIX) : zpotrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xpotrf.$(SUFFIX) xpotrf.$(PSUFFIX) : zpotrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strti2.$(SUFFIX) strti2.$(PSUFFIX) : trti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrti2.$(SUFFIX) dtrti2.$(PSUFFIX) : trti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrti2.$(SUFFIX) qtrti2.$(PSUFFIX) : trti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrti2.$(SUFFIX) ctrti2.$(PSUFFIX) : ztrti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : ztrti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strtri.$(SUFFIX) strtri.$(PSUFFIX) : trtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : trtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : ztrtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : ztrtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slaswp.$(SUFFIX) slaswp.$(PSUFFIX) : laswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlaswp.$(SUFFIX) dlaswp.$(PSUFFIX) : laswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlaswp.$(SUFFIX) qlaswp.$(PSUFFIX) : laswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +claswp.$(SUFFIX) claswp.$(PSUFFIX) : zlaswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlaswp.$(SUFFIX) zlaswp.$(PSUFFIX) : zlaswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlaswp.$(SUFFIX) xlaswp.$(PSUFFIX) : zlaswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : getrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : getrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : zgetrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : zgetrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgesv.$(SUFFIX) dgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgesv.$(SUFFIX) qgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgesv.$(SUFFIX) cgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgesv.$(SUFFIX) zgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgesv.$(SUFFIX) xgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +spotri.$(SUFFIX) spotri.$(PSUFFIX) : potri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dpotri.$(SUFFIX) dpotri.$(PSUFFIX) : potri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qpotri.$(SUFFIX) qpotri.$(PSUFFIX) : potri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cpotri.$(SUFFIX) cpotri.$(PSUFFIX) : zpotri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zpotri.$(SUFFIX) zpotri.$(PSUFFIX) : zpotri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xpotri.$(SUFFIX) xpotri.$(PSUFFIX) : zpotri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slarf.$(SUFFIX) slarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlarf.$(SUFFIX) dlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlarf.$(SUFFIX) qlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +clarf.$(SUFFIX) clarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + diff --git a/interface/asum.c b/interface/asum.c new file mode 100644 index 0000000..634836e --- /dev/null +++ b/interface/asum.c @@ -0,0 +1,93 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (FLOATRET)ASUM_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, n); + + IDEBUG_END; + + return ret; +} + +#else + +FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = ASUM_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, n); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/axpy.c b/interface/axpy.c new file mode 100644 index 0000000..03b9819 --- /dev/null +++ b/interface/axpy.c @@ -0,0 +1,112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOAT alpha = *ALPHA; + +#else + +void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + +#endif + +#ifdef SMP + int mode, nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (n <= 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + AXPYU_K(n, 0, 0, alpha, x, incx, y, incy, NULL, 0); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + blas_level1_thread(mode, n, 0, 0, &alpha, + x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); + + } +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/copy.c b/interface/copy.c new file mode 100644 index 0000000..6965682 --- /dev/null +++ b/interface/copy.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + + PRINT_DEBUG_NAME; + +#else + +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + + PRINT_DEBUG_CNAME; + +#endif + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx * COMPSIZE; + if (incy < 0) y -= (n - 1) * incy * COMPSIZE; + + COPY_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(COMPSIZE, COMPSIZE * n, 0); + + IDEBUG_END; + + return; + +} diff --git a/interface/create b/interface/create new file mode 100755 index 0000000..b7be8ab --- /dev/null +++ b/interface/create @@ -0,0 +1,22 @@ +#!/usr/bin/perl + +$count = 0; + +foreach (@ARGV) { + print "#define\tinterface_", $_, "\t\t", $count, "\n"; + $count ++; +} + +print "#ifdef USE_FUNCTABLE\n"; + +print "#define MAX_PROF_TABLE ", $count, "\n"; + +print "static char *func_table[] = {\n"; + +foreach (@ARGV) { + print "\"", $_, "\",\n"; +} + +print "};\n"; +print "#endif\n"; + diff --git a/interface/dot.c b/interface/dot.c new file mode 100644 index 0000000..3744db5 --- /dev/null +++ b/interface/dot.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ret = (FLOATRET)DOTU_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; +} + +#else + +FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ret = DOTU_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; + +} + +#endif diff --git a/interface/dsdot.c b/interface/dsdot.c new file mode 100644 index 0000000..66f7917 --- /dev/null +++ b/interface/dsdot.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + return DSDOT_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, n, n); + + IDEBUG_END; + + return 0; + +} + +#else + +double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + return DSDOT_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, n, n); + + IDEBUG_END; + + return 0; + +} + +#endif diff --git a/interface/gbmv.c b/interface/gbmv.c new file mode 100644 index 0000000..a76c48d --- /dev/null +++ b/interface/gbmv.c @@ -0,0 +1,252 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DGBMV " +#else +#define ERROR_NAME "SGBMV " +#endif + +static void (*gbmv[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qgbmv_n, qgbmv_t, +#elif defined(DOUBLE) + dgbmv_n, dgbmv_t, +#else + sgbmv_n, sgbmv_t, +#endif +}; + +#ifdef SMP +static int (*gbmv_thread[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qgbmv_thread_n, qgbmv_thread_t, +#elif defined(DOUBLE) + dgbmv_thread_n, dgbmv_thread_t, +#else + sgbmv_thread_n, sgbmv_thread_t, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, + blasint *KU, blasint *KL, + FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, + FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint ku = *KU; + blasint kl = *KL; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + FLOAT alpha = *ALPHA; + FLOAT beta = *BETA; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') i = 0; + if (trans == 'T') i = 1; + if (trans == 'R') i = 0; + if (trans == 'C') i = 1; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (i < 0) info = 1; + + trans = i; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_TRANSPOSE TransA, + blasint m, blasint n, + blasint ku, blasint kl, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT beta, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + blasint lenx, leny, info, t; + int trans; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + info = -1; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + info = -1; + + t = n; + n = m; + m = t; + + t = ku; + ku = kl; + kl = t; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + lenx = n; + leny = m; + if (trans) lenx = m; + if (trans) leny = n; + + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (lenx-1)*incx; + if (incy < 0) y -= (leny-1)*incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (gbmv[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (gbmv_thread[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, m * n / 2 + n, m * n); + + IDEBUG_END; + + return; +} diff --git a/interface/gemm.c b/interface/gemm.c new file mode 100644 index 0000000..7919f82 --- /dev/null +++ b/interface/gemm.c @@ -0,0 +1,452 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QGEMM " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMM " +#else +#define ERROR_NAME "SGEMM " +#endif +#else +#ifndef GEMM3M +#ifdef XDOUBLE +#define ERROR_NAME "XGEMM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMM " +#else +#define ERROR_NAME "CGEMM " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XGEMM3M " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMM3M " +#else +#define ERROR_NAME "CGEMM3M " +#endif +#endif +#endif + +static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef GEMM3M + GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, + GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, + GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR, + GEMM_NC, GEMM_TC, GEMM_RC, GEMM_CC, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + GEMM_THREAD_NN, GEMM_THREAD_TN, GEMM_THREAD_RN, GEMM_THREAD_CN, + GEMM_THREAD_NT, GEMM_THREAD_TT, GEMM_THREAD_RT, GEMM_THREAD_CT, + GEMM_THREAD_NR, GEMM_THREAD_TR, GEMM_THREAD_RR, GEMM_THREAD_CR, + GEMM_THREAD_NC, GEMM_THREAD_TC, GEMM_THREAD_RC, GEMM_THREAD_CC, +#endif +#else + GEMM3M_NN, GEMM3M_TN, GEMM3M_RN, GEMM3M_CN, + GEMM3M_NT, GEMM3M_TT, GEMM3M_RT, GEMM3M_CT, + GEMM3M_NR, GEMM3M_TR, GEMM3M_RR, GEMM3M_CR, + GEMM3M_NC, GEMM3M_TC, GEMM3M_RC, GEMM3M_CC, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + GEMM3M_THREAD_NN, GEMM3M_THREAD_TN, GEMM3M_THREAD_RN, GEMM3M_THREAD_CN, + GEMM3M_THREAD_NT, GEMM3M_THREAD_TT, GEMM3M_THREAD_RT, GEMM3M_THREAD_CT, + GEMM3M_THREAD_NR, GEMM3M_THREAD_TR, GEMM3M_THREAD_RR, GEMM3M_THREAD_CR, + GEMM3M_THREAD_NC, GEMM3M_THREAD_TC, GEMM3M_THREAD_RC, GEMM3M_THREAD_CC, +#endif +#endif +}; + +#ifndef CBLAS + +void NAME(char *TRANSA, char *TRANSB, + blasint *M, blasint *N, blasint *K, + FLOAT *alpha, + FLOAT *a, blasint *ldA, + FLOAT *b, blasint *ldB, + FLOAT *beta, + FLOAT *c, blasint *ldC){ + + blas_arg_t args; + + int transa, transb, nrowa, nrowb; + blasint info; + + char transA, transB; + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + +#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) + int nodes; +#endif + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.k = *K; + + args.a = (void *)a; + args.b = (void *)b; + args.c = (void *)c; + + args.lda = *ldA; + args.ldb = *ldB; + args.ldc = *ldC; + + args.alpha = (void *)alpha; + args.beta = (void *)beta; + + transA = *TRANSA; + transB = *TRANSB; + + TOUPPER(transA); + TOUPPER(transB); + + transa = -1; + transb = -1; + + if (transA == 'N') transa = 0; + if (transA == 'T') transa = 1; +#ifndef COMPLEX + if (transA == 'R') transa = 0; + if (transA == 'C') transa = 1; +#else + if (transA == 'R') transa = 2; + if (transA == 'C') transa = 3; +#endif + + if (transB == 'N') transb = 0; + if (transB == 'T') transb = 1; +#ifndef COMPLEX + if (transB == 'R') transb = 0; + if (transB == 'C') transb = 1; +#else + if (transB == 'R') transb = 2; + if (transB == 'C') transb = 3; +#endif + + nrowa = args.m; + if (transa & 1) nrowa = args.k; + nrowb = args.k; + if (transb & 1) nrowb = args.n; + + info = 0; + + if (args.ldc < args.m) info = 13; + if (args.ldb < nrowb) info = 10; + if (args.lda < nrowa) info = 8; + if (args.k < 0) info = 5; + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (transb < 0) info = 2; + if (transa < 0) info = 1; + + if (info){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, + blasint m, blasint n, blasint k, +#ifndef COMPLEX + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, + FLOAT *b, blasint ldb, +#ifndef COMPLEX + FLOAT beta, +#else + FLOAT *beta, +#endif + FLOAT *c, blasint ldc) { + + blas_arg_t args; + int transa, transb; + blasint nrowa, nrowb, info; + + XFLOAT *buffer; + XFLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + +#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) + int nodes; +#endif + + PRINT_DEBUG_CNAME; + +#ifndef COMPLEX + args.alpha = (void *)α + args.beta = (void *)β +#else + args.alpha = (void *)alpha; + args.beta = (void *)beta; +#endif + + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + args.m = m; + args.n = n; + args.k = k; + + args.a = (void *)a; + args.b = (void *)b; + args.c = (void *)c; + + args.lda = lda; + args.ldb = ldb; + args.ldc = ldc; + + if (TransA == CblasNoTrans) transa = 0; + if (TransA == CblasTrans) transa = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) transa = 0; + if (TransA == CblasConjTrans) transa = 1; +#else + if (TransA == CblasConjNoTrans) transa = 2; + if (TransA == CblasConjTrans) transa = 3; +#endif + if (TransB == CblasNoTrans) transb = 0; + if (TransB == CblasTrans) transb = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) transb = 0; + if (TransB == CblasConjTrans) transb = 1; +#else + if (TransB == CblasConjNoTrans) transb = 2; + if (TransB == CblasConjTrans) transb = 3; +#endif + + nrowa = args.m; + if (transa & 1) nrowa = args.k; + nrowb = args.k; + if (transb & 1) nrowb = args.n; + + info = -1; + + if (args.ldc < args.m) info = 13; + if (args.ldb < nrowb) info = 10; + if (args.lda < nrowa) info = 8; + if (args.k < 0) info = 5; + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (transb < 0) info = 2; + if (transa < 0) info = 1; + } + + if (order == CblasRowMajor) { + args.m = n; + args.n = m; + args.k = k; + + args.a = (void *)b; + args.b = (void *)a; + args.c = (void *)c; + + args.lda = ldb; + args.ldb = lda; + args.ldc = ldc; + + if (TransB == CblasNoTrans) transa = 0; + if (TransB == CblasTrans) transa = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) transa = 0; + if (TransB == CblasConjTrans) transa = 1; +#else + if (TransB == CblasConjNoTrans) transa = 2; + if (TransB == CblasConjTrans) transa = 3; +#endif + if (TransA == CblasNoTrans) transb = 0; + if (TransA == CblasTrans) transb = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) transb = 0; + if (TransA == CblasConjTrans) transb = 1; +#else + if (TransA == CblasConjNoTrans) transb = 2; + if (TransA == CblasConjTrans) transb = 3; +#endif + + nrowa = args.m; + if (transa & 1) nrowa = args.k; + nrowb = args.k; + if (transb & 1) nrowb = args.n; + + info = -1; + + if (args.ldc < args.m) info = 13; + if (args.ldb < nrowb) info = 10; + if (args.lda < nrowa) info = 8; + if (args.k < 0) info = 5; + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (transb < 0) info = 2; + if (transa < 0) info = 1; + + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((args.m == 0) || (args.n == 0)) return; + +#if 0 + fprintf(stderr, "m = %4d n = %d k = %d lda = %4d ldb = %4d ldc = %4d\n", + args.m, args.n, args.k, args.lda, args.ldb, args.ldc); +#endif + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (XFLOAT *)blas_memory_alloc(0); + + sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); + sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + mode |= (transa << BLAS_TRANSA_SHIFT); + mode |= (transb << BLAS_TRANSB_SHIFT); + + args.common = NULL; + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (gemm[(transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + + } else { + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + +#ifndef NO_AFFINITY + nodes = get_num_nodes(); + + if ((nodes > 1) && get_node_equal()) { + + args.nthreads /= nodes; + + gemm_thread_mn(mode, &args, NULL, NULL, gemm[16 | (transb << 2) | transa], sa, sb, nodes); + + } else { +#endif + + (gemm[16 | (transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0); + +#else + + GEMM_THREAD(mode, &args, NULL, NULL, gemm[(transb << 2) | transa], sa, sb, args.nthreads); + +#endif + +#ifndef USE_SIMPLE_THREADED_LEVEL3 +#ifndef NO_AFFINITY + } +#endif +#endif + +#endif + +#ifdef SMP + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.k + args.k * args.n + args.m * args.n, 2 * args.m * args.n * args.k); + + IDEBUG_END; + + return; +} diff --git a/interface/gemv.c b/interface/gemv.c new file mode 100644 index 0000000..9ea8aa8 --- /dev/null +++ b/interface/gemv.c @@ -0,0 +1,237 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGEMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMV " +#else +#define ERROR_NAME "SGEMV " +#endif + +#ifdef SMP +static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qgemv_thread_n, qgemv_thread_t, +#elif defined DOUBLE + dgemv_thread_n, dgemv_thread_t, +#else + sgemv_thread_n, sgemv_thread_t, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, + FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, + FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + FLOAT alpha = *ALPHA; + FLOAT beta = *BETA; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T, + }; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') i = 0; + if (trans == 'T') i = 1; + if (trans == 'R') i = 0; + if (trans == 'C') i = 1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (i < 0) info = 1; + + trans = i; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_TRANSPOSE TransA, + blasint m, blasint n, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT beta, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + blasint lenx, leny; + int trans; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T, + }; + + PRINT_DEBUG_CNAME; + + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + + } + + if (order == CblasRowMajor) { + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + info = -1; + + t = n; + n = m; + m = t; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + lenx = n; + leny = m; + if (trans) lenx = m; + if (trans) leny = n; + + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (lenx - 1) * incx; + if (incy < 0) y -= (leny - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (gemv[(int)trans])(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/ger.c b/interface/ger.c new file mode 100644 index 0000000..0218d94 --- /dev/null +++ b/interface/ger.c @@ -0,0 +1,193 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGER " +#elif defined DOUBLE +#define ERROR_NAME "DGER " +#else +#define ERROR_NAME "SGER " +#endif + +#define GER GERU_K + +#if defined XDOUBLE +#define GER_THREAD qger_thread +#elif defined DOUBLE +#define GER_THREAD dger_thread +#else +#define GER_THREAD sger_thread +#endif + + +#ifndef CBLAS + +void NAME(blasint *M, blasint *N, FLOAT *Alpha, + FLOAT *x, blasint *INCX, + FLOAT *y, blasint *INCY, + FLOAT *a, blasint *LDA){ + + blasint m = *M; + blasint n = *N; + FLOAT alpha = *Alpha; + blasint incx = *INCX; + blasint incy = *INCY; + blasint lda = *LDA; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + blasint info; + + PRINT_DEBUG_NAME; + + info = 0; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + + if (info){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + blasint m, blasint n, + FLOAT alpha, + FLOAT *x, blasint incx, + FLOAT *y, blasint incy, + FLOAT *a, blasint lda) { + + FLOAT *buffer; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + info = 0; + + if (order == CblasColMajor) { + info = -1; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (order == CblasRowMajor) { + info = -1; + + t = n; + n = m; + m = t; + + t = incx; + incx = incy; + incy = t; + + buffer = x; + x = y; + y = buffer; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + /* Quick return if possible. */ + if (m == 0 || n == 0) return; + if (alpha == 0.) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incy < 0) y -= (n - 1) * incy; + if (incx < 0) x -= (m - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); + +#ifdef SMP + } else { + + GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); + + IDEBUG_END; + + return; +} diff --git a/interface/gesv.c b/interface/gesv.c new file mode 100644 index 0000000..ce6bcbd --- /dev/null +++ b/interface/gesv.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QGESV " +#elif defined(DOUBLE) +#define ERROR_NAME "DGESV " +#else +#define ERROR_NAME "SGESV " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XGESV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGESV " +#else +#define ERROR_NAME "CGESV " +#endif +#endif + +int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, + FLOAT *b, blasint *ldB, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *N; + args.n = *NRHS; + args.a = (void *)a; + args.lda = *ldA; + args.b = (void *)b; + args.ldb = *ldB; + args.c = (void *)ipiv; + + info = 0; + if (args.ldb < MAX(1,args.m)) info = 7; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + args.alpha = NULL; + args.beta = NULL; + + *Info = 0; + + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + args.n = *N; + info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); + + if (info == 0){ + args.n = *NRHS; + GETRS_N_SINGLE(&args, NULL, NULL, sa, sb, 0); + } + +#ifdef SMP + } else { + + args.n = *N; + info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); + + if (info == 0){ + args.n = *NRHS; + GETRS_N_PARALLEL(&args, NULL, NULL, sa, sb, 0); + } + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + *Info = info; + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, *N * *N, 2. / 3. * *N * *N * *N + *N * *N); + + IDEBUG_END; + + return 0; +} diff --git a/interface/getf2.c b/interface/getf2.c new file mode 100644 index 0000000..cae1595 --- /dev/null +++ b/interface/getf2.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGETF2" +#elif defined(DOUBLE) +#define ERROR_NAME "DGETF2" +#else +#define ERROR_NAME "SGETF2" +#endif + +int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + args.c = (void *)ipiv; + + info = 0; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = GETF2(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/getrf.c b/interface/getrf.c new file mode 100644 index 0000000..aa799e8 --- /dev/null +++ b/interface/getrf.c @@ -0,0 +1,121 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGETRF" +#elif defined(DOUBLE) +#define ERROR_NAME "DGETRF" +#else +#define ERROR_NAME "SGETRF" +#endif + +int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + args.c = (void *)ipiv; + + info = 0; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/getrs.c b/interface/getrs.c new file mode 100644 index 0000000..761a001 --- /dev/null +++ b/interface/getrs.c @@ -0,0 +1,152 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGETRS" +#elif defined(DOUBLE) +#define ERROR_NAME "DGETRS" +#else +#define ERROR_NAME "SGETRS" +#endif + +static blasint (*getrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + GETRS_N_SINGLE, GETRS_T_SINGLE, +}; + +#ifdef SMP +static blasint (*getrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + GETRS_N_PARALLEL, GETRS_T_PARALLEL, +}; +#endif + +int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, + blasint *ipiv, FLOAT *b, blasint *ldB, blasint *Info){ + + char trans_arg = *TRANS; + + blas_arg_t args; + + blasint info; + int trans; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *N; + args.n = *NRHS; + args.a = (void *)a; + args.lda = *ldA; + args.b = (void *)b; + args.ldb = *ldB; + args.c = (void *)ipiv; + + info = 0; + + TOUPPER(trans_arg); + trans = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (args.ldb < MAX(1, args.m)) info = 8; + if (args.lda < MAX(1, args.m)) info = 5; + if (args.n < 0) info = 3; + if (args.m < 0) info = 2; + if (trans < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return 0; + } + + args.alpha = NULL; + args.beta = NULL; + + *Info = info; + + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + (getrs_single[trans])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + (getrs_parallel[trans])(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n); + + IDEBUG_END; + + return 0; + +} diff --git a/interface/imax.c b/interface/imax.c new file mode 100644 index 0000000..37396c7 --- /dev/null +++ b/interface/imax.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#undef MAX_K + +#ifdef USE_ABS + +#ifndef USE_MIN + +/* ABS & MAX */ +#ifndef COMPLEX +#ifdef XDOUBLE +#define MAX_K IQAMAX_K +#elif defined(DOUBLE) +#define MAX_K IDAMAX_K +#else +#define MAX_K ISAMAX_K +#endif +#else +#ifdef XDOUBLE +#define MAX_K IXAMAX_K +#elif defined(DOUBLE) +#define MAX_K IZAMAX_K +#else +#define MAX_K ICAMAX_K +#endif +#endif + +#else + +/* ABS & MIN */ +#ifndef COMPLEX +#ifdef XDOUBLE +#define MAX_K IQAMIN_K +#elif defined(DOUBLE) +#define MAX_K IDAMIN_K +#else +#define MAX_K ISAMIN_K +#endif +#else +#ifdef XDOUBLE +#define MAX_K IXAMIN_K +#elif defined(DOUBLE) +#define MAX_K IZAMIN_K +#else +#define MAX_K ICAMIN_K +#endif +#endif + +#endif + +#else + +#ifndef USE_MIN + +/* MAX */ +#ifdef XDOUBLE +#define MAX_K IQMAX_K +#elif defined(DOUBLE) +#define MAX_K IDMAX_K +#else +#define MAX_K ISMAX_K +#endif + +#else + +/* MIN */ +#ifdef XDOUBLE +#define MAX_K IQMIN_K +#elif defined(DOUBLE) +#define MAX_K IDMIN_K +#else +#define MAX_K ISMIN_K +#endif + +#endif + +#endif + +#ifndef CBLAS + +blasint NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + blasint ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (blasint)MAX_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 0); + + IDEBUG_END; + + return ret; +} + +#else + +CBLAS_INDEX CNAME(blasint n, FLOAT *x, blasint incx){ + + CBLAS_INDEX ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = MAX_K(n, x, incx); + + if (ret) ret --; + + FUNCTION_PROFILE_END(COMPSIZE, n, 0); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/larf.c b/interface/larf.c new file mode 100644 index 0000000..3b538c4 --- /dev/null +++ b/interface/larf.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +static int (*larf[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LARF_L, LARF_R, +}; + +int NAME(char *SIDE, blasint *M, blasint *N, FLOAT *v, blasint *incV, FLOAT *tau, FLOAT *c, blasint *ldC, FLOAT *work){ + + blas_arg_t args; + + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + char side_arg = *SIDE; + int side; + + PRINT_DEBUG_NAME; + + TOUPPER(side_arg); + + args.m = *M; + args.n = *N; + args.a = (void *)v; + args.lda = *incV; + args.c = (void *)c; + args.ldc = *ldC; + + args.alpha = (void *)tau; + + side = -1; + if (side_arg == 'L') side = 0; + if (side_arg == 'R') side = 1; + + if (args.m == 0 || args.n == 0) return 0; + +#ifndef COMPLEX + if (*tau == ZERO) return 0; +#else + if ((*(tau + 0) == ZERO) && (*(tau + 1) == ZERO)) return 0; +#endif + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + larf[side](&args, NULL, NULL, sa, sb, 0); + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/laswp.c b/interface/laswp.c new file mode 100644 index 0000000..026b515 --- /dev/null +++ b/interface/laswp.c @@ -0,0 +1,110 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, blasint *, BLASLONG) = { +#ifdef XDOUBLE + qlaswp_plus, qlaswp_minus, +#elif defined(DOUBLE) + dlaswp_plus, dlaswp_minus, +#else + slaswp_plus, slaswp_minus, +#endif +}; + +int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){ + + blasint n = *N; + blasint lda = *LDA; + blasint k1 = *K1; + blasint k2 = *K2; + blasint incx = *INCX; + int flag; + +#ifdef SMP + int mode, nthreads; + FLOAT dummyalpha[2] = {ZERO, ZERO}; +#endif + + PRINT_DEBUG_NAME; + + if (incx == 0 || n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + flag = (incx < 0); + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + (laswp[flag])(n, k1, k2, ZERO, a, lda, NULL, 0, ipiv, incx); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + blas_level1_thread(mode, n, k1, k2, dummyalpha, + a, lda, NULL, 0, ipiv, incx, + laswp[flag], nthreads); + } +#endif + + FUNCTION_PROFILE_END(COMPSIZE, n * (k2 - k1), 0); + + IDEBUG_END; + + return 0; + +} diff --git a/interface/lauu2.c b/interface/lauu2.c new file mode 100644 index 0000000..14417e9 --- /dev/null +++ b/interface/lauu2.c @@ -0,0 +1,128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QLAUU2" +#elif defined(DOUBLE) +#define ERROR_NAME "DLAUU2" +#else +#define ERROR_NAME "SLAUU2" +#endif + +static blasint (*lauu2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifdef XDOUBLE + qlauu2_U, qlauu2_L, +#elif defined(DOUBLE) + dlauu2_U, dlauu2_L, +#else + slauu2_U, slauu2_L, +#endif + }; + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (lauu2[uplo])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/lauum.c b/interface/lauum.c new file mode 100644 index 0000000..e5b593f --- /dev/null +++ b/interface/lauum.c @@ -0,0 +1,139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QLAUUM" +#elif defined(DOUBLE) +#define ERROR_NAME "DLAUUM" +#else +#define ERROR_NAME "SLAUUM" +#endif + +static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LAUUM_U_SINGLE, LAUUM_L_SINGLE, +}; + +#ifdef SMP +static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + *Info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/max.c b/interface/max.c new file mode 100644 index 0000000..9bedadd --- /dev/null +++ b/interface/max.c @@ -0,0 +1,169 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#undef MAX_K + +#ifdef USE_ABS + +#ifndef USE_MIN + +/* ABS & MAX */ +#ifndef COMPLEX +#ifdef XDOUBLE +#define MAX_K QAMAX_K +#elif defined(DOUBLE) +#define MAX_K DAMAX_K +#else +#define MAX_K SAMAX_K +#endif +#else +#ifdef XDOUBLE +#define MAX_K XAMAX_K +#elif defined(DOUBLE) +#define MAX_K ZAMAX_K +#else +#define MAX_K CAMAX_K +#endif +#endif + +#else + +/* ABS & MIN */ +#ifndef COMPLEX +#ifdef XDOUBLE +#define MAX_K QAMIN_K +#elif defined(DOUBLE) +#define MAX_K DAMIN_K +#else +#define MAX_K SAMIN_K +#endif +#else +#ifdef XDOUBLE +#define MAX_K XAMIN_K +#elif defined(DOUBLE) +#define MAX_K ZAMIN_K +#else +#define MAX_K CAMIN_K +#endif +#endif + +#endif + +#else + +#ifndef USE_MIN + +/* MAX */ +#ifdef XDOUBLE +#define MAX_K QMAX_K +#elif defined(DOUBLE) +#define MAX_K DMAX_K +#else +#define MAX_K SMAX_K +#endif + +#else + +/* MIN */ +#ifdef XDOUBLE +#define MAX_K QMIN_K +#elif defined(DOUBLE) +#define MAX_K DMIN_K +#else +#define MAX_K SMIN_K +#endif + +#endif + +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (FLOATRET)MAX_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 0); + + IDEBUG_END; + + return ret; +} + +#else + +FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = MAX_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 0); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/nrm2.c b/interface/nrm2.c new file mode 100644 index 0000000..ff8ef6d --- /dev/null +++ b/interface/nrm2.c @@ -0,0 +1,93 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (FLOATRET)NRM2_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 2 * n); + + IDEBUG_END; + + return ret; +} + +#else + +FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = NRM2_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 2 * n); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/potf2.c b/interface/potf2.c new file mode 100644 index 0000000..76822a4 --- /dev/null +++ b/interface/potf2.c @@ -0,0 +1,128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QPOTF2" +#elif defined(DOUBLE) +#define ERROR_NAME "DPOTF2" +#else +#define ERROR_NAME "SPOTF2" +#endif + +static blasint (*potf2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifdef XDOUBLE + qpotf2_U, qpotf2_L, +#elif defined(DOUBLE) + dpotf2_U, dpotf2_L, +#else + spotf2_U, spotf2_L, +#endif + }; + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (potf2[uplo])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/potrf.c b/interface/potrf.c new file mode 100644 index 0000000..9a15012 --- /dev/null +++ b/interface/potrf.c @@ -0,0 +1,139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QPOTRF" +#elif defined(DOUBLE) +#define ERROR_NAME "DPOTRF" +#else +#define ERROR_NAME "SPOTRF" +#endif + +static blasint (*potrf_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + POTRF_U_SINGLE, POTRF_L_SINGLE, +}; + +#ifdef SMP +static blasint (*potrf_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + POTRF_U_PARALLEL, POTRF_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (potrf_single[uplo])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + *Info = (potrf_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/potri.c b/interface/potri.c new file mode 100644 index 0000000..a4f3322 --- /dev/null +++ b/interface/potri.c @@ -0,0 +1,160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QPOTRI" +#elif defined(DOUBLE) +#define ERROR_NAME "DPOTRI" +#else +#define ERROR_NAME "SPOTRI" +#endif + +static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UN_SINGLE, TRTRI_LN_SINGLE, +}; + +static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + LAUUM_U_SINGLE, LAUUM_L_SINGLE, +}; + +#ifdef SMP +static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UN_PARALLEL, TRTRI_LN_PARALLEL, +}; + +static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + info = (trtri_single[uplo])(&args, NULL, NULL, sa, sb, 0); + + if (!info) { + info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); + } + + *Info = info; + +#ifdef SMP + } else { + info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + + if (!info) { + info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + } + + *Info = info; + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/interface/rot.c b/interface/rot.c new file mode 100644 index 0000000..2e458b1 --- /dev/null +++ b/interface/rot.c @@ -0,0 +1,82 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOAT c = *C; + FLOAT s = *S; + + PRINT_DEBUG_NAME; + +#else + +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT c, FLOAT s){ + + PRINT_DEBUG_CNAME; + +#endif + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ROT_K(n, x, incx, y, incy, c, s); + + FUNCTION_PROFILE_END(1, n, n); + + IDEBUG_END; + + return; + +} diff --git a/interface/rotg.c b/interface/rotg.c new file mode 100644 index 0000000..49088ab --- /dev/null +++ b/interface/rotg.c @@ -0,0 +1,109 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ + +#else + +void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ + +#endif + + +#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) + + long double da = *DA; + long double db = *DB; + long double c; + long double s; + long double r, roe, z; + + long double ada = fabs(da); + long double adb = fabs(db); + long double scale = ada + adb; + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + roe = db; + if (ada > adb) roe = da; + + if (scale == ZERO) { + *C = ONE; + *S = ZERO; + *DA = ZERO; + *DB = ZERO; + } else { + r = sqrt(da * da + db * db); + if (roe < 0) r = -r; + c = da / r; + s = db / r; + z = ONE; + if (da != ZERO) { + if (ada > adb){ + z = s; + } else { + z = ONE / c; + } + } + + *C = c; + *S = s; + *DA = r; + *DB = z; + } + +#else + FLOAT da = *DA; + FLOAT db = *DB; + FLOAT c = *C; + FLOAT s = *S; + FLOAT r, roe, z; + + FLOAT ada = fabs(da); + FLOAT adb = fabs(db); + FLOAT scale = ada + adb; + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + roe = db; + if (ada > adb) roe = da; + + if (scale == ZERO) { + *C = ONE; + *S = ZERO; + *DA = ZERO; + *DB = ZERO; + } else { + FLOAT aa = da / scale; + FLOAT bb = db / scale; + + r = scale * sqrt(aa * aa + bb * bb); + if (roe < 0) r = -r; + c = da / r; + s = db / r; + z = ONE; + if (ada > adb) z = s; + if ((ada < adb) && (c != ZERO)) z = ONE / c; + + *C = c; + *S = s; + *DA = r; + *DB = z; + } +#endif + + return; +} diff --git a/interface/rotm.c b/interface/rotm.c new file mode 100644 index 0000000..4f026c7 --- /dev/null +++ b/interface/rotm.c @@ -0,0 +1,155 @@ +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ + + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + +#else + +void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ + +#endif + + blasint i__1, i__2; + + blasint i__; + FLOAT w, z__; + blasint kx, ky; + FLOAT dh11, dh12, dh22, dh21, dflag; + blasint nsteps; + +#ifndef CBLAS + PRINT_DEBUG_CNAME; +#else + PRINT_DEBUG_CNAME; +#endif + + --dparam; + --dy; + --dx; + + dflag = dparam[1]; + if (n <= 0 || dflag == - 2.0) goto L140; + + if (! (incx == incy && incx > 0)) goto L70; + + nsteps = n * incx; + if (dflag < 0.) { + goto L50; + } else if (dflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__1 = nsteps; + i__2 = incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w + z__ * dh12; + dy[i__] = w * dh21 + z__; +/* L20: */ + } + goto L140; +L30: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = nsteps; + i__1 = incx; + for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__; + dy[i__] = -w + dh22 * z__; +/* L40: */ + } + goto L140; +L50: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__1 = nsteps; + i__2 = incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__ * dh12; + dy[i__] = w * dh21 + z__ * dh22; +/* L60: */ + } + goto L140; +L70: + kx = 1; + ky = 1; + if (incx < 0) { + kx = (1 - n) * incx + 1; + } + if (incy < 0) { + ky = (1 - n) * incy + 1; + } + + if (dflag < 0.) { + goto L120; + } else if (dflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w + z__ * dh12; + dy[ky] = w * dh21 + z__; + kx += incx; + ky += incy; +/* L90: */ + } + goto L140; +L100: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__; + dy[ky] = -w + dh22 * z__; + kx += incx; + ky += incy; +/* L110: */ + } + goto L140; +L120: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__ * dh12; + dy[ky] = w * dh21 + z__ * dh22; + kx += incx; + ky += incy; +/* L130: */ + } +L140: + return; +} + diff --git a/interface/rotmg.c b/interface/rotmg.c new file mode 100644 index 0000000..c37c099 --- /dev/null +++ b/interface/rotmg.c @@ -0,0 +1,199 @@ +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#define GAM 4096.e0 +#define GAMSQ 16777216.e0 +#define RGAMSQ 5.9604645e-8 + +#ifndef CBLAS + +void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){ + + FLOAT dy1 = *DY1; + +#else + +void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ + +#endif + + FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22; + int igo, flag; + FLOAT dtemp; + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + dh11 = ZERO; + dh12 = ZERO; + dh21 = ZERO; + dh22 = ZERO; + + if (*dd1 < ZERO) goto L60; + + dp2 = *dd2 * dy1; + + if (dp2 == ZERO) { + flag = -2; + goto L260; + } + + dp1 = *dd1 * *dx1; + dq2 = dp2 * dy1; + dq1 = dp1 * *dx1; + + if (! (abs(dq1) > abs(dq2))) goto L40; + + dh21 = -(dy1) / *dx1; + dh12 = dp2 / dp1; + + du = ONE - dh12 * dh21; + + if (du <= ZERO) goto L60; + + flag = 0; + *dd1 /= du; + *dd2 /= du; + *dx1 *= du; + + goto L100; + +L40: + if (dq2 < ZERO) goto L60; + + flag = 1; + dh11 = dp1 / dp2; + dh22 = *dx1 / dy1; + du = ONE + dh11 * dh22; + dtemp = *dd2 / du; + *dd2 = *dd1 / du; + *dd1 = dtemp; + *dx1 = dy1 * du; + goto L100; + +L60: + flag = -1; + dh11 = ZERO; + dh12 = ZERO; + dh21 = ZERO; + dh22 = ZERO; + + *dd1 = ZERO; + *dd2 = ZERO; + *dx1 = ZERO; + goto L220; + + +L70: + if (flag < 0) goto L90; + + if (flag > 0) goto L80; + + dh11 = ONE; + dh22 = ONE; + flag = -1; + goto L90; + +L80: + dh21 = -ONE; + dh12 = ONE; + flag = -1; + +L90: + switch (igo) { + case 0: goto L120; + case 1: goto L150; + case 2: goto L180; + case 3: goto L210; + } + +L100: + if (!(*dd1 <= RGAMSQ)) goto L130; + if (*dd1 == ZERO) goto L160; + igo = 0; + goto L70; + +L120: + *dd1 *= GAM * GAM; + *dx1 /= GAM; + dh11 /= GAM; + dh12 /= GAM; + goto L100; + +L130: + if (! (*dd1 >= GAMSQ)) { + goto L160; + } + igo = 1; + goto L70; + +L150: + *dd1 /= GAM * GAM; + *dx1 *= GAM; + dh11 *= GAM; + dh12 *= GAM; + goto L130; + +L160: + if (! (abs(*dd2) <= RGAMSQ)) { + goto L190; + } + if (*dd2 == ZERO) { + goto L220; + } + igo = 2; + goto L70; + +L180: +/* Computing 2nd power */ + *dd2 *= GAM * GAM; + dh21 /= GAM; + dh22 /= GAM; + goto L160; + +L190: + if (! (abs(*dd2) >= GAMSQ)) { + goto L220; + } + igo = 3; + goto L70; + +L210: +/* Computing 2nd power */ + *dd2 /= GAM * GAM; + dh21 *= GAM; + dh22 *= GAM; + goto L190; + +L220: + if (flag < 0) { + goto L250; + } else if (flag == 0) { + goto L230; + } else { + goto L240; + } +L230: + dparam[2] = dh21; + dparam[3] = dh12; + goto L260; +L240: + dparam[2] = dh11; + dparam[4] = dh22; + goto L260; +L250: + dparam[1] = dh11; + dparam[2] = dh21; + dparam[3] = dh12; + dparam[4] = dh22; +L260: + dparam[0] = (FLOAT) flag; + return; +} + + diff --git a/interface/sbmv.c b/interface/sbmv.c new file mode 100644 index 0000000..2ffe7f1 --- /dev/null +++ b/interface/sbmv.c @@ -0,0 +1,215 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DSBMV " +#else +#define ERROR_NAME "SSBMV " +#endif + +static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qsbmv_U, qsbmv_L, +#elif defined(DOUBLE) + dsbmv_U, dsbmv_L, +#else + ssbmv_U, ssbmv_L, +#endif +}; + +#ifdef SMP +static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qsbmv_thread_U, qsbmv_thread_L, +#elif defined(DOUBLE) + dsbmv_thread_U, dsbmv_thread_L, +#else + ssbmv_thread_U, ssbmv_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + blasint k = *K; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta = *BETA; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, blasint k, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT beta, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (sbmv[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (sbmv_thread[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/scal.c b/interface/scal.c new file mode 100644 index 0000000..7b72ca0 --- /dev/null +++ b/interface/scal.c @@ -0,0 +1,112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ + + blasint n = *N; + blasint incx = *INCX; + FLOAT alpha = *ALPHA; + +#else + +void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ + +#endif + +#ifdef SMP + int mode, nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (incx <= 0 || n <= 0) return; + + if (alpha == ONE) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + SCAL_K(n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0); + +#ifdef SMP + } else { + +#ifdef DOUBLE + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + blas_level1_thread(mode, n, 0, 0, +#ifndef CBLAS + ALPHA, +#else + &alpha, +#endif + x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + + } +#endif + + FUNCTION_PROFILE_END(1, n, n); + + IDEBUG_END; + + return; + +} diff --git a/interface/sdsdot.c b/interface/sdsdot.c new file mode 100644 index 0000000..8540be6 --- /dev/null +++ b/interface/sdsdot.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ret = (FLOATRET)(SDSDOT_K(n, x, incx, y, incy) + *a); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; + +} + +#else + +FLOAT CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ret = SDSDOT_K(n, x, incx, y, incy) + alpha; + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/spmv.c b/interface/spmv.c new file mode 100644 index 0000000..8d89027 --- /dev/null +++ b/interface/spmv.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DSPMV " +#else +#define ERROR_NAME "SSPMV " +#endif + +static int (*spmv[])(BLASLONG, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qspmv_U, qspmv_L, +#elif defined(DOUBLE) + dspmv_U, dspmv_L, +#else + sspmv_U, sspmv_L, +#endif +}; + +#ifdef SMP +static int (*spmv_thread[])(BLASLONG, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qspmv_thread_U, qspmv_thread_L, +#elif defined(DOUBLE) + dspmv_thread_U, dspmv_thread_L, +#else + sspmv_thread_U, sspmv_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint incx = *INCX; + FLOAT beta = *BETA; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT alpha, + FLOAT *a, + FLOAT *x, blasint incx, + FLOAT beta, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spmv[uplo])(n, alpha, a, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (spmv_thread[uplo])(n, alpha, a, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/spr.c b/interface/spr.c new file mode 100644 index 0000000..aa2ff8f --- /dev/null +++ b/interface/spr.c @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSPR " +#elif defined(DOUBLE) +#define ERROR_NAME "DSPR " +#else +#define ERROR_NAME "SSPR " +#endif + +static int (*spr[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + qspr_U, qspr_L, +#elif defined(DOUBLE) + dspr_U, dspr_L, +#else + sspr_U, sspr_L, +#endif +}; + +#ifdef SMP +static int (*spr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + qspr_thread_U, qspr_thread_L, +#elif defined(DOUBLE) + dspr_thread_U, dspr_thread_L, +#else + sspr_thread_U, sspr_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT alpha, + FLOAT *x, blasint incx, + FLOAT *a) { + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spr[uplo])(n, alpha, x, incx, a, buffer); + +#ifdef SMP + } else { + + (spr_thread[uplo])(n, alpha, x, incx, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/spr2.c b/interface/spr2.c new file mode 100644 index 0000000..e556d3f --- /dev/null +++ b/interface/spr2.c @@ -0,0 +1,203 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSPR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "DSPR2 " +#else +#define ERROR_NAME "SSPR2 " +#endif + +static int (*spr2[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + qspr2_U, qspr2_L, +#elif defined(DOUBLE) + dspr2_U, dspr2_L, +#else + sspr2_U, sspr2_L, +#endif +}; + +#ifdef SMP +static int (*spr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + qspr2_thread_U, qspr2_thread_L, +#elif defined(DOUBLE) + dspr2_thread_U, dspr2_thread_L, +#else + sspr2_thread_U, sspr2_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT alpha, + FLOAT *x, blasint incx, + FLOAT *y, blasint incy, + FLOAT *a) { + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spr2[uplo])(n, alpha, x, incx, y, incy, a, buffer); + +#ifdef SMP + } else { + + (spr2_thread[uplo])(n, alpha, x, incx, y, incy, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/swap.c b/interface/swap.c new file mode 100644 index 0000000..7676246 --- /dev/null +++ b/interface/swap.c @@ -0,0 +1,110 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + +#else + +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + +#endif + +#ifdef SMP + int mode, nthreads; + FLOAT dummyalpha[2] = {ZERO, ZERO}; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + SWAP_K(n, 0, 0, ZERO, x, incx, y, incy, NULL, 0); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + blas_level1_thread(mode, n, 0, 0, dummyalpha, + x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads); + } + +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 0); + + IDEBUG_END; + + return; + +} diff --git a/interface/symm.c b/interface/symm.c new file mode 100644 index 0000000..a0d52c4 --- /dev/null +++ b/interface/symm.c @@ -0,0 +1,422 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QSYMM " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYMM " +#else +#define ERROR_NAME "SSYMM " +#endif +#else +#ifndef GEMM3M +#ifndef HEMM +#ifdef XDOUBLE +#define ERROR_NAME "XSYMM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYMM " +#else +#define ERROR_NAME "CSYMM " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XHEMM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHEMM " +#else +#define ERROR_NAME "CHEMM " +#endif +#endif +#else +#ifndef HEMM +#ifdef XDOUBLE +#define ERROR_NAME "XSYMM3M " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYMM3M " +#else +#define ERROR_NAME "CSYMM3M " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XHEMM3M " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHEMM3M " +#else +#define ERROR_NAME "CHEMM3M " +#endif +#endif +#endif +#endif + +static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifndef HEMM + SYMM_LU, SYMM_LL, SYMM_RU, SYMM_RL, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + SYMM_THREAD_LU, SYMM_THREAD_LL, SYMM_THREAD_RU, SYMM_THREAD_RL, +#endif +#else + HEMM_LU, HEMM_LL, HEMM_RU, HEMM_RL, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + HEMM_THREAD_LU, HEMM_THREAD_LL, HEMM_THREAD_RU, HEMM_THREAD_RL, +#endif +#endif +#else +#ifndef HEMM + SYMM3M_LU, SYMM3M_LL, SYMM3M_RU, SYMM3M_RL, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + SYMM3M_THREAD_LU, SYMM3M_THREAD_LL, SYMM3M_THREAD_RU, SYMM3M_THREAD_RL, +#endif +#else + HEMM3M_LU, HEMM3M_LL, HEMM3M_RU, HEMM3M_RL, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + HEMM3M_THREAD_LU, HEMM3M_THREAD_LL, HEMM3M_THREAD_RU, HEMM3M_THREAD_RL, +#endif +#endif +#endif +}; + +#ifndef CBLAS + +void NAME(char *SIDE, char *UPLO, + blasint *M, blasint *N, + FLOAT *alpha, FLOAT *a, blasint *ldA, + FLOAT *b, blasint *ldB, + FLOAT *beta, FLOAT *c, blasint *ldC){ + + char side_arg = *SIDE; + char uplo_arg = *UPLO; + + blas_arg_t args; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#endif + +#if defined(SMP) && !defined(NO_AFFINITY) + int nodes; +#endif + + blasint info; + int side; + int uplo; + + PRINT_DEBUG_NAME; + + args.alpha = (void *)alpha; + args.beta = (void *)beta; + + TOUPPER(side_arg); + TOUPPER(uplo_arg); + + side = -1; + uplo = -1; + + if (side_arg == 'L') side = 0; + if (side_arg == 'R') side = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + args.m = *M; + args.n = *N; + + args.c = (void *)c; + args.ldc = *ldC; + + info = 0; + + if (args.ldc < MAX(1, args.m)) info = 12; + + if (!side) { + args.a = (void *)a; + args.b = (void *)b; + + args.lda = *ldA; + args.ldb = *ldB; + + if (args.ldb < MAX(1, args.m)) info = 9; + if (args.lda < MAX(1, args.m)) info = 7; + + } else { + args.a = (void *)b; + args.b = (void *)a; + + args.lda = *ldB; + args.ldb = *ldA; + + if (args.lda < MAX(1, args.m)) info = 9; + if (args.ldb < MAX(1, args.n)) info = 7; + } + + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, + blasint m, blasint n, +#ifndef COMPLEX + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, + FLOAT *b, blasint ldb, +#ifndef COMPLEX + FLOAT beta, +#else + FLOAT *beta, +#endif + FLOAT *c, blasint ldc) { + + blas_arg_t args; + int side, uplo; + blasint info; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#endif + +#if defined(SMP) && !defined(NO_AFFINITY) + int nodes; +#endif + + PRINT_DEBUG_CNAME; + +#ifndef COMPLEX + args.alpha = (void *)α + args.beta = (void *)β +#else + args.alpha = (void *)alpha; + args.beta = (void *)beta; +#endif + + args.c = (void *)c; + args.ldc = ldc; + + side = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Side == CblasLeft) side = 0; + if (Side == CblasRight) side = 1; + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + args.m = m; + args.n = n; + + if (args.ldc < MAX(1, args.m)) info = 12; + + if (!side) { + args.a = (void *)a; + args.b = (void *)b; + + args.lda = lda; + args.ldb = ldb; + + if (args.ldb < MAX(1, args.m)) info = 9; + if (args.lda < MAX(1, args.m)) info = 7; + + } else { + args.a = (void *)b; + args.b = (void *)a; + + args.lda = ldb; + args.ldb = lda; + + if (args.lda < MAX(1, args.m)) info = 9; + if (args.ldb < MAX(1, args.n)) info = 7; + } + + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Side == CblasLeft) side = 1; + if (Side == CblasRight) side = 0; + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + args.m = n; + args.n = m; + + if (args.ldc < MAX(1, args.m)) info = 12; + + if (!side) { + args.a = (void *)a; + args.b = (void *)b; + + args.lda = lda; + args.ldb = ldb; + + if (args.ldb < MAX(1, args.m)) info = 9; + if (args.lda < MAX(1, args.m)) info = 7; + + } else { + args.a = (void *)b; + args.b = (void *)a; + + args.lda = ldb; + args.ldb = lda; + + if (args.lda < MAX(1, args.m)) info = 9; + if (args.ldb < MAX(1, args.n)) info = 7; + } + + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (args.m == 0 || args.n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (FLOAT *)blas_memory_alloc(0); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (symm[(side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + + } else { + +#ifndef NO_AFFINITY + nodes = get_num_nodes(); + + if (nodes > 1) { + + args.nthreads /= nodes; + + gemm_thread_mn(mode, &args, NULL, NULL, + symm[4 | (side << 1) | uplo ], sa, sb, nodes); + + } else { +#endif + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + + (symm[4 | (side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); + +#else + + GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); + +#endif + +#ifndef NO_AFFINITY + } +#endif + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, + (!side)? args.m * (args.m / 2 + args.n) : args.n * (args.m + args.n / 2), + (!side)? 2 * args.m * args.m * args.n : 2 * args.m * args.n * args.n); + + IDEBUG_END; + + return; +} diff --git a/interface/symv.c b/interface/symv.c new file mode 100644 index 0000000..e8c24df --- /dev/null +++ b/interface/symv.c @@ -0,0 +1,205 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSYMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYMV " +#else +#define ERROR_NAME "SSYMV " +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta = *BETA; + blasint incy = *INCY; + + int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + SYMV_U, SYMV_L, + }; + +#ifdef SMP + int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + SYMV_THREAD_U, SYMV_THREAD_L, + }; +#endif + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, + FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy) { + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + SYMV_U, SYMV_L, + }; + +#ifdef SMP + int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + SYMV_THREAD_U, SYMV_THREAD_L, + }; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (symv[uplo])(n, n, alpha, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (symv_thread[uplo])(n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/syr.c b/interface/syr.c new file mode 100644 index 0000000..2b2d3d1 --- /dev/null +++ b/interface/syr.c @@ -0,0 +1,200 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSYR " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYR " +#else +#define ERROR_NAME "SSYR " +#endif + +static int (*syr[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + qsyr_U, qsyr_L, +#elif defined(DOUBLE) + dsyr_U, dsyr_L, +#else + ssyr_U, ssyr_L, +#endif +}; + +#ifdef SMP +static int (*syr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qsyr_thread_U, qsyr_thread_L, +#elif defined(DOUBLE) + dsyr_thread_U, dsyr_thread_L, +#else + ssyr_thread_U, ssyr_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (syr[uplo])(n, alpha, x, incx, a, lda, buffer); + +#ifdef SMP + } else { + + (syr_thread[uplo])(n, alpha, x, incx, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/syr2.c b/interface/syr2.c new file mode 100644 index 0000000..15dbae4 --- /dev/null +++ b/interface/syr2.c @@ -0,0 +1,204 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSYR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYR2 " +#else +#define ERROR_NAME "SSYR2 " +#endif + +static int (*syr2[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + qsyr2_U, qsyr2_L, +#elif defined(DOUBLE) + dsyr2_U, dsyr2_L, +#else + ssyr2_U, ssyr2_L, +#endif +}; + +#ifdef SMP +static int (*syr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qsyr2_thread_U, qsyr2_thread_L, +#elif defined(DOUBLE) + dsyr2_thread_U, dsyr2_thread_L, +#else + ssyr2_thread_U, ssyr2_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (syr2[uplo])(n, alpha, x, incx, y, incy, a, lda, buffer); + +#ifdef SMP + } else { + + (syr2_thread[uplo])(n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/syr2k.c b/interface/syr2k.c new file mode 100644 index 0000000..70b8409 --- /dev/null +++ b/interface/syr2k.c @@ -0,0 +1,366 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QSYR2K" +#elif defined(DOUBLE) +#define ERROR_NAME "DSYR2K" +#else +#define ERROR_NAME "SSYR2K" +#endif +#else +#ifndef HEMM +#ifdef XDOUBLE +#define ERROR_NAME "XSYR2K" +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYR2K" +#else +#define ERROR_NAME "CSYR2K" +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XHER2K" +#elif defined(DOUBLE) +#define ERROR_NAME "ZHER2K" +#else +#define ERROR_NAME "CHER2K" +#endif +#endif +#endif + +static int (*syr2k[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef HEMM + SYR2K_UN, SYR2K_UC, SYR2K_LN, SYR2K_LC, +#else + HER2K_UN, HER2K_UC, HER2K_LN, HER2K_LC, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, + blasint *N, blasint *K, + FLOAT *alpha, FLOAT *a, blasint *ldA, + FLOAT *b, blasint *ldB, + FLOAT *beta, FLOAT *c, blasint *ldC){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + + blas_arg_t args; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + blasint info; + int uplo; + int trans; + int nrowa; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.k = *K; + + args.a = (void *)a; + args.b = (void *)b; + args.c = (void *)c; + + args.lda = *ldA; + args.ldb = *ldB; + args.ldc = *ldC; + + args.alpha = (void *)alpha; + args.beta = (void *)beta; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + + uplo = -1; + trans = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + info = 0; + + if (args.ldc < MAX(1,args.n)) info = 12; + if (args.ldb < MAX(1,nrowa)) info = 9; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint n, blasint k, +#ifndef COMPLEX + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, + FLOAT *b, blasint ldb, +#if !defined(COMPLEX) || defined(HEMM) + FLOAT beta, +#else + FLOAT *beta, +#endif + FLOAT *c, blasint ldc) { + + blas_arg_t args; + int uplo, trans; + blasint info, nrowa; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef HEMM + FLOAT CAlpha[2]; +#endif + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + PRINT_DEBUG_CNAME; + + args.n = n; + args.k = k; + + args.a = (void *)a; + args.b = (void *)b; + args.c = (void *)c; + + args.lda = lda; + args.ldb = ldb; + args.ldc = ldc; + +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + +#if !defined(COMPLEX) || defined(HEMM) + args.beta = (void *)β +#else + args.beta = (void *)beta; +#endif + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (Trans == CblasNoTrans) trans = 0; +#ifndef COMPLEX + if (Trans == CblasTrans) trans = 1; + if (Trans == CblasConjNoTrans) trans = 0; + if (Trans == CblasConjTrans) trans = 1; +#elif !defined(HEMM) + if (Trans == CblasTrans) trans = 1; +#else + if (Trans == CblasConjTrans) trans = 1; +#endif + + info = -1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + if (args.ldc < MAX(1,args.n)) info = 12; + if (args.ldb < MAX(1,nrowa)) info = 9; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + +#ifdef HEMM + CAlpha[0] = alpha[0]; + CAlpha[1] = -alpha[1]; + + args.alpha = (void *)CAlpha; +#endif + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (Trans == CblasNoTrans) trans = 1; +#ifndef COMPLEX + if (Trans == CblasTrans) trans = 0; + if (Trans == CblasConjNoTrans) trans = 1; + if (Trans == CblasConjTrans) trans = 0; +#elif !defined(HEMM) + if (Trans == CblasTrans) trans = 0; +#else + if (Trans == CblasConjTrans) trans = 0; +#endif + + info = -1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + if (args.ldc < MAX(1,args.n)) info = 12; + if (args.ldb < MAX(1,nrowa)) info = 9; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (args.n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (FLOAT *)blas_memory_alloc(0); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + if (!trans){ + mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T); + } else { + mode |= (BLAS_TRANSA_T | BLAS_TRANSB_N); + } + + mode |= (uplo << BLAS_UPLO_SHIFT); + + args.common = NULL; + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (syr2k[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + + } else { + + syrk_thread(mode, &args, NULL, NULL, syr2k[(uplo << 1) | trans ], sa, sb, args.nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, 2 * args.n * args.k + args.n * args.n, 2 * args.n * args.n * args.k); + + IDEBUG_END; + + return; +} diff --git a/interface/syrk.c b/interface/syrk.c new file mode 100644 index 0000000..a0cc641 --- /dev/null +++ b/interface/syrk.c @@ -0,0 +1,355 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QSYRK " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYRK " +#else +#define ERROR_NAME "SSYRK " +#endif +#else +#ifndef HEMM +#ifdef XDOUBLE +#define ERROR_NAME "XSYRK " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYRK " +#else +#define ERROR_NAME "CSYRK " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XHERK " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHERK " +#else +#define ERROR_NAME "CHERK " +#endif +#endif +#endif + +static int (*syrk[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef HEMM + SYRK_UN, SYRK_UC, SYRK_LN, SYRK_LC, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + SYRK_THREAD_UN, SYRK_THREAD_UC, SYRK_THREAD_LN, SYRK_THREAD_LC, +#endif +#else + HERK_UN, HERK_UC, HERK_LN, HERK_LC, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + HERK_THREAD_UN, HERK_THREAD_UC, HERK_THREAD_LN, HERK_THREAD_LC, +#endif +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, + blasint *N, blasint *K, + FLOAT *alpha, FLOAT *a, blasint *ldA, + FLOAT *beta, FLOAT *c, blasint *ldC){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + + blas_arg_t args; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + blasint info; + int uplo; + int trans; + int nrowa; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.k = *K; + + args.a = (void *)a; + args.c = (void *)c; + + args.lda = *ldA; + args.ldc = *ldC; + + args.alpha = (void *)alpha; + args.beta = (void *)beta; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + + uplo = -1; + trans = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + info = 0; + + if (args.ldc < MAX(1,args.n)) info = 10; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint n, blasint k, +#if !defined(COMPLEX) || defined(HEMM) + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, +#if !defined(COMPLEX) || defined(HEMM) + FLOAT beta, +#else + FLOAT *beta, +#endif + FLOAT *c, blasint ldc) { + + blas_arg_t args; + int uplo, trans; + blasint info, nrowa; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + PRINT_DEBUG_CNAME; + + args.n = n; + args.k = k; + + args.a = (void *)a; + args.c = (void *)c; + + args.lda = lda; + args.ldc = ldc; + +#if !defined(COMPLEX) || defined(HEMM) + args.alpha = (void *)α + args.beta = (void *)β +#else + args.alpha = (void *)alpha; + args.beta = (void *)beta; +#endif + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (Trans == CblasNoTrans) trans = 0; +#ifndef COMPLEX + if (Trans == CblasTrans) trans = 1; + if (Trans == CblasConjNoTrans) trans = 0; + if (Trans == CblasConjTrans) trans = 1; +#elif !defined(HEMM) + if (Trans == CblasTrans) trans = 1; +#else + if (Trans == CblasConjTrans) trans = 1; +#endif + + info = -1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + if (args.ldc < MAX(1,args.n)) info = 10; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (Trans == CblasNoTrans) trans = 1; +#ifndef COMPLEX + if (Trans == CblasTrans) trans = 0; + if (Trans == CblasConjNoTrans) trans = 1; + if (Trans == CblasConjTrans) trans = 0; +#elif !defined(HEMM) + if (Trans == CblasTrans) trans = 0; +#else + if (Trans == CblasConjTrans) trans = 0; +#endif + + info = -1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + if (args.ldc < MAX(1,args.n)) info = 10; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (args.n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (FLOAT *)blas_memory_alloc(0); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + if (!trans){ + mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T); + } else { + mode |= (BLAS_TRANSA_T | BLAS_TRANSB_N); + } + + mode |= (uplo << BLAS_UPLO_SHIFT); + + args.common = NULL; + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (syrk[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + + } else { + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + + (syrk[4 | (uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); + +#else + + syrk_thread(mode, &args, NULL, NULL, syrk[(uplo << 1) | trans ], sa, sb, args.nthreads); + +#endif + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.n * args.k + args.n * args.n / 2, args.n * args.n * args.k); + + IDEBUG_END; + + return; +} diff --git a/interface/tbmv.c b/interface/tbmv.c new file mode 100644 index 0000000..cec2be4 --- /dev/null +++ b/interface/tbmv.c @@ -0,0 +1,248 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTBMV " +#else +#define ERROR_NAME "STBMV " +#endif + +static int (*tbmv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtbmv_NUU, qtbmv_NUN, qtbmv_NLU, qtbmv_NLN, + qtbmv_TUU, qtbmv_TUN, qtbmv_TLU, qtbmv_TLN, +#elif defined(DOUBLE) + dtbmv_NUU, dtbmv_NUN, dtbmv_NLU, dtbmv_NLN, + dtbmv_TUU, dtbmv_TUN, dtbmv_TLU, dtbmv_TLN, +#else + stbmv_NUU, stbmv_NUN, stbmv_NLU, stbmv_NLN, + stbmv_TUU, stbmv_TUN, stbmv_TLU, stbmv_TLN, +#endif +}; + +#ifdef SMP +static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qtbmv_thread_NUU, qtbmv_thread_NUN, qtbmv_thread_NLU, qtbmv_thread_NLN, + qtbmv_thread_TUU, qtbmv_thread_TUN, qtbmv_thread_TLU, qtbmv_thread_TLN, +#elif defined(DOUBLE) + dtbmv_thread_NUU, dtbmv_thread_NUN, dtbmv_thread_NLU, dtbmv_thread_NLN, + dtbmv_thread_TUU, dtbmv_thread_TUN, dtbmv_thread_TLU, dtbmv_thread_TLN, +#else + stbmv_thread_NUU, stbmv_thread_NUN, stbmv_thread_NLU, stbmv_thread_NLN, + stbmv_thread_TUU, stbmv_thread_TUN, stbmv_thread_TLU, stbmv_thread_TLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, blasint *K, + FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint k = *K; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (tbmv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); + +#ifdef SMP + } else { + + (tbmv_thread[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/tbsv.c b/interface/tbsv.c new file mode 100644 index 0000000..a07c4c5 --- /dev/null +++ b/interface/tbsv.c @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTBSV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTBSV " +#else +#define ERROR_NAME "STBSV " +#endif + +static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtbsv_NUU, qtbsv_NUN, qtbsv_NLU, qtbsv_NLN, + qtbsv_TUU, qtbsv_TUN, qtbsv_TLU, qtbsv_TLN, +#elif defined(DOUBLE) + dtbsv_NUU, dtbsv_NUN, dtbsv_NLU, dtbsv_NLN, + dtbsv_TUU, dtbsv_TUN, dtbsv_TLU, dtbsv_TLN, +#else + stbsv_NUU, stbsv_NUN, stbsv_NLU, stbsv_NLN, + stbsv_TUU, stbsv_TUN, stbsv_TLU, stbsv_TLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, blasint *K, + FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint k = *K; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (tbsv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/tpmv.c b/interface/tpmv.c new file mode 100644 index 0000000..f0fc4f7 --- /dev/null +++ b/interface/tpmv.c @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTPMV " +#else +#define ERROR_NAME "STPMV " +#endif + +static int (*tpmv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtpmv_NUU, qtpmv_NUN, qtpmv_NLU, qtpmv_NLN, + qtpmv_TUU, qtpmv_TUN, qtpmv_TLU, qtpmv_TLN, +#elif defined(DOUBLE) + dtpmv_NUU, dtpmv_NUN, dtpmv_NLU, dtpmv_NLN, + dtpmv_TUU, dtpmv_TUN, dtpmv_TLU, dtpmv_TLN, +#else + stpmv_NUU, stpmv_NUN, stpmv_NLU, stpmv_NLN, + stpmv_TUU, stpmv_TUN, stpmv_TLU, stpmv_TLN, +#endif +}; + +#ifdef SMP +static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qtpmv_thread_NUU, qtpmv_thread_NUN, qtpmv_thread_NLU, qtpmv_thread_NLN, + qtpmv_thread_TUU, qtpmv_thread_TUN, qtpmv_thread_TLU, qtpmv_thread_TLN, +#elif defined(DOUBLE) + dtpmv_thread_NUU, dtpmv_thread_NUN, dtpmv_thread_NLU, dtpmv_thread_NLN, + dtpmv_thread_TUU, dtpmv_thread_TUN, dtpmv_thread_TLU, dtpmv_thread_TLN, +#else + stpmv_thread_NUU, stpmv_thread_NUN, stpmv_thread_NLU, stpmv_thread_NLN, + stpmv_thread_TUU, stpmv_thread_TUN, stpmv_thread_TLU, stpmv_thread_TLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (tpmv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); + +#ifdef SMP + } else { + + (tpmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/tpsv.c b/interface/tpsv.c new file mode 100644 index 0000000..9dafd0b --- /dev/null +++ b/interface/tpsv.c @@ -0,0 +1,204 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTPSV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTPSV " +#else +#define ERROR_NAME "STPSV " +#endif + +static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtpsv_NUU, qtpsv_NUN, qtpsv_NLU, qtpsv_NLN, + qtpsv_TUU, qtpsv_TUN, qtpsv_TLU, qtpsv_TLN, +#elif defined(DOUBLE) + dtpsv_NUU, dtpsv_NUN, dtpsv_NLU, dtpsv_NLN, + dtpsv_TUU, dtpsv_TUN, dtpsv_TLU, dtpsv_TLN, +#else + stpsv_NUU, stpsv_NUN, stpsv_NLU, stpsv_NLN, + stpsv_TUU, stpsv_TUN, stpsv_TLU, stpsv_TLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (tpsv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/trmv.c b/interface/trmv.c new file mode 100644 index 0000000..ed23ced --- /dev/null +++ b/interface/trmv.c @@ -0,0 +1,243 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTRMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTRMV " +#else +#define ERROR_NAME "STRMV " +#endif + +static int (*trmv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + qtrmv_NUU, qtrmv_NUN, qtrmv_NLU, qtrmv_NLN, + qtrmv_TUU, qtrmv_TUN, qtrmv_TLU, qtrmv_TLN, +#elif defined(DOUBLE) + dtrmv_NUU, dtrmv_NUN, dtrmv_NLU, dtrmv_NLN, + dtrmv_TUU, dtrmv_TUN, dtrmv_TLU, dtrmv_TLN, +#else + strmv_NUU, strmv_NUN, strmv_NLU, strmv_NLN, + strmv_TUU, strmv_TUN, strmv_TLU, strmv_TLN, +#endif +}; + +#ifdef SMP +static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qtrmv_thread_NUU, qtrmv_thread_NUN, qtrmv_thread_NLU, qtrmv_thread_NLN, + qtrmv_thread_TUU, qtrmv_thread_TUN, qtrmv_thread_TLU, qtrmv_thread_TLN, +#elif defined(DOUBLE) + dtrmv_thread_NUU, dtrmv_thread_NUN, dtrmv_thread_NLU, dtrmv_thread_NLN, + dtrmv_thread_TUU, dtrmv_thread_TUN, dtrmv_thread_TLU, dtrmv_thread_TLN, +#else + strmv_thread_NUU, strmv_thread_NUN, strmv_thread_NLU, strmv_thread_NLN, + strmv_thread_TUU, strmv_thread_TUN, strmv_thread_TLU, strmv_thread_TLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + +#ifdef SMP + } else { + + (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/trsm.c b/interface/trsm.c new file mode 100644 index 0000000..5836ce2 --- /dev/null +++ b/interface/trsm.c @@ -0,0 +1,391 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef TRMM +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QTRSM " +#elif defined(DOUBLE) +#define ERROR_NAME "DTRSM " +#else +#define ERROR_NAME "STRSM " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XTRSM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRSM " +#else +#define ERROR_NAME "CTRSM " +#endif +#endif +#else +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QTRMM " +#elif defined(DOUBLE) +#define ERROR_NAME "DTRMM " +#else +#define ERROR_NAME "STRMM " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XTRMM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRMM " +#else +#define ERROR_NAME "CTRMM " +#endif +#endif +#endif + +static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef TRMM + TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, + TRSM_LTUU, TRSM_LTUN, TRSM_LTLU, TRSM_LTLN, + TRSM_LRUU, TRSM_LRUN, TRSM_LRLU, TRSM_LRLN, + TRSM_LCUU, TRSM_LCUN, TRSM_LCLU, TRSM_LCLN, + TRSM_RNUU, TRSM_RNUN, TRSM_RNLU, TRSM_RNLN, + TRSM_RTUU, TRSM_RTUN, TRSM_RTLU, TRSM_RTLN, + TRSM_RRUU, TRSM_RRUN, TRSM_RRLU, TRSM_RRLN, + TRSM_RCUU, TRSM_RCUN, TRSM_RCLU, TRSM_RCLN, +#else + TRMM_LNUU, TRMM_LNUN, TRMM_LNLU, TRMM_LNLN, + TRMM_LTUU, TRMM_LTUN, TRMM_LTLU, TRMM_LTLN, + TRMM_LRUU, TRMM_LRUN, TRMM_LRLU, TRMM_LRLN, + TRMM_LCUU, TRMM_LCUN, TRMM_LCLU, TRMM_LCLN, + TRMM_RNUU, TRMM_RNUN, TRMM_RNLU, TRMM_RNLN, + TRMM_RTUU, TRMM_RTUN, TRMM_RTLU, TRMM_RTLN, + TRMM_RRUU, TRMM_RRUN, TRMM_RRLU, TRMM_RRLN, + TRMM_RCUU, TRMM_RCUN, TRMM_RCLU, TRMM_RCLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, + blasint *M, blasint *N, FLOAT *alpha, + FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB){ + + char side_arg = *SIDE; + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blas_arg_t args; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + blasint info; + int side; + int uplo; + int unit; + int trans; + int nrowa; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + + args.a = (void *)a; + args.b = (void *)b; + + args.lda = *ldA; + args.ldb = *ldB; + + args.beta = (void *)alpha; + + TOUPPER(side_arg); + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + side = -1; + trans = -1; + unit = -1; + uplo = -1; + + if (side_arg == 'L') side = 0; + if (side_arg == 'R') side = 1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + nrowa = args.m; + if (side & 1) nrowa = args.n; + + info = 0; + + if (args.ldb < MAX(1,args.m)) info = 11; + if (args.lda < MAX(1,nrowa)) info = 9; + if (args.n < 0) info = 6; + if (args.m < 0) info = 5; + if (unit < 0) info = 4; + if (trans < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE Trans, enum CBLAS_DIAG Diag, + blasint m, blasint n, +#ifndef COMPLEX + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, + FLOAT *b, blasint ldb) { + + blas_arg_t args; + int side, uplo, trans, unit; + blasint info, nrowa; + + XFLOAT *buffer; + XFLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + PRINT_DEBUG_CNAME; + + args.a = (void *)a; + args.b = (void *)b; + + args.lda = lda; + args.ldb = ldb; + +#ifndef COMPLEX + args.beta = (void *)α +#else + args.beta = (void *)alpha; +#endif + + side = -1; + uplo = -1; + trans = -1; + unit = -1; + info = 0; + + if (order == CblasColMajor) { + args.m = m; + args.n = n; + + if (Side == CblasLeft) side = 0; + if (Side == CblasRight) side = 1; + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (Trans == CblasNoTrans) trans = 0; + if (Trans == CblasTrans) trans = 1; +#ifndef COMPLEX + if (Trans == CblasConjNoTrans) trans = 0; + if (Trans == CblasConjTrans) trans = 1; +#else + if (Trans == CblasConjNoTrans) trans = 2; + if (Trans == CblasConjTrans) trans = 3; +#endif + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + nrowa = args.m; + if (side & 1) nrowa = args.n; + + if (args.ldb < MAX(1,args.m)) info = 11; + if (args.lda < MAX(1,nrowa)) info = 9; + if (args.n < 0) info = 6; + if (args.m < 0) info = 5; + if (unit < 0) info = 4; + if (trans < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + } + + if (order == CblasRowMajor) { + args.m = n; + args.n = m; + + if (Side == CblasLeft) side = 1; + if (Side == CblasRight) side = 0; + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (Trans == CblasNoTrans) trans = 0; + if (Trans == CblasTrans) trans = 1; +#ifndef COMPLEX + if (Trans == CblasConjNoTrans) trans = 0; + if (Trans == CblasConjTrans) trans = 1; +#else + if (Trans == CblasConjNoTrans) trans = 2; + if (Trans == CblasConjTrans) trans = 3; +#endif + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + nrowa = args.m; + if (side & 1) nrowa = args.n; + + if (args.ldb < MAX(1,args.m)) info = 11; + if (args.lda < MAX(1,nrowa)) info = 9; + if (args.n < 0) info = 6; + if (args.m < 0) info = 5; + if (unit < 0) info = 4; + if (trans < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((args.m == 0) || (args.n == 0)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (FLOAT *)blas_memory_alloc(0); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + mode |= (trans << BLAS_TRANSA_SHIFT); + mode |= (side << BLAS_RSIDE_SHIFT); + + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + if (!side) { + gemm_thread_n(mode, &args, NULL, NULL, trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit], sa, sb, args.nthreads); + } else { + gemm_thread_m(mode, &args, NULL, NULL, trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit], sa, sb, args.nthreads); + } + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, + (!side) ? args.m * (args.m + args.n) : args.n * (args.m + args.n), + (!side) ? args.m * args.m * args.n : args.m * args.n * args.n); + + IDEBUG_END; + + return; +} + diff --git a/interface/trsv.c b/interface/trsv.c new file mode 100644 index 0000000..8ef6998 --- /dev/null +++ b/interface/trsv.c @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTRSV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTRSV " +#else +#define ERROR_NAME "STRSV " +#endif + +static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtrsv_NUU, qtrsv_NUN, qtrsv_NLU, qtrsv_NLN, + qtrsv_TUU, qtrsv_TUN, qtrsv_TLU, qtrsv_TLN, +#elif defined(DOUBLE) + dtrsv_NUU, dtrsv_NUN, dtrsv_NLU, dtrsv_NLN, + dtrsv_TUU, dtrsv_TUN, dtrsv_TLU, dtrsv_TLN, +#else + strsv_NUU, strsv_NUN, strsv_NLU, strsv_NLN, + strsv_TUU, strsv_TUN, strsv_TLU, strsv_TLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/trti2.c b/interface/trti2.c new file mode 100644 index 0000000..e119b45 --- /dev/null +++ b/interface/trti2.c @@ -0,0 +1,134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTRTI2" +#elif defined(DOUBLE) +#define ERROR_NAME "DTRTI2" +#else +#define ERROR_NAME "STRTI2" +#endif + +static blasint (*trti2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifdef XDOUBLE + qtrti2_UU, qtrti2_UN, qtrti2_LU, qtrti2_LN, +#elif defined(DOUBLE) + dtrti2_UU, dtrti2_UN, dtrti2_LU, dtrti2_LN, +#else + strti2_UU, strti2_UN, strti2_LU, strti2_LN, +#endif + }; + +int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint diag_arg = *DIAG; + blasint uplo, diag; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + TOUPPER(diag_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 5; + if (args.n < 0) info = 3; + if (diag < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (trti2[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/interface/trtri.c b/interface/trtri.c new file mode 100644 index 0000000..9e31905 --- /dev/null +++ b/interface/trtri.c @@ -0,0 +1,153 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTRTRI" +#elif defined(DOUBLE) +#define ERROR_NAME "DTRTRI" +#else +#define ERROR_NAME "STRTRI" +#endif + +static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UU_SINGLE, TRTRI_UN_SINGLE, TRTRI_LU_SINGLE, TRTRI_LN_SINGLE, +}; + +#ifdef SMP +static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UU_PARALLEL, TRTRI_UN_PARALLEL, TRTRI_LU_PARALLEL, TRTRI_LN_PARALLEL, +}; +#endif + +int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint diag_arg = *DIAG; + blasint uplo, diag; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + TOUPPER(diag_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 5; + if (args.n < 0) info = 3; + if (diag < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + if (diag) { + if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { + *Info = IAMIN_K(args.n, args.a, args.lda + 1); + return 0; + } + } + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zaxpy.c b/interface/zaxpy.c new file mode 100644 index 0000000..d3355ea --- /dev/null +++ b/interface/zaxpy.c @@ -0,0 +1,122 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + +#else + +void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + +#endif + + FLOAT alpha_r = *(ALPHA + 0); + FLOAT alpha_i = *(ALPHA + 1); + +#ifdef SMP + int mode, nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_CNAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (n <= 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx * 2; + if (incy < 0) y -= (n - 1) * incy * 2; + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + +#ifndef CONJ + AXPYU_K (n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0); +#else + AXPYC_K(n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0); +#endif + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, +#ifndef CONJ + (void *)AXPYU_K, +#else + (void *)AXPYC_K, +#endif + nthreads); + } +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zdot.c b/interface/zdot.c new file mode 100644 index 0000000..1380ce2 --- /dev/null +++ b/interface/zdot.c @@ -0,0 +1,202 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef RETURN_BY_STRUCT +#ifdef XDOUBLE +#define MYTYPE myxcomplex_t +#elif defined DOUBLE +#define MYTYPE myzcomplex_t +#else +#define MYTYPE myccomplex_t +#endif +#endif + +#ifndef CBLAS + +#ifdef RETURN_BY_STRUCT +MYTYPE NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +#elif defined RETURN_BY_STACK +void NAME(FLOAT _Complex *result, blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +#else +FLOAT _Complex NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +#endif + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; +#ifndef RETURN_BY_STACK + FLOAT _Complex ret; +#endif +#ifdef RETURN_BY_STRUCT + MYTYPE myret; +#endif + + PRINT_DEBUG_NAME; + + if (n <= 0) { +#ifdef RETURN_BY_STRUCT + myret.r = 0.; + myret.i = 0.; + return myret; +#elif defined RETURN_BY_STACK + *result = ZERO; + return; +#else + return ZERO; +#endif + } + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx * 2; + if (incy < 0) y -= (n - 1) * incy * 2; + +#ifdef RETURN_BY_STRUCT + +#ifndef CONJ + ret = DOTU_K(n, x, incx, y, incy); +#else + ret = DOTC_K(n, x, incx, y, incy); +#endif + + myret.r = CREAL ret; + myret.i = CIMAG ret; + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + + return myret; + +#elif defined RETURN_BY_STACK + +#ifndef CONJ + *result = DOTU_K(n, x, incx, y, incy); +#else + *result = DOTC_K(n, x, incx, y, incy); +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + +#else + +#ifndef CONJ + ret = DOTU_K(n, x, incx, y, incy); +#else + ret = DOTC_K(n, x, incx, y, incy); +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; + +#endif + +} + +#else + +#ifdef FORCE_USE_STACK +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT _Complex *result){ +#else +FLOAT _Complex CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + + FLOAT _Complex ret; +#endif + + PRINT_DEBUG_CNAME; + + if (n <= 0) { +#ifdef FORCE_USE_STACK + *result = ZERO; + return; +#else + return ZERO; +#endif + } + + if (incx < 0) x -= (n - 1) * incx * 2; + if (incy < 0) y -= (n - 1) * incy * 2; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifdef FORCE_USE_STACK + +#ifndef CONJ + *result = DOTU_K(n, x, incx, y, incy); +#else + *result = DOTC_K(n, x, incx, y, incy); +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + +#else + +#ifndef CONJ + ret = DOTU_K(n, x, incx, y, incy); +#else + ret = DOTC_K(n, x, incx, y, incy); +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; + +#endif + +} + +#endif diff --git a/interface/zgbmv.c b/interface/zgbmv.c new file mode 100644 index 0000000..ae1fd24 --- /dev/null +++ b/interface/zgbmv.c @@ -0,0 +1,271 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGBMV " +#else +#define ERROR_NAME "CGBMV " +#endif + +static void (*gbmv[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xgbmv_n, xgbmv_t, xgbmv_r, xgbmv_c, + xgbmv_o, xgbmv_u, xgbmv_s, xgbmv_d, +#elif defined(DOUBLE) + zgbmv_n, zgbmv_t, zgbmv_r, zgbmv_c, + zgbmv_o, zgbmv_u, zgbmv_s, zgbmv_d, +#else + cgbmv_n, cgbmv_t, cgbmv_r, cgbmv_c, + cgbmv_o, cgbmv_u, cgbmv_s, cgbmv_d, +#endif +}; + +#ifdef SMP +static int (*gbmv_thread[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT *, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xgbmv_thread_n, xgbmv_thread_t, xgbmv_thread_r, xgbmv_thread_c, + xgbmv_thread_o, xgbmv_thread_u, xgbmv_thread_s, xgbmv_thread_d, +#elif defined(DOUBLE) + zgbmv_thread_n, zgbmv_thread_t, zgbmv_thread_r, zgbmv_thread_c, + zgbmv_thread_o, zgbmv_thread_u, zgbmv_thread_s, zgbmv_thread_d, +#else + cgbmv_thread_n, cgbmv_thread_t, cgbmv_thread_r, cgbmv_thread_c, + cgbmv_thread_o, cgbmv_thread_u, cgbmv_thread_s, cgbmv_thread_d, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, + blasint *KU, blasint *KL, + FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, + FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint ku = *KU; + blasint kl = *KL; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') i = 0; + if (trans == 'T') i = 1; + if (trans == 'R') i = 2; + if (trans == 'C') i = 3; + if (trans == 'O') i = 4; + if (trans == 'U') i = 5; + if (trans == 'S') i = 6; + if (trans == 'D') i = 7; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (i < 0) info = 1; + + trans = i; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_TRANSPOSE TransA, + blasint m, blasint n, + blasint ku, blasint kl, + FLOAT *ALPHA, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT *BETA, + FLOAT *y, blasint incy){ + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + + FLOAT *buffer; + blasint lenx, leny; + int trans; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + info = -1; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + info = -1; + + t = n; + n = m; + m = t; + + t = ku; + ku = kl; + kl = t; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + lenx = n; + leny = m; + if (trans & 1) lenx = m; + if (trans & 1) leny = n; + + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (lenx - 1) * incx * 2; + if (incy < 0) y -= (leny - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (gbmv[(int)trans])(m, n, kl, ku, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + + } else { + + (gbmv_thread[(int)trans])(m, n, kl, ku, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, m * n / 2 + n, m * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zgemv.c b/interface/zgemv.c new file mode 100644 index 0000000..fb47842 --- /dev/null +++ b/interface/zgemv.c @@ -0,0 +1,259 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGEMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMV " +#else +#define ERROR_NAME "CGEMV " +#endif + +#ifdef SMP +static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, xgemv_thread_d, +#elif defined DOUBLE + zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, zgemv_thread_d, +#else + cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, cgemv_thread_d, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, + FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, + FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, + FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T, GEMV_R, GEMV_C, + GEMV_O, GEMV_U, GEMV_S, GEMV_D, + }; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + FLOAT alpha_r = *(ALPHA + 0); + FLOAT alpha_i = *(ALPHA + 1); + + FLOAT beta_r = *(BETA + 0); + FLOAT beta_i = *(BETA + 1); + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') i = 0; + if (trans == 'T') i = 1; + if (trans == 'R') i = 2; + if (trans == 'C') i = 3; + if (trans == 'O') i = 4; + if (trans == 'U') i = 5; + if (trans == 'S') i = 6; + if (trans == 'D') i = 7; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1,m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (i < 0) info = 1; + + trans = i; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_TRANSPOSE TransA, + blasint m, blasint n, + FLOAT *ALPHA, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT *BETA, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + blasint lenx, leny; + int trans; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, + FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T, GEMV_R, GEMV_C, + GEMV_O, GEMV_U, GEMV_S, GEMV_D, + }; + + PRINT_DEBUG_CNAME; + + FLOAT alpha_r = *(ALPHA + 0); + FLOAT alpha_i = *(ALPHA + 1); + + FLOAT beta_r = *(BETA + 0); + FLOAT beta_i = *(BETA + 1); + + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + + } + + if (order == CblasRowMajor) { + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + info = -1; + + t = n; + n = m; + m = t; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + /* Quick return if possible. */ + + if (m == 0 || n == 0) return; + + lenx = n; + leny = m; + + if (trans & 1) lenx = m; + if (trans & 1) leny = n; + + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (lenx - 1) * incx * 2; + if (incy < 0) y -= (leny - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + + } else { + + (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zger.c b/interface/zger.c new file mode 100644 index 0000000..ad52f40 --- /dev/null +++ b/interface/zger.c @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#ifndef CONJ +#define ERROR_NAME "XGERU " +#else +#define ERROR_NAME "XGERC " +#endif +#elif defined DOUBLE +#ifndef CONJ +#define ERROR_NAME "ZGERU " +#else +#define ERROR_NAME "ZGERC " +#endif +#else +#ifndef CONJ +#define ERROR_NAME "CGERU " +#else +#define ERROR_NAME "CGERC " +#endif +#endif + +#if defined XDOUBLE +#ifndef CONJ +#define GER GERU_K +#define GER_THREAD xger_thread_U +#else +#define GER GERC_K +#define GER_THREAD xger_thread_C +#define GERV GERV_K +#define GERV_THREAD xger_thread_V +#endif +#elif defined DOUBLE +#ifndef CONJ +#define GER GERU_K +#define GER_THREAD zger_thread_U +#else +#define GER GERC_K +#define GER_THREAD zger_thread_C +#define GERV GERV_K +#define GERV_THREAD zger_thread_V +#endif +#else +#ifndef CONJ +#define GER GERU_K +#define GER_THREAD cger_thread_U +#else +#define GER GERC_K +#define GER_THREAD cger_thread_C +#define GERV GERV_K +#define GERV_THREAD cger_thread_V +#endif +#endif + +#ifndef CBLAS + +void NAME(blasint *M, blasint *N, FLOAT *Alpha, + FLOAT *x, blasint *INCX, + FLOAT *y, blasint *INCY, + FLOAT *a, blasint *LDA){ + + blasint m = *M; + blasint n = *N; + FLOAT alpha_r = Alpha[0]; + FLOAT alpha_i = Alpha[1]; + blasint incx = *INCX; + blasint incy = *INCY; + blasint lda = *LDA; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + blasint info; + + PRINT_DEBUG_NAME; + + info = 0; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + + if (info){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + blasint m, blasint n, + FLOAT *Alpha, + FLOAT *x, blasint incx, + FLOAT *y, blasint incy, + FLOAT *a, blasint lda) { + + FLOAT alpha_r = Alpha[0]; + FLOAT alpha_i = Alpha[1]; + + FLOAT *buffer; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + info = 0; + + if (order == CblasColMajor) { + info = -1; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (order == CblasRowMajor) { + info = -1; + + t = n; + n = m; + m = t; + + t = incx; + incx = incy; + incy = t; + + buffer = x; + x = y; + y = buffer; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + /* Quick return if possible. */ + if (m == 0 || n == 0) return; + + if ((alpha_r == 0.) && (alpha_i == 0.)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incy < 0) y -= (n - 1) * incy * 2; + if (incx < 0) x -= (m - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if !defined(CBLAS) || !defined(CONJ) + GER(m, n, 0, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); +#else + if (order == CblasColMajor) { + GER(m, n, 0, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); + } else { + GERV(m, n, 0, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); + } +#endif + +#ifdef SMP + + } else { + +#if !defined(CBLAS) || !defined(CONJ) + GER_THREAD(m, n, Alpha, x, incx, y, incy, a, lda, buffer, nthreads); +#else + if (order == CblasColMajor) { + GER_THREAD(m, n, Alpha, x, incx, y, incy, a, lda, buffer, nthreads); + } else { + GERV_THREAD(m, n, Alpha, x, incx, y, incy, a, lda, buffer, nthreads); + } +#endif + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zgetf2.c b/interface/zgetf2.c new file mode 100644 index 0000000..950ef46 --- /dev/null +++ b/interface/zgetf2.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGETF2" +#elif defined(DOUBLE) +#define ERROR_NAME "ZGETF2" +#else +#define ERROR_NAME "CGETF2" +#endif + +int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + args.c = (void *)ipiv; + + info = 0; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = GETF2(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zgetrf.c b/interface/zgetrf.c new file mode 100644 index 0000000..9f041d9 --- /dev/null +++ b/interface/zgetrf.c @@ -0,0 +1,122 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGETRF" +#elif defined(DOUBLE) +#define ERROR_NAME "ZGETRF" +#else +#define ERROR_NAME "CGETRF" +#endif + +int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + args.c = (void *)ipiv; + + info = 0; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zgetrs.c b/interface/zgetrs.c new file mode 100644 index 0000000..81d50e3 --- /dev/null +++ b/interface/zgetrs.c @@ -0,0 +1,153 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGETRS" +#elif defined(DOUBLE) +#define ERROR_NAME "ZGETRS" +#else +#define ERROR_NAME "CGETRS" +#endif + +static blasint (*getrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + GETRS_N_SINGLE, GETRS_T_SINGLE, GETRS_R_SINGLE, GETRS_C_SINGLE, +}; + +#ifdef SMP +static blasint (*getrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + GETRS_N_PARALLEL, GETRS_T_PARALLEL, GETRS_R_PARALLEL, GETRS_C_PARALLEL, +}; +#endif + +int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, + blasint *ipiv, FLOAT *b, blasint *ldB, blasint *Info){ + + char trans_arg = *TRANS; + + blas_arg_t args; + + blasint info; + int trans; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *N; + args.n = *NRHS; + args.a = (void *)a; + args.lda = *ldA; + args.b = (void *)b; + args.ldb = *ldB; + args.c = (void *)ipiv; + + info = 0; + + TOUPPER(trans_arg); + trans = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (args.ldb < MAX(1, args.m)) info = 8; + if (args.lda < MAX(1, args.m)) info = 5; + if (args.n < 0) info = 3; + if (args.m < 0) info = 2; + if (trans < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return 0; + } + + args.alpha = NULL; + args.beta = NULL; + + *Info = info; + + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + (getrs_single[trans])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + (getrs_parallel[trans])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n); + + IDEBUG_END; + + return 0; + +} diff --git a/interface/zhbmv.c b/interface/zhbmv.c new file mode 100644 index 0000000..c14ad98 --- /dev/null +++ b/interface/zhbmv.c @@ -0,0 +1,223 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHBMV " +#else +#define ERROR_NAME "CHBMV " +#endif + +static int (*hbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xhbmv_U, xhbmv_L, xhbmv_V, xhbmv_M, +#elif defined(DOUBLE) + zhbmv_U, zhbmv_L, zhbmv_V, zhbmv_M, +#else + chbmv_U, chbmv_L, chbmv_V, chbmv_M, +#endif +}; + +#ifdef SMP +static int (*hbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xhbmv_thread_U, xhbmv_thread_L, xhbmv_thread_V, xhbmv_thread_M, +#elif defined(DOUBLE) + zhbmv_thread_U, zhbmv_thread_L, zhbmv_thread_V, zhbmv_thread_M, +#else + chbmv_thread_U, chbmv_thread_L, chbmv_thread_V, chbmv_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + blasint k = *K; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + if (uplo_arg == 'V') uplo = 2; + if (uplo_arg == 'M') uplo = 3; + + info = 0; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, blasint k, + FLOAT *ALPHA, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT *BETA, + FLOAT *y, blasint incy){ + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * COMPSIZE; + if (incy < 0 ) y -= (n - 1) * incy * COMPSIZE; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (hbmv_thread[uplo])(n, k, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/zhemv.c b/interface/zhemv.c new file mode 100644 index 0000000..3cba445 --- /dev/null +++ b/interface/zhemv.c @@ -0,0 +1,215 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHEMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHEMV " +#else +#define ERROR_NAME "CHEMV " +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; +#ifdef SMP + int nthreads; +#endif + + int (*hemv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + HEMV_U, HEMV_L, HEMV_V, HEMV_M, + }; + +#ifdef SMP + int (*hemv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + HEMV_THREAD_U, HEMV_THREAD_L, HEMV_THREAD_V, HEMV_THREAD_M, + }; +#endif + + blasint info; + int uplo; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + if (uplo_arg == 'V') uplo = 2; + if (uplo_arg == 'M') uplo = 3; + + info = 0; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, + FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy) { + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + int (*hemv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + HEMV_U, HEMV_L, HEMV_V, HEMV_M, + }; + +#ifdef SMP + int (*hemv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + HEMV_THREAD_U, HEMV_THREAD_L, HEMV_THREAD_V, HEMV_THREAD_M, + }; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + if (incy < 0 ) y -= (n - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hemv[uplo])(n, n, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (hemv_thread[uplo])(n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zher.c b/interface/zher.c new file mode 100644 index 0000000..ad982dd --- /dev/null +++ b/interface/zher.c @@ -0,0 +1,200 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHER " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHER " +#else +#define ERROR_NAME "CHER " +#endif + +static int (*her[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xher_U, xher_L, xher_V, xher_M, +#elif defined(DOUBLE) + zher_U, zher_L, zher_V, zher_M, +#else + cher_U, cher_L, cher_V, cher_M, +#endif +}; + +#ifdef SMP +static int (*her_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xher_thread_U, xher_thread_L, xher_thread_V, xher_thread_M, +#elif defined(DOUBLE) + zher_thread_U, zher_thread_L, zher_thread_V, zher_thread_M, +#else + cher_thread_U, cher_thread_L, cher_thread_V, cher_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (her[uplo])(n, alpha, x, incx, a, lda, buffer); + +#ifdef SMP + } else { + + (her_thread[uplo])(n, alpha, x, incx, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zher2.c b/interface/zher2.c new file mode 100644 index 0000000..88fecec --- /dev/null +++ b/interface/zher2.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHER2 " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHER2 " +#else +#define ERROR_NAME "CHER2 " +#endif + +static int (*her2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xher2_U, xher2_L, xher2_V, xher2_M, +#elif defined(DOUBLE) + zher2_U, zher2_L, zher2_V, zher2_M, +#else + cher2_U, cher2_L, cher2_V, cher2_M, +#endif +}; + +#ifdef SMP +static int (*her2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xher2_thread_U, xher2_thread_L, xher2_thread_V, xher2_thread_M, +#elif defined(DOUBLE) + zher2_thread_U, zher2_thread_L, zher2_thread_V, zher2_thread_M, +#else + cher2_thread_U, cher2_thread_L, cher2_thread_V, cher2_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (lda < MAX(1, n)) info = 9; + if (incx == 0) info = 7; + if (incy == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + if (incy < 0 ) y -= (n - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (her2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); + +#ifdef SMP + } else { + + (her2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zhpmv.c b/interface/zhpmv.c new file mode 100644 index 0000000..d7013e6 --- /dev/null +++ b/interface/zhpmv.c @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHPMV " +#else +#define ERROR_NAME "CHPMV " +#endif + +static int (*hpmv[])(BLASLONG, FLOAT, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xhpmv_U, xhpmv_L, xhpmv_V, xhpmv_M, +#elif defined(DOUBLE) + zhpmv_U, zhpmv_L, zhpmv_V, zhpmv_M, +#else + chpmv_U, chpmv_L, chpmv_V, chpmv_M, +#endif +}; + +#ifdef SMP +static int (*hpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xhpmv_thread_U, xhpmv_thread_L, xhpmv_thread_V, xhpmv_thread_M, +#elif defined(DOUBLE) + zhpmv_thread_U, zhpmv_thread_L, zhpmv_thread_V, zhpmv_thread_M, +#else + chpmv_thread_U, chpmv_thread_L, chpmv_thread_V, chpmv_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT *ALPHA, + FLOAT *a, + FLOAT *x, blasint incx, + FLOAT *BETA, + FLOAT *y, blasint incy){ + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + if (incy < 0 ) y -= (n - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hpmv[uplo])(n, alpha_r, alpha_i, a, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (hpmv_thread[uplo])(n, ALPHA, a, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zhpr.c b/interface/zhpr.c new file mode 100644 index 0000000..c48e352 --- /dev/null +++ b/interface/zhpr.c @@ -0,0 +1,198 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHPR " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHPR " +#else +#define ERROR_NAME "CHPR " +#endif + +static int (*hpr[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + xhpr_U, xhpr_L, xhpr_V, xhpr_M, +#elif defined(DOUBLE) + zhpr_U, zhpr_L, zhpr_V, zhpr_M, +#else + chpr_U, chpr_L, chpr_V, chpr_M, +#endif +}; + +#ifdef SMP +static int (*hpr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + xhpr_thread_U, xhpr_thread_L, xhpr_thread_V, xhpr_thread_M, +#elif defined(DOUBLE) + zhpr_thread_U, zhpr_thread_L, zhpr_thread_V, zhpr_thread_M, +#else + chpr_thread_U, chpr_thread_L, chpr_thread_V, chpr_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT alpha, + FLOAT *x, blasint incx, + FLOAT *a) { + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hpr[uplo])(n, alpha, x, incx, a, buffer); + +#ifdef SMP + + } else { + + (hpr_thread[uplo])(n, alpha, x, incx, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zhpr2.c b/interface/zhpr2.c new file mode 100644 index 0000000..cf1d5f9 --- /dev/null +++ b/interface/zhpr2.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHPR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHPR2 " +#else +#define ERROR_NAME "CHPR2 " +#endif + +static int (*hpr2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + xhpr2_U, xhpr2_L, xhpr2_V, xhpr2_M, +#elif defined(DOUBLE) + zhpr2_U, zhpr2_L, zhpr2_V, zhpr2_M, +#else + chpr2_U, chpr2_L, chpr2_V, chpr2_M, +#endif +}; + +#ifdef SMP +static int (*hpr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + xhpr2_thread_U, xhpr2_thread_L, xhpr2_thread_V, xhpr2_thread_M, +#elif defined(DOUBLE) + zhpr2_thread_U, zhpr2_thread_L, zhpr2_thread_V, zhpr2_thread_M, +#else + chpr2_thread_U, chpr2_thread_L, chpr2_thread_V, chpr2_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT *ALPHA, + FLOAT *x, blasint incx, + FLOAT *y, blasint incy, + FLOAT *a) { + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incx == 0) info = 7; + if (incy == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + if (incy < 0 ) y -= (n - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hpr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer); + +#ifdef SMP + } else { + + (hpr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zlaswp.c b/interface/zlaswp.c new file mode 100644 index 0000000..85ead2c --- /dev/null +++ b/interface/zlaswp.c @@ -0,0 +1,108 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, blasint *, BLASLONG) = { +#ifdef XDOUBLE + xlaswp_plus, xlaswp_minus, +#elif defined(DOUBLE) + zlaswp_plus, zlaswp_minus, +#else + claswp_plus, claswp_minus, +#endif +}; + +int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){ + + blasint n = *N; + blasint lda = *LDA; + blasint k1 = *K1; + blasint k2 = *K2; + blasint incx = *INCX; + int flag; + +#ifdef SMP + int mode; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + if (incx == 0 || n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + flag = (incx < 0); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (laswp[flag])(n, k1, k2, ZERO, ZERO, a, lda, NULL, 0, ipiv, incx); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); + } +#endif + + FUNCTION_PROFILE_END(COMPSIZE, n * (k2 - k1), 0); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zlauu2.c b/interface/zlauu2.c new file mode 100644 index 0000000..05603fe --- /dev/null +++ b/interface/zlauu2.c @@ -0,0 +1,129 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QLAUU2" +#elif defined(DOUBLE) +#define ERROR_NAME "ZLAUU2" +#else +#define ERROR_NAME "CLAUU2" +#endif + +static blasint (*lauu2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + +#ifdef XDOUBLE + xlauu2_U, xlauu2_L, +#elif defined(DOUBLE) + zlauu2_U, zlauu2_L, +#else + clauu2_U, clauu2_L, +#endif + }; + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (lauu2[uplo])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zlauum.c b/interface/zlauum.c new file mode 100644 index 0000000..23990e8 --- /dev/null +++ b/interface/zlauum.c @@ -0,0 +1,141 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XLAUUM" +#elif defined(DOUBLE) +#define ERROR_NAME "ZLAUUM" +#else +#define ERROR_NAME "CLAUUM" +#endif + +static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LAUUM_U_SINGLE, LAUUM_L_SINGLE, +}; + +#ifdef SMP +static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zpotf2.c b/interface/zpotf2.c new file mode 100644 index 0000000..f8f81e2 --- /dev/null +++ b/interface/zpotf2.c @@ -0,0 +1,129 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XPOTF2" +#elif defined(DOUBLE) +#define ERROR_NAME "ZPOTF2" +#else +#define ERROR_NAME "CPOTF2" +#endif + +static blasint (*potf2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + +#ifdef XDOUBLE + xpotf2_U, xpotf2_L, +#elif defined(DOUBLE) + zpotf2_U, zpotf2_L, +#else + cpotf2_U, cpotf2_L, +#endif + }; + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (potf2[uplo])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zpotrf.c b/interface/zpotrf.c new file mode 100644 index 0000000..e2004d7 --- /dev/null +++ b/interface/zpotrf.c @@ -0,0 +1,141 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XPOTRF" +#elif defined(DOUBLE) +#define ERROR_NAME "ZPOTRF" +#else +#define ERROR_NAME "CPOTRF" +#endif + +static blasint (*potrf_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + POTRF_U_SINGLE, POTRF_L_SINGLE, +}; + +#ifdef SMP +static blasint (*potrf_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + POTRF_U_PARALLEL, POTRF_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (potrf_single[uplo])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = (potrf_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zpotri.c b/interface/zpotri.c new file mode 100644 index 0000000..df32542 --- /dev/null +++ b/interface/zpotri.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XPOTRI" +#elif defined(DOUBLE) +#define ERROR_NAME "ZPOTRI" +#else +#define ERROR_NAME "CPOTRI" +#endif + +static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UN_SINGLE, TRTRI_LN_SINGLE, +}; + +static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + LAUUM_U_SINGLE, LAUUM_L_SINGLE, +}; + +#ifdef SMP +static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UN_PARALLEL, TRTRI_LN_PARALLEL, +}; + +static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + info = (trtri_single[uplo])(&args, NULL, NULL, sa, sb, 0); + + if (!info) { + info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); + } + + *Info = info; + +#ifdef SMP + } else { + info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + + if (!info) { + info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + } + + *Info = info; + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zrot.c b/interface/zrot.c new file mode 100644 index 0000000..f18bbc6 --- /dev/null +++ b/interface/zrot.c @@ -0,0 +1,72 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOAT c = *C; + FLOAT s = *S; + + PRINT_DEBUG_NAME; + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * 2 * incx; + if (incy < 0) y -= (n - 1) * 2 * incy; + + ROT_K(n, x, incx, y, incy, c, s); + + FUNCTION_PROFILE_END(4, n, n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zrotg.c b/interface/zrotg.c new file mode 100644 index 0000000..e9e8a11 --- /dev/null +++ b/interface/zrotg.c @@ -0,0 +1,115 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ + + PRINT_DEBUG_NAME; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) + + long double da_r = *(DA + 0); + long double da_i = *(DA + 1); + long double db_r = *(DB + 0); + long double db_i = *(DB + 1); + long double r; + + long double ada = fabs(da_r) + fabs(da_i); + + if (ada == ZERO) { + *C = ZERO; + *(S + 0) = ONE; + *(S + 1) = ZERO; + *(DA + 0) = db_r; + *(DA + 1) = db_i; + } else { + long double alpha_r, alpha_i; + + ada = sqrt(da_r * da_r + da_i * da_i); + + r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i); + + alpha_r = da_r / ada; + alpha_i = da_i / ada; + + *(C + 0) = ada / r; + *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; + *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; + *(DA + 0) = alpha_r * r; + *(DA + 1) = alpha_i * r; + } +#else + FLOAT da_r = *(DA + 0); + FLOAT da_i = *(DA + 1); + FLOAT db_r = *(DB + 0); + FLOAT db_i = *(DB + 1); + FLOAT r; + + FLOAT ada = fabs(da_r) + fabs(da_i); + FLOAT adb; + + if (ada == ZERO) { + *C = ZERO; + *(S + 0) = ONE; + *(S + 1) = ZERO; + *(DA + 0) = db_r; + *(DA + 1) = db_i; + } else { + FLOAT scale; + FLOAT aa_r, aa_i, bb_r, bb_i; + FLOAT alpha_r, alpha_i; + + aa_r = fabs(da_r); + aa_i = fabs(da_i); + + if (aa_i > aa_r) { + aa_r = fabs(da_i); + aa_i = fabs(da_r); + } + + scale = (aa_i / aa_r); + ada = aa_r * sqrt(ONE + scale * scale); + + bb_r = fabs(db_r); + bb_i = fabs(db_i); + + if (bb_i > bb_r) { + bb_r = fabs(bb_i); + bb_i = fabs(bb_r); + } + + scale = (bb_i / bb_r); + adb = bb_r * sqrt(ONE + scale * scale); + + scale = ada + adb; + + aa_r = da_r / scale; + aa_i = da_i / scale; + bb_r = db_r / scale; + bb_i = db_i / scale; + + r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); + + alpha_r = da_r / ada; + alpha_i = da_i / ada; + + *(C + 0) = ada / r; + *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; + *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; + *(DA + 0) = alpha_r * r; + *(DA + 1) = alpha_i * r; + } +#endif + + FUNCTION_PROFILE_END(4, 4, 4); + + IDEBUG_END; + + return; +} diff --git a/interface/zsbmv.c b/interface/zsbmv.c new file mode 100644 index 0000000..71c03a6 --- /dev/null +++ b/interface/zsbmv.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSBMV " +#else +#define ERROR_NAME "CSBMV " +#endif + +static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xsbmv_U, xsbmv_L, +#elif defined(DOUBLE) + zsbmv_U, zsbmv_L, +#else + csbmv_U, csbmv_L, +#endif +}; + +#ifdef SMP +static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xsbmv_thread_U, xsbmv_thread_L, +#elif defined(DOUBLE) + zsbmv_thread_U, zsbmv_thread_L, +#else + csbmv_thread_U, csbmv_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + blasint k = *K; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; + if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (sbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer); + +#ifdef SMP + } else { + + (sbmv_thread[uplo])(n, k, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/zscal.c b/interface/zscal.c new file mode 100644 index 0000000..ad99874 --- /dev/null +++ b/interface/zscal.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ + + blasint n = *N; + blasint incx = *INCX; + +#ifndef SSCAL + FLOAT *alpha=ALPHA; +#else + FLOAT alpha[2] = {ALPHA[0], ZERO}; +#endif + +#else + +#ifndef SSCAL +void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx){ + + FLOAT *alpha=ALPHA; +#else +void CNAME(blasint n, FLOAT alpha_r, FLOAT *x, blasint incx){ + + FLOAT alpha[2] = {alpha_r, ZERO}; +#endif +#endif + +#ifdef SMP + int mode; + int nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (incx <= 0 || n <= 0) return; + + if ((alpha[0] == ONE) && (alpha[1] == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0); + +#ifdef SMP + } else { +#ifdef DOUBLE + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + + } +#endif + + FUNCTION_PROFILE_END(4, n, n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zspmv.c b/interface/zspmv.c new file mode 100644 index 0000000..ecf1af5 --- /dev/null +++ b/interface/zspmv.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "ZSPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSPMV " +#else +#define ERROR_NAME "CSPMV " +#endif + +static int (*spmv[])(BLASLONG, FLOAT, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xspmv_U, xspmv_L, +#elif defined(DOUBLE) + zspmv_U, zspmv_L, +#else + cspmv_U, cspmv_L, +#endif +}; + +#ifdef SMP +static int (*spmv_thread[])(BLASLONG, FLOAT *, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xspmv_thread_U, xspmv_thread_L, +#elif defined(DOUBLE) + zspmv_thread_U, zspmv_thread_L, +#else + cspmv_thread_U, cspmv_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, + FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; + if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spmv[uplo])(n, alpha_r, alpha_i, a, b, incx, c, incy, buffer); + +#ifdef SMP + + } else { + + (spmv_thread[uplo])(n, ALPHA, a, b, incx, c, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zspr.c b/interface/zspr.c new file mode 100644 index 0000000..0021bcd --- /dev/null +++ b/interface/zspr.c @@ -0,0 +1,146 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSPR " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSPR " +#else +#define ERROR_NAME "CSPR " +#endif + +static int (*spr[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + xspr_U, xspr_L, +#elif defined(DOUBLE) + zspr_U, zspr_L, +#else + cspr_U, cspr_L, +#endif +}; + +#ifdef SMP +static int (*spr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + xspr_thread_U, xspr_thread_L, +#elif defined(DOUBLE) + zspr_thread_U, zspr_thread_L, +#else + cspr_thread_U, cspr_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spr[uplo])(n, alpha_r, alpha_i, x, incx, a, buffer); + +#ifdef SMP + } else { + + (spr_thread[uplo])(n, ALPHA, x, incx, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zspr2.c b/interface/zspr2.c new file mode 100644 index 0000000..b54e165 --- /dev/null +++ b/interface/zspr2.c @@ -0,0 +1,149 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSPR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSPR2 " +#else +#define ERROR_NAME "CSPR2 " +#endif + +static int (*spr2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + xspr2_U, xspr2_L, +#elif defined(DOUBLE) + zspr2_U, zspr2_L, +#else + cspr2_U, cspr2_L, +#endif +}; + +#ifdef SMP +static int (*spr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + xspr2_thread_U, xspr2_thread_L, +#elif defined(DOUBLE) + zspr2_thread_U, zspr2_thread_L, +#else + cspr2_thread_U, cspr2_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer); + +#ifdef SMP + } else { + + (spr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zswap.c b/interface/zswap.c new file mode 100644 index 0000000..f4a03a5 --- /dev/null +++ b/interface/zswap.c @@ -0,0 +1,111 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + +#else + +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + +#endif + +#ifdef SMP + int mode; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + int nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx * 2; + if (incy < 0) y -= (n - 1) * incy * 2; + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + SWAP_K(n, 0, 0, ZERO, ZERO, x, incx, y, incy, NULL, 0); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + blas_level1_thread(mode, n, 0, 0, dummyalpha, + x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads); + + } +#endif + + FUNCTION_PROFILE_END(2, 2 * n, 0); + + IDEBUG_END; + + return; + +} diff --git a/interface/zsymv.c b/interface/zsymv.c new file mode 100644 index 0000000..afb2c17 --- /dev/null +++ b/interface/zsymv.c @@ -0,0 +1,143 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSYMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYMV " +#else +#define ERROR_NAME "CSYMV " +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + SYMV_U, SYMV_L, + }; + +#ifdef SMP + int (*symv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + SYMV_THREAD_U, SYMV_THREAD_L, + }; +#endif + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; + if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (symv[uplo])(n, n, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer); + +#ifdef SMP + } else { + + (symv_thread[uplo])(n, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zsyr.c b/interface/zsyr.c new file mode 100644 index 0000000..b6b5202 --- /dev/null +++ b/interface/zsyr.c @@ -0,0 +1,203 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSYR " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYR " +#else +#define ERROR_NAME "CSYR " +#endif + +static int (*syr[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xsyr_U, xsyr_L, +#elif defined(DOUBLE) + zsyr_U, zsyr_L, +#else + csyr_U, csyr_L, +#endif +}; + +#ifdef SMP +static int (*syr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xsyr_thread_U, xsyr_thread_L, +#elif defined(DOUBLE) + zsyr_thread_U, zsyr_thread_L, +#else + csyr_thread_U, csyr_thread_L, +#endif +}; +#endif + + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (syr[uplo])(n, alpha_r, alpha_i, x, incx, a, lda, buffer); + +#ifdef SMP + } else { + + (syr_thread[uplo])(n, ALPHA, x, incx, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zsyr2.c b/interface/zsyr2.c new file mode 100644 index 0000000..0c705cb --- /dev/null +++ b/interface/zsyr2.c @@ -0,0 +1,151 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSYR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYR2 " +#else +#define ERROR_NAME "CSYR2 " +#endif + +static int (*syr2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xsyr2_U, xsyr2_L, +#elif defined(DOUBLE) + zsyr2_U, zsyr2_L, +#else + csyr2_U, csyr2_L, +#endif +}; + +#ifdef SMP +static int (*syr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xsyr2_thread_U, xsyr2_thread_L, +#elif defined(DOUBLE) + zsyr2_thread_U, zsyr2_thread_L, +#else + csyr2_thread_U, csyr2_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (syr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); + +#ifdef SMP + } else { + + (syr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztbmv.c b/interface/ztbmv.c new file mode 100644 index 0000000..85f53c4 --- /dev/null +++ b/interface/ztbmv.c @@ -0,0 +1,260 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTBMV " +#else +#define ERROR_NAME "CTBMV " +#endif + +static int (*tbmv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtbmv_NUU, xtbmv_NUN, xtbmv_NLU, xtbmv_NLN, + xtbmv_TUU, xtbmv_TUN, xtbmv_TLU, xtbmv_TLN, + xtbmv_RUU, xtbmv_RUN, xtbmv_RLU, xtbmv_RLN, + xtbmv_CUU, xtbmv_CUN, xtbmv_CLU, xtbmv_CLN, +#elif defined(DOUBLE) + ztbmv_NUU, ztbmv_NUN, ztbmv_NLU, ztbmv_NLN, + ztbmv_TUU, ztbmv_TUN, ztbmv_TLU, ztbmv_TLN, + ztbmv_RUU, ztbmv_RUN, ztbmv_RLU, ztbmv_RLN, + ztbmv_CUU, ztbmv_CUN, ztbmv_CLU, ztbmv_CLN, +#else + ctbmv_NUU, ctbmv_NUN, ctbmv_NLU, ctbmv_NLN, + ctbmv_TUU, ctbmv_TUN, ctbmv_TLU, ctbmv_TLN, + ctbmv_RUU, ctbmv_RUN, ctbmv_RLU, ctbmv_RLN, + ctbmv_CUU, ctbmv_CUN, ctbmv_CLU, ctbmv_CLN, +#endif +}; + +#ifdef SMP +static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xtbmv_thread_NUU, xtbmv_thread_NUN, xtbmv_thread_NLU, xtbmv_thread_NLN, + xtbmv_thread_TUU, xtbmv_thread_TUN, xtbmv_thread_TLU, xtbmv_thread_TLN, + xtbmv_thread_RUU, xtbmv_thread_RUN, xtbmv_thread_RLU, xtbmv_thread_RLN, + xtbmv_thread_CUU, xtbmv_thread_CUN, xtbmv_thread_CLU, xtbmv_thread_CLN, +#elif defined(DOUBLE) + ztbmv_thread_NUU, ztbmv_thread_NUN, ztbmv_thread_NLU, ztbmv_thread_NLN, + ztbmv_thread_TUU, ztbmv_thread_TUN, ztbmv_thread_TLU, ztbmv_thread_TLN, + ztbmv_thread_RUU, ztbmv_thread_RUN, ztbmv_thread_RLU, ztbmv_thread_RLN, + ztbmv_thread_CUU, ztbmv_thread_CUN, ztbmv_thread_CLU, ztbmv_thread_CLN, +#else + ctbmv_thread_NUU, ctbmv_thread_NUN, ctbmv_thread_NLU, ctbmv_thread_NLN, + ctbmv_thread_TUU, ctbmv_thread_TUN, ctbmv_thread_TLU, ctbmv_thread_TLN, + ctbmv_thread_RUU, ctbmv_thread_RUN, ctbmv_thread_RLU, ctbmv_thread_RLN, + ctbmv_thread_CUU, ctbmv_thread_CUN, ctbmv_thread_CLU, ctbmv_thread_CLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, blasint *K, + FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint k = *K; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (tbmv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); + +#ifdef SMP + } else { + + (tbmv_thread[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/ztbsv.c b/interface/ztbsv.c new file mode 100644 index 0000000..3846a4b --- /dev/null +++ b/interface/ztbsv.c @@ -0,0 +1,219 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTBSV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTBSV " +#else +#define ERROR_NAME "CTBSV " +#endif + +static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtbsv_NUU, xtbsv_NUN, xtbsv_NLU, xtbsv_NLN, + xtbsv_TUU, xtbsv_TUN, xtbsv_TLU, xtbsv_TLN, + xtbsv_RUU, xtbsv_RUN, xtbsv_RLU, xtbsv_RLN, + xtbsv_CUU, xtbsv_CUN, xtbsv_CLU, xtbsv_CLN, +#elif defined(DOUBLE) + ztbsv_NUU, ztbsv_NUN, ztbsv_NLU, ztbsv_NLN, + ztbsv_TUU, ztbsv_TUN, ztbsv_TLU, ztbsv_TLN, + ztbsv_RUU, ztbsv_RUN, ztbsv_RLU, ztbsv_RLN, + ztbsv_CUU, ztbsv_CUN, ztbsv_CLU, ztbsv_CLN, +#else + ctbsv_NUU, ctbsv_NUN, ctbsv_NLU, ctbsv_NLN, + ctbsv_TUU, ctbsv_TUN, ctbsv_TLU, ctbsv_TLN, + ctbsv_RUU, ctbsv_RUN, ctbsv_RLU, ctbsv_RLN, + ctbsv_CUU, ctbsv_CUN, ctbsv_CLU, ctbsv_CLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, blasint *K, + FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint k = *K; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (tbsv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/ztpmv.c b/interface/ztpmv.c new file mode 100644 index 0000000..2f9c48f --- /dev/null +++ b/interface/ztpmv.c @@ -0,0 +1,252 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTPMV " +#else +#define ERROR_NAME "CTPMV " +#endif + +static int (*tpmv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtpmv_NUU, xtpmv_NUN, xtpmv_NLU, xtpmv_NLN, + xtpmv_TUU, xtpmv_TUN, xtpmv_TLU, xtpmv_TLN, + xtpmv_RUU, xtpmv_RUN, xtpmv_RLU, xtpmv_RLN, + xtpmv_CUU, xtpmv_CUN, xtpmv_CLU, xtpmv_CLN, +#elif defined(DOUBLE) + ztpmv_NUU, ztpmv_NUN, ztpmv_NLU, ztpmv_NLN, + ztpmv_TUU, ztpmv_TUN, ztpmv_TLU, ztpmv_TLN, + ztpmv_RUU, ztpmv_RUN, ztpmv_RLU, ztpmv_RLN, + ztpmv_CUU, ztpmv_CUN, ztpmv_CLU, ztpmv_CLN, +#else + ctpmv_NUU, ctpmv_NUN, ctpmv_NLU, ctpmv_NLN, + ctpmv_TUU, ctpmv_TUN, ctpmv_TLU, ctpmv_TLN, + ctpmv_RUU, ctpmv_RUN, ctpmv_RLU, ctpmv_RLN, + ctpmv_CUU, ctpmv_CUN, ctpmv_CLU, ctpmv_CLN, +#endif +}; + +#ifdef SMP +static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xtpmv_thread_NUU, xtpmv_thread_NUN, xtpmv_thread_NLU, xtpmv_thread_NLN, + xtpmv_thread_TUU, xtpmv_thread_TUN, xtpmv_thread_TLU, xtpmv_thread_TLN, + xtpmv_thread_RUU, xtpmv_thread_RUN, xtpmv_thread_RLU, xtpmv_thread_RLN, + xtpmv_thread_CUU, xtpmv_thread_CUN, xtpmv_thread_CLU, xtpmv_thread_CLN, +#elif defined(DOUBLE) + ztpmv_thread_NUU, ztpmv_thread_NUN, ztpmv_thread_NLU, ztpmv_thread_NLN, + ztpmv_thread_TUU, ztpmv_thread_TUN, ztpmv_thread_TLU, ztpmv_thread_TLN, + ztpmv_thread_RUU, ztpmv_thread_RUN, ztpmv_thread_RLU, ztpmv_thread_RLN, + ztpmv_thread_CUU, ztpmv_thread_CUN, ztpmv_thread_CLU, ztpmv_thread_CLN, +#else + ctpmv_thread_NUU, ctpmv_thread_NUN, ctpmv_thread_NLU, ctpmv_thread_NLN, + ctpmv_thread_TUU, ctpmv_thread_TUN, ctpmv_thread_TLU, ctpmv_thread_TLN, + ctpmv_thread_RUU, ctpmv_thread_RUN, ctpmv_thread_RLU, ctpmv_thread_RLN, + ctpmv_thread_CUU, ctpmv_thread_CUN, ctpmv_thread_CLU, ctpmv_thread_CLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; +#ifdef SMP + int nthreads; +#endif + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (tpmv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); + +#ifdef SMP + + } else { + + (tpmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztpsv.c b/interface/ztpsv.c new file mode 100644 index 0000000..fde500e --- /dev/null +++ b/interface/ztpsv.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTPSV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTPSV " +#else +#define ERROR_NAME "CTPSV " +#endif + +static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtpsv_NUU, xtpsv_NUN, xtpsv_NLU, xtpsv_NLN, + xtpsv_TUU, xtpsv_TUN, xtpsv_TLU, xtpsv_TLN, + xtpsv_RUU, xtpsv_RUN, xtpsv_RLU, xtpsv_RLN, + xtpsv_CUU, xtpsv_CUN, xtpsv_CLU, xtpsv_CLN, +#elif defined(DOUBLE) + ztpsv_NUU, ztpsv_NUN, ztpsv_NLU, ztpsv_NLN, + ztpsv_TUU, ztpsv_TUN, ztpsv_TLU, ztpsv_TLN, + ztpsv_RUU, ztpsv_RUN, ztpsv_RLU, ztpsv_RLN, + ztpsv_CUU, ztpsv_CUN, ztpsv_CLU, ztpsv_CLN, +#else + ctpsv_NUU, ctpsv_NUN, ctpsv_NLU, ctpsv_NLN, + ctpsv_TUU, ctpsv_TUN, ctpsv_TLU, ctpsv_TLN, + ctpsv_RUU, ctpsv_RUN, ctpsv_RLU, ctpsv_RLN, + ctpsv_CUU, ctpsv_CUN, ctpsv_CLU, ctpsv_CLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (tpsv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztrmv.c b/interface/ztrmv.c new file mode 100644 index 0000000..5a18a85 --- /dev/null +++ b/interface/ztrmv.c @@ -0,0 +1,255 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTRMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRMV " +#else +#define ERROR_NAME "CTRMV " +#endif + +static int (*trmv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xtrmv_NUU, xtrmv_NUN, xtrmv_NLU, xtrmv_NLN, + xtrmv_TUU, xtrmv_TUN, xtrmv_TLU, xtrmv_TLN, + xtrmv_RUU, xtrmv_RUN, xtrmv_RLU, xtrmv_RLN, + xtrmv_CUU, xtrmv_CUN, xtrmv_CLU, xtrmv_CLN, +#elif defined(DOUBLE) + ztrmv_NUU, ztrmv_NUN, ztrmv_NLU, ztrmv_NLN, + ztrmv_TUU, ztrmv_TUN, ztrmv_TLU, ztrmv_TLN, + ztrmv_RUU, ztrmv_RUN, ztrmv_RLU, ztrmv_RLN, + ztrmv_CUU, ztrmv_CUN, ztrmv_CLU, ztrmv_CLN, +#else + ctrmv_NUU, ctrmv_NUN, ctrmv_NLU, ctrmv_NLN, + ctrmv_TUU, ctrmv_TUN, ctrmv_TLU, ctrmv_TLN, + ctrmv_RUU, ctrmv_RUN, ctrmv_RLU, ctrmv_RLN, + ctrmv_CUU, ctrmv_CUN, ctrmv_CLU, ctrmv_CLN, +#endif +}; + +#ifdef SMP +static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xtrmv_thread_NUU, xtrmv_thread_NUN, xtrmv_thread_NLU, xtrmv_thread_NLN, + xtrmv_thread_TUU, xtrmv_thread_TUN, xtrmv_thread_TLU, xtrmv_thread_TLN, + xtrmv_thread_RUU, xtrmv_thread_RUN, xtrmv_thread_RLU, xtrmv_thread_RLN, + xtrmv_thread_CUU, xtrmv_thread_CUN, xtrmv_thread_CLU, xtrmv_thread_CLN, +#elif defined(DOUBLE) + ztrmv_thread_NUU, ztrmv_thread_NUN, ztrmv_thread_NLU, ztrmv_thread_NLN, + ztrmv_thread_TUU, ztrmv_thread_TUN, ztrmv_thread_TLU, ztrmv_thread_TLN, + ztrmv_thread_RUU, ztrmv_thread_RUN, ztrmv_thread_RLU, ztrmv_thread_RLN, + ztrmv_thread_CUU, ztrmv_thread_CUN, ztrmv_thread_CLU, ztrmv_thread_CLN, +#else + ctrmv_thread_NUU, ctrmv_thread_NUN, ctrmv_thread_NLU, ctrmv_thread_NLN, + ctrmv_thread_TUU, ctrmv_thread_TUN, ctrmv_thread_TLU, ctrmv_thread_TLN, + ctrmv_thread_RUU, ctrmv_thread_RUN, ctrmv_thread_RLU, ctrmv_thread_RLN, + ctrmv_thread_CUU, ctrmv_thread_CUN, ctrmv_thread_CLU, ctrmv_thread_CLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + +#ifdef SMP + } else { + + (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztrsv.c b/interface/ztrsv.c new file mode 100644 index 0000000..08f7dc6 --- /dev/null +++ b/interface/ztrsv.c @@ -0,0 +1,216 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTRSV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRSV " +#else +#define ERROR_NAME "CTRSV " +#endif + +static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtrsv_NUU, xtrsv_NUN, xtrsv_NLU, xtrsv_NLN, + xtrsv_TUU, xtrsv_TUN, xtrsv_TLU, xtrsv_TLN, + xtrsv_RUU, xtrsv_RUN, xtrsv_RLU, xtrsv_RLN, + xtrsv_CUU, xtrsv_CUN, xtrsv_CLU, xtrsv_CLN, +#elif defined(DOUBLE) + ztrsv_NUU, ztrsv_NUN, ztrsv_NLU, ztrsv_NLN, + ztrsv_TUU, ztrsv_TUN, ztrsv_TLU, ztrsv_TLN, + ztrsv_RUU, ztrsv_RUN, ztrsv_RLU, ztrsv_RLN, + ztrsv_CUU, ztrsv_CUN, ztrsv_CLU, ztrsv_CLN, +#else + ctrsv_NUU, ctrsv_NUN, ctrsv_NLU, ctrsv_NLN, + ctrsv_TUU, ctrsv_TUN, ctrsv_TLU, ctrsv_TLN, + ctrsv_RUU, ctrsv_RUN, ctrsv_RLU, ctrsv_RLN, + ctrsv_CUU, ctrsv_CUN, ctrsv_CLU, ctrsv_CLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + + info = 0; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztrti2.c b/interface/ztrti2.c new file mode 100644 index 0000000..017374c --- /dev/null +++ b/interface/ztrti2.c @@ -0,0 +1,134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTRTI2" +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRTI2" +#else +#define ERROR_NAME "CTRTI2" +#endif + +static blasint (*trti2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifdef XDOUBLE + xtrti2_UU, xtrti2_UN, xtrti2_LU, xtrti2_LN, +#elif defined(DOUBLE) + ztrti2_UU, ztrti2_UN, ztrti2_LU, ztrti2_LN, +#else + ctrti2_UU, ctrti2_UN, ctrti2_LU, ctrti2_LN, +#endif + }; + +int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint diag_arg = *DIAG; + blasint uplo, diag; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + TOUPPER(diag_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 5; + if (args.n < 0) info = 3; + if (diag < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (trti2[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/interface/ztrtri.c b/interface/ztrtri.c new file mode 100644 index 0000000..89caf80 --- /dev/null +++ b/interface/ztrtri.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTRTRI" +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRTRI" +#else +#define ERROR_NAME "CTRTRI" +#endif + +static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UU_SINGLE, TRTRI_UN_SINGLE, TRTRI_LU_SINGLE, TRTRI_LN_SINGLE, +}; + +#ifdef SMP +static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UU_PARALLEL, TRTRI_UN_PARALLEL, TRTRI_LU_PARALLEL, TRTRI_LN_PARALLEL, +}; +#endif + +int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint diag_arg = *DIAG; + blasint uplo, diag; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + TOUPPER(diag_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 5; + if (args.n < 0) info = 3; + if (diag < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + if (diag) { + if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { + *Info = IAMIN_K(args.n, args.a, args.lda + 1); + return 0; + } + } + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/kernel/._Makefile b/kernel/._Makefile new file mode 100644 index 0000000..129c4bd Binary files /dev/null and b/kernel/._Makefile differ diff --git a/kernel/._Makefile.L1 b/kernel/._Makefile.L1 new file mode 100644 index 0000000..ae8182f Binary files /dev/null and b/kernel/._Makefile.L1 differ diff --git a/kernel/._Makefile.L2 b/kernel/._Makefile.L2 new file mode 100644 index 0000000..b12fa3d Binary files /dev/null and b/kernel/._Makefile.L2 differ diff --git a/kernel/._Makefile.L3 b/kernel/._Makefile.L3 new file mode 100644 index 0000000..c4365b7 Binary files /dev/null and b/kernel/._Makefile.L3 differ diff --git a/kernel/._Makefile.LA b/kernel/._Makefile.LA new file mode 100644 index 0000000..d716e54 Binary files /dev/null and b/kernel/._Makefile.LA differ diff --git a/kernel/._alpha b/kernel/._alpha new file mode 100755 index 0000000..9bc1c55 Binary files /dev/null and b/kernel/._alpha differ diff --git a/kernel/._generic b/kernel/._generic new file mode 100755 index 0000000..915ed38 Binary files /dev/null and b/kernel/._generic differ diff --git a/kernel/._ia64 b/kernel/._ia64 new file mode 100755 index 0000000..83b196c Binary files /dev/null and b/kernel/._ia64 differ diff --git a/kernel/._mips64 b/kernel/._mips64 new file mode 100755 index 0000000..1f02042 Binary files /dev/null and b/kernel/._mips64 differ diff --git a/kernel/._power b/kernel/._power new file mode 100755 index 0000000..5e9eab0 Binary files /dev/null and b/kernel/._power differ diff --git a/kernel/._setparam-ref.c b/kernel/._setparam-ref.c new file mode 100644 index 0000000..7037b2d Binary files /dev/null and b/kernel/._setparam-ref.c differ diff --git a/kernel/._sparc b/kernel/._sparc new file mode 100755 index 0000000..f0b1866 Binary files /dev/null and b/kernel/._sparc differ diff --git a/kernel/._x86 b/kernel/._x86 new file mode 100755 index 0000000..a6a22e7 Binary files /dev/null and b/kernel/._x86 differ diff --git a/kernel/._x86_64 b/kernel/._x86_64 new file mode 100755 index 0000000..d9b8fd2 Binary files /dev/null and b/kernel/._x86_64 differ diff --git a/kernel/Makefile b/kernel/Makefile new file mode 100644 index 0000000..6084cbc --- /dev/null +++ b/kernel/Makefile @@ -0,0 +1,121 @@ +ifdef TARGET_CORE +TARGET = $(TARGET_CORE) +endif + +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +ifdef TARGET_CORE +CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) +BUILD_KERNEL = 1 +KDIR = +TSUFFIX = _$(TARGET_CORE) +else +TARGET_CORE = $(CORE) +KDIR = +TSUFFIX = +endif + +-include $(KERNELDIR)/KERNEL.$(TARGET_CORE) + +include $(KERNELDIR)/KERNEL + +include Makefile.L1 + +include Makefile.L2 + +include Makefile.L3 + +include Makefile.LA + +HPLOBJS = \ + dgemm_kernel.$(SUFFIX) \ + $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ + $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) \ + dtrsm_kernel_LN.$(SUFFIX) dtrsm_kernel_LT.$(SUFFIX) \ + dtrsm_kernel_RN.$(SUFFIX) dtrsm_kernel_RT.$(SUFFIX) \ + daxpy_k.$(SUFFIX) dcopy_k.$(SUFFIX) ddot_k.$(SUFFIX) \ + dger_k.$(SUFFIX) dscal_k.$(SUFFIX) idamax_k.$(SUFFIX) \ + dgemv_n.$(SUFFIX) dgemv_t.$(SUFFIX) dgemm_beta.$(SUFFIX) \ + dtrsm_iunucopy.$(SUFFIX) dtrsm_iunncopy.$(SUFFIX) \ + dtrsm_ilnucopy.$(SUFFIX) dtrsm_ilnncopy.$(SUFFIX) \ + dtrsm_iutucopy.$(SUFFIX) dtrsm_iutncopy.$(SUFFIX) \ + dtrsm_iltucopy.$(SUFFIX) dtrsm_iltncopy.$(SUFFIX) \ + dtrsm_ounucopy.$(SUFFIX) dtrsm_ounncopy.$(SUFFIX) \ + dtrsm_olnucopy.$(SUFFIX) dtrsm_olnncopy.$(SUFFIX) \ + dtrsm_outucopy.$(SUFFIX) dtrsm_outncopy.$(SUFFIX) \ + dtrsm_oltucopy.$(SUFFIX) dtrsm_oltncopy.$(SUFFIX) + +COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) + +ifdef DYNAMIC_ARCH +SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) +CCOMMON_OPT += -DTS=$(TSUFFIX) +endif + +ifeq ($(ARCH), x86) +COMMONOBJS += cpuid.$(SUFFIX) +endif + +ifdef EXPRECISION +COMMONOBJS += qconjg.$(SUFFIX) qcabs1.$(SUFFIX) +endif + +ifdef QUAD_PRECISION +COMMONOBJS += qconjg.$(SUFFIX) qcabs1.$(SUFFIX) +endif + +all : libs + +scabs1.$(SUFFIX): $(KERNELDIR)/$(SCABS_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DF_INTERFACE $< -o $(@F) + +dcabs1.$(SUFFIX): $(KERNELDIR)/$(DCABS_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DF_INTERFACE $< -o $(@F) + +qcabs1.$(SUFFIX): $(KERNELDIR)/$(QCABS_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) + +qconjg.$(SUFFIX): $(KERNELDIR)/qconjg.S + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) + +lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) + $(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F) + +setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h + $(CC) -c $(CFLAGS) $< -o $@ + +setparam$(TSUFFIX).c : setparam-ref.c + sed 's/TS/$(TSUFFIX)/g' $< > $(@F) + +kernel$(TSUFFIX).h : ../common_level1.h ../common_level2.h ../common_level3.h ../common_lapack.h + sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F) + +cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S + $(CC) -c $(CFLAGS) $< -o $(@F) + +scabs1.$(PSUFFIX): $(KERNELDIR)/$(SCABS_KERNEL) + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DF_INTERFACE $< -o $(@F) + +dcabs1.$(PSUFFIX): $(KERNELDIR)/$(DCABS_KERNEL) + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DF_INTERFACE $< -o $(@F) + +qcabs1.$(PSUFFIX): $(KERNELDIR)/$(QCABS_KERNEL) + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) + +qconjg.$(PSUFFIX): $(KERNELDIR)/qconjg.S + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) + +lsame.$(PSUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) + $(CC) -c $(PFLAGS) -DF_INTERFACE $< -o $(@F) + +cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S + $(CC) -c $(PFLAGS) $< -o $(@F) + +ifdef DYNAMIC_ARCH +clean :: + @rm -f setparam_*.c kernel_*.h setparam.h kernel.h + +endif + +include $(TOPDIR)/Makefile.tail diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 new file mode 100644 index 0000000..317f143 --- /dev/null +++ b/kernel/Makefile.L1 @@ -0,0 +1,767 @@ +### AMAX ### + +ifndef SAMAXKERNEL +SAMAXKERNEL = amax.S +endif + +ifndef DAMAXKERNEL +DAMAXKERNEL = amax.S +endif + +ifndef QAMAXKERNEL +QAMAXKERNEL = amax.S +endif + +ifndef CAMAXKERNEL +CAMAXKERNEL = zamax.S +endif + +ifndef ZAMAXKERNEL +ZAMAXKERNEL = zamax.S +endif + +ifndef XAMAXKERNEL +XAMAXKERNEL = zamax.S +endif + +### AMIN ### + +ifndef SAMINKERNEL +SAMINKERNEL = amin.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amin.S +endif + +ifndef QAMINKERNEL +QAMINKERNEL = amin.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamin.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamin.S +endif + +ifndef XAMINKERNEL +XAMINKERNEL = zamin.S +endif + +### MAX ### + +ifndef SMAXKERNEL +SMAXKERNEL = max.S +endif + +ifndef DMAXKERNEL +DMAXKERNEL = max.S +endif + +ifndef QMAXKERNEL +QMAXKERNEL = max.S +endif + +### MIN ### + +ifndef SMINKERNEL +SMINKERNEL = min.S +endif + +ifndef DMINKERNEL +DMINKERNEL = min.S +endif + +ifndef QMINKERNEL +QMINKERNEL = min.S +endif + +### IAMAX ### + +ifndef ISAMAXKERNEL +ISAMAXKERNEL = iamax.S +endif + +ifndef IDAMAXKERNEL +IDAMAXKERNEL = iamax.S +endif + +ifndef IQAMAXKERNEL +IQAMAXKERNEL = iamax.S +endif + +ifndef ICAMAXKERNEL +ICAMAXKERNEL = izamax.S +endif + +ifndef IZAMAXKERNEL +IZAMAXKERNEL = izamax.S +endif + +ifndef IXAMAXKERNEL +IXAMAXKERNEL = izamax.S +endif + +### IAMIN ### + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamin.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamin.S +endif + +ifndef IQAMINKERNEL +IQAMINKERNEL = iamin.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamin.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamin.S +endif + +ifndef IXAMINKERNEL +IXAMINKERNEL = izamin.S +endif + +### IMAX ### + +ifndef ISMAXKERNEL +ISMAXKERNEL = iamax.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = iamax.S +endif + +ifndef IQMAXKERNEL +IQMAXKERNEL = iamax.S +endif + +### IMIN ### + +ifndef ISMINKERNEL +ISMINKERNEL = iamin.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamin.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = iamin.S +endif + +### ASUM ### + +ifndef SASUMKERNEL +SASUMKERNEL = asum.S +endif + +ifndef DASUMKERNEL +DASUMKERNEL = asum.S +endif + +ifndef CASUMKERNEL +CASUMKERNEL = zasum.S +endif + +ifndef ZASUMKERNEL +ZASUMKERNEL = zasum.S +endif + +ifndef QASUMKERNEL +QASUMKERNEL = asum.S +endif + +ifndef XASUMKERNEL +XASUMKERNEL = zasum.S +endif + +### AXPY ### + +ifndef SAXPYKERNEL +SAXPYKERNEL = axpy.S +endif + +ifndef DAXPYKERNEL +DAXPYKERNEL = axpy.S +endif + +ifndef CAXPYKERNEL +CAXPYKERNEL = zaxpy.S +endif + +ifndef ZAXPYKERNEL +ZAXPYKERNEL = zaxpy.S +endif + +ifndef QAXPYKERNEL +QAXPYKERNEL = axpy.S +endif + +ifndef XAXPYKERNEL +XAXPYKERNEL = zaxpy.S +endif + +### COPY ### + +ifndef SCOPYKERNEL +SCOPYKERNEL = copy.S +endif + +ifndef DCOPYKERNEL +DCOPYKERNEL = copy.S +endif + +ifndef CCOPYKERNEL +CCOPYKERNEL = zcopy.S +endif + +ifndef ZCOPYKERNEL +ZCOPYKERNEL = zcopy.S +endif + +ifndef QCOPYKERNEL +QCOPYKERNEL = copy.S +endif + +ifndef XCOPYKERNEL +XCOPYKERNEL = zcopy.S +endif + +### DOT ### + +ifndef SDOTKERNEL +SDOTKERNEL = dot.S +endif + +ifndef DDOTKERNEL +DDOTKERNEL = dot.S +endif + +ifndef CDOTKERNEL +CDOTKERNEL = zdot.S +endif + +ifndef ZDOTKERNEL +ZDOTKERNEL = zdot.S +endif + +ifndef QDOTKERNEL +QDOTKERNEL = dot.S +endif + +ifndef XDOTKERNEL +XDOTKERNEL = zdot.S +endif + +### NRM2 ### + +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.S +endif + +ifndef QNRM2KERNEL +QNRM2KERNEL = nrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef XNRM2KERNEL +XNRM2KERNEL = znrm2.S +endif + +### ROT ### + +ifndef SROTKERNEL +SROTKERNEL = rot.S +endif + +ifndef DROTKERNEL +DROTKERNEL = rot.S +endif + +ifndef QROTKERNEL +QROTKERNEL = rot.S +endif + +ifndef CROTKERNEL +CROTKERNEL = zrot.S +endif + +ifndef ZROTKERNEL +ZROTKERNEL = zrot.S +endif + +ifndef XROTKERNEL +XROTKERNEL = zrot.S +endif + +### SCAL ### + +ifndef SSCALKERNEL +SSCALKERNEL = scal.S +endif + +ifndef DSCALKERNEL +DSCALKERNEL = scal.S +endif + +ifndef CSCALKERNEL +CSCALKERNEL = zscal.S +endif + +ifndef ZSCALKERNEL +ZSCALKERNEL = zscal.S +endif + +ifndef QSCALKERNEL +QSCALKERNEL = scal.S +endif + +ifndef XSCALKERNEL +XSCALKERNEL = zscal.S +endif + +### SWAP ### + +ifndef SSWAPKERNEL +SSWAPKERNEL = swap.S +endif + +ifndef DSWAPKERNEL +DSWAPKERNEL = swap.S +endif + +ifndef CSWAPKERNEL +CSWAPKERNEL = zswap.S +endif + +ifndef ZSWAPKERNEL +ZSWAPKERNEL = zswap.S +endif + +ifndef QSWAPKERNEL +QSWAPKERNEL = swap.S +endif + +ifndef XSWAPKERNEL +XSWAPKERNEL = zswap.S +endif + +### GEMV ### + +ifndef SGEMVNKERNEL +SGEMVNKERNEL = gemv_n.S +endif + +ifndef SGEMVTKERNEL +SGEMVTKERNEL = gemv_t.S +endif + +ifndef DGEMVNKERNEL +DGEMVNKERNEL = gemv_n.S +endif + +ifndef DGEMVTKERNEL +DGEMVTKERNEL = gemv_t.S +endif + +ifndef CGEMVNKERNEL +CGEMVNKERNEL = zgemv_n.S +endif + +ifndef CGEMVTKERNEL +CGEMVTKERNEL = zgemv_t.S +endif + +ifndef ZGEMVNKERNEL +ZGEMVNKERNEL = zgemv_n.S +endif + +ifndef ZGEMVTKERNEL +ZGEMVTKERNEL = zgemv_t.S +endif + +ifndef QGEMVNKERNEL +QGEMVNKERNEL = gemv_n.S +endif + +ifndef QGEMVTKERNEL +QGEMVTKERNEL = gemv_t.S +endif + +ifndef XGEMVNKERNEL +XGEMVNKERNEL = zgemv_n.S +endif + +ifndef XGEMVTKERNEL +XGEMVTKERNEL = zgemv_t.S +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = cabs.S +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = cabs.S +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = cabs.S +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = lsame.S +endif + +SBLASOBJS += \ + samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ + isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ + sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ + sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ + snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ + idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ + dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ + dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) + +QBLASOBJS += \ + qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ + iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ + qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ + qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) + +CBLASOBJS += \ + camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ + casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ + cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ + cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ + zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ + zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ + zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += \ + xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ + xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ + xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ + xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) + +### AMAX ### + + + + +$(KDIR)samax_k$(TSUFFIX).$(SUFFIX) $(KDIR)samax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)damax_k$(TSUFFIX).$(SUFFIX) $(KDIR)damax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)qamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)camax_k$(TSUFFIX).$(SUFFIX) $(KDIR)camax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)zamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)xamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +### AMIN ### + +$(KDIR)samin_k$(TSUFFIX).$(SUFFIX) $(KDIR)samin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)damin_k$(TSUFFIX).$(SUFFIX) $(KDIR)damin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)qamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)camin_k$(TSUFFIX).$(SUFFIX) $(KDIR)camin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)zamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)xamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +### MAX ### + +$(KDIR)smax_k$(TSUFFIX).$(SUFFIX) $(KDIR)smax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)dmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)qmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +### MIN ### + +$(KDIR)smin_k$(TSUFFIX).$(SUFFIX) $(KDIR)smin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)dmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)qmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + + +### IAMAX ### + +$(KDIR)isamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)idamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)iqamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)icamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)izamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)ixamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +### IAMIN ### + +$(KDIR)isamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)idamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)iqamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)icamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)izamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)ixamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +### IMAX ### + +$(KDIR)ismax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)idmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)iqmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +### IMIN ### + +$(KDIR)ismin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + + +$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)dasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DASUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QASUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)casum_k$(TSUFFIX).$(SUFFIX) $(KDIR)casum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CASUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZASUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ + +$(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ + +$(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ + +$(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ + +$(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ + +$(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ + +$(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@ + +$(KDIR)zdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ $< -o $@ + +$(KDIR)xdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ $< -o $@ + +$(KDIR)xdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ $< -o $@ + +$(KDIR)cdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ $< -o $@ + +$(KDIR)cdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ $< -o $@ + +$(KDIR)snrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)snrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SNRM2KERNEL) + $(CC) $(CFLAGS) -UCOMPLEX -c -UDOUBLE $< -o $@ + +$(KDIR)dnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)dnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DNRM2KERNEL) + $(CC) $(CFLAGS) -UCOMPLEX -c -DDOUBLE $< -o $@ + +$(KDIR)qnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)qnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QNRM2KERNEL) + $(CC) $(CFLAGS) -UCOMPLEX -c -DXDOUBLE $< -o $@ + +$(KDIR)cnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)cnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CNRM2KERNEL) + $(CC) $(CFLAGS) -DCOMPLEX -c -UDOUBLE $< -o $@ + +$(KDIR)znrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)znrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZNRM2KERNEL) + $(CC) $(CFLAGS) -DCOMPLEX -c -DDOUBLE $< -o $@ + +$(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XNRM2KERNEL) + $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ + +$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zdrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZROTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xqrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)xqrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XROTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)sscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)sscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSCALKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)dscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)dscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSCALKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)qscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSCALKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)cscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)cscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSCALKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)zscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSCALKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)xscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSCALKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)sswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)sswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSWAPKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)dswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)dswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSWAPKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)qswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSWAPKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)cswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)cswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSWAPKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)zswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSWAPKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSWAPKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 new file mode 100644 index 0000000..f26292d --- /dev/null +++ b/kernel/Makefile.L2 @@ -0,0 +1,428 @@ +### GEMV ### + +ifndef SGEMVNKERNEL +SGEMVNKERNEL = gemv_n.S +endif + +ifndef SGEMVTKERNEL +SGEMVTKERNEL = gemv_t.S +endif + +ifndef DGEMVNKERNEL +DGEMVNKERNEL = gemv_n.S +endif + +ifndef DGEMVTKERNEL +DGEMVTKERNEL = gemv_t.S +endif + +ifndef CGEMVNKERNEL +CGEMVNKERNEL = zgemv_n.S +endif + +ifndef CGEMVTKERNEL +CGEMVTKERNEL = zgemv_t.S +endif + +ifndef ZGEMVNKERNEL +ZGEMVNKERNEL = zgemv_n.S +endif + +ifndef ZGEMVTKERNEL +ZGEMVTKERNEL = zgemv_t.S +endif + +ifndef QGEMVNKERNEL +QGEMVNKERNEL = gemv_n.S +endif + +ifndef QGEMVTKERNEL +QGEMVTKERNEL = gemv_t.S +endif + +ifndef XGEMVNKERNEL +XGEMVNKERNEL = zgemv_n.S +endif + +ifndef XGEMVTKERNEL +XGEMVTKERNEL = zgemv_t.S +endif + +### GER ### + +ifndef SGERKERNEL +SGERKERNEL = ../generic/ger.c +endif + +ifndef DGERKERNEL +DGERKERNEL = ../generic/ger.c +endif + +ifndef QGERKERNEL +QGERKERNEL = ../generic/ger.c +endif + +ifndef CGERUKERNEL +CGERUKERNEL = ../generic/zger.c +endif + +ifndef CGERCKERNEL +CGERCKERNEL = ../generic/zger.c +endif + +ifndef ZGERUKERNEL +ZGERUKERNEL = ../generic/zger.c +endif + +ifndef ZGERCKERNEL +ZGERCKERNEL = ../generic/zger.c +endif + +ifndef XGERUKERNEL +XGERUKERNEL = ../generic/zger.c +endif + +ifndef XGERCKERNEL +XGERCKERNEL = ../generic/zger.c +endif + +### SYMV ### + +ifndef SSYMV_U_KERNEL +SSYMV_U_KERNEL = ../generic/symv_k.c +endif + +ifndef SSYMV_L_KERNEL +SSYMV_L_KERNEL = ../generic/symv_k.c +endif + +ifndef DSYMV_U_KERNEL +DSYMV_U_KERNEL = ../generic/symv_k.c +endif + +ifndef DSYMV_L_KERNEL +DSYMV_L_KERNEL = ../generic/symv_k.c +endif + +ifndef QSYMV_U_KERNEL +QSYMV_U_KERNEL = ../generic/symv_k.c +endif + +ifndef QSYMV_L_KERNEL +QSYMV_L_KERNEL = ../generic/symv_k.c +endif + +ifndef CSYMV_U_KERNEL +CSYMV_U_KERNEL = ../generic/zsymv_k.c +endif + +ifndef CSYMV_L_KERNEL +CSYMV_L_KERNEL = ../generic/zsymv_k.c +endif + +ifndef ZSYMV_U_KERNEL +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +endif + +ifndef ZSYMV_L_KERNEL +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +endif + +ifndef XSYMV_U_KERNEL +XSYMV_U_KERNEL = ../generic/zsymv_k.c +endif + +ifndef XSYMV_L_KERNEL +XSYMV_L_KERNEL = ../generic/zsymv_k.c +endif + +### HEMV ### + +ifndef CHEMV_U_KERNEL +CHEMV_U_KERNEL = ../generic/zhemv_k.c +endif + +ifndef CHEMV_L_KERNEL +CHEMV_L_KERNEL = ../generic/zhemv_k.c +endif + +ifndef CHEMV_V_KERNEL +CHEMV_V_KERNEL = ../generic/zhemv_k.c +endif + +ifndef CHEMV_M_KERNEL +CHEMV_M_KERNEL = ../generic/zhemv_k.c +endif + +ifndef ZHEMV_U_KERNEL +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +endif + +ifndef ZHEMV_L_KERNEL +ZHEMV_L_KERNEL = ../generic/zhemv_k.c +endif + +ifndef ZHEMV_V_KERNEL +ZHEMV_V_KERNEL = ../generic/zhemv_k.c +endif + +ifndef ZHEMV_M_KERNEL +ZHEMV_M_KERNEL = ../generic/zhemv_k.c +endif + +ifndef XHEMV_U_KERNEL +XHEMV_U_KERNEL = ../generic/zhemv_k.c +endif + +ifndef XHEMV_L_KERNEL +XHEMV_L_KERNEL = ../generic/zhemv_k.c +endif + +ifndef XHEMV_V_KERNEL +XHEMV_V_KERNEL = ../generic/zhemv_k.c +endif + +ifndef XHEMV_M_KERNEL +XHEMV_M_KERNEL = ../generic/zhemv_k.c +endif + +SBLASOBJS += \ + sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \ + sger_k$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) dsymv_U$(TSUFFIX).$(SUFFIX) dsymv_L$(TSUFFIX).$(SUFFIX) \ + dger_k$(TSUFFIX).$(SUFFIX) + +QBLASOBJS += \ + qgemv_n$(TSUFFIX).$(SUFFIX) qgemv_t$(TSUFFIX).$(SUFFIX) qsymv_U$(TSUFFIX).$(SUFFIX) qsymv_L$(TSUFFIX).$(SUFFIX) \ + qger_k$(TSUFFIX).$(SUFFIX) + +CBLASOBJS += \ + cgemv_n$(TSUFFIX).$(SUFFIX) cgemv_t$(TSUFFIX).$(SUFFIX) cgemv_r$(TSUFFIX).$(SUFFIX) cgemv_c$(TSUFFIX).$(SUFFIX) \ + cgemv_o$(TSUFFIX).$(SUFFIX) cgemv_u$(TSUFFIX).$(SUFFIX) cgemv_s$(TSUFFIX).$(SUFFIX) cgemv_d$(TSUFFIX).$(SUFFIX) \ + csymv_U$(TSUFFIX).$(SUFFIX) csymv_L$(TSUFFIX).$(SUFFIX) \ + chemv_U$(TSUFFIX).$(SUFFIX) chemv_L$(TSUFFIX).$(SUFFIX) chemv_V$(TSUFFIX).$(SUFFIX) chemv_M$(TSUFFIX).$(SUFFIX) \ + cgeru_k$(TSUFFIX).$(SUFFIX) cgerc_k$(TSUFFIX).$(SUFFIX) cgerv_k$(TSUFFIX).$(SUFFIX) cgerd_k$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zgemv_n$(TSUFFIX).$(SUFFIX) zgemv_t$(TSUFFIX).$(SUFFIX) zgemv_r$(TSUFFIX).$(SUFFIX) zgemv_c$(TSUFFIX).$(SUFFIX) \ + zgemv_o$(TSUFFIX).$(SUFFIX) zgemv_u$(TSUFFIX).$(SUFFIX) zgemv_s$(TSUFFIX).$(SUFFIX) zgemv_d$(TSUFFIX).$(SUFFIX) \ + zsymv_U$(TSUFFIX).$(SUFFIX) zsymv_L$(TSUFFIX).$(SUFFIX) \ + zhemv_U$(TSUFFIX).$(SUFFIX) zhemv_L$(TSUFFIX).$(SUFFIX) zhemv_V$(TSUFFIX).$(SUFFIX) zhemv_M$(TSUFFIX).$(SUFFIX) \ + zgeru_k$(TSUFFIX).$(SUFFIX) zgerc_k$(TSUFFIX).$(SUFFIX) zgerv_k$(TSUFFIX).$(SUFFIX) zgerd_k$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += \ + xgemv_n$(TSUFFIX).$(SUFFIX) xgemv_t$(TSUFFIX).$(SUFFIX) xgemv_r$(TSUFFIX).$(SUFFIX) xgemv_c$(TSUFFIX).$(SUFFIX) \ + xgemv_o$(TSUFFIX).$(SUFFIX) xgemv_u$(TSUFFIX).$(SUFFIX) xgemv_s$(TSUFFIX).$(SUFFIX) xgemv_d$(TSUFFIX).$(SUFFIX) \ + xsymv_U$(TSUFFIX).$(SUFFIX) xsymv_L$(TSUFFIX).$(SUFFIX) \ + xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ + xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) + +$(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ + +$(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DTRANS $< -o $@ + +$(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ + +$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ + +$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -UTRANS $< -o $@ + +$(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DTRANS $< -o $@ + +$(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)cgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)cgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)cgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)cgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)cgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)zgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)zgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)zgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)zgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)zgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)xgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)xgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)xgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)xgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)xgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@ + +$(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $@ + +$(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $@ + +$(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $@ + +$(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $@ + +$(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $@ + +$(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $@ + +$(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $@ + +$(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $@ + +$(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $@ + +$(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $@ + +$(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $@ + +$(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE $< -o $@ + +$(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE $< -o $@ + +$(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE $< -o $@ + +$(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ $< -o $@ + +$(KDIR)cgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ $< -o $@ + +$(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ -DXCONJ $< -o $@ + +$(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ -DXCONJ $< -o $@ + +$(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ $< -o $@ + +$(KDIR)zgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ $< -o $@ + +$(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ -DXCONJ $< -o $@ + +$(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ -DXCONJ $< -o $@ + +$(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ $< -o $@ + +$(KDIR)xgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ $< -o $@ + +$(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@ + +$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ-DXCONJ $< -o $@ + +$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@ + +$(KDIR)chemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_L_KERNEL) $(CHEMV_L_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $@ + +$(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_V_KERNEL) $(CHEMV_U_PARAM) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $@ + +$(KDIR)zhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_L_KERNEL) $(ZHEMV_L_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $@ + +$(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_V_KERNEL) $(ZHEMV_U_PARAM) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $@ + +$(KDIR)xhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_L_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $@ + +$(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_V_KERNEL) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ + diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 new file mode 100644 index 0000000..4e331a4 --- /dev/null +++ b/kernel/Makefile.L3 @@ -0,0 +1,3135 @@ +ifeq ($(ARCH), x86) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +USE_GEMM3M = 1 +endif + +SKERNELOBJS += \ + sgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ + $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) + +DKERNELOBJS += \ + dgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ + $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) + +QKERNELOBJS += \ + qgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \ + $(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ) + +CKERNELOBJS += \ + cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ + cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + $(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \ + $(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ) + +ZKERNELOBJS += \ + zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ + zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + $(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \ + $(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ) + +XKERNELOBJS += \ + xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ + xgemm_kernel_l$(TSUFFIX).$(SUFFIX) xgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ + $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) + +SBLASOBJS += $(SKERNELOBJS) +DBLASOBJS += $(DKERNELOBJS) +QBLASOBJS += $(QKERNELOBJS) +CBLASOBJS += $(CKERNELOBJS) +ZBLASOBJS += $(ZKERNELOBJS) +XBLASOBJS += $(XKERNELOBJS) + +SBLASOBJS += \ + sgemm_beta$(TSUFFIX).$(SUFFIX) \ + strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + +DBLASOBJS += \ + dgemm_beta$(TSUFFIX).$(SUFFIX) \ + dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + +QBLASOBJS += \ + qgemm_beta$(TSUFFIX).$(SUFFIX) \ + qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + +CBLASOBJS += \ + cgemm_beta$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + +ZBLASOBJS += \ + zgemm_beta$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + +XBLASOBJS += \ + xgemm_beta$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + +ifdef USE_GEMM3M + +CBLASOBJS += cgemm3m_kernel$(TSUFFIX).$(SUFFIX) +ZBLASOBJS += zgemm3m_kernel$(TSUFFIX).$(SUFFIX) +XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) + +endif + +SBLASOBJS += \ + strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + strmm_iutucopy$(TSUFFIX).$(SUFFIX) strmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + strmm_iltucopy$(TSUFFIX).$(SUFFIX) strmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ + strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + strsm_iltucopy$(TSUFFIX).$(SUFFIX) strsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ + strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) + +QBLASOBJS += \ + qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_outucopy$(TSUFFIX).$(SUFFIX) qtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + +CBLASOBJS += \ + ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_outucopy$(TSUFFIX).$(SUFFIX) ztrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_outucopy$(TSUFFIX).$(SUFFIX) ztrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + zsymm_iutcopy$(TSUFFIX).$(SUFFIX) zsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += \ + xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_outucopy$(TSUFFIX).$(SUFFIX) xtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_outucopy$(TSUFFIX).$(SUFFIX) xtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + xsymm_iutcopy$(TSUFFIX).$(SUFFIX) xsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + xsymm_outcopy$(TSUFFIX).$(SUFFIX) xsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + xhemm_iutcopy$(TSUFFIX).$(SUFFIX) xhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + xhemm_outcopy$(TSUFFIX).$(SUFFIX) xhemm_oltcopy$(TSUFFIX).$(SUFFIX) + +ifdef USE_GEMM3M + +CBLASOBJS += \ + cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ + cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ + cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ + cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ + cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ + cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ + csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ + chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ + zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ + zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ + zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ + zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ + zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ + zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ + zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += \ + xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ + xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ + xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ + xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ + xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ + xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ + xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ + xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) + +endif + +SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SGEMMOTCOPYOBJ_P = $(SGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMINCOPYOBJ_P = $(DGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMITCOPYOBJ_P = $(DGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMONCOPYOBJ_P = $(DGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMOTCOPYOBJ_P = $(DGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMINCOPYOBJ_P = $(QGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMITCOPYOBJ_P = $(QGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMONCOPYOBJ_P = $(QGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMOTCOPYOBJ_P = $(QGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMINCOPYOBJ_P = $(CGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMITCOPYOBJ_P = $(CGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMONCOPYOBJ_P = $(CGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMOTCOPYOBJ_P = $(CGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMINCOPYOBJ_P = $(ZGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMITCOPYOBJ_P = $(ZGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMONCOPYOBJ_P = $(ZGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMOTCOPYOBJ_P = $(ZGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMINCOPYOBJ_P = $(XGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) + +$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) + +$(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +$(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(KDIR)$(QGEMMONCOPYOBJ) : $(KERNELDIR)/$(QGEMMONCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(QGEMMOTCOPYOBJ) : $(KERNELDIR)/$(QGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) + +$(KDIR)$(QGEMMINCOPYOBJ) : $(KERNELDIR)/$(QGEMMINCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(QGEMMITCOPYOBJ) : $(KERNELDIR)/$(QGEMMITCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) + +$(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(KDIR)$(ZGEMMONCOPYOBJ) : $(KERNELDIR)/$(ZGEMMONCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(ZGEMMOTCOPYOBJ) : $(KERNELDIR)/$(ZGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) + +$(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(KDIR)$(XGEMMONCOPYOBJ) : $(KERNELDIR)/$(XGEMMONCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(XGEMMOTCOPYOBJ) : $(KERNELDIR)/$(XGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) + +$(KDIR)$(XGEMMINCOPYOBJ) : $(KERNELDIR)/$(XGEMMINCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(XGEMMITCOPYOBJ) : $(KERNELDIR)/$(XGEMMITCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)strsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + + +$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + + +$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(SGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMOTCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) + +$(SGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMINCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +$(DGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMINCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(DGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMITCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(QGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMONCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(QGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) + +$(QGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMINCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(QGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMITCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(CGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMONCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(CGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMOTCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) + +$(CGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMINCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(CGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMITCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(ZGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMONCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(ZGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) + +$(ZGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMINCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(ZGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMITCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(XGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMONCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(XGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) + +$(XGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMINCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(XGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMITCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)zgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)xgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)xgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)xgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)cgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)strsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + + +$(KDIR)strmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ssymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)chemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)chemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ diff --git a/kernel/Makefile.LA b/kernel/Makefile.LA new file mode 100644 index 0000000..496d05c --- /dev/null +++ b/kernel/Makefile.LA @@ -0,0 +1,48 @@ +SBLASOBJS += sneg_tcopy$(TSUFFIX).$(SUFFIX) slaswp_ncopy$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +QBLASOBJS += qneg_tcopy$(TSUFFIX).$(SUFFIX) qlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +CBLASOBJS += cneg_tcopy$(TSUFFIX).$(SUFFIX) claswp_ncopy$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += zneg_tcopy$(TSUFFIX).$(SUFFIX) zlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += xneg_tcopy$(TSUFFIX).$(SUFFIX) xlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +$(KDIR)sneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)sneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)dneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)dneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)qneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)qneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)cneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)cneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/zneg_tcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)zneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)zneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/zneg_tcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)xneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)xneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/zneg_tcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)slaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)slaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/laswp_ncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)dlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)dlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/laswp_ncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)qlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)qlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/laswp_ncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)claswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)claswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/zlaswp_ncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)zlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)zlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/zlaswp_ncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)xlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)xlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/zlaswp_ncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + diff --git a/kernel/alpha/._KERNEL b/kernel/alpha/._KERNEL new file mode 100644 index 0000000..5bbee6c Binary files /dev/null and b/kernel/alpha/._KERNEL differ diff --git a/kernel/alpha/._Makefile b/kernel/alpha/._Makefile new file mode 100644 index 0000000..6e07b98 Binary files /dev/null and b/kernel/alpha/._Makefile differ diff --git a/kernel/alpha/._amax.S b/kernel/alpha/._amax.S new file mode 100644 index 0000000..90c4626 Binary files /dev/null and b/kernel/alpha/._amax.S differ diff --git a/kernel/alpha/._asum.S b/kernel/alpha/._asum.S new file mode 100644 index 0000000..58a4bae Binary files /dev/null and b/kernel/alpha/._asum.S differ diff --git a/kernel/alpha/._axpy.S b/kernel/alpha/._axpy.S new file mode 100644 index 0000000..06d3c1a Binary files /dev/null and b/kernel/alpha/._axpy.S differ diff --git a/kernel/alpha/._cabs.S b/kernel/alpha/._cabs.S new file mode 100644 index 0000000..431f29e Binary files /dev/null and b/kernel/alpha/._cabs.S differ diff --git a/kernel/alpha/._cnrm2.S b/kernel/alpha/._cnrm2.S new file mode 100644 index 0000000..3921dd5 Binary files /dev/null and b/kernel/alpha/._cnrm2.S differ diff --git a/kernel/alpha/._copy.S b/kernel/alpha/._copy.S new file mode 100644 index 0000000..04930c9 Binary files /dev/null and b/kernel/alpha/._copy.S differ diff --git a/kernel/alpha/._cscal.S b/kernel/alpha/._cscal.S new file mode 100644 index 0000000..bacdef4 Binary files /dev/null and b/kernel/alpha/._cscal.S differ diff --git a/kernel/alpha/._dnrm2.S b/kernel/alpha/._dnrm2.S new file mode 100644 index 0000000..27093c5 Binary files /dev/null and b/kernel/alpha/._dnrm2.S differ diff --git a/kernel/alpha/._dot.S b/kernel/alpha/._dot.S new file mode 100644 index 0000000..6614855 Binary files /dev/null and b/kernel/alpha/._dot.S differ diff --git a/kernel/alpha/._gemm_beta.S b/kernel/alpha/._gemm_beta.S new file mode 100644 index 0000000..941241f Binary files /dev/null and b/kernel/alpha/._gemm_beta.S differ diff --git a/kernel/alpha/._gemm_kernel_4x4.S b/kernel/alpha/._gemm_kernel_4x4.S new file mode 100644 index 0000000..c9f4580 Binary files /dev/null and b/kernel/alpha/._gemm_kernel_4x4.S differ diff --git a/kernel/alpha/._gemv_n.S b/kernel/alpha/._gemv_n.S new file mode 100644 index 0000000..8fd1905 Binary files /dev/null and b/kernel/alpha/._gemv_n.S differ diff --git a/kernel/alpha/._gemv_t.S b/kernel/alpha/._gemv_t.S new file mode 100644 index 0000000..6058420 Binary files /dev/null and b/kernel/alpha/._gemv_t.S differ diff --git a/kernel/alpha/._iamax.S b/kernel/alpha/._iamax.S new file mode 100644 index 0000000..41b2709 Binary files /dev/null and b/kernel/alpha/._iamax.S differ diff --git a/kernel/alpha/._imax.S b/kernel/alpha/._imax.S new file mode 100644 index 0000000..d9585ff Binary files /dev/null and b/kernel/alpha/._imax.S differ diff --git a/kernel/alpha/._izamax.S b/kernel/alpha/._izamax.S new file mode 100644 index 0000000..26f0557 Binary files /dev/null and b/kernel/alpha/._izamax.S differ diff --git a/kernel/alpha/._lsame.S b/kernel/alpha/._lsame.S new file mode 100644 index 0000000..6190631 Binary files /dev/null and b/kernel/alpha/._lsame.S differ diff --git a/kernel/alpha/._max.S b/kernel/alpha/._max.S new file mode 100644 index 0000000..760125f Binary files /dev/null and b/kernel/alpha/._max.S differ diff --git a/kernel/alpha/._rot.S b/kernel/alpha/._rot.S new file mode 100644 index 0000000..cf775a1 Binary files /dev/null and b/kernel/alpha/._rot.S differ diff --git a/kernel/alpha/._scal.S b/kernel/alpha/._scal.S new file mode 100644 index 0000000..7c67c2e Binary files /dev/null and b/kernel/alpha/._scal.S differ diff --git a/kernel/alpha/._snrm2.S b/kernel/alpha/._snrm2.S new file mode 100644 index 0000000..cda9f14 Binary files /dev/null and b/kernel/alpha/._snrm2.S differ diff --git a/kernel/alpha/._staticbuffer.S b/kernel/alpha/._staticbuffer.S new file mode 100644 index 0000000..4bd2e59 Binary files /dev/null and b/kernel/alpha/._staticbuffer.S differ diff --git a/kernel/alpha/._swap.S b/kernel/alpha/._swap.S new file mode 100644 index 0000000..34ee518 Binary files /dev/null and b/kernel/alpha/._swap.S differ diff --git a/kernel/alpha/._trsm_kernel_4x4_LN.S b/kernel/alpha/._trsm_kernel_4x4_LN.S new file mode 100644 index 0000000..ede769c Binary files /dev/null and b/kernel/alpha/._trsm_kernel_4x4_LN.S differ diff --git a/kernel/alpha/._trsm_kernel_4x4_LT.S b/kernel/alpha/._trsm_kernel_4x4_LT.S new file mode 100644 index 0000000..c5ecdfc Binary files /dev/null and b/kernel/alpha/._trsm_kernel_4x4_LT.S differ diff --git a/kernel/alpha/._trsm_kernel_4x4_RT.S b/kernel/alpha/._trsm_kernel_4x4_RT.S new file mode 100644 index 0000000..5e2c84f Binary files /dev/null and b/kernel/alpha/._trsm_kernel_4x4_RT.S differ diff --git a/kernel/alpha/._zamax.S b/kernel/alpha/._zamax.S new file mode 100644 index 0000000..c0d9aa3 Binary files /dev/null and b/kernel/alpha/._zamax.S differ diff --git a/kernel/alpha/._zasum.S b/kernel/alpha/._zasum.S new file mode 100644 index 0000000..2e7fecb Binary files /dev/null and b/kernel/alpha/._zasum.S differ diff --git a/kernel/alpha/._zaxpy.S b/kernel/alpha/._zaxpy.S new file mode 100644 index 0000000..e15d644 Binary files /dev/null and b/kernel/alpha/._zaxpy.S differ diff --git a/kernel/alpha/._zdot.S b/kernel/alpha/._zdot.S new file mode 100644 index 0000000..c4dc411 Binary files /dev/null and b/kernel/alpha/._zdot.S differ diff --git a/kernel/alpha/._zgemm_beta.S b/kernel/alpha/._zgemm_beta.S new file mode 100644 index 0000000..0709091 Binary files /dev/null and b/kernel/alpha/._zgemm_beta.S differ diff --git a/kernel/alpha/._zgemm_kernel_2x2.S b/kernel/alpha/._zgemm_kernel_2x2.S new file mode 100644 index 0000000..3eac92e Binary files /dev/null and b/kernel/alpha/._zgemm_kernel_2x2.S differ diff --git a/kernel/alpha/._zgemv_n.S b/kernel/alpha/._zgemv_n.S new file mode 100644 index 0000000..e69175e Binary files /dev/null and b/kernel/alpha/._zgemv_n.S differ diff --git a/kernel/alpha/._zgemv_t.S b/kernel/alpha/._zgemv_t.S new file mode 100644 index 0000000..ce328f6 Binary files /dev/null and b/kernel/alpha/._zgemv_t.S differ diff --git a/kernel/alpha/._znrm2.S b/kernel/alpha/._znrm2.S new file mode 100644 index 0000000..b283119 Binary files /dev/null and b/kernel/alpha/._znrm2.S differ diff --git a/kernel/alpha/._zrot.S b/kernel/alpha/._zrot.S new file mode 100644 index 0000000..979abcd Binary files /dev/null and b/kernel/alpha/._zrot.S differ diff --git a/kernel/alpha/._zscal.S b/kernel/alpha/._zscal.S new file mode 100644 index 0000000..5c61519 Binary files /dev/null and b/kernel/alpha/._zscal.S differ diff --git a/kernel/alpha/._zswap.S b/kernel/alpha/._zswap.S new file mode 100644 index 0000000..1a90614 Binary files /dev/null and b/kernel/alpha/._zswap.S differ diff --git a/kernel/alpha/._ztrsm_kernel_2x2_LN.S b/kernel/alpha/._ztrsm_kernel_2x2_LN.S new file mode 100644 index 0000000..98d37d2 Binary files /dev/null and b/kernel/alpha/._ztrsm_kernel_2x2_LN.S differ diff --git a/kernel/alpha/._ztrsm_kernel_2x2_LT.S b/kernel/alpha/._ztrsm_kernel_2x2_LT.S new file mode 100644 index 0000000..08b1d6c Binary files /dev/null and b/kernel/alpha/._ztrsm_kernel_2x2_LT.S differ diff --git a/kernel/alpha/._ztrsm_kernel_2x2_RT.S b/kernel/alpha/._ztrsm_kernel_2x2_RT.S new file mode 100644 index 0000000..eaebe33 Binary files /dev/null and b/kernel/alpha/._ztrsm_kernel_2x2_RT.S differ diff --git a/kernel/alpha/KERNEL b/kernel/alpha/KERNEL new file mode 100644 index 0000000..a39ccd5 --- /dev/null +++ b/kernel/alpha/KERNEL @@ -0,0 +1,124 @@ +ifndef SAMINKERNEL +SAMINKERNEL = amax.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax.S +endif + +ifndef SMINKERNEL +SMINKERNEL = max.S +endif + +ifndef DMINKERNEL +DMINKERNEL = max.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax.S +endif + +ifndef CCOPYKERNEL +CCOPYKERNEL = copy.S +endif + +ifndef ZCOPYKERNEL +ZCOPYKERNEL = copy.S +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +SGEMMKERNEL = gemm_kernel_4x4.S +SGEMM_BETA = gemm_beta.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) + +DGEMMKERNEL = gemm_kernel_4x4.S +DGEMM_BETA = gemm_beta.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) + +CGEMMKERNEL = zgemm_kernel_2x2.S +CGEMM_BETA = zgemm_beta.S +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_2x2.S +ZGEMM_BETA = zgemm_beta.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) + +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S + +STRSMKERNEL_LN = trsm_kernel_4x4_LN.S +STRSMKERNEL_LT = trsm_kernel_4x4_LT.S +STRSMKERNEL_RN = trsm_kernel_4x4_LT.S +STRSMKERNEL_RT = trsm_kernel_4x4_RT.S + +DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S +DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S +DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S +DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S diff --git a/kernel/alpha/Makefile b/kernel/alpha/Makefile new file mode 100644 index 0000000..efae70d --- /dev/null +++ b/kernel/alpha/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/alpha/amax.S b/kernel/alpha/amax.S new file mode 100644 index 0000000..e528adc --- /dev/null +++ b/kernel/alpha/amax.S @@ -0,0 +1,283 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 6 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + nop + .align 4 + + stt $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + stt $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + stt $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + stt $f5, 24($sp) + fclr $f19 + and $2, $3, $0 + unop + + stt $f6, 32($sp) + fclr $f0 + sra N, 3, $1 + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + unop + fabs $f20, $f0 + ble $1, $L15 + .align 4 + + fabs $f20, $f1 + unop + addq X, INCX, X + unop + + LD $f21, 0 * SIZE(X) + fabs $f20, $f2 + addq X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fabs $f20, $f3 + addq X, INCX, X + unop + + LD $f23, 0 * SIZE(X) + fabs $f20, $f4 + addq X, INCX, X + unop + + LD $f24, 0 * SIZE(X) + addq X, INCX, X + fabs $f20, $f5 + unop + + LD $f25, 0 * SIZE(X) + fabs $f20, $f6 + addq X, INCX, X + unop + + LD $f26, 0 * SIZE(X) + fabs $f20, $f28 + addq X, INCX, X + lda $1, -1($1) + + LD $f27, 0 * SIZE(X) + unop + addq X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + fcmovne $f16, $f12, $f4 + unop + fabs $f20, $f29 + ldl $31, 56 * SIZE(X) + + fcmovne $f17, $f13, $f5 + LD $f20, 0 * SIZE(X) + fabs $f21, $f30 + addq X, INCX, X + + fcmovne $f18, $f14, $f6 + LD $f21, 0 * SIZE(X) + fabs $f22, $f10 + addq X, INCX, X + + fcmovne $f19, $f15, $f28 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + addq X, INCX, X + + fabs $f24, $f12 + LD $f23, 0 * SIZE(X) + CMPLT($f0, $f29), $f16 + addq X, INCX, X + + fabs $f25, $f13 + LD $f24, 0 * SIZE(X) + CMPLT($f1, $f30), $f17 + addq X, INCX, X + + fabs $f26, $f14 + LD $f25, 0 * SIZE(X) + CMPLT($f2, $f10), $f18 + addq X, INCX, X + + fabs $f27, $f15 + LD $f26, 0 * SIZE(X) + CMPLT($f3, $f11), $f19 + addq X, INCX, X + + fcmovne $f16, $f29, $f0 + LD $f27, 0 * SIZE(X) + CMPLT($f4, $f12), $f16 + addq X, INCX, X + + fcmovne $f17, $f30, $f1 + unop + CMPLT($f5, $f13), $f17 + lda $1, -1($1) # i -- + + fcmovne $f18, $f10, $f2 + unop + CMPLT($f6, $f14), $f18 + unop + + fcmovne $f19, $f11, $f3 + unop + CMPLT($f28, $f15), $f19 + bgt $1,$L12 + .align 4 + +$L13: + fcmovne $f16, $f12, $f4 + fabs $f20, $f29 + fcmovne $f17, $f13, $f5 + fabs $f21, $f30 + + fcmovne $f18, $f14, $f6 + fabs $f22, $f10 + fcmovne $f19, $f15, $f28 + fabs $f23, $f11 + + fabs $f24, $f12 + CMPLT($f0, $f29), $f16 + fabs $f25, $f13 + CMPLT($f1, $f30), $f17 + + fabs $f26, $f14 + CMPLT($f2, $f10), $f18 + fabs $f27, $f15 + CMPLT($f3, $f11), $f19 + + fcmovne $f16, $f29, $f0 + CMPLT($f4, $f12), $f16 + fcmovne $f17, $f30, $f1 + CMPLT($f5, $f13), $f17 + + fcmovne $f18, $f10, $f2 + CMPLT($f6, $f14), $f18 + fcmovne $f19, $f11, $f3 + CMPLT($f28, $f15), $f19 + + fcmovne $f16, $f12, $f4 + CMPLT($f0, $f1), $f16 + fcmovne $f17, $f13, $f5 + CMPLT($f2, $f3), $f17 + + fcmovne $f18, $f14, $f6 + CMPLT($f4, $f5), $f18 + fcmovne $f19, $f15, $f28 + CMPLT($f6, $f28), $f19 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f3, $f2 + fcmovne $f18, $f5, $f4 + fcmovne $f19, $f28, $f6 + + CMPLT($f0, $f2), $f16 + CMPLT($f4, $f6), $f17 + + fcmovne $f16, $f2, $f0 + fcmovne $f17, $f6, $f4 + + CMPLT($f0, $f4), $f16 + fcmovne $f16, $f4, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addq X, INCX, X + + fabs $f20, $f29 + CMPLT($f0, $f29), $f16 + fcmovne $f16, $f29, $f0 + + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + + ldt $f6, 32($sp) + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/asum.S b/kernel/alpha/asum.S new file mode 100644 index 0000000..b312d06 --- /dev/null +++ b/kernel/alpha/asum.S @@ -0,0 +1,206 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + ble N, $L999 + + sra N, 3, I + fclr s1 + fclr s2 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t1 + SXADDQ INCX, X, X + fclr t2 + + LD a1, 0 * SIZE(X) + fclr t3 + SXADDQ INCX, X, X + fclr s3 + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + ldl $31, PREFETCHSIZE * 2 * SIZE(X) + fabs a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a7, 0 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + SXADDQ INCX, X, X + + ADD s0, t0, s0 + LD a1, 0 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a3, 0 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a7, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fabs a2, t2 + ADD s3, t3, s3 + fabs a3, t3 + + ADD s0, t0, s0 + fabs a4, t0 + ADD s1, t1, s1 + fabs a5, t1 + ADD s2, t2, s2 + fabs a6, t2 + ADD s3, t3, s3 + fabs a7, t3 + + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s1, s0 + ADD s2, s3, s2 + .align 4 + +$L15: + and N, 7, I + ADD s0, s2, s0 + unop + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + fabs a0, t0 + + lda I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ret + EPILOGUE diff --git a/kernel/alpha/axpy.S b/kernel/alpha/axpy.S new file mode 100644 index 0000000..1007b06 --- /dev/null +++ b/kernel/alpha/axpy.S @@ -0,0 +1,428 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 40 + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldq $24, 0($sp) + fmov $f19, $f30 + ldl $23, 8($sp) + lda $sp, -16($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + nop + sra $16, 3, $1 + stt $f2, 0($sp) + cmpeq $21, 1, $3 + + stt $f3, 8($sp) + cmpeq $23, 1, $4 + and $16, 7, $2 + ble $16, $End + + and $3, $4, $3 + fbeq $f30, $End + + beq $3, $Sub + ble $1, $Remain + .align 4 + + LD $f10, 0*SIZE($20) + LD $f11, 1*SIZE($20) + LD $f12, 2*SIZE($20) + LD $f13, 3*SIZE($20) + + LD $f18, 0*SIZE($24) + LD $f19, 1*SIZE($24) + LD $f20, 2*SIZE($24) + LD $f21, 3*SIZE($24) + + LD $f14, 4*SIZE($20) + LD $f15, 5*SIZE($20) + LD $f16, 6*SIZE($20) + LD $f17, 7*SIZE($20) + + LD $f22, 4*SIZE($24) + LD $f23, 5*SIZE($24) + LD $f24, 6*SIZE($24) + LD $f25, 7*SIZE($24) + + subq $1, 1, $1 + addq $20, 8*SIZE, $20 + unop + ble $1, $LoopEnd + .align 4 + +$Loop: + ldt $f31, PREFETCHSIZE * SIZE($24) + ldl $31, PREFETCHSIZE * SIZE($20) + + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + LD $f10, 0*SIZE($20) + MUL $f30, $f11, $f27 + LD $f11, 1*SIZE($20) + + MUL $f30, $f12, $f28 + LD $f12, 2*SIZE($20) + MUL $f30, $f13, $f29 + LD $f13, 3*SIZE($20) + + ADD $f18, $f26, $f0 + LD $f18, 8*SIZE($24) + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + LD $f14, 4*SIZE($20) + + ADD $f19, $f27, $f1 + LD $f19, 9*SIZE($24) + MUL $f30, $f15, $f27 + LD $f15, 5*SIZE($20) + + ADD $f20, $f28, $f2 + LD $f20, 10*SIZE($24) + MUL $f30, $f16, $f28 + LD $f16, 6*SIZE($20) + + ADD $f21, $f29, $f3 + LD $f21, 11*SIZE($24) + MUL $f30, $f17, $f29 + LD $f17, 7*SIZE($20) + + ST $f0, 0*SIZE($24) + ADD $f22, $f26, $f0 + ST $f1, 1*SIZE($24) + ADD $f23, $f27, $f1 + + ST $f2, 2*SIZE($24) + ADD $f24, $f28, $f2 + ST $f3, 3*SIZE($24) + ADD $f25, $f29, $f3 + + LD $f22, 12*SIZE($24) + LD $f23, 13*SIZE($24) + LD $f24, 14*SIZE($24) + LD $f25, 15*SIZE($24) + + ST $f0, 4*SIZE($24) + ST $f1, 5*SIZE($24) + ST $f2, 6*SIZE($24) + ST $f3, 7*SIZE($24) + + subq $1, 1, $1 + addq $24, 8*SIZE, $24 + addq $20, 8*SIZE, $20 + bgt $1, $Loop + .align 4 + +$LoopEnd: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + MUL $f30, $f11, $f27 + MUL $f30, $f12, $f28 + MUL $f30, $f13, $f29 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + + ST $f0, 0*SIZE($24) + ADD $f22, $f26, $f0 + ST $f1, 1*SIZE($24) + ADD $f23, $f27, $f1 + + ST $f2, 2*SIZE($24) + ADD $f24, $f28, $f2 + ST $f3, 3*SIZE($24) + ADD $f25, $f29, $f3 + + ST $f0, 4*SIZE($24) + ST $f1, 5*SIZE($24) + ST $f2, 6*SIZE($24) + ST $f3, 7*SIZE($24) + addq $24, 8*SIZE, $24 + .align 4 + +$Remain: + ble $2, $End + .align 4 + +$RemainLoop: + LD $f10, 0*SIZE($20) + LD $f11, 0*SIZE($24) + addq $20, SIZE, $20 + addq $24, SIZE, $24 + + MUL $f30, $f10, $f12 + subq $2, 1, $2 + ADD $f11, $f12, $f13 + ST $f13, -1*SIZE($24) + bgt $2, $RemainLoop + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + lda $sp, 16($sp) + ret + .align 4 + +$Sub: + SXSUBL $16, SIZE, $22 + subq $1, 1, $4 + ble $1, $SubRemain + .align 4 + + LD $f10, 0($20) + SXADDQ $21, $20, $20 + + LD $f11, 0($20) + SXADDQ $21, $20, $20 + LD $f12, 0($20) + SXADDQ $21, $20, $20 + + LD $f13, 0($20) + SXADDQ $21, $20, $20 + LD $f18, 0($24) + SXADDQ $23, $24, $22 + + LD $f19, 0($22) + SXADDQ $23, $22, $22 + LD $f20, 0($22) + SXADDQ $23, $22, $22 + + LD $f21, 0($22) + SXADDQ $23, $22, $22 + LD $f14, 0($20) + SXADDQ $21, $20, $20 + + LD $f15, 0($20) + SXADDQ $21, $20, $20 + LD $f16, 0($20) + SXADDQ $21, $20, $20 + + LD $f17, 0($20) + SXADDQ $21, $20, $20 + LD $f22, 0($22) + SXADDQ $23, $22, $22 + + LD $f23, 0($22) + SXADDQ $23, $22, $22 + LD $f24, 0($22) + SXADDQ $23, $22, $22 + + LD $f25, 0($22) + SXADDQ $23, $22, $22 + unop + ble $4, $SubLoopEnd + .align 4 + +$SubLoop: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + LD $f10, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f11, $f27 + LD $f11, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f12, $f28 + LD $f12, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f13, $f29 + LD $f13, 0($20) + unop + SXADDQ $21, $20, $20 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + LD $f14, 0($20) + SXADDQ $21, $20, $20 + + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + LD $f15, 0($20) + SXADDQ $21, $20, $20 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + LD $f16, 0($20) + SXADDQ $21, $20, $20 + + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + LD $f17, 0($20) + SXADDQ $21, $20, $20 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ADD $f22, $f26, $f0 + unop + + ST $f1, 0($24) + SXADDQ $23, $24, $24 + ADD $f23, $f27, $f1 + unop + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ADD $f24, $f28, $f2 + unop + + ST $f3, 0($24) + SXADDQ $23, $24, $24 + ADD $f25, $f29, $f3 + unop + + LD $f18, 0($22) + SXADDQ $23, $22, $22 + LD $f19, 0($22) + SXADDQ $23, $22, $22 + + LD $f20, 0($22) + SXADDQ $23, $22, $22 + LD $f21, 0($22) + SXADDQ $23, $22, $22 + + LD $f22, 0($22) + SXADDQ $23, $22, $22 + LD $f23, 0($22) + SXADDQ $23, $22, $22 + + LD $f24, 0($22) + SXADDQ $23, $22, $22 + LD $f25, 0($22) + SXADDQ $23, $22, $22 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + + subq $4, 1, $4 + bgt $4, $SubLoop + .align 4 + +$SubLoopEnd: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + MUL $f30, $f11, $f27 + MUL $f30, $f12, $f28 + MUL $f30, $f13, $f29 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + + ADD $f22, $f26, $f0 + ADD $f23, $f27, $f1 + ADD $f24, $f28, $f2 + ADD $f25, $f29, $f3 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + .align 4 + +$SubRemain: + ble $2, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0($20) + LD $f11, 0($24) + SXADDQ $21, $20, $20 + + MUL $f30, $f10, $f12 + subq $2, 1, $2 + ADD $f11, $f12, $f13 + ST $f13, 0($24) + SXADDQ $23, $24, $24 + + bgt $2, $SubRemainLoop + .align 4 + +$SubEnd: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + lda $sp, 16($sp) + ret + EPILOGUE diff --git a/kernel/alpha/cabs.S b/kernel/alpha/cabs.S new file mode 100644 index 0000000..5fa27af --- /dev/null +++ b/kernel/alpha/cabs.S @@ -0,0 +1,71 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl NAME + .ent NAME +NAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount +#endif + + LD $f10, 0($16) + LD $f11, SIZE($16) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fabs $f10, $f12 + fabs $f11, $f0 + ADD $f12, $f0, $f0 + ret + .end NAME + .ident VERSION diff --git a/kernel/alpha/cnrm2.S b/kernel/alpha/cnrm2.S new file mode 100644 index 0000000..03343b2 --- /dev/null +++ b/kernel/alpha/cnrm2.S @@ -0,0 +1,426 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldah $29, 0($27) !gpdisp!1 + lda $29, 0($29) !gpdisp!1 + + lda $sp, -16($sp) + ldq $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + sll INCX, ZBASE_SHIFT, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, 2 * SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + lda I, -1(I) + ble I, $L12 + .align 4 + +$L11: + addt a0, t0, a0 + ldl $31, (PREFETCH_SIZE) * SIZE(X) + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + mov X, XX + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(X) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(X) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(X) + + addt a3, t3, a3 + unop + mult x7, x7, t3 + LD x7, 15 * SIZE(X) + + addt a0, t0, a0 + unop + mult x0, x0, t0 + LD x0, 16 * SIZE(X) + + addt a1, t1, a1 + lda X, 16 * SIZE(X) + mult x1, x1, t1 + LD x1, 17 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 18 * SIZE(XX) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 19 * SIZE(XX) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 20 * SIZE(XX) + + addt a1, t1, a1 + lda I, -1(I) + mult x5, x5, t1 + LD x5, 21 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 22 * SIZE(XX) + + addt a3, t3, a3 + mult x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + addt a0, t0, a0 + mov X, XX + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + unop + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(XX) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(XX) + + addt a3, t3, a3 + lda X, 16 * SIZE(X) + mult x7, x7, t3 + LD x7, 15 * SIZE(XX) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + LD x1, 1 * SIZE(X) + + lda X, 2 * SIZE(X) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + lda I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 2, I + fclr t1 + ble I, $L25 + + LD x0, 0 * SIZE(X) + fclr t2 + LD x1, 1 * SIZE(X) + addq X, INCX, X + LD x2, 0 * SIZE(X) + fclr t3 + LD x3, 1 * SIZE(X) + addq X, INCX, X + + LD x4, 0 * SIZE(X) + lda I, -1(I) + LD x5, 1 * SIZE(X) + addq X, INCX, X + + LD x6, 0 * SIZE(X) + ble I, $L22 + .align 4 + +$L21: + addt a0, t0, a0 + LD x7, 1 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x0, 0 * SIZE(X) + mult x1, x1, t1 + unop + + addt a2, t2, a2 + LD x1, 1 * SIZE(X) + mult x2, x2, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x2, 0 * SIZE(X) + mult x3, x3, t3 + unop + + addt a0, t0, a0 + LD x3, 1 * SIZE(X) + mult x4, x4, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x4, 0 * SIZE(X) + mult x5, x5, t1 + lda I, -1(I) + + addt a2, t2, a2 + LD x5, 1 * SIZE(X) + mult x6, x6, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x6, 0 * SIZE(X) + mult x7, x7, t3 + bgt I, $L21 + .align 4 + +$L22: + addt a0, t0, a0 + LD x7, 1 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + mult x1, x1, t1 + addt a2, t2, a2 + mult x2, x2, t2 + + addt a3, t3, a3 + mult x3, x3, t3 + addt a0, t0, a0 + mult x4, x4, t0 + + addt a1, t1, a1 + mult x5, x5, t1 + addt a2, t2, a2 + mult x6, x6, t2 + + addt a3, t3, a3 + mult x7, x7, t3 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L25: + and N, 3, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + lda I, -1(I) + LD x1, 1 * SIZE(X) + addq X, INCX, X + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + bgt I, $L26 + .align 4 + + +$L998: + addt a0, t0, a0 + addt a1, t1, a1 + + addt a0, a1, a0 + addt a2, a3, a2 + +#if defined(EV4) || defined(EV5) + addt a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldah $29, 0($26) !gpdisp!3 + lda $29, 0($29) !gpdisp!3 +#else + addt a0, a2, a0 + sqrtt a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldq $26, 0($sp) + lda $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/alpha/copy.S b/kernel/alpha/copy.S new file mode 100644 index 0000000..749039c --- /dev/null +++ b/kernel/alpha/copy.S @@ -0,0 +1,379 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + cmpeq INCX, 1, $0 + ble N, $End +#ifndef COMPLEX + sra N, 4, $4 +#else + sra N, 3, $4 +#endif + cmpeq INCY, 1, $1 + + and $0, $1, $0 + beq $0, $Sub +#ifndef COMPLEX + and N, 15, $5 +#else + and N, 7, $5 +#endif + ble $4, $Remain + + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + LD $f12, 2*SIZE(X) + LD $f13, 3*SIZE(X) + LD $f14, 4*SIZE(X) + LD $f15, 5*SIZE(X) + LD $f16, 6*SIZE(X) + LD $f17, 7*SIZE(X) + + LD $f18, 8*SIZE(X) + LD $f19, 9*SIZE(X) + LD $f20, 10*SIZE(X) + LD $f21, 11*SIZE(X) + LD $f22, 12*SIZE(X) + LD $f23, 13*SIZE(X) + LD $f24, 14*SIZE(X) + LD $f25, 15*SIZE(X) + + subq $4, 1, $4 + lda X, 16*SIZE(X) + ble $4, $MainLoopEnd + .align 4 + +$MainLoop: + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + ST $f12, 2*SIZE(Y) + ST $f13, 3*SIZE(Y) + + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + LD $f12, 2*SIZE(X) + LD $f13, 3*SIZE(X) + + ST $f14, 4*SIZE(Y) + ST $f15, 5*SIZE(Y) + ST $f16, 6*SIZE(Y) + ST $f17, 7*SIZE(Y) + + LD $f14, 4*SIZE(X) + LD $f15, 5*SIZE(X) + LD $f16, 6*SIZE(X) + LD $f17, 7*SIZE(X) + + ST $f18, 8*SIZE(Y) + ST $f19, 9*SIZE(Y) + ST $f20, 10*SIZE(Y) + ST $f21, 11*SIZE(Y) + + LD $f18, 8*SIZE(X) + LD $f19, 9*SIZE(X) + LD $f20, 10*SIZE(X) + LD $f21, 11*SIZE(X) + + ST $f22, 12*SIZE(Y) + ST $f23, 13*SIZE(Y) + ST $f24, 14*SIZE(Y) + ST $f25, 15*SIZE(Y) + + LD $f22, 12*SIZE(X) + LD $f23, 13*SIZE(X) + LD $f24, 14*SIZE(X) + LD $f25, 15*SIZE(X) + + subq $4, 1, $4 + lda Y, 16*SIZE(Y) + lda X, 16*SIZE(X) + bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + ST $f12, 2*SIZE(Y) + ST $f13, 3*SIZE(Y) + ST $f14, 4*SIZE(Y) + ST $f15, 5*SIZE(Y) + ST $f16, 6*SIZE(Y) + ST $f17, 7*SIZE(Y) + + ST $f18, 8*SIZE(Y) + ST $f19, 9*SIZE(Y) + ST $f20, 10*SIZE(Y) + ST $f21, 11*SIZE(Y) + ST $f22, 12*SIZE(Y) + ST $f23, 13*SIZE(Y) + ST $f24, 14*SIZE(Y) + ST $f25, 15*SIZE(Y) + + lda Y, 16*SIZE(Y) + .align 4 + +$Remain: + ble $5, $End + .align 4 + +$RemainLoop: +#ifndef COMPLEX + LD $f10, 0*SIZE(X) + lda X, 1*SIZE(X) + ST $f10, 0*SIZE(Y) + lda Y, 1*SIZE(Y) +#else + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + lda X, 2*SIZE(X) + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + lda Y, 2*SIZE(Y) +#endif + subq $5, 1, $5 + bgt $5, $RemainLoop + .align 4 +$End: + ret + .align 4 + +$Sub: +#ifdef COMPLEX + addq INCX, INCX, INCX + addq INCY, INCY, INCY + and N, 7, $5 +#else + and N, 15, $5 +#endif + ble $4, $SubRemain + .align 4 + +$SubMainLoop: +#ifndef COMPLEX + LD $f10, 0(X) + SXADDQ INCX, X, X + LD $f11, 0(X) + SXADDQ INCX, X, X + + LD $f12, 0(X) + SXADDQ INCX, X, X + LD $f13, 0(X) + SXADDQ INCX, X, X + + LD $f14, 0(X) + SXADDQ INCX, X, X + LD $f15, 0(X) + SXADDQ INCX, X, X + + LD $f16, 0(X) + SXADDQ INCX, X, X + LD $f17, 0(X) + SXADDQ INCX, X, X + + LD $f18, 0(X) + SXADDQ INCX, X, X + LD $f19, 0(X) + SXADDQ INCX, X, X + + LD $f20, 0(X) + SXADDQ INCX, X, X + LD $f21, 0(X) + SXADDQ INCX, X, X + + LD $f22, 0(X) + SXADDQ INCX, X, X + LD $f23, 0(X) + SXADDQ INCX, X, X + + LD $f24, 0(X) + SXADDQ INCX, X, X + LD $f25, 0(X) + SXADDQ INCX, X, X + + ST $f10, 0(Y) + SXADDQ INCY, Y, Y + ST $f11, 0(Y) + SXADDQ INCY, Y, Y + + ST $f12, 0(Y) + SXADDQ INCY, Y, Y + ST $f13, 0(Y) + SXADDQ INCY, Y, Y + + ST $f14, 0(Y) + SXADDQ INCY, Y, Y + ST $f15, 0(Y) + SXADDQ INCY, Y, Y + + ST $f16, 0(Y) + SXADDQ INCY, Y, Y + ST $f17, 0(Y) + SXADDQ INCY, Y, Y + + ST $f18, 0(Y) + SXADDQ INCY, Y, Y + ST $f19, 0(Y) + SXADDQ INCY, Y, Y + + ST $f20, 0(Y) + SXADDQ INCY, Y, Y + ST $f21, 0(Y) + SXADDQ INCY, Y, Y + + ST $f22, 0(Y) + SXADDQ INCY, Y, Y + ST $f23, 0(Y) + SXADDQ INCY, Y, Y + + ST $f24, 0(Y) + SXADDQ INCY, Y, Y + ST $f25, 0(Y) + SXADDQ INCY, Y, Y +#else + LD $f10, 0(X) + LD $f11, SIZE(X) + SXADDQ INCX, X, X + + LD $f12, 0(X) + LD $f13, SIZE(X) + SXADDQ INCX, X, X + + LD $f14, 0(X) + LD $f15, SIZE(X) + SXADDQ INCX, X, X + + LD $f16, 0(X) + LD $f17, SIZE(X) + SXADDQ INCX, X, X + + LD $f18, 0(X) + LD $f19, SIZE(X) + SXADDQ INCX, X, X + + LD $f20, 0(X) + LD $f21, SIZE(X) + SXADDQ INCX, X, X + + LD $f22, 0(X) + LD $f23, SIZE(X) + SXADDQ INCX, X, X + + LD $f24, 0(X) + LD $f25, SIZE(X) + SXADDQ INCX, X, X + + ST $f10, 0(Y) + ST $f11, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f12, 0(Y) + ST $f13, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f14, 0(Y) + ST $f15, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f16, 0(Y) + ST $f17, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f18, 0(Y) + ST $f19, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f20, 0(Y) + ST $f21, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f22, 0(Y) + ST $f23, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f24, 0(Y) + ST $f25, SIZE(Y) + SXADDQ INCY, Y, Y +#endif + subq $4, 1, $4 + bgt $4, $SubMainLoop + .align 4 + +$SubRemain: + ble $5, $SubEnd + .align 4 + + $SubRemainLoop: +#ifndef COMPLEX + LD $f10, 0(X) + SXADDQ INCX, X, X + ST $f10, 0(Y) + SXADDQ INCY, Y, Y +#else + LD $f10, 0(X) + LD $f11, SIZE(X) + SXADDQ INCX, X, X + ST $f10, 0(Y) + ST $f11, SIZE(Y) + SXADDQ INCY, Y, Y +#endif + subq $5, 1, $5 + bgt $5, $SubRemainLoop + .align 4 + +$SubEnd: + ret + EPILOGUE diff --git a/kernel/alpha/cscal.S b/kernel/alpha/cscal.S new file mode 100644 index 0000000..bba3137 --- /dev/null +++ b/kernel/alpha/cscal.S @@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + .set noat + .set noreorder + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + + .globl NAME + .ent NAME + +NAME: +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount +#endif + +#ifndef C_INTERFACE + ldl $16, 0($16) # n + mov $18, $20 # Store Address + ldl $19, 0($19) # incx + nop + + LD $f1, 0($17) # alpha +#else + mov $18, $20 # Store Address + fmov $f17, $f1 # alpha +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + sra $16, 1, $21 # 4-unrolling + ble $16, $End + + lda $23, -1($19) + ble $19, $End + + bgt $23, $INC_NOT_1 + .align 4 + + ble $21, $Sub + lda $21, -1($21) + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + + LD $f12, 2*SIZE($18) + LD $f13, 3*SIZE($18) + lda $18, 4*SIZE($18) + ble $21, $MainRemain + .align 4 + +$MainLoop: + MUL $f10, $f1, $f20 + LD $f10, 0*SIZE($18) + MUL $f11, $f1, $f21 + LD $f11, 1*SIZE($18) + + MUL $f12, $f1, $f22 + LD $f12, 2*SIZE($18) + MUL $f13, $f1, $f23 + LD $f13, 3*SIZE($18) + + lda $18, 4*SIZE($18) + lda $21, -1($21) + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + ST $f22, 2*SIZE($20) + ST $f23, 3*SIZE($20) + lda $20, 4*SIZE($20) + + bgt $21, $MainLoop + .align 4 + +$MainRemain: + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + MUL $f12, $f1, $f22 + MUL $f13, $f1, $f23 + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + ST $f22, 2*SIZE($20) + ST $f23, 3*SIZE($20) + lda $20, 4*SIZE($20) + .align 4 + +$Sub: + blbc $16, $End + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + .align 4 + +$End: + ret + .align 4 + +$INC_NOT_1: + addl $19, $19, $19 + ble $21, $INC_Sub + lda $21, -1($21) + + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f12, 0*SIZE($18) + LD $f13, 1*SIZE($18) + SXADDQ $19, $18, $18 + ble $21, $INC_MainRemain + .align 4 + +$INC_MainLoop: + MUL $f10, $f1, $f20 + LD $f10, 0*SIZE($18) + MUL $f11, $f1, $f21 + LD $f11, 1*SIZE($18) + + SXADDQ $19, $18, $18 + + MUL $f12, $f1, $f22 + LD $f12, 0*SIZE($18) + MUL $f13, $f1, $f23 + LD $f13, 1*SIZE($18) + + SXADDQ $19, $18, $18 + + ST $f20, 0*SIZE($20) + lda $21, -1($21) + ST $f21, 1*SIZE($20) + SXADDQ $19, $20, $20 + + ST $f22, 0*SIZE($20) + ST $f23, 1*SIZE($20) + SXADDQ $19, $20, $20 + unop + bgt $21, $INC_MainLoop + .align 4 + +$INC_MainRemain: + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + MUL $f12, $f1, $f22 + MUL $f13, $f1, $f23 + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + SXADDQ $19, $20, $20 + + ST $f22, 0*SIZE($20) + ST $f23, 1*SIZE($20) + SXADDQ $19, $20, $20 + .align 4 + +$INC_Sub: + blbc $16, $INC_End + + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + .align 4 + +$INC_End: + ret + .end NAME + .ident VERSION diff --git a/kernel/alpha/dnrm2.S b/kernel/alpha/dnrm2.S new file mode 100644 index 0000000..b8ccc75 --- /dev/null +++ b/kernel/alpha/dnrm2.S @@ -0,0 +1,431 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldah $29, 0($27) !gpdisp!1 + lda $29, 0($29) !gpdisp!1 + + lda $sp, -16($sp) + ldq $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 4, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + lda I, -1(I) + ble I, $L12 + .align 4 + +$L11: + addt a0, t0, a0 + ldl $31, (PREFETCH_SIZE) * SIZE(X) + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + mov X, XX + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(X) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(X) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(X) + + addt a3, t3, a3 + unop + mult x7, x7, t3 + LD x7, 15 * SIZE(X) + + addt a0, t0, a0 + unop + mult x0, x0, t0 + LD x0, 16 * SIZE(X) + + addt a1, t1, a1 + lda X, 16 * SIZE(X) + mult x1, x1, t1 + LD x1, 17 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 18 * SIZE(XX) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 19 * SIZE(XX) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 20 * SIZE(XX) + + addt a1, t1, a1 + lda I, -1(I) + mult x5, x5, t1 + LD x5, 21 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 22 * SIZE(XX) + + addt a3, t3, a3 + mult x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + addt a0, t0, a0 + mov X, XX + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + unop + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(XX) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(XX) + + addt a3, t3, a3 + lda X, 16 * SIZE(X) + mult x7, x7, t3 + LD x7, 15 * SIZE(XX) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a1, t1, a1 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L15: + and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + lda X, 1 * SIZE(X) + + addt a0, t0, a0 + mult x0, x0, t0 + + lda I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L25 + + fclr t2 + fclr t3 + + LD x0, 0 * SIZE(X) + addq X, INCX, X + LD x1, 0 * SIZE(X) + addq X, INCX, X + LD x2, 0 * SIZE(X) + addq X, INCX, X + LD x3, 0 * SIZE(X) + addq X, INCX, X + + LD x4, 0 * SIZE(X) + addq X, INCX, X + LD x5, 0 * SIZE(X) + addq X, INCX, X + LD x6, 0 * SIZE(X) + addq X, INCX, X + + lda I, -1(I) + ble I, $L22 + .align 4 + +$L21: + addt a0, t0, a0 + LD x7, 0 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x0, 0 * SIZE(X) + mult x1, x1, t1 + addq X, INCX, X + + addt a2, t2, a2 + LD x1, 0 * SIZE(X) + mult x2, x2, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x2, 0 * SIZE(X) + mult x3, x3, t3 + addq X, INCX, X + + addt a0, t0, a0 + LD x3, 0 * SIZE(X) + mult x4, x4, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x4, 0 * SIZE(X) + mult x5, x5, t1 + addq X, INCX, X + + addt a2, t2, a2 + LD x5, 0 * SIZE(X) + mult x6, x6, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x6, 0 * SIZE(X) + mult x7, x7, t3 + addq X, INCX, X + + lda I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + addt a0, t0, a0 + LD x7, 0 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + unop + mult x1, x1, t1 + unop + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a1, t1, a1 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L25: + and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + addq X, INCX, X + + addt a0, t0, a0 + mult x0, x0, t0 + + lda I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + addt a0, t0, a0 + + addt a0, a1, a0 + addt a2, a3, a2 + +#if defined(EV4) || defined(EV5) + addt a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldah $29, 0($26) !gpdisp!3 + lda $29, 0($29) !gpdisp!3 +#else + addt a0, a2, a0 + sqrtt a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldq $26, 0($sp) + lda $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/alpha/dot.S b/kernel/alpha/dot.S new file mode 100644 index 0000000..330196c --- /dev/null +++ b/kernel/alpha/dot.S @@ -0,0 +1,530 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 + +#define I $5 + +#define s0 $f0 +#define s1 $f30 +#define s2 $f1 +#define s3 $f2 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + lda $sp, -16($sp) + fclr s0 + stt $f2, 0($sp) + fclr s1 + + fclr s2 + nop + fclr s3 + ble N, $L999 + + fclr t0 + cmpeq INCX, 1, $21 + fclr t1 + cmpeq INCY, 1, $22 + fclr t2 + and $21, $22, $22 + fclr t3 + beq $22, $L20 + +#ifndef DOUBLE + srl N, 4, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addq X, 16 * SIZE, X + subq I, 1, I + + addq Y, 16 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + ldl $31, PREFETCHSIZE * 2 * SIZE(X) + subq I, 1, I + ldl $31, PREFETCHSIZE * 2 * SIZE(Y) + addq X, 16 * SIZE, X + + ADD s0, t0, s0 + LD b6, -10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -24 * SIZE(X) + MUL a1, b1, t1 + LD a1, -23 * SIZE(X) + + ADD s2, t2, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -22 * SIZE(X) + MUL a3, b3, t3 + LD a3, -21 * SIZE(X) + + ADD s0, t0, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -20 * SIZE(X) + MUL a5, b5, t1 + LD a5, -19 * SIZE(X) + + ADD s2, t2, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -18 * SIZE(X) + MUL a7, b7, t3 + LD a7, -17 * SIZE(X) + + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -16 * SIZE(X) + MUL a1, b1, t1 + LD a1, -15 * SIZE(X) + + ADD s2, t2, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -14 * SIZE(X) + MUL a3, b3, t3 + LD a3, -13 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -12 * SIZE(X) + MUL a5, b5, t1 + LD a5, -11 * SIZE(X) + + ADD s2, t2, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -10 * SIZE(X) + MUL a7, b7, t3 + LD a7, -9 * SIZE(X) + + addq Y, 16 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, s0 + LD b6,-10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, s1 + MUL a1, b1, t1 + + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a5, b5, t1 + ADD s2, t2, s2 + MUL a6, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, s0 + and N, 15, I + ADD s1, t1, s1 + ble I, $L18 + .align 4 + +#else + + srl N, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addq X, 8 * SIZE, X + subq I, 1, I + + addq Y, 8 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + ldl $31, PREFETCHSIZE * SIZE(X) + subq I, 1, I + ldl $31, PREFETCHSIZE * SIZE(Y) + addq X, 8 * SIZE, X + + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + addq Y, 8 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, s1 + MUL a1, b1, t1 + + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a5, b5, t1 + ADD s2, t2, s2 + MUL a6, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, s0 + and N, 7, I + ADD s1, t1, s1 + ble I, $L18 + .align 4 + +#endif + +$L16: + LD a0, 0 * SIZE(X) + addq X, SIZE, X + LD b0, 0 * SIZE(Y) + addq Y, SIZE, Y + + ADD s2, t2, s2 + MUL a0, b0, t2 + subq I, 1, I + bgt I, $L16 + .align 4 + +$L18: + ADD s2, t2, s2 + ADD s3, t3, s3 + br $L999 + .align 4 + +$L20: + srl N, 2, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + subq I, 1, I + + SXADDQ INCY, Y, Y + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a1, b1, t1 + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + subq I, 1, I + bgt I, $L22 + nop + fnop + .align 4 + +$L23: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a1, b1, t1 + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + .align 4 + +$L25: + ADD s0, t0, s0 + and N, 3, I + ADD s1, t1, s1 + ble I, $L28 + .align 4 + +$L26: + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + MUL a0, b0, t2 + subq I, 1, I + bgt I, $L26 + .align 4 + +$L28: + ADD s2, t2, s2 + ADD s3, t3, s3 + .align 4 + +$L999: + ADD s2, s3, s2 + ldt $f2, 0($sp) + ADD s0, s1, s0 + lda $sp, 16($sp) + + ADD s0, s2, s0 + ret + + EPILOGUE diff --git a/kernel/alpha/gemm_beta.S b/kernel/alpha/gemm_beta.S new file mode 100644 index 0000000..44b2fad --- /dev/null +++ b/kernel/alpha/gemm_beta.S @@ -0,0 +1,179 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl CNAME + .ent CNAME +CNAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount +#endif + + ldq $18, 16($sp) + ble $16, $End + ldl $19, 24($sp) + ble $17, $End +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO) + .align 4 + +$BETA_NE_ZERO: + sra $16, 3, $2 # i = (m >> 3) + mov $18, $1 # c_offset = c + lda $17, -1($17) # j -- + ble $2,$L52 + .align 4 + +$L51: + lds $f31, 64($1) + lda $2, -1($2) + + LD $f14, 0*SIZE($1) + LD $f15, 1*SIZE($1) + LD $f16, 2*SIZE($1) + LD $f17, 3*SIZE($1) + LD $f18, 4*SIZE($1) + LD $f11, 5*SIZE($1) + LD $f21, 6*SIZE($1) + LD $f22, 7*SIZE($1) + + MUL $f19, $f14, $f23 + MUL $f19, $f15, $f24 + MUL $f19, $f16, $f25 + MUL $f19, $f17, $f26 + MUL $f19, $f18, $f27 + MUL $f19, $f11, $f28 + MUL $f19, $f21, $f29 + MUL $f19, $f22, $f30 + + ST $f23, 0*SIZE($1) + ST $f24, 1*SIZE($1) + ST $f25, 2*SIZE($1) + ST $f26, 3*SIZE($1) + ST $f27, 4*SIZE($1) + ST $f28, 5*SIZE($1) + ST $f29, 6*SIZE($1) + ST $f30, 7*SIZE($1) + + lda $1,8*SIZE($1) + bgt $2,$L51 + .align 4 + +$L52: + and $16, 7, $2 + ble $2,$L54 + .align 4 + +$L53: + LD $f12, 0($1) + lda $2, -1($2) + MUL $f19, $f12, $f23 + ST $f23, 0($1) + lda $1, SIZE($1) + bgt $2,$L53 + .align 4 + +$L54: + SXADDQ $19, $18, $18 # c += ldc + bgt $17,$BETA_NE_ZERO + clr $0 + ret + .align 4 + +$BETA_EQ_ZERO: + sra $16, 3, $2 # i = (m >> 3) + lda $4, 8*SIZE($18) + mov $18, $1 # c_offset = c + lda $17, -1($17) # j -- + ble $2,$L42 + .align 4 + +$L41: + ST $f31, 0*SIZE($1) + ST $f31, 1*SIZE($1) + ST $f31, 2*SIZE($1) + ST $f31, 3*SIZE($1) + ST $f31, 4*SIZE($1) + ST $f31, 5*SIZE($1) + ST $f31, 6*SIZE($1) + ST $f31, 7*SIZE($1) + lda $2, -1($2) + + lda $4, 8*SIZE($4) + lda $1, 8*SIZE($1) + bgt $2,$L41 + .align 4 + +$L42: + and $16, 7, $2 + ble $2,$L44 + .align 4 + +$L43: + lda $2, -1($2) + ST $f31, 0($1) + lda $1, SIZE($1) + bgt $2, $L43 + .align 4 + +$L44: + SXADDQ $19, $18, $18 # c += ldc + bgt $17,$BETA_EQ_ZERO + clr $0 + .align 4 + +$End: + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/gemm_kernel_4x4.S b/kernel/alpha/gemm_kernel_4x4.S new file mode 100644 index 0000000..4e92534 --- /dev/null +++ b/kernel/alpha/gemm_kernel_4x4.S @@ -0,0 +1,2852 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define BB $3 +#define OFFSET $4 + +#define ALPHA 64($sp) + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + ldq C, 0 + STACKSIZE($sp) + ldq LDC, 8 + STACKSIZE($sp) +#ifdef TRMMKERNEL + ldq OFFSET, 16 + STACKSIZE($sp) +#endif + + SXADDQ LDC, 0, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + stt $f19, ALPHA + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subq $31, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: + mov C, C1 + addq C, LDC, C2 + mov A, AO + s4addq K, 0, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + addq C2, LDC, C3 + s4addq LDC, C, C + + SXADDQ BB, B, BB + fclr t1 + addq C3, LDC, C4 + fclr t2 + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(EV5) || defined(EV6) + ldl $31, 0 * SIZE(BB) + ldl $31, 8 * SIZE(BB) + unop + lda BB, 16 * SIZE(BB) +#endif + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 4, TMP1 +#else + addq KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(B) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + +#else + sll KK, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(TMP1) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(BO) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 +#endif + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + ldt alpha, ALPHA + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L18 +#else + blbs TMP1, $L18 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L18: + ADD c12, t2, c12 + unop + MUL b1, a2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 +#ifndef TRMMKERNEL + LD b5, 1 * SIZE(C1) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL b1, a3, t1 + unop + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(C2) +#else + unop +#endif + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 +#ifndef TRMMKERNEL + LD a1, 0 * SIZE(C3) +#else + unop +#endif + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 +#ifndef TRMMKERNEL + LD a2, 2 * SIZE(C1) +#else + unop +#endif + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 +#ifndef TRMMKERNEL + LD b2, 3 * SIZE(C1) +#else + unop +#endif + + ADD c09, t1, c09 + lda I, -1(I) + MUL b3, a3, t1 + unop + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 +#ifndef TRMMKERNEL + LD b3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 +#ifndef TRMMKERNEL + LD a4, 1 * SIZE(C2) +#else + unop +#endif + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 +#ifndef TRMMKERNEL + LD a3, 2 * SIZE(C2) +#else + unop +#endif + + ADD c11, t1, c11 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD b4, 3 * SIZE(C2) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL alpha, c02, c02 +#ifndef TRMMKERNEL + LD t1, 1 * SIZE(C3) +#else + unop +#endif + + ADD c16, t3, c16 + unop + MUL alpha, c03, c03 +#ifndef TRMMKERNEL + LD t2, 2 * SIZE(C3) +#else + unop +#endif + + ADD c15, t4, c15 + unop + MUL alpha, c04, c04 +#ifndef TRMMKERNEL + LD t3, 3 * SIZE(C3) +#else + unop +#endif + + MUL alpha, c05, c05 + unop +#ifndef TRMMKERNEL + ADD c01, a5, c01 + LD t4, 1 * SIZE(C4) +#else + unop + unop +#endif + + MUL alpha, c06, c06 +#ifndef TRMMKERNEL + unop + ADD c02, b5, c02 + LD a5, 2 * SIZE(C4) +#endif + + MUL alpha, c07, c07 +#ifndef TRMMKERNEL + unop + ADD c03, a2, c03 + LD b5, 3 * SIZE(C4) +#endif + + MUL alpha, c08, c08 +#ifndef TRMMKERNEL + unop + ADD c04, b2, c04 + unop +#endif + + MUL alpha, c09, c09 + ST c01, 0 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c05, b1, c05 + unop +#endif + + MUL alpha, c10, c10 + ST c02, 1 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c06, a4, c06 + unop +#endif + + MUL alpha, c11, c11 + ST c03, 2 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c07, a3, c07 + unop +#endif + + MUL alpha, c12, c12 + ST c04, 3 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c08, b4, c08 +#else + unop +#endif + lda C1, 4 * SIZE(C1) + + MUL alpha, c13, c13 + ST c05, 0 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c09, a1, c09 + unop +#endif + + MUL alpha, c14, c14 + ST c06, 1 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c10, t1, c10 + unop +#endif + + MUL alpha, c15, c15 + ST c07, 2 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c11, t2, c11 + unop +#endif + + MUL alpha, c16, c16 + ST c08, 3 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c12, t3, c12 +#else + unop +#endif + lda C2, 4 * SIZE(C2) + +#ifndef TRMMKERNEL + ADD c13, b3, c13 +#else + unop +#endif + ST c09, 0 * SIZE(C3) + fclr t1 + lda C4, 4 * SIZE(C4) + +#ifndef TRMMKERNEL + ADD c14, t4, c14 +#else + unop +#endif + ST c10, 1 * SIZE(C3) + fclr t2 + unop + +#ifndef TRMMKERNEL + ADD c15, a5, c15 +#else + unop +#endif + ST c11, 2 * SIZE(C3) + fclr t3 + unop + +#ifndef TRMMKERNEL + ADD c16, b5, c16 +#else + unop +#endif + ST c12, 3 * SIZE(C3) + fclr t4 + lda C3, 4 * SIZE(C3) + + ST c13, -4 * SIZE(C4) + ST c14, -3 * SIZE(C4) + ST c15, -2 * SIZE(C4) + ST c16, -1 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 4, TMP1 +#else + subq TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 4, KK +#endif + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + lda BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble L, $L25 + +#else + sll KK, BASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + lda BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L28 +#else + blbs TMP1, $L28 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L28: + ADD c10, t2, c10 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C1) +#else + unop +#endif + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD a4, 1 * SIZE(C1) +#else + unop +#endif + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 +#ifndef TRMMKERNEL + LD b5, 1 * SIZE(C2) +#else + unop +#endif + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(C3) +#else + unop +#endif + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 +#ifndef TRMMKERNEL + LD b2, 1 * SIZE(C3) +#else + unop +#endif + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 +#ifndef TRMMKERNEL + LD b3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c09, t1, c09 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD b4, 1 * SIZE(C4) +#else + unop +#endif + + ADD c10, t2, c10 + unop + MUL alpha, c02, c02 + unop + + ADD c13, t3, c13 + MUL alpha, c05, c05 + ADD c14, t4, c14 + MUL alpha, c06, c06 + + MUL alpha, c09, c09 +#ifndef TRMMKERNEL + ADD c01, a3, c01 +#endif + MUL alpha, c10, c10 +#ifndef TRMMKERNEL + ADD c02, a4, c02 +#endif + + MUL alpha, c13, c13 +#ifndef TRMMKERNEL + ADD c05, a5, c05 +#endif + MUL alpha, c14, c14 +#ifndef TRMMKERNEL + ADD c06, b5, c06 +#endif + +#ifndef TRMMKERNEL + ADD c09, b1, c09 + unop +#endif + ST c01, 0 * SIZE(C1) + fclr t1 + +#ifndef TRMMKERNEL + ADD c10, b2, c10 + unop +#endif + ST c02, 1 * SIZE(C1) + fclr t2 + +#ifndef TRMMKERNEL + ADD c13, b3, c13 + unop +#endif + ST c05, 0 * SIZE(C2) + fclr t3 + +#ifndef TRMMKERNEL + ADD c14, b4, c14 + unop +#endif + ST c06, 1 * SIZE(C2) + fclr t4 + + ST c09, 0 * SIZE(C3) + lda C1, 2 * SIZE(C1) + ST c10, 1 * SIZE(C3) + lda C2, 2 * SIZE(C2) + + ST c13, 0 * SIZE(C4) + lda C3, 2 * SIZE(C3) + ST c14, 1 * SIZE(C4) + lda C4, 2 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + lda BO, 4 * SIZE(B) + ble L, $L35 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + lda BO, 4 * SIZE(BO) + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + lda AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + lda BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L38 +#else + blbs TMP1, $L38 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + lda AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L38: + ADD c05, t2, c05 + unop + MUL a1, b2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c09, t3, c09 + unop + MUL a1, b3, t3 +#ifndef TRMMKERNEL + LD b5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c13, t4, c13 + unop + MUL a1, b4, t4 +#ifndef TRMMKERNEL + LD a2, 0 * SIZE(C3) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c05, t2, c05 + unop + MUL alpha, c05, c05 + unop + + ADD c09, t3, c09 + MUL alpha, c09, c09 + ADD c13, t4, c13 + MUL alpha, c13, c13 + +#ifndef TRMMKERNEL + ADD c01, a5, c01 + ADD c05, b5, c05 + ADD c09, a2, c09 + ADD c13, a3, c13 +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 1, TMP1 +#else + subq TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 1, KK +#endif + .align 4 + +$L39: + mov BO, B + lda J, -1(J) +#if defined(TRMMKERNEL) && !defined(LEFT) + addq KK, 4, KK +#else + unop +#endif + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + + mov C, C1 + addq C, LDC, C2 + mov A, AO + fclr t1 + addq C2, LDC, C + fclr t2 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 4, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + lda BO, 2 * SIZE(B) + lda AO, 4 * SIZE(AO) + ble L, $L55 +#else + sll KK, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + lda BO, 2 * SIZE(BO) + lda AO, 4 * SIZE(AO) + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L58 +#else + blbs TMP1, $L58 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L58: + ADD c06, t2, c06 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 +#ifndef TRMMKERNEL + LD c11, 2 * SIZE(C1) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 +#ifndef TRMMKERNEL + LD c12, 3 * SIZE(C1) +#else + unop +#endif + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 +#ifndef TRMMKERNEL + LD c13, 0 * SIZE(C2) + unop +#endif + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 +#ifndef TRMMKERNEL + LD c14, 1 * SIZE(C2) +#else + unop +#endif + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 +#ifndef TRMMKERNEL + LD c15, 2 * SIZE(C2) +#else + unop +#endif + + ADD c05, t1, c05 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD c16, 3 * SIZE(C2) +#else + unop +#endif + + ADD c06, t2, c06 + lda I, -1(I) + MUL alpha, c02, c02 + unop + + ADD c07, t3, c07 + MUL alpha, c03, c03 + ADD c08, t4, c08 + MUL alpha, c04, c04 + + MUL alpha, c05, c05 +#ifndef TRMMKERNEL + ADD c01, c09, c01 +#endif + MUL alpha, c06, c06 +#ifndef TRMMKERNEL + ADD c02, c10, c02 +#endif + + MUL alpha, c07, c07 +#ifndef TRMMKERNEL + ADD c03, c11, c03 +#endif + MUL alpha, c08, c08 +#ifndef TRMMKERNEL + ADD c04, c12, c04 +#endif + +#ifndef TRMMKERNEL + ADD c05, c13, c05 +#endif + ST c01, 0 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c06, c14, c06 +#endif + ST c02, 1 * SIZE(C1) + +#ifndef TRMMKERNEL + ADD c07, c15, c07 +#endif + ST c03, 2 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c08, c16, c08 +#endif + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + fclr t1 + ST c06, 1 * SIZE(C2) + fclr t2 + ST c07, 2 * SIZE(C2) + fclr t3 + ST c08, 3 * SIZE(C2) + fclr t4 + + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 4, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 4, KK +#endif + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + ble L, $L65 +#else + sll KK, BASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L68 +#else + blbs TMP1, $L68 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L68: + ADD c02, t2, c02 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD c05, t3, c05 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c11, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD c12, 1 * SIZE(C2) +#else + unop +#endif + + ADD c02, t2, c02 + lda C1, 2 * SIZE(C1) + MUL alpha, c02, c02 + lda C2, 2 * SIZE(C2) + + ADD c05, t3, c05 + MUL alpha, c05, c05 + ADD c06, t4, c06 + MUL alpha, c06, c06 + +#ifndef TRMMKERNEL + ADD c01, c09, c01 + ADD c02, c10, c02 + ADD c05, c11, c05 + ADD c06, c12, c06 +#endif + + ST c01, -2 * SIZE(C1) + fclr t1 + ST c02, -1 * SIZE(C1) + fclr t2 + ST c05, -2 * SIZE(C2) + fclr t3 + ST c06, -1 * SIZE(C2) + fclr t4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + + LD b3, 2 * SIZE(B) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + ble L, $L75 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + + LD b3, 2 * SIZE(BO) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + lda AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + lda BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L78 +#else + blbs TMP1, $L78 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L78: + ADD c05, t2, c05 + MUL a1, b2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c02, t3, c02 + ADD c06, t4, c06 +#ifndef TRMMKERNEL + LD b5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, c02, c01 + ADD c05, c06, c05 + + ADD c01, t1, c01 + ADD c05, t2, c05 + + MUL alpha, c01, c01 + MUL alpha, c05, c05 + +#ifndef TRMMKERNEL + ADD c01, a5, c01 + ADD c05, b5, c05 +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 1, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 1, KK +#endif + .align 4 + +$L79: + mov BO, B +#if defined(TRMMKERNEL) && !defined(LEFT) + addq KK, 2, KK +#else + unop +#endif + unop + unop + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + + mov C, C1 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 4, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L95 +#else + sll KK, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + lda AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + ldt alpha, ALPHA + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + lda AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: +#ifndef TRMMKERNEL + ADD c01, t1, c01 + LD c05, 0 * SIZE(C1) + ADD c02, t2, c02 + LD c06, 1 * SIZE(C1) + ADD c03, t3, c03 + LD c07, 2 * SIZE(C1) + ADD c04, t4, c04 + LD c08, 3 * SIZE(C1) +#else + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 +#endif + + MUL alpha, c01, c01 + MUL alpha, c02, c02 + MUL alpha, c03, c03 + MUL alpha, c04, c04 + +#ifndef TRMMKERNEL + ADD c01, c05, c01 + ADD c02, c06, c02 + ADD c03, c07, c03 + ADD c04, c08, c04 +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + lda C1, 4 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 4, TMP1 +#else + subq TMP1, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 4, KK +#endif + + lda I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + unop + unop + ble I, $L110 + .align 4 + +$L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L105 +#else + sll KK, BASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + lda BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + lda AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + ldt alpha, ALPHA +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C1) + LD a4, 1 * SIZE(C1) +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda AO, 2 * SIZE(AO) + unop + lda BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + fclr t1 + ADD c02, t2, c02 + fclr t2 + ADD c03, t3, c03 + fclr t3 + ADD c04, t4, c04 + fclr t4 + + ADD c01, c03, c01 + ADD c02, c04, c02 + + MUL alpha, c01, c01 + MUL alpha, c02, c02 + +#ifndef TRMMKERNEL + ADD c01, a3, c01 + ADD c02, a4, c02 +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + lda C1, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L999 + .align 4 + +$L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L115 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + lda L, -1(L) + lda AO, 4 * SIZE(AO) + lda BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + ldt alpha, ALPHA +#ifndef TRMMKERNEL + LD a2, 0 * SIZE(C1) +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda L, -1(L) + lda AO, 1 * SIZE(AO) + lda BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + ADD c01, a2, c01 +#endif + ST c01, 0 * SIZE(C1) + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/gemv_n.S b/kernel/alpha/gemv_n.S new file mode 100644 index 0000000..665b217 --- /dev/null +++ b/kernel/alpha/gemv_n.S @@ -0,0 +1,1307 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $20 +#define LDA $21 + +#define X $18 +#define INCX $19 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define Y1 $4 + +#define A1 $5 +#define A2 $6 +#define A3 $7 +#define A4 $8 + +#define alpha $f19 + +#define alpha1 $f0 +#define alpha2 $f1 +#define alpha3 $f10 +#define alpha4 $f11 + +#define y0 $f12 +#define y1 $f13 +#define y2 $f14 +#define y3 $f15 + +#define y4 $f16 +#define y5 $f17 +#define y6 $f18 +#define y7 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + + PROLOGUE + + lda $sp, -STACKSIZE($sp) + ldq X, 0 + STACKSIZE($sp) + ldq INCX, 8 + STACKSIZE($sp) + ldq Y, 16 + STACKSIZE($sp) + ldq INCY, 24 + STACKSIZE($sp) + ldq BUFFER, 32 + STACKSIZE($sp) + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + SXADDQ INCX, 0, INCX + cmple N, 0, $1 + SXADDQ INCY, 0, INCY + + or $0, $1, $0 + bne $0, $L999 + + SXADDQ LDA, 0, LDA + + cmpeq INCY, SIZE, $0 + bne $0, $L10 + + mov BUFFER, Y1 + + mov Y, BUFFER + mov Y1, Y + + sra M, 3, I + ble I, $L05 + .align 4 + +$L02: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + ST $f31, 2 * SIZE(Y1) + ST $f31, 3 * SIZE(Y1) + ST $f31, 4 * SIZE(Y1) + ST $f31, 5 * SIZE(Y1) + ST $f31, 6 * SIZE(Y1) + ST $f31, 7 * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + lda I, -1(I) + bgt I, $L02 + .align 4 + +$L05: + and M, 7, I + ble I, $L10 + .align 4 + +$L06: + ST $f31, 0 * SIZE(Y1) + addq Y1, SIZE, Y1 + + lda I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + sra N, 2, J + ble J, $L20 + .align 4 + +$L11: + LD alpha1, 0 * SIZE(X) + addq X, INCX, X + LD alpha2, 0 * SIZE(X) + addq X, INCX, X + LD alpha3, 0 * SIZE(X) + addq X, INCX, X + LD alpha4, 0 * SIZE(X) + addq X, INCX, X + + MUL alpha, alpha1, alpha1 + MUL alpha, alpha2, alpha2 + MUL alpha, alpha3, alpha3 + MUL alpha, alpha4, alpha4 + + mov A, A1 + addq A, LDA, A2 + addq A2, LDA, A3 + addq A3, LDA, A4 + s4addq LDA, A, A + + mov Y, Y1 + ldl $31, 4 * SIZE(X) + + sra M, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a8, 0 * SIZE(A3) + LD a9, 1 * SIZE(A3) + LD a10, 2 * SIZE(A3) + LD a11, 3 * SIZE(A3) + + LD y4, 4 * SIZE(Y1) + LD y5, 5 * SIZE(Y1) + LD y6, 6 * SIZE(Y1) + LD y7, 7 * SIZE(Y1) + + MUL alpha1, a0, a0 + LD a12, 0 * SIZE(A4) + MUL alpha1, a1, a1 + LD a13, 1 * SIZE(A4) + MUL alpha1, a2, a2 + LD a14, 2 * SIZE(A4) + MUL alpha1, a3, a3 + LD a15, 3 * SIZE(A4) + + ADD y0, a0, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, a4 + unop + + ADD y1, a1, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, a5 + unop + + ADD y2, a2, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, a6 + unop + + ADD y3, a3, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, a7 + unop + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha3, a8, a8 + unop + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha3, a9, a9 + lda I, -1(I) + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha3, a10, a10 + unop + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha3, a11, a11 + unop + + ADD y0, a8, y0 + LD a8, 4 * SIZE(A3) + MUL alpha4, a12, a12 + ble I, $L13 + .align 4 + +$L12: + ADD y1, a9, y1 + LD a9, 5 * SIZE(A3) + MUL alpha4, a13, a13 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + + ADD y2, a10, y2 + LD a10, 6 * SIZE(A3) + MUL alpha4, a14, a14 + unop + + ADD y3, a11, y3 + LD a11, 7 * SIZE(A3) + MUL alpha4, a15, a15 + lda I, -1(I) + + ADD y0, a12, y0 + LD a12, 4 * SIZE(A4) + MUL alpha1, a0, a0 + lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + + ADD y1, a13, y1 + LD a13, 5 * SIZE(A4) + MUL alpha1, a1, a1 + unop + + ADD y2, a14, y2 + LD a14, 6 * SIZE(A4) + MUL alpha1, a2, a2 + unop + + ADD y3, a15, y3 + LD a15, 7 * SIZE(A4) + MUL alpha1, a3, a3 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + + ADD y4, a0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, a4 + LD a0, 8 * SIZE(A1) + + ADD y5, a1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, a5 + LD a1, 9 * SIZE(A1) + + ADD y6, a2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, a6 + LD a2, 10 * SIZE(A1) + + ADD y7, a3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, a7 + LD a3, 11 * SIZE(A1) + + ADD y4, a4, y4 + LD a4, 8 * SIZE(A2) + MUL alpha3, a8, a8 + LD y0, 8 * SIZE(Y1) + + ADD y5, a5, y5 + LD a5, 9 * SIZE(A2) + MUL alpha3, a9, a9 + LD y1, 9 * SIZE(Y1) + + ADD y6, a6, y6 + LD a6, 10 * SIZE(A2) + MUL alpha3, a10, a10 + LD y2, 10 * SIZE(Y1) + + ADD y7, a7, y7 + LD a7, 11 * SIZE(A2) + MUL alpha3, a11, a11 + LD y3, 11 * SIZE(Y1) + + ADD y4, a8, y4 + LD a8, 8 * SIZE(A3) + MUL alpha4, a12, a12 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A3) + + ADD y5, a9, y5 + LD a9, 9 * SIZE(A3) + MUL alpha4, a13, a13 + lda A1, 8 * SIZE(A1) + + ADD y6, a10, y6 + LD a10, 10 * SIZE(A3) + MUL alpha4, a14, a14 + lda A2, 8 * SIZE(A2) + + ADD y7, a11, y7 + LD a11, 11 * SIZE(A3) + MUL alpha4, a15, a15 + lda Y1, 8 * SIZE(Y1) + + ADD y4, a12, y4 + LD a12, 8 * SIZE(A4) + MUL alpha1, a0, a0 + unop + + ADD y5, a13, y5 + LD a13, 9 * SIZE(A4) + MUL alpha1, a1, a1 + lda A3, 8 * SIZE(A3) + + ADD y6, a14, y6 + LD a14, 10 * SIZE(A4) + MUL alpha1, a2, a2 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A4) + + ADD y7, a15, y7 + LD a15, 11 * SIZE(A4) + MUL alpha1, a3, a3 + lda A4, 8 * SIZE(A4) + + ADD y0, a0, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, a4 + ST y4, -4 * SIZE(Y1) + + ADD y1, a1, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, a5 + ST y5, -3 * SIZE(Y1) + + ADD y2, a2, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, a6 + ST y6, -2 * SIZE(Y1) + + ADD y3, a3, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, a7 + ST y7, -1 * SIZE(Y1) + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha3, a8, a8 + LD y4, 4 * SIZE(Y1) + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha3, a9, a9 + LD y5, 5 * SIZE(Y1) + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha3, a10, a10 + LD y6, 6 * SIZE(Y1) + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha3, a11, a11 + LD y7, 7 * SIZE(Y1) + + ADD y0, a8, y0 + LD a8, 4 * SIZE(A3) + MUL alpha4, a12, a12 + bgt I, $L12 + .align 4 + +$L13: + ADD y1, a9, y1 + LD a9, 5 * SIZE(A3) + MUL alpha4, a13, a13 + unop + + ADD y2, a10, y2 + LD a10, 6 * SIZE(A3) + MUL alpha4, a14, a14 + unop + + ADD y3, a11, y3 + LD a11, 7 * SIZE(A3) + MUL alpha4, a15, a15 + unop + + ADD y0, a12, y0 + LD a12, 4 * SIZE(A4) + MUL alpha1, a0, a0 + unop + + ADD y1, a13, y1 + LD a13, 5 * SIZE(A4) + MUL alpha1, a1, a1 + unop + + ADD y2, a14, y2 + LD a14, 6 * SIZE(A4) + MUL alpha1, a2, a2 + unop + + ADD y3, a15, y3 + LD a15, 7 * SIZE(A4) + MUL alpha1, a3, a3 + unop + + ST y0, 0 * SIZE(Y1) + ADD y4, a0, y4 + unop + MUL alpha2, a4, a4 + + ST y1, 1 * SIZE(Y1) + ADD y5, a1, y5 + unop + MUL alpha2, a5, a5 + + ST y2, 2 * SIZE(Y1) + ADD y6, a2, y6 + unop + MUL alpha2, a6, a6 + + ST y3, 3 * SIZE(Y1) + ADD y7, a3, y7 + lda Y1, 8 * SIZE(Y1) + MUL alpha2, a7, a7 + + ADD y4, a4, y4 + MUL alpha3, a8, a8 + ADD y5, a5, y5 + MUL alpha3, a9, a9 + ADD y6, a6, y6 + MUL alpha3, a10, a10 + ADD y7, a7, y7 + MUL alpha3, a11, a11 + + ADD y4, a8, y4 + MUL alpha4, a12, a12 + ADD y5, a9, y5 + MUL alpha4, a13, a13 + ADD y6, a10, y6 + MUL alpha4, a14, a14 + ADD y7, a11, y7 + MUL alpha4, a15, a15 + + ADD y4, a12, y4 + ADD y5, a13, y5 + ADD y6, a14, y6 + ADD y7, a15, y7 + + ST y4, -4 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y5, -3 * SIZE(Y1) + lda A2, 8 * SIZE(A2) + ST y6, -2 * SIZE(Y1) + lda A3, 8 * SIZE(A3) + ST y7, -1 * SIZE(Y1) + lda A4, 8 * SIZE(A4) + .align 4 + +$L15: + and M, 4, I + ble I, $L16 + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD a8, 0 * SIZE(A3) + LD a9, 1 * SIZE(A3) + LD a10, 2 * SIZE(A3) + LD a11, 3 * SIZE(A3) + + MUL alpha1, a0, a0 + LD a12, 0 * SIZE(A4) + MUL alpha1, a1, a1 + LD a13, 1 * SIZE(A4) + MUL alpha1, a2, a2 + LD a14, 2 * SIZE(A4) + MUL alpha1, a3, a3 + LD a15, 3 * SIZE(A4) + + ADD y0, a0, y0 + MUL alpha2, a4, a4 + ADD y1, a1, y1 + MUL alpha2, a5, a5 + ADD y2, a2, y2 + MUL alpha2, a6, a6 + ADD y3, a3, y3 + MUL alpha2, a7, a7 + + ADD y0, a4, y0 + MUL alpha3, a8, a8 + ADD y1, a5, y1 + MUL alpha3, a9, a9 + ADD y2, a6, y2 + MUL alpha3, a10, a10 + ADD y3, a7, y3 + MUL alpha3, a11, a11 + + ADD y0, a8, y0 + MUL alpha4, a12, a12 + ADD y1, a9, y1 + MUL alpha4, a13, a13 + ADD y2, a10, y2 + MUL alpha4, a14, a14 + ADD y3, a11, y3 + MUL alpha4, a15, a15 + + ADD y0, a12, y0 + lda Y1, 4 * SIZE(Y1) + ADD y1, a13, y1 + unop + + ADD y2, a14, y2 + unop + ADD y3, a15, y3 + unop + + ST y0, -4 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y1, -3 * SIZE(Y1) + lda A2, 4 * SIZE(A2) + ST y2, -2 * SIZE(Y1) + lda A3, 4 * SIZE(A3) + ST y3, -1 * SIZE(Y1) + lda A4, 4 * SIZE(A4) + .align 4 + +$L16: + and M, 2, I + ble I, $L17 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + LD a4, 0 * SIZE(A3) + MUL alpha1, a0, a0 + LD a5, 1 * SIZE(A3) + MUL alpha1, a1, a1 + LD a6, 0 * SIZE(A4) + MUL alpha2, a2, a2 + LD a7, 1 * SIZE(A4) + MUL alpha2, a3, a3 + + ADD y0, a0, y0 + MUL alpha3, a4, a4 + ADD y1, a1, y1 + MUL alpha3, a5, a5 + ADD y0, a2, y0 + MUL alpha4, a6, a6 + ADD y1, a3, y1 + MUL alpha4, a7, a7 + + ADD y0, a4, y0 + lda A1, 2 * SIZE(A1) + ADD y1, a5, y1 + lda A2, 2 * SIZE(A2) + ADD y0, a6, y0 + lda A3, 2 * SIZE(A3) + ADD y1, a7, y1 + lda A4, 2 * SIZE(A4) + + ST y0, 0 * SIZE(Y1) + unop + ST y1, 1 * SIZE(Y1) + lda Y1, 2 * SIZE(Y1) + .align 4 + +$L17: + blbc M, $L18 + + LD y0, 0 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + + MUL alpha1, a0, a0 + MUL alpha2, a1, a1 + MUL alpha3, a2, a2 + MUL alpha4, a3, a3 + + ADD y0, a0, y0 + ADD y0, a1, y0 + ADD y0, a2, y0 + ADD y0, a3, y0 + + ST y0, 0 * SIZE(Y1) + .align 4 + +$L18: + lda J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + and N, 2, J + ble J, $L30 + + LD alpha1, 0 * SIZE(X) + addq X, INCX, X + LD alpha2, 0 * SIZE(X) + addq X, INCX, X + + mov A, A1 + MUL alpha, alpha1, alpha1 + addq A, LDA, A2 + MUL alpha, alpha2, alpha2 + + addq A2, LDA, A + mov Y, Y1 + + sra M, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + MUL alpha1, a0, a0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a1, a1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a2, a2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a3, a3 + LD y7, 7 * SIZE(Y1) + + ADD y0, a0, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, a4 + + ADD y1, a1, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, a5 + + ADD y2, a2, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, a6 + + ADD y3, a3, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, a7 + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha1, a0, a0 + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha1, a1, a1 + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha1, a2, a2 + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha1, a3, a3 + + lda I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + lda I, -1(I) + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + lda A2, 8 * SIZE(A2) + + ADD y4, a0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, a4 + LD a0, 8 * SIZE(A1) + + ADD y5, a1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, a5 + LD a1, 9 * SIZE(A1) + + ADD y6, a2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, a6 + LD a2, 10 * SIZE(A1) + + ADD y7, a3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, a7 + LD a3, 11 * SIZE(A1) + + ADD y4, a4, y4 + LD a4, 0 * SIZE(A2) + MUL alpha1, a0, a0 + LD y0, 8 * SIZE(Y1) + + ADD y5, a5, y5 + LD a5, 1 * SIZE(A2) + MUL alpha1, a1, a1 + LD y1, 9 * SIZE(Y1) + + ADD y6, a6, y6 + LD a6, 2 * SIZE(A2) + MUL alpha1, a2, a2 + LD y2, 10 * SIZE(Y1) + + ADD y7, a7, y7 + LD a7, 3 * SIZE(A2) + MUL alpha1, a3, a3 + LD y3, 11 * SIZE(Y1) + + ADD y0, a0, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha2, a4, a4 + LD a0, 12 * SIZE(A1) + + ADD y1, a1, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha2, a5, a5 + LD a1, 13 * SIZE(A1) + + ADD y2, a2, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha2, a6, a6 + LD a2, 14 * SIZE(A1) + + ADD y3, a3, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha2, a7, a7 + LD a3, 15 * SIZE(A1) + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha1, a0, a0 + LD y4, 12 * SIZE(Y1) + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha1, a1, a1 + LD y5, 13 * SIZE(Y1) + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha1, a2, a2 + LD y6, 14 * SIZE(Y1) + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha1, a3, a3 + LD y7, 15 * SIZE(Y1) + + lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + lda A1, 8 * SIZE(A1) + lda Y1, 8 * SIZE(Y1) + bgt I, $L22 + .align 4 + +$L23: + ADD y4, a0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, a4 + unop + + ADD y5, a1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, a5 + unop + + ADD y6, a2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, a6 + unop + + ADD y7, a3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, a7 + unop + + ADD y4, a4, y4 + ADD y5, a5, y5 + ADD y6, a6, y6 + ADD y7, a7, y7 + + ST y4, 4 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y5, 5 * SIZE(Y1) + lda A2, 8 * SIZE(A2) + + ST y6, 6 * SIZE(Y1) + unop + ST y7, 7 * SIZE(Y1) + lda Y1, 8 * SIZE(Y1) + .align 4 + +$L25: + and M, 4, I + ble I, $L26 + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, a0 + LD a4, 0 * SIZE(A2) + MUL alpha1, a1, a1 + LD a5, 1 * SIZE(A2) + MUL alpha1, a2, a2 + LD a6, 2 * SIZE(A2) + MUL alpha1, a3, a3 + LD a7, 3 * SIZE(A2) + + ADD y0, a0, y0 + MUL alpha2, a4, a4 + ADD y1, a1, y1 + MUL alpha2, a5, a5 + ADD y2, a2, y2 + MUL alpha2, a6, a6 + ADD y3, a3, y3 + MUL alpha2, a7, a7 + + ADD y0, a4, y0 + lda Y1, 4 * SIZE(Y1) + ADD y1, a5, y1 + unop + ADD y2, a6, y2 + unop + ADD y3, a7, y3 + unop + + ST y0, -4 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y1, -3 * SIZE(Y1) + lda A2, 4 * SIZE(A2) + ST y2, -2 * SIZE(Y1) + lda A3, 4 * SIZE(A3) + ST y3, -1 * SIZE(Y1) + lda A4, 4 * SIZE(A4) + .align 4 + +$L26: + and M, 2, I + ble I, $L27 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a0, a0 + MUL alpha1, a1, a1 + MUL alpha2, a2, a2 + MUL alpha2, a3, a3 + + ADD y0, a0, y0 + lda A1, 2 * SIZE(A1) + ADD y1, a1, y1 + lda A2, 2 * SIZE(A2) + ADD y0, a2, y0 + unop + ADD y1, a3, y1 + unop + + ST y0, 0 * SIZE(Y1) + unop + ST y1, 1 * SIZE(Y1) + lda Y1, 2 * SIZE(Y1) + .align 4 + +$L27: + blbc M, $L30 + + LD y0, 0 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + + MUL alpha1, a0, a0 + MUL alpha2, a1, a1 + + ADD y0, a0, y0 + ADD y0, a1, y0 + + ST y0, 0 * SIZE(Y1) + .align 4 + +$L30: + blbc N, $L990 + + LD alpha1, 0 * SIZE(X) + mov A, A1 + MUL alpha, alpha1, alpha1 + mov Y, Y1 + + sra M, 3, I + ble I, $L35 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + LD a4, 4 * SIZE(A1) + LD a5, 5 * SIZE(A1) + LD a6, 6 * SIZE(A1) + LD a7, 7 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + LD y4, 4 * SIZE(Y1) + LD y5, 5 * SIZE(Y1) + LD y6, 6 * SIZE(Y1) + LD y7, 7 * SIZE(Y1) + + MUL alpha1, a0, a0 + MUL alpha1, a1, a1 + MUL alpha1, a2, a2 + MUL alpha1, a3, a3 + + lda I, -1(I) + ble I, $L33 + .align 4 + +$L32: + ADD y0, a0, y0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, a4 + LD a0, 8 * SIZE(A1) + + ADD y1, a1, y1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, a5 + LD a1, 9 * SIZE(A1) + + ADD y2, a2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, a6 + LD a2, 10 * SIZE(A1) + + ADD y3, a3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, a7 + LD a3, 11 * SIZE(A1) + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + ST y2, 2 * SIZE(Y1) + ST y3, 3 * SIZE(Y1) + + ADD y4, a4, y4 + LD y0, 8 * SIZE(Y1) + MUL alpha1, a0, a0 + LD a4, 12 * SIZE(A1) + + ADD y5, a5, y5 + LD y1, 9 * SIZE(Y1) + MUL alpha1, a1, a1 + LD a5, 13 * SIZE(A1) + + ADD y6, a6, y6 + LD y2, 10 * SIZE(Y1) + MUL alpha1, a2, a2 + LD a6, 14 * SIZE(A1) + + ADD y7, a7, y7 + LD y3, 11 * SIZE(Y1) + MUL alpha1, a3, a3 + LD a7, 15 * SIZE(A1) + + ST y4, 4 * SIZE(Y1) + lda I, -1(I) + ST y5, 5 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + + ST y6, 6 * SIZE(Y1) + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + ST y7, 7 * SIZE(Y1) + lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + bgt I, $L32 + .align 4 + +$L33: + ADD y0, a0, y0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, a4 + unop + + ADD y1, a1, y1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, a5 + unop + + ADD y2, a2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, a6 + unop + + ADD y3, a3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, a7 + unop + + ADD y4, a4, y4 + ST y0, 0 * SIZE(Y1) + ADD y5, a5, y5 + ST y1, 1 * SIZE(Y1) + ADD y6, a6, y6 + ST y2, 2 * SIZE(Y1) + ADD y7, a7, y7 + ST y3, 3 * SIZE(Y1) + + ST y4, 4 * SIZE(Y1) + unop + ST y5, 5 * SIZE(Y1) + unop + + ST y6, 6 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y7, 7 * SIZE(Y1) + lda Y1, 8 * SIZE(Y1) + .align 4 + +$L35: + and M, 4, I + ble I, $L36 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, a0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, a1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, a2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, a3 + LD y3, 3 * SIZE(Y1) + + ADD y0, a0, y0 + ADD y1, a1, y1 + ADD y2, a2, y2 + ADD y3, a3, y3 + + ST y0, 0 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + lda A2, 4 * SIZE(A2) + ST y2, 2 * SIZE(Y1) + unop + ST y3, 3 * SIZE(Y1) + lda Y1, 4 * SIZE(Y1) + .align 4 + +$L36: + and M, 2, I + ble I, $L37 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + MUL alpha1, a0, a0 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a1, a1 + + ADD y0, a0, y0 + ADD y1, a1, y1 + + ST y0, 0 * SIZE(Y1) + lda A1, 2 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + lda Y1, 2 * SIZE(Y1) + .align 4 + +$L37: + blbc M, $L990 + + LD y0, 0 * SIZE(Y1) + LD a0, 0 * SIZE(A1) + + MUL alpha1, a0, a0 + + ADD y0, a0, y0 + ST y0, 0 * SIZE(Y1) + .align 4 + +$L990: + cmpeq INCY, SIZE, $0 + bne $0, $L999 + + mov BUFFER, Y1 + + sra M, 3, I + ble I, $L995 + .align 4 + +$L992: + LD a0, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a1, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a2, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a3, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + LD y2, 2 * SIZE(Y) + LD y3, 3 * SIZE(Y) + + LD a4, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a5, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a6, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a7, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y4, 4 * SIZE(Y) + LD y5, 5 * SIZE(Y) + LD y6, 6 * SIZE(Y) + LD y7, 7 * SIZE(Y) + + ADD a0, y0, a0 + ADD a1, y1, a1 + ADD a2, y2, a2 + ADD a3, y3, a3 + ADD a4, y4, a4 + ADD a5, y5, a5 + ADD a6, y6, a6 + ADD a7, y7, a7 + + ST a0, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a1, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a2, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a3, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + + ST a4, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a5, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a6, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a7, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda I, -1(I) + lda Y, 8 * SIZE(Y) + bgt I, $L992 + .align 4 + +$L995: + and M, 7, I + ble I, $L999 + .align 4 + +$L996: + LD a0, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + lda Y, 1 * SIZE(Y) + + ADD a0, y0, a0 + + ST a0, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda I, -1(I) + bgt I, $L996 + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/gemv_t.S b/kernel/alpha/gemv_t.S new file mode 100644 index 0000000..ea95546 --- /dev/null +++ b/kernel/alpha/gemv_t.S @@ -0,0 +1,1061 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $20 +#define LDA $21 + +#define X $18 +#define INCX $19 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define X1 $3 +#define Y1 $4 + +#define A1 $5 +#define A2 $6 +#define A3 $7 +#define A4 $8 + +#define alpha $f19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + + PROLOGUE + + lda $sp, -STACKSIZE($sp) + ldq X, 0 + STACKSIZE($sp) + ldq INCX, 8 + STACKSIZE($sp) + ldq Y, 16 + STACKSIZE($sp) + ldq INCY, 24 + STACKSIZE($sp) + ldq BUFFER, 32 + STACKSIZE($sp) + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + SXADDQ INCX, 0, INCX + cmple N, 0, $1 + SXADDQ INCY, 0, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCX, SIZE, $0 + mov X, X1 + SXADDQ LDA, 0, LDA + bne $0, $L10 + + sra M, 3, I + mov BUFFER, Y1 + mov BUFFER, X + ble I, $L05 + .align 4 + +$L02: + ldl $31, (PREFETCHSIZE + 0) * SIZE(X1) + lda I, -1(I) + + LD a0, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a1, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a2, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a3, 0 * SIZE(X1) + addq X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ST a2, 2 * SIZE(Y1) + ST a3, 3 * SIZE(Y1) + + LD a4, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a5, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a6, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a7, 0 * SIZE(X1) + addq X1, INCX, X1 + + ST a4, 4 * SIZE(Y1) + ST a5, 5 * SIZE(Y1) + ST a6, 6 * SIZE(Y1) + ST a7, 7 * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + bgt I, $L02 + .align 4 + +$L05: + and M, 7, I + ble I, $L10 + .align 4 + +$L06: + LD a0, 0 * SIZE(X1) + addq X1, INCX, X1 + ST a0, 0 * SIZE(Y1) + addq Y1, SIZE, Y1 + + lda I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + mov Y, Y1 + fclr t0 + unop + fclr t1 + + sra N, 2, J + fclr t2 + fclr t3 + ble J, $L20 + .align 4 + +$L11: + mov A, A1 + fclr s0 + addq A, LDA, A2 + fclr s1 + + addq A2, LDA, A3 + fclr s2 + addq A3, LDA, A4 + fclr s3 + + s4addq LDA, A, A + unop + mov X, X1 + lds $f31, 3 * SIZE(Y) + + sra M, 3, I + ble I, $L15 + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + LD a4, 1 * SIZE(A1) + LD a5, 1 * SIZE(A2) + LD a6, 1 * SIZE(A3) + LD a7, 1 * SIZE(A4) + LD a8, 2 * SIZE(A1) + LD a9, 2 * SIZE(A2) + LD a10, 2 * SIZE(A3) + LD a11, 2 * SIZE(A4) + LD a12, 3 * SIZE(A1) + LD a13, 3 * SIZE(A2) + LD a14, 3 * SIZE(A3) + LD a15, 3 * SIZE(A4) + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 4 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + LD a1, 4 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x0, a2, t2 + LD a2, 4 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x0, a3, t3 + LD a3, 4 * SIZE(A4) + + ADD s0, t0, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 5 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 8 * SIZE(A1) + MUL x1, a5, t1 + LD a5, 5 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x1, a6, t2 + LD a6, 5 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x1, a7, t3 + LD a7, 5 * SIZE(A4) + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a8, t0 + LD a8, -2 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + MUL x2, a9, t1 + LD a9, 6 * SIZE(A2) + + ADD s2, t2, s2 + lda A2, 8 * SIZE(A2) + MUL x2, a10, t2 + LD a10, 6 * SIZE(A3) + + ADD s3, t3, s3 + lda A3, 8 * SIZE(A3) + MUL x2, a11, t3 + LD a11, 6 * SIZE(A4) + + ADD s0, t0, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a12, t0 + LD a12, -1 * SIZE(A1) + + ADD s1, t1, s1 + lda A4, 8 * SIZE(A4) + MUL x3, a13, t1 + LD a13, -1 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x3, a14, t2 + LD a14, -1 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x3, a15, t3 + LD a15, -1 * SIZE(A4) + + ADD s0, t0, s0 + LD x3, 7 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 0 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE - 8) * SIZE(A3) + MUL x0, a1, t1 + LD a1, 0 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x0, a2, t2 + LD a2, 0 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x0, a3, t3 + LD a3, 0 * SIZE(A4) + + ADD s0, t0, s0 + LD x0, 8 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 1 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x1, a5, t1 + LD a5, 1 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x1, a6, t2 + LD a6, 1 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x1, a7, t3 + LD a7, 1 * SIZE(A4) + + ADD s0, t0, s0 + LD x1, 9 * SIZE(X1) + MUL x2, a8, t0 + LD a8, 2 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE - 8) * SIZE(A4) + MUL x2, a9, t1 + LD a9, 2 * SIZE(A2) + + ADD s2, t2, s2 + lda X1, 8 * SIZE(X1) + MUL x2, a10, t2 + LD a10, 2 * SIZE(A3) + + ADD s3, t3, s3 + lda I, -1(I) + MUL x2, a11, t3 + LD a11, 2 * SIZE(A4) + + ADD s0, t0, s0 + LD x2, 2 * SIZE(X1) + MUL x3, a12, t0 + LD a12, 3 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE - 8) * SIZE(X1) + MUL x3, a13, t1 + LD a13, 3 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x3, a14, t2 + LD a14, 3 * SIZE(A3) + + ADD s3, t3, s3 + MUL x3, a15, t3 + LD a15, 3 * SIZE(A4) + bgt I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 4 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x0, a1, t1 + LD a1, 4 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x0, a2, t2 + LD a2, 4 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x0, a3, t3 + LD a3, 4 * SIZE(A4) + + ADD s0, t0, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 5 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x1, a5, t1 + LD a5, 5 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x1, a6, t2 + LD a6, 5 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x1, a7, t3 + LD a7, 5 * SIZE(A4) + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a8, t0 + LD a8, 6 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x2, a9, t1 + LD a9, 6 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x2, a10, t2 + LD a10, 6 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x2, a11, t3 + LD a11, 6 * SIZE(A4) + + ADD s0, t0, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a12, t0 + LD a12, 7 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 8 * SIZE(A1) + MUL x3, a13, t1 + LD a13, 7 * SIZE(A2) + + ADD s2, t2, s2 + lda A2, 8 * SIZE(A2) + MUL x3, a14, t2 + LD a14, 7 * SIZE(A3) + + ADD s3, t3, s3 + lda A3, 8 * SIZE(A3) + MUL x3, a15, t3 + LD a15, 7 * SIZE(A4) + + ADD s0, t0, s0 + LD x3, 7 * SIZE(X1) + MUL x0, a0, t0 + unop + + ADD s1, t1, s1 + lda X1, 8 * SIZE(X1) + MUL x0, a1, t1 + lda A4, 8 * SIZE(A4) + + ADD s2, t2, s2 + MUL x0, a2, t2 + ADD s3, t3, s3 + MUL x0, a3, t3 + + ADD s0, t0, s0 + MUL x1, a4, t0 + ADD s1, t1, s1 + MUL x1, a5, t1 + + ADD s2, t2, s2 + MUL x1, a6, t2 + ADD s3, t3, s3 + MUL x1, a7, t3 + + ADD s0, t0, s0 + MUL x2, a8, t0 + ADD s1, t1, s1 + MUL x2, a9, t1 + + ADD s2, t2, s2 + MUL x2, a10, t2 + ADD s3, t3, s3 + MUL x2, a11, t3 + + ADD s0, t0, s0 + MUL x3, a12, t0 + ADD s1, t1, s1 + MUL x3, a13, t1 + + ADD s2, t2, s2 + MUL x3, a14, t2 + ADD s3, t3, s3 + MUL x3, a15, t3 + .align 4 + +$L15: + and M, 7, I + ble I, $L18 + + LD x0, 0 * SIZE(X1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + + lda I, -1(I) + ble I, $L17 + .align 4 + +$L16: + ADD s0, t0, s0 + lda A4, 1 * SIZE(A4) + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 1 * SIZE(A1) + MUL x0, a1, t1 + LD a1, 1 * SIZE(A2) + + ADD s2, t2, s2 + lda A2, 1 * SIZE(A2) + MUL x0, a2, t2 + LD a2, 1 * SIZE(A3) + + ADD s3, t3, s3 + lda A3, 1 * SIZE(A3) + MUL x0, a3, t3 + LD a3, 0 * SIZE(A4) + + LD x0, 1 * SIZE(X1) + lda X1, 1 * SIZE(X1) + lda I, -1(I) + bgt I, $L16 + .align 4 + +$L17: + ADD s0, t0, s0 + MUL x0, a0, t0 + ADD s1, t1, s1 + MUL x0, a1, t1 + + ADD s2, t2, s2 + MUL x0, a2, t2 + ADD s3, t3, s3 + MUL x0, a3, t3 + .align 4 + +$L18: + LD a0, 0 * SIZE(Y) + addq Y, INCY, Y + LD a1, 0 * SIZE(Y) + addq Y, INCY, Y + LD a2, 0 * SIZE(Y) + addq Y, INCY, Y + LD a3, 0 * SIZE(Y) + addq Y, INCY, Y + + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + MUL alpha, s0, s0 + MUL alpha, s1, s1 + MUL alpha, s2, s2 + MUL alpha, s3, s3 + + ADD a0, s0, a0 + fclr t0 + ADD a1, s1, a1 + fclr t1 + ADD a2, s2, a2 + fclr t2 + ADD a3, s3, a3 + fclr t3 + + ST a0, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a1, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a2, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a3, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + and N, 2, J + ble J, $L30 + mov A, A1 + addq A, LDA, A2 + + addq A2, LDA, A + fclr s0 + mov X, X1 + fclr s1 + + sra M, 3, I + fclr s2 + fclr s3 + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 1 * SIZE(A1) + LD a3, 1 * SIZE(A2) + LD a4, 2 * SIZE(A1) + LD a5, 2 * SIZE(A2) + LD a6, 3 * SIZE(A1) + LD a7, 3 * SIZE(A2) + + LD a8, 4 * SIZE(A1) + LD a9, 4 * SIZE(A2) + LD a10, 5 * SIZE(A1) + LD a11, 5 * SIZE(A2) + LD a12, 6 * SIZE(A1) + LD a13, 6 * SIZE(A2) + LD a14, 7 * SIZE(A1) + LD a15, 7 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + lda I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 8 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + LD a1, 8 * SIZE(A2) + + ADD s0, t2, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a2, t2 + LD a2, 9 * SIZE(A1) + + ADD s1, t3, s1 + unop + MUL x1, a3, t3 + LD a3, 9 * SIZE(A2) + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a4, t0 + LD a4, 10 * SIZE(A1) + + ADD s1, t1, s1 + lda I, -1(I) + MUL x2, a5, t1 + LD a5, 10 * SIZE(A2) + + ADD s0, t2, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a6, t2 + LD a6, 11 * SIZE(A1) + + ADD s1, t3, s1 + lda X1, 8 * SIZE(X1) + MUL x3, a7, t3 + LD a7, 11 * SIZE(A2) + + ADD s0, t0, s0 + LD x3, -1 * SIZE(X1) + MUL x0, a8, t0 + LD a8, 12 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + MUL x0, a9, t1 + LD a9, 12 * SIZE(A2) + + ADD s0, t0, s0 + LD x0, 0 * SIZE(X1) + MUL x1, a10, t0 + LD a10, 13 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 8 * SIZE(A1) + MUL x1, a11, t1 + LD a11, 13 * SIZE(A2) + + ADD s0, t0, s0 + LD x1, 1 * SIZE(X1) + MUL x2, a12, t0 + LD a12, 6 * SIZE(A1) + + ADD s1, t1, s1 + MUL x2, a13, t1 + LD a13, 14 * SIZE(A2) + lda A2, 8 * SIZE(A2) + + ADD s0, t0, s0 + LD x2, 2 * SIZE(X1) + MUL x3, a14, t0 + LD a14, 7 * SIZE(A1) + + ADD s1, t1, s1 + MUL x3, a15, t1 + LD a15, 7 * SIZE(A2) + bgt I, $L22 + .align 4 + +$L23: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + lda A1, 8 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x0, a1, t1 + unop + + ADD s0, t2, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a2, t2 + lda A2, 8 * SIZE(A2) + + ADD s1, t3, s1 + unop + MUL x1, a3, t3 + unop + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a4, t0 + unop + + ADD s1, t1, s1 + unop + MUL x2, a5, t1 + unop + + ADD s0, t2, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a6, t2 + unop + + ADD s1, t3, s1 + unop + MUL x3, a7, t3 + unop + + ADD s0, t0, s0 + LD x3, 7 * SIZE(X1) + MUL x0, a8, t0 + lda X1, 8 * SIZE(X1) + + ADD s1, t1, s1 + unop + MUL x0, a9, t1 + unop + + ADD s0, t0, s0 + MUL x1, a10, t0 + ADD s1, t1, s1 + MUL x1, a11, t1 + + ADD s0, t0, s0 + MUL x2, a12, t0 + ADD s1, t1, s1 + MUL x2, a13, t1 + + ADD s0, t0, s0 + MUL x3, a14, t0 + ADD s1, t1, s1 + MUL x3, a15, t1 + .align 4 + +$L25: + and M, 7, I + ble I, $L28 + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD x0, 0 * SIZE(X1) + + lda I, -1(I) + ble I, $L27 + .align 4 + +$L26: + ADD s0, t0, s0 + lda A2, 1 * SIZE(A2) + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 1 * SIZE(A1) + MUL x0, a1, t1 + LD a1, 0 * SIZE(A2) + + LD x0, 1 * SIZE(X1) + lda X1, 1 * SIZE(X1) + lda I, -1(I) + bgt I, $L26 + .align 4 + +$L27: + ADD s0, t0, s0 + MUL x0, a0, t0 + ADD s1, t1, s1 + MUL x0, a1, t1 + .align 4 + +$L28: + LD a0, 0 * SIZE(Y) + addq Y, INCY, Y + LD a1, 0 * SIZE(Y) + addq Y, INCY, Y + + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s2, s0 + ADD s1, s3, s1 + + MUL alpha, s0, s0 + MUL alpha, s1, s1 + + ADD a0, s0, a0 + ADD a1, s1, a1 + + ST a0, 0 * SIZE(Y1) + fclr t0 + addq Y1, INCY, Y1 + fclr t1 + + ST a1, 0 * SIZE(Y1) + fclr t2 + addq Y1, INCY, Y1 + fclr t3 + .align 4 + +$L30: + blbc N, $L999 + + mov A, A1 + fclr s0 + mov X, X1 + fclr s1 + + sra M, 3, I + fclr s2 + fclr s3 + ble I, $L35 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a8, 0 * SIZE(X1) + LD a9, 1 * SIZE(X1) + + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + LD a10, 2 * SIZE(X1) + LD a11, 3 * SIZE(X1) + + LD a4, 4 * SIZE(A1) + LD a5, 5 * SIZE(A1) + LD a12, 4 * SIZE(X1) + LD a13, 5 * SIZE(X1) + + LD a6, 6 * SIZE(A1) + LD a7, 7 * SIZE(A1) + LD a14, 6 * SIZE(X1) + + lda I, -1(I) + ble I, $L33 + .align 4 + +$L32: + ADD s0, t0, s0 + LD a15, 7 * SIZE(X1) + MUL a0, a8, t0 + LD a0, 8 * SIZE(A1) + + ADD s1, t1, s1 + LD a8, 8 * SIZE(X1) + MUL a1, a9, t1 + LD a1, 9 * SIZE(A1) + + ADD s2, t2, s2 + LD a9, 9 * SIZE(X1) + MUL a2, a10, t2 + LD a2, 10 * SIZE(A1) + + ADD s3, t3, s3 + LD a10, 10 * SIZE(X1) + MUL a3, a11, t3 + LD a3, 11 * SIZE(A1) + + ADD s0, t0, s0 + LD a11, 11 * SIZE(X1) + MUL a4, a12, t0 + LD a4, 12 * SIZE(A1) + + ADD s1, t1, s1 + LD a12, 12 * SIZE(X1) + MUL a5, a13, t1 + LD a5, 13 * SIZE(A1) + + ADD s2, t2, s2 + LD a13, 13 * SIZE(X1) + MUL a6, a14, t2 + LD a6, 14 * SIZE(A1) + + ADD s3, t3, s3 + LD a14, 14 * SIZE(X1) + MUL a7, a15, t3 + LD a7, 15 * SIZE(A1) + + lda A1, 8 * SIZE(A1) + lda I, -1(I) + lda X1, 8 * SIZE(X1) + bgt I, $L32 + .align 4 + +$L33: + ADD s0, t0, s0 + LD a15, 7 * SIZE(X1) + MUL a0, a8, t0 + lda A1, 8 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL a1, a9, t1 + lda X1, 8 * SIZE(X1) + + ADD s2, t2, s2 + MUL a2, a10, t2 + ADD s3, t3, s3 + MUL a3, a11, t3 + + ADD s0, t0, s0 + MUL a4, a12, t0 + ADD s1, t1, s1 + MUL a5, a13, t1 + + ADD s2, t2, s2 + MUL a6, a14, t2 + ADD s3, t3, s3 + MUL a7, a15, t3 + .align 4 + +$L35: + and M, 7, I + ble I, $L38 + + LD a0, 0 * SIZE(A1) + LD x0, 0 * SIZE(X1) + + lda I, -1(I) + ble I, $L37 + .align 4 + +$L36: + ADD s0, t0, s0 + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + LD x0, 1 * SIZE(X1) + + lda A1, 1 * SIZE(A1) + lda X1, 1 * SIZE(X1) + lda I, -1(I) + bgt I, $L36 + .align 4 + +$L37: + ADD s0, t0, s0 + MUL x0, a0, t0 + .align 4 + +$L38: + LD a0, 0 * SIZE(Y) + + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s2, s0 + ADD s1, s3, s1 + ADD s0, s1, s0 + + MUL alpha, s0, s0 + ADD a0, s0, a0 + + ST a0, 0 * SIZE(Y1) + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/iamax.S b/kernel/alpha/iamax.S new file mode 100644 index 0000000..cb87632 --- /dev/null +++ b/kernel/alpha/iamax.S @@ -0,0 +1,440 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 6 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + +#ifdef F_INTERFACE + ldl N, 0(N) # n + ldl INCX, 0(INCX) # incx +#endif + lda $sp, -STACKSIZE($sp) + mov X, XX + .align 4 + + stt $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + stt $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + stt $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + stt $f5, 24($sp) + fclr $f19 + and $2, $3, $2 + clr $0 + + stt $f6, 32($sp) + fclr $f0 + sra N, 3, $1 + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + unop + fabs $f20, $f0 + ble $1, $L15 + .align 4 + + fabs $f20, $f1 + unop + addq X, INCX, X + unop + + LD $f21, 0 * SIZE(X) + fabs $f20, $f2 + addq X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fabs $f20, $f3 + addq X, INCX, X + unop + + LD $f23, 0 * SIZE(X) + fabs $f20, $f4 + addq X, INCX, X + unop + + LD $f24, 0 * SIZE(X) + addq X, INCX, X + fabs $f20, $f5 + unop + + LD $f25, 0 * SIZE(X) + fabs $f20, $f6 + addq X, INCX, X + unop + + LD $f26, 0 * SIZE(X) + fabs $f20, $f28 + addq X, INCX, X + lda $1, -1($1) + + LD $f27, 0 * SIZE(X) + unop + addq X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + fcmovne $f16, $f12, $f4 + unop + fabs $f20, $f29 + ldl $31, 56 * SIZE(X) + + fcmovne $f17, $f13, $f5 + LD $f20, 0 * SIZE(X) + fabs $f21, $f30 + addq X, INCX, X + + fcmovne $f18, $f14, $f6 + LD $f21, 0 * SIZE(X) + fabs $f22, $f10 + addq X, INCX, X + + fcmovne $f19, $f15, $f28 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + addq X, INCX, X + + fabs $f24, $f12 + LD $f23, 0 * SIZE(X) + CMPLT($f0, $f29), $f16 + addq X, INCX, X + + fabs $f25, $f13 + LD $f24, 0 * SIZE(X) + CMPLT($f1, $f30), $f17 + addq X, INCX, X + + fabs $f26, $f14 + LD $f25, 0 * SIZE(X) + CMPLT($f2, $f10), $f18 + addq X, INCX, X + + fabs $f27, $f15 + LD $f26, 0 * SIZE(X) + CMPLT($f3, $f11), $f19 + addq X, INCX, X + + fcmovne $f16, $f29, $f0 + LD $f27, 0 * SIZE(X) + CMPLT($f4, $f12), $f16 + addq X, INCX, X + + fcmovne $f17, $f30, $f1 + unop + CMPLT($f5, $f13), $f17 + lda $1, -1($1) # i -- + + fcmovne $f18, $f10, $f2 + unop + CMPLT($f6, $f14), $f18 + unop + + fcmovne $f19, $f11, $f3 + unop + CMPLT($f28, $f15), $f19 + bgt $1,$L12 + .align 4 + +$L13: + fcmovne $f16, $f12, $f4 + fabs $f20, $f29 + fcmovne $f17, $f13, $f5 + fabs $f21, $f30 + + fcmovne $f18, $f14, $f6 + fabs $f22, $f10 + fcmovne $f19, $f15, $f28 + fabs $f23, $f11 + + fabs $f24, $f12 + CMPLT($f0, $f29), $f16 + fabs $f25, $f13 + CMPLT($f1, $f30), $f17 + + fabs $f26, $f14 + CMPLT($f2, $f10), $f18 + fabs $f27, $f15 + CMPLT($f3, $f11), $f19 + + fcmovne $f16, $f29, $f0 + CMPLT($f4, $f12), $f16 + fcmovne $f17, $f30, $f1 + CMPLT($f5, $f13), $f17 + + fcmovne $f18, $f10, $f2 + CMPLT($f6, $f14), $f18 + fcmovne $f19, $f11, $f3 + CMPLT($f28, $f15), $f19 + + fcmovne $f16, $f12, $f4 + CMPLT($f0, $f1), $f16 + fcmovne $f17, $f13, $f5 + CMPLT($f2, $f3), $f17 + + fcmovne $f18, $f14, $f6 + CMPLT($f4, $f5), $f18 + fcmovne $f19, $f15, $f28 + CMPLT($f6, $f28), $f19 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f3, $f2 + fcmovne $f18, $f5, $f4 + fcmovne $f19, $f28, $f6 + + CMPLT($f0, $f2), $f16 + CMPLT($f4, $f6), $f17 + + fcmovne $f16, $f2, $f0 + fcmovne $f17, $f6, $f4 + + CMPLT($f0, $f4), $f16 + fcmovne $f16, $f4, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addq X, INCX, X + + fabs $f20, $f29 + CMPLT($f0, $f29), $f16 + fcmovne $f16, $f29, $f0 + + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 3, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f11, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f13, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f15, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f17, 0 * SIZE(XX) + addq XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + lda $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + addq XX, INCX, XX + cmpteq $f0, $f18, $f2 + + LD $f11, 0 * SIZE(XX) + fabs $f15, $f23 + addq XX, INCX, XX + cmpteq $f0, $f19, $f3 + + LD $f12, 0 * SIZE(XX) + fabs $f16, $f24 + addq XX, INCX, XX + cmpteq $f0, $f20, $f4 + + LD $f13, 0 * SIZE(XX) + fabs $f17, $f25 + addq XX, INCX, XX + cmpteq $f0, $f21, $f5 + + LD $f14, 0 * SIZE(XX) + lda $1, -1($1) # i -- + cmpteq $f0, $f22, $f26 + addq XX, INCX, XX + + lda $0, 1($0) + fbne $f2, $End + + LD $f15, 0 * SIZE(XX) + cmpteq $f0, $f23, $f27 + lda $0, 1($0) + fbne $f3, $End + + addq XX, INCX, XX + cmpteq $f0, $f24, $f28 + lda $0, 1($0) + fbne $f4, $End + + LD $f16, 0 * SIZE(XX) + cmpteq $f0, $f25, $f29 + lda $0, 1($0) + fbne $f5, $End + + addq XX, INCX, XX + lda $0, 1($0) + fabs $f10, $f18 + fbne $f26, $End + + LD $f17, 0 * SIZE(XX) + lda $0, 1($0) + fabs $f11, $f19 + fbne $f27, $End + + addq XX, INCX, XX + lda $0, 1($0) + fabs $f12, $f20 + fbne $f28, $End + + lda $0, 1($0) + fabs $f13, $f21 + fbne $f29, $End + bgt $1, $L22 + .align 4 + +$L23: + fabs $f14, $f22 + cmpteq $f0, $f18, $f2 + fabs $f15, $f23 + cmpteq $f0, $f19, $f3 + + fabs $f16, $f24 + cmpteq $f0, $f20, $f4 + fabs $f17, $f25 + cmpteq $f0, $f21, $f5 + + cmpteq $f0, $f22, $f26 + lda $0, 1($0) + unop + fbne $f2, $End + + cmpteq $f0, $f23, $f27 + lda $0, 1($0) + unop + fbne $f3, $End + + cmpteq $f0, $f24, $f28 + lda $0, 1($0) + unop + fbne $f4, $End + + cmpteq $f0, $f25, $f29 + lda $0, 1($0) + unop + fbne $f5, $End + + lda $0, 1($0) + fbne $f26, $End + lda $0, 1($0) + fbne $f27, $End + lda $0, 1($0) + fbne $f28, $End + lda $0, 1($0) + fbne $f29, $End + .align 4 + +$L40: + LD $f20, 0 * SIZE(XX) + addq XX, INCX, XX + + fabs $f20, $f25 + cmpteq $f0, $f25, $f29 + + lda $0, 1($0) + fbne $f29, $End + br $31, $L40 + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + + ldt $f6, 32($sp) + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/imax.S b/kernel/alpha/imax.S new file mode 100644 index 0000000..b0cf5c8 --- /dev/null +++ b/kernel/alpha/imax.S @@ -0,0 +1,351 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + + clr $0 + mov X, XX + .align 4 + + cmplt $31, N, $2 + cmplt $31, INCX, $3 + SXADDQ INCX, $31, INCX + and $2, $3, $2 + + sra N, 3, $1 + fclr $f0 + unop + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f0, 0 * SIZE(X) + unop + unop + ble $1, $L15 + .align 4 + + fmov $f0, $f1 + addq X, INCX, X + fmov $f0, $f10 + lda $1, -1($1) + + LD $f21, 0 * SIZE(X) + fmov $f0, $f11 + addq X, INCX, X + fmov $f0, $f12 + + LD $f22, 0 * SIZE(X) + fmov $f0, $f13 + addq X, INCX, X + fmov $f0, $f14 + + LD $f23, 0 * SIZE(X) + fmov $f0, $f15 + addq X, INCX, X + fmov $f0, $f20 + + LD $f24, 0 * SIZE(X) + addq X, INCX, X + LD $f25, 0 * SIZE(X) + addq X, INCX, X + LD $f26, 0 * SIZE(X) + addq X, INCX, X + LD $f27, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + CMPLT($f1, $f21), $f17 + CMPLT($f10, $f22), $f18 + CMPLT($f11, $f23), $f19 + + ble $1, $L13 + .align 4 + +$L12: + fcmovne $f16, $f20, $f0 + LD $f20, 0 * SIZE(X) + CMPLT($f12, $f24), $f16 + addq X, INCX, X + + fcmovne $f17, $f21, $f1 + LD $f21, 0 * SIZE(X) + CMPLT($f13, $f25), $f17 + addq X, INCX, X + + fcmovne $f18, $f22, $f10 + LD $f22, 0 * SIZE(X) + CMPLT($f14, $f26), $f18 + addq X, INCX, X + + fcmovne $f19, $f23, $f11 + LD $f23, 0 * SIZE(X) + CMPLT($f15, $f27), $f19 + addq X, INCX, X + + fcmovne $f16, $f24, $f12 + LD $f24, 0 * SIZE(X) + CMPLT($f0, $f20), $f16 + addq X, INCX, X + + fcmovne $f17, $f25, $f13 + LD $f25, 0 * SIZE(X) + CMPLT($f1, $f21), $f17 + addq X, INCX, X + + fcmovne $f18, $f26, $f14 + LD $f26, 0 * SIZE(X) + CMPLT($f10, $f22), $f18 + addq X, INCX, X + + fcmovne $f19, $f27, $f15 + LD $f27, 0 * SIZE(X) + CMPLT($f11, $f23), $f19 + lda $1, -1($1) # i -- + + addq X, INCX, X + unop + unop + bgt $1,$L12 + .align 4 + +$L13: + fcmovne $f16, $f20, $f0 + CMPLT($f12, $f24), $f16 + + fcmovne $f17, $f21, $f1 + CMPLT($f13, $f25), $f17 + + fcmovne $f18, $f22, $f10 + CMPLT($f14, $f26), $f18 + + fcmovne $f19, $f23, $f11 + CMPLT($f15, $f27), $f19 + + fcmovne $f16, $f24, $f12 + CMPLT($f0, $f1), $f16 + fcmovne $f17, $f25, $f13 + CMPLT($f10, $f11), $f17 + + fcmovne $f18, $f26, $f14 + CMPLT($f12, $f13), $f18 + fcmovne $f19, $f27, $f15 + CMPLT($f14, $f15), $f19 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f11, $f10 + fcmovne $f18, $f13, $f12 + fcmovne $f19, $f15, $f14 + + CMPLT($f0, $f10), $f16 + CMPLT($f12, $f14), $f17 + + fcmovne $f16, $f10, $f0 + fcmovne $f17, $f14, $f12 + + CMPLT($f0, $f12), $f16 + fcmovne $f16, $f12, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + fcmovne $f16, $f20, $f0 + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 3, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f11, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f13, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f15, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f17, 0 * SIZE(XX) + addq XX, INCX, XX + + cmpteq $f0, $f10, $f20 + cmpteq $f0, $f11, $f21 + cmpteq $f0, $f12, $f22 + cmpteq $f0, $f13, $f23 + + lda $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + cmpteq $f0, $f14, $f24 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f20, $End + + LD $f11, 0 * SIZE(XX) + cmpteq $f0, $f15, $f25 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f21, $End + + LD $f12, 0 * SIZE(XX) + cmpteq $f0, $f16, $f26 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f22, $End + + LD $f13, 0 * SIZE(XX) + cmpteq $f0, $f17, $f27 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f23, $End + + LD $f14, 0 * SIZE(XX) + cmpteq $f0, $f10, $f20 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f24, $End + + LD $f15, 0 * SIZE(XX) + cmpteq $f0, $f11, $f21 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f25, $End + + LD $f16, 0 * SIZE(XX) + lda $1, -1($1) # i -- + cmpteq $f0, $f12, $f22 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f26, $End + + LD $f17, 0 * SIZE(XX) + cmpteq $f0, $f13, $f23 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f27, $End + + bgt $1, $L22 + .align 4 + +$L23: + lda $0, 1($0) + cmpteq $f0, $f14, $f24 + unop + fbne $f20, $End + + lda $0, 1($0) + cmpteq $f0, $f15, $f25 + unop + fbne $f21, $End + + lda $0, 1($0) + cmpteq $f0, $f16, $f26 + unop + fbne $f22, $End + + lda $0, 1($0) + cmpteq $f0, $f17, $f27 + unop + fbne $f23, $End + + lda $0, 1($0) + fbne $f24, $End + lda $0, 1($0) + fbne $f25, $End + lda $0, 1($0) + fbne $f26, $End + lda $0, 1($0) + fbne $f27, $End + .align 4 + +$L40: + LD $f20, 0 * SIZE(XX) + addq XX, INCX, XX + + cmpteq $f0, $f20, $f29 + + lda $0, 1($0) + fbne $f29, $End + br $31, $L40 + .align 4 + +$End: + ret + + EPILOGUE diff --git a/kernel/alpha/izamax.S b/kernel/alpha/izamax.S new file mode 100644 index 0000000..2269b12 --- /dev/null +++ b/kernel/alpha/izamax.S @@ -0,0 +1,427 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + + lda $sp, -STACKSIZE($sp) + + stt $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + stt $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + stt $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + stt $f5, 24($sp) + fclr $f19 + and $2, $3, $2 + clr $0 + + stt $f6, 32($sp) + mov X, XX + + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + fclr $f0 + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + sra N, 2, $1 + addq INCX, INCX, INCX + + fabs $f20, $f20 + fabs $f21, $f21 + addt $f20, $f21, $f0 + ble $1, $L15 + .align 4 + + lda $1, -1($1) + unop + addq X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fmov $f0, $f1 + LD $f23, 1 * SIZE(X) + addq X, INCX, X + + LD $f24, 0 * SIZE(X) + fmov $f0, $f2 + LD $f25, 1 * SIZE(X) + addq X, INCX, X + + LD $f26, 0 * SIZE(X) + fmov $f0, $f3 + LD $f27, 1 * SIZE(X) + addq X, INCX, X + + fabs $f20, $f8 + fabs $f21, $f9 + fabs $f22, $f10 + fabs $f23, $f11 + + fabs $f24, $f12 + fabs $f25, $f13 + fabs $f26, $f14 + fabs $f27, $f15 + + ble $1, $L14 + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + lda $1, -1($1) + addq X, INCX, X + + LD $f22, 0 * SIZE(X) + LD $f23, 1 * SIZE(X) + unop + addq X, INCX, X + + LD $f24, 0 * SIZE(X) + LD $f25, 1 * SIZE(X) + unop + addq X, INCX, X + + LD $f26, 0 * SIZE(X) + LD $f27, 1 * SIZE(X) + addq X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + addt $f8, $f9, $f16 + unop + fabs $f20, $f8 + ldl $31, 64 * SIZE(X) + + addt $f10, $f11, $f17 + unop + fabs $f21, $f9 + LD $f20, 0 * SIZE(X) + + addt $f12, $f13, $f18 + LD $f21, 1 * SIZE(X) + fabs $f22, $f10 + addq X, INCX, X + + addt $f14, $f15, $f19 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + unop + + CMPLT($f0, $f16), $f4 + LD $f23, 1 * SIZE(X) + fabs $f24, $f12 + addq X, INCX, X + + CMPLT($f1, $f17), $f5 + LD $f24, 0 * SIZE(X) + fabs $f25, $f13 + unop + + CMPLT($f2, $f18), $f6 + LD $f25, 1 * SIZE(X) + fabs $f26, $f14 + addq X, INCX, X + + CMPLT($f3, $f19), $f7 + LD $f26, 0 * SIZE(X) + fabs $f27, $f15 + unop + + fcmovne $f4, $f16, $f0 + LD $f27, 1 * SIZE(X) + addq X, INCX, X + lda $1, -1($1) # i -- + + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + bgt $1,$L12 + .align 4 + +$L13: + addt $f8, $f9, $f16 + fabs $f20, $f8 + + addt $f10, $f11, $f17 + fabs $f21, $f9 + + addt $f12, $f13, $f18 + fabs $f22, $f10 + + addt $f14, $f15, $f19 + fabs $f23, $f11 + + CMPLT($f0, $f16), $f4 + fabs $f24, $f12 + + CMPLT($f1, $f17), $f5 + fabs $f25, $f13 + + CMPLT($f2, $f18), $f6 + fabs $f26, $f14 + CMPLT($f3, $f19), $f7 + fabs $f27, $f15 + + fcmovne $f4, $f16, $f0 + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + .align 4 + +$L14: + addt $f8, $f9, $f16 + addt $f10, $f11, $f17 + addt $f12, $f13, $f18 + addt $f14, $f15, $f19 + + CMPLT($f0, $f16), $f4 + CMPLT($f1, $f17), $f5 + CMPLT($f2, $f18), $f6 + CMPLT($f3, $f19), $f7 + + fcmovne $f4, $f16, $f0 + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + + CMPLT($f0, $f1), $f16 + CMPLT($f2, $f3), $f17 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f3, $f2 + + CMPLT($f0, $f2), $f16 + fcmovne $f16, $f2, $f0 + .align 4 + +$L15: + and N, 3, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addq X, INCX, X + + fabs $f20, $f29 + fabs $f21, $f30 + addt $f29, $f30, $f29 + + CMPLT($f0, $f29), $f16 + fcmovne $f16, $f29, $f0 + + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 2, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + LD $f13, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + LD $f15, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + LD $f17, 1 * SIZE(XX) + addq XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + lda $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + LD $f11, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + fabs $f15, $f23 + LD $f13, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + fabs $f16, $f24 + LD $f15, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + fabs $f17, $f25 + LD $f17, 1 * SIZE(XX) + addq XX, INCX, XX + + addt $f18, $f19, $f4 + addt $f20, $f21, $f5 + addt $f22, $f23, $f6 + addt $f24, $f25, $f7 + + cmpteq $f0, $f4, $f26 + cmpteq $f0, $f5, $f27 + cmpteq $f0, $f6, $f28 + cmpteq $f0, $f7, $f29 + + fabs $f10, $f18 + lda $0, 1($0) + lda $1, -1($1) # i -- + fbne $f26, $End + + fabs $f11, $f19 + lda $0, 1($0) + unop + fbne $f27, $End + + fabs $f12, $f20 + lda $0, 1($0) + unop + fbne $f28, $End + + fabs $f13, $f21 + lda $0, 1($0) + fbne $f29, $End + bgt $1, $L22 + .align 4 + +$L23: + fabs $f14, $f22 + fabs $f15, $f23 + fabs $f16, $f24 + fabs $f17, $f25 + + addt $f18, $f19, $f4 + addt $f20, $f21, $f5 + addt $f22, $f23, $f6 + addt $f24, $f25, $f7 + + cmpteq $f0, $f4, $f26 + cmpteq $f0, $f5, $f27 + cmpteq $f0, $f6, $f28 + cmpteq $f0, $f7, $f29 + + lda $0, 1($0) + fbne $f26, $End + lda $0, 1($0) + fbne $f27, $End + lda $0, 1($0) + fbne $f28, $End + lda $0, 1($0) + fbne $f29, $End + .align 4 + +$L40: + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + + addq XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + + addt $f18, $f19, $f18 + cmpteq $f0, $f18, $f2 + + lda $0, 1($0) + fbne $f2, $End + br $31, $L40 + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/lsame.S b/kernel/alpha/lsame.S new file mode 100644 index 0000000..082f790 --- /dev/null +++ b/kernel/alpha/lsame.S @@ -0,0 +1,76 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl lsame_ + .ent lsame_ +lsame_: + .frame $sp,0,$26,0 +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + ldq_u $5, 0($16) + ldq_u $6, 0($17) + extbl $5, $16, $5 + extbl $6, $17, $6 + + subl $5, 96, $1 + subl $6, 96, $2 + subl $5, 32, $3 + subl $6, 32, $4 + + cmovgt $1, $3, $5 + cmovgt $2, $4, $6 + cmpeq $5, $6, $0 + .align 4 + +$End: + ret + .end lsame_ + .ident VERSION diff --git a/kernel/alpha/max.S b/kernel/alpha/max.S new file mode 100644 index 0000000..af1b8fb --- /dev/null +++ b/kernel/alpha/max.S @@ -0,0 +1,227 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + +#ifdef F_INTERFACE + ldl N, 0(N) # n + ldl INCX, 0(INCX) # incx +#endif + lda $sp, -STACKSIZE($sp) + nop + .align 4 + + cmplt $31, N, $2 + cmplt $31, INCX, $3 + SXADDQ INCX, $31, INCX + and $2, $3, $0 + + sra N, 3, $1 + fclr $f0 + unop + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f0, 0 * SIZE(X) + unop + unop + ble $1, $L15 + .align 4 + + fmov $f0, $f1 + addq X, INCX, X + fmov $f0, $f10 + lda $1, -1($1) + + LD $f21, 0 * SIZE(X) + fmov $f0, $f11 + addq X, INCX, X + fmov $f0, $f12 + + LD $f22, 0 * SIZE(X) + fmov $f0, $f13 + addq X, INCX, X + fmov $f0, $f14 + + LD $f23, 0 * SIZE(X) + fmov $f0, $f15 + addq X, INCX, X + fmov $f0, $f20 + + LD $f24, 0 * SIZE(X) + addq X, INCX, X + LD $f25, 0 * SIZE(X) + addq X, INCX, X + LD $f26, 0 * SIZE(X) + addq X, INCX, X + LD $f27, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + CMPLT($f1, $f21), $f17 + CMPLT($f10, $f22), $f18 + CMPLT($f11, $f23), $f19 + + ble $1, $L13 + .align 4 + +$L12: + fcmovne $f16, $f20, $f0 + LD $f20, 0 * SIZE(X) + CMPLT($f12, $f24), $f16 + addq X, INCX, X + + fcmovne $f17, $f21, $f1 + LD $f21, 0 * SIZE(X) + CMPLT($f13, $f25), $f17 + addq X, INCX, X + + fcmovne $f18, $f22, $f10 + LD $f22, 0 * SIZE(X) + CMPLT($f14, $f26), $f18 + addq X, INCX, X + + fcmovne $f19, $f23, $f11 + LD $f23, 0 * SIZE(X) + CMPLT($f15, $f27), $f19 + addq X, INCX, X + + fcmovne $f16, $f24, $f12 + LD $f24, 0 * SIZE(X) + CMPLT($f0, $f20), $f16 + addq X, INCX, X + + fcmovne $f17, $f25, $f13 + LD $f25, 0 * SIZE(X) + CMPLT($f1, $f21), $f17 + addq X, INCX, X + + fcmovne $f18, $f26, $f14 + LD $f26, 0 * SIZE(X) + CMPLT($f10, $f22), $f18 + addq X, INCX, X + + fcmovne $f19, $f27, $f15 + LD $f27, 0 * SIZE(X) + CMPLT($f11, $f23), $f19 + lda $1, -1($1) # i -- + + addq X, INCX, X + unop + unop + bgt $1,$L12 + .align 4 + +$L13: + fcmovne $f16, $f20, $f0 + CMPLT($f12, $f24), $f16 + + fcmovne $f17, $f21, $f1 + CMPLT($f13, $f25), $f17 + + fcmovne $f18, $f22, $f10 + CMPLT($f14, $f26), $f18 + + fcmovne $f19, $f23, $f11 + CMPLT($f15, $f27), $f19 + + fcmovne $f16, $f24, $f12 + CMPLT($f0, $f1), $f16 + fcmovne $f17, $f25, $f13 + CMPLT($f10, $f11), $f17 + + fcmovne $f18, $f26, $f14 + CMPLT($f12, $f13), $f18 + fcmovne $f19, $f27, $f15 + CMPLT($f14, $f15), $f19 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f11, $f10 + fcmovne $f18, $f13, $f12 + fcmovne $f19, $f15, $f14 + + CMPLT($f0, $f10), $f16 + CMPLT($f12, $f14), $f17 + + fcmovne $f16, $f10, $f0 + fcmovne $f17, $f14, $f12 + + CMPLT($f0, $f12), $f16 + fcmovne $f16, $f12, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + fcmovne $f16, $f20, $f0 + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/rot.S b/kernel/alpha/rot.S new file mode 100644 index 0000000..d1656d7 --- /dev/null +++ b/kernel/alpha/rot.S @@ -0,0 +1,624 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define C $f10 +#define S $f11 + +#define PREFETCH_SIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fmov $f21, C + LD S, 0($sp) + + cmpeq INCX, 1, $23 + cmpeq INCY, 1, $24 + ble N, $L998 + + and $23, $24, $23 + beq $23, $L50 + + sra N, 3, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + lda I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, $f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + lds $f31, (PREFETCH_SIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + lds $f31, (PREFETCH_SIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + lda I, -1(I) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + lda X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + lda Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + LD $f18, 7*SIZE(X) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, $f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, $f28 + + ST $f26, 7*SIZE(X) + lda X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + lda Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + lda I, -1(I) + + ST $f25, 0*SIZE(X) + lda X, 1 * SIZE(X) + ST $f26, 0*SIZE(Y) + lda Y, 1 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 3, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + lda I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 7, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + lda I, -1(I) + + ST $f25, 0*SIZE(X) + SXADDQ INCX, X, X + ST $f26, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + clr $0 + ret + EPILOGUE diff --git a/kernel/alpha/scal.S b/kernel/alpha/scal.S new file mode 100644 index 0000000..2d95801 --- /dev/null +++ b/kernel/alpha/scal.S @@ -0,0 +1,480 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $20 +#define INCX $21 + +#define XX $18 +#define I $19 + +#define ALPHA $f19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + + PROLOGUE + PROFCODE + + mov X, XX + ble N, $L999 + + cmpeq INCX, 1, $0 + beq $0, $L20 + +#ifndef DOUBLE + sra N, 4, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + MUL a0, ALPHA, t0 + LD a5, 5 * SIZE(X) + MUL a1, ALPHA, t1 + LD a6, 6 * SIZE(X) + MUL a2, ALPHA, t2 + LD a7, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 8 * SIZE(X) + LD a1, 9 * SIZE(X) + LD a2, 10 * SIZE(X) + LD a3, 11 * SIZE(X) + + ST t0, 4 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 5 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 6 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 12 * SIZE(X) + LD a5, 13 * SIZE(X) + LD a6, 14 * SIZE(X) + LD a7, 15 * SIZE(X) + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t0, 8 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 9 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 10 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 11 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 16 * SIZE(X) + LD a1, 17 * SIZE(X) + LD a2, 18 * SIZE(X) + LD a3, 19 * SIZE(X) + + ST t0, 12 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 13 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 14 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 15 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 20 * SIZE(X) + LD a5, 21 * SIZE(X) + LD a6, 22 * SIZE(X) + LD a7, 23 * SIZE(X) + + ST t0, 16 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 17 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 18 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 19 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 24 * SIZE(X) + LD a1, 25 * SIZE(X) + LD a2, 26 * SIZE(X) + LD a3, 27 * SIZE(X) + + ST t0, 20 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 21 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 22 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 23 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 28 * SIZE(X) + LD a5, 29 * SIZE(X) + LD a6, 30 * SIZE(X) + LD a7, 31 * SIZE(X) + + lds $f31, PREFETCHSIZE * SIZE(X) + lda I, -1(I) + addq X, 16 * SIZE, X + bne I, $L12 + .align 4 + +$L13: + ST t0, 8 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 9 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 10 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 11 * SIZE(X) + MUL a7, ALPHA, t3 + + ST t0, 12 * SIZE(X) + ST t1, 13 * SIZE(X) + ST t2, 14 * SIZE(X) + ST t3, 15 * SIZE(X) + addq X, 16 * SIZE, X + .align 4 + +$L15: + and N, 15, I + +#else + + sra N, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + MUL a0, ALPHA, t0 + LD a5, 5 * SIZE(X) + MUL a1, ALPHA, t1 + + LD a6, 6 * SIZE(X) + MUL a2, ALPHA, t2 + LD a7, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 8 * SIZE(X) + lda I, -1(I) + LD a1, 9 * SIZE(X) + addq X, 8 * SIZE, X + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + ST t0, -4 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, -3 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, -2 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, -1 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + lds $f31, PREFETCHSIZE * SIZE(X) + bne I, $L12 + .align 4 + +$L13: + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + ST t0, 4 * SIZE(X) + ST t1, 5 * SIZE(X) + ST t2, 6 * SIZE(X) + ST t3, 7 * SIZE(X) + addq X, 8 * SIZE, X + .align 4 + +$L15: + and N, 7, I + +#endif + + unop + unop + ble I, $L999 + .align 4 + +$L17: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(X) + + addq X, SIZE, X + + lda I, -1(I) + bne I, $L17 + ret + .align 4 + +$L20: + sra N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + MUL a0, ALPHA, t0 + lda I, -1(I) + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + MUL a1, ALPHA, t1 + SXADDQ INCX, X, X + unop + + LD a6, 0 * SIZE(X) + MUL a2, ALPHA, t2 + SXADDQ INCX, X, X + unop + + LD a7, 0 * SIZE(X) + MUL a3, ALPHA, t3 + SXADDQ INCX, X, X + ble I, $L23 + .align 4 + +$L22: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 + lds $f31, PREFETCHSIZE * SIZE(X) + SXADDQ INCX, XX, XX + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + lda I, -1(I) + unop + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t0, 0 * SIZE(XX) + MUL a0, ALPHA, t0 + SXADDQ INCX, XX, XX + unop + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t1, 0 * SIZE(XX) + MUL a1, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a2, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a6, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a3, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a7, 0 * SIZE(X) + SXADDQ INCX, X, X + unop + bne I, $L22 + .align 4 + +$L23: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 + SXADDQ INCX, XX, XX + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + + ST t0, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t1, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t2, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t3, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 4 + +$L25: + and N, 7, I + unop + unop + ble I, $L999 + .align 4 + +$L27: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(XX) + + SXADDQ INCX, X, X + SXADDQ INCX, XX, XX + + lda I, -1(I) + bne I, $L27 + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/alpha/snrm2.S b/kernel/alpha/snrm2.S new file mode 100644 index 0000000..b8ccc75 --- /dev/null +++ b/kernel/alpha/snrm2.S @@ -0,0 +1,431 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldah $29, 0($27) !gpdisp!1 + lda $29, 0($29) !gpdisp!1 + + lda $sp, -16($sp) + ldq $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 4, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + lda I, -1(I) + ble I, $L12 + .align 4 + +$L11: + addt a0, t0, a0 + ldl $31, (PREFETCH_SIZE) * SIZE(X) + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + mov X, XX + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(X) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(X) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(X) + + addt a3, t3, a3 + unop + mult x7, x7, t3 + LD x7, 15 * SIZE(X) + + addt a0, t0, a0 + unop + mult x0, x0, t0 + LD x0, 16 * SIZE(X) + + addt a1, t1, a1 + lda X, 16 * SIZE(X) + mult x1, x1, t1 + LD x1, 17 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 18 * SIZE(XX) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 19 * SIZE(XX) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 20 * SIZE(XX) + + addt a1, t1, a1 + lda I, -1(I) + mult x5, x5, t1 + LD x5, 21 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 22 * SIZE(XX) + + addt a3, t3, a3 + mult x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + addt a0, t0, a0 + mov X, XX + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + unop + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(XX) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(XX) + + addt a3, t3, a3 + lda X, 16 * SIZE(X) + mult x7, x7, t3 + LD x7, 15 * SIZE(XX) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a1, t1, a1 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L15: + and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + lda X, 1 * SIZE(X) + + addt a0, t0, a0 + mult x0, x0, t0 + + lda I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L25 + + fclr t2 + fclr t3 + + LD x0, 0 * SIZE(X) + addq X, INCX, X + LD x1, 0 * SIZE(X) + addq X, INCX, X + LD x2, 0 * SIZE(X) + addq X, INCX, X + LD x3, 0 * SIZE(X) + addq X, INCX, X + + LD x4, 0 * SIZE(X) + addq X, INCX, X + LD x5, 0 * SIZE(X) + addq X, INCX, X + LD x6, 0 * SIZE(X) + addq X, INCX, X + + lda I, -1(I) + ble I, $L22 + .align 4 + +$L21: + addt a0, t0, a0 + LD x7, 0 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x0, 0 * SIZE(X) + mult x1, x1, t1 + addq X, INCX, X + + addt a2, t2, a2 + LD x1, 0 * SIZE(X) + mult x2, x2, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x2, 0 * SIZE(X) + mult x3, x3, t3 + addq X, INCX, X + + addt a0, t0, a0 + LD x3, 0 * SIZE(X) + mult x4, x4, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x4, 0 * SIZE(X) + mult x5, x5, t1 + addq X, INCX, X + + addt a2, t2, a2 + LD x5, 0 * SIZE(X) + mult x6, x6, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x6, 0 * SIZE(X) + mult x7, x7, t3 + addq X, INCX, X + + lda I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + addt a0, t0, a0 + LD x7, 0 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + unop + mult x1, x1, t1 + unop + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a1, t1, a1 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L25: + and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + addq X, INCX, X + + addt a0, t0, a0 + mult x0, x0, t0 + + lda I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + addt a0, t0, a0 + + addt a0, a1, a0 + addt a2, a3, a2 + +#if defined(EV4) || defined(EV5) + addt a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldah $29, 0($26) !gpdisp!3 + lda $29, 0($29) !gpdisp!3 +#else + addt a0, a2, a0 + sqrtt a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldq $26, 0($sp) + lda $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/alpha/staticbuffer.S b/kernel/alpha/staticbuffer.S new file mode 100644 index 0000000..7bbd23d --- /dev/null +++ b/kernel/alpha/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 8 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 +#endif diff --git a/kernel/alpha/swap.S b/kernel/alpha/swap.S new file mode 100644 index 0000000..9e21990 --- /dev/null +++ b/kernel/alpha/swap.S @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + mov $20, $17 + mov $21, $18 + ldq $19, 0($sp) + ldl $20, 8($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + subl $18, 1, $1 + subl $20, 1, $2 + ble $16, $SubEnd # if n <= 0 goto $End + or $1, $2, $1 + + sra $16, 3, $21 + + and $16, 7, $22 + bne $1, $Sub + ble $21, $MainRemain + .align 4 + +$MainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f12, 2*SIZE($19) + LD $f13, 3*SIZE($19) + LD $f14, 4*SIZE($19) + LD $f15, 5*SIZE($19) + LD $f16, 6*SIZE($19) + LD $f17, 7*SIZE($19) + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + LD $f22, 2*SIZE($17) + LD $f23, 3*SIZE($17) + LD $f24, 4*SIZE($17) + LD $f25, 5*SIZE($17) + LD $f26, 6*SIZE($17) + LD $f27, 7*SIZE($17) + + lds $f31, 32*SIZE($17) + unop + lds $f31, 32*SIZE($19) + subl $21, 1, $21 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f12, 2*SIZE($17) + ST $f13, 3*SIZE($17) + ST $f14, 4*SIZE($17) + ST $f15, 5*SIZE($17) + ST $f16, 6*SIZE($17) + ST $f17, 7*SIZE($17) + + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + ST $f22, 2*SIZE($19) + ST $f23, 3*SIZE($19) + ST $f24, 4*SIZE($19) + ST $f25, 5*SIZE($19) + ST $f26, 6*SIZE($19) + ST $f27, 7*SIZE($19) + + lda $17, 8*SIZE($17) + lda $19, 8*SIZE($19) + bgt $21, $MainLoop + .align 4 + +$MainRemain: + ble $22, $MainEnd + .align 4 + +$MainRemainLoop: + LD $f10, 0*SIZE($19) + LD $f20, 0*SIZE($17) + lda $17, 1*SIZE($17) + lda $19, 1*SIZE($19) + subl $22, 1, $22 + ST $f10, -1*SIZE($17) + ST $f20, -1*SIZE($19) + bgt $22, $MainRemainLoop + .align 4 + +$MainEnd: + clr $0 + ret + .align 4 + +$Sub: + mov $17, $23 + mov $19, $24 + + ble $21, $SubRemain + .align 4 + +$SubLoop: + LD $f10, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f11, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f12, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f13, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f14, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f15, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f16, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f17, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f20, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f21, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f22, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f23, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f24, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f25, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f26, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f27, 0*SIZE($17) + SXADDQ $18, $17, $17 + + ST $f10, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f11, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f12, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f13, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f14, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f15, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f16, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f17, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f20, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f21, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f22, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f23, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f24, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f25, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f26, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f27, 0*SIZE($24) + SXADDQ $20, $24, $24 + + subl $21, 1, $21 + bgt $21, $SubLoop + .align 4 + +$SubRemain: + ble $22, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0*SIZE($19) + LD $f20, 0*SIZE($17) + + subl $22, 1, $22 + + ST $f10, 0*SIZE($17) + ST $f20, 0*SIZE($19) + + SXADDQ $18, $17, $17 + SXADDQ $20, $19, $19 + bgt $22, $SubRemainLoop + .align 4 + +$SubEnd: + clr $0 + ret + EPILOGUE diff --git a/kernel/alpha/trsm_kernel_4x4_LN.S b/kernel/alpha/trsm_kernel_4x4_LN.S new file mode 100644 index 0000000..a1760c6 --- /dev/null +++ b/kernel/alpha/trsm_kernel_4x4_LN.S @@ -0,0 +1,4068 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + ldq C, 0 + STACKSIZE($sp) + ldq LDC, 8 + STACKSIZE($sp) + ldq OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mulq M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + s4addq LDC, 0, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C3 +#ifndef RT + s4addq LDC, C, C +#endif + + fclr t1 + addq C3, LDC, C4 + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + fclr t3 + fclr t4 + + and M, 1, I + ble I, $L20 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + lda BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + lda BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + lda AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + lda BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + lda AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c09, t3, c09 + MUL a1, b3, t3 + + ADD c13, t4, c13 + lda AO, 1 * SIZE(AO) + MUL a1, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + ADD c09, t3, c09 + ADD c13, t4, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c01, t1 + SUB c09, t1, c09 + MUL a4, c01, t1 + SUB c13, t1, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b2, c05, t1 + SUB c09, t1, c09 + MUL b3, c05, t1 + SUB c13, t1, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a2, c09, t1 + SUB c13, t1, c13 + MUL a3, c13, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a2, c13, t1 + SUB c09, t1, c09 + MUL a3, c13, t1 + SUB c05, t1, c05 + MUL a4, c13, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b2, c09, t1 + SUB c05, t1, c05 + MUL b3, c09, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) + lda C3, -1 * SIZE(C3) + lda C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + lda BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + lda BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, c10 + MUL a2, b1, t2 + ADD c13, t3, c13 + MUL a1, b2, t3 + + ADD c14, t4, c14 + MUL a2, b2, t4 + ADD c01, t1, c01 + MUL a1, b3, t1 + + ADD c02, t2, c02 + MUL a2, b3, t2 + ADD c05, t3, c05 + MUL a1, b4, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c09, t1, c09 + ADD c10, t2, c10 + ADD c13, t3, c13 + ADD c14, t4, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c13, c13 + SUB b4, c14, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c02, c02 + MUL a3, c06, c06 + MUL a3, c10, c10 + MUL a3, c14, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c01, t1 + MUL a4, c02, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + + MUL b2, c05, t1 + MUL b2, c06, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL b3, c05, t1 + MUL b3, c06, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + + MUL a2, c09, t1 + MUL a2, c10, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + MUL a3, c13, c13 + MUL a3, c14, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + + MUL a2, c13, t1 + MUL a2, c14, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a3, c13, t1 + MUL a3, c14, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a4, c13, t1 + MUL a4, c14, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + + MUL b2, c09, t1 + MUL b2, c10, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL b3, c09, t1 + MUL b3, c10, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) + lda C3, -2 * SIZE(C3) + lda C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) + lda C3, 2 * SIZE(C3) + lda C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L30: + sra M, 2, I + ble I, $L39 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(KK) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(B) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(TMP1) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(BO) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + MUL b1, a4, t2 + ADD c06, t3, c06 + MUL b2, a4, t3 + + ADD c05, t4, c05 + MUL b4, a1, t4 + ADD c03, t1, c03 + MUL b3, a1, t1 + + ADD c04, t2, c04 + MUL b3, a2, t2 + ADD c08, t3, c08 + MUL b4, a2, t3 + + ADD c13, t4, c13 + MUL b2, a3, t4 + ADD c09, t1, c09 + MUL b3, a3, t1 + + ADD c10, t2, c10 + MUL b3, a4, t2 + ADD c14, t3, c14 + MUL b4, a4, t3 + + ADD c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD c11, t1, c11 + ADD c12, t2, c12 + ADD c16, t3, c16 + ADD c15, t4, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, c03 + SUB a2, c07, c07 + SUB a3, c11, c11 + SUB a4, c15, c15 + + SUB b1, c04, c04 + SUB b2, c08, c08 + SUB b3, c12, c12 + SUB b4, c16, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, c09 + SUB a2, c10, c10 + SUB a3, c11, c11 + SUB a4, c12, c12 + + SUB b1, c13, c13 + SUB b2, c14, c14 + SUB b3, c15, c15 + SUB b4, c16, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + MUL a1, c12, c12 + MUL a1, c16, c16 + + MUL a2, c04, t1 + MUL a2, c08, t2 + MUL a2, c12, t3 + MUL a2, c16, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a3, c04, t1 + MUL a3, c08, t2 + MUL a3, c12, t3 + MUL a3, c16, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a4, c04, t1 + MUL a4, c08, t2 + MUL a4, c12, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + MUL b1, c11, c11 + MUL b1, c15, c15 + + MUL b2, c03, t1 + MUL b2, c07, t2 + MUL b2, c11, t3 + MUL b2, c15, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL b3, c03, t1 + MUL b3, c07, t2 + MUL b3, c11, t3 + MUL b3, c15, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c01, t1 + MUL a3, c05, t2 + MUL a3, c09, t3 + MUL a3, c13, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a4, c01, t1 + MUL a4, c05, t2 + MUL a4, c09, t3 + MUL a4, c13, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + MUL b1, c10, c10 + MUL b1, c14, c14 + + MUL b2, c02, t1 + MUL b2, c06, t2 + MUL b2, c10, t3 + MUL b2, c14, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL b3, c02, t1 + MUL b3, c06, t2 + MUL b3, c10, t3 + MUL b3, c14, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + MUL a1, c11, c11 + MUL a1, c15, c15 + + MUL a2, c03, t1 + MUL a2, c07, t2 + MUL a2, c11, t3 + MUL a2, c15, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + MUL a3, c04, c04 + MUL a3, c08, c08 + MUL a3, c12, c12 + MUL a3, c16, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c01, t1 + MUL a4, c02, t2 + MUL a4, c03, t3 + MUL a4, c04, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + MUL b1, c07, c07 + MUL b1, c08, c08 + + MUL b2, c05, t1 + MUL b2, c06, t2 + MUL b2, c07, t3 + MUL b2, c08, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL b3, c05, t1 + MUL b3, c06, t2 + MUL b3, c07, t3 + MUL b3, c08, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + MUL a2, c09, t1 + MUL a2, c10, t2 + MUL a2, c11, t3 + MUL a2, c12, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + MUL a3, c13, c13 + MUL a3, c14, c14 + MUL a3, c15, c15 + MUL a3, c16, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + MUL a1, c15, c15 + MUL a1, c16, c16 + + MUL a2, c13, t1 + MUL a2, c14, t2 + MUL a2, c15, t3 + MUL a2, c16, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a3, c13, t1 + MUL a3, c14, t2 + MUL a3, c15, t3 + MUL a3, c16, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a4, c13, t1 + MUL a4, c14, t2 + MUL a4, c15, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + MUL b1, c11, c11 + MUL b1, c12, c12 + + MUL b2, c09, t1 + MUL b2, c10, t2 + MUL b2, c11, t3 + MUL b2, c12, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL b3, c09, t1 + MUL b3, c10, t2 + MUL b3, c11, t3 + MUL b3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) + lda C3, -4 * SIZE(C3) + lda C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) + lda C3, 4 * SIZE(C3) + lda C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L11 + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 4, KK +#endif + +#ifdef RT + subq KK, 4, KK +#endif + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + addq LDC, LDC, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + fclr t1 +#ifndef RT + addq C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + fclr t3 + fclr t4 + + and M, 1, I + ble I, $L60 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + LD b3, 2 * SIZE(B) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + lda AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + lda BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c02, t3, c02 + ADD c06, t4, c06 + + ADD c01, c02, c01 + lda AO, 1 * SIZE(AO) + ADD c05, c06, c05 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c05, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, c02 + MUL a2, b1, t2 + ADD c05, t3, c05 + MUL a1, b2, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c05, t3, c05 + ADD c06, t4, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c02, c02 + MUL a3, c06, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c05, c05 + MUL a3, c06, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L70: + sra M, 2, I + ble I, $L79 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + lda BO, 2 * SIZE(B) + lda AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + lda BO, 2 * SIZE(BO) + lda AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + MUL a4, b1, t4 + ADD c01, t1, c01 + MUL a1, b2, t1 + + ADD c02, t2, c02 + MUL a2, b2, t2 + ADD c03, t3, c03 + MUL a3, b2, t3 + + ADD c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c05, t1, c05 + ADD c06, t2, c06 + ADD c07, t3, c07 + ADD c08, t4, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 + + SUB b1, c03, c03 + SUB b2, c07, c07 + SUB b3, c04, c04 + SUB b4, c08, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + + MUL a2, c04, t1 + MUL a2, c08, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a3, c04, t1 + MUL a3, c08, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a4, c04, t1 + MUL a4, c08, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + + MUL b2, c03, t1 + MUL b2, c07, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL b3, c03, t1 + MUL b3, c07, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c05, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a4, c01, t1 + MUL a4, c05, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + + MUL b2, c02, t1 + MUL b2, c06, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL b3, c02, t1 + MUL b3, c06, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + + MUL a2, c03, t1 + MUL a2, c07, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + MUL a3, c04, c04 + MUL a3, c08, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c05, c05 + MUL a3, c06, c06 + MUL a3, c07, c07 + MUL a3, c08, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L51 + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + ble I, $L100 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + lda L, -1(L) + lda AO, 4 * SIZE(AO) + lda BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda L, -1(L) + lda AO, 1 * SIZE(AO) + lda BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + lda C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + lda BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + lda AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda AO, 2 * SIZE(AO) + unop + lda BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c03, c01 + ADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L110: + sra M, 2, I + ble I, $L119 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + lda AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + lda AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a2, c04, t1 + SUB c03, t1, c03 + MUL a3, c04, t1 + SUB c02, t1, c02 + MUL a4, c04, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b2, c03, t1 + SUB c02, t1, c02 + MUL b3, c03, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c01, t1 + SUB c03, t1, c03 + MUL a4, c01, t1 + SUB c04, t1, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b2, c02, t1 + SUB c03, t1, c03 + MUL b3, c02, t1 + SUB c04, t1, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a2, c03, t1 + SUB c04, t1, c04 + MUL a3, c04, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + bgt I, $L91 + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/trsm_kernel_4x4_LT.S b/kernel/alpha/trsm_kernel_4x4_LT.S new file mode 100644 index 0000000..2848d26 --- /dev/null +++ b/kernel/alpha/trsm_kernel_4x4_LT.S @@ -0,0 +1,4066 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + ldq C, 0 + STACKSIZE($sp) + ldq LDC, 8 + STACKSIZE($sp) + ldq OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mulq M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + s4addq LDC, 0, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C3 +#ifndef RT + s4addq LDC, C, C +#endif + + fclr t1 + addq C3, LDC, C4 + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(KK) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(B) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(TMP1) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(BO) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + MUL b1, a4, t2 + ADD c06, t3, c06 + MUL b2, a4, t3 + + ADD c05, t4, c05 + MUL b4, a1, t4 + ADD c03, t1, c03 + MUL b3, a1, t1 + + ADD c04, t2, c04 + MUL b3, a2, t2 + ADD c08, t3, c08 + MUL b4, a2, t3 + + ADD c13, t4, c13 + MUL b2, a3, t4 + ADD c09, t1, c09 + MUL b3, a3, t1 + + ADD c10, t2, c10 + MUL b3, a4, t2 + ADD c14, t3, c14 + MUL b4, a4, t3 + + ADD c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD c11, t1, c11 + ADD c12, t2, c12 + ADD c16, t3, c16 + ADD c15, t4, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, c03 + SUB a2, c07, c07 + SUB a3, c11, c11 + SUB a4, c15, c15 + + SUB b1, c04, c04 + SUB b2, c08, c08 + SUB b3, c12, c12 + SUB b4, c16, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, c09 + SUB a2, c10, c10 + SUB a3, c11, c11 + SUB a4, c12, c12 + + SUB b1, c13, c13 + SUB b2, c14, c14 + SUB b3, c15, c15 + SUB b4, c16, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + MUL a1, c12, c12 + MUL a1, c16, c16 + + MUL a2, c04, t1 + MUL a2, c08, t2 + MUL a2, c12, t3 + MUL a2, c16, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a3, c04, t1 + MUL a3, c08, t2 + MUL a3, c12, t3 + MUL a3, c16, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a4, c04, t1 + MUL a4, c08, t2 + MUL a4, c12, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + MUL b1, c11, c11 + MUL b1, c15, c15 + + MUL b2, c03, t1 + MUL b2, c07, t2 + MUL b2, c11, t3 + MUL b2, c15, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL b3, c03, t1 + MUL b3, c07, t2 + MUL b3, c11, t3 + MUL b3, c15, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c01, t1 + MUL a3, c05, t2 + MUL a3, c09, t3 + MUL a3, c13, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a4, c01, t1 + MUL a4, c05, t2 + MUL a4, c09, t3 + MUL a4, c13, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + MUL b1, c10, c10 + MUL b1, c14, c14 + + MUL b2, c02, t1 + MUL b2, c06, t2 + MUL b2, c10, t3 + MUL b2, c14, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL b3, c02, t1 + MUL b3, c06, t2 + MUL b3, c10, t3 + MUL b3, c14, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + MUL a1, c11, c11 + MUL a1, c15, c15 + + MUL a2, c03, t1 + MUL a2, c07, t2 + MUL a2, c11, t3 + MUL a2, c15, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + MUL a3, c04, c04 + MUL a3, c08, c08 + MUL a3, c12, c12 + MUL a3, c16, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c01, t1 + MUL a4, c02, t2 + MUL a4, c03, t3 + MUL a4, c04, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + MUL b1, c07, c07 + MUL b1, c08, c08 + + MUL b2, c05, t1 + MUL b2, c06, t2 + MUL b2, c07, t3 + MUL b2, c08, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL b3, c05, t1 + MUL b3, c06, t2 + MUL b3, c07, t3 + MUL b3, c08, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + MUL a2, c09, t1 + MUL a2, c10, t2 + MUL a2, c11, t3 + MUL a2, c12, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + MUL a3, c13, c13 + MUL a3, c14, c14 + MUL a3, c15, c15 + MUL a3, c16, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + MUL a1, c15, c15 + MUL a1, c16, c16 + + MUL a2, c13, t1 + MUL a2, c14, t2 + MUL a2, c15, t3 + MUL a2, c16, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a3, c13, t1 + MUL a3, c14, t2 + MUL a3, c15, t3 + MUL a3, c16, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a4, c13, t1 + MUL a4, c14, t2 + MUL a4, c15, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + MUL b1, c11, c11 + MUL b1, c12, c12 + + MUL b2, c09, t1 + MUL b2, c10, t2 + MUL b2, c11, t3 + MUL b2, c12, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL b3, c09, t1 + MUL b3, c10, t2 + MUL b3, c11, t3 + MUL b3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) + lda C3, -4 * SIZE(C3) + lda C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) + lda C3, 4 * SIZE(C3) + lda C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + lda BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + lda BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, c10 + MUL a2, b1, t2 + ADD c13, t3, c13 + MUL a1, b2, t3 + + ADD c14, t4, c14 + MUL a2, b2, t4 + ADD c01, t1, c01 + MUL a1, b3, t1 + + ADD c02, t2, c02 + MUL a2, b3, t2 + ADD c05, t3, c05 + MUL a1, b4, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c09, t1, c09 + ADD c10, t2, c10 + ADD c13, t3, c13 + ADD c14, t4, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c13, c13 + SUB b4, c14, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c02, c02 + MUL a3, c06, c06 + MUL a3, c10, c10 + MUL a3, c14, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c01, t1 + MUL a4, c02, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + + MUL b2, c05, t1 + MUL b2, c06, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL b3, c05, t1 + MUL b3, c06, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + + MUL a2, c09, t1 + MUL a2, c10, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + MUL a3, c13, c13 + MUL a3, c14, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + + MUL a2, c13, t1 + MUL a2, c14, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a3, c13, t1 + MUL a3, c14, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a4, c13, t1 + MUL a4, c14, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + + MUL b2, c09, t1 + MUL b2, c10, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL b3, c09, t1 + MUL b3, c10, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) + lda C3, -2 * SIZE(C3) + lda C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) + lda C3, 2 * SIZE(C3) + lda C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + lda BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + lda BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + lda AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + lda BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + lda AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c09, t3, c09 + MUL a1, b3, t3 + + ADD c13, t4, c13 + lda AO, 1 * SIZE(AO) + MUL a1, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + ADD c09, t3, c09 + ADD c13, t4, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c01, t1 + SUB c09, t1, c09 + MUL a4, c01, t1 + SUB c13, t1, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b2, c05, t1 + SUB c09, t1, c09 + MUL b3, c05, t1 + SUB c13, t1, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a2, c09, t1 + SUB c13, t1, c13 + MUL a3, c13, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a2, c13, t1 + SUB c09, t1, c09 + MUL a3, c13, t1 + SUB c05, t1, c05 + MUL a4, c13, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b2, c09, t1 + SUB c05, t1, c05 + MUL b3, c09, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) + lda C3, -1 * SIZE(C3) + lda C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 4, KK +#endif + +#ifdef RT + subq KK, 4, KK +#endif + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + addq LDC, LDC, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + fclr t1 +#ifndef RT + addq C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + lda BO, 2 * SIZE(B) + lda AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + lda BO, 2 * SIZE(BO) + lda AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + MUL a4, b1, t4 + ADD c01, t1, c01 + MUL a1, b2, t1 + + ADD c02, t2, c02 + MUL a2, b2, t2 + ADD c03, t3, c03 + MUL a3, b2, t3 + + ADD c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c05, t1, c05 + ADD c06, t2, c06 + ADD c07, t3, c07 + ADD c08, t4, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 + + SUB b1, c03, c03 + SUB b2, c07, c07 + SUB b3, c04, c04 + SUB b4, c08, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + + MUL a2, c04, t1 + MUL a2, c08, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a3, c04, t1 + MUL a3, c08, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a4, c04, t1 + MUL a4, c08, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + + MUL b2, c03, t1 + MUL b2, c07, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL b3, c03, t1 + MUL b3, c07, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c05, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a4, c01, t1 + MUL a4, c05, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + + MUL b2, c02, t1 + MUL b2, c06, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL b3, c02, t1 + MUL b3, c06, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + + MUL a2, c03, t1 + MUL a2, c07, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + MUL a3, c04, c04 + MUL a3, c08, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c05, c05 + MUL a3, c06, c06 + MUL a3, c07, c07 + MUL a3, c08, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, c02 + MUL a2, b1, t2 + ADD c05, t3, c05 + MUL a1, b2, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c05, t3, c05 + ADD c06, t4, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c02, c02 + MUL a3, c06, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c05, c05 + MUL a3, c06, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + LD b3, 2 * SIZE(B) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + lda AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + lda BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c02, t3, c02 + ADD c06, t4, c06 + + ADD c01, c02, c01 + lda AO, 1 * SIZE(AO) + ADD c05, c06, c05 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c05, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + lda AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + lda AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a2, c04, t1 + SUB c03, t1, c03 + MUL a3, c04, t1 + SUB c02, t1, c02 + MUL a4, c04, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b2, c03, t1 + SUB c02, t1, c02 + MUL b3, c03, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c01, t1 + SUB c03, t1, c03 + MUL a4, c01, t1 + SUB c04, t1, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b2, c02, t1 + SUB c03, t1, c03 + MUL b3, c02, t1 + SUB c04, t1, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a2, c03, t1 + SUB c04, t1, c04 + MUL a3, c04, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + lda BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + lda AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda AO, 2 * SIZE(AO) + unop + lda BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c03, c01 + ADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L119 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + lda L, -1(L) + lda AO, 4 * SIZE(AO) + lda BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda L, -1(L) + lda AO, 1 * SIZE(AO) + lda BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + lda C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/trsm_kernel_4x4_RT.S b/kernel/alpha/trsm_kernel_4x4_RT.S new file mode 100644 index 0000000..6d3d2e3 --- /dev/null +++ b/kernel/alpha/trsm_kernel_4x4_RT.S @@ -0,0 +1,4066 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + ldq C, 0 + STACKSIZE($sp) + ldq LDC, 8 + STACKSIZE($sp) + ldq OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mulq M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + and N, 1, J + ble J, $L40 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + lda AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + lda AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a2, c04, t1 + SUB c03, t1, c03 + MUL a3, c04, t1 + SUB c02, t1, c02 + MUL a4, c04, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b2, c03, t1 + SUB c02, t1, c02 + MUL b3, c03, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c01, t1 + SUB c03, t1, c03 + MUL a4, c01, t1 + SUB c04, t1, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b2, c02, t1 + SUB c03, t1, c03 + MUL b3, c02, t1 + SUB c04, t1, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a2, c03, t1 + SUB c04, t1, c04 + MUL a3, c04, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + lda BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + lda AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda AO, 2 * SIZE(AO) + unop + lda BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c03, c01 + ADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L119 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + lda L, -1(L) + lda AO, 4 * SIZE(AO) + lda BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda L, -1(L) + lda AO, 1 * SIZE(AO) + lda BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + lda C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + addq LDC, LDC, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + fclr t1 +#ifndef RT + addq C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + lda BO, 2 * SIZE(B) + lda AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + lda BO, 2 * SIZE(BO) + lda AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + MUL a4, b1, t4 + ADD c01, t1, c01 + MUL a1, b2, t1 + + ADD c02, t2, c02 + MUL a2, b2, t2 + ADD c03, t3, c03 + MUL a3, b2, t3 + + ADD c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c05, t1, c05 + ADD c06, t2, c06 + ADD c07, t3, c07 + ADD c08, t4, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 + + SUB b1, c03, c03 + SUB b2, c07, c07 + SUB b3, c04, c04 + SUB b4, c08, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + + MUL a2, c04, t1 + MUL a2, c08, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a3, c04, t1 + MUL a3, c08, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a4, c04, t1 + MUL a4, c08, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + + MUL b2, c03, t1 + MUL b2, c07, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL b3, c03, t1 + MUL b3, c07, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c05, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a4, c01, t1 + MUL a4, c05, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + + MUL b2, c02, t1 + MUL b2, c06, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL b3, c02, t1 + MUL b3, c06, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + + MUL a2, c03, t1 + MUL a2, c07, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + MUL a3, c04, c04 + MUL a3, c08, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c05, c05 + MUL a3, c06, c06 + MUL a3, c07, c07 + MUL a3, c08, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, c02 + MUL a2, b1, t2 + ADD c05, t3, c05 + MUL a1, b2, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c05, t3, c05 + ADD c06, t4, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c02, c02 + MUL a3, c06, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c05, c05 + MUL a3, c06, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + LD b3, 2 * SIZE(B) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + lda AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + lda BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c02, t3, c02 + ADD c06, t4, c06 + + ADD c01, c02, c01 + lda AO, 1 * SIZE(AO) + ADD c05, c06, c05 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c05, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + .align 4 + +$L80: + sra N, 2, J + ble J, $L999 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + s4addq LDC, 0, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C3 +#ifndef RT + s4addq LDC, C, C +#endif + + fclr t1 + addq C3, LDC, C4 + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(KK) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(B) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(TMP1) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(BO) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + MUL b1, a4, t2 + ADD c06, t3, c06 + MUL b2, a4, t3 + + ADD c05, t4, c05 + MUL b4, a1, t4 + ADD c03, t1, c03 + MUL b3, a1, t1 + + ADD c04, t2, c04 + MUL b3, a2, t2 + ADD c08, t3, c08 + MUL b4, a2, t3 + + ADD c13, t4, c13 + MUL b2, a3, t4 + ADD c09, t1, c09 + MUL b3, a3, t1 + + ADD c10, t2, c10 + MUL b3, a4, t2 + ADD c14, t3, c14 + MUL b4, a4, t3 + + ADD c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD c11, t1, c11 + ADD c12, t2, c12 + ADD c16, t3, c16 + ADD c15, t4, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, c03 + SUB a2, c07, c07 + SUB a3, c11, c11 + SUB a4, c15, c15 + + SUB b1, c04, c04 + SUB b2, c08, c08 + SUB b3, c12, c12 + SUB b4, c16, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, c09 + SUB a2, c10, c10 + SUB a3, c11, c11 + SUB a4, c12, c12 + + SUB b1, c13, c13 + SUB b2, c14, c14 + SUB b3, c15, c15 + SUB b4, c16, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + MUL a1, c12, c12 + MUL a1, c16, c16 + + MUL a2, c04, t1 + MUL a2, c08, t2 + MUL a2, c12, t3 + MUL a2, c16, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a3, c04, t1 + MUL a3, c08, t2 + MUL a3, c12, t3 + MUL a3, c16, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a4, c04, t1 + MUL a4, c08, t2 + MUL a4, c12, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + MUL b1, c11, c11 + MUL b1, c15, c15 + + MUL b2, c03, t1 + MUL b2, c07, t2 + MUL b2, c11, t3 + MUL b2, c15, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL b3, c03, t1 + MUL b3, c07, t2 + MUL b3, c11, t3 + MUL b3, c15, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c01, t1 + MUL a3, c05, t2 + MUL a3, c09, t3 + MUL a3, c13, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a4, c01, t1 + MUL a4, c05, t2 + MUL a4, c09, t3 + MUL a4, c13, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + MUL b1, c10, c10 + MUL b1, c14, c14 + + MUL b2, c02, t1 + MUL b2, c06, t2 + MUL b2, c10, t3 + MUL b2, c14, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL b3, c02, t1 + MUL b3, c06, t2 + MUL b3, c10, t3 + MUL b3, c14, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + MUL a1, c11, c11 + MUL a1, c15, c15 + + MUL a2, c03, t1 + MUL a2, c07, t2 + MUL a2, c11, t3 + MUL a2, c15, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + MUL a3, c04, c04 + MUL a3, c08, c08 + MUL a3, c12, c12 + MUL a3, c16, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c01, t1 + MUL a4, c02, t2 + MUL a4, c03, t3 + MUL a4, c04, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + MUL b1, c07, c07 + MUL b1, c08, c08 + + MUL b2, c05, t1 + MUL b2, c06, t2 + MUL b2, c07, t3 + MUL b2, c08, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL b3, c05, t1 + MUL b3, c06, t2 + MUL b3, c07, t3 + MUL b3, c08, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + MUL a2, c09, t1 + MUL a2, c10, t2 + MUL a2, c11, t3 + MUL a2, c12, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + MUL a3, c13, c13 + MUL a3, c14, c14 + MUL a3, c15, c15 + MUL a3, c16, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + MUL a1, c15, c15 + MUL a1, c16, c16 + + MUL a2, c13, t1 + MUL a2, c14, t2 + MUL a2, c15, t3 + MUL a2, c16, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a3, c13, t1 + MUL a3, c14, t2 + MUL a3, c15, t3 + MUL a3, c16, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a4, c13, t1 + MUL a4, c14, t2 + MUL a4, c15, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + MUL b1, c11, c11 + MUL b1, c12, c12 + + MUL b2, c09, t1 + MUL b2, c10, t2 + MUL b2, c11, t3 + MUL b2, c12, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL b3, c09, t1 + MUL b3, c10, t2 + MUL b3, c11, t3 + MUL b3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) + lda C3, -4 * SIZE(C3) + lda C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) + lda C3, 4 * SIZE(C3) + lda C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + lda BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + lda BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, c10 + MUL a2, b1, t2 + ADD c13, t3, c13 + MUL a1, b2, t3 + + ADD c14, t4, c14 + MUL a2, b2, t4 + ADD c01, t1, c01 + MUL a1, b3, t1 + + ADD c02, t2, c02 + MUL a2, b3, t2 + ADD c05, t3, c05 + MUL a1, b4, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c09, t1, c09 + ADD c10, t2, c10 + ADD c13, t3, c13 + ADD c14, t4, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c13, c13 + SUB b4, c14, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c02, c02 + MUL a3, c06, c06 + MUL a3, c10, c10 + MUL a3, c14, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c01, t1 + MUL a4, c02, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + + MUL b2, c05, t1 + MUL b2, c06, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL b3, c05, t1 + MUL b3, c06, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + + MUL a2, c09, t1 + MUL a2, c10, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + MUL a3, c13, c13 + MUL a3, c14, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + + MUL a2, c13, t1 + MUL a2, c14, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a3, c13, t1 + MUL a3, c14, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a4, c13, t1 + MUL a4, c14, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + + MUL b2, c09, t1 + MUL b2, c10, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL b3, c09, t1 + MUL b3, c10, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) + lda C3, -2 * SIZE(C3) + lda C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) + lda C3, 2 * SIZE(C3) + lda C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + lda BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + lda BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + lda AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + lda BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + lda AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c09, t3, c09 + MUL a1, b3, t3 + + ADD c13, t4, c13 + lda AO, 1 * SIZE(AO) + MUL a1, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + ADD c09, t3, c09 + ADD c13, t4, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c01, t1 + SUB c09, t1, c09 + MUL a4, c01, t1 + SUB c13, t1, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b2, c05, t1 + SUB c09, t1, c09 + MUL b3, c05, t1 + SUB c13, t1, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a2, c09, t1 + SUB c13, t1, c13 + MUL a3, c13, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a2, c13, t1 + SUB c09, t1, c09 + MUL a3, c13, t1 + SUB c05, t1, c05 + MUL a4, c13, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b2, c09, t1 + SUB c05, t1, c05 + MUL b3, c09, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) + lda C3, -1 * SIZE(C3) + lda C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 4, KK +#endif + +#ifdef RT + subq KK, 4, KK +#endif + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/zamax.S b/kernel/alpha/zamax.S new file mode 100644 index 0000000..01fb4e1 --- /dev/null +++ b/kernel/alpha/zamax.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + stt $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + + stt $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + stt $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + stt $f5, 24($sp) + fclr $f19 + and $2, $3, $0 + unop + + stt $f6, 32($sp) + unop + + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + fclr $f0 + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + sra N, 2, $1 + addq INCX, INCX, INCX + + fabs $f20, $f20 + fabs $f21, $f21 + addt $f20, $f21, $f0 + ble $1, $L15 + .align 4 + + lda $1, -1($1) + unop + addq X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fmov $f0, $f1 + LD $f23, 1 * SIZE(X) + addq X, INCX, X + + LD $f24, 0 * SIZE(X) + fmov $f0, $f2 + LD $f25, 1 * SIZE(X) + addq X, INCX, X + + LD $f26, 0 * SIZE(X) + fmov $f0, $f3 + LD $f27, 1 * SIZE(X) + addq X, INCX, X + + fabs $f20, $f8 + fabs $f21, $f9 + fabs $f22, $f10 + fabs $f23, $f11 + + fabs $f24, $f12 + fabs $f25, $f13 + fabs $f26, $f14 + fabs $f27, $f15 + + ble $1, $L14 + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + lda $1, -1($1) + addq X, INCX, X + + LD $f22, 0 * SIZE(X) + LD $f23, 1 * SIZE(X) + unop + addq X, INCX, X + + LD $f24, 0 * SIZE(X) + LD $f25, 1 * SIZE(X) + unop + addq X, INCX, X + + LD $f26, 0 * SIZE(X) + LD $f27, 1 * SIZE(X) + addq X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + addt $f8, $f9, $f16 + unop + fabs $f20, $f8 + ldl $31, 64 * SIZE(X) + + addt $f10, $f11, $f17 + unop + fabs $f21, $f9 + LD $f20, 0 * SIZE(X) + + addt $f12, $f13, $f18 + LD $f21, 1 * SIZE(X) + fabs $f22, $f10 + addq X, INCX, X + + addt $f14, $f15, $f19 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + unop + + CMPLT($f0, $f16), $f4 + LD $f23, 1 * SIZE(X) + fabs $f24, $f12 + addq X, INCX, X + + CMPLT($f1, $f17), $f5 + LD $f24, 0 * SIZE(X) + fabs $f25, $f13 + unop + + CMPLT($f2, $f18), $f6 + LD $f25, 1 * SIZE(X) + fabs $f26, $f14 + addq X, INCX, X + + CMPLT($f3, $f19), $f7 + LD $f26, 0 * SIZE(X) + fabs $f27, $f15 + unop + + fcmovne $f4, $f16, $f0 + LD $f27, 1 * SIZE(X) + addq X, INCX, X + lda $1, -1($1) # i -- + + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + bgt $1,$L12 + .align 4 + +$L13: + addt $f8, $f9, $f16 + fabs $f20, $f8 + + addt $f10, $f11, $f17 + fabs $f21, $f9 + + addt $f12, $f13, $f18 + fabs $f22, $f10 + + addt $f14, $f15, $f19 + fabs $f23, $f11 + + CMPLT($f0, $f16), $f4 + fabs $f24, $f12 + + CMPLT($f1, $f17), $f5 + fabs $f25, $f13 + + CMPLT($f2, $f18), $f6 + fabs $f26, $f14 + CMPLT($f3, $f19), $f7 + fabs $f27, $f15 + + fcmovne $f4, $f16, $f0 + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + .align 4 + +$L14: + addt $f8, $f9, $f16 + addt $f10, $f11, $f17 + addt $f12, $f13, $f18 + addt $f14, $f15, $f19 + + CMPLT($f0, $f16), $f4 + CMPLT($f1, $f17), $f5 + CMPLT($f2, $f18), $f6 + CMPLT($f3, $f19), $f7 + + fcmovne $f4, $f16, $f0 + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + + CMPLT($f0, $f1), $f16 + CMPLT($f2, $f3), $f17 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f3, $f2 + + CMPLT($f0, $f2), $f16 + fcmovne $f16, $f2, $f0 + .align 4 + +$L15: + and N, 3, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addq X, INCX, X + + fabs $f20, $f29 + fabs $f21, $f30 + addt $f29, $f30, $f29 + + CMPLT($f0, $f29), $f16 + fcmovne $f16, $f29, $f0 + + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/zasum.S b/kernel/alpha/zasum.S new file mode 100644 index 0000000..67ed785 --- /dev/null +++ b/kernel/alpha/zasum.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + addq INCX, INCX, INCX + + fclr s1 + unop + fclr t1 + ble N, $L999 + + fclr s2 + sra N, 2, I + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t2 + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a2, 0 * SIZE(X) + fclr t3 + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + lda I, -1(I) + + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + ldl $31, PREFETCHSIZE * SIZE(X) + fabs a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + unop + + ADD s2, t2, s2 + LD a7, 1 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + unop + + ADD s0, t0, s0 + LD a1, 1 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + unop + + ADD s2, t2, s2 + LD a3, 1 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + unop + + LD a5, 1 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + + ADD s1, t1, s1 + LD a7, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fabs a2, t2 + ADD s3, t3, s3 + fabs a3, t3 + + ADD s0, t0, s0 + fabs a4, t0 + ADD s1, t1, s1 + fabs a5, t1 + ADD s2, t2, s2 + fabs a6, t2 + ADD s3, t3, s3 + fabs a7, t3 + + ADD s2, t2, s2 + ADD s3, t3, s3 + + .align 4 + +$L15: + ADD s0, s2, s0 + and N, 3, I + ADD s1, s3, s1 + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + fabs a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a1, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ADD s1, t1, s1 + + ADD s0, s1, s0 + ret + EPILOGUE diff --git a/kernel/alpha/zaxpy.S b/kernel/alpha/zaxpy.S new file mode 100644 index 0000000..a6f3c1d --- /dev/null +++ b/kernel/alpha/zaxpy.S @@ -0,0 +1,611 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 40 + +#ifndef CONJ +#define ADD1 SUB +#define ADD2 ADD +#else +#define ADD1 ADD +#define ADD2 SUB +#endif + + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldl $19, 0($sp) + fmov $f19, $f29 + ldq $20, 8($sp) + fmov $f20, $f30 + + mov $21, $18 + ldl $21, 16($sp) + lda $sp, -64($sp) + nop + + stt $f2, 0($sp) + cmpeq $19, 1, $1 + stt $f3, 8($sp) + cmpeq $21, 1, $2 + + stt $f4, 16($sp) + and $16, 3, $5 + stt $f5, 24($sp) + stt $f6, 32($sp) + + stt $f7, 40($sp) + stt $f8, 48($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + and $1, $2, $1 + ble $16, $End + sra $16, 2, $4 + beq $1, $Sub + + ble $4, $Remain + subq $4, 1, $4 + + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + LD $f2, 2*SIZE($18) + LD $f3, 3*SIZE($18) + LD $f4, 4*SIZE($18) + LD $f5, 5*SIZE($18) + LD $f6, 6*SIZE($18) + LD $f7, 7*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + LD $f10, 2*SIZE($20) + LD $f11, 3*SIZE($20) + LD $f12, 4*SIZE($20) + LD $f13, 5*SIZE($20) + LD $f14, 6*SIZE($20) + LD $f15, 7*SIZE($20) + + addq $18, 8*SIZE, $18 + ble $4, $MainLoopEnd + .align 4 + +$MainLoop: + ldt $f31, PREFETCHSIZE * SIZE($20) + ldl $31, PREFETCHSIZE * SIZE($18) + + MUL $f29, $f0, $f20 + LD $f31, 9*SIZE($18) + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + unop + MUL $f30, $f3, $f25 + nop + + MUL $f30, $f2, $f26 + LD $f2, 2*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 3*SIZE($18) + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 4*SIZE($18) + + ADD2 $f26, $f27, $f19 + addq $20, 8*SIZE, $20 + MUL $f29, $f5, $f23 + LD $f5, 5*SIZE($18) + + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($20) + MUL $f29, $f6, $f24 + unop + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($20) + MUL $f30, $f7, $f25 + unop + + ADD $f18, $f10, $f18 + LD $f10, 2*SIZE($20) + MUL $f30, $f6, $f26 + LD $f6, 6*SIZE($18) + + ADD $f19, $f11, $f19 + LD $f11, 3*SIZE($20) + MUL $f29, $f7, $f27 + LD $f7, 7*SIZE($18) + + ST $f16,-8*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17,-7*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18,-6*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19,-5*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, $f16 + LD $f12, 4*SIZE($20) + ADD $f17, $f13, $f17 + LD $f13, 5*SIZE($20) + ADD $f18, $f14, $f18 + LD $f14, 6*SIZE($20) + ADD $f19, $f15, $f19 + LD $f15, 7*SIZE($20) + + ST $f16,-4*SIZE($20) + addq $18, 8*SIZE, $18 + ST $f17,-3*SIZE($20) + subq $4, 1, $4 + + ST $f18,-2*SIZE($20) + nop + ST $f19,-1*SIZE($20) + bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18, 2*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19, 3*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, $f16 + ADD $f17, $f13, $f17 + ADD $f18, $f14, $f18 + ADD $f19, $f15, $f19 + + ST $f16, 4*SIZE($20) + ST $f17, 5*SIZE($20) + ST $f18, 6*SIZE($20) + ST $f19, 7*SIZE($20) + + unop + addq $20, 8*SIZE, $20 + unop + ble $5, $End + .align 4 + +$Remain: + subq $5, 1, $6 + ble $5, $End + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + addq $18, 2*SIZE, $18 + ble $6, $RemainLoopEnd + .align 4 + +$RemainLoop: + MUL $f29, $f0, $f20 + subq $6, 1, $6 + MUL $f30, $f1, $f21 + addq $20, 2*SIZE, $20 + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($20) + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($20) + + ST $f16,-2*SIZE($20) + addq $18, 2*SIZE, $18 + ST $f17,-1*SIZE($20) + bgt $6, $RemainLoop + .align 4 + +$RemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + ADD $f17, $f28, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + lda $sp, 64($sp) + ret + .align 4 + +$Sub: + SXSUBL $16, SIZE, $22 + addq $22, $22, $22 # Complex + .align 4 + + addq $19, $19, $19 # Complex + addq $21, $21, $21 # Complex + + ble $4, $SubRemain + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f2, 0*SIZE($18) + LD $f3, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f4, 0*SIZE($18) + LD $f5, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f6, 0*SIZE($18) + LD $f7, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $21, $20, $24 + + LD $f10, 0*SIZE($24) + LD $f11, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f12, 0*SIZE($24) + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f14, 0*SIZE($24) + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + subq $4, 1, $4 + ble $4, $SubMainLoopEnd + .align 4 + +$SubMainLoop: + MUL $f29, $f0, $f20 + unop + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + SXADDQ $19, $18, $18 + MUL $f30, $f3, $f25 + unop + + MUL $f30, $f2, $f26 + LD $f2, 0*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + MUL $f29, $f4, $f20 + unop + + ADD2 $f22, $f23, $f17 + unop + MUL $f30, $f5, $f21 + unop + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 0*SIZE($18) + + ADD2 $f26, $f27, $f19 + unop + MUL $f29, $f5, $f23 + LD $f5, 1*SIZE($18) + + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($24) + MUL $f29, $f6, $f24 + SXADDQ $19, $18, $18 + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($24) + MUL $f30, $f7, $f25 + SXADDQ $21, $24, $24 + + ADD $f18, $f10, $f18 + LD $f10, 0*SIZE($24) + MUL $f30, $f6, $f26 + LD $f6, 0*SIZE($18) + + ADD $f19, $f11, $f19 + LD $f11, 1*SIZE($24) + MUL $f29, $f7, $f27 + LD $f7, 1*SIZE($18) + + ST $f16, 0*SIZE($20) + SXADDQ $19, $18, $18 + ADD1 $f20, $f21, $f16 + unop + + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + ADD2 $f22, $f23, $f17 + unop + + ST $f18, 0*SIZE($20) + SXADDQ $21, $24, $24 + ADD1 $f24, $f25, $f18 + unop + + ST $f19, 1*SIZE($20) + unop + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + + ADD $f16, $f12, $f16 + unop + LD $f12, 0*SIZE($24) + unop + + ADD $f17, $f13, $f17 + unop + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ADD $f18, $f14, $f18 + subq $4, 1, $4 + LD $f14, 0*SIZE($24) + unop + + ADD $f19, $f15, $f19 + unop + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + unop + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $4, $SubMainLoop + .align 4 + +$SubMainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + SXADDQ $21, $20, $20 + nop + ST $f18, 0*SIZE($20) + ADD1 $f24, $f25, $f18 + + ST $f19, 1*SIZE($20) + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + ADD $f16, $f12, $f16 + + ADD $f17, $f13, $f17 + ADD $f18, $f14, $f18 + ADD $f19, $f15, $f19 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + ble $5, $SubEnd + .align 4 + +$SubRemain: + subq $5, 1, $6 + ble $5, $SubEnd + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $19, $18, $18 + SXADDQ $21, $20, $24 + ble $6, $SubRemainLoopEnd + .align 4 + +$SubRemainLoop: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + + ADD2 $f22, $f23, $f17 + nop + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($24) + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($24) + SXADDQ $21, $24, $24 + subq $6, 1, $6 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $6, $SubRemainLoop + .align 4 + +$SubRemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + ADD $f17, $f28, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$SubEnd: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + lda $sp, 64($sp) + ret + EPILOGUE diff --git a/kernel/alpha/zdot.S b/kernel/alpha/zdot.S new file mode 100644 index 0000000..78dcae6 --- /dev/null +++ b/kernel/alpha/zdot.S @@ -0,0 +1,500 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define XX $21 +#define YY $23 + +#define I $5 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f2 +#define s3 $f30 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + lda $sp, -16($sp) + fclr s0 + stt $f2, 0($sp) + fclr s1 + + fclr s2 + addq INCX, INCX, INCX + fclr s3 + ble N, $L999 + + addq INCY, INCY, INCY + fclr t0 + fclr t1 + fclr t2 + fclr t3 + + srl N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + LD b2, 0 * SIZE(Y) + LD b3, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + LD b4, 0 * SIZE(Y) + LD b5, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a6, 0 * SIZE(X) + LD b6, 0 * SIZE(Y) + + subq I, 1, I + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + ldl $31, PREFETCHSIZE * SIZE(X) + MUL a0, b1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + ldl $31, PREFETCHSIZE * SIZE(Y) + MUL a1, b0, t2 + SXADDQ INCY, Y, Y + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + subq I, 1, I + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + LD a6, 0 * SIZE(X) + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD b6, 0 * SIZE(Y) + MUL a7, b7, t3 + bgt I, $L22 + .align 4 + +$L23: + ADD s0, t0, s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + MUL a1, b0, t2 + ADD s3, t3, s3 + MUL a1, b1, t3 + + ADD s0, t0, s0 + MUL a2, b2, t0 + ADD s1, t1, s1 + MUL a2, b3, t1 + + ADD s2, t2, s2 + MUL a3, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a4, b5, t1 + + ADD s2, t2, s2 + MUL a5, b4, t2 + ADD s3, t3, s3 + MUL a5, b5, t3 + + ADD s0, t0, s0 + MUL a6, b6, t0 + ADD s1, t1, s1 + MUL a6, b7, t1 + + ADD s2, t2, s2 + MUL a7, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L25: + and N, 7, I + unop + unop + ble I, $L998 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + subq I, 1, I + SXADDQ INCY, Y, Y + ble I, $L28 + .align 4 + +$L26: + ADD s0, t0, s0 + mov X, XX + MUL a0, b0, t0 + mov Y, YY + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + LD a0, 0 * SIZE(XX) + MUL a1, b0, t2 + LD b0, 0 * SIZE(YY) + + ADD s3, t3, s3 + subq I, 1, I + MUL a1, b1, t3 + LD a1, 1 * SIZE(XX) + + LD b1, 1 * SIZE(YY) + bgt I, $L26 + .align 4 + +$L28: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a0, b1, t1 + + ADD s2, t2, s2 + MUL a1, b0, t2 + ADD s3, t3, s3 + MUL a1, b1, t3 + .align 4 + +$L998: + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + +#ifndef CONJ + SUB s0, s3, s0 + ADD s1, s2, s1 +#else + ADD s0, s3, s0 + SUB s1, s2, s1 +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + lda $sp, 16($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/zgemm_beta.S b/kernel/alpha/zgemm_beta.S new file mode 100644 index 0000000..f7ca347 --- /dev/null +++ b/kernel/alpha/zgemm_beta.S @@ -0,0 +1,192 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl CNAME + .ent CNAME +CNAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + ldq $18, 24($sp) + ble $16, $End + ldl $19, 32($sp) + ble $17, $End + + addq $19, $19, $19 + fbne $f19,$Main + fbne $f20,$Main + .align 4 + +$L13: + mov $18, $1 + lda $17, -1($17) + SXADDQ $19, $18, $18 + mov $16, $2 + .align 4 + +$L12: + ST $f31, 0*SIZE($1) + ST $f31, 1*SIZE($1) + lda $2, -1($2) + lda $1, 2*SIZE($1) + bgt $2, $L12 + bgt $17,$L13 + clr $0 + ret + .align 4 + +/* Main Routine */ +$Main: + sra $16, 1, $2 # $2 = (m >> 1) + mov $18, $1 # c_offset = c + lda $17, -1($17) # n -- + SXADDQ $19, $18, $18 # c += ldc + beq $2, $L18 + + LD $f14, 0*SIZE($1) + LD $f15, 1*SIZE($1) + LD $f24, 2*SIZE($1) + LD $f25, 3*SIZE($1) + lda $2, -1($2) # $2 -- + ble $2, $L19 + .align 4 + + +$L23: + MUL $f19, $f14, $f10 + lds $f31, 9*SIZE($1) + MUL $f20, $f15, $f11 + lda $2, -1($2) + + MUL $f19, $f15, $f12 + LD $f15, 5*SIZE($1) + MUL $f20, $f14, $f13 + LD $f14, 4*SIZE($1) + + MUL $f19, $f24, $f16 + unop + MUL $f20, $f25, $f17 + unop + + MUL $f19, $f25, $f18 + LD $f25, 7*SIZE($1) + SUB $f10, $f11, $f22 + unop + + MUL $f20, $f24, $f21 + LD $f24, 6*SIZE($1) + ADD $f12, $f13, $f23 + lda $1, 4*SIZE($1) + + SUB $f16, $f17, $f26 + ADD $f18, $f21, $f27 + ST $f22,-4*SIZE($1) + ST $f23,-3*SIZE($1) + + ST $f26,-2*SIZE($1) + ST $f27,-1*SIZE($1) + unop + bgt $2,$L23 + .align 4 + +$L19: + MUL $f19, $f14, $f10 + MUL $f20, $f15, $f11 + MUL $f19, $f15, $f12 + MUL $f20, $f14, $f13 + + MUL $f19, $f24, $f16 + MUL $f20, $f25, $f17 + MUL $f19, $f25, $f18 + MUL $f20, $f24, $f21 + + SUB $f10, $f11, $f22 + ADD $f12, $f13, $f23 + SUB $f16, $f17, $f26 + ADD $f18, $f21, $f27 + lda $1, 4*SIZE($1) + + ST $f22, -4*SIZE($1) + ST $f23, -3*SIZE($1) + ST $f26, -2*SIZE($1) + ST $f27, -1*SIZE($1) + + blbs $16, $L18 + bgt $17, $Main + clr $0 + ret + .align 4 + +$L18: + LD $f14, 0*SIZE($1) + LD $f15, 1*SIZE($1) + MUL $f19, $f15, $f13 + MUL $f20, $f14, $f10 + + MUL $f19, $f14, $f12 + MUL $f20, $f15, $f11 + ADD $f13, $f10, $f26 + SUB $f12, $f11, $f27 + + ST $f26, 1*SIZE($1) + ST $f27, 0*SIZE($1) + lda $1, 2*SIZE($1) + bgt $17, $Main + .align 4 + +$End: + clr $0 + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/zgemm_kernel_2x2.S b/kernel/alpha/zgemm_kernel_2x2.S new file mode 100644 index 0000000..33c50dd --- /dev/null +++ b/kernel/alpha/zgemm_kernel_2x2.S @@ -0,0 +1,1712 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define BB $3 +#define OFFSET $4 + +#define ALPHA_R 64($sp) +#define ALPHA_I 72($sp) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + lda $sp, -STACKSIZE($sp) + + ldq B, 0 + STACKSIZE($sp) + ldq C, 8 + STACKSIZE($sp) + ldq LDC, 16 + STACKSIZE($sp) +#ifdef TRMMKERNEL + ldq OFFSET, 24 + STACKSIZE($sp) +#endif + + sll LDC, ZBASE_SHIFT, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + stt $f19, ALPHA_R + stt $f20, ALPHA_I + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subq $31, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: + mov C, C1 + addq C, LDC, C2 + mov A, AO + s4addq K, 0, BB + + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + SXADDQ BB, B, BB + addq C2, LDC, C + unop + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#ifndef EV4 + ldl $31, 0 * SIZE(BB) + ldl $31, 8 * SIZE(BB) + unop + lda BB, 16 * SIZE(BB) +#endif + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + lda BO, 4 * SIZE(B) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble L, $L15 +#else + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + lda BO, 4 * SIZE(BO) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(TMP1) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + ldt alpha_r, ALPHA_R + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L18 +#else + blbs TMP1, $L18 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L18: + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + ldt alpha_i, ALPHA_I + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 +#ifndef TRMMKERNEL + LD b1, 1 * SIZE(C1) +#else + unop +#endif + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 +#ifndef TRMMKERNEL + LD a1, 2 * SIZE(C1) +#else + unop +#endif + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 +#ifndef TRMMKERNEL + LD a2, 3 * SIZE(C1) +#else + unop +#endif + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 +#ifndef TRMMKERNEL + LD b2, 0 * SIZE(C2) +#else + unop +#endif + + ADD1 c09, t1, c09 + lda I, -1(I) + MUL b3, a3, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 +#ifndef TRMMKERNEL + LD b3, 1 * SIZE(C2) +#else + unop +#endif + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 +#ifndef TRMMKERNEL + LD a4, 2 * SIZE(C2) +#else + unop +#endif + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 +#ifndef TRMMKERNEL + LD a3, 3 * SIZE(C2) +#else + unop +#endif + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + MUL alpha_r, c01, t1 + ADD c10, c13, c10 + MUL alpha_r, c02, t2 + + ADD c11, c16, c11 + MUL alpha_r, c03, t3 + ADD c12, c15, c12 + MUL alpha_r, c04, t4 + +#ifndef TRMMKERNEL + ADD a5, t1, a5 + MUL alpha_i, c02, t1 + ADD b1, t2, b1 + MUL alpha_i, c01, t2 + + ADD a1, t3, a1 + MUL alpha_i, c04, t3 + ADD a2, t4, a2 + MUL alpha_i, c03, t4 +#else + ADD $f31, t1, a5 + MUL alpha_i, c02, t1 + ADD $f31, t2, b1 + MUL alpha_i, c01, t2 + + ADD $f31, t3, a1 + MUL alpha_i, c04, t3 + ADD $f31, t4, a2 + MUL alpha_i, c03, t4 +#endif + + SUB a5, t1, a5 + MUL alpha_r, c09, t1 + ADD b1, t2, b1 + MUL alpha_r, c10, t2 + + SUB a1, t3, a1 + MUL alpha_r, c11, t3 + ADD a2, t4, a2 + MUL alpha_r, c12, t4 + +#ifndef TRMMKERNEL + ADD b2, t1, b2 + MUL alpha_i, c10, t1 + ADD b3, t2, b3 + MUL alpha_i, c09, t2 + + ADD a4, t3, a4 + MUL alpha_i, c12, t3 + ADD a3, t4, a3 + MUL alpha_i, c11, t4 +#else + ADD $f31, t1, b2 + MUL alpha_i, c10, t1 + ADD $f31, t2, b3 + MUL alpha_i, c09, t2 + + ADD $f31, t3, a4 + MUL alpha_i, c12, t3 + ADD $f31, t4, a3 + MUL alpha_i, c11, t4 +#endif + + SUB b2, t1, b2 + ST a5, 0 * SIZE(C1) + fclr t1 + unop + + ADD b3, t2, b3 + ST b1, 1 * SIZE(C1) + fclr t2 + unop + + SUB a4, t3, a4 + ST a1, 2 * SIZE(C1) + fclr t3 + unop + + ADD a3, t4, a3 + ST a2, 3 * SIZE(C1) + fclr t4 + unop + + ST b2, 0 * SIZE(C2) + fclr c01 + ST b3, 1 * SIZE(C2) + fclr c05 + + ST a4, 2 * SIZE(C2) + lda C1, 4 * SIZE(C1) + ST a3, 3 * SIZE(C2) + lda C2, 4 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 4 * SIZE(B) + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + ble L, $L25 +#else + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 4 * SIZE(BO) + + lda L, -2(TMP1) + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + ldt alpha_r, ALPHA_R + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L28 +#else + blbs TMP1, $L28 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L28: + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + ldt alpha_i, ALPHA_I + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c03, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c04, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 +#ifndef TRMMKERNEL + LD c11, 0 * SIZE(C2) +#else + unop +#endif + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 +#ifndef TRMMKERNEL + LD c12, 1 * SIZE(C2) +#else + unop +#endif + + ADD4 c05, t3, c05 + MUL a1, b4, t3 + ADD2 c06, t4, c06 + MUL a2, b4, t4 + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_r, c09, t3 + MUL alpha_r, c10, t4 + +#ifndef TRMMKERNEL + ADD c03, t1, c03 + MUL alpha_i, c02, t1 + ADD c04, t2, c04 + MUL alpha_i, c01, t2 + + ADD c11, t3, c11 + MUL alpha_i, c10, t3 + ADD c12, t4, c12 + MUL alpha_i, c09, t4 +#else + ADD $f31, t1, c03 + MUL alpha_i, c02, t1 + ADD $f31, t2, c04 + MUL alpha_i, c01, t2 + + ADD $f31, t3, c11 + MUL alpha_i, c10, t3 + ADD $f31, t4, c12 + MUL alpha_i, c09, t4 +#endif + + SUB c03, t1, c03 + ADD c04, t2, c04 + SUB c11, t3, c11 + ADD c12, t4, c12 + + ST c03, 0 * SIZE(C1) + ST c04, 1 * SIZE(C1) + ST c11, 0 * SIZE(C2) + ST c12, 1 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 1, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 1, KK +#endif + .align 4 + +$L29: + mov BO, B + lda J, -1(J) +#if defined(TRMMKERNEL) && !defined(LEFT) + addq KK, 2, KK +#else + unop +#endif + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + + mov C, C1 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda BO, 2 * SIZE(B) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + fclr c04 + fclr c08 + ble L, $L45 +#else + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + sll KK, ZBASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda BO, 2 * SIZE(BO) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(TMP1) + fclr c04 + fclr c08 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + ldt alpha_r, ALPHA_R + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L48 +#else + blbs TMP1, $L48 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L48: + ADD2 c06, t2, c06 + unop + MUL a2, b1, t2 + ldt alpha_i, ALPHA_I + + ADD4 c07, t3, c07 + lda I, -1(I) + MUL a3, b1, t3 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 +#ifndef TRMMKERNEL + LD c11, 2 * SIZE(C1) +#else + unop +#endif + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 +#ifndef TRMMKERNEL + LD c12, 3 * SIZE(C1) +#else + unop +#endif + + ADD1 c03, t3, c03 + MUL a3, b2, t3 + ADD3 c04, t4, c04 + MUL a4, b2, t4 + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_r, c03, t3 + MUL alpha_r, c04, t4 + +#ifndef TRMMKERNEL + ADD c09, t1, c09 + MUL alpha_i, c02, t1 + ADD c10, t2, c10 + MUL alpha_i, c01, t2 + + ADD c11, t3, c11 + MUL alpha_i, c04, t3 + ADD c12, t4, c12 + MUL alpha_i, c03, t4 +#else + ADD $f31, t1, c09 + MUL alpha_i, c02, t1 + ADD $f31, t2, c10 + MUL alpha_i, c01, t2 + + ADD $f31, t3, c11 + MUL alpha_i, c04, t3 + ADD $f31, t4, c12 + MUL alpha_i, c03, t4 +#endif + + SUB c09, t1, c09 + ADD c10, t2, c10 + SUB c11, t3, c11 + ADD c12, t4, c12 + + ST c09, 0 * SIZE(C1) + ST c10, 1 * SIZE(C1) + ST c11, 2 * SIZE(C1) + ST c12, 3 * SIZE(C1) + + lda C1, 4 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(B) + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + ble L, $L55 +#else + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(BO) + + lda L, -2(TMP1) + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + ldt alpha_r, ALPHA_R + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L58 +#else + blbs TMP1, $L58 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L58: + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + ldt alpha_i, ALPHA_I + + ADD4 c05, t3, c05 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c03, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c04, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_i, c02, t3 + MUL alpha_i, c01, t4 + +#ifndef TRMMKERNEL + ADD c03, t1, c03 + ADD c04, t2, c04 +#else + ADD $f31, t1, c03 + ADD $f31, t2, c04 +#endif + + SUB c03, t3, c03 + ADD c04, t4, c04 + + ST c03, 0 * SIZE(C1) + ST c04, 1 * SIZE(C1) + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/zgemv_n.S b/kernel/alpha/zgemv_n.S new file mode 100644 index 0000000..fd602a3 --- /dev/null +++ b/kernel/alpha/zgemv_n.S @@ -0,0 +1,1027 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $21 +#define LDA $18 + +#define X $19 +#define INCX $20 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define Y1 $4 +#define A1 $5 +#define A2 $6 + +#define alpha_r $f19 +#define alpha_i $f20 + +#define alpha1 $f0 +#define alpha2 $f1 +#define alpha3 $f10 +#define alpha4 $f11 + +#define y0 $f12 +#define y1 $f13 +#define y2 $f14 +#define y3 $f15 + +#define y4 $f16 +#define y5 $f17 +#define y6 $f18 +#define y7 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define t0 $f2 +#define t1 $f3 +#define t2 $f4 +#define t3 $f5 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + + PROLOGUE + + lda $sp, -STACKSIZE($sp) + ldq LDA, 0 + STACKSIZE($sp) + ldq X, 8 + STACKSIZE($sp) + ldq INCX, 16 + STACKSIZE($sp) + ldq Y, 24 + STACKSIZE($sp) + ldq INCY, 32 + STACKSIZE($sp) + ldq BUFFER, 40 + STACKSIZE($sp) + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + sll INCX, ZBASE_SHIFT, INCX + cmple N, 0, $1 + sll INCY, ZBASE_SHIFT, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCY, 2 * SIZE, $0 + sll LDA, ZBASE_SHIFT,LDA + bne $0, $L10 + + mov BUFFER, Y1 + + mov Y, BUFFER + mov Y1, Y + + sra M, 2, I + ble I, $L05 + .align 4 + +$L02: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + ST $f31, 2 * SIZE(Y1) + ST $f31, 3 * SIZE(Y1) + ST $f31, 4 * SIZE(Y1) + ST $f31, 5 * SIZE(Y1) + ST $f31, 6 * SIZE(Y1) + ST $f31, 7 * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + lda I, -1(I) + bgt I, $L02 + .align 4 + +$L05: + and M, 3, I + ble I, $L10 + .align 4 + +$L06: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + addq Y1, 2 * SIZE, Y1 + + lda I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + sra N, 1, J + ble J, $L20 + .align 4 + +$L11: + LD alpha1, 0 * SIZE(X) + LD alpha2, 1 * SIZE(X) + addq X, INCX, X + LD alpha3, 0 * SIZE(X) + LD alpha4, 1 * SIZE(X) + addq X, INCX, X + + MUL alpha_r, alpha1, y0 + MUL alpha_r, alpha2, y1 + MUL alpha_r, alpha3, y2 + MUL alpha_r, alpha4, y3 + + MUL alpha_i, alpha2, t0 + mov A, A1 + MUL alpha_i, alpha1, t1 + addq A, LDA, A2 + MUL alpha_i, alpha4, t2 + addq A2, LDA, A + MUL alpha_i, alpha3, t3 + mov Y, Y1 + +#ifndef XCONJ + SUB y0, t0, alpha1 + ADD y1, t1, alpha2 + SUB y2, t2, alpha3 + ADD y3, t3, alpha4 +#else + ADD y0, t0, alpha1 + SUB y1, t1, alpha2 + ADD y2, t2, alpha3 + SUB y3, t3, alpha4 +#endif + + ldl $31, 4 * SIZE(X) + + sra M, 2, I + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, y0 + unop + MUL alpha3, a4, t0 + LD y4, 4 * SIZE(Y1) + + ADD2 y1, t1, y1 + unop + MUL alpha3, a5, t1 + LD y5, 5 * SIZE(Y1) + + ADD1 y2, t2, y2 + unop + MUL alpha3, a6, t2 + LD y6, 6 * SIZE(Y1) + + ADD2 y3, t3, y3 + unop + MUL alpha3, a7, t3 + LD y7, 7 * SIZE(Y1) + + ADD1 y0, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 5 * SIZE(A1) + + ADD2 y1, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 4 * SIZE(A1) + + ADD1 y2, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 7 * SIZE(A1) + + ADD2 y3, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 6 * SIZE(A1) + + ADD3 y0, t0, y0 + unop + MUL alpha4, a5, t0 + LD a5, 5 * SIZE(A2) + + ADD4 y1, t1, y1 + unop + MUL alpha4, a4, t1 + LD a4, 4 * SIZE(A2) + + ADD3 y2, t2, y2 + unop + MUL alpha4, a7, t2 + LD a7, 7 * SIZE(A2) + + ADD4 y3, t3, y3 + unop + MUL alpha4, a6, t3 + LD a6, 6 * SIZE(A2) + + ADD3 y0, t0, y0 + MUL alpha1, a0, t0 + ADD4 y1, t1, y1 + MUL alpha1, a1, t1 + + ADD3 y2, t2, y2 + unop + MUL alpha1, a2, t2 + unop + + ADD4 y3, t3, y3 + lda I, -1(I) + MUL alpha1, a3, t3 + ble I, $L13 + .align 4 + +$L12: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha3, a4, t0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha3, a5, t1 + lda I, -1(I) + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha3, a7, t3 + unop + + ADD1 y4, t0, y4 + unop + MUL alpha2, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 y5, t1, y5 + unop + MUL alpha2, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 y6, t2, y6 + unop + MUL alpha2, a3, t2 + LD a3, 11 * SIZE(A1) + + ADD2 y7, t3, y7 + unop + MUL alpha2, a2, t3 + LD a2, 10 * SIZE(A1) + + ADD3 y4, t0, y4 + lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + MUL alpha4, a5, t0 + LD a5, 9 * SIZE(A2) + + ADD4 y5, t1, y5 + unop + MUL alpha4, a4, t1 + LD a4, 8 * SIZE(A2) + + ADD3 y6, t2, y6 + unop + MUL alpha4, a7, t2 + LD a7, 11 * SIZE(A2) + + ADD4 y7, t3, y7 + unop + MUL alpha4, a6, t3 + LD a6, 10 * SIZE(A2) + + ADD3 y4, t0, y4 + unop + MUL alpha1, a0, t0 + LD y0, 8 * SIZE(Y1) + + ADD4 y5, t1, y5 + unop + MUL alpha1, a1, t1 + LD y1, 9 * SIZE(Y1) + + ADD3 y6, t2, y6 + unop + MUL alpha1, a2, t2 + LD y2, 10 * SIZE(Y1) + + ADD4 y7, t3, y7 + unop + MUL alpha1, a3, t3 + LD y3, 11 * SIZE(Y1) + + ADD1 y0, t0, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha3, a4, t0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + + ADD2 y1, t1, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha3, a5, t1 + unop + + ADD1 y2, t2, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y3, t3, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha3, a7, t3 + lda Y1, 8 * SIZE(Y1) + + ADD1 y0, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 13 * SIZE(A1) + + ADD2 y1, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 12 * SIZE(A1) + + ADD1 y2, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 15 * SIZE(A1) + + ADD2 y3, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 14 * SIZE(A1) + + ADD3 y0, t0, y0 + unop + MUL alpha4, a5, t0 + LD a5, 13 * SIZE(A2) + + ADD4 y1, t1, y1 + unop + MUL alpha4, a4, t1 + LD a4, 12 * SIZE(A2) + + ADD3 y2, t2, y2 + unop + MUL alpha4, a7, t2 + LD a7, 15 * SIZE(A2) + + ADD4 y3, t3, y3 + unop + MUL alpha4, a6, t3 + LD a6, 14 * SIZE(A2) + + ADD3 y0, t0, y0 + unop + MUL alpha1, a0, t0 + LD y4, 4 * SIZE(Y1) + + ADD4 y1, t1, y1 + lda A2, 8 * SIZE(A2) + MUL alpha1, a1, t1 + LD y5, 5 * SIZE(Y1) + + ADD3 y2, t2, y2 + lda A1, 8 * SIZE(A1) + MUL alpha1, a2, t2 + LD y6, 6 * SIZE(Y1) + + ADD4 y3, t3, y3 + MUL alpha1, a3, t3 + LD y7, 7 * SIZE(Y1) + bgt I, $L12 + .align 4 + +$L13: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha3, a4, t0 + unop + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha3, a5, t1 + unop + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha3, a7, t3 + unop + + ADD1 y4, t0, y4 + MUL alpha2, a1, t0 + ADD2 y5, t1, y5 + MUL alpha2, a0, t1 + + ADD1 y6, t2, y6 + MUL alpha2, a3, t2 + ADD2 y7, t3, y7 + MUL alpha2, a2, t3 + + ADD3 y4, t0, y4 + MUL alpha4, a5, t0 + ADD4 y5, t1, y5 + MUL alpha4, a4, t1 + + ADD3 y6, t2, y6 + MUL alpha4, a7, t2 + ADD4 y7, t3, y7 + MUL alpha4, a6, t3 + + ADD3 y4, t0, y4 + ADD4 y5, t1, y5 + ADD3 y6, t2, y6 + ADD4 y7, t3, y7 + + ST y4, 4 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y5, 5 * SIZE(Y1) + lda A2, 8 * SIZE(A2) + + ST y6, 6 * SIZE(Y1) + unop + ST y7, 7 * SIZE(Y1) + lda Y1, 8 * SIZE(Y1) + .align 4 + +$L15: + and M, 2, I + ble I, $L17 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, y0 + MUL alpha3, a4, t0 + ADD2 y1, t1, y1 + MUL alpha3, a5, t1 + ADD1 y2, t2, y2 + MUL alpha3, a6, t2 + ADD2 y3, t3, y3 + MUL alpha3, a7, t3 + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + + ADD1 y2, t2, y2 + MUL alpha2, a3, t2 + ADD2 y3, t3, y3 + MUL alpha2, a2, t3 + + ADD3 y0, t0, y0 + MUL alpha4, a5, t0 + ADD4 y1, t1, y1 + MUL alpha4, a4, t1 + + ADD3 y2, t2, y2 + MUL alpha4, a7, t2 + ADD4 y3, t3, y3 + MUL alpha4, a6, t3 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + ADD3 y2, t2, y2 + ADD4 y3, t3, y3 + + ST y0, 0 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + lda A2, 4 * SIZE(A2) + + ST y2, 2 * SIZE(Y1) + unop + ST y3, 3 * SIZE(Y1) + lda Y1, 4 * SIZE(Y1) + .align 4 + +$L17: + blbc M, $L18 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a0, t0 + MUL alpha1, a1, t1 + + ADD1 y0, t0, y0 + MUL alpha3, a2, t0 + ADD2 y1, t1, y1 + MUL alpha3, a3, t1 + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + + ADD3 y0, t0, y0 + MUL alpha4, a3, t0 + ADD4 y1, t1, y1 + MUL alpha4, a2, t1 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + .align 4 + +$L18: + lda J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + blbc N, $L990 + + LD alpha1, 0 * SIZE(X) + LD alpha2, 1 * SIZE(X) + + MUL alpha_r, alpha1, y0 + MUL alpha_r, alpha2, y1 + + MUL alpha_i, alpha2, t0 + mov A, A1 + MUL alpha_i, alpha1, t1 + mov Y, Y1 + +#ifndef XCONJ + SUB y0, t0, alpha1 + ADD y1, t1, alpha2 +#else + ADD y0, t0, alpha1 + SUB y1, t1, alpha2 +#endif + + sra M, 2, I + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + MUL alpha1, a0, t0 + LD a4, 4 * SIZE(A1) + MUL alpha1, a1, t1 + LD a5, 5 * SIZE(A1) + MUL alpha1, a2, t2 + LD a6, 6 * SIZE(A1) + MUL alpha1, a3, t3 + LD a7, 7 * SIZE(A1) + + ADD1 y0, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 y1, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 y2, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 11 * SIZE(A1) + + ADD2 y3, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 10 * SIZE(A1) + + ADD3 y0, t0, y0 + unop + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, t0 + + ADD4 y1, t1, y1 + unop + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, t1 + + ADD3 y2, t2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, t2 + lda I, -1(I) + + ADD4 y3, t3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, t3 + ble I, $L23 + .align 4 + +$L22: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a5, t0 + LD a5, 13 * SIZE(A1) + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a4, t1 + LD a4, 12 * SIZE(A1) + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a7, t2 + LD a7, 15 * SIZE(A1) + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a6, t3 + LD a6, 14 * SIZE(A1) + + ADD3 y4, t0, y4 + LD y0, 8 * SIZE(Y1) + MUL alpha1, a0, t0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + + ADD4 y5, t1, y5 + LD y1, 9 * SIZE(Y1) + MUL alpha1, a1, t1 + lda I, -1(I) + + ADD3 y6, t2, y6 + LD y2, 10 * SIZE(Y1) + MUL alpha1, a2, t2 + unop + + ADD4 y7, t3, y7 + LD y3, 11 * SIZE(Y1) + MUL alpha1, a3, t3 + unop + + ADD1 y0, t0, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha2, a1, t0 + LD a1, 17 * SIZE(A1) + + ADD2 y1, t1, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha2, a0, t1 + LD a0, 16 * SIZE(A1) + + ADD1 y2, t2, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha2, a3, t2 + LD a3, 19 * SIZE(A1) + + ADD2 y3, t3, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha2, a2, t3 + LD a2, 18 * SIZE(A1) + + ADD3 y0, t0, y0 + LD y4, 12 * SIZE(Y1) + MUL alpha1, a4, t0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(Y1) + + ADD4 y1, t1, y1 + LD y5, 13 * SIZE(Y1) + MUL alpha1, a5, t1 + lda A1, 8 * SIZE(A1) + + ADD3 y2, t2, y2 + LD y6, 14 * SIZE(Y1) + MUL alpha1, a6, t2 + lda Y1, 8 * SIZE(Y1) + + ADD4 y3, t3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, t3 + bgt I, $L22 + .align 4 + +$L23: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a5, t0 + unop + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a4, t1 + unop + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a7, t2 + unop + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a6, t3 + unop + + ADD3 y4, t0, y4 + ADD4 y5, t1, y5 + ADD3 y6, t2, y6 + ADD4 y7, t3, y7 + + ST y4, 4 * SIZE(Y1) + unop + ST y5, 5 * SIZE(Y1) + unop + + ST y6, 6 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y7, 7 * SIZE(Y1) + lda Y1, 8 * SIZE(Y1) + .align 4 + +$L25: + and M, 2, I + ble I, $L27 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + ADD1 y2, t2, y2 + MUL alpha2, a3, t2 + ADD2 y3, t3, y3 + MUL alpha2, a2, t3 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + ADD3 y2, t2, y2 + ADD4 y3, t3, y3 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + + ST y2, 2 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y3, 3 * SIZE(Y1) + lda Y1, 4 * SIZE(Y1) + .align 4 + +$L27: + blbc M, $L990 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + .align 4 + +$L990: + cmpeq INCY, 2 * SIZE, $0 + bne $0, $L999 + + mov BUFFER, Y1 + + sra M, 2, I + ble I, $L995 + .align 4 + +$L992: + LD a0, 0 * SIZE(BUFFER) + LD a1, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a2, 0 * SIZE(BUFFER) + LD a3, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + LD y2, 2 * SIZE(Y) + LD y3, 3 * SIZE(Y) + + LD a4, 0 * SIZE(BUFFER) + LD a5, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a6, 0 * SIZE(BUFFER) + LD a7, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y4, 4 * SIZE(Y) + LD y5, 5 * SIZE(Y) + LD y6, 6 * SIZE(Y) + LD y7, 7 * SIZE(Y) + + ADD a0, y0, a0 + ADD a1, y1, a1 + ADD a2, y2, a2 + ADD a3, y3, a3 + + ST a0, 0 * SIZE(Y1) + ADD a4, y4, a4 + ST a1, 1 * SIZE(Y1) + ADD a5, y5, a5 + addq Y1, INCY, Y1 + + ST a2, 0 * SIZE(Y1) + ADD a6, y6, a6 + ST a3, 1 * SIZE(Y1) + ADD a7, y7, a7 + addq Y1, INCY, Y1 + + ST a4, 0 * SIZE(Y1) + ST a5, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a6, 0 * SIZE(Y1) + ST a7, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda I, -1(I) + lda Y, 8 * SIZE(Y) + bgt I, $L992 + .align 4 + +$L995: + and M, 3, I + ble I, $L999 + .align 4 + +$L996: + LD a0, 0 * SIZE(BUFFER) + LD a1, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + lda Y, 2 * SIZE(Y) + + ADD a0, y0, a0 + ADD a1, y1, a1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda I, -1(I) + bgt I, $L996 + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/zgemv_t.S b/kernel/alpha/zgemv_t.S new file mode 100644 index 0000000..bac56eb --- /dev/null +++ b/kernel/alpha/zgemv_t.S @@ -0,0 +1,922 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $21 +#define LDA $18 + +#define X $19 +#define INCX $20 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define X1 $3 +#define Y1 $4 +#define A1 $5 +#define A2 $6 + +#define alpha_r $f19 +#define alpha_i $f20 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + + PROLOGUE + + lda $sp, -STACKSIZE($sp) + ldq LDA, 0 + STACKSIZE($sp) + ldq X, 8 + STACKSIZE($sp) + ldq INCX, 16 + STACKSIZE($sp) + ldq Y, 24 + STACKSIZE($sp) + ldq INCY, 32 + STACKSIZE($sp) + ldq BUFFER, 40 + STACKSIZE($sp) + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + sll INCX, ZBASE_SHIFT, INCX + cmple N, 0, $1 + sll INCY, ZBASE_SHIFT, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCX, 2 * SIZE, $0 + mov X, X1 + sll LDA, ZBASE_SHIFT,LDA + bne $0, $L10 + + sra M, 2, I + mov BUFFER, Y1 + mov BUFFER, X + ble I, $L05 + .align 4 + +$L02: + ldl $31, (PREFETCHSIZE + 0) * SIZE(X1) + lda I, -1(I) + + LD a0, 0 * SIZE(X1) + LD a1, 1 * SIZE(X1) + addq X1, INCX, X1 + LD a2, 0 * SIZE(X1) + LD a3, 1 * SIZE(X1) + addq X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ST a2, 2 * SIZE(Y1) + ST a3, 3 * SIZE(Y1) + + LD a4, 0 * SIZE(X1) + LD a5, 1 * SIZE(X1) + addq X1, INCX, X1 + LD a6, 0 * SIZE(X1) + LD a7, 1 * SIZE(X1) + addq X1, INCX, X1 + + ST a4, 4 * SIZE(Y1) + ST a5, 5 * SIZE(Y1) + ST a6, 6 * SIZE(Y1) + ST a7, 7 * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + bgt I, $L02 + .align 4 + +$L05: + and M, 3, I + ble I, $L10 + .align 4 + +$L06: + LD a0, 0 * SIZE(X1) + LD a1, 1 * SIZE(X1) + addq X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + lda Y1, 2 * SIZE(Y1) + + lda I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + mov Y, Y1 + fclr t0 + unop + fclr t1 + + sra N, 1, J + fclr t2 + fclr t3 + ble J, $L20 + .align 4 + +$L11: + mov A, A1 + fclr s0 + addq A, LDA, A2 + fclr s1 + + addq A2, LDA, A + unop + mov X, X1 + lds $f31, 3 * SIZE(Y) + + sra M, 2, I + fclr s2 + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + LD a4, 2 * SIZE(A1) + LD a5, 3 * SIZE(A1) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD a8, 4 * SIZE(A1) + LD a9, 5 * SIZE(A1) + LD a10, 4 * SIZE(A2) + LD a11, 5 * SIZE(A2) + LD a12, 6 * SIZE(A1) + LD a13, 7 * SIZE(A1) + LD a14, 6 * SIZE(A2) + LD a15, 7 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + unop + + ADD3 s2, t2, s2 + unop + MUL x0, a2, t2 + unop + + ADD4 s3, t3, s3 + unop + MUL x0, a3, t3 + LD x0, 4 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x1, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 s1, t1, s1 + unop + MUL x1, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 s2, t2, s2 + unop + MUL x1, a3, t2 + LD a3, 9 * SIZE(A2) + + ADD2 s3, t3, s3 + unop + MUL x1, a2, t3 + LD a2, 8 * SIZE(A2) + + ADD3 s0, t0, s0 + unop + MUL x2, a4, t0 + LD x1, 5 * SIZE(X1) + + ADD4 s1, t1, s1 + MUL x2, a5, t1 + ADD3 s2, t2, s2 + MUL x2, a6, t2 + + ADD4 s3, t3, s3 + unop + MUL x2, a7, t3 + LD x2, 6 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x3, a5, t0 + LD a5, 11 * SIZE(A1) + + ADD2 s1, t1, s1 + unop + MUL x3, a4, t1 + LD a4, 10 * SIZE(A1) + + ADD1 s2, t2, s2 + unop + MUL x3, a7, t2 + LD a7, 11 * SIZE(A2) + + ADD2 s3, t3, s3 + unop + MUL x3, a6, t3 + LD a6, 10 * SIZE(A2) + + ADD3 s0, t0, s0 + unop + MUL x0, a8, t0 + LD x3, 7 * SIZE(X1) + + ADD4 s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + MUL x0, a9, t1 + unop + + ADD3 s2, t2, s2 + lda I, -1(I) + MUL x0, a10, t2 + unop + + ADD4 s3, t3, s3 + unop + MUL x0, a11, t3 + LD x0, 8 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x1, a9, t0 + LD a9, 13 * SIZE(A1) + + ADD2 s1, t1, s1 + unop + MUL x1, a8, t1 + LD a8, 12 * SIZE(A1) + + ADD1 s2, t2, s2 + lda A1, 8 * SIZE(A1) + MUL x1, a11, t2 + LD a11, 13 * SIZE(A2) + + ADD2 s3, t3, s3 + unop + MUL x1, a10, t3 + LD a10, 12 * SIZE(A2) + + ADD3 s0, t0, s0 + unop + MUL x2, a12, t0 + LD x1, 9 * SIZE(X1) + + ADD4 s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(X1) + MUL x2, a13, t1 + lda A2, 8 * SIZE(A2) + + ADD3 s2, t2, s2 + unop + MUL x2, a14, t2 + unop + + ADD4 s3, t3, s3 + unop + MUL x2, a15, t3 + LD x2, 10 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x3, a13, t0 + LD a13, 7 * SIZE(A1) + + ADD2 s1, t1, s1 + lda X1, 8 * SIZE(X1) + MUL x3, a12, t1 + LD a12, 6 * SIZE(A1) + + ADD1 s2, t2, s2 + unop + MUL x3, a15, t2 + LD a15, 7 * SIZE(A2) + + ADD2 s3, t3, s3 + MUL x3, a14, t3 + LD a14, 6 * SIZE(A2) + bgt I, $L12 + .align 4 + +$L13: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + MUL x0, a1, t1 + ADD3 s2, t2, s2 + MUL x0, a2, t2 + + ADD4 s3, t3, s3 + unop + MUL x0, a3, t3 + LD x0, 4 * SIZE(X1) + + ADD1 s0, t0, s0 + MUL x1, a1, t0 + ADD2 s1, t1, s1 + MUL x1, a0, t1 + + ADD1 s2, t2, s2 + unop + MUL x1, a3, t2 + unop + + ADD2 s3, t3, s3 + lda A1, 8 * SIZE(A1) + MUL x1, a2, t3 + LD x1, 5 * SIZE(X1) + + ADD3 s0, t0, s0 + MUL x2, a4, t0 + ADD4 s1, t1, s1 + MUL x2, a5, t1 + + ADD3 s2, t2, s2 + unop + MUL x2, a6, t2 + unop + + ADD4 s3, t3, s3 + lda A2, 8 * SIZE(A2) + MUL x2, a7, t3 + LD x2, 6 * SIZE(X1) + + ADD1 s0, t0, s0 + MUL x3, a5, t0 + ADD2 s1, t1, s1 + MUL x3, a4, t1 + + ADD1 s2, t2, s2 + unop + MUL x3, a7, t2 + lda X1, 8 * SIZE(X1) + + ADD2 s3, t3, s3 + unop + MUL x3, a6, t3 + LD x3, -1 * SIZE(X1) + + ADD3 s0, t0, s0 + MUL x0, a8, t0 + ADD4 s1, t1, s1 + MUL x0, a9, t1 + + ADD3 s2, t2, s2 + MUL x0, a10, t2 + ADD4 s3, t3, s3 + MUL x0, a11, t3 + + ADD1 s0, t0, s0 + MUL x1, a9, t0 + ADD2 s1, t1, s1 + MUL x1, a8, t1 + + ADD1 s2, t2, s2 + MUL x1, a11, t2 + ADD2 s3, t3, s3 + MUL x1, a10, t3 + + ADD3 s0, t0, s0 + MUL x2, a12, t0 + ADD4 s1, t1, s1 + MUL x2, a13, t1 + + ADD3 s2, t2, s2 + MUL x2, a14, t2 + ADD4 s3, t3, s3 + MUL x2, a15, t3 + + ADD1 s0, t0, s0 + MUL x3, a13, t0 + ADD2 s1, t1, s1 + MUL x3, a12, t1 + + ADD1 s2, t2, s2 + MUL x3, a15, t2 + ADD2 s3, t3, s3 + MUL x3, a14, t3 + .align 4 + +$L15: + and M, 3, I + ble I, $L18 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + + lda I, -1(I) + ble I, $L17 + .align 4 + +$L16: + ADD3 s0, t0, s0 + lda I, -1(I) + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + MUL x0, a1, t1 + ADD3 s2, t2, s2 + MUL x0, a2, t2 + + ADD4 s3, t3, s3 + unop + MUL x0, a3, t3 + LD x0, 2 * SIZE(X1) + + ADD1 s0, t0, s0 + lda A2, 2 * SIZE(A2) + MUL x1, a1, t0 + LD a1, 3 * SIZE(A1) + + ADD2 s1, t1, s1 + lda X1, 2 * SIZE(X1) + MUL x1, a0, t1 + LD a0, 2 * SIZE(A1) + + ADD1 s2, t2, s2 + lda A1, 2 * SIZE(A1) + MUL x1, a3, t2 + LD a3, 1 * SIZE(A2) + + ADD2 s3, t3, s3 + MUL x1, a2, t3 + LD a2, 0 * SIZE(A2) + bgt I, $L16 + .align 4 + +$L17: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + unop + + ADD3 s2, t2, s2 + MUL x0, a2, t2 + ADD4 s3, t3, s3 + MUL x0, a3, t3 + + ADD1 s0, t0, s0 + MUL x1, a1, t0 + ADD2 s1, t1, s1 + MUL x1, a0, t1 + + ADD1 s2, t2, s2 + MUL x1, a3, t2 + ADD2 s3, t3, s3 + MUL x1, a2, t3 + .align 4 + +$L18: + LD a0, 0 * SIZE(Y) + unop + LD a1, 1 * SIZE(Y) + addq Y, INCY, Y + + LD a2, 0 * SIZE(Y) + unop + LD a3, 1 * SIZE(Y) + addq Y, INCY, Y + + ADD3 s0, t0, s0 + ADD4 s1, t1, s1 + ADD3 s2, t2, s2 + ADD4 s3, t3, s3 + + MUL alpha_r, s0, t0 + MUL alpha_r, s1, t1 + MUL alpha_r, s2, t2 + MUL alpha_r, s3, t3 + + ADD a0, t0, a0 + MUL alpha_i, s1, t0 + ADD a1, t1, a1 + MUL alpha_i, s0, t1 + ADD a2, t2, a2 + MUL alpha_i, s3, t2 + ADD a3, t3, a3 + MUL alpha_i, s2, t3 + + SUB a0, t0, a0 + ADD a1, t1, a1 + SUB a2, t2, a2 + ADD a3, t3, a3 + + ST a0, 0 * SIZE(Y1) + fclr t0 + ST a1, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + + ST a2, 0 * SIZE(Y1) + fclr t1 + ST a3, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + + fclr t2 + lda J, -1(J) + fclr t3 + bgt J, $L11 + .align 4 + +$L20: + blbc N, $L999 + + mov A, A1 + fclr s0 + fclr s1 + mov X, X1 + + sra M, 2, I + fclr s2 + fclr s3 + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a4, 2 * SIZE(A1) + LD a5, 3 * SIZE(A1) + LD a8, 4 * SIZE(A1) + LD a9, 5 * SIZE(A1) + LD a12, 6 * SIZE(A1) + LD a13, 7 * SIZE(A1) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + lda I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ADD3 s0, t0, s0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + LD x0, 4 * SIZE(X1) + + ADD1 s2, t0, s2 + lda I, -1(I) + MUL x1, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x1, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD3 s0, t0, s0 + unop + MUL x2, a4, t0 + LD x1, 5 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x2, a5, t1 + LD x2, 6 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x3, a5, t0 + LD a5, 11 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x3, a4, t1 + LD a4, 10 * SIZE(A1) + + ADD3 s0, t0, s0 + unop + MUL x0, a8, t0 + LD x3, 7 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a9, t1 + LD x0, 8 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x1, a9, t0 + LD a9, 13 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x1, a8, t1 + LD a8, 12 * SIZE(A1) + + ADD3 s0, t0, s0 + unop + MUL x2, a12, t0 + LD x1, 9 * SIZE(X1) + + ADD4 s1, t1, s1 + lda A1, 8 * SIZE(A1) + MUL x2, a13, t1 + LD x2, 10 * SIZE(X1) + + ADD1 s2, t0, s2 + lda X1, 8 * SIZE(X1) + MUL x3, a13, t0 + LD a13, 7 * SIZE(A1) + + ADD2 s3, t1, s3 + MUL x3, a12, t1 + LD a12, 6 * SIZE(A1) + bgt I, $L22 + .align 4 + +$L23: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + LD x0, 4 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x1, a1, t0 + lda A1, 8 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x1, a0, t1 + LD x1, 5 * SIZE(X1) + + ADD3 s0, t0, s0 + unop + MUL x2, a4, t0 + unop + + ADD4 s1, t1, s1 + unop + MUL x2, a5, t1 + LD x2, 6 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x3, a5, t0 + lda X1, 8 * SIZE(X1) + + ADD2 s3, t1, s3 + unop + MUL x3, a4, t1 + LD x3, -1 * SIZE(X1) + + ADD3 s0, t0, s0 + MUL x0, a8, t0 + ADD4 s1, t1, s1 + MUL x0, a9, t1 + + ADD1 s2, t0, s2 + MUL x1, a9, t0 + ADD2 s3, t1, s3 + MUL x1, a8, t1 + + ADD3 s0, t0, s0 + MUL x2, a12, t0 + ADD4 s1, t1, s1 + MUL x2, a13, t1 + + ADD1 s2, t0, s2 + MUL x3, a13, t0 + ADD2 s3, t1, s3 + MUL x3, a12, t1 + .align 4 + +$L25: + and M, 3, I + ble I, $L28 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + LD x0, 0 * SIZE(X1) + + lda I, -1(I) + ble I, $L27 + .align 4 + +$L26: + ADD3 s0, t0, s0 + lda A1, 2 * SIZE(A1) + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + lda I, -1(I) + MUL x0, a1, t1 + LD x0, 2 * SIZE(X1) + + ADD1 s0, t0, s0 + lda X1, 2 * SIZE(X1) + MUL x1, a1, t0 + LD a1, 1 * SIZE(A1) + + ADD2 s1, t1, s1 + MUL x1, a0, t1 + LD a0, 0 * SIZE(A1) + bgt I, $L26 + .align 4 + +$L27: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + unop + + ADD1 s0, t0, s0 + MUL x1, a1, t0 + ADD2 s1, t1, s1 + MUL x1, a0, t1 + .align 4 + +$L28: + LD a0, 0 * SIZE(Y) + LD a1, 1 * SIZE(Y) + + ADD3 s0, t0, s0 + ADD4 s1, t1, s1 + ADD3 s2, t2, s2 + ADD4 s3, t3, s3 + + ADD s0, s2, s0 + ADD s1, s3, s1 + + MUL alpha_r, s0, t0 + MUL alpha_r, s1, t1 + + ADD a0, t0, a0 + MUL alpha_i, s1, t0 + ADD a1, t1, a1 + MUL alpha_i, s0, t1 + + SUB a0, t0, a0 + ADD a1, t1, a1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/znrm2.S b/kernel/alpha/znrm2.S new file mode 100644 index 0000000..03343b2 --- /dev/null +++ b/kernel/alpha/znrm2.S @@ -0,0 +1,426 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldah $29, 0($27) !gpdisp!1 + lda $29, 0($29) !gpdisp!1 + + lda $sp, -16($sp) + ldq $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + sll INCX, ZBASE_SHIFT, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, 2 * SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + lda I, -1(I) + ble I, $L12 + .align 4 + +$L11: + addt a0, t0, a0 + ldl $31, (PREFETCH_SIZE) * SIZE(X) + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + mov X, XX + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(X) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(X) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(X) + + addt a3, t3, a3 + unop + mult x7, x7, t3 + LD x7, 15 * SIZE(X) + + addt a0, t0, a0 + unop + mult x0, x0, t0 + LD x0, 16 * SIZE(X) + + addt a1, t1, a1 + lda X, 16 * SIZE(X) + mult x1, x1, t1 + LD x1, 17 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 18 * SIZE(XX) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 19 * SIZE(XX) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 20 * SIZE(XX) + + addt a1, t1, a1 + lda I, -1(I) + mult x5, x5, t1 + LD x5, 21 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 22 * SIZE(XX) + + addt a3, t3, a3 + mult x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + addt a0, t0, a0 + mov X, XX + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + unop + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(XX) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(XX) + + addt a3, t3, a3 + lda X, 16 * SIZE(X) + mult x7, x7, t3 + LD x7, 15 * SIZE(XX) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + LD x1, 1 * SIZE(X) + + lda X, 2 * SIZE(X) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + lda I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 2, I + fclr t1 + ble I, $L25 + + LD x0, 0 * SIZE(X) + fclr t2 + LD x1, 1 * SIZE(X) + addq X, INCX, X + LD x2, 0 * SIZE(X) + fclr t3 + LD x3, 1 * SIZE(X) + addq X, INCX, X + + LD x4, 0 * SIZE(X) + lda I, -1(I) + LD x5, 1 * SIZE(X) + addq X, INCX, X + + LD x6, 0 * SIZE(X) + ble I, $L22 + .align 4 + +$L21: + addt a0, t0, a0 + LD x7, 1 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x0, 0 * SIZE(X) + mult x1, x1, t1 + unop + + addt a2, t2, a2 + LD x1, 1 * SIZE(X) + mult x2, x2, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x2, 0 * SIZE(X) + mult x3, x3, t3 + unop + + addt a0, t0, a0 + LD x3, 1 * SIZE(X) + mult x4, x4, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x4, 0 * SIZE(X) + mult x5, x5, t1 + lda I, -1(I) + + addt a2, t2, a2 + LD x5, 1 * SIZE(X) + mult x6, x6, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x6, 0 * SIZE(X) + mult x7, x7, t3 + bgt I, $L21 + .align 4 + +$L22: + addt a0, t0, a0 + LD x7, 1 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + mult x1, x1, t1 + addt a2, t2, a2 + mult x2, x2, t2 + + addt a3, t3, a3 + mult x3, x3, t3 + addt a0, t0, a0 + mult x4, x4, t0 + + addt a1, t1, a1 + mult x5, x5, t1 + addt a2, t2, a2 + mult x6, x6, t2 + + addt a3, t3, a3 + mult x7, x7, t3 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L25: + and N, 3, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + lda I, -1(I) + LD x1, 1 * SIZE(X) + addq X, INCX, X + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + bgt I, $L26 + .align 4 + + +$L998: + addt a0, t0, a0 + addt a1, t1, a1 + + addt a0, a1, a0 + addt a2, a3, a2 + +#if defined(EV4) || defined(EV5) + addt a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldah $29, 0($26) !gpdisp!3 + lda $29, 0($29) !gpdisp!3 +#else + addt a0, a2, a0 + sqrtt a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldq $26, 0($sp) + lda $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/alpha/zrot.S b/kernel/alpha/zrot.S new file mode 100644 index 0000000..afcdf12 --- /dev/null +++ b/kernel/alpha/zrot.S @@ -0,0 +1,631 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define C $f10 +#define S $f11 + +#define PREFETCH_SIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fmov $f21, C + LD S, 0($sp) + + addq INCX, INCX, INCX + addq INCY, INCY, INCY + + cmpeq INCX, 2, $23 + cmpeq INCY, 2, $24 + ble N, $L998 + + and $23, $24, $23 + beq $23, $L50 + + sra N, 2, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + lda I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, $f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + lds $f31, (PREFETCH_SIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + lds $f31, (PREFETCH_SIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + lda I, -1(I) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + lda X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + lda Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + LD $f18, 7*SIZE(X) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, $f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, $f28 + + ST $f26, 7*SIZE(X) + lda X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + lda Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 3, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + lda I, -1(I) + + ST $f26, 1*SIZE(X) + lda X, 2 * SIZE(X) + ST $f28, 1*SIZE(Y) + lda Y, 2 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 2, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + lda I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 3, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + lda I, -1(I) + + ST $f26, 1*SIZE(X) + ST $f28, 1*SIZE(Y) + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + clr $0 + ret + EPILOGUE diff --git a/kernel/alpha/zscal.S b/kernel/alpha/zscal.S new file mode 100644 index 0000000..1a2ac10 --- /dev/null +++ b/kernel/alpha/zscal.S @@ -0,0 +1,255 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $21 +#define INCX $17 + +#define XX $18 +#define I $19 + +#define ALPHA_R $f19 +#define ALPHA_I $f20 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + +#define t4 $f26 +#define t5 $f27 +#define t6 $f28 +#define t7 $f29 + + PROLOGUE + PROFCODE + + ldq INCX, 0($sp) + mov X, XX + ble N, $L999 + + addq INCX, INCX, INCX + + sra N, 2, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a6, 0 * SIZE(X) + LD a7, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_R, t0 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_I, t1 + + MUL a2, ALPHA_I, t2 + LD a0, 0 * SIZE(X) + MUL a3, ALPHA_R, t3 + LD a1, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a4, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a5, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a4, ALPHA_I, t2 + LD a2, 0 * SIZE(X) + MUL a5, ALPHA_R, t3 + LD a3, 1 * SIZE(X) + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + SXADDQ INCX, X, X + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + LD a4, 0 * SIZE(X) + MUL a7, ALPHA_R, t3 + LD a5, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a1, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a0, ALPHA_I, t2 + LD a6, 0 * SIZE(X) + MUL a1, ALPHA_R, t3 + LD a7, 1 * SIZE(X) + + SUB t0, t1, t4 + lda I, -1(I) + ADD t2, t3, t5 + SXADDQ INCX, XX, XX + + lds $f31, PREFETCHSIZE * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + MUL a2, ALPHA_R, t0 + MUL a3, ALPHA_I, t1 + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_I, t2 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + unop + + ST t6, 0 * SIZE(XX) + MUL a4, ALPHA_R, t0 + ST t7, 1 * SIZE(XX) + MUL a5, ALPHA_I, t1 + MUL a4, ALPHA_I, t2 + MUL a5, ALPHA_R, t3 + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + unop + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + MUL a7, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + + ST t6, 0 * SIZE(XX) + ST t7, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 4 + +$L15: + and N, 3, I + unop + unop + ble I, $L999 + .align 4 + +$L17: + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + ST t4, 0 * SIZE(XX) + ST t5, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + + lda I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/alpha/zswap.S b/kernel/alpha/zswap.S new file mode 100644 index 0000000..a12a2c7 --- /dev/null +++ b/kernel/alpha/zswap.S @@ -0,0 +1,244 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + mov $21, $17 + ldl $18, 0($sp) + ldq $19, 8($sp) + ldl $20, 16($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ble $16, $SubEnd # if n <= 0 goto $End + + cmpeq $18, 1, $1 + addq $18, $18, $18 + cmpeq $20, 1, $2 + addq $20, $20, $20 + + sra $16, 2, $21 + and $1, $2, $1 + and $16, 3, $22 + beq $1, $Sub + + ble $21, $MainRemain + .align 4 + +$MainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f12, 2*SIZE($19) + LD $f13, 3*SIZE($19) + LD $f14, 4*SIZE($19) + LD $f15, 5*SIZE($19) + LD $f16, 6*SIZE($19) + LD $f17, 7*SIZE($19) + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + LD $f22, 2*SIZE($17) + LD $f23, 3*SIZE($17) + LD $f24, 4*SIZE($17) + LD $f25, 5*SIZE($17) + LD $f26, 6*SIZE($17) + LD $f27, 7*SIZE($17) + + lds $f31, 16*SIZE($17) + unop + lds $f31, 16*SIZE($19) + subl $21, 1, $21 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f12, 2*SIZE($17) + ST $f13, 3*SIZE($17) + ST $f14, 4*SIZE($17) + ST $f15, 5*SIZE($17) + ST $f16, 6*SIZE($17) + ST $f17, 7*SIZE($17) + + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + ST $f22, 2*SIZE($19) + ST $f23, 3*SIZE($19) + ST $f24, 4*SIZE($19) + ST $f25, 5*SIZE($19) + ST $f26, 6*SIZE($19) + ST $f27, 7*SIZE($19) + + lda $17, 8*SIZE($17) + lda $19, 8*SIZE($19) + bgt $21, $MainLoop + .align 4 + +$MainRemain: + ble $22, $MainEnd + .align 4 + +$MainRemainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + + lda $17, 2*SIZE($17) + lda $19, 2*SIZE($19) + subl $22, 1, $22 + ST $f10, -2*SIZE($17) + ST $f11, -1*SIZE($17) + ST $f20, -2*SIZE($19) + ST $f21, -1*SIZE($19) + bgt $22, $MainRemainLoop + .align 4 + +$MainEnd: + clr $0 + ret + .align 4 + +$Sub: + mov $17, $23 + mov $19, $24 + ble $21, $SubRemain + .align 4 + +$SubLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f12, 0*SIZE($19) + LD $f13, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f14, 0*SIZE($19) + LD $f15, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f16, 0*SIZE($19) + LD $f17, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f22, 0*SIZE($17) + LD $f23, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f24, 0*SIZE($17) + LD $f25, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f26, 0*SIZE($17) + LD $f27, 1*SIZE($17) + SXADDQ $18, $17, $17 + + ST $f10, 0*SIZE($23) + ST $f11, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f12, 0*SIZE($23) + ST $f13, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f14, 0*SIZE($23) + ST $f15, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f16, 0*SIZE($23) + ST $f17, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f20, 0*SIZE($24) + ST $f21, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f22, 0*SIZE($24) + ST $f23, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f24, 0*SIZE($24) + ST $f25, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f26, 0*SIZE($24) + ST $f27, 1*SIZE($24) + SXADDQ $20, $24, $24 + + subl $21, 1, $21 + bgt $21, $SubLoop + .align 4 + +$SubRemain: + ble $22, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + + subl $22, 1, $22 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + + SXADDQ $18, $17, $17 + SXADDQ $20, $19, $19 + bgt $22, $SubRemainLoop + .align 4 + +$SubEnd: + clr $0 + ret + EPILOGUE diff --git a/kernel/alpha/ztrsm_kernel_2x2_LN.S b/kernel/alpha/ztrsm_kernel_2x2_LN.S new file mode 100644 index 0000000..2921f9e --- /dev/null +++ b/kernel/alpha/ztrsm_kernel_2x2_LN.S @@ -0,0 +1,2237 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + lda $sp, -STACKSIZE($sp) + + ldq B, 0 + STACKSIZE($sp) + ldq C, 8 + STACKSIZE($sp) + ldq LDC, 16 + STACKSIZE($sp) + ldq OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addq M, M, TMP2 + mulq TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + addq TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subq B, TMP1, B + + subq C, LDC, C2 + subq C2, LDC, C1 + subq C2, LDC, C +#else + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + ble I, $L20 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 4 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 4 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, c10 + MUL a2, b1, t2 + ADD4 c13, t3, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, c14 + MUL a2, b2, t4 + ADD1 c01, t1, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, c02 + MUL a2, b3, t2 + ADD4 c05, t3, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L20: + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L29 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + lda BO, 4 * SIZE(B) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(KK) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + lda BO, 4 * SIZE(BO) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(TMP1) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + MUL b1, a4, t2 + ADD2 c06, t3, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, c05 + MUL b4, a1, t4 + ADD1 c03, t1, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, c04 + MUL b3, a2, t2 + ADD2 c08, t3, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, c13 + MUL b2, a3, t4 + ADD1 c09, t1, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, c10 + MUL b3, a4, t2 + ADD2 c14, t3, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + ADD c10, c13, c10 + ADD c11, c16, c11 + ADD c12, c15, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 + + SUB b1, c03, c03 + SUB b2, c04, c04 + SUB b3, c11, c11 + SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c11, c11 + SUB b4, c12, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c09, t3, c09 + SUB c10, t4, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, c03 + SUB c04, t2, c04 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + fclr c01 + fclr c05 + + lda I, -1(I) + bgt I, $L11 + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C1 + subq C, LDC, C +#else + mov C, C1 + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + ble I, $L50 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, c02 + MUL a2, b1, t2 + ADD4 c05, t3, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + +$L58: +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L50: + sra M, 1, I + ble I, $L59 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda BO, 2 * SIZE(B) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda BO, 2 * SIZE(BO) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + MUL a4, b1, t4 + ADD1 c01, t1, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, c02 + MUL a2, b2, t2 + ADD1 c03, t3, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, c03 + SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + + lda I, -1(I) + bgt I, $L41 + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/ztrsm_kernel_2x2_LT.S b/kernel/alpha/ztrsm_kernel_2x2_LT.S new file mode 100644 index 0000000..e6ffc0f --- /dev/null +++ b/kernel/alpha/ztrsm_kernel_2x2_LT.S @@ -0,0 +1,2230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + lda $sp, -STACKSIZE($sp) + + ldq B, 0 + STACKSIZE($sp) + ldq C, 8 + STACKSIZE($sp) + ldq LDC, 16 + STACKSIZE($sp) + ldq OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addq M, M, TMP2 + mulq TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + addq TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subq B, TMP1, B + + subq C, LDC, C2 + subq C2, LDC, C1 + subq C2, LDC, C +#else + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + lda BO, 4 * SIZE(B) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(KK) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + lda BO, 4 * SIZE(BO) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(TMP1) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + MUL b1, a4, t2 + ADD2 c06, t3, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, c05 + MUL b4, a1, t4 + ADD1 c03, t1, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, c04 + MUL b3, a2, t2 + ADD2 c08, t3, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, c13 + MUL b2, a3, t4 + ADD1 c09, t1, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, c10 + MUL b3, a4, t2 + ADD2 c14, t3, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + ADD c10, c13, c10 + ADD c11, c16, c11 + ADD c12, c15, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 + + SUB b1, c03, c03 + SUB b2, c04, c04 + SUB b3, c11, c11 + SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c11, c11 + SUB b4, c12, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c09, t3, c09 + SUB c10, t4, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, c03 + SUB c04, t2, c04 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + fclr c01 + fclr c05 + + lda I, -1(I) + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 4 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 4 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, c10 + MUL a2, b1, t2 + ADD4 c13, t3, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, c14 + MUL a2, b2, t4 + ADD1 c01, t1, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, c02 + MUL a2, b3, t2 + ADD4 c05, t3, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C1 + subq C, LDC, C +#else + mov C, C1 + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda BO, 2 * SIZE(B) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda BO, 2 * SIZE(BO) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + MUL a4, b1, t4 + ADD1 c01, t1, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, c02 + MUL a2, b2, t2 + ADD1 c03, t3, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, c03 + SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + + lda I, -1(I) + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L59 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, c02 + MUL a2, b1, t2 + ADD4 c05, t3, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + +$L58: +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/ztrsm_kernel_2x2_RT.S b/kernel/alpha/ztrsm_kernel_2x2_RT.S new file mode 100644 index 0000000..4c490fc --- /dev/null +++ b/kernel/alpha/ztrsm_kernel_2x2_RT.S @@ -0,0 +1,2230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + lda $sp, -STACKSIZE($sp) + + ldq B, 0 + STACKSIZE($sp) + ldq C, 8 + STACKSIZE($sp) + ldq LDC, 16 + STACKSIZE($sp) + ldq OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addq M, M, TMP2 + mulq TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + addq TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + and N, 1, J + ble J, $L30 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C1 + subq C, LDC, C +#else + mov C, C1 + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda BO, 2 * SIZE(B) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda BO, 2 * SIZE(BO) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + MUL a4, b1, t4 + ADD1 c01, t1, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, c02 + MUL a2, b2, t2 + ADD1 c03, t3, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, c03 + SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + + lda I, -1(I) + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L59 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, c02 + MUL a2, b1, t2 + ADD4 c05, t3, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + +$L58: +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L30: + sra N, 1, J + ble J, $L999 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subq B, TMP1, B + + subq C, LDC, C2 + subq C2, LDC, C1 + subq C2, LDC, C +#else + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + lda BO, 4 * SIZE(B) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(KK) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + lda BO, 4 * SIZE(BO) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(TMP1) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + MUL b1, a4, t2 + ADD2 c06, t3, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, c05 + MUL b4, a1, t4 + ADD1 c03, t1, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, c04 + MUL b3, a2, t2 + ADD2 c08, t3, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, c13 + MUL b2, a3, t4 + ADD1 c09, t1, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, c10 + MUL b3, a4, t2 + ADD2 c14, t3, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + ADD c10, c13, c10 + ADD c11, c16, c11 + ADD c12, c15, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 + + SUB b1, c03, c03 + SUB b2, c04, c04 + SUB b3, c11, c11 + SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c11, c11 + SUB b4, c12, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c09, t3, c09 + SUB c10, t4, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, c03 + SUB c04, t2, c04 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + fclr c01 + fclr c05 + + lda I, -1(I) + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 4 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 4 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, c10 + MUL a2, b1, t2 + ADD4 c13, t3, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, c14 + MUL a2, b2, t4 + ADD1 c01, t1, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, c02 + MUL a2, b3, t2 + ADD4 c05, t3, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/generic/._cabs.c b/kernel/generic/._cabs.c new file mode 100644 index 0000000..e8b1f4d Binary files /dev/null and b/kernel/generic/._cabs.c differ diff --git a/kernel/generic/._gemm_beta.c b/kernel/generic/._gemm_beta.c new file mode 100644 index 0000000..a9d6a91 Binary files /dev/null and b/kernel/generic/._gemm_beta.c differ diff --git a/kernel/generic/._gemm_ncopy_1.c b/kernel/generic/._gemm_ncopy_1.c new file mode 100644 index 0000000..b12e943 Binary files /dev/null and b/kernel/generic/._gemm_ncopy_1.c differ diff --git a/kernel/generic/._gemm_ncopy_16.c b/kernel/generic/._gemm_ncopy_16.c new file mode 100644 index 0000000..7b63153 Binary files /dev/null and b/kernel/generic/._gemm_ncopy_16.c differ diff --git a/kernel/generic/._gemm_ncopy_2.c b/kernel/generic/._gemm_ncopy_2.c new file mode 100644 index 0000000..e8ed4e2 Binary files /dev/null and b/kernel/generic/._gemm_ncopy_2.c differ diff --git a/kernel/generic/._gemm_ncopy_4.c b/kernel/generic/._gemm_ncopy_4.c new file mode 100644 index 0000000..7688c8c Binary files /dev/null and b/kernel/generic/._gemm_ncopy_4.c differ diff --git a/kernel/generic/._gemm_ncopy_8.c b/kernel/generic/._gemm_ncopy_8.c new file mode 100644 index 0000000..50e9488 Binary files /dev/null and b/kernel/generic/._gemm_ncopy_8.c differ diff --git a/kernel/generic/._gemm_tcopy_1.c b/kernel/generic/._gemm_tcopy_1.c new file mode 100644 index 0000000..ac59323 Binary files /dev/null and b/kernel/generic/._gemm_tcopy_1.c differ diff --git a/kernel/generic/._gemm_tcopy_16.c b/kernel/generic/._gemm_tcopy_16.c new file mode 100644 index 0000000..cf42a84 Binary files /dev/null and b/kernel/generic/._gemm_tcopy_16.c differ diff --git a/kernel/generic/._gemm_tcopy_2.c b/kernel/generic/._gemm_tcopy_2.c new file mode 100644 index 0000000..fe3ee64 Binary files /dev/null and b/kernel/generic/._gemm_tcopy_2.c differ diff --git a/kernel/generic/._gemm_tcopy_4.c b/kernel/generic/._gemm_tcopy_4.c new file mode 100644 index 0000000..3986e63 Binary files /dev/null and b/kernel/generic/._gemm_tcopy_4.c differ diff --git a/kernel/generic/._gemm_tcopy_8.c b/kernel/generic/._gemm_tcopy_8.c new file mode 100644 index 0000000..91c277b Binary files /dev/null and b/kernel/generic/._gemm_tcopy_8.c differ diff --git a/kernel/generic/._ger.c b/kernel/generic/._ger.c new file mode 100644 index 0000000..2150ccc Binary files /dev/null and b/kernel/generic/._ger.c differ diff --git a/kernel/generic/._laswp_ncopy_1.c b/kernel/generic/._laswp_ncopy_1.c new file mode 100644 index 0000000..2dee1e6 Binary files /dev/null and b/kernel/generic/._laswp_ncopy_1.c differ diff --git a/kernel/generic/._laswp_ncopy_2.c b/kernel/generic/._laswp_ncopy_2.c new file mode 100644 index 0000000..cb2b7e7 Binary files /dev/null and b/kernel/generic/._laswp_ncopy_2.c differ diff --git a/kernel/generic/._laswp_ncopy_4.c b/kernel/generic/._laswp_ncopy_4.c new file mode 100644 index 0000000..a6caaf9 Binary files /dev/null and b/kernel/generic/._laswp_ncopy_4.c differ diff --git a/kernel/generic/._laswp_ncopy_8.c b/kernel/generic/._laswp_ncopy_8.c new file mode 100644 index 0000000..0982d23 Binary files /dev/null and b/kernel/generic/._laswp_ncopy_8.c differ diff --git a/kernel/generic/._lsame.c b/kernel/generic/._lsame.c new file mode 100644 index 0000000..3ffc831 Binary files /dev/null and b/kernel/generic/._lsame.c differ diff --git a/kernel/generic/._neg_tcopy_1.c b/kernel/generic/._neg_tcopy_1.c new file mode 100644 index 0000000..a902a0a Binary files /dev/null and b/kernel/generic/._neg_tcopy_1.c differ diff --git a/kernel/generic/._neg_tcopy_16.c b/kernel/generic/._neg_tcopy_16.c new file mode 100644 index 0000000..3861ca6 Binary files /dev/null and b/kernel/generic/._neg_tcopy_16.c differ diff --git a/kernel/generic/._neg_tcopy_2.c b/kernel/generic/._neg_tcopy_2.c new file mode 100644 index 0000000..6cd6dee Binary files /dev/null and b/kernel/generic/._neg_tcopy_2.c differ diff --git a/kernel/generic/._neg_tcopy_4.c b/kernel/generic/._neg_tcopy_4.c new file mode 100644 index 0000000..2a546b5 Binary files /dev/null and b/kernel/generic/._neg_tcopy_4.c differ diff --git a/kernel/generic/._neg_tcopy_8.c b/kernel/generic/._neg_tcopy_8.c new file mode 100644 index 0000000..d4e893f Binary files /dev/null and b/kernel/generic/._neg_tcopy_8.c differ diff --git a/kernel/generic/._symm_lcopy_1.c b/kernel/generic/._symm_lcopy_1.c new file mode 100644 index 0000000..543822f Binary files /dev/null and b/kernel/generic/._symm_lcopy_1.c differ diff --git a/kernel/generic/._symm_lcopy_16.c b/kernel/generic/._symm_lcopy_16.c new file mode 100644 index 0000000..212eea2 Binary files /dev/null and b/kernel/generic/._symm_lcopy_16.c differ diff --git a/kernel/generic/._symm_lcopy_2.c b/kernel/generic/._symm_lcopy_2.c new file mode 100644 index 0000000..9402153 Binary files /dev/null and b/kernel/generic/._symm_lcopy_2.c differ diff --git a/kernel/generic/._symm_lcopy_4.c b/kernel/generic/._symm_lcopy_4.c new file mode 100644 index 0000000..adae99d Binary files /dev/null and b/kernel/generic/._symm_lcopy_4.c differ diff --git a/kernel/generic/._symm_lcopy_8.c b/kernel/generic/._symm_lcopy_8.c new file mode 100644 index 0000000..243ad4c Binary files /dev/null and b/kernel/generic/._symm_lcopy_8.c differ diff --git a/kernel/generic/._symm_ucopy_1.c b/kernel/generic/._symm_ucopy_1.c new file mode 100644 index 0000000..0690252 Binary files /dev/null and b/kernel/generic/._symm_ucopy_1.c differ diff --git a/kernel/generic/._symm_ucopy_16.c b/kernel/generic/._symm_ucopy_16.c new file mode 100644 index 0000000..c847dc5 Binary files /dev/null and b/kernel/generic/._symm_ucopy_16.c differ diff --git a/kernel/generic/._symm_ucopy_2.c b/kernel/generic/._symm_ucopy_2.c new file mode 100644 index 0000000..2b932eb Binary files /dev/null and b/kernel/generic/._symm_ucopy_2.c differ diff --git a/kernel/generic/._symm_ucopy_4.c b/kernel/generic/._symm_ucopy_4.c new file mode 100644 index 0000000..70a5832 Binary files /dev/null and b/kernel/generic/._symm_ucopy_4.c differ diff --git a/kernel/generic/._symm_ucopy_8.c b/kernel/generic/._symm_ucopy_8.c new file mode 100644 index 0000000..944a7f5 Binary files /dev/null and b/kernel/generic/._symm_ucopy_8.c differ diff --git a/kernel/generic/._symv_k.c b/kernel/generic/._symv_k.c new file mode 100644 index 0000000..6f00615 Binary files /dev/null and b/kernel/generic/._symv_k.c differ diff --git a/kernel/generic/._trmm_lncopy_1.c b/kernel/generic/._trmm_lncopy_1.c new file mode 100644 index 0000000..33839c4 Binary files /dev/null and b/kernel/generic/._trmm_lncopy_1.c differ diff --git a/kernel/generic/._trmm_lncopy_16.c b/kernel/generic/._trmm_lncopy_16.c new file mode 100644 index 0000000..b0f7131 Binary files /dev/null and b/kernel/generic/._trmm_lncopy_16.c differ diff --git a/kernel/generic/._trmm_lncopy_2.c b/kernel/generic/._trmm_lncopy_2.c new file mode 100644 index 0000000..b1c5feb Binary files /dev/null and b/kernel/generic/._trmm_lncopy_2.c differ diff --git a/kernel/generic/._trmm_lncopy_4.c b/kernel/generic/._trmm_lncopy_4.c new file mode 100644 index 0000000..d407068 Binary files /dev/null and b/kernel/generic/._trmm_lncopy_4.c differ diff --git a/kernel/generic/._trmm_lncopy_8.c b/kernel/generic/._trmm_lncopy_8.c new file mode 100644 index 0000000..30454fb Binary files /dev/null and b/kernel/generic/._trmm_lncopy_8.c differ diff --git a/kernel/generic/._trmm_ltcopy_1.c b/kernel/generic/._trmm_ltcopy_1.c new file mode 100644 index 0000000..6dc5fe2 Binary files /dev/null and b/kernel/generic/._trmm_ltcopy_1.c differ diff --git a/kernel/generic/._trmm_ltcopy_16.c b/kernel/generic/._trmm_ltcopy_16.c new file mode 100644 index 0000000..4b3e7b2 Binary files /dev/null and b/kernel/generic/._trmm_ltcopy_16.c differ diff --git a/kernel/generic/._trmm_ltcopy_2.c b/kernel/generic/._trmm_ltcopy_2.c new file mode 100644 index 0000000..b66dc44 Binary files /dev/null and b/kernel/generic/._trmm_ltcopy_2.c differ diff --git a/kernel/generic/._trmm_ltcopy_4.c b/kernel/generic/._trmm_ltcopy_4.c new file mode 100644 index 0000000..fd02ea3 Binary files /dev/null and b/kernel/generic/._trmm_ltcopy_4.c differ diff --git a/kernel/generic/._trmm_ltcopy_8.c b/kernel/generic/._trmm_ltcopy_8.c new file mode 100644 index 0000000..60bc4f3 Binary files /dev/null and b/kernel/generic/._trmm_ltcopy_8.c differ diff --git a/kernel/generic/._trmm_uncopy_1.c b/kernel/generic/._trmm_uncopy_1.c new file mode 100644 index 0000000..ed4f2b5 Binary files /dev/null and b/kernel/generic/._trmm_uncopy_1.c differ diff --git a/kernel/generic/._trmm_uncopy_16.c b/kernel/generic/._trmm_uncopy_16.c new file mode 100644 index 0000000..1c13015 Binary files /dev/null and b/kernel/generic/._trmm_uncopy_16.c differ diff --git a/kernel/generic/._trmm_uncopy_2.c b/kernel/generic/._trmm_uncopy_2.c new file mode 100644 index 0000000..7fe7739 Binary files /dev/null and b/kernel/generic/._trmm_uncopy_2.c differ diff --git a/kernel/generic/._trmm_uncopy_4.c b/kernel/generic/._trmm_uncopy_4.c new file mode 100644 index 0000000..03bfe94 Binary files /dev/null and b/kernel/generic/._trmm_uncopy_4.c differ diff --git a/kernel/generic/._trmm_uncopy_8.c b/kernel/generic/._trmm_uncopy_8.c new file mode 100644 index 0000000..d03cfe9 Binary files /dev/null and b/kernel/generic/._trmm_uncopy_8.c differ diff --git a/kernel/generic/._trmm_utcopy_1.c b/kernel/generic/._trmm_utcopy_1.c new file mode 100644 index 0000000..bdebb36 Binary files /dev/null and b/kernel/generic/._trmm_utcopy_1.c differ diff --git a/kernel/generic/._trmm_utcopy_16.c b/kernel/generic/._trmm_utcopy_16.c new file mode 100644 index 0000000..77b607c Binary files /dev/null and b/kernel/generic/._trmm_utcopy_16.c differ diff --git a/kernel/generic/._trmm_utcopy_2.c b/kernel/generic/._trmm_utcopy_2.c new file mode 100644 index 0000000..5558113 Binary files /dev/null and b/kernel/generic/._trmm_utcopy_2.c differ diff --git a/kernel/generic/._trmm_utcopy_4.c b/kernel/generic/._trmm_utcopy_4.c new file mode 100644 index 0000000..0b1b2df Binary files /dev/null and b/kernel/generic/._trmm_utcopy_4.c differ diff --git a/kernel/generic/._trmm_utcopy_8.c b/kernel/generic/._trmm_utcopy_8.c new file mode 100644 index 0000000..540a623 Binary files /dev/null and b/kernel/generic/._trmm_utcopy_8.c differ diff --git a/kernel/generic/._trsm_kernel_LN.c b/kernel/generic/._trsm_kernel_LN.c new file mode 100644 index 0000000..2f471f8 Binary files /dev/null and b/kernel/generic/._trsm_kernel_LN.c differ diff --git a/kernel/generic/._trsm_kernel_LT.c b/kernel/generic/._trsm_kernel_LT.c new file mode 100644 index 0000000..95c73b6 Binary files /dev/null and b/kernel/generic/._trsm_kernel_LT.c differ diff --git a/kernel/generic/._trsm_kernel_RN.c b/kernel/generic/._trsm_kernel_RN.c new file mode 100644 index 0000000..99f3616 Binary files /dev/null and b/kernel/generic/._trsm_kernel_RN.c differ diff --git a/kernel/generic/._trsm_kernel_RT.c b/kernel/generic/._trsm_kernel_RT.c new file mode 100644 index 0000000..8383e61 Binary files /dev/null and b/kernel/generic/._trsm_kernel_RT.c differ diff --git a/kernel/generic/._trsm_lncopy_1.c b/kernel/generic/._trsm_lncopy_1.c new file mode 100644 index 0000000..e07cb25 Binary files /dev/null and b/kernel/generic/._trsm_lncopy_1.c differ diff --git a/kernel/generic/._trsm_lncopy_16.c b/kernel/generic/._trsm_lncopy_16.c new file mode 100644 index 0000000..3f63487 Binary files /dev/null and b/kernel/generic/._trsm_lncopy_16.c differ diff --git a/kernel/generic/._trsm_lncopy_2.c b/kernel/generic/._trsm_lncopy_2.c new file mode 100644 index 0000000..173ef0a Binary files /dev/null and b/kernel/generic/._trsm_lncopy_2.c differ diff --git a/kernel/generic/._trsm_lncopy_4.c b/kernel/generic/._trsm_lncopy_4.c new file mode 100644 index 0000000..c7e99f2 Binary files /dev/null and b/kernel/generic/._trsm_lncopy_4.c differ diff --git a/kernel/generic/._trsm_lncopy_8.c b/kernel/generic/._trsm_lncopy_8.c new file mode 100644 index 0000000..fae6f45 Binary files /dev/null and b/kernel/generic/._trsm_lncopy_8.c differ diff --git a/kernel/generic/._trsm_ltcopy_1.c b/kernel/generic/._trsm_ltcopy_1.c new file mode 100644 index 0000000..9e39540 Binary files /dev/null and b/kernel/generic/._trsm_ltcopy_1.c differ diff --git a/kernel/generic/._trsm_ltcopy_16.c b/kernel/generic/._trsm_ltcopy_16.c new file mode 100644 index 0000000..4d81eb4 Binary files /dev/null and b/kernel/generic/._trsm_ltcopy_16.c differ diff --git a/kernel/generic/._trsm_ltcopy_2.c b/kernel/generic/._trsm_ltcopy_2.c new file mode 100644 index 0000000..79e06fd Binary files /dev/null and b/kernel/generic/._trsm_ltcopy_2.c differ diff --git a/kernel/generic/._trsm_ltcopy_4.c b/kernel/generic/._trsm_ltcopy_4.c new file mode 100644 index 0000000..d600c79 Binary files /dev/null and b/kernel/generic/._trsm_ltcopy_4.c differ diff --git a/kernel/generic/._trsm_ltcopy_8.c b/kernel/generic/._trsm_ltcopy_8.c new file mode 100644 index 0000000..0b66327 Binary files /dev/null and b/kernel/generic/._trsm_ltcopy_8.c differ diff --git a/kernel/generic/._trsm_uncopy_1.c b/kernel/generic/._trsm_uncopy_1.c new file mode 100644 index 0000000..2070bf8 Binary files /dev/null and b/kernel/generic/._trsm_uncopy_1.c differ diff --git a/kernel/generic/._trsm_uncopy_16.c b/kernel/generic/._trsm_uncopy_16.c new file mode 100644 index 0000000..659727d Binary files /dev/null and b/kernel/generic/._trsm_uncopy_16.c differ diff --git a/kernel/generic/._trsm_uncopy_2.c b/kernel/generic/._trsm_uncopy_2.c new file mode 100644 index 0000000..0dd6bc6 Binary files /dev/null and b/kernel/generic/._trsm_uncopy_2.c differ diff --git a/kernel/generic/._trsm_uncopy_4.c b/kernel/generic/._trsm_uncopy_4.c new file mode 100644 index 0000000..fe7c680 Binary files /dev/null and b/kernel/generic/._trsm_uncopy_4.c differ diff --git a/kernel/generic/._trsm_uncopy_8.c b/kernel/generic/._trsm_uncopy_8.c new file mode 100644 index 0000000..a8b7e19 Binary files /dev/null and b/kernel/generic/._trsm_uncopy_8.c differ diff --git a/kernel/generic/._trsm_utcopy_1.c b/kernel/generic/._trsm_utcopy_1.c new file mode 100644 index 0000000..81d5539 Binary files /dev/null and b/kernel/generic/._trsm_utcopy_1.c differ diff --git a/kernel/generic/._trsm_utcopy_16.c b/kernel/generic/._trsm_utcopy_16.c new file mode 100644 index 0000000..ae8aa6b Binary files /dev/null and b/kernel/generic/._trsm_utcopy_16.c differ diff --git a/kernel/generic/._trsm_utcopy_2.c b/kernel/generic/._trsm_utcopy_2.c new file mode 100644 index 0000000..44da0dc Binary files /dev/null and b/kernel/generic/._trsm_utcopy_2.c differ diff --git a/kernel/generic/._trsm_utcopy_4.c b/kernel/generic/._trsm_utcopy_4.c new file mode 100644 index 0000000..c75adf2 Binary files /dev/null and b/kernel/generic/._trsm_utcopy_4.c differ diff --git a/kernel/generic/._trsm_utcopy_8.c b/kernel/generic/._trsm_utcopy_8.c new file mode 100644 index 0000000..409b0dd Binary files /dev/null and b/kernel/generic/._trsm_utcopy_8.c differ diff --git a/kernel/generic/._zgemm3m_ncopy_1.c b/kernel/generic/._zgemm3m_ncopy_1.c new file mode 100644 index 0000000..b5954ae Binary files /dev/null and b/kernel/generic/._zgemm3m_ncopy_1.c differ diff --git a/kernel/generic/._zgemm3m_ncopy_2.c b/kernel/generic/._zgemm3m_ncopy_2.c new file mode 100644 index 0000000..34e0f76 Binary files /dev/null and b/kernel/generic/._zgemm3m_ncopy_2.c differ diff --git a/kernel/generic/._zgemm3m_ncopy_4.c b/kernel/generic/._zgemm3m_ncopy_4.c new file mode 100644 index 0000000..7f36108 Binary files /dev/null and b/kernel/generic/._zgemm3m_ncopy_4.c differ diff --git a/kernel/generic/._zgemm3m_ncopy_8.c b/kernel/generic/._zgemm3m_ncopy_8.c new file mode 100644 index 0000000..2a6212d Binary files /dev/null and b/kernel/generic/._zgemm3m_ncopy_8.c differ diff --git a/kernel/generic/._zgemm3m_tcopy_1.c b/kernel/generic/._zgemm3m_tcopy_1.c new file mode 100644 index 0000000..8d42646 Binary files /dev/null and b/kernel/generic/._zgemm3m_tcopy_1.c differ diff --git a/kernel/generic/._zgemm3m_tcopy_2.c b/kernel/generic/._zgemm3m_tcopy_2.c new file mode 100644 index 0000000..640a277 Binary files /dev/null and b/kernel/generic/._zgemm3m_tcopy_2.c differ diff --git a/kernel/generic/._zgemm3m_tcopy_4.c b/kernel/generic/._zgemm3m_tcopy_4.c new file mode 100644 index 0000000..14094dc Binary files /dev/null and b/kernel/generic/._zgemm3m_tcopy_4.c differ diff --git a/kernel/generic/._zgemm3m_tcopy_8.c b/kernel/generic/._zgemm3m_tcopy_8.c new file mode 100644 index 0000000..fd53453 Binary files /dev/null and b/kernel/generic/._zgemm3m_tcopy_8.c differ diff --git a/kernel/generic/._zgemm_beta.c b/kernel/generic/._zgemm_beta.c new file mode 100644 index 0000000..3cebcc9 Binary files /dev/null and b/kernel/generic/._zgemm_beta.c differ diff --git a/kernel/generic/._zgemm_ncopy_1.c b/kernel/generic/._zgemm_ncopy_1.c new file mode 100644 index 0000000..0c4ab85 Binary files /dev/null and b/kernel/generic/._zgemm_ncopy_1.c differ diff --git a/kernel/generic/._zgemm_ncopy_2.c b/kernel/generic/._zgemm_ncopy_2.c new file mode 100644 index 0000000..d1d880d Binary files /dev/null and b/kernel/generic/._zgemm_ncopy_2.c differ diff --git a/kernel/generic/._zgemm_ncopy_4.c b/kernel/generic/._zgemm_ncopy_4.c new file mode 100644 index 0000000..4c41312 Binary files /dev/null and b/kernel/generic/._zgemm_ncopy_4.c differ diff --git a/kernel/generic/._zgemm_ncopy_8.c b/kernel/generic/._zgemm_ncopy_8.c new file mode 100644 index 0000000..12b274a Binary files /dev/null and b/kernel/generic/._zgemm_ncopy_8.c differ diff --git a/kernel/generic/._zgemm_tcopy_1.c b/kernel/generic/._zgemm_tcopy_1.c new file mode 100644 index 0000000..b85f493 Binary files /dev/null and b/kernel/generic/._zgemm_tcopy_1.c differ diff --git a/kernel/generic/._zgemm_tcopy_2.c b/kernel/generic/._zgemm_tcopy_2.c new file mode 100644 index 0000000..7d7b0e0 Binary files /dev/null and b/kernel/generic/._zgemm_tcopy_2.c differ diff --git a/kernel/generic/._zgemm_tcopy_4.c b/kernel/generic/._zgemm_tcopy_4.c new file mode 100644 index 0000000..5360d09 Binary files /dev/null and b/kernel/generic/._zgemm_tcopy_4.c differ diff --git a/kernel/generic/._zgemm_tcopy_8.c b/kernel/generic/._zgemm_tcopy_8.c new file mode 100644 index 0000000..4103274 Binary files /dev/null and b/kernel/generic/._zgemm_tcopy_8.c differ diff --git a/kernel/generic/._zger.c b/kernel/generic/._zger.c new file mode 100644 index 0000000..4063bd2 Binary files /dev/null and b/kernel/generic/._zger.c differ diff --git a/kernel/generic/._zhemm3m_lcopy_1.c b/kernel/generic/._zhemm3m_lcopy_1.c new file mode 100644 index 0000000..035234a Binary files /dev/null and b/kernel/generic/._zhemm3m_lcopy_1.c differ diff --git a/kernel/generic/._zhemm3m_lcopy_2.c b/kernel/generic/._zhemm3m_lcopy_2.c new file mode 100644 index 0000000..4291ec4 Binary files /dev/null and b/kernel/generic/._zhemm3m_lcopy_2.c differ diff --git a/kernel/generic/._zhemm3m_lcopy_4.c b/kernel/generic/._zhemm3m_lcopy_4.c new file mode 100644 index 0000000..8ea4c7a Binary files /dev/null and b/kernel/generic/._zhemm3m_lcopy_4.c differ diff --git a/kernel/generic/._zhemm3m_lcopy_8.c b/kernel/generic/._zhemm3m_lcopy_8.c new file mode 100644 index 0000000..ff471cf Binary files /dev/null and b/kernel/generic/._zhemm3m_lcopy_8.c differ diff --git a/kernel/generic/._zhemm3m_ucopy_1.c b/kernel/generic/._zhemm3m_ucopy_1.c new file mode 100644 index 0000000..da4baef Binary files /dev/null and b/kernel/generic/._zhemm3m_ucopy_1.c differ diff --git a/kernel/generic/._zhemm3m_ucopy_2.c b/kernel/generic/._zhemm3m_ucopy_2.c new file mode 100644 index 0000000..b77cc2a Binary files /dev/null and b/kernel/generic/._zhemm3m_ucopy_2.c differ diff --git a/kernel/generic/._zhemm3m_ucopy_4.c b/kernel/generic/._zhemm3m_ucopy_4.c new file mode 100644 index 0000000..fa4a5e9 Binary files /dev/null and b/kernel/generic/._zhemm3m_ucopy_4.c differ diff --git a/kernel/generic/._zhemm3m_ucopy_8.c b/kernel/generic/._zhemm3m_ucopy_8.c new file mode 100644 index 0000000..9851c30 Binary files /dev/null and b/kernel/generic/._zhemm3m_ucopy_8.c differ diff --git a/kernel/generic/._zhemm_ltcopy_1.c b/kernel/generic/._zhemm_ltcopy_1.c new file mode 100644 index 0000000..2d11637 Binary files /dev/null and b/kernel/generic/._zhemm_ltcopy_1.c differ diff --git a/kernel/generic/._zhemm_ltcopy_2.c b/kernel/generic/._zhemm_ltcopy_2.c new file mode 100644 index 0000000..282bc0d Binary files /dev/null and b/kernel/generic/._zhemm_ltcopy_2.c differ diff --git a/kernel/generic/._zhemm_ltcopy_4.c b/kernel/generic/._zhemm_ltcopy_4.c new file mode 100644 index 0000000..ca2b992 Binary files /dev/null and b/kernel/generic/._zhemm_ltcopy_4.c differ diff --git a/kernel/generic/._zhemm_ltcopy_8.c b/kernel/generic/._zhemm_ltcopy_8.c new file mode 100644 index 0000000..d887cbf Binary files /dev/null and b/kernel/generic/._zhemm_ltcopy_8.c differ diff --git a/kernel/generic/._zhemm_utcopy_1.c b/kernel/generic/._zhemm_utcopy_1.c new file mode 100644 index 0000000..0b87710 Binary files /dev/null and b/kernel/generic/._zhemm_utcopy_1.c differ diff --git a/kernel/generic/._zhemm_utcopy_2.c b/kernel/generic/._zhemm_utcopy_2.c new file mode 100644 index 0000000..443de9b Binary files /dev/null and b/kernel/generic/._zhemm_utcopy_2.c differ diff --git a/kernel/generic/._zhemm_utcopy_4.c b/kernel/generic/._zhemm_utcopy_4.c new file mode 100644 index 0000000..7c3bbd5 Binary files /dev/null and b/kernel/generic/._zhemm_utcopy_4.c differ diff --git a/kernel/generic/._zhemm_utcopy_8.c b/kernel/generic/._zhemm_utcopy_8.c new file mode 100644 index 0000000..f47f4b0 Binary files /dev/null and b/kernel/generic/._zhemm_utcopy_8.c differ diff --git a/kernel/generic/._zhemv_k.c b/kernel/generic/._zhemv_k.c new file mode 100644 index 0000000..e045c4d Binary files /dev/null and b/kernel/generic/._zhemv_k.c differ diff --git a/kernel/generic/._zlaswp_ncopy_1.c b/kernel/generic/._zlaswp_ncopy_1.c new file mode 100644 index 0000000..081d08c Binary files /dev/null and b/kernel/generic/._zlaswp_ncopy_1.c differ diff --git a/kernel/generic/._zlaswp_ncopy_2.c b/kernel/generic/._zlaswp_ncopy_2.c new file mode 100644 index 0000000..f2406bc Binary files /dev/null and b/kernel/generic/._zlaswp_ncopy_2.c differ diff --git a/kernel/generic/._zlaswp_ncopy_4.c b/kernel/generic/._zlaswp_ncopy_4.c new file mode 100644 index 0000000..0f9c954 Binary files /dev/null and b/kernel/generic/._zlaswp_ncopy_4.c differ diff --git a/kernel/generic/._zneg_tcopy_1.c b/kernel/generic/._zneg_tcopy_1.c new file mode 100644 index 0000000..a70633d Binary files /dev/null and b/kernel/generic/._zneg_tcopy_1.c differ diff --git a/kernel/generic/._zneg_tcopy_2.c b/kernel/generic/._zneg_tcopy_2.c new file mode 100644 index 0000000..2edf946 Binary files /dev/null and b/kernel/generic/._zneg_tcopy_2.c differ diff --git a/kernel/generic/._zneg_tcopy_4.c b/kernel/generic/._zneg_tcopy_4.c new file mode 100644 index 0000000..0c96353 Binary files /dev/null and b/kernel/generic/._zneg_tcopy_4.c differ diff --git a/kernel/generic/._zneg_tcopy_8.c b/kernel/generic/._zneg_tcopy_8.c new file mode 100644 index 0000000..f1b9d4c Binary files /dev/null and b/kernel/generic/._zneg_tcopy_8.c differ diff --git a/kernel/generic/._zsymm3m_lcopy_1.c b/kernel/generic/._zsymm3m_lcopy_1.c new file mode 100644 index 0000000..e37f51f Binary files /dev/null and b/kernel/generic/._zsymm3m_lcopy_1.c differ diff --git a/kernel/generic/._zsymm3m_lcopy_2.c b/kernel/generic/._zsymm3m_lcopy_2.c new file mode 100644 index 0000000..d6707f0 Binary files /dev/null and b/kernel/generic/._zsymm3m_lcopy_2.c differ diff --git a/kernel/generic/._zsymm3m_lcopy_4.c b/kernel/generic/._zsymm3m_lcopy_4.c new file mode 100644 index 0000000..1185f99 Binary files /dev/null and b/kernel/generic/._zsymm3m_lcopy_4.c differ diff --git a/kernel/generic/._zsymm3m_lcopy_8.c b/kernel/generic/._zsymm3m_lcopy_8.c new file mode 100644 index 0000000..5289001 Binary files /dev/null and b/kernel/generic/._zsymm3m_lcopy_8.c differ diff --git a/kernel/generic/._zsymm3m_ucopy_1.c b/kernel/generic/._zsymm3m_ucopy_1.c new file mode 100644 index 0000000..2ddf188 Binary files /dev/null and b/kernel/generic/._zsymm3m_ucopy_1.c differ diff --git a/kernel/generic/._zsymm3m_ucopy_2.c b/kernel/generic/._zsymm3m_ucopy_2.c new file mode 100644 index 0000000..911a2c2 Binary files /dev/null and b/kernel/generic/._zsymm3m_ucopy_2.c differ diff --git a/kernel/generic/._zsymm3m_ucopy_4.c b/kernel/generic/._zsymm3m_ucopy_4.c new file mode 100644 index 0000000..e67b6b8 Binary files /dev/null and b/kernel/generic/._zsymm3m_ucopy_4.c differ diff --git a/kernel/generic/._zsymm3m_ucopy_8.c b/kernel/generic/._zsymm3m_ucopy_8.c new file mode 100644 index 0000000..0e57d59 Binary files /dev/null and b/kernel/generic/._zsymm3m_ucopy_8.c differ diff --git a/kernel/generic/._zsymm_lcopy_1.c b/kernel/generic/._zsymm_lcopy_1.c new file mode 100644 index 0000000..48bfa19 Binary files /dev/null and b/kernel/generic/._zsymm_lcopy_1.c differ diff --git a/kernel/generic/._zsymm_lcopy_2.c b/kernel/generic/._zsymm_lcopy_2.c new file mode 100644 index 0000000..1b5d5ce Binary files /dev/null and b/kernel/generic/._zsymm_lcopy_2.c differ diff --git a/kernel/generic/._zsymm_lcopy_4.c b/kernel/generic/._zsymm_lcopy_4.c new file mode 100644 index 0000000..aa173b9 Binary files /dev/null and b/kernel/generic/._zsymm_lcopy_4.c differ diff --git a/kernel/generic/._zsymm_lcopy_8.c b/kernel/generic/._zsymm_lcopy_8.c new file mode 100644 index 0000000..a295e9e Binary files /dev/null and b/kernel/generic/._zsymm_lcopy_8.c differ diff --git a/kernel/generic/._zsymm_ucopy_1.c b/kernel/generic/._zsymm_ucopy_1.c new file mode 100644 index 0000000..caa3cb7 Binary files /dev/null and b/kernel/generic/._zsymm_ucopy_1.c differ diff --git a/kernel/generic/._zsymm_ucopy_2.c b/kernel/generic/._zsymm_ucopy_2.c new file mode 100644 index 0000000..ced8a57 Binary files /dev/null and b/kernel/generic/._zsymm_ucopy_2.c differ diff --git a/kernel/generic/._zsymm_ucopy_4.c b/kernel/generic/._zsymm_ucopy_4.c new file mode 100644 index 0000000..e6bcd1b Binary files /dev/null and b/kernel/generic/._zsymm_ucopy_4.c differ diff --git a/kernel/generic/._zsymm_ucopy_8.c b/kernel/generic/._zsymm_ucopy_8.c new file mode 100644 index 0000000..dc16b6c Binary files /dev/null and b/kernel/generic/._zsymm_ucopy_8.c differ diff --git a/kernel/generic/._zsymv_k.c b/kernel/generic/._zsymv_k.c new file mode 100644 index 0000000..d5d82bf Binary files /dev/null and b/kernel/generic/._zsymv_k.c differ diff --git a/kernel/generic/._ztrmm_lncopy_1.c b/kernel/generic/._ztrmm_lncopy_1.c new file mode 100644 index 0000000..d3ec275 Binary files /dev/null and b/kernel/generic/._ztrmm_lncopy_1.c differ diff --git a/kernel/generic/._ztrmm_lncopy_2.c b/kernel/generic/._ztrmm_lncopy_2.c new file mode 100644 index 0000000..9024fdb Binary files /dev/null and b/kernel/generic/._ztrmm_lncopy_2.c differ diff --git a/kernel/generic/._ztrmm_lncopy_4.c b/kernel/generic/._ztrmm_lncopy_4.c new file mode 100644 index 0000000..9e52c55 Binary files /dev/null and b/kernel/generic/._ztrmm_lncopy_4.c differ diff --git a/kernel/generic/._ztrmm_lncopy_8.c b/kernel/generic/._ztrmm_lncopy_8.c new file mode 100644 index 0000000..1e60708 Binary files /dev/null and b/kernel/generic/._ztrmm_lncopy_8.c differ diff --git a/kernel/generic/._ztrmm_ltcopy_1.c b/kernel/generic/._ztrmm_ltcopy_1.c new file mode 100644 index 0000000..7b6e845 Binary files /dev/null and b/kernel/generic/._ztrmm_ltcopy_1.c differ diff --git a/kernel/generic/._ztrmm_ltcopy_2.c b/kernel/generic/._ztrmm_ltcopy_2.c new file mode 100644 index 0000000..d1190c0 Binary files /dev/null and b/kernel/generic/._ztrmm_ltcopy_2.c differ diff --git a/kernel/generic/._ztrmm_ltcopy_4.c b/kernel/generic/._ztrmm_ltcopy_4.c new file mode 100644 index 0000000..0be384f Binary files /dev/null and b/kernel/generic/._ztrmm_ltcopy_4.c differ diff --git a/kernel/generic/._ztrmm_ltcopy_8.c b/kernel/generic/._ztrmm_ltcopy_8.c new file mode 100644 index 0000000..4ad817c Binary files /dev/null and b/kernel/generic/._ztrmm_ltcopy_8.c differ diff --git a/kernel/generic/._ztrmm_uncopy_1.c b/kernel/generic/._ztrmm_uncopy_1.c new file mode 100644 index 0000000..73e13a4 Binary files /dev/null and b/kernel/generic/._ztrmm_uncopy_1.c differ diff --git a/kernel/generic/._ztrmm_uncopy_2.c b/kernel/generic/._ztrmm_uncopy_2.c new file mode 100644 index 0000000..5041ce9 Binary files /dev/null and b/kernel/generic/._ztrmm_uncopy_2.c differ diff --git a/kernel/generic/._ztrmm_uncopy_4.c b/kernel/generic/._ztrmm_uncopy_4.c new file mode 100644 index 0000000..448713e Binary files /dev/null and b/kernel/generic/._ztrmm_uncopy_4.c differ diff --git a/kernel/generic/._ztrmm_uncopy_8.c b/kernel/generic/._ztrmm_uncopy_8.c new file mode 100644 index 0000000..45f7686 Binary files /dev/null and b/kernel/generic/._ztrmm_uncopy_8.c differ diff --git a/kernel/generic/._ztrmm_utcopy_1.c b/kernel/generic/._ztrmm_utcopy_1.c new file mode 100644 index 0000000..0d0c650 Binary files /dev/null and b/kernel/generic/._ztrmm_utcopy_1.c differ diff --git a/kernel/generic/._ztrmm_utcopy_2.c b/kernel/generic/._ztrmm_utcopy_2.c new file mode 100644 index 0000000..efa6304 Binary files /dev/null and b/kernel/generic/._ztrmm_utcopy_2.c differ diff --git a/kernel/generic/._ztrmm_utcopy_4.c b/kernel/generic/._ztrmm_utcopy_4.c new file mode 100644 index 0000000..ea7b395 Binary files /dev/null and b/kernel/generic/._ztrmm_utcopy_4.c differ diff --git a/kernel/generic/._ztrmm_utcopy_8.c b/kernel/generic/._ztrmm_utcopy_8.c new file mode 100644 index 0000000..1239d58 Binary files /dev/null and b/kernel/generic/._ztrmm_utcopy_8.c differ diff --git a/kernel/generic/._ztrsm_lncopy_1.c b/kernel/generic/._ztrsm_lncopy_1.c new file mode 100644 index 0000000..2eacb9f Binary files /dev/null and b/kernel/generic/._ztrsm_lncopy_1.c differ diff --git a/kernel/generic/._ztrsm_lncopy_2.c b/kernel/generic/._ztrsm_lncopy_2.c new file mode 100644 index 0000000..17072f1 Binary files /dev/null and b/kernel/generic/._ztrsm_lncopy_2.c differ diff --git a/kernel/generic/._ztrsm_lncopy_4.c b/kernel/generic/._ztrsm_lncopy_4.c new file mode 100644 index 0000000..933bf02 Binary files /dev/null and b/kernel/generic/._ztrsm_lncopy_4.c differ diff --git a/kernel/generic/._ztrsm_lncopy_8.c b/kernel/generic/._ztrsm_lncopy_8.c new file mode 100644 index 0000000..e9fd266 Binary files /dev/null and b/kernel/generic/._ztrsm_lncopy_8.c differ diff --git a/kernel/generic/._ztrsm_ltcopy_1.c b/kernel/generic/._ztrsm_ltcopy_1.c new file mode 100644 index 0000000..804e44a Binary files /dev/null and b/kernel/generic/._ztrsm_ltcopy_1.c differ diff --git a/kernel/generic/._ztrsm_ltcopy_2.c b/kernel/generic/._ztrsm_ltcopy_2.c new file mode 100644 index 0000000..c8df336 Binary files /dev/null and b/kernel/generic/._ztrsm_ltcopy_2.c differ diff --git a/kernel/generic/._ztrsm_ltcopy_4.c b/kernel/generic/._ztrsm_ltcopy_4.c new file mode 100644 index 0000000..03070af Binary files /dev/null and b/kernel/generic/._ztrsm_ltcopy_4.c differ diff --git a/kernel/generic/._ztrsm_ltcopy_8.c b/kernel/generic/._ztrsm_ltcopy_8.c new file mode 100644 index 0000000..83dd7a6 Binary files /dev/null and b/kernel/generic/._ztrsm_ltcopy_8.c differ diff --git a/kernel/generic/._ztrsm_uncopy_1.c b/kernel/generic/._ztrsm_uncopy_1.c new file mode 100644 index 0000000..2f31cfb Binary files /dev/null and b/kernel/generic/._ztrsm_uncopy_1.c differ diff --git a/kernel/generic/._ztrsm_uncopy_2.c b/kernel/generic/._ztrsm_uncopy_2.c new file mode 100644 index 0000000..27e1100 Binary files /dev/null and b/kernel/generic/._ztrsm_uncopy_2.c differ diff --git a/kernel/generic/._ztrsm_uncopy_4.c b/kernel/generic/._ztrsm_uncopy_4.c new file mode 100644 index 0000000..3301c7d Binary files /dev/null and b/kernel/generic/._ztrsm_uncopy_4.c differ diff --git a/kernel/generic/._ztrsm_uncopy_8.c b/kernel/generic/._ztrsm_uncopy_8.c new file mode 100644 index 0000000..485f018 Binary files /dev/null and b/kernel/generic/._ztrsm_uncopy_8.c differ diff --git a/kernel/generic/._ztrsm_utcopy_1.c b/kernel/generic/._ztrsm_utcopy_1.c new file mode 100644 index 0000000..4a07a9f Binary files /dev/null and b/kernel/generic/._ztrsm_utcopy_1.c differ diff --git a/kernel/generic/._ztrsm_utcopy_2.c b/kernel/generic/._ztrsm_utcopy_2.c new file mode 100644 index 0000000..967f52d Binary files /dev/null and b/kernel/generic/._ztrsm_utcopy_2.c differ diff --git a/kernel/generic/._ztrsm_utcopy_4.c b/kernel/generic/._ztrsm_utcopy_4.c new file mode 100644 index 0000000..781f01a Binary files /dev/null and b/kernel/generic/._ztrsm_utcopy_4.c differ diff --git a/kernel/generic/._ztrsm_utcopy_8.c b/kernel/generic/._ztrsm_utcopy_8.c new file mode 100644 index 0000000..313d6f8 Binary files /dev/null and b/kernel/generic/._ztrsm_utcopy_8.c differ diff --git a/kernel/generic/cabs.c b/kernel/generic/cabs.c new file mode 100644 index 0000000..f76f69b --- /dev/null +++ b/kernel/generic/cabs.c @@ -0,0 +1,44 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +FLOAT NAME(FLOAT *a){ + return fabs(a[0]) + fabs(a[1]); +} diff --git a/kernel/generic/gemm_beta.c b/kernel/generic/gemm_beta.c new file mode 100644 index 0000000..525ff94 --- /dev/null +++ b/kernel/generic/gemm_beta.c @@ -0,0 +1,142 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + + BLASLONG i, j; + FLOAT *c_offset1, *c_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + c_offset = c; + + if (beta == ZERO){ + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + *(c_offset1 + 0) = ZERO; + *(c_offset1 + 1) = ZERO; + *(c_offset1 + 2) = ZERO; + *(c_offset1 + 3) = ZERO; + *(c_offset1 + 4) = ZERO; + *(c_offset1 + 5) = ZERO; + *(c_offset1 + 6) = ZERO; + *(c_offset1 + 7) = ZERO; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + *c_offset1 = ZERO; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + ctemp1 = *(c_offset1 + 0); + ctemp2 = *(c_offset1 + 1); + ctemp3 = *(c_offset1 + 2); + ctemp4 = *(c_offset1 + 3); + ctemp5 = *(c_offset1 + 4); + ctemp6 = *(c_offset1 + 5); + ctemp7 = *(c_offset1 + 6); + ctemp8 = *(c_offset1 + 7); + + ctemp1 *= beta; + ctemp2 *= beta; + ctemp3 *= beta; + ctemp4 *= beta; + ctemp5 *= beta; + ctemp6 *= beta; + ctemp7 *= beta; + ctemp8 *= beta; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + *(c_offset1 + 4) = ctemp5; + *(c_offset1 + 5) = ctemp6; + *(c_offset1 + 6) = ctemp7; + *(c_offset1 + 7) = ctemp8; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + ctemp1 = *c_offset1; + ctemp1 *= beta; + *c_offset1 = ctemp1; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } + return 0; +}; diff --git a/kernel/generic/gemm_ncopy_1.c b/kernel/generic/gemm_ncopy_1.c new file mode 100644 index 0000000..e990de7 --- /dev/null +++ b/kernel/generic/gemm_ncopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + FLOAT *a_offset, *a_offset1; + FLOAT *b_offset; + + a_offset = a; + b_offset = b; + + j = n; + + if (j > 0){ + do { + a_offset1 = a_offset; + a_offset += lda; + + i = (m >> 3); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset1 + 0); + *(b_offset + 1) = *(a_offset1 + 1); + *(b_offset + 2) = *(a_offset1 + 2); + *(b_offset + 3) = *(a_offset1 + 3); + *(b_offset + 4) = *(a_offset1 + 4); + *(b_offset + 5) = *(a_offset1 + 5); + *(b_offset + 6) = *(a_offset1 + 6); + *(b_offset + 7) = *(a_offset1 + 7); + a_offset1 += 8; + b_offset += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset1 + 0); + a_offset1 ++; + b_offset ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + } + + return 0; +} diff --git a/kernel/generic/gemm_ncopy_16.c b/kernel/generic/gemm_ncopy_16.c new file mode 100644 index 0000000..4a9269e --- /dev/null +++ b/kernel/generic/gemm_ncopy_16.c @@ -0,0 +1,437 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + ctemp17 = *(aoffset9 + 0); + ctemp18 = *(aoffset9 + 1); + ctemp19 = *(aoffset10 + 0); + ctemp20 = *(aoffset10 + 1); + + ctemp21 = *(aoffset11 + 0); + ctemp22 = *(aoffset11 + 1); + ctemp23 = *(aoffset12 + 0); + ctemp24 = *(aoffset12 + 1); + + ctemp25 = *(aoffset13 + 0); + ctemp26 = *(aoffset13 + 1); + ctemp27 = *(aoffset14 + 0); + ctemp28 = *(aoffset14 + 1); + + ctemp29 = *(aoffset15 + 0); + ctemp30 = *(aoffset15 + 1); + ctemp31 = *(aoffset16 + 0); + ctemp32 = *(aoffset16 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + + *(boffset + 16) = ctemp02; + *(boffset + 17) = ctemp04; + *(boffset + 18) = ctemp06; + *(boffset + 19) = ctemp08; + *(boffset + 20) = ctemp10; + *(boffset + 21) = ctemp12; + *(boffset + 22) = ctemp14; + *(boffset + 23) = ctemp16; + + *(boffset + 24) = ctemp18; + *(boffset + 25) = ctemp20; + *(boffset + 26) = ctemp22; + *(boffset + 27) = ctemp24; + *(boffset + 28) = ctemp26; + *(boffset + 29) = ctemp28; + *(boffset + 30) = ctemp30; + *(boffset + 31) = ctemp32; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + aoffset9 += 2; + aoffset10 += 2; + aoffset11 += 2; + aoffset12 += 2; + aoffset13 += 2; + aoffset14 += 2; + aoffset15 += 2; + aoffset16 += 2; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + ctemp17 = *(aoffset9 + 0); + ctemp19 = *(aoffset10 + 0); + ctemp21 = *(aoffset11 + 0); + ctemp23 = *(aoffset12 + 0); + ctemp25 = *(aoffset13 + 0); + ctemp27 = *(aoffset14 + 0); + ctemp29 = *(aoffset15 + 0); + ctemp31 = *(aoffset16 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + + boffset += 16; + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp02; + *(boffset + 9) = ctemp04; + *(boffset + 10) = ctemp06; + *(boffset + 11) = ctemp08; + *(boffset + 12) = ctemp10; + *(boffset + 13) = ctemp12; + *(boffset + 14) = ctemp14; + *(boffset + 15) = ctemp16; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp02; + *(boffset + 5) = ctemp04; + *(boffset + 6) = ctemp06; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + boffset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/gemm_ncopy_2.c b/kernel/generic/gemm_ncopy_2.c new file mode 100644 index 0000000..0ec807c --- /dev/null +++ b/kernel/generic/gemm_ncopy_2.c @@ -0,0 +1,126 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset; + + a_offset = a; + b_offset = b; + + j = (n >> 1); + + if (j > 0){ + do { + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + i = (m >> 2); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset1 + 0); + *(b_offset + 1) = *(a_offset2 + 0); + *(b_offset + 2) = *(a_offset1 + 1); + *(b_offset + 3) = *(a_offset2 + 1); + *(b_offset + 4) = *(a_offset1 + 2); + *(b_offset + 5) = *(a_offset2 + 2); + *(b_offset + 6) = *(a_offset1 + 3); + *(b_offset + 7) = *(a_offset2 + 3); + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + } while (i > 0); + } + + i = (m & 3); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset1 + 0); + *(b_offset + 1) = *(a_offset2 + 0); + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + } + + if (n & 1){ + + i = (m >> 3); + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset + 0); + *(b_offset + 1) = *(a_offset + 1); + *(b_offset + 2) = *(a_offset + 2); + *(b_offset + 3) = *(a_offset + 3); + *(b_offset + 4) = *(a_offset + 4); + *(b_offset + 5) = *(a_offset + 5); + *(b_offset + 6) = *(a_offset + 6); + *(b_offset + 7) = *(a_offset + 7); + a_offset += 8; + b_offset += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset + 0); + a_offset ++; + b_offset ++; + i --; + } while (i > 0); + } + } + + return 0; +} + diff --git a/kernel/generic/gemm_ncopy_4.c b/kernel/generic/gemm_ncopy_4.c new file mode 100644 index 0000000..1ecb93c --- /dev/null +++ b/kernel/generic/gemm_ncopy_4.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + *(b_offset + 4) = ctemp2; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp10; + *(b_offset + 7) = ctemp14; + + *(b_offset + 8) = ctemp3; + *(b_offset + 9) = ctemp7; + *(b_offset + 10) = ctemp11; + *(b_offset + 11) = ctemp15; + + *(b_offset + 12) = ctemp4; + *(b_offset + 13) = ctemp8; + *(b_offset + 14) = ctemp12; + *(b_offset + 15) = ctemp16; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/gemm_ncopy_8.c b/kernel/generic/gemm_ncopy_8.c new file mode 100644 index 0000000..bdaaba1 --- /dev/null +++ b/kernel/generic/gemm_ncopy_8.c @@ -0,0 +1,422 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + ctemp33 = *(aoffset5 + 0); + ctemp34 = *(aoffset5 + 1); + ctemp35 = *(aoffset5 + 2); + ctemp36 = *(aoffset5 + 3); + ctemp37 = *(aoffset5 + 4); + ctemp38 = *(aoffset5 + 5); + ctemp39 = *(aoffset5 + 6); + ctemp40 = *(aoffset5 + 7); + + ctemp41 = *(aoffset6 + 0); + ctemp42 = *(aoffset6 + 1); + ctemp43 = *(aoffset6 + 2); + ctemp44 = *(aoffset6 + 3); + ctemp45 = *(aoffset6 + 4); + ctemp46 = *(aoffset6 + 5); + ctemp47 = *(aoffset6 + 6); + ctemp48 = *(aoffset6 + 7); + + ctemp49 = *(aoffset7 + 0); + ctemp50 = *(aoffset7 + 1); + ctemp51 = *(aoffset7 + 2); + ctemp52 = *(aoffset7 + 3); + ctemp53 = *(aoffset7 + 4); + ctemp54 = *(aoffset7 + 5); + ctemp55 = *(aoffset7 + 6); + ctemp56 = *(aoffset7 + 7); + + ctemp57 = *(aoffset8 + 0); + ctemp58 = *(aoffset8 + 1); + ctemp59 = *(aoffset8 + 2); + ctemp60 = *(aoffset8 + 3); + ctemp61 = *(aoffset8 + 4); + ctemp62 = *(aoffset8 + 5); + ctemp63 = *(aoffset8 + 6); + ctemp64 = *(aoffset8 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + *(boffset + 8) = ctemp02; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp18; + *(boffset + 11) = ctemp26; + *(boffset + 12) = ctemp34; + *(boffset + 13) = ctemp42; + *(boffset + 14) = ctemp50; + *(boffset + 15) = ctemp58; + + *(boffset + 16) = ctemp03; + *(boffset + 17) = ctemp11; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp27; + *(boffset + 20) = ctemp35; + *(boffset + 21) = ctemp43; + *(boffset + 22) = ctemp51; + *(boffset + 23) = ctemp59; + + *(boffset + 24) = ctemp04; + *(boffset + 25) = ctemp12; + *(boffset + 26) = ctemp20; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp36; + *(boffset + 29) = ctemp44; + *(boffset + 30) = ctemp52; + *(boffset + 31) = ctemp60; + + *(boffset + 32) = ctemp05; + *(boffset + 33) = ctemp13; + *(boffset + 34) = ctemp21; + *(boffset + 35) = ctemp29; + *(boffset + 36) = ctemp37; + *(boffset + 37) = ctemp45; + *(boffset + 38) = ctemp53; + *(boffset + 39) = ctemp61; + + *(boffset + 40) = ctemp06; + *(boffset + 41) = ctemp14; + *(boffset + 42) = ctemp22; + *(boffset + 43) = ctemp30; + *(boffset + 44) = ctemp38; + *(boffset + 45) = ctemp46; + *(boffset + 46) = ctemp54; + *(boffset + 47) = ctemp62; + + *(boffset + 48) = ctemp07; + *(boffset + 49) = ctemp15; + *(boffset + 50) = ctemp23; + *(boffset + 51) = ctemp31; + *(boffset + 52) = ctemp39; + *(boffset + 53) = ctemp47; + *(boffset + 54) = ctemp55; + *(boffset + 55) = ctemp63; + + *(boffset + 56) = ctemp08; + *(boffset + 57) = ctemp16; + *(boffset + 58) = ctemp24; + *(boffset + 59) = ctemp32; + *(boffset + 60) = ctemp40; + *(boffset + 61) = ctemp48; + *(boffset + 62) = ctemp56; + *(boffset + 63) = ctemp64; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp05; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp13; + + *(boffset + 4) = ctemp02; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp10; + *(boffset + 7) = ctemp14; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp07; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp15; + + *(boffset + 12) = ctemp04; + *(boffset + 13) = ctemp08; + *(boffset + 14) = ctemp12; + *(boffset + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_1.c b/kernel/generic/gemm_tcopy_1.c new file mode 100644 index 0000000..c0c8bd0 --- /dev/null +++ b/kernel/generic/gemm_tcopy_1.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1; + FLOAT *b_offset, *b_offset1; + + a_offset = a; + b_offset = b; + + i = m; + + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset ++; + + j = n; + if (j > 0) { + do { + *(b_offset1 + 0) = *(a_offset1 + 0); + a_offset1 ++; + b_offset1 += m; + j --; + } while (j > 0); + } + i --; + } while (i > 0); + } + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_16.c b/kernel/generic/gemm_tcopy_16.c new file mode 100644 index 0000000..e573225 --- /dev/null +++ b/kernel/generic/gemm_tcopy_16.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = ctemp01; + boffset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_2.c b/kernel/generic/gemm_tcopy_2.c new file mode 100644 index 0000000..0aa9c2e --- /dev/null +++ b/kernel/generic/gemm_tcopy_2.c @@ -0,0 +1,104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + + a_offset = a; + b_offset = b; + b_offset2 = b + m * (n & ~1); + + i = (m >> 1); + + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + j = (n >> 1); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0); + *(b_offset1 + 1) = *(a_offset1 + 1); + *(b_offset1 + 2) = *(a_offset2 + 0); + *(b_offset1 + 3) = *(a_offset2 + 1); + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += m * 2; + j--; + } while (j > 0); + } + + if (n & 1){ + *(b_offset2 + 0) = *(a_offset1 + 0); + *(b_offset2 + 1) = *(a_offset2 + 0); + b_offset2 += 2; + } + i --; + } while (i > 0); + } + + if (m & 1) { + j = (n >> 1); + if (j > 0){ + do { + *(b_offset + 0) = *(a_offset + 0); + *(b_offset + 1) = *(a_offset + 1); + a_offset += 2; + b_offset += m * 2; + j--; + } while (j > 0); + } + + if (n & 1){ + *(b_offset2 + 0) = *(a_offset + 0); + } + } + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_4.c b/kernel/generic/gemm_tcopy_4.c new file mode 100644 index 0000000..bd32090 --- /dev/null +++ b/kernel/generic/gemm_tcopy_4.c @@ -0,0 +1,281 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + *(b_offset1 + 8) = ctemp9; + *(b_offset1 + 9) = ctemp10; + *(b_offset1 + 10) = ctemp11; + *(b_offset1 + 11) = ctemp12; + + *(b_offset1 + 12) = ctemp13; + *(b_offset1 + 13) = ctemp14; + *(b_offset1 + 14) = ctemp15; + *(b_offset1 + 15) = ctemp16; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset3 + 0); + ctemp6 = *(a_offset3 + 1); + + ctemp7 = *(a_offset4 + 0); + ctemp8 = *(a_offset4 + 1); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + *(b_offset2 + 4) = ctemp5; + *(b_offset2 + 5) = ctemp6; + *(b_offset2 + 6) = ctemp7; + *(b_offset2 + 7) = ctemp8; + + b_offset2 += 8; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + ctemp3 = *(a_offset3 + 0); + ctemp4 = *(a_offset4 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + *(b_offset3 + 2) = ctemp3; + *(b_offset3 + 3) = ctemp4; + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_8.c b/kernel/generic/gemm_tcopy_8.c new file mode 100644 index 0000000..8f6e33c --- /dev/null +++ b/kernel/generic/gemm_tcopy_8.c @@ -0,0 +1,787 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + ctemp33 = *(aoffset5 + 0); + ctemp34 = *(aoffset5 + 1); + ctemp35 = *(aoffset5 + 2); + ctemp36 = *(aoffset5 + 3); + ctemp37 = *(aoffset5 + 4); + ctemp38 = *(aoffset5 + 5); + ctemp39 = *(aoffset5 + 6); + ctemp40 = *(aoffset5 + 7); + aoffset5 += 8; + + ctemp41 = *(aoffset6 + 0); + ctemp42 = *(aoffset6 + 1); + ctemp43 = *(aoffset6 + 2); + ctemp44 = *(aoffset6 + 3); + ctemp45 = *(aoffset6 + 4); + ctemp46 = *(aoffset6 + 5); + ctemp47 = *(aoffset6 + 6); + ctemp48 = *(aoffset6 + 7); + aoffset6 += 8; + + ctemp49 = *(aoffset7 + 0); + ctemp50 = *(aoffset7 + 1); + ctemp51 = *(aoffset7 + 2); + ctemp52 = *(aoffset7 + 3); + ctemp53 = *(aoffset7 + 4); + ctemp54 = *(aoffset7 + 5); + ctemp55 = *(aoffset7 + 6); + ctemp56 = *(aoffset7 + 7); + aoffset7 += 8; + + ctemp57 = *(aoffset8 + 0); + ctemp58 = *(aoffset8 + 1); + ctemp59 = *(aoffset8 + 2); + ctemp60 = *(aoffset8 + 3); + ctemp61 = *(aoffset8 + 4); + ctemp62 = *(aoffset8 + 5); + ctemp63 = *(aoffset8 + 6); + ctemp64 = *(aoffset8 + 7); + aoffset8 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + *(boffset1 + 16) = ctemp17; + *(boffset1 + 17) = ctemp18; + *(boffset1 + 18) = ctemp19; + *(boffset1 + 19) = ctemp20; + *(boffset1 + 20) = ctemp21; + *(boffset1 + 21) = ctemp22; + *(boffset1 + 22) = ctemp23; + *(boffset1 + 23) = ctemp24; + + *(boffset1 + 24) = ctemp25; + *(boffset1 + 25) = ctemp26; + *(boffset1 + 26) = ctemp27; + *(boffset1 + 27) = ctemp28; + *(boffset1 + 28) = ctemp29; + *(boffset1 + 29) = ctemp30; + *(boffset1 + 30) = ctemp31; + *(boffset1 + 31) = ctemp32; + + *(boffset1 + 32) = ctemp33; + *(boffset1 + 33) = ctemp34; + *(boffset1 + 34) = ctemp35; + *(boffset1 + 35) = ctemp36; + *(boffset1 + 36) = ctemp37; + *(boffset1 + 37) = ctemp38; + *(boffset1 + 38) = ctemp39; + *(boffset1 + 39) = ctemp40; + + *(boffset1 + 40) = ctemp41; + *(boffset1 + 41) = ctemp42; + *(boffset1 + 42) = ctemp43; + *(boffset1 + 43) = ctemp44; + *(boffset1 + 44) = ctemp45; + *(boffset1 + 45) = ctemp46; + *(boffset1 + 46) = ctemp47; + *(boffset1 + 47) = ctemp48; + + *(boffset1 + 48) = ctemp49; + *(boffset1 + 49) = ctemp50; + *(boffset1 + 50) = ctemp51; + *(boffset1 + 51) = ctemp52; + *(boffset1 + 52) = ctemp53; + *(boffset1 + 53) = ctemp54; + *(boffset1 + 54) = ctemp55; + *(boffset1 + 55) = ctemp56; + + *(boffset1 + 56) = ctemp57; + *(boffset1 + 57) = ctemp58; + *(boffset1 + 58) = ctemp59; + *(boffset1 + 59) = ctemp60; + *(boffset1 + 60) = ctemp61; + *(boffset1 + 61) = ctemp62; + *(boffset1 + 62) = ctemp63; + *(boffset1 + 63) = ctemp64; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + ctemp17 = *(aoffset5 + 0); + ctemp18 = *(aoffset5 + 1); + ctemp19 = *(aoffset5 + 2); + ctemp20 = *(aoffset5 + 3); + aoffset5 += 4; + + ctemp21 = *(aoffset6 + 0); + ctemp22 = *(aoffset6 + 1); + ctemp23 = *(aoffset6 + 2); + ctemp24 = *(aoffset6 + 3); + aoffset6 += 4; + + ctemp25 = *(aoffset7 + 0); + ctemp26 = *(aoffset7 + 1); + ctemp27 = *(aoffset7 + 2); + ctemp28 = *(aoffset7 + 3); + aoffset7 += 4; + + ctemp29 = *(aoffset8 + 0); + ctemp30 = *(aoffset8 + 1); + ctemp31 = *(aoffset8 + 2); + ctemp32 = *(aoffset8 + 3); + aoffset8 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp10; + *(boffset2 + 10) = ctemp11; + *(boffset2 + 11) = ctemp12; + *(boffset2 + 12) = ctemp13; + *(boffset2 + 13) = ctemp14; + *(boffset2 + 14) = ctemp15; + *(boffset2 + 15) = ctemp16; + + *(boffset2 + 16) = ctemp17; + *(boffset2 + 17) = ctemp18; + *(boffset2 + 18) = ctemp19; + *(boffset2 + 19) = ctemp20; + *(boffset2 + 20) = ctemp21; + *(boffset2 + 21) = ctemp22; + *(boffset2 + 22) = ctemp23; + *(boffset2 + 23) = ctemp24; + *(boffset2 + 24) = ctemp25; + *(boffset2 + 25) = ctemp26; + *(boffset2 + 26) = ctemp27; + *(boffset2 + 27) = ctemp28; + *(boffset2 + 28) = ctemp29; + *(boffset2 + 29) = ctemp30; + *(boffset2 + 30) = ctemp31; + *(boffset2 + 31) = ctemp32; + + boffset2 += 32; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + aoffset5 += 2; + + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + aoffset6 += 2; + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + aoffset7 += 2; + + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + aoffset8 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + *(boffset3 + 8) = ctemp09; + *(boffset3 + 9) = ctemp10; + *(boffset3 + 10) = ctemp11; + *(boffset3 + 11) = ctemp12; + *(boffset3 + 12) = ctemp13; + *(boffset3 + 13) = ctemp14; + *(boffset3 + 14) = ctemp15; + *(boffset3 + 15) = ctemp16; + boffset3 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + ctemp05 = *(aoffset5 + 0); + aoffset5 ++; + ctemp06 = *(aoffset6 + 0); + aoffset6 ++; + ctemp07 = *(aoffset7 + 0); + aoffset7 ++; + ctemp08 = *(aoffset8 + 0); + aoffset8 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + *(boffset4 + 4) = ctemp05; + *(boffset4 + 5) = ctemp06; + *(boffset4 + 6) = ctemp07; + *(boffset4 + 7) = ctemp08; + boffset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 3); + if (i > 0){ + + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + *(boffset1 + 16) = ctemp17; + *(boffset1 + 17) = ctemp18; + *(boffset1 + 18) = ctemp19; + *(boffset1 + 19) = ctemp20; + *(boffset1 + 20) = ctemp21; + *(boffset1 + 21) = ctemp22; + *(boffset1 + 22) = ctemp23; + *(boffset1 + 23) = ctemp24; + + *(boffset1 + 24) = ctemp25; + *(boffset1 + 25) = ctemp26; + *(boffset1 + 26) = ctemp27; + *(boffset1 + 27) = ctemp28; + *(boffset1 + 28) = ctemp29; + *(boffset1 + 29) = ctemp30; + *(boffset1 + 30) = ctemp31; + *(boffset1 + 31) = ctemp32; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp10; + *(boffset2 + 10) = ctemp11; + *(boffset2 + 11) = ctemp12; + *(boffset2 + 12) = ctemp13; + *(boffset2 + 13) = ctemp14; + *(boffset2 + 14) = ctemp15; + *(boffset2 + 15) = ctemp16; + boffset2 += 16; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + boffset3 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + boffset4 += 4; + } + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + boffset2 += 8; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + boffset3 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + boffset4 += 2; + } + } + + if (m & 1){ + aoffset1 = aoffset; + aoffset += lda; + + boffset1 = boffset; + boffset += 8; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + boffset2 += 4; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + boffset3 += 2; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + *(boffset4 + 0) = ctemp01; + boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/generic/ger.c b/kernel/generic/ger.c new file mode 100644 index 0000000..2438786 --- /dev/null +++ b/kernel/generic/ger.c @@ -0,0 +1,63 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, + FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + FLOAT *X = x; + + if (incx != 1) { + X = buffer; + COPY_K(m, x, incx, X, 1); + } + + while (n > 0) { + AXPYU_K(m, 0, 0, alpha * *y, X, 1, a, 1, NULL, 0); + a += lda; + y += incy; + n --; + } + + return 0; +} + diff --git a/kernel/generic/laswp_ncopy_1.c b/kernel/generic/laswp_ncopy_1.c new file mode 100644 index 0000000..4394474 --- /dev/null +++ b/kernel/generic/laswp_ncopy_1.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define a2 (a1 + 1) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, B1, B2; + + a--; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + + j = n; + do { + piv = ipiv; + + a1 = a + k1 + 1; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = B2; + + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A1; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = B2; + *b2 = A1; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = A2; + *b1 = A1; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = A1; + *b1 = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *b1 = A1; + *b2 = A2; + } + } + + buffer += 2; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + + if (a1 == b1) { + *(buffer + 0) = A1; + } else { + *(buffer + 0) = B1; + *b1 = A1; + } + } + a += lda; + j --; + } while (j > 0); + + return 0; +} + diff --git a/kernel/generic/laswp_ncopy_2.c b/kernel/generic/laswp_ncopy_2.c new file mode 100644 index 0000000..806a1e1 --- /dev/null +++ b/kernel/generic/laswp_ncopy_2.c @@ -0,0 +1,293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define PREFETCHSIZE 12 + +#define a2 (a1 + 1) +#define a4 (a3 + 1) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3; + FLOAT *b1, *b2, *b3, *b4; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + + a--; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 1); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + k1 + 1; + a3 = a1 + 1 * lda; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + +#ifdef __GNUC__ + __builtin_prefetch(a1 + PREFETCHSIZE, 0, 0); + __builtin_prefetch(a3 + PREFETCHSIZE, 0, 0); +#endif + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A2; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + + *b2 = A2; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A3; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + *b2 = A1; + *b4 = A3; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = A2; + *(buffer + 3) = A4; + *b1 = A1; + *b3 = A3; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = A1; + *(buffer + 3) = A3; + *b1 = A2; + *b3 = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + *b1 = A1; + *b2 = A2; + *b3 = A3; + *b4 = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + a1 += 2; + a3 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *b1 = A1; + *b3 = A3; + } + buffer += 2; + } + + a += 2 * lda; + j --; + } while (j > 0); + } + + if (n & 1) { + piv = ipiv; + + a1 = a + k1 + 1; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = B2; + + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A1; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = B2; + *b2 = A1; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = A2; + *b1 = A1; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = A1; + *b1 = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *b1 = A1; + *b2 = A2; + } + } + + buffer += 2; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + + if (a1 == b1) { + *(buffer + 0) = A1; + } else { + *(buffer + 0) = B1; + *b1 = A1; + } + } + } + + return 0; +} + diff --git a/kernel/generic/laswp_ncopy_4.c b/kernel/generic/laswp_ncopy_4.c new file mode 100644 index 0000000..0736f07 --- /dev/null +++ b/kernel/generic/laswp_ncopy_4.c @@ -0,0 +1,503 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define PREFETCHSIZE 8 + +#define a2 (a1 + 1) +#define a4 (a3 + 1) +#define a6 (a5 + 1) +#define a8 (a7 + 1) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + + a--; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 2); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + k1 + 1; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + +#ifdef __GNUC__ + __builtin_prefetch(a1 + PREFETCHSIZE, 0, 0); + __builtin_prefetch(a3 + PREFETCHSIZE, 0, 0); + __builtin_prefetch(a5 + PREFETCHSIZE, 0, 0); + __builtin_prefetch(a7 + PREFETCHSIZE, 0, 0); +#endif + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A5; + *(buffer + 3) = A7; + + *(buffer + 4) = A2; + *(buffer + 5) = A4; + *(buffer + 6) = A6; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A5; + *(buffer + 3) = A7; + + *(buffer + 4) = B2; + *(buffer + 5) = B4; + *(buffer + 6) = B6; + *(buffer + 7) = B8; + + *b2 = A2; + *b4 = A4; + *b6 = A6; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = A6; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A3; + *(buffer + 6) = A5; + *(buffer + 7) = A7; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = A6; + *(buffer + 3) = A8; + *(buffer + 4) = B2; + *(buffer + 5) = B4; + *(buffer + 6) = B6; + *(buffer + 7) = B8; + *b2 = A1; + *b4 = A3; + *b6 = A5; + *b8 = A7; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B5; + *(buffer + 3) = B7; + *(buffer + 4) = A2; + *(buffer + 5) = A4; + *(buffer + 6) = A6; + *(buffer + 7) = A8; + *b1 = A1; + *b3 = A3; + *b5 = A5; + *b7 = A7; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B5; + *(buffer + 3) = B7; + *(buffer + 4) = A1; + *(buffer + 5) = A3; + *(buffer + 6) = A5; + *(buffer + 7) = A7; + *b1 = A2; + *b3 = A4; + *b5 = A6; + *b7 = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B5; + *(buffer + 3) = B7; + *(buffer + 4) = B2; + *(buffer + 5) = B4; + *(buffer + 6) = B6; + *(buffer + 7) = B8; + *b1 = A1; + *b2 = A2; + *b3 = A3; + *b4 = A4; + *b5 = A5; + *b6 = A6; + *b7 = A7; + *b8 = A8; + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A5; + *(buffer + 3) = A7; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B5; + *(buffer + 3) = B7; + *b1 = A1; + *b3 = A3; + *b5 = A5; + *b7 = A7; + } + buffer += 4; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + if (n & 2) { + piv = ipiv; + + a1 = a + k1 + 1; + a3 = a1 + 1 * lda; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A2; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + + *b2 = A2; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A3; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + *b2 = A1; + *b4 = A3; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = A2; + *(buffer + 3) = A4; + *b1 = A1; + *b3 = A3; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = A1; + *(buffer + 3) = A3; + *b1 = A2; + *b3 = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + *b1 = A1; + *b2 = A2; + *b3 = A3; + *b4 = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + a1 += 2; + a3 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *b1 = A1; + *b3 = A3; + } + buffer += 2; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + + a1 = a + k1 + 1; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = B2; + + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A1; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = B2; + *b2 = A1; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = A2; + *b1 = A1; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = A1; + *b1 = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *b1 = A1; + *b2 = A2; + } + } + + buffer += 2; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + + if (a1 == b1) { + *(buffer + 0) = A1; + } else { + *(buffer + 0) = B1; + *b1 = A1; + } + } + } + + return 0; +} + diff --git a/kernel/generic/laswp_ncopy_8.c b/kernel/generic/laswp_ncopy_8.c new file mode 100644 index 0000000..e08c8ce --- /dev/null +++ b/kernel/generic/laswp_ncopy_8.c @@ -0,0 +1,296 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define PREFETCHSIZE 4 + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip; + blasint *piv; + FLOAT *dx1, *dy1; + FLOAT *dx2, *dy2; + FLOAT *dx3, *dy3; + FLOAT *dx4, *dy4; + FLOAT *dx5, *dy5; + FLOAT *dx6, *dy6; + FLOAT *dx7, *dy7; + FLOAT *dx8, *dy8; + FLOAT atemp1, btemp1; + FLOAT atemp2, btemp2; + FLOAT atemp3, btemp3; + FLOAT atemp4, btemp4; + FLOAT atemp5, btemp5; + FLOAT atemp6, btemp6; + FLOAT atemp7, btemp7; + FLOAT atemp8, btemp8; + + a--; + ipiv += k1 - 1; + + if (n <= 0) return 0; + if (k1 > k2) return 0; + + j = (n >> 3); + if (j > 0) { + do { + piv = ipiv; + i = k1; + + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + i + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + i + lda * 3; + dy4 = a + ip + lda * 3; + dx5 = a + i + lda * 4; + dy5 = a + ip + lda * 4; + dx6 = a + i + lda * 5; + dy6 = a + ip + lda * 5; + dx7 = a + i + lda * 6; + dy7 = a + ip + lda * 6; + dx8 = a + i + lda * 7; + dy8 = a + ip + lda * 7; + +#ifdef __GNUC__ + __builtin_prefetch(dx1 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx2 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx3 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx4 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx5 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx6 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx7 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx8 + PREFETCHSIZE, 0, 1); +#endif + + atemp1 = *dx1; + btemp1 = *dy1; + atemp2 = *dx2; + btemp2 = *dy2; + atemp3 = *dx3; + btemp3 = *dy3; + atemp4 = *dx4; + btemp4 = *dy4; + + atemp5 = *dx5; + btemp5 = *dy5; + atemp6 = *dx6; + btemp6 = *dy6; + atemp7 = *dx7; + btemp7 = *dy7; + atemp8 = *dx8; + btemp8 = *dy8; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *dy3 = atemp3; + *dy4 = atemp4; + *dy5 = atemp5; + *dy6 = atemp6; + *dy7 = atemp7; + *dy8 = atemp8; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + *(buffer + 2) = btemp3; + *(buffer + 3) = btemp4; + *(buffer + 4) = btemp5; + *(buffer + 5) = btemp6; + *(buffer + 6) = btemp7; + *(buffer + 7) = btemp8; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + *(buffer + 2) = atemp3; + *(buffer + 3) = atemp4; + *(buffer + 4) = atemp5; + *(buffer + 5) = atemp6; + *(buffer + 6) = atemp7; + *(buffer + 7) = atemp8; + } + + buffer += 8; + + i++; + } while (i <= k2); + + a += 8 * lda; + j --; + } while (j > 0); + } + + if (n & 4) { + piv = ipiv; + + ip = *piv; + piv ++; + + dx1 = a + k1; + dy1 = a + ip; + dx2 = a + k1 + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + k1 + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + k1 + lda * 3; + dy4 = a + ip + lda * 3; + + i = k1; + + do { + atemp1 = *dx1; + atemp2 = *dx2; + atemp3 = *dx3; + atemp4 = *dx4; + + btemp1 = *dy1; + btemp2 = *dy2; + btemp3 = *dy3; + btemp4 = *dy4; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *dy3 = atemp3; + *dy4 = atemp4; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + *(buffer + 2) = btemp3; + *(buffer + 3) = btemp4; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + *(buffer + 2) = atemp3; + *(buffer + 3) = atemp4; + } + + ip = *piv; + piv ++; + + i++; + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + i + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + i + lda * 3; + dy4 = a + ip + lda * 3; + + buffer += 4; + + } while (i <= k2); + + a += 4 * lda; + } + + if (n & 2) { + piv = ipiv; + + i = k1; + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda; + dy2 = a + ip + lda; + + atemp1 = *dx1; + btemp1 = *dy1; + atemp2 = *dx2; + btemp2 = *dy2; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + } + + buffer += 2; + + i++; + } while (i <= k2); + + a += 2 * lda; + } + + + if (n & 1) { + piv = ipiv; + + i = k1; + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + atemp1 = *dx1; + btemp1 = *dy1; + + if (ip != i) { + *dy1 = atemp1; + *buffer = btemp1; + } else { + *buffer = atemp1; + } + + buffer ++; + + i++; + } while (i <= k2); + + a += lda; + } + + return 0; +} + diff --git a/kernel/generic/lsame.c b/kernel/generic/lsame.c new file mode 100644 index 0000000..cae8b4a --- /dev/null +++ b/kernel/generic/lsame.c @@ -0,0 +1,50 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include + +int NAME(char *A, char *B){ + + char a = *A; + char b = *B; + + if (a > 96) a -= 32; + if (b > 96) b -= 32; + + return (a == b); +} diff --git a/kernel/generic/neg_tcopy_1.c b/kernel/generic/neg_tcopy_1.c new file mode 100644 index 0000000..3845f04 --- /dev/null +++ b/kernel/generic/neg_tcopy_1.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1; + FLOAT *b_offset, *b_offset1; + + a_offset = a; + b_offset = b; + + i = m; + + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset ++; + + j = n; + if (j > 0) { + do { + *(b_offset1 + 0) = -*(a_offset1 + 0); + a_offset1 ++; + b_offset1 += m; + j --; + } while (j > 0); + } + i --; + } while (i > 0); + } + + return 0; +} diff --git a/kernel/generic/neg_tcopy_16.c b/kernel/generic/neg_tcopy_16.c new file mode 100644 index 0000000..2d47b27 --- /dev/null +++ b/kernel/generic/neg_tcopy_16.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = -ctemp01; + boffset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/neg_tcopy_2.c b/kernel/generic/neg_tcopy_2.c new file mode 100644 index 0000000..e4dfa0b --- /dev/null +++ b/kernel/generic/neg_tcopy_2.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + + a_offset = a; + b_offset = b; + b_offset2 = b + m * (n & ~1); + + i = (m >> 1); + + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + j = (n >> 1); + if (j > 0){ + do { + *(b_offset1 + 0) = -*(a_offset1 + 0); + *(b_offset1 + 1) = -*(a_offset1 + 1); + *(b_offset1 + 2) = -*(a_offset2 + 0); + *(b_offset1 + 3) = -*(a_offset2 + 1); + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += m * 2; + j--; + } while (j > 0); + } + + if (n & 1){ + *(b_offset2 + 0) = -*(a_offset1 + 0); + *(b_offset2 + 1) = -*(a_offset2 + 0); + b_offset2 += 2; + } + i --; + } while (i > 0); + } + + if (m & 1) { + j = (n >> 1); + if (j > 0){ + do { + *(b_offset + 0) = -*(a_offset + 0); + *(b_offset + 1) = -*(a_offset + 1); + a_offset += 2; + b_offset += m * 2; + j--; + } while (j > 0); + } + + if (n & 1){ + *(b_offset2 + 0) = -*(a_offset + 0); + } + } + + return 0; +} diff --git a/kernel/generic/neg_tcopy_4.c b/kernel/generic/neg_tcopy_4.c new file mode 100644 index 0000000..9fb1dc7 --- /dev/null +++ b/kernel/generic/neg_tcopy_4.c @@ -0,0 +1,281 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + *(b_offset1 + 4) = -ctemp5; + *(b_offset1 + 5) = -ctemp6; + *(b_offset1 + 6) = -ctemp7; + *(b_offset1 + 7) = -ctemp8; + + *(b_offset1 + 8) = -ctemp9; + *(b_offset1 + 9) = -ctemp10; + *(b_offset1 + 10) = -ctemp11; + *(b_offset1 + 11) = -ctemp12; + + *(b_offset1 + 12) = -ctemp13; + *(b_offset1 + 13) = -ctemp14; + *(b_offset1 + 14) = -ctemp15; + *(b_offset1 + 15) = -ctemp16; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset3 + 0); + ctemp6 = *(a_offset3 + 1); + + ctemp7 = *(a_offset4 + 0); + ctemp8 = *(a_offset4 + 1); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + *(b_offset2 + 2) = -ctemp3; + *(b_offset2 + 3) = -ctemp4; + + *(b_offset2 + 4) = -ctemp5; + *(b_offset2 + 5) = -ctemp6; + *(b_offset2 + 6) = -ctemp7; + *(b_offset2 + 7) = -ctemp8; + + b_offset2 += 8; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + ctemp3 = *(a_offset3 + 0); + ctemp4 = *(a_offset4 + 0); + + *(b_offset3 + 0) = -ctemp1; + *(b_offset3 + 1) = -ctemp2; + *(b_offset3 + 2) = -ctemp3; + *(b_offset3 + 3) = -ctemp4; + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + *(b_offset1 + 4) = -ctemp5; + *(b_offset1 + 5) = -ctemp6; + *(b_offset1 + 6) = -ctemp7; + *(b_offset1 + 7) = -ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + *(b_offset2 + 2) = -ctemp3; + *(b_offset2 + 3) = -ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = -ctemp1; + *(b_offset3 + 1) = -ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = -ctemp1; + } + } + + return 0; +} diff --git a/kernel/generic/neg_tcopy_8.c b/kernel/generic/neg_tcopy_8.c new file mode 100644 index 0000000..97fec3b --- /dev/null +++ b/kernel/generic/neg_tcopy_8.c @@ -0,0 +1,787 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + ctemp33 = *(aoffset5 + 0); + ctemp34 = *(aoffset5 + 1); + ctemp35 = *(aoffset5 + 2); + ctemp36 = *(aoffset5 + 3); + ctemp37 = *(aoffset5 + 4); + ctemp38 = *(aoffset5 + 5); + ctemp39 = *(aoffset5 + 6); + ctemp40 = *(aoffset5 + 7); + aoffset5 += 8; + + ctemp41 = *(aoffset6 + 0); + ctemp42 = *(aoffset6 + 1); + ctemp43 = *(aoffset6 + 2); + ctemp44 = *(aoffset6 + 3); + ctemp45 = *(aoffset6 + 4); + ctemp46 = *(aoffset6 + 5); + ctemp47 = *(aoffset6 + 6); + ctemp48 = *(aoffset6 + 7); + aoffset6 += 8; + + ctemp49 = *(aoffset7 + 0); + ctemp50 = *(aoffset7 + 1); + ctemp51 = *(aoffset7 + 2); + ctemp52 = *(aoffset7 + 3); + ctemp53 = *(aoffset7 + 4); + ctemp54 = *(aoffset7 + 5); + ctemp55 = *(aoffset7 + 6); + ctemp56 = *(aoffset7 + 7); + aoffset7 += 8; + + ctemp57 = *(aoffset8 + 0); + ctemp58 = *(aoffset8 + 1); + ctemp59 = *(aoffset8 + 2); + ctemp60 = *(aoffset8 + 3); + ctemp61 = *(aoffset8 + 4); + ctemp62 = *(aoffset8 + 5); + ctemp63 = *(aoffset8 + 6); + ctemp64 = *(aoffset8 + 7); + aoffset8 += 8; + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + *(boffset1 + 16) = -ctemp17; + *(boffset1 + 17) = -ctemp18; + *(boffset1 + 18) = -ctemp19; + *(boffset1 + 19) = -ctemp20; + *(boffset1 + 20) = -ctemp21; + *(boffset1 + 21) = -ctemp22; + *(boffset1 + 22) = -ctemp23; + *(boffset1 + 23) = -ctemp24; + + *(boffset1 + 24) = -ctemp25; + *(boffset1 + 25) = -ctemp26; + *(boffset1 + 26) = -ctemp27; + *(boffset1 + 27) = -ctemp28; + *(boffset1 + 28) = -ctemp29; + *(boffset1 + 29) = -ctemp30; + *(boffset1 + 30) = -ctemp31; + *(boffset1 + 31) = -ctemp32; + + *(boffset1 + 32) = -ctemp33; + *(boffset1 + 33) = -ctemp34; + *(boffset1 + 34) = -ctemp35; + *(boffset1 + 35) = -ctemp36; + *(boffset1 + 36) = -ctemp37; + *(boffset1 + 37) = -ctemp38; + *(boffset1 + 38) = -ctemp39; + *(boffset1 + 39) = -ctemp40; + + *(boffset1 + 40) = -ctemp41; + *(boffset1 + 41) = -ctemp42; + *(boffset1 + 42) = -ctemp43; + *(boffset1 + 43) = -ctemp44; + *(boffset1 + 44) = -ctemp45; + *(boffset1 + 45) = -ctemp46; + *(boffset1 + 46) = -ctemp47; + *(boffset1 + 47) = -ctemp48; + + *(boffset1 + 48) = -ctemp49; + *(boffset1 + 49) = -ctemp50; + *(boffset1 + 50) = -ctemp51; + *(boffset1 + 51) = -ctemp52; + *(boffset1 + 52) = -ctemp53; + *(boffset1 + 53) = -ctemp54; + *(boffset1 + 54) = -ctemp55; + *(boffset1 + 55) = -ctemp56; + + *(boffset1 + 56) = -ctemp57; + *(boffset1 + 57) = -ctemp58; + *(boffset1 + 58) = -ctemp59; + *(boffset1 + 59) = -ctemp60; + *(boffset1 + 60) = -ctemp61; + *(boffset1 + 61) = -ctemp62; + *(boffset1 + 62) = -ctemp63; + *(boffset1 + 63) = -ctemp64; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + ctemp17 = *(aoffset5 + 0); + ctemp18 = *(aoffset5 + 1); + ctemp19 = *(aoffset5 + 2); + ctemp20 = *(aoffset5 + 3); + aoffset5 += 4; + + ctemp21 = *(aoffset6 + 0); + ctemp22 = *(aoffset6 + 1); + ctemp23 = *(aoffset6 + 2); + ctemp24 = *(aoffset6 + 3); + aoffset6 += 4; + + ctemp25 = *(aoffset7 + 0); + ctemp26 = *(aoffset7 + 1); + ctemp27 = *(aoffset7 + 2); + ctemp28 = *(aoffset7 + 3); + aoffset7 += 4; + + ctemp29 = *(aoffset8 + 0); + ctemp30 = *(aoffset8 + 1); + ctemp31 = *(aoffset8 + 2); + ctemp32 = *(aoffset8 + 3); + aoffset8 += 4; + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + *(boffset2 + 8) = -ctemp09; + *(boffset2 + 9) = -ctemp10; + *(boffset2 + 10) = -ctemp11; + *(boffset2 + 11) = -ctemp12; + *(boffset2 + 12) = -ctemp13; + *(boffset2 + 13) = -ctemp14; + *(boffset2 + 14) = -ctemp15; + *(boffset2 + 15) = -ctemp16; + + *(boffset2 + 16) = -ctemp17; + *(boffset2 + 17) = -ctemp18; + *(boffset2 + 18) = -ctemp19; + *(boffset2 + 19) = -ctemp20; + *(boffset2 + 20) = -ctemp21; + *(boffset2 + 21) = -ctemp22; + *(boffset2 + 22) = -ctemp23; + *(boffset2 + 23) = -ctemp24; + *(boffset2 + 24) = -ctemp25; + *(boffset2 + 25) = -ctemp26; + *(boffset2 + 26) = -ctemp27; + *(boffset2 + 27) = -ctemp28; + *(boffset2 + 28) = -ctemp29; + *(boffset2 + 29) = -ctemp30; + *(boffset2 + 30) = -ctemp31; + *(boffset2 + 31) = -ctemp32; + + boffset2 += 32; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + aoffset5 += 2; + + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + aoffset6 += 2; + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + aoffset7 += 2; + + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + aoffset8 += 2; + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + *(boffset3 + 4) = -ctemp05; + *(boffset3 + 5) = -ctemp06; + *(boffset3 + 6) = -ctemp07; + *(boffset3 + 7) = -ctemp08; + *(boffset3 + 8) = -ctemp09; + *(boffset3 + 9) = -ctemp10; + *(boffset3 + 10) = -ctemp11; + *(boffset3 + 11) = -ctemp12; + *(boffset3 + 12) = -ctemp13; + *(boffset3 + 13) = -ctemp14; + *(boffset3 + 14) = -ctemp15; + *(boffset3 + 15) = -ctemp16; + boffset3 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + ctemp05 = *(aoffset5 + 0); + aoffset5 ++; + ctemp06 = *(aoffset6 + 0); + aoffset6 ++; + ctemp07 = *(aoffset7 + 0); + aoffset7 ++; + ctemp08 = *(aoffset8 + 0); + aoffset8 ++; + + *(boffset4 + 0) = -ctemp01; + *(boffset4 + 1) = -ctemp02; + *(boffset4 + 2) = -ctemp03; + *(boffset4 + 3) = -ctemp04; + *(boffset4 + 4) = -ctemp05; + *(boffset4 + 5) = -ctemp06; + *(boffset4 + 6) = -ctemp07; + *(boffset4 + 7) = -ctemp08; + boffset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 3); + if (i > 0){ + + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + *(boffset1 + 16) = -ctemp17; + *(boffset1 + 17) = -ctemp18; + *(boffset1 + 18) = -ctemp19; + *(boffset1 + 19) = -ctemp20; + *(boffset1 + 20) = -ctemp21; + *(boffset1 + 21) = -ctemp22; + *(boffset1 + 22) = -ctemp23; + *(boffset1 + 23) = -ctemp24; + + *(boffset1 + 24) = -ctemp25; + *(boffset1 + 25) = -ctemp26; + *(boffset1 + 26) = -ctemp27; + *(boffset1 + 27) = -ctemp28; + *(boffset1 + 28) = -ctemp29; + *(boffset1 + 29) = -ctemp30; + *(boffset1 + 30) = -ctemp31; + *(boffset1 + 31) = -ctemp32; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + + *(boffset2 + 8) = -ctemp09; + *(boffset2 + 9) = -ctemp10; + *(boffset2 + 10) = -ctemp11; + *(boffset2 + 11) = -ctemp12; + *(boffset2 + 12) = -ctemp13; + *(boffset2 + 13) = -ctemp14; + *(boffset2 + 14) = -ctemp15; + *(boffset2 + 15) = -ctemp16; + boffset2 += 16; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + *(boffset3 + 4) = -ctemp05; + *(boffset3 + 5) = -ctemp06; + *(boffset3 + 6) = -ctemp07; + *(boffset3 + 7) = -ctemp08; + boffset3 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + + *(boffset4 + 0) = -ctemp01; + *(boffset4 + 1) = -ctemp02; + *(boffset4 + 2) = -ctemp03; + *(boffset4 + 3) = -ctemp04; + boffset4 += 4; + } + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + boffset2 += 8; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + boffset3 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + + *(boffset4 + 0) = -ctemp01; + *(boffset4 + 1) = -ctemp02; + boffset4 += 2; + } + } + + if (m & 1){ + aoffset1 = aoffset; + aoffset += lda; + + boffset1 = boffset; + boffset += 8; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + boffset2 += 4; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + boffset3 += 2; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + *(boffset4 + 0) = -ctemp01; + boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_1.c b/kernel/generic/symm_lcopy_1.c new file mode 100644 index 0000000..7b6cfea --- /dev/null +++ b/kernel/generic/symm_lcopy_1.c @@ -0,0 +1,76 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + js = n; + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b += 1; + + offset --; + i --; + } + + posX += 1; + js --; + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_16.c b/kernel/generic/symm_lcopy_16.c new file mode 100644 index 0000000..2c8ad81 --- /dev/null +++ b/kernel/generic/symm_lcopy_16.c @@ -0,0 +1,273 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; + if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; + if (offset > -6) ao7 = a + posX + 6 + posY * lda; else ao7 = a + posY + (posX + 6) * lda; + if (offset > -7) ao8 = a + posX + 7 + posY * lda; else ao8 = a + posY + (posX + 7) * lda; + if (offset > -8) ao9 = a + posX + 8 + posY * lda; else ao9 = a + posY + (posX + 8) * lda; + if (offset > -9) ao10 = a + posX + 9 + posY * lda; else ao10 = a + posY + (posX + 9) * lda; + if (offset > -10) ao11 = a + posX + 10 + posY * lda; else ao11 = a + posY + (posX + 10) * lda; + if (offset > -11) ao12 = a + posX + 11 + posY * lda; else ao12 = a + posY + (posX + 11) * lda; + if (offset > -12) ao13 = a + posX + 12 + posY * lda; else ao13 = a + posY + (posX + 12) * lda; + if (offset > -13) ao14 = a + posX + 13 + posY * lda; else ao14 = a + posY + (posX + 13) * lda; + if (offset > -14) ao15 = a + posX + 14 + posY * lda; else ao15 = a + posY + (posX + 14) * lda; + if (offset > -15) ao16 = a + posX + 15 + posY * lda; else ao16 = a + posY + (posX + 15) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + data09 = *(ao9 + 0); + data10 = *(ao10 + 0); + data11 = *(ao11 + 0); + data12 = *(ao12 + 0); + data13 = *(ao13 + 0); + data14 = *(ao14 + 0); + data15 = *(ao15 + 0); + data16 = *(ao16 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + if (offset > -4) ao5 += lda; else ao5 ++; + if (offset > -5) ao6 += lda; else ao6 ++; + if (offset > -6) ao7 += lda; else ao7 ++; + if (offset > -7) ao8 += lda; else ao8 ++; + if (offset > -8) ao9 += lda; else ao9 ++; + if (offset > -9) ao10 += lda; else ao10 ++; + if (offset > -10) ao11 += lda; else ao11 ++; + if (offset > -11) ao12 += lda; else ao12 ++; + if (offset > -12) ao13 += lda; else ao13 ++; + if (offset > -13) ao14 += lda; else ao14 ++; + if (offset > -14) ao15 += lda; else ao15 ++; + if (offset > -15) ao16 += lda; else ao16 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; + if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; + if (offset > -6) ao7 = a + posX + 6 + posY * lda; else ao7 = a + posY + (posX + 6) * lda; + if (offset > -7) ao8 = a + posX + 7 + posY * lda; else ao8 = a + posY + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + if (offset > -4) ao5 += lda; else ao5 ++; + if (offset > -5) ao6 += lda; else ao6 ++; + if (offset > -6) ao7 += lda; else ao7 ++; + if (offset > -7) ao8 += lda; else ao8 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_2.c b/kernel/generic/symm_lcopy_2.c new file mode 100644 index 0000000..e7944c4 --- /dev/null +++ b/kernel/generic/symm_lcopy_2.c @@ -0,0 +1,102 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_4.c b/kernel/generic/symm_lcopy_4.c new file mode 100644 index 0000000..ac04943 --- /dev/null +++ b/kernel/generic/symm_lcopy_4.c @@ -0,0 +1,138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_8.c b/kernel/generic/symm_lcopy_8.c new file mode 100644 index 0000000..c315574 --- /dev/null +++ b/kernel/generic/symm_lcopy_8.c @@ -0,0 +1,188 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; + if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; + if (offset > -6) ao7 = a + posX + 6 + posY * lda; else ao7 = a + posY + (posX + 6) * lda; + if (offset > -7) ao8 = a + posX + 7 + posY * lda; else ao8 = a + posY + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + if (offset > -4) ao5 += lda; else ao5 ++; + if (offset > -5) ao6 += lda; else ao6 ++; + if (offset > -6) ao7 += lda; else ao7 ++; + if (offset > -7) ao8 += lda; else ao8 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} + diff --git a/kernel/generic/symm_ucopy_1.c b/kernel/generic/symm_ucopy_1.c new file mode 100644 index 0000000..4ab9bb4 --- /dev/null +++ b/kernel/generic/symm_ucopy_1.c @@ -0,0 +1,76 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + js = n; + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_16.c b/kernel/generic/symm_ucopy_16.c new file mode 100644 index 0000000..094810b --- /dev/null +++ b/kernel/generic/symm_ucopy_16.c @@ -0,0 +1,274 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; + if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; + if (offset > -6) ao7 = a + posY + (posX + 6) * lda; else ao7 = a + posX + 6 + posY * lda; + if (offset > -7) ao8 = a + posY + (posX + 7) * lda; else ao8 = a + posX + 7 + posY * lda; + if (offset > -8) ao9 = a + posY + (posX + 8) * lda; else ao9 = a + posX + 8 + posY * lda; + if (offset > -9) ao10 = a + posY + (posX + 9) * lda; else ao10 = a + posX + 9 + posY * lda; + if (offset > -10) ao11 = a + posY + (posX + 10) * lda; else ao11 = a + posX + 10 + posY * lda; + if (offset > -11) ao12 = a + posY + (posX + 11) * lda; else ao12 = a + posX + 11 + posY * lda; + if (offset > -12) ao13 = a + posY + (posX + 12) * lda; else ao13 = a + posX + 12 + posY * lda; + if (offset > -13) ao14 = a + posY + (posX + 13) * lda; else ao14 = a + posX + 13 + posY * lda; + if (offset > -14) ao15 = a + posY + (posX + 14) * lda; else ao15 = a + posX + 14 + posY * lda; + if (offset > -15) ao16 = a + posY + (posX + 15) * lda; else ao16 = a + posX + 15 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + data09 = *(ao9 + 0); + data10 = *(ao10 + 0); + data11 = *(ao11 + 0); + data12 = *(ao12 + 0); + data13 = *(ao13 + 0); + data14 = *(ao14 + 0); + data15 = *(ao15 + 0); + data16 = *(ao16 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + if (offset > -4) ao5 ++; else ao5 += lda; + if (offset > -5) ao6 ++; else ao6 += lda; + if (offset > -6) ao7 ++; else ao7 += lda; + if (offset > -7) ao8 ++; else ao8 += lda; + if (offset > -8) ao9 ++; else ao9 += lda; + if (offset > -9) ao10 ++; else ao10 += lda; + if (offset > -10) ao11 ++; else ao11 += lda; + if (offset > -11) ao12 ++; else ao12 += lda; + if (offset > -12) ao13 ++; else ao13 += lda; + if (offset > -13) ao14 ++; else ao14 += lda; + if (offset > -14) ao15 ++; else ao15 += lda; + if (offset > -15) ao16 ++; else ao16 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; + if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; + if (offset > -6) ao7 = a + posY + (posX + 6) * lda; else ao7 = a + posX + 6 + posY * lda; + if (offset > -7) ao8 = a + posY + (posX + 7) * lda; else ao8 = a + posX + 7 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + if (offset > -4) ao5 ++; else ao5 += lda; + if (offset > -5) ao6 ++; else ao6 += lda; + if (offset > -6) ao7 ++; else ao7 += lda; + if (offset > -7) ao8 ++; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_2.c b/kernel/generic/symm_ucopy_2.c new file mode 100644 index 0000000..6396b74 --- /dev/null +++ b/kernel/generic/symm_ucopy_2.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_4.c b/kernel/generic/symm_ucopy_4.c new file mode 100644 index 0000000..9b9cff8 --- /dev/null +++ b/kernel/generic/symm_ucopy_4.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_8.c b/kernel/generic/symm_ucopy_8.c new file mode 100644 index 0000000..411768b --- /dev/null +++ b/kernel/generic/symm_ucopy_8.c @@ -0,0 +1,188 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; + if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; + if (offset > -6) ao7 = a + posY + (posX + 6) * lda; else ao7 = a + posX + 6 + posY * lda; + if (offset > -7) ao8 = a + posY + (posX + 7) * lda; else ao8 = a + posX + 7 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + if (offset > -4) ao5 ++; else ao5 += lda; + if (offset > -5) ao6 ++; else ao6 += lda; + if (offset > -6) ao7 ++; else ao7 += lda; + if (offset > -7) ao8 ++; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symv_k.c b/kernel/generic/symv_k.c new file mode 100644 index 0000000..bd882fe --- /dev/null +++ b/kernel/generic/symv_k.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + + BLASLONG is, min_i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *symbuffer = buffer; + FLOAT *gemvbuffer = (FLOAT *)(((BLASLONG)buffer + SYMV_P * SYMV_P * sizeof(FLOAT) + 4095) & ~4095); + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + +#ifndef LOWER + for(is = m - offset; is < m; is += SYMV_P){ + min_i = MIN(m - is, SYMV_P); +#else + for(is = 0; is < offset; is += SYMV_P){ + min_i = MIN(offset - is, SYMV_P); +#endif + +#ifndef LOWER + if (is >0){ + GEMV_T(is, min_i, 0, alpha, + a + is * lda, lda, + X, 1, + Y + is, 1, gemvbuffer); + + GEMV_N(is, min_i, 0, alpha, + a + is * lda, lda, + X + is, 1, + Y, 1, gemvbuffer); + } +#endif + +#ifdef LOWER + SYMCOPY_L(min_i, a + is + is * lda, lda, symbuffer); +#else + SYMCOPY_U(min_i, a + is + is * lda, lda, symbuffer); +#endif + + GEMV_N(min_i, min_i, 0, alpha, + symbuffer, min_i, + X + is, 1, + Y + is, 1, gemvbuffer); + +#ifdef LOWER + if (m - is > min_i){ + GEMV_T(m - is - min_i, min_i, 0, alpha, + a + (is + min_i) + is * lda, lda, + X + (is + min_i), 1, + Y + is, 1, gemvbuffer); + + GEMV_N(m - is - min_i, min_i, 0, alpha, + a + (is + min_i) + is * lda, lda, + X + is, 1, + Y + (is + min_i), 1, gemvbuffer); + } +#endif + + } /* end of is */ + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/kernel/generic/trmm_lncopy_1.c b/kernel/generic/trmm_lncopy_1.c new file mode 100644 index 0000000..66e407f --- /dev/null +++ b/kernel/generic/trmm_lncopy_1.c @@ -0,0 +1,92 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, X; + + FLOAT data01; + FLOAT *ao1; + + while (n > 0) { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X < posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + n --; + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_16.c b/kernel/generic/trmm_lncopy_16.c new file mode 100644 index 0000000..a183402 --- /dev/null +++ b/kernel/generic/trmm_lncopy_16.c @@ -0,0 +1,1543 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + a09 = a + posY + (posX + 8) * lda; + a10 = a + posY + (posX + 9) * lda; + a11 = a + posY + (posX + 10) * lda; + a12 = a + posY + (posX + 11) * lda; + a13 = a + posY + (posX + 12) * lda; + a14 = a + posY + (posX + 13) * lda; + a15 = a + posY + (posX + 14) * lda; + a16 = a + posY + (posX + 15) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + a09 = a + posX + (posY + 8) * lda; + a10 = a + posX + (posY + 9) * lda; + a11 = a + posX + (posY + 10) * lda; + a12 = a + posX + (posY + 11) * lda; + a13 = a + posX + (posY + 12) * lda; + a14 = a + posX + (posY + 13) * lda; + a15 = a + posX + (posY + 14) * lda; + a16 = a + posX + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + a09 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + } + } else + if (X < posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 256; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 1); +#ifdef UNIT + b[ 17] = ONE; +#else + b[ 17] = *(a02 + 1); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 2); + b[ 33] = *(a02 + 2); +#ifdef UNIT + b[ 34] = ONE; +#else + b[ 34] = *(a03 + 2); +#endif + b[ 35] = ZERO; + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a01 + 3); + b[ 49] = *(a02 + 3); + b[ 50] = *(a03 + 3); +#ifdef UNIT + b[ 51] = ONE; +#else + b[ 51] = *(a04 + 3); +#endif + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 4); + b[ 65] = *(a02 + 4); + b[ 66] = *(a03 + 4); + b[ 67] = *(a04 + 4); +#ifdef UNIT + b[ 68] = ONE; +#else + b[ 68] = *(a05 + 4); +#endif + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a01 + 5); + b[ 81] = *(a02 + 5); + b[ 82] = *(a03 + 5); + b[ 83] = *(a04 + 5); + b[ 84] = *(a05 + 5); +#ifdef UNIT + b[ 85] = ONE; +#else + b[ 85] = *(a06 + 5); +#endif + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 6); + b[ 97] = *(a02 + 6); + b[ 98] = *(a03 + 6); + b[ 99] = *(a04 + 6); + b[100] = *(a05 + 6); + b[101] = *(a06 + 6); +#ifdef UNIT + b[102] = ONE; +#else + b[102] = *(a07 + 6); +#endif + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a01 + 7); + b[113] = *(a02 + 7); + b[114] = *(a03 + 7); + b[115] = *(a04 + 7); + b[116] = *(a05 + 7); + b[117] = *(a06 + 7); + b[118] = *(a07 + 7); +#ifdef UNIT + b[119] = ONE; +#else + b[119] = *(a08 + 7); +#endif + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a01 + 8); + b[129] = *(a02 + 8); + b[130] = *(a03 + 8); + b[131] = *(a04 + 8); + b[132] = *(a05 + 8); + b[133] = *(a06 + 8); + b[134] = *(a07 + 8); + b[135] = *(a08 + 8); +#ifdef UNIT + b[136] = ONE; +#else + b[136] = *(a09 + 8); +#endif + b[137] = ZERO; + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + + b[144] = *(a01 + 9); + b[145] = *(a02 + 9); + b[146] = *(a03 + 9); + b[147] = *(a04 + 9); + b[148] = *(a05 + 9); + b[149] = *(a06 + 9); + b[150] = *(a07 + 9); + b[151] = *(a08 + 9); + b[152] = *(a09 + 9); +#ifdef UNIT + b[153] = ONE; +#else + b[153] = *(a10 + 9); +#endif + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a01 + 10); + b[161] = *(a02 + 10); + b[162] = *(a03 + 10); + b[163] = *(a04 + 10); + b[164] = *(a05 + 10); + b[165] = *(a06 + 10); + b[166] = *(a07 + 10); + b[167] = *(a08 + 10); + b[168] = *(a09 + 10); + b[169] = *(a10 + 10); +#ifdef UNIT + b[170] = ONE; +#else + b[170] = *(a11 + 10); +#endif + b[171] = ZERO; + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + + b[176] = *(a01 + 11); + b[177] = *(a02 + 11); + b[178] = *(a03 + 11); + b[179] = *(a04 + 11); + b[180] = *(a05 + 11); + b[181] = *(a06 + 11); + b[182] = *(a07 + 11); + b[183] = *(a08 + 11); + b[184] = *(a09 + 11); + b[185] = *(a10 + 11); + b[186] = *(a11 + 11); +#ifdef UNIT + b[187] = ONE; +#else + b[187] = *(a12 + 11); +#endif + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a01 + 12); + b[193] = *(a02 + 12); + b[194] = *(a03 + 12); + b[195] = *(a04 + 12); + b[196] = *(a05 + 12); + b[197] = *(a06 + 12); + b[198] = *(a07 + 12); + b[199] = *(a08 + 12); + b[200] = *(a09 + 12); + b[201] = *(a10 + 12); + b[202] = *(a11 + 12); + b[203] = *(a12 + 12); +#ifdef UNIT + b[204] = ONE; +#else + b[204] = *(a13 + 12); +#endif + b[205] = ZERO; + b[206] = ZERO; + b[207] = ZERO; + + b[208] = *(a01 + 13); + b[209] = *(a02 + 13); + b[210] = *(a03 + 13); + b[211] = *(a04 + 13); + b[212] = *(a05 + 13); + b[213] = *(a06 + 13); + b[214] = *(a07 + 13); + b[215] = *(a08 + 13); + b[216] = *(a09 + 13); + b[217] = *(a10 + 13); + b[218] = *(a11 + 13); + b[219] = *(a12 + 13); + b[220] = *(a13 + 13); +#ifdef UNIT + b[221] = ONE; +#else + b[221] = *(a14 + 13); +#endif + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a01 + 14); + b[225] = *(a02 + 14); + b[226] = *(a03 + 14); + b[227] = *(a04 + 14); + b[228] = *(a05 + 14); + b[229] = *(a06 + 14); + b[230] = *(a07 + 14); + b[231] = *(a08 + 14); + b[232] = *(a09 + 14); + b[233] = *(a10 + 14); + b[234] = *(a11 + 14); + b[235] = *(a12 + 14); + b[236] = *(a13 + 14); + b[237] = *(a14 + 14); +#ifdef UNIT + b[238] = ONE; +#else + b[238] = *(a15 + 14); +#endif + b[239] = ZERO; + + b[240] = *(a01 + 15); + b[241] = *(a02 + 15); + b[242] = *(a03 + 15); + b[243] = *(a04 + 15); + b[244] = *(a05 + 15); + b[245] = *(a06 + 15); + b[246] = *(a07 + 15); + b[247] = *(a08 + 15); + b[248] = *(a09 + 15); + b[249] = *(a10 + 15); + b[250] = *(a11 + 15); + b[251] = *(a12 + 15); + b[252] = *(a13 + 15); + b[253] = *(a14 + 15); + b[254] = *(a15 + 15); +#ifdef UNIT + b[255] = ONE; +#else + b[255] = *(a16 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + a09 += 16; + a10 += 16; + a11 += 16; + a12 += 16; + a13 += 16; + a14 += 16; + a15 += 16; + a16 += 16; + b += 256; + + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + a09 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + } + } else + if (X < posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + + if (i >= 2) { + b[ 0] = *(a01 + 1); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a02 + 2); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a01 + 3); + b[ 1] = *(a02 + 3); + b[ 2] = *(a03 + 3); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 11] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a02 + 4); + b[ 2] = *(a03 + 4); + b[ 3] = *(a04 + 4); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a01 + 5); + b[ 1] = *(a02 + 5); + b[ 2] = *(a03 + 5); + b[ 3] = *(a04 + 5); + b[ 4] = *(a05 + 5); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a02 + 6); + b[ 2] = *(a03 + 6); + b[ 3] = *(a04 + 6); + b[ 4] = *(a05 + 6); + b[ 5] = *(a06 + 6); +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 8) { + b[ 0] = *(a01 + 7); + b[ 1] = *(a02 + 7); + b[ 2] = *(a03 + 7); + b[ 3] = *(a04 + 7); + b[ 4] = *(a05 + 7); + b[ 5] = *(a06 + 7); + b[ 6] = *(a07 + 7); +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(a08 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 9) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a02 + 8); + b[ 2] = *(a03 + 8); + b[ 3] = *(a04 + 8); + b[ 4] = *(a05 + 8); + b[ 5] = *(a06 + 8); + b[ 6] = *(a07 + 8); + b[ 7] = *(a08 + 8); +#ifdef UNIT + b[ 8] = ONE; +#else + b[ 8] = *(a09 + 8); +#endif + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 10) { + b[ 0] = *(a01 + 9); + b[ 1] = *(a02 + 9); + b[ 2] = *(a03 + 9); + b[ 3] = *(a04 + 9); + b[ 4] = *(a05 + 9); + b[ 5] = *(a06 + 9); + b[ 6] = *(a07 + 9); + b[ 7] = *(a08 + 9); + b[ 8] = *(a09 + 9); +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a10 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 11) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a02 + 10); + b[ 2] = *(a03 + 10); + b[ 3] = *(a04 + 10); + b[ 4] = *(a05 + 10); + b[ 5] = *(a06 + 10); + b[ 6] = *(a07 + 10); + b[ 7] = *(a08 + 10); + b[ 8] = *(a09 + 10); + b[ 9] = *(a10 + 10); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a11 + 10); +#endif + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 12) { + b[ 0] = *(a01 + 11); + b[ 1] = *(a02 + 11); + b[ 2] = *(a03 + 11); + b[ 3] = *(a04 + 11); + b[ 4] = *(a05 + 11); + b[ 5] = *(a06 + 11); + b[ 6] = *(a07 + 11); + b[ 7] = *(a08 + 11); + b[ 8] = *(a09 + 11); + b[ 9] = *(a10 + 11); + b[ 10] = *(a11 + 11); +#ifdef UNIT + b[ 11] = ONE; +#else + b[ 11] = *(a12 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 13) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a02 + 12); + b[ 2] = *(a03 + 12); + b[ 3] = *(a04 + 12); + b[ 4] = *(a05 + 12); + b[ 5] = *(a06 + 12); + b[ 6] = *(a07 + 12); + b[ 7] = *(a08 + 12); + b[ 8] = *(a09 + 12); + b[ 9] = *(a10 + 12); + b[ 10] = *(a11 + 12); + b[ 11] = *(a12 + 12); +#ifdef UNIT + b[ 12] = ONE; +#else + b[ 12] = *(a13 + 12); +#endif + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 14) { + b[ 0] = *(a01 + 13); + b[ 1] = *(a02 + 13); + b[ 2] = *(a03 + 13); + b[ 3] = *(a04 + 13); + b[ 4] = *(a05 + 13); + b[ 5] = *(a06 + 13); + b[ 6] = *(a07 + 13); + b[ 7] = *(a08 + 13); + b[ 8] = *(a09 + 13); + b[ 9] = *(a10 + 13); + b[ 10] = *(a11 + 13); + b[ 11] = *(a12 + 13); + b[ 12] = *(a13 + 13); +#ifdef UNIT + b[ 13] = ONE; +#else + b[ 13] = *(a14 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 15) { + b[ 0] = *(a01 + 14); + b[ 1] = *(a02 + 14); + b[ 2] = *(a03 + 14); + b[ 3] = *(a04 + 14); + b[ 4] = *(a05 + 14); + b[ 5] = *(a06 + 14); + b[ 6] = *(a07 + 14); + b[ 7] = *(a08 + 14); + b[ 8] = *(a09 + 14); + b[ 9] = *(a10 + 14); + b[ 10] = *(a11 + 14); + b[ 11] = *(a12 + 14); + b[ 12] = *(a13 + 14); + b[ 13] = *(a14 + 14); +#ifdef UNIT + b[ 14] = ONE; +#else + b[ 14] = *(a15 + 14); +#endif + b[ 15] = ZERO; + b += 16; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + b += 8; + } + } else + if (X < posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 64; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a01 + 1); +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a02 + 1); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 2); + b[ 17] = *(a02 + 2); +#ifdef UNIT + b[ 18] = ONE; +#else + b[ 18] = *(a03 + 2); +#endif + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a01 + 3); + b[ 25] = *(a02 + 3); + b[ 26] = *(a03 + 3); +#ifdef UNIT + b[ 27] = ONE; +#else + b[ 27] = *(a04 + 3); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 4); + b[ 33] = *(a02 + 4); + b[ 34] = *(a03 + 4); + b[ 35] = *(a04 + 4); +#ifdef UNIT + b[ 36] = ONE; +#else + b[ 36] = *(a05 + 4); +#endif + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + + b[ 40] = *(a01 + 5); + b[ 41] = *(a02 + 5); + b[ 42] = *(a03 + 5); + b[ 43] = *(a04 + 5); + b[ 44] = *(a05 + 5); +#ifdef UNIT + b[ 45] = ONE; +#else + b[ 45] = *(a06 + 5); +#endif + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a01 + 6); + b[ 49] = *(a02 + 6); + b[ 50] = *(a03 + 6); + b[ 51] = *(a04 + 6); + b[ 52] = *(a05 + 6); + b[ 53] = *(a06 + 6); +#ifdef UNIT + b[ 54] = ONE; +#else + b[ 54] = *(a07 + 6); +#endif + b[ 55] = ZERO; + + b[ 56] = *(a01 + 7); + b[ 57] = *(a02 + 7); + b[ 58] = *(a03 + 7); + b[ 59] = *(a04 + 7); + b[ 60] = *(a05 + 7); + b[ 61] = *(a06 + 7); + b[ 62] = *(a07 + 7); +#ifdef UNIT + b[ 63] = ONE; +#else + b[ 63] = *(a08 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + a05 += 8; + a06 += 8; + a07 += 8; + a08 += 8; + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + b += 8; + } + } else + if (X < posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(a01 + 1); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a02 + 2); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 4) { + b[ 0] = *(a01 + 3); + b[ 1] = *(a02 + 3); + b[ 2] = *(a03 + 3); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 5) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a02 + 4); + b[ 2] = *(a03 + 4); + b[ 3] = *(a04 + 4); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 6) { + b[ 0] = *(a01 + 5); + b[ 1] = *(a02 + 5); + b[ 2] = *(a03 + 5); + b[ 3] = *(a04 + 5); + b[ 4] = *(a05 + 5); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 7) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a02 + 6); + b[ 2] = *(a03 + 6); + b[ 3] = *(a04 + 6); + b[ 4] = *(a05 + 6); + b[ 5] = *(a06 + 6); +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + b += 4; + } + } else + if (X < posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 16; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a01 + 1); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a02 + 1); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a01 + 2); + b[ 9] = *(a02 + 2); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a03 + 2); +#endif + b[ 11] = ZERO; + + b[ 12] = *(a01 + 3); + b[ 13] = *(a02 + 3); + b[ 14] = *(a03 + 3); +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(a04 + 3); +#endif + + a01 += 4; + a02 += 4; + a03 += 4; + a04 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + b += 4; + } + } else + if (X < posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + b += 4 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if (i >= 2) { + b[ 0] = *(a01 + 1); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a02 + 2); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a01 + 1); + b[ 3] = *(a02 + 1); + a01 += 2; + a02 += 2; + b += 4; + } else + if (X < posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + + b[ 2] = *(a01 + 1); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a02 + 1); +#endif + + a01 += 2; + a02 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + + a01 ++; + a02 ++; + b += 2; + } else + if (X < posY) { + a01 += lda; + a02 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + a01 += 1; + b += 1; + } else + if (X < posY) { + a01 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_2.c b/kernel/generic/trmm_lncopy_2.c new file mode 100644 index 0000000..f7fefaa --- /dev/null +++ b/kernel/generic/trmm_lncopy_2.c @@ -0,0 +1,198 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data02; + b[ 3] = data04; + + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data04; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data03; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data03; +#else + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data03; +#endif + ao1 += 1; + ao2 += 1; + b += 2; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X < posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_4.c b/kernel/generic/trmm_lncopy_4.c new file mode 100644 index 0000000..6cd1667 --- /dev/null +++ b/kernel/generic/trmm_lncopy_4.c @@ -0,0 +1,484 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = ONE; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += 1; + } else + if (X < posY) { + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_8.c b/kernel/generic/trmm_lncopy_8.c new file mode 100644 index 0000000..4a1964b --- /dev/null +++ b/kernel/generic/trmm_lncopy_8.c @@ -0,0 +1,1227 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + ao7 = a + posY + (posX + 6) * lda; + ao8 = a + posY + (posX + 7) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + ao7 = a + posX + (posY + 6) * lda; + ao8 = a + posX + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + data56 = *(ao7 + 7); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = data03; + b[17] = data11; + b[18] = data19; + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = data04; + b[25] = data12; + b[26] = data20; + b[27] = data28; + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + b[32] = data05; + b[33] = data13; + b[34] = data21; + b[35] = data29; + b[36] = data37; + b[37] = data45; + b[38] = data53; + b[39] = data61; + + b[40] = data06; + b[41] = data14; + b[42] = data22; + b[43] = data30; + b[44] = data38; + b[45] = data46; + b[46] = data54; + b[47] = data62; + + b[48] = data07; + b[49] = data15; + b[50] = data23; + b[51] = data31; + b[52] = data39; + b[53] = data47; + b[54] = data55; + b[55] = data63; + + b[56] = data08; + b[57] = data16; + b[58] = data24; + b[59] = data32; + b[60] = data40; + b[61] = data48; + b[62] = data56; + b[63] = data64; + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + + } else + if (X < posY) { + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + + } else { +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data56 = *(ao7 + 7); + +#ifndef UNIT + data64 = *(ao8 + 7); +#endif + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data02; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = data10; +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data03; + b[17] = data11; +#ifdef UNIT + b[18] = ONE; +#else + b[18] = data19; +#endif + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data04; + b[25] = data12; + b[26] = data20; +#ifdef UNIT + b[27] = ONE; +#else + b[27] = data28; +#endif + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + + b[32] = data05; + b[33] = data13; + b[34] = data21; + b[35] = data29; +#ifdef UNIT + b[36] = ONE; +#else + b[36] = data37; +#endif + b[37] = ZERO; + b[38] = ZERO; + b[39] = ZERO; + + b[40] = data06; + b[41] = data14; + b[42] = data22; + b[43] = data30; + b[44] = data38; +#ifdef UNIT + b[45] = ONE; +#else + b[45] = data46; +#endif + b[46] = ZERO; + b[47] = ZERO; + + b[48] = data07; + b[49] = data15; + b[50] = data23; + b[51] = data31; + b[52] = data39; + b[53] = data47; +#ifdef UNIT + b[54] = ONE; +#else + b[54] = data55; +#endif + b[55] = ZERO; + + b[56] = data08; + b[57] = data16; + b[58] = data24; + b[59] = data32; + b[60] = data40; + b[61] = data48; + b[62] = data56; +#ifdef UNIT + b[63] = ONE; +#else + b[63] = data64; +#endif + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + + if (m & 4) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = data03; + b[17] = data11; + b[18] = data19; + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = data04; + b[25] = data12; + b[26] = data20; + b[27] = data28; + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + ao5 += 4; + ao6 += 4; + ao7 += 4; + ao8 += 4; + + b += 32; + } + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + data33 = *(ao5 + 0); + data41 = *(ao6 + 0); + data49 = *(ao7 + 0); + data57 = *(ao8 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b += 8; + } + } else + if (X < posY) { + if (m & 4) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + if (m & 2) { + ao1 += 2 * lda; + b += 16; + } + + if (m & 1) { + b += 8; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + } + + if (i >= 4) { +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + } + + if (i >= 5) { +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + } + + if (i >= 6) { +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + } + + if (i >= 7) { +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data56 = *(ao7 + 7); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = data02; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data03; + b[ 1] = data11; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 4) { + b[ 0] = data04; + b[ 1] = data12; + b[ 2] = data20; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = data28; +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 5) { + b[ 0] = data05; + b[ 1] = data13; + b[ 2] = data21; + b[ 3] = data29; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = data37; +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 6) { + b[ 0] = data06; + b[ 1] = data14; + b[ 2] = data22; + b[ 3] = data30; + b[ 4] = data38; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = data46; +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 7) { + b[ 0] = data07; + b[ 1] = data15; + b[ 2] = data23; + b[ 3] = data31; + b[ 4] = data39; + b[ 5] = data47; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = data55; +#endif + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data11; + b[10] = data19; + b[11] = data27; + + b[12] = data04; + b[13] = data12; + b[14] = data20; + b[15] = data28; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data20 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data02; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data11; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data04; + b[13] = data12; + b[14] = data20; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data11; + b[10] = data19; + b[11] = ZERO; + + b[12] = data04; + b[13] = data12; + b[14] = data20; + b[15] = data28; +#endif + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b += 4; + } + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + b += 8; + } + + if (m & 1) { + b += 4; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data02; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data03; + b[ 1] = data11; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = ZERO; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data02; + b[ 3] = data10; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data10; +#endif + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b += 2; + } else + if (X < posY) { + b += 2; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data09; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data09; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X < posY) { + ao1 += lda; + b += 1; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 ++; + b ++; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_1.c b/kernel/generic/trmm_ltcopy_1.c new file mode 100644 index 0000000..ab5e9d8 --- /dev/null +++ b/kernel/generic/trmm_ltcopy_1.c @@ -0,0 +1,92 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, X; + + FLOAT data01; + FLOAT *ao1; + + while (n > 0) { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + ao1 += 1; + b += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + n --; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_16.c b/kernel/generic/trmm_ltcopy_16.c new file mode 100644 index 0000000..0598de8 --- /dev/null +++ b/kernel/generic/trmm_ltcopy_16.c @@ -0,0 +1,1547 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + a09 = a + posY + (posX + 8) * lda; + a10 = a + posY + (posX + 9) * lda; + a11 = a + posY + (posX + 10) * lda; + a12 = a + posY + (posX + 11) * lda; + a13 = a + posY + (posX + 12) * lda; + a14 = a + posY + (posX + 13) * lda; + a15 = a + posY + (posX + 14) * lda; + a16 = a + posY + (posX + 15) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + a09 = a + posX + (posY + 8) * lda; + a10 = a + posX + (posY + 9) * lda; + a11 = a + posX + (posY + 10) * lda; + a12 = a + posX + (posY + 11) * lda; + a13 = a + posX + (posY + 12) * lda; + a14 = a + posX + (posY + 13) * lda; + a15 = a + posX + (posY + 14) * lda; + a16 = a + posX + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + a09 += 16; + a10 += 16; + a11 += 16; + a12 += 16; + a13 += 16; + a14 += 16; + a15 += 16; + a16 += 16; + b += 256; + } else + if (X < posY) { + + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = ZERO; +#ifdef UNIT + b[ 17] = ONE; +#else + b[ 17] = *(a02 + 1); +#endif + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); + b[ 20] = *(a02 + 4); + b[ 21] = *(a02 + 5); + b[ 22] = *(a02 + 6); + b[ 23] = *(a02 + 7); + b[ 24] = *(a02 + 8); + b[ 25] = *(a02 + 9); + b[ 26] = *(a02 + 10); + b[ 27] = *(a02 + 11); + b[ 28] = *(a02 + 12); + b[ 29] = *(a02 + 13); + b[ 30] = *(a02 + 14); + b[ 31] = *(a02 + 15); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; +#else + b[ 34] = *(a03 + 2); +#endif + b[ 35] = *(a03 + 3); + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); + b[ 38] = *(a03 + 6); + b[ 39] = *(a03 + 7); + b[ 40] = *(a03 + 8); + b[ 41] = *(a03 + 9); + b[ 42] = *(a03 + 10); + b[ 43] = *(a03 + 11); + b[ 44] = *(a03 + 12); + b[ 45] = *(a03 + 13); + b[ 46] = *(a03 + 14); + b[ 47] = *(a03 + 15); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; +#ifdef UNIT + b[ 51] = ONE; +#else + b[ 51] = *(a04 + 3); +#endif + b[ 52] = *(a04 + 4); + b[ 53] = *(a04 + 5); + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); + b[ 56] = *(a04 + 8); + b[ 57] = *(a04 + 9); + b[ 58] = *(a04 + 10); + b[ 59] = *(a04 + 11); + b[ 60] = *(a04 + 12); + b[ 61] = *(a04 + 13); + b[ 62] = *(a04 + 14); + b[ 63] = *(a04 + 15); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; +#else + b[ 68] = *(a05 + 4); +#endif + b[ 69] = *(a05 + 5); + b[ 70] = *(a05 + 6); + b[ 71] = *(a05 + 7); + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); + b[ 74] = *(a05 + 10); + b[ 75] = *(a05 + 11); + b[ 76] = *(a05 + 12); + b[ 77] = *(a05 + 13); + b[ 78] = *(a05 + 14); + b[ 79] = *(a05 + 15); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; +#ifdef UNIT + b[ 85] = ONE; +#else + b[ 85] = *(a06 + 5); +#endif + b[ 86] = *(a06 + 6); + b[ 87] = *(a06 + 7); + b[ 88] = *(a06 + 8); + b[ 89] = *(a06 + 9); + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); + b[ 92] = *(a06 + 12); + b[ 93] = *(a06 + 13); + b[ 94] = *(a06 + 14); + b[ 95] = *(a06 + 15); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; +#else + b[102] = *(a07 + 6); +#endif + b[103] = *(a07 + 7); + b[104] = *(a07 + 8); + b[105] = *(a07 + 9); + b[106] = *(a07 + 10); + b[107] = *(a07 + 11); + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); + b[110] = *(a07 + 14); + b[111] = *(a07 + 15); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; +#ifdef UNIT + b[119] = ONE; +#else + b[119] = *(a08 + 7); +#endif + b[120] = *(a08 + 8); + b[121] = *(a08 + 9); + b[122] = *(a08 + 10); + b[123] = *(a08 + 11); + b[124] = *(a08 + 12); + b[125] = *(a08 + 13); + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; +#else + b[136] = *(a09 + 8); +#endif + b[137] = *(a09 + 9); + b[138] = *(a09 + 10); + b[139] = *(a09 + 11); + b[140] = *(a09 + 12); + b[141] = *(a09 + 13); + b[142] = *(a09 + 14); + b[143] = *(a09 + 15); + + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; +#ifdef UNIT + b[153] = ONE; +#else + b[153] = *(a10 + 9); +#endif + b[154] = *(a10 + 10); + b[155] = *(a10 + 11); + b[156] = *(a10 + 12); + b[157] = *(a10 + 13); + b[158] = *(a10 + 14); + b[159] = *(a10 + 15); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; +#else + b[170] = *(a11 + 10); +#endif + b[171] = *(a11 + 11); + b[172] = *(a11 + 12); + b[173] = *(a11 + 13); + b[174] = *(a11 + 14); + b[175] = *(a11 + 15); + + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; +#ifdef UNIT + b[187] = ONE; +#else + b[187] = *(a12 + 11); +#endif + b[188] = *(a12 + 12); + b[189] = *(a12 + 13); + b[190] = *(a12 + 14); + b[191] = *(a12 + 15); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; +#else + b[204] = *(a13 + 12); +#endif + b[205] = *(a13 + 13); + b[206] = *(a13 + 14); + b[207] = *(a13 + 15); + + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; +#ifdef UNIT + b[221] = ONE; +#else + b[221] = *(a14 + 13); +#endif + b[222] = *(a14 + 14); + b[223] = *(a14 + 15); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; +#else + b[238] = *(a15 + 14); +#endif + b[239] = *(a15 + 15); + + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; +#ifdef UNIT + b[255] = ONE; +#else + b[255] = *(a16 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + a09 += 16; + a10 += 16; + a11 += 16; + a12 += 16; + a13 += 16; + a14 += 16; + a15 += 16; + a16 += 16; + + b += 256; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i > 0) { + if (X > posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + a05 += i; + a06 += i; + a07 += i; + a08 += i; + a09 += i; + a10 += i; + a11 += i; + a12 += i; + a13 += i; + a14 += i; + a15 += i; + a16 += i; + b += 16 * i; + } else + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[10] = *(a02 + 10); + b[11] = *(a02 + 11); + b[12] = *(a02 + 12); + b[13] = *(a02 + 13); + b[14] = *(a02 + 14); + b[15] = *(a02 + 15); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a03 + 3); + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[10] = *(a03 + 10); + b[11] = *(a03 + 11); + b[12] = *(a03 + 12); + b[13] = *(a03 + 13); + b[14] = *(a03 + 14); + b[15] = *(a03 + 15); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[10] = *(a04 + 10); + b[11] = *(a04 + 11); + b[12] = *(a04 + 12); + b[13] = *(a04 + 13); + b[14] = *(a04 + 14); + b[15] = *(a04 + 15); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); + b[10] = *(a05 + 10); + b[11] = *(a05 + 11); + b[12] = *(a05 + 12); + b[13] = *(a05 + 13); + b[14] = *(a05 + 14); + b[15] = *(a05 + 15); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); + b[12] = *(a06 + 12); + b[13] = *(a06 + 13); + b[14] = *(a06 + 14); + b[15] = *(a06 + 15); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); + b[14] = *(a07 + 14); + b[15] = *(a07 + 15); + b += 16; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(a08 + 7); +#endif + b[ 8] = *(a08 + 8); + b[ 9] = *(a08 + 9); + b[10] = *(a08 + 10); + b[11] = *(a08 + 11); + b[12] = *(a08 + 12); + b[13] = *(a08 + 13); + b[14] = *(a08 + 14); + b[15] = *(a08 + 15); + b += 16; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; +#else + b[ 8] = *(a09 + 8); +#endif + b[ 9] = *(a09 + 9); + b[10] = *(a09 + 10); + b[11] = *(a09 + 11); + b[12] = *(a09 + 12); + b[13] = *(a09 + 13); + b[14] = *(a09 + 14); + b[15] = *(a09 + 15); + b += 16; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a10 + 9); +#endif + b[10] = *(a10 + 10); + b[11] = *(a10 + 11); + b[12] = *(a10 + 12); + b[13] = *(a10 + 13); + b[14] = *(a10 + 14); + b[15] = *(a10 + 15); + b += 16; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; +#else + b[10] = *(a11 + 10); +#endif + b[11] = *(a11 + 11); + b[12] = *(a11 + 12); + b[13] = *(a11 + 13); + b[14] = *(a11 + 14); + b[15] = *(a11 + 15); + b += 16; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; +#ifdef UNIT + b[11] = ONE; +#else + b[11] = *(a12 + 11); +#endif + b[12] = *(a12 + 12); + b[13] = *(a12 + 13); + b[14] = *(a12 + 14); + b[15] = *(a12 + 15); + b += 16; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; +#else + b[12] = *(a13 + 12); +#endif + b[13] = *(a13 + 13); + b[14] = *(a13 + 14); + b[15] = *(a13 + 15); + b += 16; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; +#ifdef UNIT + b[13] = ONE; +#else + b[13] = *(a14 + 13); +#endif + b[14] = *(a14 + 14); + b[15] = *(a14 + 15); + b += 16; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(a15 + 14); +#endif + b[15] = *(a15 + 15); + b += 16; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + a05 += 8; + a06 += 8; + a07 += 8; + a08 += 8; + b += 64; + } else + if (X < posY) { + + for (ii = 0; ii < 8; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + a01 += lda; + b += 8; + } + + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a02 + 1); +#endif + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); + b[ 12] = *(a02 + 4); + b[ 13] = *(a02 + 5); + b[ 14] = *(a02 + 6); + b[ 15] = *(a02 + 7); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; +#else + b[ 18] = *(a03 + 2); +#endif + b[ 19] = *(a03 + 3); + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); + b[ 22] = *(a03 + 6); + b[ 23] = *(a03 + 7); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; +#ifdef UNIT + b[ 27] = ONE; +#else + b[ 27] = *(a04 + 3); +#endif + b[ 28] = *(a04 + 4); + b[ 29] = *(a04 + 5); + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; +#else + b[ 36] = *(a05 + 4); +#endif + b[ 37] = *(a05 + 5); + b[ 38] = *(a05 + 6); + b[ 39] = *(a05 + 7); + + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; +#ifdef UNIT + b[ 45] = ONE; +#else + b[ 45] = *(a06 + 5); +#endif + b[ 46] = *(a06 + 6); + b[ 47] = *(a06 + 7); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; +#else + b[ 54] = *(a07 + 6); +#endif + b[ 55] = *(a07 + 7); + + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; +#ifdef UNIT + b[ 63] = ONE; +#else + b[ 63] = *(a08 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + a05 += 8; + a06 += 8; + a07 += 8; + a08 += 8; + b += 64; + + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0) { + if (X > posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + a05 += i; + a06 += i; + a07 += i; + a08 += i; + b += 8 * i; + } else + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 8; + } + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a03 + 3); + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b += 8; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); + b += 8; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); + b += 8; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b += 8; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = *(a07 + 7); + b += 8; + } + } + } + posY += 8; + } + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + a01 += 4; + a02 += 4; + a03 += 4; + a04 += 4; + b += 16; + } else + if (X < posY) { + + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + a01 += lda; + b += 4; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a02 + 1); +#endif + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a03 + 2); +#endif + b[ 11] = *(a03 + 3); + + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(a04 + 3); +#endif + + a01 += 4; + a02 += 4; + a03 += 4; + a04 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X > posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + b += 4 * i; + } else + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 4; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a03 + 3); + b += 4; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + a01 += 2; + a02 += 2; + b += 4; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a02 + 1); +#endif + + a01 += 2; + a02 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + if (X > posY) { + a01 ++; + a02 ++; + b += 2; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += lda; + a02 += lda; + b += 2; + } + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b += 2; + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + b ++; + a01 ++; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + a01 += lda; + b ++; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + a01 ++; + b ++; + } + X += 1; + i --; + } while (i > 0); + } + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_2.c b/kernel/generic/trmm_ltcopy_2.c new file mode 100644 index 0000000..098e16f --- /dev/null +++ b/kernel/generic/trmm_ltcopy_2.c @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data04; +#endif + + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + ao1 += 1; + b += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_4.c b/kernel/generic/trmm_ltcopy_4.c new file mode 100644 index 0000000..69a233b --- /dev/null +++ b/kernel/generic/trmm_ltcopy_4.c @@ -0,0 +1,488 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data12; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data12 = *(ao3 + 3); + } + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data12; + b += 4; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 1; + ao1 += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += 1; + b += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_8.c b/kernel/generic/trmm_ltcopy_8.c new file mode 100644 index 0000000..64954da --- /dev/null +++ b/kernel/generic/trmm_ltcopy_8.c @@ -0,0 +1,1219 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + ao7 = a + posY + (posX + 6) * lda; + ao8 = a + posY + (posX + 7) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + ao7 = a + posX + (posY + 6) * lda; + ao8 = a + posX + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + data56 = *(ao7 + 7); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b[32] = data33; + b[33] = data34; + b[34] = data35; + b[35] = data36; + b[36] = data37; + b[37] = data38; + b[38] = data39; + b[39] = data40; + + b[40] = data41; + b[41] = data42; + b[42] = data43; + b[43] = data44; + b[44] = data45; + b[45] = data46; + b[46] = data47; + b[47] = data48; + + b[48] = data49; + b[49] = data50; + b[50] = data51; + b[51] = data52; + b[52] = data53; + b[53] = data54; + b[54] = data55; + b[55] = data56; + + b[56] = data57; + b[57] = data58; + b[58] = data59; + b[59] = data60; + b[60] = data61; + b[61] = data62; + b[62] = data63; + b[63] = data64; + + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data56 = *(ao7 + 7); + +#ifndef UNIT + data64 = *(ao8 + 7); +#endif + + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = data10; +#endif + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = ZERO; + b[17] = ZERO; +#ifdef UNIT + b[18] = ONE; +#else + b[18] = data19; +#endif + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; +#ifdef UNIT + b[27] = ONE; +#else + b[27] = data28; +#endif + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ZERO; +#ifdef UNIT + b[36] = ONE; +#else + b[36] = data37; +#endif + b[37] = data38; + b[38] = data39; + b[39] = data40; + + b[40] = ZERO; + b[41] = ZERO; + b[42] = ZERO; + b[43] = ZERO; + b[44] = ZERO; +#ifdef UNIT + b[45] = ONE; +#else + b[45] = data46; +#endif + b[46] = data47; + b[47] = data48; + + b[48] = ZERO; + b[49] = ZERO; + b[50] = ZERO; + b[51] = ZERO; + b[52] = ZERO; + b[53] = ZERO; +#ifdef UNIT + b[54] = ONE; +#else + b[54] = data55; +#endif + b[55] = data56; + + b[56] = ZERO; + b[57] = ZERO; + b[58] = ZERO; + b[59] = ZERO; + b[60] = ZERO; + b[61] = ZERO; + b[62] = ZERO; +#ifdef UNIT + b[63] = ONE; +#else + b[63] = data64; +#endif + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + + if (m & 4) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + ao5 += 4; + ao6 += 4; + ao7 += 4; + ao8 += 4; + + b += 32; + } + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + + b += 16; + } + + if (m & 1) { + b += 8; + } + } else + if (X < posY) { + if (m & 4) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 2 * lda; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + } + } else { +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + } + + if (i >= 4) { +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + } + + if (i >= 5) { +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + } + + if (i >= 6) { +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + } + + if (i >= 7) { +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data56 = *(ao7 + 7); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = data13; + b[ 5] = data14; + b[ 6] = data15; + b[ 7] = data16; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = data20; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = data23; + b[ 7] = data24; + b += 8; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = data28; +#endif + b[ 4] = data29; + b[ 5] = data30; + b[ 6] = data31; + b[ 7] = data32; + b += 8; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = data37; +#endif + b[ 5] = data38; + b[ 6] = data39; + b[ 7] = data40; + b += 8; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = data46; +#endif + b[ 6] = data47; + b[ 7] = data48; + b += 8; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = data55; +#endif + b[ 7] = data56; + b += 8; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + b[ 8] = data17; + b[ 9] = data18; + b[10] = data19; + b[11] = data20; + + b[12] = data25; + b[13] = data26; + b[14] = data27; + b[15] = data28; + + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data20 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data11; + b[ 7] = data12; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data20; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data19; + b[11] = data20; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data28; +#endif + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + + b += 8; + } + + if (m & 1) { + b += 4; + } + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = data11; + b[ 3] = data12; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = data20; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + + b += 4; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data10; +#endif + + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b += 2; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data09; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data09; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + ao1 += 1; + b += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + ao1 += lda; + + b[ 0] = data01; + b += 1; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 ++; + b ++; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_1.c b/kernel/generic/trmm_uncopy_1.c new file mode 100644 index 0000000..6e75c2f --- /dev/null +++ b/kernel/generic/trmm_uncopy_1.c @@ -0,0 +1,91 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, X; + + FLOAT data01; + FLOAT *ao1; + + while (n > 0) { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + posY ++; + n --; + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_16.c b/kernel/generic/trmm_uncopy_16.c new file mode 100644 index 0000000..6325a26 --- /dev/null +++ b/kernel/generic/trmm_uncopy_16.c @@ -0,0 +1,1543 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + a09 = a + posX + (posY + 8) * lda; + a10 = a + posX + (posY + 9) * lda; + a11 = a + posX + (posY + 10) * lda; + a12 = a + posX + (posY + 11) * lda; + a13 = a + posX + (posY + 12) * lda; + a14 = a + posX + (posY + 13) * lda; + a15 = a + posX + (posY + 14) * lda; + a16 = a + posX + (posY + 15) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + a09 = a + posY + (posX + 8) * lda; + a10 = a + posY + (posX + 9) * lda; + a11 = a + posY + (posX + 10) * lda; + a12 = a + posY + (posX + 11) * lda; + a13 = a + posY + (posX + 12) * lda; + a14 = a + posY + (posX + 13) * lda; + a15 = a + posY + (posX + 14) * lda; + a16 = a + posY + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + a09 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + } + } else + if (X > posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 256; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + b[ 16] = ZERO; +#ifdef UNIT + b[ 17] = ONE; +#else + b[ 17] = *(a02 + 1); +#endif + b[ 18] = *(a03 + 1); + b[ 19] = *(a04 + 1); + b[ 20] = *(a05 + 1); + b[ 21] = *(a06 + 1); + b[ 22] = *(a07 + 1); + b[ 23] = *(a08 + 1); + b[ 24] = *(a09 + 1); + b[ 25] = *(a10 + 1); + b[ 26] = *(a11 + 1); + b[ 27] = *(a12 + 1); + b[ 28] = *(a13 + 1); + b[ 29] = *(a14 + 1); + b[ 30] = *(a15 + 1); + b[ 31] = *(a16 + 1); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; +#else + b[ 34] = *(a03 + 2); +#endif + b[ 35] = *(a04 + 2); + b[ 36] = *(a05 + 2); + b[ 37] = *(a06 + 2); + b[ 38] = *(a07 + 2); + b[ 39] = *(a08 + 2); + b[ 40] = *(a09 + 2); + b[ 41] = *(a10 + 2); + b[ 42] = *(a11 + 2); + b[ 43] = *(a12 + 2); + b[ 44] = *(a13 + 2); + b[ 45] = *(a14 + 2); + b[ 46] = *(a15 + 2); + b[ 47] = *(a16 + 2); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; +#ifdef UNIT + b[ 51] = ONE; +#else + b[ 51] = *(a04 + 3); +#endif + b[ 52] = *(a05 + 3); + b[ 53] = *(a06 + 3); + b[ 54] = *(a07 + 3); + b[ 55] = *(a08 + 3); + b[ 56] = *(a09 + 3); + b[ 57] = *(a10 + 3); + b[ 58] = *(a11 + 3); + b[ 59] = *(a12 + 3); + b[ 60] = *(a13 + 3); + b[ 61] = *(a14 + 3); + b[ 62] = *(a15 + 3); + b[ 63] = *(a16 + 3); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; +#else + b[ 68] = *(a05 + 4); +#endif + b[ 69] = *(a06 + 4); + b[ 70] = *(a07 + 4); + b[ 71] = *(a08 + 4); + b[ 72] = *(a09 + 4); + b[ 73] = *(a10 + 4); + b[ 74] = *(a11 + 4); + b[ 75] = *(a12 + 4); + b[ 76] = *(a13 + 4); + b[ 77] = *(a14 + 4); + b[ 78] = *(a15 + 4); + b[ 79] = *(a16 + 4); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; +#ifdef UNIT + b[ 85] = ONE; +#else + b[ 85] = *(a06 + 5); +#endif + b[ 86] = *(a07 + 5); + b[ 87] = *(a08 + 5); + b[ 88] = *(a09 + 5); + b[ 89] = *(a10 + 5); + b[ 90] = *(a11 + 5); + b[ 91] = *(a12 + 5); + b[ 92] = *(a13 + 5); + b[ 93] = *(a14 + 5); + b[ 94] = *(a15 + 5); + b[ 95] = *(a16 + 5); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; +#else + b[102] = *(a07 + 6); +#endif + b[103] = *(a08 + 6); + b[104] = *(a09 + 6); + b[105] = *(a10 + 6); + b[106] = *(a11 + 6); + b[107] = *(a12 + 6); + b[108] = *(a13 + 6); + b[109] = *(a14 + 6); + b[110] = *(a15 + 6); + b[111] = *(a16 + 6); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; +#ifdef UNIT + b[119] = ONE; +#else + b[119] = *(a08 + 7); +#endif + b[120] = *(a09 + 7); + b[121] = *(a10 + 7); + b[122] = *(a11 + 7); + b[123] = *(a12 + 7); + b[124] = *(a13 + 7); + b[125] = *(a14 + 7); + b[126] = *(a15 + 7); + b[127] = *(a16 + 7); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; +#else + b[136] = *(a09 + 8); +#endif + b[137] = *(a10 + 8); + b[138] = *(a11 + 8); + b[139] = *(a12 + 8); + b[140] = *(a13 + 8); + b[141] = *(a14 + 8); + b[142] = *(a15 + 8); + b[143] = *(a16 + 8); + + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; +#ifdef UNIT + b[153] = ONE; +#else + b[153] = *(a10 + 9); +#endif + b[154] = *(a11 + 9); + b[155] = *(a12 + 9); + b[156] = *(a13 + 9); + b[157] = *(a14 + 9); + b[158] = *(a15 + 9); + b[159] = *(a16 + 9); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; +#else + b[170] = *(a11 + 10); +#endif + b[171] = *(a12 + 10); + b[172] = *(a13 + 10); + b[173] = *(a14 + 10); + b[174] = *(a15 + 10); + b[175] = *(a16 + 10); + + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; +#ifdef UNIT + b[187] = ONE; +#else + b[187] = *(a12 + 11); +#endif + b[188] = *(a13 + 11); + b[189] = *(a14 + 11); + b[190] = *(a15 + 11); + b[191] = *(a16 + 11); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; +#else + b[204] = *(a13 + 12); +#endif + b[205] = *(a14 + 12); + b[206] = *(a15 + 12); + b[207] = *(a16 + 12); + + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; +#ifdef UNIT + b[221] = ONE; +#else + b[221] = *(a14 + 13); +#endif + b[222] = *(a15 + 13); + b[223] = *(a16 + 13); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; +#else + b[238] = *(a15 + 14); +#endif + b[239] = *(a16 + 14); + + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; +#ifdef UNIT + b[255] = ONE; +#else + b[255] = *(a16 + 15); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 256; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + a09 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + } + } else + if (X > posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a03 + 1); + b[ 3] = *(a04 + 1); + b[ 4] = *(a05 + 1); + b[ 5] = *(a06 + 1); + b[ 6] = *(a07 + 1); + b[ 7] = *(a08 + 1); + b[ 8] = *(a09 + 1); + b[ 9] = *(a10 + 1); + b[ 10] = *(a11 + 1); + b[ 11] = *(a12 + 1); + b[ 12] = *(a13 + 1); + b[ 13] = *(a14 + 1); + b[ 14] = *(a15 + 1); + b[ 15] = *(a16 + 1); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a04 + 2); + b[ 4] = *(a05 + 2); + b[ 5] = *(a06 + 2); + b[ 6] = *(a07 + 2); + b[ 7] = *(a08 + 2); + b[ 8] = *(a09 + 2); + b[ 9] = *(a10 + 2); + b[ 10] = *(a11 + 2); + b[ 11] = *(a12 + 2); + b[ 12] = *(a13 + 2); + b[ 13] = *(a14 + 2); + b[ 14] = *(a15 + 2); + b[ 15] = *(a16 + 2); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = *(a05 + 3); + b[ 5] = *(a06 + 3); + b[ 6] = *(a07 + 3); + b[ 7] = *(a08 + 3); + b[ 8] = *(a09 + 3); + b[ 9] = *(a10 + 3); + b[ 10] = *(a11 + 3); + b[ 11] = *(a12 + 3); + b[ 12] = *(a13 + 3); + b[ 13] = *(a14 + 3); + b[ 14] = *(a15 + 3); + b[ 15] = *(a16 + 3); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = *(a06 + 4); + b[ 6] = *(a07 + 4); + b[ 7] = *(a08 + 4); + b[ 8] = *(a09 + 4); + b[ 9] = *(a10 + 4); + b[ 10] = *(a11 + 4); + b[ 11] = *(a12 + 4); + b[ 12] = *(a13 + 4); + b[ 13] = *(a14 + 4); + b[ 14] = *(a15 + 4); + b[ 15] = *(a16 + 4); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = *(a07 + 5); + b[ 7] = *(a08 + 5); + b[ 8] = *(a09 + 5); + b[ 9] = *(a10 + 5); + b[ 10] = *(a11 + 5); + b[ 11] = *(a12 + 5); + b[ 12] = *(a13 + 5); + b[ 13] = *(a14 + 5); + b[ 14] = *(a15 + 5); + b[ 15] = *(a16 + 5); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = *(a08 + 6); + b[ 8] = *(a09 + 6); + b[ 9] = *(a10 + 6); + b[ 10] = *(a11 + 6); + b[ 11] = *(a12 + 6); + b[ 12] = *(a13 + 6); + b[ 13] = *(a14 + 6); + b[ 14] = *(a15 + 6); + b[ 15] = *(a16 + 6); + b += 16; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(a08 + 7); +#endif + b[ 8] = *(a09 + 7); + b[ 9] = *(a10 + 7); + b[ 10] = *(a11 + 7); + b[ 11] = *(a12 + 7); + b[ 12] = *(a13 + 7); + b[ 13] = *(a14 + 7); + b[ 14] = *(a15 + 7); + b[ 15] = *(a16 + 7); + b += 16; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; +#else + b[ 8] = *(a09 + 8); +#endif + b[ 9] = *(a10 + 8); + b[ 10] = *(a11 + 8); + b[ 11] = *(a12 + 8); + b[ 12] = *(a13 + 8); + b[ 13] = *(a14 + 8); + b[ 14] = *(a15 + 8); + b[ 15] = *(a16 + 8); + b += 16; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a10 + 9); +#endif + b[ 10] = *(a11 + 9); + b[ 11] = *(a12 + 9); + b[ 12] = *(a13 + 9); + b[ 13] = *(a14 + 9); + b[ 14] = *(a15 + 9); + b[ 15] = *(a16 + 9); + b += 16; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a11 + 10); +#endif + b[ 11] = *(a12 + 10); + b[ 12] = *(a13 + 10); + b[ 13] = *(a14 + 10); + b[ 14] = *(a15 + 10); + b[ 15] = *(a16 + 10); + b += 16; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; +#ifdef UNIT + b[ 11] = ONE; +#else + b[ 11] = *(a12 + 11); +#endif + b[ 12] = *(a13 + 11); + b[ 13] = *(a14 + 11); + b[ 14] = *(a15 + 11); + b[ 15] = *(a16 + 11); + b += 16; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; +#ifdef UNIT + b[ 12] = ONE; +#else + b[ 12] = *(a13 + 12); +#endif + b[ 13] = *(a14 + 12); + b[ 14] = *(a15 + 12); + b[ 15] = *(a16 + 12); + b += 16; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; +#ifdef UNIT + b[ 13] = ONE; +#else + b[ 13] = *(a14 + 13); +#endif + b[ 14] = *(a15 + 13); + b[ 15] = *(a16 + 13); + b += 16; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; +#else + b[ 14] = *(a15 + 14); +#endif + b[ 15] = *(a16 + 14); + b += 16; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + b += 8; + } + } else + if (X > posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 64; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a02 + 1); +#endif + b[ 10] = *(a03 + 1); + b[ 11] = *(a04 + 1); + b[ 12] = *(a05 + 1); + b[ 13] = *(a06 + 1); + b[ 14] = *(a07 + 1); + b[ 15] = *(a08 + 1); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; +#else + b[ 18] = *(a03 + 2); +#endif + b[ 19] = *(a04 + 2); + b[ 20] = *(a05 + 2); + b[ 21] = *(a06 + 2); + b[ 22] = *(a07 + 2); + b[ 23] = *(a08 + 2); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; +#ifdef UNIT + b[ 27] = ONE; +#else + b[ 27] = *(a04 + 3); +#endif + b[ 28] = *(a05 + 3); + b[ 29] = *(a06 + 3); + b[ 30] = *(a07 + 3); + b[ 31] = *(a08 + 3); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; +#else + b[ 36] = *(a05 + 4); +#endif + b[ 37] = *(a06 + 4); + b[ 38] = *(a07 + 4); + b[ 39] = *(a08 + 4); + + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; +#ifdef UNIT + b[ 45] = ONE; +#else + b[ 45] = *(a06 + 5); +#endif + b[ 46] = *(a07 + 5); + b[ 47] = *(a08 + 5); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; +#else + b[ 54] = *(a07 + 6); +#endif + b[ 55] = *(a08 + 6); + + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; +#ifdef UNIT + b[ 63] = ONE; +#else + b[ 63] = *(a08 + 7); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + b += 8; + } + } else + if (X > posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a03 + 1); + b[ 3] = *(a04 + 1); + b[ 4] = *(a05 + 1); + b[ 5] = *(a06 + 1); + b[ 6] = *(a07 + 1); + b[ 7] = *(a08 + 1); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a04 + 2); + b[ 4] = *(a05 + 2); + b[ 5] = *(a06 + 2); + b[ 6] = *(a07 + 2); + b[ 7] = *(a08 + 2); + b += 8; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = *(a05 + 3); + b[ 5] = *(a06 + 3); + b[ 6] = *(a07 + 3); + b[ 7] = *(a08 + 3); + b += 8; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = *(a06 + 4); + b[ 6] = *(a07 + 4); + b[ 7] = *(a08 + 4); + b += 8; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = *(a07 + 5); + b[ 7] = *(a08 + 5); + b += 8; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = *(a08 + 6); + b += 8; + } + } + } + + posY += 8; + } + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + b += 4; + } + } else + if (X > posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 16; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a02 + 1); +#endif + b[ 6] = *(a03 + 1); + b[ 7] = *(a04 + 1); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a03 + 2); +#endif + b[ 11] = *(a04 + 2); + + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(a04 + 3); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + b += 4; + } + } else + if (X > posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + b += 4 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b += 4; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a03 + 1); + b[ 3] = *(a04 + 1); + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a04 + 2); + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a01 + 1); + b[ 3] = *(a02 + 1); + + a01 += 2; + a02 += 2; + b += 4; + } else + if (X > posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a02 + 1); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + + a01 ++; + a02 ++; + b += 2; + } else + if (X > posY) { + a01 += lda; + a02 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + a01 += 1; + b += 1; + } else + if (X > posY) { + a01 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_2.c b/kernel/generic/trmm_uncopy_2.c new file mode 100644 index 0000000..1b6d235 --- /dev/null +++ b/kernel/generic/trmm_uncopy_2.c @@ -0,0 +1,195 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data02; + b[ 3] = data04; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data03; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = ZERO; + b[ 3] = data04; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data03; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data03; +#else + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data03; +#endif + ao1 += lda; + b += 2; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_4.c b/kernel/generic/trmm_uncopy_4.c new file mode 100644 index 0000000..4ff6948 --- /dev/null +++ b/kernel/generic/trmm_uncopy_4.c @@ -0,0 +1,489 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data05 = *(ao3 + 0); + data07 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + ao1 += lda; + ao2 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + ao1 += lda; + ao2 += lda; + b += 2; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_8.c b/kernel/generic/trmm_uncopy_8.c new file mode 100644 index 0000000..4e23ffc --- /dev/null +++ b/kernel/generic/trmm_uncopy_8.c @@ -0,0 +1,1226 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + ao7 = a + posX + (posY + 6) * lda; + ao8 = a + posX + (posY + 7) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + ao7 = a + posY + (posX + 6) * lda; + ao8 = a + posY + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + data56 = *(ao7 + 7); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = data03; + b[17] = data11; + b[18] = data19; + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = data04; + b[25] = data12; + b[26] = data20; + b[27] = data28; + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + b[32] = data05; + b[33] = data13; + b[34] = data21; + b[35] = data29; + b[36] = data37; + b[37] = data45; + b[38] = data53; + b[39] = data61; + + b[40] = data06; + b[41] = data14; + b[42] = data22; + b[43] = data30; + b[44] = data38; + b[45] = data46; + b[46] = data54; + b[47] = data62; + + b[48] = data07; + b[49] = data15; + b[50] = data23; + b[51] = data31; + b[52] = data39; + b[53] = data47; + b[54] = data55; + b[55] = data63; + + b[56] = data08; + b[57] = data16; + b[58] = data24; + b[59] = data32; + b[60] = data40; + b[61] = data48; + b[62] = data56; + b[63] = data64; + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + + } else + if (X > posY) { + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + + data09 = *(ao2 + 0); +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); +#ifndef UNIT + data64 = *(ao8 + 7); +#endif + + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = data10; +#endif + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = ZERO; + b[17] = ZERO; +#ifdef UNIT + b[18] = ONE; +#else + b[18] = data19; +#endif + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; +#ifdef UNIT + b[27] = ONE; +#else + b[27] = data28; +#endif + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ZERO; +#ifdef UNIT + b[36] = ONE; +#else + b[36] = data37; +#endif + b[37] = data45; + b[38] = data53; + b[39] = data61; + + b[40] = ZERO; + b[41] = ZERO; + b[42] = ZERO; + b[43] = ZERO; + b[44] = ZERO; +#ifdef UNIT + b[45] = ONE; +#else + b[45] = data46; +#endif + b[46] = data54; + b[47] = data62; + + b[48] = ZERO; + b[49] = ZERO; + b[50] = ZERO; + b[51] = ZERO; + b[52] = ZERO; + b[53] = ZERO; +#ifdef UNIT + b[54] = ONE; +#else + b[54] = data55; +#endif + b[55] = data63; + + b[56] = ZERO; + b[57] = ZERO; + b[58] = ZERO; + b[59] = ZERO; + b[60] = ZERO; + b[61] = ZERO; + b[62] = ZERO; +#ifdef UNIT + b[63] = ONE; +#else + b[63] = data64; +#endif + + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + + if (m & 4) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = data03; + b[17] = data11; + b[18] = data19; + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = data04; + b[25] = data12; + b[26] = data20; + b[27] = data28; + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + ao5 += 4; + ao6 += 4; + ao7 += 4; + ao8 += 4; + + b += 32; + } + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + data33 = *(ao5 + 0); + data41 = *(ao6 + 0); + data49 = *(ao7 + 0); + data57 = *(ao8 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b += 8; + } + } else + if (X > posY) { + if (m & 4) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + if (m & 2) { + ao1 += 2 * lda; + b += 16; + } + + if (m & 1) { + b += 8; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + data33 = *(ao5 + 0); + data41 = *(ao6 + 0); + data49 = *(ao7 + 0); + data57 = *(ao8 + 0); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data18 = *(ao3 + 1); + data26 = *(ao4 + 1); + data34 = *(ao5 + 1); + data42 = *(ao6 + 1); + data50 = *(ao7 + 1); + data58 = *(ao8 + 1); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data27 = *(ao4 + 2); + data35 = *(ao5 + 2); + data43 = *(ao6 + 2); + data51 = *(ao7 + 2); + data59 = *(ao8 + 2); + } + + if (i >= 4) { +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data36 = *(ao5 + 3); + data44 = *(ao6 + 3); + data52 = *(ao7 + 3); + data60 = *(ao8 + 3); + } + + if (i >= 5) { +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data45 = *(ao6 + 4); + data53 = *(ao7 + 4); + data61 = *(ao8 + 4); + } + + if (i >= 6) { +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data54 = *(ao7 + 5); + data62 = *(ao8 + 5); + } + + if (i >= 7) { +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data63 = *(ao8 + 6); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = data18; + b[ 3] = data26; + b[ 4] = data34; + b[ 5] = data42; + b[ 6] = data50; + b[ 7] = data58; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = data27; + b[ 4] = data35; + b[ 5] = data43; + b[ 6] = data51; + b[ 7] = data59; + b += 8; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = data28; +#endif + b[ 4] = data36; + b[ 5] = data44; + b[ 6] = data52; + b[ 7] = data60; + b += 8; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = data37; +#endif + b[ 5] = data45; + b[ 6] = data53; + b[ 7] = data61; + b += 8; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = data46; +#endif + b[ 6] = data54; + b[ 7] = data62; + b += 8; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = data55; +#endif + b[ 7] = data63; + b += 8; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data11; + b[10] = data19; + b[11] = data27; + + b[12] = data04; + b[13] = data12; + b[14] = data20; + b[15] = data28; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + + } else + if (X > posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data18; + b[ 7] = data26; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data27; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = ZERO; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data19; + b[11] = data27; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data28; +#endif + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b += 4; + } + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + b += 8; + } + + if (m & 1) { + b += 4; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data18 = *(ao3 + 1); + data26 = *(ao4 + 1); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data27 = *(ao4 + 2); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = data18; + b[ 3] = data26; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = data27; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data02; + b[ 3] = data10; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data09; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = ZERO; + b[ 3] = data10; +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b += 2; + } else + if (X > posY) { + b += 2; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data09; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data09; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_1.c b/kernel/generic/trmm_utcopy_1.c new file mode 100644 index 0000000..92f2da3 --- /dev/null +++ b/kernel/generic/trmm_utcopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, X; + + FLOAT data01; + FLOAT *ao1; + + while (n > 0) { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + posY ++; + n --; + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_16.c b/kernel/generic/trmm_utcopy_16.c new file mode 100644 index 0000000..a964cd3 --- /dev/null +++ b/kernel/generic/trmm_utcopy_16.c @@ -0,0 +1,1550 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + a09 = a + posX + (posY + 8) * lda; + a10 = a + posX + (posY + 9) * lda; + a11 = a + posX + (posY + 10) * lda; + a12 = a + posX + (posY + 11) * lda; + a13 = a + posX + (posY + 12) * lda; + a14 = a + posX + (posY + 13) * lda; + a15 = a + posX + (posY + 14) * lda; + a16 = a + posX + (posY + 15) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + a09 = a + posY + (posX + 8) * lda; + a10 = a + posY + (posX + 9) * lda; + a11 = a + posY + (posX + 10) * lda; + a12 = a + posY + (posX + 11) * lda; + a13 = a + posY + (posX + 12) * lda; + a14 = a + posY + (posX + 13) * lda; + a15 = a + posY + (posX + 14) * lda; + a16 = a + posY + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + a09 += 16; + a10 += 16; + a11 += 16; + a12 += 16; + a13 += 16; + a14 += 16; + a15 += 16; + a16 += 16; + b += 256; + } else + if (X > posY) { + + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a02 + 0); +#ifdef UNIT + b[ 17] = ONE; +#else + b[ 17] = *(a02 + 1); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a03 + 0); + b[ 33] = *(a03 + 1); +#ifdef UNIT + b[ 34] = ONE; +#else + b[ 34] = *(a03 + 2); +#endif + b[ 35] = ZERO; + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a04 + 0); + b[ 49] = *(a04 + 1); + b[ 50] = *(a04 + 2); +#ifdef UNIT + b[ 51] = ONE; +#else + b[ 51] = *(a04 + 3); +#endif + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a05 + 0); + b[ 65] = *(a05 + 1); + b[ 66] = *(a05 + 2); + b[ 67] = *(a05 + 3); +#ifdef UNIT + b[ 68] = ONE; +#else + b[ 68] = *(a05 + 4); +#endif + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a06 + 0); + b[ 81] = *(a06 + 1); + b[ 82] = *(a06 + 2); + b[ 83] = *(a06 + 3); + b[ 84] = *(a06 + 4); +#ifdef UNIT + b[ 85] = ONE; +#else + b[ 85] = *(a06 + 5); +#endif + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a07 + 0); + b[ 97] = *(a07 + 1); + b[ 98] = *(a07 + 2); + b[ 99] = *(a07 + 3); + b[100] = *(a07 + 4); + b[101] = *(a07 + 5); +#ifdef UNIT + b[102] = ONE; +#else + b[102] = *(a07 + 6); +#endif + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a08 + 0); + b[113] = *(a08 + 1); + b[114] = *(a08 + 2); + b[115] = *(a08 + 3); + b[116] = *(a08 + 4); + b[117] = *(a08 + 5); + b[118] = *(a08 + 6); +#ifdef UNIT + b[119] = ONE; +#else + b[119] = *(a08 + 7); +#endif + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a09 + 0); + b[129] = *(a09 + 1); + b[130] = *(a09 + 2); + b[131] = *(a09 + 3); + b[132] = *(a09 + 4); + b[133] = *(a09 + 5); + b[134] = *(a09 + 6); + b[135] = *(a09 + 7); +#ifdef UNIT + b[136] = ONE; +#else + b[136] = *(a09 + 8); +#endif + b[137] = ZERO; + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + + b[144] = *(a10 + 0); + b[145] = *(a10 + 1); + b[146] = *(a10 + 2); + b[147] = *(a10 + 3); + b[148] = *(a10 + 4); + b[149] = *(a10 + 5); + b[150] = *(a10 + 6); + b[151] = *(a10 + 7); + b[152] = *(a10 + 8); +#ifdef UNIT + b[153] = ONE; +#else + b[153] = *(a10 + 9); +#endif + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a11 + 0); + b[161] = *(a11 + 1); + b[162] = *(a11 + 2); + b[163] = *(a11 + 3); + b[164] = *(a11 + 4); + b[165] = *(a11 + 5); + b[166] = *(a11 + 6); + b[167] = *(a11 + 7); + b[168] = *(a11 + 8); + b[169] = *(a11 + 9); +#ifdef UNIT + b[170] = ONE; +#else + b[170] = *(a11 + 10); +#endif + b[171] = ZERO; + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + + b[176] = *(a12 + 0); + b[177] = *(a12 + 1); + b[178] = *(a12 + 2); + b[179] = *(a12 + 3); + b[180] = *(a12 + 4); + b[181] = *(a12 + 5); + b[182] = *(a12 + 6); + b[183] = *(a12 + 7); + b[184] = *(a12 + 8); + b[185] = *(a12 + 9); + b[186] = *(a12 + 10); +#ifdef UNIT + b[187] = ONE; +#else + b[187] = *(a12 + 11); +#endif + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a13 + 0); + b[193] = *(a13 + 1); + b[194] = *(a13 + 2); + b[195] = *(a13 + 3); + b[196] = *(a13 + 4); + b[197] = *(a13 + 5); + b[198] = *(a13 + 6); + b[199] = *(a13 + 7); + b[200] = *(a13 + 8); + b[201] = *(a13 + 9); + b[202] = *(a13 + 10); + b[203] = *(a13 + 11); +#ifdef UNIT + b[204] = ONE; +#else + b[204] = *(a13 + 12); +#endif + b[205] = ZERO; + b[206] = ZERO; + b[207] = ZERO; + + b[208] = *(a14 + 0); + b[209] = *(a14 + 1); + b[210] = *(a14 + 2); + b[211] = *(a14 + 3); + b[212] = *(a14 + 4); + b[213] = *(a14 + 5); + b[214] = *(a14 + 6); + b[215] = *(a14 + 7); + b[216] = *(a14 + 8); + b[217] = *(a14 + 9); + b[218] = *(a14 + 10); + b[219] = *(a14 + 11); + b[220] = *(a14 + 12); +#ifdef UNIT + b[221] = ONE; +#else + b[221] = *(a14 + 13); +#endif + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a15 + 0); + b[225] = *(a15 + 1); + b[226] = *(a15 + 2); + b[227] = *(a15 + 3); + b[228] = *(a15 + 4); + b[229] = *(a15 + 5); + b[230] = *(a15 + 6); + b[231] = *(a15 + 7); + b[232] = *(a15 + 8); + b[233] = *(a15 + 9); + b[234] = *(a15 + 10); + b[235] = *(a15 + 11); + b[236] = *(a15 + 12); + b[237] = *(a15 + 13); +#ifdef UNIT + b[238] = ONE; +#else + b[238] = *(a15 + 14); +#endif + b[239] = ZERO; + + b[240] = *(a16 + 0); + b[241] = *(a16 + 1); + b[242] = *(a16 + 2); + b[243] = *(a16 + 3); + b[244] = *(a16 + 4); + b[245] = *(a16 + 5); + b[246] = *(a16 + 6); + b[247] = *(a16 + 7); + b[248] = *(a16 + 8); + b[249] = *(a16 + 9); + b[250] = *(a16 + 10); + b[251] = *(a16 + 11); + b[252] = *(a16 + 12); + b[253] = *(a16 + 13); + b[254] = *(a16 + 14); +#ifdef UNIT + b[255] = ONE; +#else + b[255] = *(a16 + 15); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 256; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i > 0) { + if (X < posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + a05 += i; + a06 += i; + a07 += i; + a08 += i; + a09 += i; + a10 += i; + a11 += i; + a12 += i; + a13 += i; + a14 += i; + a15 += i; + a16 += i; + b += 16 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 16; + } + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + if (i >= 2) { + b[ 0] = *(a02 + 0); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b += 16; + } + + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 8) { + b[ 0] = *(a08 + 0); + b[ 1] = *(a08 + 1); + b[ 2] = *(a08 + 2); + b[ 3] = *(a08 + 3); + b[ 4] = *(a08 + 4); + b[ 5] = *(a08 + 5); + b[ 6] = *(a08 + 6); +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(a08 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b += 16; + } + + if (i >= 9) { + b[ 0] = *(a09 + 0); + b[ 1] = *(a09 + 1); + b[ 2] = *(a09 + 2); + b[ 3] = *(a09 + 3); + b[ 4] = *(a09 + 4); + b[ 5] = *(a09 + 5); + b[ 6] = *(a09 + 6); + b[ 7] = *(a09 + 7); +#ifdef UNIT + b[ 8] = ONE; +#else + b[ 8] = *(a09 + 8); +#endif + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 10) { + b[ 0] = *(a10 + 0); + b[ 1] = *(a10 + 1); + b[ 2] = *(a10 + 2); + b[ 3] = *(a10 + 3); + b[ 4] = *(a10 + 4); + b[ 5] = *(a10 + 5); + b[ 6] = *(a10 + 6); + b[ 7] = *(a10 + 7); + b[ 8] = *(a10 + 8); +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a10 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 11) { + b[ 0] = *(a11 + 0); + b[ 1] = *(a11 + 1); + b[ 2] = *(a11 + 2); + b[ 3] = *(a11 + 3); + b[ 4] = *(a11 + 4); + b[ 5] = *(a11 + 5); + b[ 6] = *(a11 + 6); + b[ 7] = *(a11 + 7); + b[ 8] = *(a11 + 8); + b[ 9] = *(a11 + 9); +#ifdef UNIT + b[10] = ONE; +#else + b[10] = *(a11 + 10); +#endif + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 12) { + b[ 0] = *(a12 + 0); + b[ 1] = *(a12 + 1); + b[ 2] = *(a12 + 2); + b[ 3] = *(a12 + 3); + b[ 4] = *(a12 + 4); + b[ 5] = *(a12 + 5); + b[ 6] = *(a12 + 6); + b[ 7] = *(a12 + 7); + b[ 8] = *(a12 + 8); + b[ 9] = *(a12 + 9); + b[10] = *(a12 + 10); +#ifdef UNIT + b[11] = ONE; +#else + b[11] = *(a12 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 13) { + b[ 0] = *(a13 + 0); + b[ 1] = *(a13 + 1); + b[ 2] = *(a13 + 2); + b[ 3] = *(a13 + 3); + b[ 4] = *(a13 + 4); + b[ 5] = *(a13 + 5); + b[ 6] = *(a13 + 6); + b[ 7] = *(a13 + 7); + b[ 8] = *(a13 + 8); + b[ 9] = *(a13 + 9); + b[10] = *(a13 + 10); + b[11] = *(a13 + 11); +#ifdef UNIT + b[12] = ONE; +#else + b[12] = *(a13 + 12); +#endif + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 14) { + b[ 0] = *(a14 + 0); + b[ 1] = *(a14 + 1); + b[ 2] = *(a14 + 2); + b[ 3] = *(a14 + 3); + b[ 4] = *(a14 + 4); + b[ 5] = *(a14 + 5); + b[ 6] = *(a14 + 6); + b[ 7] = *(a14 + 7); + b[ 8] = *(a14 + 8); + b[ 9] = *(a14 + 9); + b[10] = *(a14 + 10); + b[11] = *(a14 + 11); + b[12] = *(a14 + 12); +#ifdef UNIT + b[13] = ONE; +#else + b[13] = *(a14 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 15) { + b[ 0] = *(a15 + 0); + b[ 1] = *(a15 + 1); + b[ 2] = *(a15 + 2); + b[ 3] = *(a15 + 3); + b[ 4] = *(a15 + 4); + b[ 5] = *(a15 + 5); + b[ 6] = *(a15 + 6); + b[ 7] = *(a15 + 7); + b[ 8] = *(a15 + 8); + b[ 9] = *(a15 + 9); + b[10] = *(a15 + 10); + b[11] = *(a15 + 11); + b[12] = *(a15 + 12); + b[13] = *(a15 + 13); +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(a15 + 14); +#endif + b[15] = ZERO; + } + } + } + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + a05 += 8; + a06 += 8; + a07 += 8; + a08 += 8; + b += 64; + } else + if (X > posY) { + + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a02 + 0); +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a02 + 1); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a03 + 0); + b[ 17] = *(a03 + 1); +#ifdef UNIT + b[ 18] = ONE; +#else + b[ 18] = *(a03 + 2); +#endif + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a04 + 0); + b[ 25] = *(a04 + 1); + b[ 26] = *(a04 + 2); +#ifdef UNIT + b[ 27] = ONE; +#else + b[ 27] = *(a04 + 3); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a05 + 0); + b[ 33] = *(a05 + 1); + b[ 34] = *(a05 + 2); + b[ 35] = *(a05 + 3); +#ifdef UNIT + b[ 36] = ONE; +#else + b[ 36] = *(a05 + 4); +#endif + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + + b[ 40] = *(a06 + 0); + b[ 41] = *(a06 + 1); + b[ 42] = *(a06 + 2); + b[ 43] = *(a06 + 3); + b[ 44] = *(a06 + 4); +#ifdef UNIT + b[ 45] = ONE; +#else + b[ 45] = *(a06 + 5); +#endif + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a07 + 0); + b[ 49] = *(a07 + 1); + b[ 50] = *(a07 + 2); + b[ 51] = *(a07 + 3); + b[ 52] = *(a07 + 4); + b[ 53] = *(a07 + 5); +#ifdef UNIT + b[ 54] = ONE; +#else + b[ 54] = *(a07 + 6); +#endif + b[ 55] = ZERO; + + b[ 56] = *(a08 + 0); + b[ 57] = *(a08 + 1); + b[ 58] = *(a08 + 2); + b[ 59] = *(a08 + 3); + b[ 60] = *(a08 + 4); + b[ 61] = *(a08 + 5); + b[ 62] = *(a08 + 6); +#ifdef UNIT + b[ 63] = ONE; +#else + b[ 63] = *(a08 + 7); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0) { + if (X < posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + a05 += i; + a06 += i; + a07 += i; + a08 += i; + b += 8 * i; + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(a02 + 0); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = ZERO; + b += 8; + } + } + } + posY += 8; + } + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + a01 += 4; + a02 += 4; + a03 += 4; + a04 += 4; + b += 16; + } else + if (X > posY) { + + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + a01 += lda; + b += 4; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a02 + 0); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a02 + 1); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a03 + 0); + b[ 9] = *(a03 + 1); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a03 + 2); +#endif + b[ 11] = ZERO; + + b[ 12] = *(a04 + 0); + b[ 13] = *(a04 + 1); + b[ 14] = *(a04 + 2); +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(a04 + 3); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X < posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + b += 4 * i; + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + a01 += lda; + b += 4; + } + a02 += lda; + a03 += lda; + a04 += lda; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if (i >= 2) { + b[ 0] = *(a02 + 0); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b += 4; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + a01 += 2; + a02 += 2; + b += 4; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + + b[ 2] = *(a02 + 0); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a02 + 1); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + if (X < posY) { + a01 ++; + a02 ++; + b += 2; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = *(a01 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b += 2; + } + } + posY += 2; + } + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X < posY) { + a01 += 1; + b ++; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + a01 += lda; + b ++; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + a01 += lda; + b ++; + } + + X += 1; + i --; + } while (i > 0); + } + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_2.c b/kernel/generic/trmm_utcopy_2.c new file mode 100644 index 0000000..620b06a --- /dev/null +++ b/kernel/generic/trmm_utcopy_2.c @@ -0,0 +1,191 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + + b[ 0] = data01; + b[ 1] = ZERO; +#endif + ao1 += lda; + b += 2; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_4.c b/kernel/generic/trmm_utcopy_4.c new file mode 100644 index 0000000..7d4dba3 --- /dev/null +++ b/kernel/generic/trmm_utcopy_4.c @@ -0,0 +1,472 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + if (i >= 2) { + data05 = *(ao2 + 0); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = ONE; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = ONE; + b[ 3] = ZERO; + b += 4; + } +#else + data01 = *(ao1 + 0); + + if (i >= 2) { + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + } + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = data11; + b[ 3] = ZERO; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; + +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + + b[ 0] = data01; + b[ 1] = ZERO; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_8.c b/kernel/generic/trmm_utcopy_8.c new file mode 100644 index 0000000..6dbf8bd --- /dev/null +++ b/kernel/generic/trmm_utcopy_8.c @@ -0,0 +1,1276 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + ao7 = a + posX + (posY + 6) * lda; + ao8 = a + posX + (posY + 7) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + ao7 = a + posY + (posX + 6) * lda; + ao8 = a + posY + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + data56 = *(ao7 + 7); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b[32] = data33; + b[33] = data34; + b[34] = data35; + b[35] = data36; + b[36] = data37; + b[37] = data38; + b[38] = data39; + b[39] = data40; + + b[40] = data41; + b[41] = data42; + b[42] = data43; + b[43] = data44; + b[44] = data45; + b[45] = data46; + b[46] = data47; + b[47] = data48; + + b[48] = data49; + b[49] = data50; + b[50] = data51; + b[51] = data52; + b[52] = data53; + b[53] = data54; + b[54] = data55; + b[55] = data56; + + b[56] = data57; + b[57] = data58; + b[58] = data59; + b[59] = data60; + b[60] = data61; + b[61] = data62; + b[62] = data63; + b[63] = data64; + + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = ONE; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data17; + b[17] = data18; + b[18] = ONE; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = ONE; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + + b[32] = data33; + b[33] = data34; + b[34] = data35; + b[35] = data36; + b[36] = ONE; + b[37] = ZERO; + b[38] = ZERO; + b[39] = ZERO; + + b[40] = data41; + b[41] = data42; + b[42] = data43; + b[43] = data44; + b[44] = data45; + b[45] = ONE; + b[46] = ZERO; + b[47] = ZERO; + + b[48] = data49; + b[49] = data50; + b[50] = data51; + b[51] = data52; + b[52] = data53; + b[53] = data54; + b[54] = ONE; + b[55] = ZERO; + + b[56] = data57; + b[57] = data58; + b[58] = data59; + b[59] = data60; + b[60] = data61; + b[61] = data62; + b[62] = data63; + b[63] = ONE; +#else + data01 = *(ao1 + 0); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + + b[32] = data33; + b[33] = data34; + b[34] = data35; + b[35] = data36; + b[36] = data37; + b[37] = ZERO; + b[38] = ZERO; + b[39] = ZERO; + + b[40] = data41; + b[41] = data42; + b[42] = data43; + b[43] = data44; + b[44] = data45; + b[45] = data46; + b[46] = ZERO; + b[47] = ZERO; + + b[48] = data49; + b[49] = data50; + b[50] = data51; + b[51] = data52; + b[52] = data53; + b[53] = data54; + b[54] = data55; + b[55] = ZERO; + + b[56] = data57; + b[57] = data58; + b[58] = data59; + b[59] = data60; + b[60] = data61; + b[61] = data62; + b[62] = data63; + b[63] = data64; + +#endif + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + + if (m & 4) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + ao5 += 4; + ao6 += 4; + ao7 += 4; + ao8 += 4; + + b += 32; + } + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + + b += 16; + } + + if (m & 1) { + b += 8; + } + } else + if (X > posY) { + if (m & 4) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 2 * lda; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + data33 = *(ao5 + 0); + data41 = *(ao6 + 0); + data49 = *(ao7 + 0); + data57 = *(ao8 + 0); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data18 = *(ao3 + 1); + data26 = *(ao4 + 1); + data34 = *(ao5 + 1); + data42 = *(ao6 + 1); + data50 = *(ao7 + 1); + data58 = *(ao8 + 1); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data27 = *(ao4 + 2); + data35 = *(ao5 + 2); + data43 = *(ao6 + 2); + data51 = *(ao7 + 2); + data59 = *(ao8 + 2); + } + + if (i >= 4) { +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data36 = *(ao5 + 3); + data44 = *(ao6 + 3); + data52 = *(ao7 + 3); + data60 = *(ao8 + 3); + } + + if (i >= 5) { +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data45 = *(ao6 + 4); + data53 = *(ao7 + 4); + data61 = *(ao8 + 4); + } + + if (i >= 6) { +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data54 = *(ao7 + 5); + data62 = *(ao8 + 5); + } + + if (i >= 7) { +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data63 = *(ao8 + 6); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = data09; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data17; + b[ 1] = data18; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 4) { + b[ 0] = data25; + b[ 1] = data26; + b[ 2] = data27; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = data28; +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 5) { + b[ 0] = data33; + b[ 1] = data34; + b[ 2] = data35; + b[ 3] = data36; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = data37; +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 6) { + b[ 0] = data41; + b[ 1] = data42; + b[ 2] = data43; + b[ 3] = data44; + b[ 4] = data45; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = data46; +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 7) { + b[ 0] = data49; + b[ 1] = data50; + b[ 2] = data51; + b[ 3] = data52; + b[ 4] = data53; + b[ 5] = data54; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = data55; +#endif + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + b[ 8] = data17; + b[ 9] = data18; + b[10] = data19; + b[11] = data20; + + b[12] = data25; + b[13] = data26; + b[14] = data27; + b[15] = data28; + + b += 16; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data09; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data17; + b[ 9] = data18; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data25; + b[13] = data26; + b[14] = data27; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data17; + b[ 9] = data18; + b[10] = data19; + b[11] = ZERO; + + b[12] = data25; + b[13] = data26; + b[14] = data27; + b[15] = data28; +#endif + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + + b += 8; + } + + if (m & 1) { + b += 4; + } + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data18 = *(ao3 + 1); + data26 = *(ao4 + 1); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data27 = *(ao4 + 2); + } + +#ifndef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data09; +#ifndef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data17; + + b[ 1] = data18; +#ifndef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = ZERO; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + + b += 4; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b += 2; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data09; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data09; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + ao1 += 1; + b += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + ao1 += lda; + + b[ 0] = data01; + b += 1; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_LN.c b/kernel/generic/trsm_kernel_LN.c new file mode 100644 index 0000000..068a202 --- /dev/null +++ b/kernel/generic/trsm_kernel_LN.c @@ -0,0 +1,333 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_LT.c b/kernel/generic/trsm_kernel_LT.c new file mode 100644 index 0000000..300fdd4 --- /dev/null +++ b/kernel/generic/trsm_kernel_LT.c @@ -0,0 +1,317 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_RN.c b/kernel/generic/trsm_kernel_RN.c new file mode 100644 index 0000000..b85c3c1 --- /dev/null +++ b/kernel/generic/trsm_kernel_RN.c @@ -0,0 +1,315 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_RT.c b/kernel/generic/trsm_kernel_RT.c new file mode 100644 index 0000000..2adb3a4 --- /dev/null +++ b/kernel/generic/trsm_kernel_RT.c @@ -0,0 +1,341 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/generic/trsm_lncopy_1.c b/kernel/generic/trsm_lncopy_1.c new file mode 100644 index 0000000..abad971 --- /dev/null +++ b/kernel/generic/trsm_lncopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + +#ifndef UNIT + FLOAT data01; +#endif + FLOAT *a1; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) *(b + 0) = *(a1 + 0); + + a1 ++; + b ++; + + i --; + ii ++; + } + + a += lda; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/trsm_lncopy_16.c b/kernel/generic/trsm_lncopy_16.c new file mode 100644 index 0000000..a7f9cb0 --- /dev/null +++ b/kernel/generic/trsm_lncopy_16.c @@ -0,0 +1,271 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + *(b + 4) = *(a5 + 0); + *(b + 5) = *(a6 + 0); + *(b + 6) = *(a7 + 0); + *(b + 7) = *(a8 + 0); + *(b + 8) = *(a9 + 0); + *(b + 9) = *(a10 + 0); + *(b + 10) = *(a11 + 0); + *(b + 11) = *(a12 + 0); + *(b + 12) = *(a13 + 0); + *(b + 13) = *(a14 + 0); + *(b + 14) = *(a15 + 0); + *(b + 15) = *(a16 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + a5 ++; + a6 ++; + a7 ++; + a8 ++; + a9 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + *(b + 4) = *(a5 + 0); + *(b + 5) = *(a6 + 0); + *(b + 6) = *(a7 + 0); + *(b + 7) = *(a8 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + a5 ++; + a6 ++; + a7 ++; + a8 ++; + b += 8; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + b += 4; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + } + + a1 ++; + a2 ++; + b += 2; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + } + + a1 ++; + b += 1; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_lncopy_2.c b/kernel/generic/trsm_lncopy_2.c new file mode 100644 index 0000000..20cc642 --- /dev/null +++ b/kernel/generic/trsm_lncopy_2.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT *a1, *a2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + + a += 2 * lda; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_lncopy_4.c b/kernel/generic/trsm_lncopy_4.c new file mode 100644 index 0000000..9f7bcc2 --- /dev/null +++ b/kernel/generic/trsm_lncopy_4.c @@ -0,0 +1,326 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = INV(data11); + + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data05; + *(b + 3) = data07; + *(b + 4) = data02; + *(b + 5) = data04; + *(b + 6) = data06; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_lncopy_8.c b/kernel/generic/trsm_lncopy_8.c new file mode 100644 index 0000000..40feb81 --- /dev/null +++ b/kernel/generic/trsm_lncopy_8.c @@ -0,0 +1,841 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + ii = 0; + i = (m >> 3); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + +#ifndef UNIT + data37 = *(a5 + 4); +#endif + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + +#ifndef UNIT + data46 = *(a6 + 5); +#endif + data47 = *(a6 + 6); + data48 = *(a6 + 7); + +#ifndef UNIT + data55 = *(a7 + 6); +#endif + data56 = *(a7 + 7); + +#ifndef UNIT + data64 = *(a8 + 7); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data02; + *(b + 9) = INV(data10); + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = INV(data19); + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = INV(data28); + + *(b + 32) = data05; + *(b + 33) = data13; + *(b + 34) = data21; + *(b + 35) = data29; + *(b + 36) = INV(data37); + + *(b + 40) = data06; + *(b + 41) = data14; + *(b + 42) = data22; + *(b + 43) = data30; + *(b + 44) = data38; + *(b + 45) = INV(data46); + + *(b + 48) = data07; + *(b + 49) = data15; + *(b + 50) = data23; + *(b + 51) = data31; + *(b + 52) = data39; + *(b + 53) = data47; + *(b + 54) = INV(data55); + + *(b + 56) = data08; + *(b + 57) = data16; + *(b + 58) = data24; + *(b + 59) = data32; + *(b + 60) = data40; + *(b + 61) = data48; + *(b + 62) = data56; + *(b + 63) = INV(data64); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + data47 = *(a6 + 6); + data48 = *(a6 + 7); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); + data55 = *(a7 + 6); + data56 = *(a7 + 7); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); + data64 = *(a8 + 7); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = data19; + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = data28; + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + *(b + 32) = data05; + *(b + 33) = data13; + *(b + 34) = data21; + *(b + 35) = data29; + *(b + 36) = data37; + *(b + 37) = data45; + *(b + 38) = data53; + *(b + 39) = data61; + + *(b + 40) = data06; + *(b + 41) = data14; + *(b + 42) = data22; + *(b + 43) = data30; + *(b + 44) = data38; + *(b + 45) = data46; + *(b + 46) = data54; + *(b + 47) = data62; + + *(b + 48) = data07; + *(b + 49) = data15; + *(b + 50) = data23; + *(b + 51) = data31; + *(b + 52) = data39; + *(b + 53) = data47; + *(b + 54) = data55; + *(b + 55) = data63; + + *(b + 56) = data08; + *(b + 57) = data16; + *(b + 58) = data24; + *(b + 59) = data32; + *(b + 60) = data40; + *(b + 61) = data48; + *(b + 62) = data56; + *(b + 63) = data64; + } + + a1 += 8; + a2 += 8; + a3 += 8; + a4 += 8; + a5 += 8; + a6 += 8; + a7 += 8; + a8 += 8; + b += 64; + + i --; + ii += 8; + } + + if (m & 4) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data02; + *(b + 9) = INV(data10); + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = INV(data19); + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = data19; + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = data28; + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + a5 += 4; + a6 += 4; + a7 += 4; + a8 += 4; + b += 32; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data02; + *(b + 9) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + data33 = *(a5 + 0); + data41 = *(a6 + 0); + data49 = *(a7 + 0); + data57 = *(a8 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + } + b += 8; + } + + a += 8 * lda; + jj += 8; + j --; + } + + if (n & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data10); + + *(b + 8) = data03; + *(b + 9) = data11; + *(b + 10) = INV(data19); + + *(b + 12) = data04; + *(b + 13) = data12; + *(b + 14) = data20; + *(b + 15) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data11; + *(b + 10) = data19; + *(b + 11) = data27; + + *(b + 12) = data04; + *(b + 13) = data12; + *(b + 14) = data20; + *(b + 15) = data28; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data02; + *(b + 3) = data10; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + } + b += 2; + } + + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + ii = 0; + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_1.c b/kernel/generic/trsm_ltcopy_1.c new file mode 100644 index 0000000..ea84136 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + +#ifndef UNIT + FLOAT data01; +#endif + FLOAT *a1; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) *(b + 0) = *(a1 + 0); + + a1 += lda; + b ++; + + i --; + ii ++; + } + + a ++; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_16.c b/kernel/generic/trsm_ltcopy_16.c new file mode 100644 index 0000000..1203f1b --- /dev/null +++ b/kernel/generic/trsm_ltcopy_16.c @@ -0,0 +1,228 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 2; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + } + + b += 1; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_2.c b/kernel/generic/trsm_ltcopy_2.c new file mode 100644 index 0000000..4705635 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_2.c @@ -0,0 +1,160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT *a1, *a2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + + a += 2; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_4.c b/kernel/generic/trsm_ltcopy_4.c new file mode 100644 index 0000000..d891468 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_4.c @@ -0,0 +1,346 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 10) = INV(data11); + *(b + 11) = data12; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_8.c b/kernel/generic/trsm_ltcopy_8.c new file mode 100644 index 0000000..0925dcc --- /dev/null +++ b/kernel/generic/trsm_ltcopy_8.c @@ -0,0 +1,921 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + ii = 0; + i = (m >> 3); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + +#ifndef UNIT + data37 = *(a5 + 4); +#endif + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + +#ifndef UNIT + data46 = *(a6 + 5); +#endif + data47 = *(a6 + 6); + data48 = *(a6 + 7); + +#ifndef UNIT + data55 = *(a7 + 6); +#endif + data56 = *(a7 + 7); + +#ifndef UNIT + data64 = *(a8 + 7); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 9) = INV(data10); + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 18) = INV(data19); + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + + *(b + 27) = INV(data28); + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + + *(b + 36) = INV(data37); + *(b + 37) = data38; + *(b + 38) = data39; + *(b + 39) = data40; + + *(b + 45) = INV(data46); + *(b + 46) = data47; + *(b + 47) = data48; + + *(b + 54) = INV(data55); + *(b + 55) = data56; + + *(b + 63) = INV(data64); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + data47 = *(a6 + 6); + data48 = *(a6 + 7); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); + data55 = *(a7 + 6); + data56 = *(a7 + 7); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); + data64 = *(a8 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + + *(b + 32) = data33; + *(b + 33) = data34; + *(b + 34) = data35; + *(b + 35) = data36; + *(b + 36) = data37; + *(b + 37) = data38; + *(b + 38) = data39; + *(b + 39) = data40; + *(b + 40) = data41; + *(b + 41) = data42; + *(b + 42) = data43; + *(b + 43) = data44; + *(b + 44) = data45; + *(b + 45) = data46; + *(b + 46) = data47; + *(b + 47) = data48; + + *(b + 48) = data49; + *(b + 49) = data50; + *(b + 50) = data51; + *(b + 51) = data52; + *(b + 52) = data53; + *(b + 53) = data54; + *(b + 54) = data55; + *(b + 55) = data56; + *(b + 56) = data57; + *(b + 57) = data58; + *(b + 58) = data59; + *(b + 59) = data60; + *(b + 60) = data61; + *(b + 61) = data62; + *(b + 62) = data63; + *(b + 63) = data64; + } + + a1 += 8 * lda; + a2 += 8 * lda; + a3 += 8 * lda; + a4 += 8 * lda; + a5 += 8 * lda; + a6 += 8 * lda; + a7 += 8 * lda; + a8 += 8 * lda; + b += 64; + + i --; + ii += 8; + } + + if (m & 4) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 9) = INV(data10); + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 18) = INV(data19); + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + + *(b + 27) = INV(data28); + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 32; + + ii += 4; + } + + if (m & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 9) = INV(data10); + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 16; + + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + b += 8; + } + a += 8; + jj += 8; + j --; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data10); + *(b + 6) = data11; + *(b + 7) = data12; + + *(b + 10) = INV(data19); + *(b + 11) = data20; + *(b + 15) = INV(data28); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + + *(b + 8) = data17; + *(b + 9) = data18; + *(b + 10) = data19; + *(b + 11) = data20; + *(b + 12) = data25; + *(b + 13) = data26; + *(b + 14) = data27; + *(b + 15) = data28; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 6) = INV(data10); + *(b + 7) = data11; + *(b + 8) = data12; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + a += 4; + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 3) = INV(data10); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += lda; + b += 1; + + i --; + ii += 1; + } + + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_1.c b/kernel/generic/trsm_uncopy_1.c new file mode 100644 index 0000000..3a25860 --- /dev/null +++ b/kernel/generic/trsm_uncopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + +#ifndef UNIT + FLOAT data01; +#endif + FLOAT *a1; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) *(b + 0) = *(a1 + 0); + + a1 ++; + b ++; + i --; + ii ++; + } + + a += lda; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_16.c b/kernel/generic/trsm_uncopy_16.c new file mode 100644 index 0000000..e2b8ce4 --- /dev/null +++ b/kernel/generic/trsm_uncopy_16.c @@ -0,0 +1,271 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + *(b + 4) = *(a5 + 0); + *(b + 5) = *(a6 + 0); + *(b + 6) = *(a7 + 0); + *(b + 7) = *(a8 + 0); + *(b + 8) = *(a9 + 0); + *(b + 9) = *(a10 + 0); + *(b + 10) = *(a11 + 0); + *(b + 11) = *(a12 + 0); + *(b + 12) = *(a13 + 0); + *(b + 13) = *(a14 + 0); + *(b + 14) = *(a15 + 0); + *(b + 15) = *(a16 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + a5 ++; + a6 ++; + a7 ++; + a8 ++; + a9 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + *(b + 4) = *(a5 + 0); + *(b + 5) = *(a6 + 0); + *(b + 6) = *(a7 + 0); + *(b + 7) = *(a8 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + a5 ++; + a6 ++; + a7 ++; + a8 ++; + b += 8; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + b += 4; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + } + + a1 ++; + a2 ++; + b += 2; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 1; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + } + + a1 ++; + b += 1; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_2.c b/kernel/generic/trsm_uncopy_2.c new file mode 100644 index 0000000..f7f3435 --- /dev/null +++ b/kernel/generic/trsm_uncopy_2.c @@ -0,0 +1,160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT *a1, *a2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + + *(b + 0) = INV(data01); + *(b + 1) = data03; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + + a += 2 * lda; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_4.c b/kernel/generic/trsm_uncopy_4.c new file mode 100644 index 0000000..837a250 --- /dev/null +++ b/kernel/generic/trsm_uncopy_4.c @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 10) = INV(data11); + *(b + 11) = data15; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); + data09 = *(a3 + 0); + data13 = *(a4 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data03; + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data03; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_8.c b/kernel/generic/trsm_uncopy_8.c new file mode 100644 index 0000000..8c5623d --- /dev/null +++ b/kernel/generic/trsm_uncopy_8.c @@ -0,0 +1,946 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + ii = 0; + + i = (m >> 3); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); +#ifndef UNIT + data37 = *(a5 + 4); +#endif + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); +#ifndef UNIT + data46 = *(a6 + 5); +#endif + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); +#ifndef UNIT + data55 = *(a7 + 6); +#endif + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); +#ifndef UNIT + data64 = *(a8 + 7); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 9) = INV(data10); + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 18) = INV(data19); + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 27) = INV(data28); + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + *(b + 36) = INV(data37); + *(b + 37) = data45; + *(b + 38) = data53; + *(b + 39) = data61; + + *(b + 45) = INV(data46); + *(b + 46) = data54; + *(b + 47) = data62; + + *(b + 54) = INV(data55); + *(b + 55) = data63; + + *(b + 63) = INV(data64); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + data47 = *(a6 + 6); + data48 = *(a6 + 7); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); + data55 = *(a7 + 6); + data56 = *(a7 + 7); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); + data64 = *(a8 + 7); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = data19; + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = data28; + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + *(b + 32) = data05; + *(b + 33) = data13; + *(b + 34) = data21; + *(b + 35) = data29; + *(b + 36) = data37; + *(b + 37) = data45; + *(b + 38) = data53; + *(b + 39) = data61; + + *(b + 40) = data06; + *(b + 41) = data14; + *(b + 42) = data22; + *(b + 43) = data30; + *(b + 44) = data38; + *(b + 45) = data46; + *(b + 46) = data54; + *(b + 47) = data62; + + *(b + 48) = data07; + *(b + 49) = data15; + *(b + 50) = data23; + *(b + 51) = data31; + *(b + 52) = data39; + *(b + 53) = data47; + *(b + 54) = data55; + *(b + 55) = data63; + + *(b + 56) = data08; + *(b + 57) = data16; + *(b + 58) = data24; + *(b + 59) = data32; + *(b + 60) = data40; + *(b + 61) = data48; + *(b + 62) = data56; + *(b + 63) = data64; + } + + a1 += 8; + a2 += 8; + a3 += 8; + a4 += 8; + a5 += 8; + a6 += 8; + a7 += 8; + a8 += 8; + b += 64; + + i --; + ii += 8; + } + + if (m & 4) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 9) = INV(data10); + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 18) = INV(data19); + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 27) = INV(data28); + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = data19; + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = data28; + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + a5 += 4; + a6 += 4; + a7 += 4; + a8 += 4; + b += 32; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 9) = INV(data10); + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + data33 = *(a5 + 0); + data41 = *(a6 + 0); + data49 = *(a7 + 0); + data57 = *(a8 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + } + b += 8; + ii += 1; + } + + a += 8 * lda; + jj += 8; + j --; + } + + + if (n & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + + *(b + 5) = INV(data10); + *(b + 6) = data18; + *(b + 7) = data26; + + *(b + 10) = INV(data19); + *(b + 11) = data27; + + *(b + 15) = INV(data28); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data11; + *(b + 10) = data19; + *(b + 11) = data27; + *(b + 12) = data04; + *(b + 13) = data12; + *(b + 14) = data20; + *(b + 15) = data28; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + + *(b + 5) = INV(data10); + *(b + 6) = data18; + *(b + 7) = data26; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + } + b += 4; + ii += 1; + } + + a += 4 * lda; + jj += 4; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data09; + + *(b + 3) = INV(data10); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data02; + *(b + 3) = data10; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + } + b += 2; + ii += 1; + } + + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1; + b += 1; + i --; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_1.c b/kernel/generic/trsm_utcopy_1.c new file mode 100644 index 0000000..ea490d5 --- /dev/null +++ b/kernel/generic/trsm_utcopy_1.c @@ -0,0 +1,89 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + +#ifndef UNIT + FLOAT data01; +#endif + FLOAT *a1; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) *(b + 0) = *(a1 + 0); + + a1 += lda; + b ++; + i --; + ii ++; + } + + a ++; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_16.c b/kernel/generic/trsm_utcopy_16.c new file mode 100644 index 0000000..5466412 --- /dev/null +++ b/kernel/generic/trsm_utcopy_16.c @@ -0,0 +1,225 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 2; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + } + + b += 1; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_2.c b/kernel/generic/trsm_utcopy_2.c new file mode 100644 index 0000000..3def611 --- /dev/null +++ b/kernel/generic/trsm_utcopy_2.c @@ -0,0 +1,155 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT *a1, *a2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 2) = data03; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + + a += 2; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_4.c b/kernel/generic/trsm_utcopy_4.c new file mode 100644 index 0000000..bbba78d --- /dev/null +++ b/kernel/generic/trsm_utcopy_4.c @@ -0,0 +1,322 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = INV(data11); + + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data03; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_8.c b/kernel/generic/trsm_utcopy_8.c new file mode 100644 index 0000000..531ac59 --- /dev/null +++ b/kernel/generic/trsm_utcopy_8.c @@ -0,0 +1,803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + i = (m >> 3); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); +#ifndef UNIT + data37 = *(a5 + 4); +#endif + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); +#ifndef UNIT + data46 = *(a6 + 5); +#endif + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); +#ifndef UNIT + data55 = *(a7 + 6); +#endif + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); +#ifndef UNIT + data64 = *(a8 + 7); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data09; + *(b + 9) = INV(data10); + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = INV(data19); + + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = INV(data28); + + *(b + 32) = data33; + *(b + 33) = data34; + *(b + 34) = data35; + *(b + 35) = data36; + *(b + 36) = INV(data37); + + *(b + 40) = data41; + *(b + 41) = data42; + *(b + 42) = data43; + *(b + 43) = data44; + *(b + 44) = data45; + *(b + 45) = INV(data46); + + *(b + 48) = data49; + *(b + 49) = data50; + *(b + 50) = data51; + *(b + 51) = data52; + *(b + 52) = data53; + *(b + 53) = data54; + *(b + 54) = INV(data55); + + *(b + 56) = data57; + *(b + 57) = data58; + *(b + 58) = data59; + *(b + 59) = data60; + *(b + 60) = data61; + *(b + 61) = data62; + *(b + 62) = data63; + *(b + 63) = INV(data64); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + data47 = *(a6 + 6); + data48 = *(a6 + 7); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); + data55 = *(a7 + 6); + data56 = *(a7 + 7); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); + data64 = *(a8 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + + *(b + 32) = data33; + *(b + 33) = data34; + *(b + 34) = data35; + *(b + 35) = data36; + *(b + 36) = data37; + *(b + 37) = data38; + *(b + 38) = data39; + *(b + 39) = data40; + *(b + 40) = data41; + *(b + 41) = data42; + *(b + 42) = data43; + *(b + 43) = data44; + *(b + 44) = data45; + *(b + 45) = data46; + *(b + 46) = data47; + *(b + 47) = data48; + + *(b + 48) = data49; + *(b + 49) = data50; + *(b + 50) = data51; + *(b + 51) = data52; + *(b + 52) = data53; + *(b + 53) = data54; + *(b + 54) = data55; + *(b + 55) = data56; + *(b + 56) = data57; + *(b + 57) = data58; + *(b + 58) = data59; + *(b + 59) = data60; + *(b + 60) = data61; + *(b + 61) = data62; + *(b + 62) = data63; + *(b + 63) = data64; + } + + a1 += 8 * lda; + a2 += 8 * lda; + a3 += 8 * lda; + a4 += 8 * lda; + a5 += 8 * lda; + a6 += 8 * lda; + a7 += 8 * lda; + a8 += 8 * lda; + b += 64; + + i --; + ii += 8; + } + + if (m & 4) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data09; + *(b + 9) = INV(data10); + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = INV(data19); + + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 32; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 8) = data09; + *(b + 9) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 16; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + b += 8; + } + + a += 8; + jj += 8; + j --; + } + + if (n & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 4) = data09; + *(b + 5) = INV(data10); + + *(b + 8) = data17; + *(b + 9) = data18; + *(b + 10) = INV(data19); + + *(b + 12) = data25; + *(b + 13) = data26; + *(b + 14) = data27; + *(b + 15) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + + *(b + 8) = data17; + *(b + 9) = data18; + *(b + 10) = data19; + *(b + 11) = data20; + *(b + 12) = data25; + *(b + 13) = data26; + *(b + 14) = data27; + *(b + 15) = data28; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + a += 4; + jj += 4; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data09; + *(b + 3) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + i --; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += lda; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_ncopy_1.c b/kernel/generic/zgemm3m_ncopy_1.c new file mode 100644 index 0000000..7ac734b --- /dev/null +++ b/kernel/generic/zgemm3m_ncopy_1.c @@ -0,0 +1,89 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i; + + FLOAT *a_offset, a1, a2; + + lda *= 2; + + while (n > 0) { + a_offset = a; + a += lda; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset + 0); + a2 = *(a_offset + 1); + + *(b + 0) = CMULT(a1, a2); + + a_offset += 2; + + b ++; + } + n --; + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_ncopy_2.c b/kernel/generic/zgemm3m_ncopy_2.c new file mode 100644 index 0000000..702524a --- /dev/null +++ b/kernel/generic/zgemm3m_ncopy_2.c @@ -0,0 +1,120 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset; + FLOAT a1, a2, a3, a4; + + lda *= 2; + + a_offset = a; + b_offset = b; + + j = (n >> 1); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + + a_offset1 += 2; + a_offset2 += 2; + + b_offset += 2; + + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 1) { + a_offset1 = a_offset; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + + a_offset1 += 2; + + b_offset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_ncopy_4.c b/kernel/generic/zgemm3m_ncopy_4.c new file mode 100644 index 0000000..1117d77 --- /dev/null +++ b/kernel/generic/zgemm3m_ncopy_4.c @@ -0,0 +1,153 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + + lda *= 2; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + a5 = *(a_offset3 + 0); + a6 = *(a_offset3 + 1); + a7 = *(a_offset4 + 0); + a8 = *(a_offset4 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + *(b_offset + 2) = CMULT(a5, a6); + *(b_offset + 3) = CMULT(a7, a8); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset += 4; + + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + + a_offset1 += 2; + a_offset2 += 2; + + b_offset += 2; + + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + + a_offset1 += 2; + + b_offset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_ncopy_8.c b/kernel/generic/zgemm3m_ncopy_8.c new file mode 100644 index 0000000..0c3cb5d --- /dev/null +++ b/kernel/generic/zgemm3m_ncopy_8.c @@ -0,0 +1,216 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + FLOAT a9, a10, a11, a12, a13, a14, a15, a16; + +#if 0 +#ifdef REAL_ONLY + fprintf(stderr, "NON Real "); +#elif defined(IMAGE_ONLY) + fprintf(stderr, "NON Image "); +#else + fprintf(stderr, "NON Both "); +#endif + +#ifdef ICOPY + fprintf(stderr, " ICOPY %ld x %ld\n", m, n); +#else + fprintf(stderr, " OCOPY %ld x %ld\n", m, n); +#endif +#endif + + lda *= 2; + + a_offset = a; + b_offset = b; + + j = (n >> 3); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for (i = 0; i < m; i ++) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + a5 = *(a_offset3 + 0); + a6 = *(a_offset3 + 1); + a7 = *(a_offset4 + 0); + a8 = *(a_offset4 + 1); + a9 = *(a_offset5 + 0); + a10 = *(a_offset5 + 1); + a11 = *(a_offset6 + 0); + a12 = *(a_offset6 + 1); + a13 = *(a_offset7 + 0); + a14 = *(a_offset7 + 1); + a15 = *(a_offset8 + 0); + a16 = *(a_offset8 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + *(b_offset + 2) = CMULT(a5, a6); + *(b_offset + 3) = CMULT(a7, a8); + *(b_offset + 4) = CMULT(a9, a10); + *(b_offset + 5) = CMULT(a11, a12); + *(b_offset + 6) = CMULT(a13, a14); + *(b_offset + 7) = CMULT(a15, a16); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + a_offset5 += 2; + a_offset6 += 2; + a_offset7 += 2; + a_offset8 += 2; + + b_offset += 8; + } + + j--; + }while(j > 0); + } + + if (n & 4){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for (i = 0; i < m; i ++) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + a5 = *(a_offset3 + 0); + a6 = *(a_offset3 + 1); + a7 = *(a_offset4 + 0); + a8 = *(a_offset4 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + *(b_offset + 2) = CMULT(a5, a6); + *(b_offset + 3) = CMULT(a7, a8); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset += 4; + } + } + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for (i = 0; i < m; i ++) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + + a_offset1 += 2; + a_offset2 += 2; + + b_offset += 2; + } + } + + if (n & 1){ + a_offset1 = a_offset; + + for (i = 0; i < m; i ++) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + + a_offset1 += 2; + b_offset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_tcopy_1.c b/kernel/generic/zgemm3m_tcopy_1.c new file mode 100644 index 0000000..47cf7e5 --- /dev/null +++ b/kernel/generic/zgemm3m_tcopy_1.c @@ -0,0 +1,89 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i; + + FLOAT *a_offset, a1, a2; + + lda *= 2; + + while (n > 0) { + a_offset = a; + a += 2; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset + 0); + a2 = *(a_offset + 1); + + *(b + 0) = CMULT(a1, a2); + + a_offset += lda; + + b ++; + } + n --; + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_tcopy_2.c b/kernel/generic/zgemm3m_tcopy_2.c new file mode 100644 index 0000000..f6fe10b --- /dev/null +++ b/kernel/generic/zgemm3m_tcopy_2.c @@ -0,0 +1,162 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + b_offset2 = b + m * (n & ~1); + + j = (m >> 1); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + i = (n >> 1); + if (i > 0){ + do{ + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset2 + 0); + a6 = *(a_offset2 + 1); + a7 = *(a_offset2 + 2); + a8 = *(a_offset2 + 3); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + + a_offset1 += 4; + a_offset2 += 4; + + b_offset1 += m * 2; + i --; + }while(i > 0); + } + + if (n & 1) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + + b_offset2 += 2; + } + + j--; + }while(j > 0); + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 1); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + + a_offset1 += 4; + + b_offset1 += 2 * m; + + i --; + }while(i > 0); + } + + if (n & 1) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset2 + 0) = CMULT(a1, a2); + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_tcopy_4.c b/kernel/generic/zgemm3m_tcopy_4.c new file mode 100644 index 0000000..e072262 --- /dev/null +++ b/kernel/generic/zgemm3m_tcopy_4.c @@ -0,0 +1,352 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset1 + 4) = CMULT(a1, a2); + *(b_offset1 + 5) = CMULT(a3, a4); + *(b_offset1 + 6) = CMULT(a5, a6); + *(b_offset1 + 7) = CMULT(a7, a8); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + + *(b_offset1 + 8) = CMULT(a1, a2); + *(b_offset1 + 9) = CMULT(a3, a4); + *(b_offset1 + 10) = CMULT(a5, a6); + *(b_offset1 + 11) = CMULT(a7, a8); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + + *(b_offset1 + 12) = CMULT(a1, a2); + *(b_offset1 + 13) = CMULT(a3, a4); + *(b_offset1 + 14) = CMULT(a5, a6); + *(b_offset1 + 15) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + a_offset3 += 8; + a_offset4 += 8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset2 + 0); + a6 = *(a_offset2 + 1); + a7 = *(a_offset2 + 2); + a8 = *(a_offset2 + 3); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset4 + 0); + a6 = *(a_offset4 + 1); + a7 = *(a_offset4 + 2); + a8 = *(a_offset4 + 3); + + *(b_offset2 + 4) = CMULT(a1, a2); + *(b_offset2 + 5) = CMULT(a3, a4); + *(b_offset2 + 6) = CMULT(a5, a6); + *(b_offset2 + 7) = CMULT(a7, a8); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset2 += 8; + } + + if (n & 1) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + a5 = *(a_offset3 + 0); + a6 = *(a_offset3 + 1); + a7 = *(a_offset4 + 0); + a8 = *(a_offset4 + 1); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + *(b_offset3 + 2) = CMULT(a5, a6); + *(b_offset3 + 3) = CMULT(a7, a8); + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset1 + 4) = CMULT(a1, a2); + *(b_offset1 + 5) = CMULT(a3, a4); + *(b_offset1 + 6) = CMULT(a5, a6); + *(b_offset1 + 7) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset2 + 0); + a6 = *(a_offset2 + 1); + a7 = *(a_offset2 + 2); + a8 = *(a_offset2 + 3); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a_offset1 += 4; + a_offset2 += 4; + b_offset2 += 4; + } + + if (n & 1) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + + a_offset1 += 8; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + + a_offset1 += 4; + } + + if (n & 1) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset3 + 0) = CMULT(a1, a2); + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_tcopy_8.c b/kernel/generic/zgemm3m_tcopy_8.c new file mode 100644 index 0000000..e68bccf --- /dev/null +++ b/kernel/generic/zgemm3m_tcopy_8.c @@ -0,0 +1,1072 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + FLOAT a9, a10, a11, a12, a13, a14, a15, a16; + +#if 0 +#ifdef REAL_ONLY + fprintf(stderr, "TNS Real "); +#elif defined(IMAGE_ONLY) + fprintf(stderr, "TNS Image "); +#else + fprintf(stderr, "TNS Both "); +#endif + +#ifdef ICOPY + fprintf(stderr, " ICOPY %ld x %ld\n", m, n); +#else + fprintf(stderr, " OCOPY %ld x %ld\n", m, n); +#endif +#endif + + a_offset = a; + b_offset = b; + + lda *= 2; + + b_offset2 = b + m * (n & ~7); + b_offset3 = b + m * (n & ~3); + b_offset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + + a_offset += 8 * lda; + + b_offset1 = b_offset; + b_offset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + a9 = *(a_offset1 + 8); + a10 = *(a_offset1 + 9); + a11 = *(a_offset1 + 10); + a12 = *(a_offset1 + 11); + a13 = *(a_offset1 + 12); + a14 = *(a_offset1 + 13); + a15 = *(a_offset1 + 14); + a16 = *(a_offset1 + 15); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + *(b_offset1 + 4) = CMULT(a9, a10); + *(b_offset1 + 5) = CMULT(a11, a12); + *(b_offset1 + 6) = CMULT(a13, a14); + *(b_offset1 + 7) = CMULT(a15, a16); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + a9 = *(a_offset2 + 8); + a10 = *(a_offset2 + 9); + a11 = *(a_offset2 + 10); + a12 = *(a_offset2 + 11); + a13 = *(a_offset2 + 12); + a14 = *(a_offset2 + 13); + a15 = *(a_offset2 + 14); + a16 = *(a_offset2 + 15); + + *(b_offset1 + 8) = CMULT(a1, a2); + *(b_offset1 + 9) = CMULT(a3, a4); + *(b_offset1 + 10) = CMULT(a5, a6); + *(b_offset1 + 11) = CMULT(a7, a8); + *(b_offset1 + 12) = CMULT(a9, a10); + *(b_offset1 + 13) = CMULT(a11, a12); + *(b_offset1 + 14) = CMULT(a13, a14); + *(b_offset1 + 15) = CMULT(a15, a16); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + a9 = *(a_offset3 + 8); + a10 = *(a_offset3 + 9); + a11 = *(a_offset3 + 10); + a12 = *(a_offset3 + 11); + a13 = *(a_offset3 + 12); + a14 = *(a_offset3 + 13); + a15 = *(a_offset3 + 14); + a16 = *(a_offset3 + 15); + + *(b_offset1 + 16) = CMULT(a1, a2); + *(b_offset1 + 17) = CMULT(a3, a4); + *(b_offset1 + 18) = CMULT(a5, a6); + *(b_offset1 + 19) = CMULT(a7, a8); + *(b_offset1 + 20) = CMULT(a9, a10); + *(b_offset1 + 21) = CMULT(a11, a12); + *(b_offset1 + 22) = CMULT(a13, a14); + *(b_offset1 + 23) = CMULT(a15, a16); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + a9 = *(a_offset4 + 8); + a10 = *(a_offset4 + 9); + a11 = *(a_offset4 + 10); + a12 = *(a_offset4 + 11); + a13 = *(a_offset4 + 12); + a14 = *(a_offset4 + 13); + a15 = *(a_offset4 + 14); + a16 = *(a_offset4 + 15); + + *(b_offset1 + 24) = CMULT(a1, a2); + *(b_offset1 + 25) = CMULT(a3, a4); + *(b_offset1 + 26) = CMULT(a5, a6); + *(b_offset1 + 27) = CMULT(a7, a8); + *(b_offset1 + 28) = CMULT(a9, a10); + *(b_offset1 + 29) = CMULT(a11, a12); + *(b_offset1 + 30) = CMULT(a13, a14); + *(b_offset1 + 31) = CMULT(a15, a16); + + a1 = *(a_offset5 + 0); + a2 = *(a_offset5 + 1); + a3 = *(a_offset5 + 2); + a4 = *(a_offset5 + 3); + a5 = *(a_offset5 + 4); + a6 = *(a_offset5 + 5); + a7 = *(a_offset5 + 6); + a8 = *(a_offset5 + 7); + a9 = *(a_offset5 + 8); + a10 = *(a_offset5 + 9); + a11 = *(a_offset5 + 10); + a12 = *(a_offset5 + 11); + a13 = *(a_offset5 + 12); + a14 = *(a_offset5 + 13); + a15 = *(a_offset5 + 14); + a16 = *(a_offset5 + 15); + + *(b_offset1 + 32) = CMULT(a1, a2); + *(b_offset1 + 33) = CMULT(a3, a4); + *(b_offset1 + 34) = CMULT(a5, a6); + *(b_offset1 + 35) = CMULT(a7, a8); + *(b_offset1 + 36) = CMULT(a9, a10); + *(b_offset1 + 37) = CMULT(a11, a12); + *(b_offset1 + 38) = CMULT(a13, a14); + *(b_offset1 + 39) = CMULT(a15, a16); + + a1 = *(a_offset6 + 0); + a2 = *(a_offset6 + 1); + a3 = *(a_offset6 + 2); + a4 = *(a_offset6 + 3); + a5 = *(a_offset6 + 4); + a6 = *(a_offset6 + 5); + a7 = *(a_offset6 + 6); + a8 = *(a_offset6 + 7); + a9 = *(a_offset6 + 8); + a10 = *(a_offset6 + 9); + a11 = *(a_offset6 + 10); + a12 = *(a_offset6 + 11); + a13 = *(a_offset6 + 12); + a14 = *(a_offset6 + 13); + a15 = *(a_offset6 + 14); + a16 = *(a_offset6 + 15); + + *(b_offset1 + 40) = CMULT(a1, a2); + *(b_offset1 + 41) = CMULT(a3, a4); + *(b_offset1 + 42) = CMULT(a5, a6); + *(b_offset1 + 43) = CMULT(a7, a8); + *(b_offset1 + 44) = CMULT(a9, a10); + *(b_offset1 + 45) = CMULT(a11, a12); + *(b_offset1 + 46) = CMULT(a13, a14); + *(b_offset1 + 47) = CMULT(a15, a16); + + a1 = *(a_offset7 + 0); + a2 = *(a_offset7 + 1); + a3 = *(a_offset7 + 2); + a4 = *(a_offset7 + 3); + a5 = *(a_offset7 + 4); + a6 = *(a_offset7 + 5); + a7 = *(a_offset7 + 6); + a8 = *(a_offset7 + 7); + a9 = *(a_offset7 + 8); + a10 = *(a_offset7 + 9); + a11 = *(a_offset7 + 10); + a12 = *(a_offset7 + 11); + a13 = *(a_offset7 + 12); + a14 = *(a_offset7 + 13); + a15 = *(a_offset7 + 14); + a16 = *(a_offset7 + 15); + + *(b_offset1 + 48) = CMULT(a1, a2); + *(b_offset1 + 49) = CMULT(a3, a4); + *(b_offset1 + 50) = CMULT(a5, a6); + *(b_offset1 + 51) = CMULT(a7, a8); + *(b_offset1 + 52) = CMULT(a9, a10); + *(b_offset1 + 53) = CMULT(a11, a12); + *(b_offset1 + 54) = CMULT(a13, a14); + *(b_offset1 + 55) = CMULT(a15, a16); + + a1 = *(a_offset8 + 0); + a2 = *(a_offset8 + 1); + a3 = *(a_offset8 + 2); + a4 = *(a_offset8 + 3); + a5 = *(a_offset8 + 4); + a6 = *(a_offset8 + 5); + a7 = *(a_offset8 + 6); + a8 = *(a_offset8 + 7); + a9 = *(a_offset8 + 8); + a10 = *(a_offset8 + 9); + a11 = *(a_offset8 + 10); + a12 = *(a_offset8 + 11); + a13 = *(a_offset8 + 12); + a14 = *(a_offset8 + 13); + a15 = *(a_offset8 + 14); + a16 = *(a_offset8 + 15); + + *(b_offset1 + 56) = CMULT(a1, a2); + *(b_offset1 + 57) = CMULT(a3, a4); + *(b_offset1 + 58) = CMULT(a5, a6); + *(b_offset1 + 59) = CMULT(a7, a8); + *(b_offset1 + 60) = CMULT(a9, a10); + *(b_offset1 + 61) = CMULT(a11, a12); + *(b_offset1 + 62) = CMULT(a13, a14); + *(b_offset1 + 63) = CMULT(a15, a16); + + a_offset1 += 16; + a_offset2 += 16; + a_offset3 += 16; + a_offset4 += 16; + a_offset5 += 16; + a_offset6 += 16; + a_offset7 += 16; + a_offset8 += 16; + + b_offset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset2 + 4) = CMULT(a1, a2); + *(b_offset2 + 5) = CMULT(a3, a4); + *(b_offset2 + 6) = CMULT(a5, a6); + *(b_offset2 + 7) = CMULT(a7, a8); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + + *(b_offset2 + 8) = CMULT(a1, a2); + *(b_offset2 + 9) = CMULT(a3, a4); + *(b_offset2 + 10) = CMULT(a5, a6); + *(b_offset2 + 11) = CMULT(a7, a8); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + + *(b_offset2 + 12) = CMULT(a1, a2); + *(b_offset2 + 13) = CMULT(a3, a4); + *(b_offset2 + 14) = CMULT(a5, a6); + *(b_offset2 + 15) = CMULT(a7, a8); + + a1 = *(a_offset5 + 0); + a2 = *(a_offset5 + 1); + a3 = *(a_offset5 + 2); + a4 = *(a_offset5 + 3); + a5 = *(a_offset5 + 4); + a6 = *(a_offset5 + 5); + a7 = *(a_offset5 + 6); + a8 = *(a_offset5 + 7); + + *(b_offset2 + 16) = CMULT(a1, a2); + *(b_offset2 + 17) = CMULT(a3, a4); + *(b_offset2 + 18) = CMULT(a5, a6); + *(b_offset2 + 19) = CMULT(a7, a8); + + a1 = *(a_offset6 + 0); + a2 = *(a_offset6 + 1); + a3 = *(a_offset6 + 2); + a4 = *(a_offset6 + 3); + a5 = *(a_offset6 + 4); + a6 = *(a_offset6 + 5); + a7 = *(a_offset6 + 6); + a8 = *(a_offset6 + 7); + + *(b_offset2 + 20) = CMULT(a1, a2); + *(b_offset2 + 21) = CMULT(a3, a4); + *(b_offset2 + 22) = CMULT(a5, a6); + *(b_offset2 + 23) = CMULT(a7, a8); + + a1 = *(a_offset7 + 0); + a2 = *(a_offset7 + 1); + a3 = *(a_offset7 + 2); + a4 = *(a_offset7 + 3); + a5 = *(a_offset7 + 4); + a6 = *(a_offset7 + 5); + a7 = *(a_offset7 + 6); + a8 = *(a_offset7 + 7); + + *(b_offset2 + 24) = CMULT(a1, a2); + *(b_offset2 + 25) = CMULT(a3, a4); + *(b_offset2 + 26) = CMULT(a5, a6); + *(b_offset2 + 27) = CMULT(a7, a8); + + a1 = *(a_offset8 + 0); + a2 = *(a_offset8 + 1); + a3 = *(a_offset8 + 2); + a4 = *(a_offset8 + 3); + a5 = *(a_offset8 + 4); + a6 = *(a_offset8 + 5); + a7 = *(a_offset8 + 6); + a8 = *(a_offset8 + 7); + + *(b_offset2 + 28) = CMULT(a1, a2); + *(b_offset2 + 29) = CMULT(a3, a4); + *(b_offset2 + 30) = CMULT(a5, a6); + *(b_offset2 + 31) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + a_offset3 += 8; + a_offset4 += 8; + a_offset5 += 8; + a_offset6 += 8; + a_offset7 += 8; + a_offset8 += 8; + + b_offset2 += 32; + } + + if (n & 2){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + + *(b_offset3 + 2) = CMULT(a1, a2); + *(b_offset3 + 3) = CMULT(a3, a4); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + + *(b_offset3 + 4) = CMULT(a1, a2); + *(b_offset3 + 5) = CMULT(a3, a4); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + + *(b_offset3 + 6) = CMULT(a1, a2); + *(b_offset3 + 7) = CMULT(a3, a4); + + a1 = *(a_offset5 + 0); + a2 = *(a_offset5 + 1); + a3 = *(a_offset5 + 2); + a4 = *(a_offset5 + 3); + + *(b_offset3 + 8) = CMULT(a1, a2); + *(b_offset3 + 9) = CMULT(a3, a4); + + a1 = *(a_offset6 + 0); + a2 = *(a_offset6 + 1); + a3 = *(a_offset6 + 2); + a4 = *(a_offset6 + 3); + + *(b_offset3 + 10) = CMULT(a1, a2); + *(b_offset3 + 11) = CMULT(a3, a4); + + a1 = *(a_offset7 + 0); + a2 = *(a_offset7 + 1); + a3 = *(a_offset7 + 2); + a4 = *(a_offset7 + 3); + + *(b_offset3 + 12) = CMULT(a1, a2); + *(b_offset3 + 13) = CMULT(a3, a4); + + a1 = *(a_offset8 + 0); + a2 = *(a_offset8 + 1); + a3 = *(a_offset8 + 2); + a4 = *(a_offset8 + 3); + + *(b_offset3 + 14) = CMULT(a1, a2); + *(b_offset3 + 15) = CMULT(a3, a4); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + a_offset5 += 4; + a_offset6 += 4; + a_offset7 += 4; + a_offset8 += 4; + + b_offset3 += 16; + } + + if (n & 1){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset4 + 0) = CMULT(a1, a2); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + + *(b_offset4 + 1) = CMULT(a1, a2); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + + *(b_offset4 + 2) = CMULT(a1, a2); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + + *(b_offset4 + 3) = CMULT(a1, a2); + + a1 = *(a_offset5 + 0); + a2 = *(a_offset5 + 1); + + *(b_offset4 + 4) = CMULT(a1, a2); + + a1 = *(a_offset6 + 0); + a2 = *(a_offset6 + 1); + + *(b_offset4 + 5) = CMULT(a1, a2); + + a1 = *(a_offset7 + 0); + a2 = *(a_offset7 + 1); + + *(b_offset4 + 6) = CMULT(a1, a2); + + a1 = *(a_offset8 + 0); + a2 = *(a_offset8 + 1); + + *(b_offset4 + 7) = CMULT(a1, a2); + + b_offset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 32; + + i = (n >> 3); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + a9 = *(a_offset1 + 8); + a10 = *(a_offset1 + 9); + a11 = *(a_offset1 + 10); + a12 = *(a_offset1 + 11); + a13 = *(a_offset1 + 12); + a14 = *(a_offset1 + 13); + a15 = *(a_offset1 + 14); + a16 = *(a_offset1 + 15); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + *(b_offset1 + 4) = CMULT(a9, a10); + *(b_offset1 + 5) = CMULT(a11, a12); + *(b_offset1 + 6) = CMULT(a13, a14); + *(b_offset1 + 7) = CMULT(a15, a16); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + a9 = *(a_offset2 + 8); + a10 = *(a_offset2 + 9); + a11 = *(a_offset2 + 10); + a12 = *(a_offset2 + 11); + a13 = *(a_offset2 + 12); + a14 = *(a_offset2 + 13); + a15 = *(a_offset2 + 14); + a16 = *(a_offset2 + 15); + + *(b_offset1 + 8) = CMULT(a1, a2); + *(b_offset1 + 9) = CMULT(a3, a4); + *(b_offset1 + 10) = CMULT(a5, a6); + *(b_offset1 + 11) = CMULT(a7, a8); + *(b_offset1 + 12) = CMULT(a9, a10); + *(b_offset1 + 13) = CMULT(a11, a12); + *(b_offset1 + 14) = CMULT(a13, a14); + *(b_offset1 + 15) = CMULT(a15, a16); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + a9 = *(a_offset3 + 8); + a10 = *(a_offset3 + 9); + a11 = *(a_offset3 + 10); + a12 = *(a_offset3 + 11); + a13 = *(a_offset3 + 12); + a14 = *(a_offset3 + 13); + a15 = *(a_offset3 + 14); + a16 = *(a_offset3 + 15); + + *(b_offset1 + 16) = CMULT(a1, a2); + *(b_offset1 + 17) = CMULT(a3, a4); + *(b_offset1 + 18) = CMULT(a5, a6); + *(b_offset1 + 19) = CMULT(a7, a8); + *(b_offset1 + 20) = CMULT(a9, a10); + *(b_offset1 + 21) = CMULT(a11, a12); + *(b_offset1 + 22) = CMULT(a13, a14); + *(b_offset1 + 23) = CMULT(a15, a16); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + a9 = *(a_offset4 + 8); + a10 = *(a_offset4 + 9); + a11 = *(a_offset4 + 10); + a12 = *(a_offset4 + 11); + a13 = *(a_offset4 + 12); + a14 = *(a_offset4 + 13); + a15 = *(a_offset4 + 14); + a16 = *(a_offset4 + 15); + + *(b_offset1 + 24) = CMULT(a1, a2); + *(b_offset1 + 25) = CMULT(a3, a4); + *(b_offset1 + 26) = CMULT(a5, a6); + *(b_offset1 + 27) = CMULT(a7, a8); + *(b_offset1 + 28) = CMULT(a9, a10); + *(b_offset1 + 29) = CMULT(a11, a12); + *(b_offset1 + 30) = CMULT(a13, a14); + *(b_offset1 + 31) = CMULT(a15, a16); + + a_offset1 += 16; + a_offset2 += 16; + a_offset3 += 16; + a_offset4 += 16; + + b_offset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset2 + 4) = CMULT(a1, a2); + *(b_offset2 + 5) = CMULT(a3, a4); + *(b_offset2 + 6) = CMULT(a5, a6); + *(b_offset2 + 7) = CMULT(a7, a8); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + + *(b_offset2 + 8) = CMULT(a1, a2); + *(b_offset2 + 9) = CMULT(a3, a4); + *(b_offset2 + 10) = CMULT(a5, a6); + *(b_offset2 + 11) = CMULT(a7, a8); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + + *(b_offset2 + 12) = CMULT(a1, a2); + *(b_offset2 + 13) = CMULT(a3, a4); + *(b_offset2 + 14) = CMULT(a5, a6); + *(b_offset2 + 15) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + a_offset3 += 8; + a_offset4 += 8; + + b_offset2 += 16; + } + + if (n & 2){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + + *(b_offset3 + 2) = CMULT(a1, a2); + *(b_offset3 + 3) = CMULT(a3, a4); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + + *(b_offset3 + 4) = CMULT(a1, a2); + *(b_offset3 + 5) = CMULT(a3, a4); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + + *(b_offset3 + 6) = CMULT(a1, a2); + *(b_offset3 + 7) = CMULT(a3, a4); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset3 += 8; + } + + if (n & 1){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset4 + 0) = CMULT(a1, a2); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + + *(b_offset4 + 1) = CMULT(a1, a2); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + + *(b_offset4 + 2) = CMULT(a1, a2); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + + *(b_offset4 + 3) = CMULT(a1, a2); + + b_offset4 += 4; + } + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + a9 = *(a_offset1 + 8); + a10 = *(a_offset1 + 9); + a11 = *(a_offset1 + 10); + a12 = *(a_offset1 + 11); + a13 = *(a_offset1 + 12); + a14 = *(a_offset1 + 13); + a15 = *(a_offset1 + 14); + a16 = *(a_offset1 + 15); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + *(b_offset1 + 4) = CMULT(a9, a10); + *(b_offset1 + 5) = CMULT(a11, a12); + *(b_offset1 + 6) = CMULT(a13, a14); + *(b_offset1 + 7) = CMULT(a15, a16); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + a9 = *(a_offset2 + 8); + a10 = *(a_offset2 + 9); + a11 = *(a_offset2 + 10); + a12 = *(a_offset2 + 11); + a13 = *(a_offset2 + 12); + a14 = *(a_offset2 + 13); + a15 = *(a_offset2 + 14); + a16 = *(a_offset2 + 15); + + *(b_offset1 + 8) = CMULT(a1, a2); + *(b_offset1 + 9) = CMULT(a3, a4); + *(b_offset1 + 10) = CMULT(a5, a6); + *(b_offset1 + 11) = CMULT(a7, a8); + *(b_offset1 + 12) = CMULT(a9, a10); + *(b_offset1 + 13) = CMULT(a11, a12); + *(b_offset1 + 14) = CMULT(a13, a14); + *(b_offset1 + 15) = CMULT(a15, a16); + + a_offset1 += 16; + a_offset2 += 16; + + b_offset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset2 + 4) = CMULT(a1, a2); + *(b_offset2 + 5) = CMULT(a3, a4); + *(b_offset2 + 6) = CMULT(a5, a6); + *(b_offset2 + 7) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + + b_offset2 += 8; + } + + if (n & 2){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + + *(b_offset3 + 2) = CMULT(a1, a2); + *(b_offset3 + 3) = CMULT(a3, a4); + + a_offset1 += 4; + a_offset2 += 4; + + b_offset3 += 4; + } + + if (n & 1){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset4 + 0) = CMULT(a1, a2); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + + *(b_offset4 + 1) = CMULT(a1, a2); + + b_offset4 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 3); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + a9 = *(a_offset1 + 8); + a10 = *(a_offset1 + 9); + a11 = *(a_offset1 + 10); + a12 = *(a_offset1 + 11); + a13 = *(a_offset1 + 12); + a14 = *(a_offset1 + 13); + a15 = *(a_offset1 + 14); + a16 = *(a_offset1 + 15); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + *(b_offset1 + 4) = CMULT(a9, a10); + *(b_offset1 + 5) = CMULT(a11, a12); + *(b_offset1 + 6) = CMULT(a13, a14); + *(b_offset1 + 7) = CMULT(a15, a16); + + a_offset1 += 16; + + b_offset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a_offset1 += 8; + b_offset2 += 4; + } + + if (n & 2){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + a_offset1 += 4; + b_offset3 += 2; + } + + if (n & 1){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset4 + 0) = CMULT(a1, a2); + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_beta.c b/kernel/generic/zgemm_beta.c new file mode 100644 index 0000000..b7a77a2 --- /dev/null +++ b/kernel/generic/zgemm_beta.c @@ -0,0 +1,158 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, + FLOAT beta_r, FLOAT beta_i, + FLOAT *dummy2, BLASLONG dummy3, + FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + BLASLONG i, j; + + FLOAT *c_offset, *c_offset1; + FLOAT atemp1, atemp2, atemp3, atemp4; + FLOAT btemp1, btemp2, btemp3, btemp4; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + + ldc *= 2; + + c_offset = c; + + if (beta_r == 0. && beta_i == 0.) { + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 2); + if (i > 0){ + do { + *(c_offset1 + 0) = ZERO; + *(c_offset1 + 1) = ZERO; + *(c_offset1 + 2) = ZERO; + *(c_offset1 + 3) = ZERO; + *(c_offset1 + 4) = ZERO; + *(c_offset1 + 5) = ZERO; + *(c_offset1 + 6) = ZERO; + *(c_offset1 + 7) = ZERO; + c_offset1 += 8; + i--; + } while (i > 0); + } + + i = (m & 3); + if (i > 0){ + do { + *(c_offset1 + 0) = ZERO; + *(c_offset1 + 1) = ZERO; + c_offset1 += 2; + i--; + } while (i > 0); + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 1); + if (i > 0){ + do { + atemp1 = *(c_offset1 + 0); + atemp2 = *(c_offset1 + 1); + atemp3 = *(c_offset1 + 2); + atemp4 = *(c_offset1 + 3); + + btemp1 = beta_r * atemp1; + btemp2 = beta_i * atemp2; + btemp3 = beta_r * atemp2; + btemp4 = beta_i * atemp1; + + ctemp1 = btemp1 - btemp2; + ctemp2 = btemp3 + btemp4; + + btemp1 = beta_r * atemp3; + btemp2 = beta_i * atemp4; + btemp3 = beta_r * atemp4; + btemp4 = beta_i * atemp3; + + ctemp3 = btemp1 - btemp2; + ctemp4 = btemp3 + btemp4; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + c_offset1 += 4; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0){ + do { + atemp1 = *(c_offset1 + 0); + atemp2 = *(c_offset1 + 1); + + btemp1 = beta_r * atemp1; + btemp2 = beta_i * atemp2; + btemp3 = beta_r * atemp2; + btemp4 = beta_i * atemp1; + + ctemp1 = btemp1 - btemp2; + ctemp2 = btemp3 + btemp4; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + c_offset1 += 2; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + } + return 0; +} diff --git a/kernel/generic/zgemm_ncopy_1.c b/kernel/generic/zgemm_ncopy_1.c new file mode 100644 index 0000000..6679a33 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_1.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + i = n; + + if (i > 0){ + do { + + j = (m >> 2); + if (j > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + *(b_offset + 4) = ctemp5; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp7; + *(b_offset + 7) = ctemp8; + + a_offset += 8; + b_offset += 8; + j --; + } while(j>0); + } + + j = (m & 3); + if (j > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + a_offset += 2; + b_offset += 2; + j --; + } while(j>0); + } + a_offset += lda - m * 2; + i--; + } while (i > 0); + } + + return 0; +} diff --git a/kernel/generic/zgemm_ncopy_2.c b/kernel/generic/zgemm_ncopy_2.c new file mode 100644 index 0000000..2d5f255 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_2.c @@ -0,0 +1,183 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + lda *= 2; + + i = (n >> 1); + + if (i > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + j = (m >> 2); + if (j > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset1 + 2); + ctemp6 = *(a_offset1 + 3); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset1 + 4); + ctemp10 = *(a_offset1 + 5); + ctemp11 = *(a_offset2 + 4); + ctemp12 = *(a_offset2 + 5); + + ctemp13 = *(a_offset1 + 6); + ctemp14 = *(a_offset1 + 7); + ctemp15 = *(a_offset2 + 6); + ctemp16 = *(a_offset2 + 7); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + *(b_offset + 4) = ctemp5; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp7; + *(b_offset + 7) = ctemp8; + + *(b_offset + 8) = ctemp9; + *(b_offset + 9) = ctemp10; + *(b_offset +10) = ctemp11; + *(b_offset +11) = ctemp12; + + *(b_offset +12) = ctemp13; + *(b_offset +13) = ctemp14; + *(b_offset +14) = ctemp15; + *(b_offset +15) = ctemp16; + + a_offset1 += 8; + a_offset2 += 8; + b_offset += 16; + j --; + } while(j>0); + } + + j = (m & 3); + if (j > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 2; + a_offset2 += 2; + b_offset += 4; + j --; + } while(j>0); + } + i --; + } while(i>0); + } + + if (n & 1){ + j = (m >> 2); + if (j > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp5 = *(a_offset + 2); + ctemp6 = *(a_offset + 3); + + ctemp9 = *(a_offset + 4); + ctemp10 = *(a_offset + 5); + ctemp13 = *(a_offset + 6); + ctemp14 = *(a_offset + 7); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp5; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp9; + *(b_offset + 5) = ctemp10; + *(b_offset + 6) = ctemp13; + *(b_offset + 7) = ctemp14; + + a_offset += 8; + b_offset += 8; + j --; + } while(j>0); + } + + j = (m & 3); + if (j > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + a_offset += 2; + b_offset += 2; + j --; + } while(j > 0); + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_ncopy_4.c b/kernel/generic/zgemm_ncopy_4.c new file mode 100644 index 0000000..abd1d57 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_4.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m,n ); +#endif + + j = (n >> 2); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp10; + *(boffset + 4) = ctemp17; + *(boffset + 5) = ctemp18; + *(boffset + 6) = ctemp25; + *(boffset + 7) = ctemp26; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp04; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp19; + *(boffset + 13) = ctemp20; + *(boffset + 14) = ctemp27; + *(boffset + 15) = ctemp28; + + *(boffset + 16) = ctemp05; + *(boffset + 17) = ctemp06; + *(boffset + 18) = ctemp13; + *(boffset + 19) = ctemp14; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp29; + *(boffset + 23) = ctemp30; + + *(boffset + 24) = ctemp07; + *(boffset + 25) = ctemp08; + *(boffset + 26) = ctemp15; + *(boffset + 27) = ctemp16; + *(boffset + 28) = ctemp23; + *(boffset + 29) = ctemp24; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + boffset += 32; + i --; + }while(i > 0); + } + + if (m & 2) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp06; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp10; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp14; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp04; + *(boffset + 10) = ctemp07; + *(boffset + 11) = ctemp08; + *(boffset + 12) = ctemp11; + *(boffset + 13) = ctemp12; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + } + + if (m & 1) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + boffset += 8; + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp10; + *(boffset + 4) = ctemp03; + *(boffset + 5) = ctemp04; + *(boffset + 6) = ctemp11; + *(boffset + 7) = ctemp12; + + *(boffset + 8) = ctemp05; + *(boffset + 9) = ctemp06; + *(boffset + 10) = ctemp13; + *(boffset + 11) = ctemp14; + *(boffset + 12) = ctemp07; + *(boffset + 13) = ctemp08; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 8; + aoffset2 += 8; + boffset += 16; + i --; + }while(i > 0); + } + + if (m & 2) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp06; + *(boffset + 4) = ctemp03; + *(boffset + 5) = ctemp04; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 4; + aoffset2 += 4; + boffset += 8; + } + + if (m & 1) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 8; + boffset += 8; + i --; + }while(i > 0); + } + + if (m & 2) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 4; + boffset += 4; + } + + if (m & 1) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_ncopy_8.c b/kernel/generic/zgemm_ncopy_8.c new file mode 100644 index 0000000..6490285 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_8.c @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + aoffset = a; + boffset = b; + lda *= 2; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset += 8; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/zgemm_tcopy_1.c b/kernel/generic/zgemm_tcopy_1.c new file mode 100644 index 0000000..03dfcc7 --- /dev/null +++ b/kernel/generic/zgemm_tcopy_1.c @@ -0,0 +1,121 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + FLOAT *a_offset; + FLOAT *b_offset, *b_offset1; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + j = m; + + m *= 2; + + if (j > 0){ + do { + b_offset1 = b_offset; + b_offset += 2; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + + b_offset1 += m; + + *(b_offset1 + 0) = ctemp3; + *(b_offset1 + 1) = ctemp4; + + b_offset1 += m; + + *(b_offset1 + 0) = ctemp5; + *(b_offset1 + 1) = ctemp6; + + b_offset1 += m; + + *(b_offset1 + 0) = ctemp7; + *(b_offset1 + 1) = ctemp8; + + b_offset1 += m; + a_offset += 8; + i --; + } while(i>0); + } + + i = (n & 3); + if (i > 0){ + do { + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + + b_offset1 += m; + a_offset += 2; + i --; + } while(i > 0); + } + a_offset += lda - n * 2; + j --; + } while (j > 0); + } + + return 0; +} diff --git a/kernel/generic/zgemm_tcopy_2.c b/kernel/generic/zgemm_tcopy_2.c new file mode 100644 index 0000000..75aff7f --- /dev/null +++ b/kernel/generic/zgemm_tcopy_2.c @@ -0,0 +1,220 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~1) * 2; + + lda *= 2; + + j = (m >> 1); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset1 + 4); + ctemp6 = *(a_offset1 + 5); + ctemp7 = *(a_offset1 + 6); + ctemp8 = *(a_offset1 + 7); + + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + ctemp11 = *(a_offset2 + 2); + ctemp12 = *(a_offset2 + 3); + + ctemp13 = *(a_offset2 + 4); + ctemp14 = *(a_offset2 + 5); + ctemp15 = *(a_offset2 + 6); + ctemp16 = *(a_offset2 + 7); + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp9; + *(b_offset1 + 5) = ctemp10; + *(b_offset1 + 6) = ctemp11; + *(b_offset1 + 7) = ctemp12; + + b_offset1 += m * 4; + + *(b_offset1 + 0) = ctemp5; + *(b_offset1 + 1) = ctemp6; + *(b_offset1 + 2) = ctemp7; + *(b_offset1 + 3) = ctemp8; + + *(b_offset1 + 4) = ctemp13; + *(b_offset1 + 5) = ctemp14; + *(b_offset1 + 6) = ctemp15; + *(b_offset1 + 7) = ctemp16; + + b_offset1 += m * 4; + + a_offset1 += 8; + a_offset2 += 8; + i --; + } while(i>0); + } + + if (n & 2){ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + ctemp11 = *(a_offset2 + 2); + ctemp12 = *(a_offset2 + 3); + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp9; + *(b_offset1 + 5) = ctemp10; + *(b_offset1 + 6) = ctemp11; + *(b_offset1 + 7) = ctemp12; + + b_offset1 += m * 4; + a_offset1 += 4; + a_offset2 += 4; + } + + if (n & 1){ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp9; + *(b_offset2 + 3) = ctemp10; + b_offset2 += 4; + } + j--; + } while(j > 0); + } + + if (m & 1){ + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + b_offset += m * 4; + + *(b_offset + 0) = ctemp5; + *(b_offset + 1) = ctemp6; + *(b_offset + 2) = ctemp7; + *(b_offset + 3) = ctemp8; + + b_offset += m * 4; + a_offset += 8; + i --; + } while(i > 0); + } + + if (n & 2){ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + b_offset += m * 4; + a_offset += 4; + } + + if (n & 1){ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_tcopy_4.c b/kernel/generic/zgemm_tcopy_4.c new file mode 100644 index 0000000..c61d9d5 --- /dev/null +++ b/kernel/generic/zgemm_tcopy_4.c @@ -0,0 +1,403 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + + boffset2 = b + 2 * m * (n & ~3); + boffset3 = b + 2 * m * (n & ~1); + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m,n ); +#endif + + j = (m >> 2); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + *(boffset1 + 16) = ctemp17; + *(boffset1 + 17) = ctemp18; + *(boffset1 + 18) = ctemp19; + *(boffset1 + 19) = ctemp20; + *(boffset1 + 20) = ctemp21; + *(boffset1 + 21) = ctemp22; + *(boffset1 + 22) = ctemp23; + *(boffset1 + 23) = ctemp24; + + *(boffset1 + 24) = ctemp25; + *(boffset1 + 25) = ctemp26; + *(boffset1 + 26) = ctemp27; + *(boffset1 + 27) = ctemp28; + *(boffset1 + 28) = ctemp29; + *(boffset1 + 29) = ctemp30; + *(boffset1 + 30) = ctemp31; + *(boffset1 + 31) = ctemp32; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp10; + *(boffset2 + 10) = ctemp11; + *(boffset2 + 11) = ctemp12; + *(boffset2 + 12) = ctemp13; + *(boffset2 + 13) = ctemp14; + *(boffset2 + 14) = ctemp15; + *(boffset2 + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + + boffset2 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset3 += 8; + } + j--; + }while(j > 0); + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + + aoffset1 += 4; + aoffset2 += 4; + + boffset2 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset3 += 4; + } + } + + if (m & 1){ + aoffset1 = aoffset; + boffset1 = boffset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + aoffset1 += 8; + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + + aoffset1 += 4; + boffset2 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_tcopy_8.c b/kernel/generic/zgemm_tcopy_8.c new file mode 100644 index 0000000..b258785 --- /dev/null +++ b/kernel/generic/zgemm_tcopy_8.c @@ -0,0 +1,361 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + boffset += 8; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + return 0; +} diff --git a/kernel/generic/zger.c b/kernel/generic/zger.c new file mode 100644 index 0000000..134ff5f --- /dev/null +++ b/kernel/generic/zger.c @@ -0,0 +1,84 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, + FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + FLOAT *X = x; + + if (incx != 1) { + X = buffer; + COPY_K(m, x, incx, X, 1); + } + + lda *= 2; + incy *= 2; + + while (n > 0) { + FLOAT beta_r = y[0]; + FLOAT beta_i = y[1]; + +#ifndef XCONJ + AXPYU_K +#else + AXPYC_K +#endif + (m, 0, 0, +#ifndef CONJ + alpha_r * beta_r - alpha_i * beta_i, + alpha_r * beta_i + alpha_i * beta_r, +#else + alpha_r * beta_r + alpha_i * beta_i, + -alpha_r * beta_i + alpha_i * beta_r, +#endif + X, 1, a, 1, NULL, 0); + + a += lda; + y += incy; + n --; + } + + return 0; +} + diff --git a/kernel/generic/zhemm3m_lcopy_1.c b/kernel/generic/zhemm3m_lcopy_1.c new file mode 100644 index 0000000..72f473d --- /dev/null +++ b/kernel/generic/zhemm3m_lcopy_1.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_lcopy_2.c b/kernel/generic/zhemm3m_lcopy_2.c new file mode 100644 index 0000000..f0da12c --- /dev/null +++ b/kernel/generic/zhemm3m_lcopy_2.c @@ -0,0 +1,146 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_lcopy_4.c b/kernel/generic/zhemm3m_lcopy_4.c new file mode 100644 index 0000000..7e958f1 --- /dev/null +++ b/kernel/generic/zhemm3m_lcopy_4.c @@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + } else + if (offset < -3) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_lcopy_8.c b/kernel/generic/zhemm3m_lcopy_8.c new file mode 100644 index 0000000..86600b5 --- /dev/null +++ b/kernel/generic/zhemm3m_lcopy_8.c @@ -0,0 +1,364 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + } else + if (offset < -7) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -4 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), ZERO); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -5 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), ZERO); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -6 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), ZERO); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -7 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + } else + if (offset < -3) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_ucopy_1.c b/kernel/generic/zhemm3m_ucopy_1.c new file mode 100644 index 0000000..a6d4975 --- /dev/null +++ b/kernel/generic/zhemm3m_ucopy_1.c @@ -0,0 +1,106 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_ucopy_2.c b/kernel/generic/zhemm3m_ucopy_2.c new file mode 100644 index 0000000..fecbae6 --- /dev/null +++ b/kernel/generic/zhemm3m_ucopy_2.c @@ -0,0 +1,146 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_ucopy_4.c b/kernel/generic/zhemm3m_ucopy_4.c new file mode 100644 index 0000000..6a45c7e --- /dev/null +++ b/kernel/generic/zhemm3m_ucopy_4.c @@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + } else + if (offset < -3) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_ucopy_8.c b/kernel/generic/zhemm3m_ucopy_8.c new file mode 100644 index 0000000..efed390 --- /dev/null +++ b/kernel/generic/zhemm3m_ucopy_8.c @@ -0,0 +1,364 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + } else + if (offset < -7) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -4 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), ZERO); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -5 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), ZERO); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -6 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), ZERO); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -7 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + } else + if (offset < -3) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm_ltcopy_1.c b/kernel/generic/zhemm_ltcopy_1.c new file mode 100644 index 0000000..6f5615b --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + posX ++; + js --; + + } + + return 0; +} diff --git a/kernel/generic/zhemm_ltcopy_2.c b/kernel/generic/zhemm_ltcopy_2.c new file mode 100644 index 0000000..8547b4d --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_2.c @@ -0,0 +1,144 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_ltcopy_4.c b/kernel/generic/zhemm_ltcopy_4.c new file mode 100644 index 0000000..d7afc11 --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_4.c @@ -0,0 +1,244 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_ltcopy_8.c b/kernel/generic/zhemm_ltcopy_8.c new file mode 100644 index 0000000..d5ebd1c --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_8.c @@ -0,0 +1,480 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} + diff --git a/kernel/generic/zhemm_utcopy_1.c b/kernel/generic/zhemm_utcopy_1.c new file mode 100644 index 0000000..961b849 --- /dev/null +++ b/kernel/generic/zhemm_utcopy_1.c @@ -0,0 +1,88 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1; + + lda *= 2; + + js = n; + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_2.c b/kernel/generic/zhemm_utcopy_2.c new file mode 100644 index 0000000..91e7108 --- /dev/null +++ b/kernel/generic/zhemm_utcopy_2.c @@ -0,0 +1,142 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_4.c b/kernel/generic/zhemm_utcopy_4.c new file mode 100644 index 0000000..15671b4 --- /dev/null +++ b/kernel/generic/zhemm_utcopy_4.c @@ -0,0 +1,242 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_8.c b/kernel/generic/zhemm_utcopy_8.c new file mode 100644 index 0000000..1cfd3bd --- /dev/null +++ b/kernel/generic/zhemm_utcopy_8.c @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemv_k.c b/kernel/generic/zhemv_k.c new file mode 100644 index 0000000..3551938 --- /dev/null +++ b/kernel/generic/zhemv_k.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + + BLASLONG is, min_i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *symbuffer = buffer; + FLOAT *gemvbuffer = (FLOAT *)(((BLASLONG)buffer + SYMV_P * SYMV_P * sizeof(FLOAT) * 2 + 4095) & ~4095); + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + +#ifndef LOWER + for(is = m - offset; is < m; is += SYMV_P){ + min_i = MIN(m - is, SYMV_P); +#else + for(is = 0; is < offset; is += SYMV_P){ + min_i = MIN(offset - is, SYMV_P); +#endif + +#ifndef LOWER + if (is > 0){ +#ifndef HEMVREV + GEMV_C(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * 2, lda, + X, 1, + Y + is * 2, 1, gemvbuffer); + + GEMV_N(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * 2, lda, + X + is * 2, 1, + Y, 1, gemvbuffer); +#else + GEMV_T(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * 2, lda, + X, 1, + Y + is * 2, 1, gemvbuffer); + + GEMV_R(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * 2, lda, + X + is * 2, 1, + Y, 1, gemvbuffer); +#endif + } +#endif + +#ifndef HEMVREV +#ifdef LOWER + ZHEMCOPY_L(min_i, a + (is + is * lda) * 2, lda, symbuffer); +#else + ZHEMCOPY_U(min_i, a + (is + is * lda) * 2, lda, symbuffer); +#endif +#else +#ifdef LOWER + ZHEMCOPY_M(min_i, a + (is + is * lda) * 2, lda, symbuffer); +#else + ZHEMCOPY_V(min_i, a + (is + is * lda) * 2, lda, symbuffer); +#endif +#endif + + GEMV_N(min_i, min_i, 0, alpha_r, alpha_i, + symbuffer, min_i, + X + is * 2, 1, + Y + is * 2, 1, gemvbuffer); + +#ifdef LOWER + if (m - is - min_i > 0){ + +#ifndef HEMVREV + GEMV_C(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * 2, lda, + X + (is + min_i) * 2, 1, + Y + is * 2, 1, gemvbuffer); + + GEMV_N(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * 2, lda, + X + is * 2, 1, + Y + (is + min_i) * 2, 1, gemvbuffer); +#else + GEMV_T(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * 2, lda, + X + (is + min_i) * 2, 1, + Y + is * 2, 1, gemvbuffer); + + GEMV_R(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * 2, lda, + X + is * 2, 1, + Y + (is + min_i) * 2, 1, gemvbuffer); +#endif + + } +#endif + + } /* end of is */ + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/kernel/generic/zlaswp_ncopy_1.c b/kernel/generic/zlaswp_ncopy_1.c new file mode 100644 index 0000000..acbda68 --- /dev/null +++ b/kernel/generic/zlaswp_ncopy_1.c @@ -0,0 +1,186 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define a2 (a1 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, A3, A4; + FLOAT B1, B2, B3, B4; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = n; + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + buffer += 2; + } + + a += lda; + j --; + } while (j > 0); + + return 0; +} + diff --git a/kernel/generic/zlaswp_ncopy_2.c b/kernel/generic/zlaswp_ncopy_2.c new file mode 100644 index 0000000..7fa56be --- /dev/null +++ b/kernel/generic/zlaswp_ncopy_2.c @@ -0,0 +1,381 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define a2 (a1 + 2) +#define a4 (a3 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3; + FLOAT *b1, *b2, *b3, *b4; + FLOAT A1, A2, A3, A4; + FLOAT A5, A6, A7, A8; + FLOAT B1, B2, B3, B4; + FLOAT B5, B6, B7, B8; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 1); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + a3 = a1 + lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + a1 += 4; + a3 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + buffer += 4; + } + + a += 2 * lda; + j --; + } while (j > 0); + } + + if (n & 1) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + buffer += 2; + } + } + + return 0; +} + diff --git a/kernel/generic/zlaswp_ncopy_4.c b/kernel/generic/zlaswp_ncopy_4.c new file mode 100644 index 0000000..c9c44fc --- /dev/null +++ b/kernel/generic/zlaswp_ncopy_4.c @@ -0,0 +1,711 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define a2 (a1 + 2) +#define a4 (a3 + 2) +#define a6 (a5 + 2) +#define a8 (a7 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, A3, A4, A5, A6, A7, A8; + FLOAT B1, B2, B3, B4, B5, B6, B7, B8; + + FLOAT A9, A10, A11, A12, A13, A14, A15, A16; + FLOAT B9, B10, B11, B12, B13, B14, B15, B16; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 2); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + + buffer += 16; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + if (n & 2) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + a3 = a1 + lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + a1 += 4; + a3 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + buffer += 4; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + buffer += 2; + } + } + + return 0; +} + diff --git a/kernel/generic/zneg_tcopy_1.c b/kernel/generic/zneg_tcopy_1.c new file mode 100644 index 0000000..3701c9c --- /dev/null +++ b/kernel/generic/zneg_tcopy_1.c @@ -0,0 +1,121 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + FLOAT *a_offset; + FLOAT *b_offset, *b_offset1; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + j = m; + + m *= 2; + + if (j > 0){ + do { + b_offset1 = b_offset; + b_offset += 2; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + + b_offset1 += m; + + *(b_offset1 + 0) = -ctemp3; + *(b_offset1 + 1) = -ctemp4; + + b_offset1 += m; + + *(b_offset1 + 0) = -ctemp5; + *(b_offset1 + 1) = -ctemp6; + + b_offset1 += m; + + *(b_offset1 + 0) = -ctemp7; + *(b_offset1 + 1) = -ctemp8; + + b_offset1 += m; + a_offset += 8; + i --; + } while(i>0); + } + + i = (n & 3); + if (i > 0){ + do { + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + + b_offset1 += m; + a_offset += 2; + i --; + } while(i > 0); + } + a_offset += lda - n * 2; + j --; + } while (j > 0); + } + + return 0; +} diff --git a/kernel/generic/zneg_tcopy_2.c b/kernel/generic/zneg_tcopy_2.c new file mode 100644 index 0000000..40dd115 --- /dev/null +++ b/kernel/generic/zneg_tcopy_2.c @@ -0,0 +1,220 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~1) * 2; + + lda *= 2; + + j = (m >> 1); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset1 + 4); + ctemp6 = *(a_offset1 + 5); + ctemp7 = *(a_offset1 + 6); + ctemp8 = *(a_offset1 + 7); + + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + ctemp11 = *(a_offset2 + 2); + ctemp12 = *(a_offset2 + 3); + + ctemp13 = *(a_offset2 + 4); + ctemp14 = *(a_offset2 + 5); + ctemp15 = *(a_offset2 + 6); + ctemp16 = *(a_offset2 + 7); + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + *(b_offset1 + 4) = -ctemp9; + *(b_offset1 + 5) = -ctemp10; + *(b_offset1 + 6) = -ctemp11; + *(b_offset1 + 7) = -ctemp12; + + b_offset1 += m * 4; + + *(b_offset1 + 0) = -ctemp5; + *(b_offset1 + 1) = -ctemp6; + *(b_offset1 + 2) = -ctemp7; + *(b_offset1 + 3) = -ctemp8; + + *(b_offset1 + 4) = -ctemp13; + *(b_offset1 + 5) = -ctemp14; + *(b_offset1 + 6) = -ctemp15; + *(b_offset1 + 7) = -ctemp16; + + b_offset1 += m * 4; + + a_offset1 += 8; + a_offset2 += 8; + i --; + } while(i>0); + } + + if (n & 2){ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + ctemp11 = *(a_offset2 + 2); + ctemp12 = *(a_offset2 + 3); + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + *(b_offset1 + 4) = -ctemp9; + *(b_offset1 + 5) = -ctemp10; + *(b_offset1 + 6) = -ctemp11; + *(b_offset1 + 7) = -ctemp12; + + b_offset1 += m * 4; + a_offset1 += 4; + a_offset2 += 4; + } + + if (n & 1){ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + *(b_offset2 + 2) = -ctemp9; + *(b_offset2 + 3) = -ctemp10; + b_offset2 += 4; + } + j--; + } while(j > 0); + } + + if (m & 1){ + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset + 0) = -ctemp1; + *(b_offset + 1) = -ctemp2; + *(b_offset + 2) = -ctemp3; + *(b_offset + 3) = -ctemp4; + + b_offset += m * 4; + + *(b_offset + 0) = -ctemp5; + *(b_offset + 1) = -ctemp6; + *(b_offset + 2) = -ctemp7; + *(b_offset + 3) = -ctemp8; + + b_offset += m * 4; + a_offset += 8; + i --; + } while(i > 0); + } + + if (n & 2){ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + *(b_offset + 0) = -ctemp1; + *(b_offset + 1) = -ctemp2; + *(b_offset + 2) = -ctemp3; + *(b_offset + 3) = -ctemp4; + + b_offset += m * 4; + a_offset += 4; + } + + if (n & 1){ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + } + } + + return 0; +} diff --git a/kernel/generic/zneg_tcopy_4.c b/kernel/generic/zneg_tcopy_4.c new file mode 100644 index 0000000..7cd9887 --- /dev/null +++ b/kernel/generic/zneg_tcopy_4.c @@ -0,0 +1,403 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + + boffset2 = b + 2 * m * (n & ~3); + boffset3 = b + 2 * m * (n & ~1); + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m,n ); +#endif + + j = (m >> 2); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + *(boffset1 + 16) = -ctemp17; + *(boffset1 + 17) = -ctemp18; + *(boffset1 + 18) = -ctemp19; + *(boffset1 + 19) = -ctemp20; + *(boffset1 + 20) = -ctemp21; + *(boffset1 + 21) = -ctemp22; + *(boffset1 + 22) = -ctemp23; + *(boffset1 + 23) = -ctemp24; + + *(boffset1 + 24) = -ctemp25; + *(boffset1 + 25) = -ctemp26; + *(boffset1 + 26) = -ctemp27; + *(boffset1 + 27) = -ctemp28; + *(boffset1 + 28) = -ctemp29; + *(boffset1 + 29) = -ctemp30; + *(boffset1 + 30) = -ctemp31; + *(boffset1 + 31) = -ctemp32; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + + *(boffset2 + 8) = -ctemp09; + *(boffset2 + 9) = -ctemp10; + *(boffset2 + 10) = -ctemp11; + *(boffset2 + 11) = -ctemp12; + *(boffset2 + 12) = -ctemp13; + *(boffset2 + 13) = -ctemp14; + *(boffset2 + 14) = -ctemp15; + *(boffset2 + 15) = -ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + + boffset2 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + *(boffset3 + 4) = -ctemp05; + *(boffset3 + 5) = -ctemp06; + *(boffset3 + 6) = -ctemp07; + *(boffset3 + 7) = -ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset3 += 8; + } + j--; + }while(j > 0); + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + + aoffset1 += 4; + aoffset2 += 4; + + boffset2 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset3 += 4; + } + } + + if (m & 1){ + aoffset1 = aoffset; + boffset1 = boffset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + aoffset1 += 8; + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + + aoffset1 += 4; + boffset2 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + } + } + + return 0; +} diff --git a/kernel/generic/zneg_tcopy_8.c b/kernel/generic/zneg_tcopy_8.c new file mode 100644 index 0000000..fe8f25c --- /dev/null +++ b/kernel/generic/zneg_tcopy_8.c @@ -0,0 +1,361 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + boffset += 8; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + boffset += 2; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_lcopy_1.c b/kernel/generic/zsymm3m_lcopy_1.c new file mode 100644 index 0000000..0e0d5a3 --- /dev/null +++ b/kernel/generic/zsymm3m_lcopy_1.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + lda *= 2; + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + js = n; + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_lcopy_2.c b/kernel/generic/zsymm3m_lcopy_2.c new file mode 100644 index 0000000..96686c1 --- /dev/null +++ b/kernel/generic/zsymm3m_lcopy_2.c @@ -0,0 +1,124 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + lda *= 2; + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_lcopy_4.c b/kernel/generic/zsymm3m_lcopy_4.c new file mode 100644 index 0000000..38a58cf --- /dev/null +++ b/kernel/generic/zsymm3m_lcopy_4.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_lcopy_8.c b/kernel/generic/zsymm3m_lcopy_8.c new file mode 100644 index 0000000..4e5cddc --- /dev/null +++ b/kernel/generic/zsymm3m_lcopy_8.c @@ -0,0 +1,209 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_ucopy_1.c b/kernel/generic/zsymm3m_ucopy_1.c new file mode 100644 index 0000000..14ca6e7 --- /dev/null +++ b/kernel/generic/zsymm3m_ucopy_1.c @@ -0,0 +1,98 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_ucopy_2.c b/kernel/generic/zsymm3m_ucopy_2.c new file mode 100644 index 0000000..4ba1e69 --- /dev/null +++ b/kernel/generic/zsymm3m_ucopy_2.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_ucopy_4.c b/kernel/generic/zsymm3m_ucopy_4.c new file mode 100644 index 0000000..8de026a --- /dev/null +++ b/kernel/generic/zsymm3m_ucopy_4.c @@ -0,0 +1,158 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_ucopy_8.c b/kernel/generic/zsymm3m_ucopy_8.c new file mode 100644 index 0000000..79ef364 --- /dev/null +++ b/kernel/generic/zsymm3m_ucopy_8.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_1.c b/kernel/generic/zsymm_lcopy_1.c new file mode 100644 index 0000000..1b4f58d --- /dev/null +++ b/kernel/generic/zsymm_lcopy_1.c @@ -0,0 +1,81 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_2.c b/kernel/generic/zsymm_lcopy_2.c new file mode 100644 index 0000000..ce1b16e --- /dev/null +++ b/kernel/generic/zsymm_lcopy_2.c @@ -0,0 +1,112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_4.c b/kernel/generic/zsymm_lcopy_4.c new file mode 100644 index 0000000..dd2034d --- /dev/null +++ b/kernel/generic/zsymm_lcopy_4.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_8.c b/kernel/generic/zsymm_lcopy_8.c new file mode 100644 index 0000000..3397612 --- /dev/null +++ b/kernel/generic/zsymm_lcopy_8.c @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_1.c b/kernel/generic/zsymm_ucopy_1.c new file mode 100644 index 0000000..9943a2d --- /dev/null +++ b/kernel/generic/zsymm_ucopy_1.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1; + + lda *= 2; + + js = n; + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_2.c b/kernel/generic/zsymm_ucopy_2.c new file mode 100644 index 0000000..da64cde --- /dev/null +++ b/kernel/generic/zsymm_ucopy_2.c @@ -0,0 +1,111 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_4.c b/kernel/generic/zsymm_ucopy_4.c new file mode 100644 index 0000000..eed0bca --- /dev/null +++ b/kernel/generic/zsymm_ucopy_4.c @@ -0,0 +1,155 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_8.c b/kernel/generic/zsymm_ucopy_8.c new file mode 100644 index 0000000..c81a7a8 --- /dev/null +++ b/kernel/generic/zsymm_ucopy_8.c @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymv_k.c b/kernel/generic/zsymv_k.c new file mode 100644 index 0000000..211def3 --- /dev/null +++ b/kernel/generic/zsymv_k.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + + BLASLONG is, min_i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *symbuffer = buffer; + FLOAT *gemvbuffer = (FLOAT *)(((BLASLONG)buffer + SYMV_P * SYMV_P * sizeof(FLOAT) * 2 + 4095) & ~4095); + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + +#ifndef LOWER + for(is = m - offset; is < m; is += SYMV_P){ + min_i = MIN(m - is, SYMV_P); +#else + for(is = 0; is < offset; is += SYMV_P){ + min_i = MIN(offset - is, SYMV_P); +#endif + +#ifndef LOWER + if (is >0){ + GEMV_T(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * COMPSIZE, lda, + X, 1, + Y + is * COMPSIZE, 1, gemvbuffer); + + GEMV_N(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * COMPSIZE, lda, + X + is * COMPSIZE, 1, + Y, 1, gemvbuffer); + } +#endif + +#ifdef LOWER + ZSYMCOPY_L(min_i, a + (is + is * lda) * COMPSIZE, lda, symbuffer); +#else + ZSYMCOPY_U(min_i, a + (is + is * lda) * COMPSIZE, lda, symbuffer); +#endif + + GEMV_N(min_i, min_i, 0, alpha_r, alpha_i, + symbuffer, min_i, + X + is * COMPSIZE, 1, + Y + is * COMPSIZE, 1, gemvbuffer); + + +#ifdef LOWER + if (m - is > min_i){ + GEMV_T(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * COMPSIZE, lda, + X + (is + min_i) * COMPSIZE, 1, + Y + is * COMPSIZE, 1, gemvbuffer); + + GEMV_N(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * COMPSIZE, lda, + X + is * COMPSIZE, 1, + Y + (is + min_i) * COMPSIZE, 1, gemvbuffer); + } +#endif + + } /* end of is */ + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_1.c b/kernel/generic/ztrmm_lncopy_1.c new file mode 100644 index 0000000..15a0509 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_1.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02; + FLOAT *ao1; + + lda += lda; + + js = n; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 2; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + + posY ++; + js --; + } while (js > 0); + } /* End of main loop */ + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_2.c b/kernel/generic/ztrmm_lncopy_2.c new file mode 100644 index 0000000..f41ee5b --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_2.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *ao1, *ao2; + + lda += lda; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 4; + ao2 += 4; + b += 8; + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data07; + b[ 7] = data08; +#endif + ao1 += 4; + ao2 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + ao1 += lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 0] = ZERO; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; +#endif + + b += 4; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + b += 2; + ao1 += 2; + } else + if (X < posY) { + b += 2; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + ao1 += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_4.c b/kernel/generic/ztrmm_lncopy_4.c new file mode 100644 index 0000000..76170c7 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_4.c @@ -0,0 +1,664 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda += lda; + + js = (n >> 2); + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + b[16] = data05; + b[17] = data06; + b[18] = data13; + b[19] = data14; + b[20] = data21; + b[21] = data22; + b[22] = data29; + b[23] = data30; + + b[24] = data07; + b[25] = data08; + b[26] = data15; + b[27] = data16; + b[28] = data23; + b[29] = data24; + b[30] = data31; + b[31] = data32; + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = ONE; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data05; + b[17] = data06; + b[18] = data13; + b[19] = data14; + b[20] = ONE; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data07; + b[25] = data08; + b[26] = data15; + b[27] = data16; + b[28] = data23; + b[29] = data24; + b[30] = ONE; + b[31] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data05; + b[17] = data06; + b[18] = data13; + b[19] = data14; + b[20] = data21; + b[21] = data22; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data07; + b[25] = data08; + b[26] = data15; + b[27] = data16; + b[28] = data23; + b[29] = data24; + b[30] = data31; + b[31] = data32; +#endif + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 16; + } + + if (m & 1) { + ao1 += lda; + b += 8; + } + + } else { +#ifdef UNIT + + if (i >= 2) { + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + } + + if (i >= 3) { + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = data03; + b[ 1] = data04; + b[ 2] = ONE; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = data13; + b[ 3] = data14; + b[ 4] = ONE; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (i >= 2) { + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + } + + if (i >= 3) { + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = data03; + b[ 1] = data04; + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = data13; + b[ 3] = data14; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data11; + b[ 7] = data12; +#endif + ao1 += 4; + ao2 += 4; + + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + ao1 += lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + ao1 += 2; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += lda; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_8.c b/kernel/generic/ztrmm_lncopy_8.c new file mode 100644 index 0000000..308ddd7 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_8.c @@ -0,0 +1,871 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda += lda; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + ao5 = a + posY * 2 + (posX + 4) * lda; + ao6 = a + posY * 2 + (posX + 5) * lda; + ao7 = a + posY * 2 + (posX + 6) * lda; + ao8 = a + posY * 2 + (posX + 7) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + ao5 = a + posX * 2 + (posY + 4) * lda; + ao6 = a + posX * 2 + (posY + 5) * lda; + ao7 = a + posX * 2 + (posY + 6) * lda; + ao8 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + b += 16; + } + } else + if (X < posY) { + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 128; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(ao1 + 2); + b[ 17] = *(ao1 + 3); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(ao2 + 2); + b[ 19] = *(ao2 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(ao1 + 4); + b[ 33] = *(ao1 + 5); + b[ 34] = *(ao2 + 4); + b[ 35] = *(ao2 + 5); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(ao3 + 4); + b[ 37] = *(ao3 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(ao1 + 6); + b[ 49] = *(ao1 + 7); + b[ 50] = *(ao2 + 6); + b[ 51] = *(ao2 + 7); + b[ 52] = *(ao3 + 6); + b[ 53] = *(ao3 + 7); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(ao4 + 6); + b[ 55] = *(ao4 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(ao1 + 8); + b[ 65] = *(ao1 + 9); + b[ 66] = *(ao2 + 8); + b[ 67] = *(ao2 + 9); + b[ 68] = *(ao3 + 8); + b[ 69] = *(ao3 + 9); + b[ 70] = *(ao4 + 8); + b[ 71] = *(ao4 + 9); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(ao5 + 8); + b[ 73] = *(ao5 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(ao1 + 10); + b[ 81] = *(ao1 + 11); + b[ 82] = *(ao2 + 10); + b[ 83] = *(ao2 + 11); + b[ 84] = *(ao3 + 10); + b[ 85] = *(ao3 + 11); + b[ 86] = *(ao4 + 10); + b[ 87] = *(ao4 + 11); + b[ 88] = *(ao5 + 10); + b[ 89] = *(ao5 + 11); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(ao6 + 10); + b[ 91] = *(ao6 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(ao1 + 12); + b[ 97] = *(ao1 + 13); + b[ 98] = *(ao2 + 12); + b[ 99] = *(ao2 + 13); + b[100] = *(ao3 + 12); + b[101] = *(ao3 + 13); + b[102] = *(ao4 + 12); + b[103] = *(ao4 + 13); + b[104] = *(ao5 + 12); + b[105] = *(ao5 + 13); + b[106] = *(ao6 + 12); + b[107] = *(ao6 + 13); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(ao7 + 12); + b[109] = *(ao7 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(ao1 + 14); + b[113] = *(ao1 + 15); + b[114] = *(ao2 + 14); + b[115] = *(ao2 + 15); + b[116] = *(ao3 + 14); + b[117] = *(ao3 + 15); + b[118] = *(ao4 + 14); + b[119] = *(ao4 + 15); + b[120] = *(ao5 + 14); + b[121] = *(ao5 + 15); + b[122] = *(ao6 + 14); + b[123] = *(ao6 + 15); + b[124] = *(ao7 + 14); + b[125] = *(ao7 + 15); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(ao8 + 14); + b[127] = *(ao8 + 15); +#endif + + ao1 += 16; + ao2 += 16; + ao3 += 16; + ao4 += 16; + ao5 += 16; + ao6 += 16; + ao7 += 16; + ao8 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + b += 16; + } + } else + if (X < posY) { + ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; + ao5 += i * lda; + ao6 += i * lda; + ao7 += i * lda; + ao8 += i * lda; + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + + if (i >= 2) { + b[ 0] = *(ao1 + 2); + b[ 1] = *(ao1 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(ao1 + 4); + b[ 1] = *(ao1 + 5); + b[ 2] = *(ao2 + 4); + b[ 3] = *(ao2 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(ao1 + 6); + b[ 1] = *(ao1 + 7); + b[ 2] = *(ao2 + 6); + b[ 3] = *(ao2 + 7); + b[ 4] = *(ao3 + 6); + b[ 5] = *(ao3 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(ao4 + 6); + b[ 7] = *(ao4 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(ao1 + 8); + b[ 1] = *(ao1 + 9); + b[ 2] = *(ao2 + 8); + b[ 3] = *(ao2 + 9); + b[ 4] = *(ao3 + 8); + b[ 5] = *(ao3 + 9); + b[ 6] = *(ao4 + 8); + b[ 7] = *(ao4 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(ao5 + 8); + b[ 9] = *(ao5 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(ao1 + 10); + b[ 1] = *(ao1 + 11); + b[ 2] = *(ao2 + 10); + b[ 3] = *(ao2 + 11); + b[ 4] = *(ao3 + 10); + b[ 5] = *(ao3 + 11); + b[ 6] = *(ao4 + 10); + b[ 7] = *(ao4 + 11); + b[ 8] = *(ao5 + 10); + b[ 9] = *(ao5 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(ao6 + 10); + b[11] = *(ao6 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(ao1 + 12); + b[ 1] = *(ao1 + 13); + b[ 2] = *(ao2 + 12); + b[ 3] = *(ao2 + 13); + b[ 4] = *(ao3 + 12); + b[ 5] = *(ao3 + 13); + b[ 6] = *(ao4 + 12); + b[ 7] = *(ao4 + 13); + b[ 8] = *(ao5 + 12); + b[ 9] = *(ao5 + 13); + b[10] = *(ao6 + 12); + b[11] = *(ao6 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(ao7 + 12); + b[13] = *(ao7 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(ao1 + 2); + b[ 9] = *(ao1 + 3); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(ao2 + 2); + b[ 11] = *(ao2 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(ao1 + 4); + b[ 17] = *(ao1 + 5); + b[ 18] = *(ao2 + 4); + b[ 19] = *(ao2 + 5); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(ao3 + 4); + b[ 21] = *(ao3 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(ao1 + 6); + b[ 25] = *(ao1 + 7); + b[ 26] = *(ao2 + 6); + b[ 27] = *(ao2 + 7); + b[ 28] = *(ao3 + 6); + b[ 29] = *(ao3 + 7); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(ao4 + 6); + b[ 31] = *(ao4 + 7); +#endif + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + } else + if (X < posY) { + ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(ao1 + 2); + b[ 1] = *(ao1 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(ao1 + 4); + b[ 1] = *(ao1 + 5); + b[ 2] = *(ao2 + 4); + b[ 3] = *(ao2 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao1 + 2); + b[ 5] = *(ao1 + 3); + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); + + ao1 += 4; + ao2 += 4; + b += 8; + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(ao1 + 2); + b[ 5] = *(ao1 + 3); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); +#endif + ao1 += 4; + ao2 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + ao1 += 2; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + ao1 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_1.c b/kernel/generic/ztrmm_ltcopy_1.c new file mode 100644 index 0000000..1229b45 --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_1.c @@ -0,0 +1,104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02; + FLOAT *ao1; + + lda += lda; + + js = n; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY ++; + js --; + } while (js > 0); + } /* End of main loop */ + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_2.c b/kernel/generic/ztrmm_ltcopy_2.c new file mode 100644 index 0000000..7bcadf3 --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_2.c @@ -0,0 +1,240 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data1, data2, data3, data4, data5, data6, data7, data8; + + FLOAT *ao1, *ao2; + + lda += lda; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X < posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + data7 = *(ao2 + 2); + data8 = *(ao2 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + b[ 4] = data5; + b[ 5] = data6; + b[ 6] = data7; + b[ 7] = data8; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data3; + b[ 3] = data4; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + data7 = *(ao2 + 2); + data8 = *(ao2 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = data7; + b[ 7] = data8; +#endif + ao1 += 4; + ao2 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + + ao1 += lda; + b += 4; + } else { +#ifdef UNIT + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data3; + b[ 3] = data4; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; +#endif + b += 4; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 2; + ao1 += 2; + } else + if (X < posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + b[ 0] = data1; + b[ 1] = data2; + b += 2; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + b[ 0] = data1; + b[ 1] = data2; +#endif + b += 2; + ao1 += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_4.c b/kernel/generic/ztrmm_ltcopy_4.c new file mode 100644 index 0000000..e43ed12 --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_4.c @@ -0,0 +1,685 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda += lda; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + } else { + +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ONE; + b[21] = ZERO; + b[22] = data23; + b[23] = data24; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ONE; + b[31] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = data31; + b[31] = data32; +#endif + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + if (m & 1) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += lda; + b += 8; + } + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + if (i >= 2) { + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + } + + if (i >= 3) { + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = ZERO; + b[ 4] = data13; + b[ 5] = data14; + b[ 6] = data15; + b[ 7] = data16; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ONE; + b[ 5] = ZERO; + b[ 6] = data23; + b[ 7] = data24; + b += 8; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + if (i >= 2) { + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + } + + if (i >= 3) { + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = data13; + b[ 5] = data14; + b[ 6] = data15; + b[ 7] = data16; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = data23; + b[ 7] = data24; + b += 8; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = data11; + b[ 7] = data12; +#endif + ao1 += 4; + ao2 += 4; + + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 2; + ao2 += 2; + + b += 4; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + b += 2; + ao1 += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_8.c b/kernel/generic/ztrmm_ltcopy_8.c new file mode 100644 index 0000000..e25d922 --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_8.c @@ -0,0 +1,876 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + + lda *= 2; + + js = (n >> 3); + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X < posY) { + + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a02 + 4); + b[ 21] = *(a02 + 5); + b[ 22] = *(a02 + 6); + b[ 23] = *(a02 + 7); + b[ 24] = *(a02 + 8); + b[ 25] = *(a02 + 9); + b[ 26] = *(a02 + 10); + b[ 27] = *(a02 + 11); + b[ 28] = *(a02 + 12); + b[ 29] = *(a02 + 13); + b[ 30] = *(a02 + 14); + b[ 31] = *(a02 + 15); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a03 + 6); + b[ 39] = *(a03 + 7); + b[ 40] = *(a03 + 8); + b[ 41] = *(a03 + 9); + b[ 42] = *(a03 + 10); + b[ 43] = *(a03 + 11); + b[ 44] = *(a03 + 12); + b[ 45] = *(a03 + 13); + b[ 46] = *(a03 + 14); + b[ 47] = *(a03 + 15); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a04 + 8); + b[ 57] = *(a04 + 9); + b[ 58] = *(a04 + 10); + b[ 59] = *(a04 + 11); + b[ 60] = *(a04 + 12); + b[ 61] = *(a04 + 13); + b[ 62] = *(a04 + 14); + b[ 63] = *(a04 + 15); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a05 + 10); + b[ 75] = *(a05 + 11); + b[ 76] = *(a05 + 12); + b[ 77] = *(a05 + 13); + b[ 78] = *(a05 + 14); + b[ 79] = *(a05 + 15); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a06 + 12); + b[ 93] = *(a06 + 13); + b[ 94] = *(a06 + 14); + b[ 95] = *(a06 + 15); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a07 + 14); + b[111] = *(a07 + 15); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0) { + if (X > posY) { + a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; + b += 16 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[10] = *(a02 + 10); + b[11] = *(a02 + 11); + b[12] = *(a02 + 12); + b[13] = *(a02 + 13); + b[14] = *(a02 + 14); + b[15] = *(a02 + 15); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[10] = *(a03 + 10); + b[11] = *(a03 + 11); + b[12] = *(a03 + 12); + b[13] = *(a03 + 13); + b[14] = *(a03 + 14); + b[15] = *(a03 + 15); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[10] = *(a04 + 10); + b[11] = *(a04 + 11); + b[12] = *(a04 + 12); + b[13] = *(a04 + 13); + b[14] = *(a04 + 14); + b[15] = *(a04 + 15); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a05 + 10); + b[11] = *(a05 + 11); + b[12] = *(a05 + 12); + b[13] = *(a05 + 13); + b[14] = *(a05 + 14); + b[15] = *(a05 + 15); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a06 + 12); + b[13] = *(a06 + 13); + b[14] = *(a06 + 14); + b[15] = *(a06 + 15); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a07 + 14); + b[15] = *(a07 + 15); + b += 16; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a02 + 4); + b[ 13] = *(a02 + 5); + b[ 14] = *(a02 + 6); + b[ 15] = *(a02 + 7); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a03 + 6); + b[ 23] = *(a03 + 7); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X > posY) { + a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + b += 8 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b += 8; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + b[4] = *(a02 + 0); + b[5] = *(a02 + 1); + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + + b[4] = ZERO; + b[5] = ZERO; +#ifdef UNIT + b[6] = ONE; + b[7] = ZERO; +#else + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X > posY) { + a01 += 2; + a02 += 2; + b += 4; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + a01 += lda; + a02 += lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + a01 += 2; + b += 2; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_1.c b/kernel/generic/ztrmm_uncopy_1.c new file mode 100644 index 0000000..595f009 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_1.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02; + FLOAT *ao1; + + lda += lda; + + js = n; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 2; + b += 2; + + } else + if (X > posY) { + ao1 += lda; + b += 2; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + + ao1 += lda; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY ++; + js --; + } while (js > 0); + } /* End of main loop */ + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_2.c b/kernel/generic/ztrmm_uncopy_2.c new file mode 100644 index 0000000..6beddf5 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_2.c @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *ao1, *ao2; + + lda += lda; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { + +#ifdef UNIT + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X > posY) { + ao1 += lda; + b += 4; + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; +#endif + b += 4; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += 2; + b += 2; + } else + if (X > posY) { + b += 2; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_4.c b/kernel/generic/ztrmm_uncopy_4.c new file mode 100644 index 0000000..f885b0d --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_4.c @@ -0,0 +1,679 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda += lda; + + js = (n >> 2); + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + b[16] = data05; + b[17] = data06; + b[18] = data13; + b[19] = data14; + b[20] = data21; + b[21] = data22; + b[22] = data29; + b[23] = data30; + + b[24] = data07; + b[25] = data08; + b[26] = data15; + b[27] = data16; + b[28] = data23; + b[29] = data24; + b[30] = data31; + b[31] = data32; + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + } else + if (X > posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = ZERO; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ONE; + b[21] = ZERO; + b[22] = data29; + b[23] = data30; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ONE; + b[31] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = data21; + b[21] = data22; + b[22] = data29; + b[23] = data30; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = data31; + b[31] = data32; +#endif + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 16; + } + + if (m & 1) { + ao1 += lda; + b += 8; + } + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + if (i >= 2) { + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + } + + if (i >= 3) { + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = ZERO; + b[ 4] = data19; + b[ 5] = data20; + b[ 6] = data27; + b[ 7] = data28; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ONE; + b[ 5] = ZERO; + b[ 6] = data29; + b[ 7] = data30; + b += 8; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + if (i >= 2) { + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + } + + if (i >= 3) { + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + } + + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = data19; + b[ 5] = data20; + b[ 6] = data27; + b[ 7] = data28; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = data29; + b[ 7] = data30; + b += 8; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = data11; + b[ 7] = data12; +#endif + ao1 += 4; + ao2 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + ao1 += 2; + b += 2; + } else + if (X > posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; + } + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_8.c b/kernel/generic/ztrmm_uncopy_8.c new file mode 100644 index 0000000..c02c1de --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_8.c @@ -0,0 +1,876 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda += lda; + + js = (n >> 3); + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + ao5 = a + posX * 2 + (posY + 4) * lda; + ao6 = a + posX * 2 + (posY + 5) * lda; + ao7 = a + posX * 2 + (posY + 6) * lda; + ao8 = a + posX * 2 + (posY + 7) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + ao5 = a + posY * 2 + (posX + 4) * lda; + ao6 = a + posY * 2 + (posX + 5) * lda; + ao7 = a + posY * 2 + (posX + 6) * lda; + ao8 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + b += 16; + } + } else + if (X > posY) { + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 128; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(ao2 + 2); + b[ 19] = *(ao2 + 3); +#endif + b[ 20] = *(ao3 + 2); + b[ 21] = *(ao3 + 3); + b[ 22] = *(ao4 + 2); + b[ 23] = *(ao4 + 3); + b[ 24] = *(ao5 + 2); + b[ 25] = *(ao5 + 3); + b[ 26] = *(ao6 + 2); + b[ 27] = *(ao6 + 3); + b[ 28] = *(ao7 + 2); + b[ 29] = *(ao7 + 3); + b[ 30] = *(ao8 + 2); + b[ 31] = *(ao8 + 3); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(ao3 + 4); + b[ 37] = *(ao3 + 5); +#endif + b[ 38] = *(ao4 + 4); + b[ 39] = *(ao4 + 5); + b[ 40] = *(ao5 + 4); + b[ 41] = *(ao5 + 5); + b[ 42] = *(ao6 + 4); + b[ 43] = *(ao6 + 5); + b[ 44] = *(ao7 + 4); + b[ 45] = *(ao7 + 5); + b[ 46] = *(ao8 + 4); + b[ 47] = *(ao8 + 5); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(ao4 + 6); + b[ 55] = *(ao4 + 7); +#endif + b[ 56] = *(ao5 + 6); + b[ 57] = *(ao5 + 7); + b[ 58] = *(ao6 + 6); + b[ 59] = *(ao6 + 7); + b[ 60] = *(ao7 + 6); + b[ 61] = *(ao7 + 7); + b[ 62] = *(ao8 + 6); + b[ 63] = *(ao8 + 7); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(ao5 + 8); + b[ 73] = *(ao5 + 9); +#endif + b[ 74] = *(ao6 + 8); + b[ 75] = *(ao6 + 9); + b[ 76] = *(ao7 + 8); + b[ 77] = *(ao7 + 9); + b[ 78] = *(ao8 + 8); + b[ 79] = *(ao8 + 9); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(ao6 + 10); + b[ 91] = *(ao6 + 11); +#endif + b[ 92] = *(ao7 + 10); + b[ 93] = *(ao7 + 11); + b[ 94] = *(ao8 + 10); + b[ 95] = *(ao8 + 11); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(ao7 + 12); + b[109] = *(ao7 + 13); +#endif + b[110] = *(ao8 + 12); + b[111] = *(ao8 + 13); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(ao8 + 14); + b[127] = *(ao8 + 15); +#endif + + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + b += 16; + } + } else + if (X > posY) { + ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; + ao5 += i * lda; + ao6 += i * lda; + ao7 += i * lda; + ao8 += i * lda; + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[10] = *(ao6 + 0); + b[11] = *(ao6 + 1); + b[12] = *(ao7 + 0); + b[13] = *(ao7 + 1); + b[14] = *(ao8 + 0); + b[15] = *(ao8 + 1); + b += 16; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); +#endif + b[ 4] = *(ao3 + 2); + b[ 5] = *(ao3 + 3); + b[ 6] = *(ao4 + 2); + b[ 7] = *(ao4 + 3); + b[ 8] = *(ao5 + 2); + b[ 9] = *(ao5 + 3); + b[10] = *(ao6 + 2); + b[11] = *(ao6 + 3); + b[12] = *(ao7 + 2); + b[13] = *(ao7 + 3); + b[14] = *(ao8 + 2); + b[15] = *(ao8 + 3); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); +#endif + b[ 6] = *(ao4 + 4); + b[ 7] = *(ao4 + 5); + b[ 8] = *(ao5 + 4); + b[ 9] = *(ao5 + 5); + b[10] = *(ao6 + 4); + b[11] = *(ao6 + 5); + b[12] = *(ao7 + 4); + b[13] = *(ao7 + 5); + b[14] = *(ao8 + 4); + b[15] = *(ao8 + 5); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(ao4 + 6); + b[ 7] = *(ao4 + 7); +#endif + b[ 8] = *(ao5 + 6); + b[ 9] = *(ao5 + 7); + b[10] = *(ao6 + 6); + b[11] = *(ao6 + 7); + b[12] = *(ao7 + 6); + b[13] = *(ao7 + 7); + b[14] = *(ao8 + 6); + b[15] = *(ao8 + 7); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(ao5 + 8); + b[ 9] = *(ao5 + 9); +#endif + b[10] = *(ao6 + 8); + b[11] = *(ao6 + 9); + b[12] = *(ao7 + 8); + b[13] = *(ao7 + 9); + b[14] = *(ao8 + 8); + b[15] = *(ao8 + 9); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(ao6 + 10); + b[11] = *(ao6 + 11); +#endif + b[12] = *(ao7 + 10); + b[13] = *(ao7 + 11); + b[14] = *(ao8 + 10); + b[15] = *(ao8 + 11); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(ao7 + 12); + b[13] = *(ao7 + 13); +#endif + b[14] = *(ao8 + 12); + b[15] = *(ao8 + 13); + b += 16; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + } else + if (X > posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(ao2 + 2); + b[ 11] = *(ao2 + 3); +#endif + b[ 12] = *(ao3 + 2); + b[ 13] = *(ao3 + 3); + b[ 14] = *(ao4 + 2); + b[ 15] = *(ao4 + 3); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(ao3 + 4); + b[ 21] = *(ao3 + 5); +#endif + b[ 22] = *(ao4 + 4); + b[ 23] = *(ao4 + 5); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(ao4 + 6); + b[ 31] = *(ao4 + 7); +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + } else + if (X > posY) { + ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); +#endif + b[ 4] = *(ao3 + 2); + b[ 5] = *(ao3 + 3); + b[ 6] = *(ao4 + 2); + b[ 7] = *(ao4 + 3); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); +#endif + b[ 6] = *(ao4 + 4); + b[ 7] = *(ao4 + 5); + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao1 + 2); + b[ 5] = *(ao1 + 3); + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); + + ao1 += 4; + ao2 += 4; + b += 8; + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + ao1 += 2; + b += 2; + } else + if (X > posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + ao1 += lda; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_1.c b/kernel/generic/ztrmm_utcopy_1.c new file mode 100644 index 0000000..d4406c9 --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_1.c @@ -0,0 +1,103 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02; + FLOAT *ao1; + + lda += lda; + + js = n; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += lda; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY ++; + js --; + } while (js > 0); + } /* End of main loop */ + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_2.c b/kernel/generic/ztrmm_utcopy_2.c new file mode 100644 index 0000000..c71a55c --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_2.c @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data1, data2, data3, data4, data5, data6, data7, data8; + + FLOAT *ao1, *ao2; + + lda += lda; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X > posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + data7 = *(ao2 + 2); + data8 = *(ao2 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + b[ 4] = data5; + b[ 5] = data6; + b[ 6] = data7; + b[ 7] = data8; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data5; + b[ 5] = data6; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + data7 = *(ao2 + 2); + data8 = *(ao2 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data5; + b[ 5] = data6; + b[ 6] = data7; + b[ 7] = data8; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X > posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + + ao1 += lda; + b += 4; + + } else { +#ifdef UNIT + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data5; + b[ 3] = data6; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data5; + b[ 3] = data6; +#endif + b += 4; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + b[ 0] = data1; + b[ 1] = data2; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + b[ 0] = data1; + b[ 1] = data2; +#endif + ao1 += lda; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_4.c b/kernel/generic/ztrmm_utcopy_4.c new file mode 100644 index 0000000..cda62bc --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_4.c @@ -0,0 +1,663 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda += lda; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ONE; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = ONE; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = ONE; + b[31] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + if (m & 1) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += lda; + b += 8; + } + + } else { + +#ifdef UNIT + if (i >= 2) { + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + } + + if (i >= 3) { + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = ONE; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data17; + b[ 1] = data18; + b[ 2] = data19; + b[ 3] = data20; + b[ 4] = ONE; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (i >= 2) { + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + } + + if (i >= 3) { + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data17; + b[ 1] = data18; + b[ 2] = data19; + b[ 3] = data20; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + b += 4; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; +#endif + b += 4; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + b += 2; + ao1 += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + ao1 += lda; + } + + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_8.c b/kernel/generic/ztrmm_utcopy_8.c new file mode 100644 index 0000000..08dd80c --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_8.c @@ -0,0 +1,880 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + + lda *= 2; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X > posY) { + + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a02 + 0); + b[ 17] = *(a02 + 1); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a03 + 0); + b[ 33] = *(a03 + 1); + b[ 34] = *(a03 + 2); + b[ 35] = *(a03 + 3); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a04 + 0); + b[ 49] = *(a04 + 1); + b[ 50] = *(a04 + 2); + b[ 51] = *(a04 + 3); + b[ 52] = *(a04 + 4); + b[ 53] = *(a04 + 5); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a05 + 0); + b[ 65] = *(a05 + 1); + b[ 66] = *(a05 + 2); + b[ 67] = *(a05 + 3); + b[ 68] = *(a05 + 4); + b[ 69] = *(a05 + 5); + b[ 70] = *(a05 + 6); + b[ 71] = *(a05 + 7); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a06 + 0); + b[ 81] = *(a06 + 1); + b[ 82] = *(a06 + 2); + b[ 83] = *(a06 + 3); + b[ 84] = *(a06 + 4); + b[ 85] = *(a06 + 5); + b[ 86] = *(a06 + 6); + b[ 87] = *(a06 + 7); + b[ 88] = *(a06 + 8); + b[ 89] = *(a06 + 9); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a07 + 0); + b[ 97] = *(a07 + 1); + b[ 98] = *(a07 + 2); + b[ 99] = *(a07 + 3); + b[100] = *(a07 + 4); + b[101] = *(a07 + 5); + b[102] = *(a07 + 6); + b[103] = *(a07 + 7); + b[104] = *(a07 + 8); + b[105] = *(a07 + 9); + b[106] = *(a07 + 10); + b[107] = *(a07 + 11); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a08 + 0); + b[113] = *(a08 + 1); + b[114] = *(a08 + 2); + b[115] = *(a08 + 3); + b[116] = *(a08 + 4); + b[117] = *(a08 + 5); + b[118] = *(a08 + 6); + b[119] = *(a08 + 7); + b[120] = *(a08 + 8); + b[121] = *(a08 + 9); + b[122] = *(a08 + 10); + b[123] = *(a08 + 11); + b[124] = *(a08 + 12); + b[125] = *(a08 + 13); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + + a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; + b += 16 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X > posY) { + + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a02 + 0); + b[ 9] = *(a02 + 1); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a03 + 0); + b[ 17] = *(a03 + 1); + b[ 18] = *(a03 + 2); + b[ 19] = *(a03 + 3); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a04 + 0); + b[ 25] = *(a04 + 1); + b[ 26] = *(a04 + 2); + b[ 27] = *(a04 + 3); + b[ 28] = *(a04 + 4); + b[ 29] = *(a04 + 5); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + b += 8 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + b += 4; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b += 4; + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + a01 += 2; + b += 2; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += lda; + b += 2; + } + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_1.c b/kernel/generic/ztrsm_lncopy_1.c new file mode 100644 index 0000000..ec8ffbc --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_1.c @@ -0,0 +1,91 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02; + FLOAT *a1; + + lda *= 2; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 2; + b += 2; + + i --; + ii ++; + } + + a += lda; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_2.c b/kernel/generic/ztrsm_lncopy_2.c new file mode 100644 index 0000000..967b60c --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_2.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *a1, *a2; + + lda *= 2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data07 = *(a2 + 2); + data08 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 4) = data03; + *(b + 5) = data04; + compinv(b + 6, data07, data08); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data05; + *(b + 3) = data06; + *(b + 4) = data03; + *(b + 5) = data04; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 4; + a2 += 4; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data05; + *(b + 3) = data06; + } + b += 4; + } + + a += 2 * lda; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1+= 2; + b += 2; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_4.c b/kernel/generic/ztrsm_lncopy_4.c new file mode 100644 index 0000000..e4a3fb9 --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_4.c @@ -0,0 +1,459 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT data09, data10, data11, data12; + FLOAT data13, data14, data15, data16; + FLOAT data17, data18, data19, data20; + FLOAT data21, data22, data23, data24; + FLOAT data25, data26, data27, data28; + FLOAT data29, data30, data31, data32; + + FLOAT *a1, *a2, *a3, *a4; + + lda *= 2; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data21 = *(a3 + 4); + data22 = *(a3 + 5); +#endif + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data31 = *(a4 + 6); + data32 = *(a4 + 7); +#endif + + compinv(b + 0, data01, data02); + + *(b + 8) = data03; + *(b + 9) = data04; + compinv(b + 10, data11, data12); + + *(b + 16) = data05; + *(b + 17) = data06; + *(b + 18) = data13; + *(b + 19) = data14; + compinv(b + 20, data21, data22); + + *(b + 24) = data07; + *(b + 25) = data08; + *(b + 26) = data15; + *(b + 27) = data16; + *(b + 28) = data23; + *(b + 29) = data24; + compinv(b + 30, data31, data32); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data04; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + + *(b + 16) = data05; + *(b + 17) = data06; + *(b + 18) = data13; + *(b + 19) = data14; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 24) = data07; + *(b + 25) = data08; + *(b + 26) = data15; + *(b + 27) = data16; + *(b + 28) = data23; + *(b + 29) = data24; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 8; + a2 += 8; + a3 += 8; + a4 += 8; + b += 32; + + i --; + ii += 4; + } + + if (m & 2) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + + *(b + 4) = data03; + *(b + 5) = data04; + compinv(b + 6, data11, data12); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data04; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 1; + } + a += 4 * lda; + jj += 4; + j --; + } + + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + + *(b + 4) = data03; + *(b + 5) = data04; + compinv(b + 6, data11, data12); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + + *(b + 4) = data03; + *(b + 5) = data04; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 4; + a2 += 4; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + } + + a1 += 2; + a2 += 2; + b += 4; + + ii += 1; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 2; + b += 2; + + i --; + ii += 1; + } + + a += lda; + jj += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_8.c b/kernel/generic/ztrsm_lncopy_8.c new file mode 100644 index 0000000..0176f91 --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_8.c @@ -0,0 +1,225 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + j --; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_1.c b/kernel/generic/ztrsm_ltcopy_1.c new file mode 100644 index 0000000..ef49532 --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_1.c @@ -0,0 +1,91 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02; + FLOAT *a1; + + lda *= 2; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += lda; + b += 2; + + i --; + ii ++; + } + + a += 2; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_2.c b/kernel/generic/ztrsm_ltcopy_2.c new file mode 100644 index 0000000..bcc2bbc --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_2.c @@ -0,0 +1,177 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *a1, *a2; + + lda *= 2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data07 = *(a2 + 2); + data08 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + compinv(b + 6, data07, data08); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 1 * lda; + b += 2; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_4.c b/kernel/generic/ztrsm_ltcopy_4.c new file mode 100644 index 0000000..8c4e66b --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_4.c @@ -0,0 +1,479 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT data09, data10, data11, data12; + FLOAT data13, data14, data15, data16; + FLOAT data17, data18, data19, data20; + FLOAT data21, data22, data23, data24; + FLOAT data25, data26, data27, data28; + FLOAT data29, data30, data31, data32; + + FLOAT *a1, *a2, *a3, *a4; + + lda *= 2; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data21 = *(a3 + 4); + data22 = *(a3 + 5); +#endif + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data31 = *(a4 + 6); + data32 = *(a4 + 7); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + compinv(b + 10, data11, data12); + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + compinv(b + 20, data21, data22); + *(b + 22) = data23; + *(b + 23) = data24; + + compinv(b + 30, data31, data32); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 32; + + i --; + ii += 4; + } + + if (m & 2) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + compinv(b + 10, data11, data12); + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 16; + + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += lda; + b += 8; + ii += 1; + } + + a += 8; + jj += 4; + j --; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + compinv(b + 6, data11, data12); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += lda; + b += 4; + ii += 1; + } + + a += 4; + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += lda; + b += 2; + + i --; + ii += 1; + } + + a += 2; + jj += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_8.c b/kernel/generic/ztrsm_ltcopy_8.c new file mode 100644 index 0000000..899c9ab --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_8.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + j --; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_1.c b/kernel/generic/ztrsm_uncopy_1.c new file mode 100644 index 0000000..0891300 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02; + FLOAT *a1; + + lda *= 2; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 2; + b += 2; + + i --; + ii ++; + } + + a += lda; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_2.c b/kernel/generic/ztrsm_uncopy_2.c new file mode 100644 index 0000000..45c2093 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_2.c @@ -0,0 +1,176 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *a1, *a2; + + lda *= 2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data05 = *(a2 + 0); + data06 = *(a2 + 1); +#ifndef UNIT + data07 = *(a2 + 2); + data08 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data05; + *(b + 3) = data06; + compinv(b + 6, data07, data08); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data05; + *(b + 3) = data06; + *(b + 4) = data03; + *(b + 5) = data04; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 4; + a2 += 4; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data05 = *(a2 + 0); + data06 = *(a2 + 1); + + compinv(b + 0, data01, data02); + *(b + 2) = data05; + *(b + 3) = data06; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 2 * lda; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1+= 2; + b += 2; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_4.c b/kernel/generic/ztrsm_uncopy_4.c new file mode 100644 index 0000000..9cbc6c7 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_4.c @@ -0,0 +1,496 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT data09, data10, data11, data12; + FLOAT data13, data14, data15, data16; + FLOAT data17, data18, data19, data20; + FLOAT data21, data22, data23, data24; + FLOAT data25, data26, data27, data28; + FLOAT data29, data30, data31, data32; + + FLOAT *a1, *a2, *a3, *a4; + + lda *= 2; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); +#ifndef UNIT + data21 = *(a3 + 4); + data22 = *(a3 + 5); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); +#ifndef UNIT + data31 = *(a4 + 6); + data32 = *(a4 + 7); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + compinv(b + 10, data11, data12); + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + + compinv(b + 20, data21, data22); + *(b + 22) = data29; + *(b + 23) = data30; + compinv(b + 30, data31, data32); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data04; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + + *(b + 16) = data05; + *(b + 17) = data06; + *(b + 18) = data13; + *(b + 19) = data14; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 24) = data07; + *(b + 25) = data08; + *(b + 26) = data15; + *(b + 27) = data16; + *(b + 28) = data23; + *(b + 29) = data24; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 8; + a2 += 8; + a3 += 8; + a4 += 8; + b += 32; + + i --; + ii += 4; + } + + if (m & 2) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + compinv(b + 10, data11, data12); + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data04; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 1; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + compinv(b + 6, data11, data12); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data03; + *(b + 5) = data04; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 4; + a2 += 4; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + } + + a1 += 2; + a2 += 2; + b += 4; + + ii += 1; + } + + a += 2 *lda; + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 2; + b += 2; + + i --; + ii += 1; + } + + a += lda; + jj += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_8.c b/kernel/generic/ztrsm_uncopy_8.c new file mode 100644 index 0000000..2ce1c72 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_8.c @@ -0,0 +1,228 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + j --; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 1; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c new file mode 100644 index 0000000..42ecc47 --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02; + FLOAT *a1; + + lda *= 2; + + jj = offset; + + j = (n); + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += lda; + b += 2; + + i --; + ii ++; + } + + a += 2; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c new file mode 100644 index 0000000..fd7affb --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *a1, *a2; + + lda *= 2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data05 = *(a2 + 0); + data06 = *(a2 + 1); +#ifndef UNIT + data07 = *(a2 + 2); + data08 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 4) = data05; + *(b + 5) = data06; + compinv(b + 6, data07, data08); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 1 * lda; + b += 2; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_4.c b/kernel/generic/ztrsm_utcopy_4.c new file mode 100644 index 0000000..fd3483c --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_4.c @@ -0,0 +1,444 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT data09, data10, data11, data12; + FLOAT data13, data14, data15, data16; + FLOAT data17, data18, data19, data20; + FLOAT data21, data22, data23, data24; + FLOAT data25, data26, data27, data28; + FLOAT data29, data30, data31, data32; + + FLOAT *a1, *a2, *a3, *a4; + + lda *= 2; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); +#ifndef UNIT + data21 = *(a3 + 4); + data22 = *(a3 + 5); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); +#ifndef UNIT + data31 = *(a4 + 6); + data32 = *(a4 + 7); +#endif + + compinv(b + 0, data01, data02); + *(b + 8) = data09; + *(b + 9) = data10; + compinv(b + 10, data11, data12); + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + compinv(b + 20, data21, data22); + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + compinv(b + 30, data31, data32); + + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 32; + + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 8) = data09; + *(b + 9) = data10; + compinv(b + 10, data11, data12); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 16; + + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += lda; + b += 8; + + ii += 1; + } + + a += 8; + jj += 4; + j --; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 4) = data09; + *(b + 5) = data10; + compinv(b + 6, data11, data12); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += lda; + b += 4; + + ii += 1; + } + + a += 4; + jj += 2; + j --; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += lda; + b += 2; + + i --; + ii += 1; + } + + a += 2; + jj += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_8.c b/kernel/generic/ztrsm_utcopy_8.c new file mode 100644 index 0000000..52c7ed5 --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_8.c @@ -0,0 +1,209 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, data1, data2; + + lda *= 2; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + j --; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/ia64/._KERNEL b/kernel/ia64/._KERNEL new file mode 100644 index 0000000..a2266b2 Binary files /dev/null and b/kernel/ia64/._KERNEL differ diff --git a/kernel/ia64/._Makefile b/kernel/ia64/._Makefile new file mode 100644 index 0000000..324f66d Binary files /dev/null and b/kernel/ia64/._Makefile differ diff --git a/kernel/ia64/._amax.S b/kernel/ia64/._amax.S new file mode 100644 index 0000000..40364d6 Binary files /dev/null and b/kernel/ia64/._amax.S differ diff --git a/kernel/ia64/._asum.S b/kernel/ia64/._asum.S new file mode 100644 index 0000000..eecd8f3 Binary files /dev/null and b/kernel/ia64/._asum.S differ diff --git a/kernel/ia64/._cabs.S b/kernel/ia64/._cabs.S new file mode 100644 index 0000000..81bf7f2 Binary files /dev/null and b/kernel/ia64/._cabs.S differ diff --git a/kernel/ia64/._caxpy.S b/kernel/ia64/._caxpy.S new file mode 100644 index 0000000..657fcda Binary files /dev/null and b/kernel/ia64/._caxpy.S differ diff --git a/kernel/ia64/._copy.S b/kernel/ia64/._copy.S new file mode 100644 index 0000000..9f63d9f Binary files /dev/null and b/kernel/ia64/._copy.S differ diff --git a/kernel/ia64/._daxpy.S b/kernel/ia64/._daxpy.S new file mode 100644 index 0000000..424f22f Binary files /dev/null and b/kernel/ia64/._daxpy.S differ diff --git a/kernel/ia64/._ddot.S b/kernel/ia64/._ddot.S new file mode 100644 index 0000000..59d7a5f Binary files /dev/null and b/kernel/ia64/._ddot.S differ diff --git a/kernel/ia64/._gemm_beta.S b/kernel/ia64/._gemm_beta.S new file mode 100644 index 0000000..785d45a Binary files /dev/null and b/kernel/ia64/._gemm_beta.S differ diff --git a/kernel/ia64/._gemm_kernel.S b/kernel/ia64/._gemm_kernel.S new file mode 100644 index 0000000..d5ae842 Binary files /dev/null and b/kernel/ia64/._gemm_kernel.S differ diff --git a/kernel/ia64/._gemm_ncopy.S b/kernel/ia64/._gemm_ncopy.S new file mode 100644 index 0000000..e26a5fd Binary files /dev/null and b/kernel/ia64/._gemm_ncopy.S differ diff --git a/kernel/ia64/._gemm_tcopy.S b/kernel/ia64/._gemm_tcopy.S new file mode 100644 index 0000000..f17942a Binary files /dev/null and b/kernel/ia64/._gemm_tcopy.S differ diff --git a/kernel/ia64/._gemv_n.S b/kernel/ia64/._gemv_n.S new file mode 100644 index 0000000..c06d063 Binary files /dev/null and b/kernel/ia64/._gemv_n.S differ diff --git a/kernel/ia64/._gemv_t.S b/kernel/ia64/._gemv_t.S new file mode 100644 index 0000000..78a0d18 Binary files /dev/null and b/kernel/ia64/._gemv_t.S differ diff --git a/kernel/ia64/._iamax.S b/kernel/ia64/._iamax.S new file mode 100644 index 0000000..dc093af Binary files /dev/null and b/kernel/ia64/._iamax.S differ diff --git a/kernel/ia64/._izamax.S b/kernel/ia64/._izamax.S new file mode 100644 index 0000000..f4f5f56 Binary files /dev/null and b/kernel/ia64/._izamax.S differ diff --git a/kernel/ia64/._lsame.S b/kernel/ia64/._lsame.S new file mode 100644 index 0000000..e1e37af Binary files /dev/null and b/kernel/ia64/._lsame.S differ diff --git a/kernel/ia64/._nrm2.S b/kernel/ia64/._nrm2.S new file mode 100644 index 0000000..b4c5eb9 Binary files /dev/null and b/kernel/ia64/._nrm2.S differ diff --git a/kernel/ia64/._qaxpy.S b/kernel/ia64/._qaxpy.S new file mode 100644 index 0000000..70bc17e Binary files /dev/null and b/kernel/ia64/._qaxpy.S differ diff --git a/kernel/ia64/._qcopy.S b/kernel/ia64/._qcopy.S new file mode 100644 index 0000000..28da09f Binary files /dev/null and b/kernel/ia64/._qcopy.S differ diff --git a/kernel/ia64/._qdot.S b/kernel/ia64/._qdot.S new file mode 100644 index 0000000..9111a30 Binary files /dev/null and b/kernel/ia64/._qdot.S differ diff --git a/kernel/ia64/._qgemm_kernel.S b/kernel/ia64/._qgemm_kernel.S new file mode 100644 index 0000000..e1b2021 Binary files /dev/null and b/kernel/ia64/._qgemm_kernel.S differ diff --git a/kernel/ia64/._qgemv_n.S b/kernel/ia64/._qgemv_n.S new file mode 100644 index 0000000..57e1276 Binary files /dev/null and b/kernel/ia64/._qgemv_n.S differ diff --git a/kernel/ia64/._qgemv_t.S b/kernel/ia64/._qgemv_t.S new file mode 100644 index 0000000..17241ee Binary files /dev/null and b/kernel/ia64/._qgemv_t.S differ diff --git a/kernel/ia64/._qscal.S b/kernel/ia64/._qscal.S new file mode 100644 index 0000000..1c22a42 Binary files /dev/null and b/kernel/ia64/._qscal.S differ diff --git a/kernel/ia64/._rot.S b/kernel/ia64/._rot.S new file mode 100644 index 0000000..b746f24 Binary files /dev/null and b/kernel/ia64/._rot.S differ diff --git a/kernel/ia64/._saxpy.S b/kernel/ia64/._saxpy.S new file mode 100644 index 0000000..8fc8c5e Binary files /dev/null and b/kernel/ia64/._saxpy.S differ diff --git a/kernel/ia64/._scal.S b/kernel/ia64/._scal.S new file mode 100644 index 0000000..370a4f8 Binary files /dev/null and b/kernel/ia64/._scal.S differ diff --git a/kernel/ia64/._sdot.S b/kernel/ia64/._sdot.S new file mode 100644 index 0000000..e5a7d52 Binary files /dev/null and b/kernel/ia64/._sdot.S differ diff --git a/kernel/ia64/._sgemv_n.S b/kernel/ia64/._sgemv_n.S new file mode 100644 index 0000000..3046cc0 Binary files /dev/null and b/kernel/ia64/._sgemv_n.S differ diff --git a/kernel/ia64/._staticbuffer.S b/kernel/ia64/._staticbuffer.S new file mode 100644 index 0000000..87a0cec Binary files /dev/null and b/kernel/ia64/._staticbuffer.S differ diff --git a/kernel/ia64/._swap.S b/kernel/ia64/._swap.S new file mode 100644 index 0000000..5abfc5a Binary files /dev/null and b/kernel/ia64/._swap.S differ diff --git a/kernel/ia64/._symv_U.S b/kernel/ia64/._symv_U.S new file mode 100644 index 0000000..edba412 Binary files /dev/null and b/kernel/ia64/._symv_U.S differ diff --git a/kernel/ia64/._trsm_kernel_LN.S b/kernel/ia64/._trsm_kernel_LN.S new file mode 100644 index 0000000..1b48272 Binary files /dev/null and b/kernel/ia64/._trsm_kernel_LN.S differ diff --git a/kernel/ia64/._trsm_kernel_LT.S b/kernel/ia64/._trsm_kernel_LT.S new file mode 100644 index 0000000..8b12c51 Binary files /dev/null and b/kernel/ia64/._trsm_kernel_LT.S differ diff --git a/kernel/ia64/._trsm_kernel_RT.S b/kernel/ia64/._trsm_kernel_RT.S new file mode 100644 index 0000000..3eb89b3 Binary files /dev/null and b/kernel/ia64/._trsm_kernel_RT.S differ diff --git a/kernel/ia64/._xcopy.S b/kernel/ia64/._xcopy.S new file mode 100644 index 0000000..5c2e223 Binary files /dev/null and b/kernel/ia64/._xcopy.S differ diff --git a/kernel/ia64/._xdot.S b/kernel/ia64/._xdot.S new file mode 100644 index 0000000..d0585d2 Binary files /dev/null and b/kernel/ia64/._xdot.S differ diff --git a/kernel/ia64/._zaxpy.S b/kernel/ia64/._zaxpy.S new file mode 100644 index 0000000..846ddb0 Binary files /dev/null and b/kernel/ia64/._zaxpy.S differ diff --git a/kernel/ia64/._zcopy.S b/kernel/ia64/._zcopy.S new file mode 100644 index 0000000..3b48c66 Binary files /dev/null and b/kernel/ia64/._zcopy.S differ diff --git a/kernel/ia64/._zdot.S b/kernel/ia64/._zdot.S new file mode 100644 index 0000000..b18a8a8 Binary files /dev/null and b/kernel/ia64/._zdot.S differ diff --git a/kernel/ia64/._zgemm3m_kernel.S b/kernel/ia64/._zgemm3m_kernel.S new file mode 100644 index 0000000..40aa364 Binary files /dev/null and b/kernel/ia64/._zgemm3m_kernel.S differ diff --git a/kernel/ia64/._zgemm_beta.S b/kernel/ia64/._zgemm_beta.S new file mode 100644 index 0000000..6ba650d Binary files /dev/null and b/kernel/ia64/._zgemm_beta.S differ diff --git a/kernel/ia64/._zgemm_kernel.S b/kernel/ia64/._zgemm_kernel.S new file mode 100644 index 0000000..611ecc6 Binary files /dev/null and b/kernel/ia64/._zgemm_kernel.S differ diff --git a/kernel/ia64/._zgemm_ncopy.S b/kernel/ia64/._zgemm_ncopy.S new file mode 100644 index 0000000..2601be2 Binary files /dev/null and b/kernel/ia64/._zgemm_ncopy.S differ diff --git a/kernel/ia64/._zgemm_tcopy.S b/kernel/ia64/._zgemm_tcopy.S new file mode 100644 index 0000000..e72f187 Binary files /dev/null and b/kernel/ia64/._zgemm_tcopy.S differ diff --git a/kernel/ia64/._zgemv_n.S b/kernel/ia64/._zgemv_n.S new file mode 100644 index 0000000..8febbde Binary files /dev/null and b/kernel/ia64/._zgemv_n.S differ diff --git a/kernel/ia64/._zgemv_t.S b/kernel/ia64/._zgemv_t.S new file mode 100644 index 0000000..3ecb8e8 Binary files /dev/null and b/kernel/ia64/._zgemv_t.S differ diff --git a/kernel/ia64/._zrot.S b/kernel/ia64/._zrot.S new file mode 100644 index 0000000..341c9e3 Binary files /dev/null and b/kernel/ia64/._zrot.S differ diff --git a/kernel/ia64/._zscal.S b/kernel/ia64/._zscal.S new file mode 100644 index 0000000..d0c4e9e Binary files /dev/null and b/kernel/ia64/._zscal.S differ diff --git a/kernel/ia64/._zswap.S b/kernel/ia64/._zswap.S new file mode 100644 index 0000000..b7673ee Binary files /dev/null and b/kernel/ia64/._zswap.S differ diff --git a/kernel/ia64/._ztrsm_kernel_LN.S b/kernel/ia64/._ztrsm_kernel_LN.S new file mode 100644 index 0000000..13efb09 Binary files /dev/null and b/kernel/ia64/._ztrsm_kernel_LN.S differ diff --git a/kernel/ia64/._ztrsm_kernel_LT.S b/kernel/ia64/._ztrsm_kernel_LT.S new file mode 100644 index 0000000..67a0a4d Binary files /dev/null and b/kernel/ia64/._ztrsm_kernel_LT.S differ diff --git a/kernel/ia64/._ztrsm_kernel_RT.S b/kernel/ia64/._ztrsm_kernel_RT.S new file mode 100644 index 0000000..75b9ea5 Binary files /dev/null and b/kernel/ia64/._ztrsm_kernel_RT.S differ diff --git a/kernel/ia64/KERNEL b/kernel/ia64/KERNEL new file mode 100644 index 0000000..10a7e61 --- /dev/null +++ b/kernel/ia64/KERNEL @@ -0,0 +1,140 @@ +SAXPYKERNEL = saxpy.S +DAXPYKERNEL = daxpy.S +QAXPYKERNEL = qaxpy.S +CAXPYKERNEL = caxpy.S +ZAXPYKERNEL = zaxpy.S +XAXPYKERNEL = zaxpy.S + +SDOTKERNEL = sdot.S +DDOTKERNEL = ddot.S +QDOTKERNEL = qdot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +XDOTKERNEL = xdot.S + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +QAMAXKERNEL = amax.S +CAMAXKERNEL = izamax.S +ZAMAXKERNEL = izamax.S +XAMAXKERNEL = izamax.S + +SAMINKERNEL = amax.S +DAMINKERNEL = amax.S +QAMINKERNEL = amax.S +CAMINKERNEL = izamax.S +ZAMINKERNEL = izamax.S +XAMINKERNEL = izamax.S + +SMAXKERNEL = amax.S +DMAXKERNEL = amax.S +QMAXKERNEL = amax.S + +SMINKERNEL = amax.S +DMINKERNEL = amax.S +QMINKERNEL = amax.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +IQAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S +IXAMAXKERNEL = izamax.S + +ISAMINKERNEL = iamax.S +IDAMINKERNEL = iamax.S +IQAMINKERNEL = iamax.S +ICAMINKERNEL = izamax.S +IZAMINKERNEL = izamax.S +IXAMINKERNEL = izamax.S + +ISMAXKERNEL = iamax.S +IDMAXKERNEL = iamax.S +IQMAXKERNEL = iamax.S + +ISMINKERNEL = iamax.S +IDMINKERNEL = iamax.S +IQMINKERNEL = iamax.S + +CASUMKERNEL = asum.S +ZASUMKERNEL = asum.S +XASUMKERNEL = asum.S + +CNRM2KERNEL = nrm2.S +ZNRM2KERNEL = nrm2.S +XNRM2KERNEL = nrm2.S + +QCOPYKERNEL = qcopy.S +XCOPYKERNEL = xcopy.S + +QSCALKERNEL = qscal.S + +QGEMVNKERNEL = qgemv_n.S +QGEMVTKERNEL = qgemv_t.S +XGEMVNKERNEL = xgemv_n.S +XGEMVTKERNEL = xgemv_t.S + +SGEMMKERNEL = gemm_kernel.S +SGEMM_BETA = gemm_beta.S +SGEMMONCOPY = gemm_ncopy.S +SGEMMOTCOPY = gemm_tcopy.S +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) + +DGEMMKERNEL = gemm_kernel.S +DGEMM_BETA = gemm_beta.S +DGEMMONCOPY = gemm_ncopy.S +DGEMMOTCOPY = gemm_tcopy.S +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) + +QGEMMKERNEL = qgemm_kernel.S +QGEMM_BETA = ../generic/gemm_beta.c +QGEMMONCOPY = ../generic/gemm_ncopy_8.c +QGEMMOTCOPY = ../generic/gemm_tcopy_8.c +QGEMMONCOPYOBJ = qgemm_oncopy.$(SUFFIX) +QGEMMOTCOPYOBJ = qgemm_otcopy.$(SUFFIX) + +CGEMMKERNEL = zgemm_kernel.S +CGEMM_BETA = zgemm_beta.S +CGEMMONCOPY = zgemm_ncopy.S +CGEMMOTCOPY = zgemm_tcopy.S +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel.S +ZGEMM_BETA = zgemm_beta.S +ZGEMMONCOPY = zgemm_ncopy.S +ZGEMMOTCOPY = zgemm_tcopy.S +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) + +XGEMMKERNEL = zgemm_kernel.S +XGEMM_BETA = ../generic/zgemm_beta.c +XGEMMONCOPY = ../generic/zgemm_ncopy_4.c +XGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +XGEMMONCOPYOBJ = xgemm_oncopy.$(SUFFIX) +XGEMMOTCOPYOBJ = xgemm_otcopy.$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN.S +STRSMKERNEL_LT = trsm_kernel_LT.S +STRSMKERNEL_RN = trsm_kernel_LT.S +STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + +CGEMM3MKERNEL = zgemm3m_kernel.S +ZGEMM3MKERNEL = zgemm3m_kernel.S diff --git a/kernel/ia64/Makefile b/kernel/ia64/Makefile new file mode 100644 index 0000000..520349b --- /dev/null +++ b/kernel/ia64/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/ia64/amax.S b/kernel/ia64/amax.S new file mode 100644 index 0000000..fae96f1 --- /dev/null +++ b/kernel/ia64/amax.S @@ -0,0 +1,396 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#if !defined(USE_MIN) && defined(USE_ABS) +#define FMAX famax +#elif !defined(USE_MIN) && !defined(USE_ABS) +#define FMAX fmax +#elif defined(USE_MIN) && defined(USE_ABS) +#define FMAX famin +#else +#define FMAX fmin +#endif + +#define RET r8 + +#define N r32 +#define DX r33 +#define INCX r34 + +#define PRE1 r2 +#define J r14 +#define K r15 +#define X2 r16 +#define X3 r17 +#define INCX5 r18 +#define INCX16 r19 + +#define DMAX1 f8 +#define DMAX2 f9 +#define DMAX3 f10 +#define DMAX4 f11 +#define DMAX5 f12 +#define DMAX6 f13 +#define DMAX7 f14 +#define DMAX8 f15 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + mov RET = 0 + mov DMAX1 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body + +#ifdef F_INTERFACE + { .mmi + LDINT N = [N] + LDINT INCX = [INCX] + nop.i 0 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#endif + { .mii + mov PR = pr + cmp.ge p6, p0 = 0, INCX + } + { .mbb + cmp.ge p8, p0 = 0, N + (p8) br.ret.sptk.many b0 + (p6) br.ret.sptk.many b0 + } + ;; + { .mmi + LDFD DMAX1 = [DX] + shladd INCX = INCX, BASE_SHIFT, r0 + mov pr.rot= 0 + } + ;; + { .mmf + add DX = DX, INCX + adds K = -1, N + mov DMAX2 = DMAX1 + } + ;; + { .mfi + shladd X2 = INCX, 2, DX + mov DMAX5 = DMAX1 + shr J = K, 4 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + nop.m 0 + mov DMAX6 = DMAX1 + } + ;; + { .mfi + shladd INCX5 = INCX, 2, INCX + mov DMAX3 = DMAX1 + mov ar.ec= 4 + } + { .mmf +#ifdef XDOUBLE + shladd INCX16= INCX, 3, r0 +#else + shladd INCX16= INCX, 4, r0 +#endif + adds J = -1, J + mov DMAX7 = DMAX1 + } + ;; + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, DX + mov DMAX4 = DMAX1 + mov ar.lc = J + } + { .mfb + cmp.eq p7 ,p0 = -1, J + mov DMAX8 = DMAX1 + (p7) br.cond.dpnt .L15 + } + .align 32 + ;; +.L10: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [DX], INCX + (p19) FMAX DMAX1 = f35, DMAX1 + } + { .mmf + (p16) LDFD f48 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX5 = f51, DMAX5 + } + ;; + { .mmf + (p16) LDFD f36 = [DX], INCX + nop.m 0 + (p19) FMAX DMAX2 = f39, DMAX2 + } + { .mmf + (p16) LDFD f52 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX6 = f55, DMAX6 + } + ;; + { .mmf + (p16) LDFD f40 = [DX], INCX + nop.m 0 + (p19) FMAX DMAX3 = f43, DMAX3 + } + { .mmf + (p16) LDFD f56 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX7 = f59, DMAX7 + } + ;; + { .mmf + (p16) LDFD f44 = [DX], INCX5 + nop.m 0 + (p19) FMAX DMAX4 = f47, DMAX4 + } + { .mmf + (p16) LDFD f60 = [X2], INCX5 + nop.m 0 + (p19) FMAX DMAX8 = f63, DMAX8 + } + ;; + { .mmf +#ifdef XDOUBLE + (p16) lfetch.nt1 [PRE1], INCX16 +#endif + (p16) LDFD f64 = [DX], INCX +#ifndef XDOUBLE + nop.m 0 +#endif + (p19) FMAX DMAX1 = f67, DMAX1 + } + { .mmf + (p16) LDFD f80 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX5 = f83, DMAX5 + } + ;; + { .mmf + (p16) LDFD f68 = [DX], INCX + nop.m 0 + (p19) FMAX DMAX2 = f71, DMAX2 + } + { .mmf + (p16) LDFD f84 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX6 = f87, DMAX6 + } + ;; + { .mmf + (p16) LDFD f72 = [DX], INCX + nop.m 0 + (p19) FMAX DMAX3 = f75, DMAX3 + } + { .mmf + (p16) LDFD f88 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX7 = f91, DMAX7 + } + ;; + { .mmf + (p16) LDFD f76 = [DX], INCX5 + nop.m 0 + (p19) FMAX DMAX4 = f79, DMAX4 + } + { .mfb + (p16) LDFD f92 = [X2], INCX5 + (p19) FMAX DMAX8 = f95, DMAX8 + br.ctop.sptk.few .L10 + } + .align 32 + ;; +.L15: + and J = 15, K + tbit.z p0, p12 = K, 3 + mov X3 = DX + ;; + { .mmi + (p12) LDFD f32 = [DX], INCX + (p12) LDFD f36 = [X2], INCX + tbit.z p0, p13 = K, 2 + } + { .mib + cmp.eq p8 ,p0 = r0, J + tbit.z p0, p14 = K, 1 + (p8) br.cond.dpnt .L99 + } + ;; + { .mmi + (p12) LDFD f33 = [DX], INCX + (p12) LDFD f37 = [X2], INCX + tbit.z p0, p15 = K, 0 + } + ;; + { .mmi + (p12) LDFD f34 = [DX], INCX + (p12) LDFD f38 = [X2], INCX + (p12) shladd X3 = INCX, 3, X3 + } + ;; + { .mmi + (p12) LDFD f35 = [DX], INCX5 + (p12) LDFD f39 = [X2], INCX5 + (p13) shladd X3 = INCX, 2, X3 + } + ;; + { .mmi + (p13) LDFD f40 = [DX], INCX + (p14) LDFD f44 = [X3], INCX + nop.i 0 + } + ;; + { .mmi + (p13) LDFD f41 = [DX], INCX + (p14) LDFD f45 = [X3], INCX + nop.i 0 + } + ;; + { .mmf + (p13) LDFD f42 = [DX], INCX + nop.m 0 + (p12) FMAX DMAX1 = f32, DMAX1 + } + { .mmf + (p15) LDFD f46 = [X3], INCX + nop.m 0 + (p12) FMAX DMAX5 = f36, DMAX5 + } + ;; + { .mmf + (p13) LDFD f43 = [DX], INCX + nop.m 0 + (p12) FMAX DMAX2 = f33, DMAX2 + } + (p12) FMAX DMAX6 = f37, DMAX6 + (p12) FMAX DMAX3 = f34, DMAX3 + (p12) FMAX DMAX7 = f38, DMAX7 + (p12) FMAX DMAX4 = f35, DMAX4 + (p12) FMAX DMAX8 = f39, DMAX8 + ;; + (p13) FMAX DMAX1 = f40, DMAX1 + (p14) FMAX DMAX5 = f44, DMAX5 + (p13) FMAX DMAX2 = f41, DMAX2 + (p14) FMAX DMAX6 = f45, DMAX6 + (p13) FMAX DMAX3 = f42, DMAX3 + (p15) FMAX DMAX7 = f46, DMAX7 + (p13) FMAX DMAX4 = f43, DMAX4 + ;; + .align 32 + +.L99: + { .mfi + nop.m 0 + FMAX DMAX1 = DMAX5, DMAX1 + mov ar.lc = ARLC + } + { .mmf + nop.m 0 + nop.m 0 + FMAX DMAX2 = DMAX6, DMAX2 + } + ;; + { .mfi + nop.m 0 + FMAX DMAX3 = DMAX7, DMAX3 + mov pr = PR, -65474 + } + { .mmf + nop.m 0 + nop.m 0 + FMAX DMAX4 = DMAX8, DMAX4 + } + ;; + { .mmf + FMAX DMAX1 = DMAX2, DMAX1 + } + { .mmf + FMAX DMAX3 = DMAX4, DMAX3 + } + ;; +#ifndef USE_ABS + { .mfb + FMAX DMAX1 = DMAX3, DMAX1 + br.ret.sptk.many b0 + } +#else + { .mmf + FMAX DMAX1 = DMAX3, DMAX1 + } + ;; + { .mfb + fabs DMAX1 = DMAX1 + br.ret.sptk.many b0 + } +#endif + ;; + EPILOGUE + + + diff --git a/kernel/ia64/asum.S b/kernel/ia64/asum.S new file mode 100644 index 0000000..6114f57 --- /dev/null +++ b/kernel/ia64/asum.S @@ -0,0 +1,388 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#ifndef COMPLEX +#define COMPADD 0 +#define STRIDE INCX +#else +#define COMPADD 1 +#define STRIDE SIZE +#endif + +#define PRE1 r2 + +#define I r17 +#define J r18 +#define INCX16 r21 + +#define PR r30 +#define ARLC r31 + +#define N r32 +#define X r33 +#define INCX r34 + + + PROLOGUE + .prologue + PROFCODE + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, X + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body +#ifdef F_INTERFACE + { .mmi + LDINT N = [N] + LDINT INCX = [INCX] + nop.i 0 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#endif + { .mmi + cmp.lt p0, p6 = r0, INCX + cmp.lt p0, p7 = r0, N + shr I = N, (4 - COMPADD) + } + { .mbb + and J = ((1 << (4 - COMPADD)) - 1), N + (p6) br.ret.sptk.many b0 + (p7) br.ret.sptk.many b0 + } + ;; + { .mfi + adds I = -1, I + mov f10 = f0 + mov PR = pr + } + { .mfi + cmp.eq p9, p0 = r0, J + mov f9 = f0 + tbit.z p0, p12 = N, 3 - COMPADD + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + cmp.ne p17, p0 = r0, r0 + mov ar.ec= 3 + } + { .mfi + cmp.ne p18, p0 = r0, r0 + mov f11 = f0 + shl INCX = INCX, BASE_SHIFT + COMPADD + } + ;; + { .mmi +#ifdef XDOUBLE + shladd INCX16 = INCX, (3 - COMPADD), r0 +#else + shladd INCX16 = INCX, (4 - COMPADD), r0 +#endif + cmp.ne p19, p0 = r0, r0 + mov ar.lc = I + } + { .mmb + cmp.gt p8 ,p0 = r0, I +#ifdef COMPLEX + adds INCX = - SIZE, INCX +#else + nop.m 0 +#endif + (p8) br.cond.dpnt .L55 + } + ;; + .align 32 + +.L52: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X], STRIDE + (p18) fabs f34 = f34 + } + { .mfb + (p19) FADD f8 = f8, f71 + } + ;; + { .mmf + (p16) LDFD f35 = [X], INCX + (p18) fabs f37 = f37 + } + { .mfb + (p19) FADD f9 = f9, f74 + } + ;; + { .mmf + (p16) LDFD f38 = [X], STRIDE + (p18) fabs f40 = f40 + } + { .mfb + (p19) FADD f10 = f10, f77 + } + ;; + { .mmf + (p16) LDFD f41 = [X], INCX + (p18) fabs f43 = f43 + } + { .mfb + (p19) FADD f11 = f11, f80 + } + ;; + { .mmf + (p16) LDFD f44 = [X], STRIDE + (p18) fabs f46 = f46 + } + { .mfb + (p18) FADD f8 = f8, f34 + } + ;; + { .mmf + (p16) LDFD f47 = [X], INCX + (p18) fabs f49 = f49 + } + { .mfb + (p18) FADD f9 = f9, f37 + } + ;; + { .mmf + (p16) LDFD f50 = [X], STRIDE + (p18) fabs f52 = f52 + } + { .mfb + (p18) FADD f10 = f10, f40 + } + ;; + { .mmf + (p16) LDFD f53 = [X], INCX + (p18) fabs f55 = f55 + } + { .mfb + (p18) FADD f11 = f11, f43 + } + ;; + { .mmf +#ifdef XDOUBLE + (p16) lfetch.nt1 [PRE1], INCX16 +#endif + (p16) LDFD f56 = [X], STRIDE + (p18) fabs f58 = f58 + } + { .mfb + (p18) FADD f8 = f8, f46 + } + ;; + { .mmf + (p16) LDFD f59 = [X], INCX + (p18) fabs f61 = f61 + } + { .mfb + (p18) FADD f9 = f9, f49 + } + ;; + { .mmf + (p16) LDFD f62 = [X], STRIDE + (p18) fabs f64 = f64 + } + { .mfb + (p18) FADD f10 = f10, f52 + } + ;; + { .mmf + (p16) LDFD f65 = [X], INCX + (p18) fabs f67 = f67 + } + { .mfb + (p18) FADD f11 = f11, f55 + } + ;; + { .mmf + (p16) LDFD f68 = [X], STRIDE + (p18) fabs f70 = f70 + } + { .mfb + (p18) FADD f8 = f8, f58 + } + ;; + { .mmf + (p16) LDFD f71 = [X], INCX + (p18) fabs f73 = f73 + } + { .mfb + (p18) FADD f9 = f9, f61 + } + ;; + { .mmf + (p16) LDFD f74 = [X], STRIDE + (p18) fabs f76 = f76 + } + { .mfb + (p18) FADD f10 = f10, f64 + } + ;; + { .mmf + (p16) LDFD f77 = [X], INCX + (p18) fabs f79 = f79 + } + { .mfb + (p18) FADD f11 = f11, f67 + br.ctop.sptk.few .L52 + } + ;; + FADD f8 = f8, f71 + FADD f9 = f9, f74 + FADD f10 = f10, f77 + FADD f11 = f11, f80 + .align 32 + ;; +.L55: + (p12) LDFD f32 = [X], STRIDE + (p9) br.cond.dptk .L998 + ;; + (p12) LDFD f33 = [X], INCX + ;; + (p12) LDFD f34 = [X], STRIDE + ;; + (p12) LDFD f35 = [X], INCX + tbit.z p0, p13 = N, (2 - COMPADD) + ;; + (p12) LDFD f36 = [X], STRIDE + tbit.z p0, p14 = N, (1 - COMPADD) + ;; + (p12) LDFD f37 = [X], INCX +#ifndef COMPLEX + tbit.z p0, p15 = N, 0 +#endif + ;; + (p12) LDFD f38 = [X], STRIDE + (p12) fabs f32 = f32 + ;; + (p12) LDFD f39 = [X], INCX + (p12) fabs f33 = f33 + ;; + (p13) LDFD f40 = [X], STRIDE + (p12) fabs f34 = f34 + ;; + (p13) LDFD f41 = [X], INCX + (p12) fabs f35 = f35 + ;; + (p13) LDFD f42 = [X], STRIDE + (p12) fabs f36 = f36 + (p12) FADD f8 = f8, f32 + ;; + (p13) LDFD f43 = [X], INCX + (p12) fabs f37 = f37 + (p12) FADD f9 = f9, f33 + ;; + (p14) LDFD f44 = [X], STRIDE + (p12) fabs f38 = f38 + (p12) FADD f10 = f10, f34 + ;; + (p14) LDFD f45 = [X], INCX + (p12) fabs f39 = f39 + (p12) FADD f11 = f11, f35 + ;; +#ifndef COMPLEX + (p15) LDFD f46 = [X] +#endif + (p13) fabs f40 = f40 + (p12) FADD f8 = f8, f36 + ;; + (p13) fabs f41 = f41 + (p12) FADD f9 = f9, f37 + (p13) fabs f42 = f42 + (p12) FADD f10 = f10, f38 + (p13) fabs f43 = f43 + (p12) FADD f11 = f11, f39 + ;; + (p14) fabs f44 = f44 + (p13) FADD f8 = f8, f40 + (p14) fabs f45 = f45 + (p13) FADD f9 = f9, f41 +#ifndef COMPLEX + (p15) fabs f46 = f46 +#endif + (p13) FADD f10 = f10, f42 + ;; + (p13) FADD f11 = f11, f43 + (p14) FADD f8 = f8, f44 + (p14) FADD f9 = f9, f45 +#ifndef COMPLEX + (p15) FADD f10 = f10, f46 +#endif + ;; + .align 32 + +.L998: + { .mfi + FADD f8 = f8, f9 + mov ar.lc = ARLC + } + { .mmf + FADD f10 = f10, f11 + } + ;; + { .mii + mov pr = PR, -65474 + } + ;; + { .mfb + FADD f8 = f8, f10 + br.ret.sptk.many b0 + } + EPILOGUE diff --git a/kernel/ia64/cabs.S b/kernel/ia64/cabs.S new file mode 100644 index 0000000..834b1bd --- /dev/null +++ b/kernel/ia64/cabs.S @@ -0,0 +1,58 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + .prologue + .body + LDFD f8 = [r32], SIZE + ;; + LDFD f6 = [r32] + ;; + fabs f8 = f8 + fabs f6 = f6 + ;; + FADD f8 = f6, f8 + br.ret.sptk.many b0 + + EPILOGUE + diff --git a/kernel/ia64/caxpy.S b/kernel/ia64/caxpy.S new file mode 100644 index 0000000..0a28ebe --- /dev/null +++ b/kernel/ia64/caxpy.S @@ -0,0 +1,519 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (32 * 16) + +#ifndef CONJ +#define FMA1 FNMA +#define FMA2 FMA +#else +#define FMA1 FMA +#define FMA2 FNMA +#endif + +#define SP r12 + +#define N r32 +#define X1 r37 +#define INCX r38 +#define Y1 r39 +#define INCY r36 + +#define PREX1 r2 +#define PREY1 r3 + +#define I r33 +#define J r34 +#define Y2 r35 +#define X2 r14 +#define YY1 r15 +#define YY2 r16 +#define YY3 r17 +#define YY4 r18 + +#define INCXM1 r19 +#define INCYM1 r20 +#define INCX3M1 r21 +#define INCY3M1 r22 +#define INCX7M1 r23 +#define INCY7M1 r24 + +#define X3 r8 +#define Y3 r9 +#define X4 r10 +#define Y4 r11 +#define INCX8 r25 +#define INCY8 r26 + +#define ARLC r29 +#define PR r30 + +#define ALPHA_R f8 +#define ALPHA_I f9 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds r14 = 16, SP + and J = 7, N + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.gt p15, p0 = r0, N + shr I = N, 3 + (p15) br.ret.sptk.many b0 + } + ;; + { .mmi + ld8 INCY = [r14] + nop __LINE__ + mov PR = pr + } + { .mmi + adds PREX1 = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY1 = (PREFETCH_SIZE + 0) * SIZE, Y1 + shl INCX = INCX, ZBASE_SHIFT + } + ;; + { .mii + adds I = -1, I + mov pr.rot= 0 + shl INCY = INCY, ZBASE_SHIFT + } + ;; + { .mmi + adds INCXM1 = -SIZE, INCX + adds INCYM1 = -SIZE, INCY + mov ar.ec = 3 + } + { .mmi + shladd X2 = INCX, 1, X1 + shladd Y2 = INCY, 1, Y1 + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mmi + shladd INCX3M1 = INCX, 1, INCXM1 + shladd INCY3M1 = INCY, 1, INCYM1 + shladd INCX8 = INCX, 3, r0 + } + { .mmi + shladd X3 = INCX, 1, X2 + shladd Y3 = INCY, 1, Y2 + shladd INCY8 = INCY, 3, r0 + } + ;; + { .mmi + shladd X4 = INCX, 1, X3 + shladd Y4 = INCY, 1, Y3 + shladd INCX7M1 = INCX, 2, INCX3M1 + } + { .mmi + mov YY1 = Y1 + mov YY2 = Y2 + shladd INCY7M1 = INCY, 2, INCY3M1 + } + ;; + { .mmi + mov YY3 = Y3 + mov YY4 = Y4 + mov ar.lc = I + } + { .mib + cmp.eq p11 ,p0 = -1, I + tbit.z p0, p13 = N, 2 + (p11) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmf + (p19) STFD [YY3] = f14 + (p19) STFD [YY4] = f15 + (p18) FMA2 f14 = ALPHA_R, f64, f112 + } + { .mmf + (p16) LDFD f80 = [Y1], 1 * SIZE + (p16) LDFD f92 = [Y2], 1 * SIZE + (p18) FMA2 f15 = ALPHA_R, f76, f124 + } + ;; + { .mmf + (p16) lfetch.excl.nt1 [PREY1], INCY8 + (p16) LDFD f104 = [Y3], 1 * SIZE + (p18) FMA1 f6 = ALPHA_I, f40, f6 + } + { .mmf + (p16) LDFD f116 = [Y4], 1 * SIZE + nop __LINE__ + (p18) FMA1 f7 = ALPHA_I, f52, f7 + } + ;; + { .mmf + (p16) LDFD f86 = [Y1], INCYM1 + (p16) LDFD f98 = [Y2], INCYM1 + (p18) FMA1 f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA1 f11 = ALPHA_I, f76, f11 + } + ;; + { .mmf + (p16) LDFD f110 = [Y3], INCYM1 + (p16) LDFD f122 = [Y4], INCYM1 + (p18) FMA f12 = ALPHA_I, f34, f12 + } + { .mmf + (p19) add YY1 = YY1, INCY7M1 + (p19) add YY2 = YY2, INCY7M1 + (p18) FMA f13 = ALPHA_I, f46, f13 + } + ;; + { .mmf + (p16) LDFD f32 = [X1], 1 * SIZE + (p16) LDFD f44 = [X2], 1 * SIZE + (p18) FMA f14 = ALPHA_I, f58, f14 + } + { .mmf + (p19) add YY3 = YY3, INCY7M1 + (p19) add YY4 = YY4, INCY7M1 + (p18) FMA f15 = ALPHA_I, f70, f15 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA_R, f37, f85 + } + { .mmf + (p16) LDFD f56 = [X3], 1 * SIZE + (p16) LDFD f68 = [X4], 1 * SIZE + (p18) FMA f7 = ALPHA_R, f49, f97 + } + ;; + { .mmf + (p18) STFD [YY3] = f10, 1 * SIZE + (p18) STFD [YY4] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA_R, f61, f109 + } + { .mmf + (p16) LDFD f38 = [X1], INCXM1 + (p16) LDFD f50 = [X2], INCXM1 + (p18) FMA f11 = ALPHA_R, f73, f121 + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) STFD [YY2] = f13 + (p18) FMA2 f12 = ALPHA_R, f43, f91 + } + { .mmf + (p16) LDFD f62 = [X3], INCXM1 + (p16) LDFD f74 = [X4], INCXM1 + (p18) FMA2 f13 = ALPHA_R, f55, f103 + } + ;; + { .mmf + (p18) STFD [YY3] = f14 + (p18) STFD [YY4] = f15 + (p18) FMA2 f14 = ALPHA_R, f67, f115 + } + { .mmf + (p16) LDFD f83 = [Y1], 1 * SIZE + (p16) LDFD f95 = [Y2], 1 * SIZE + (p18) FMA2 f15 = ALPHA_R, f79, f127 + } + ;; + { .mmf + (p16) LDFD f107 = [Y3], 1 * SIZE + (p16) LDFD f119 = [Y4], 1 * SIZE + (p18) FMA1 f6 = ALPHA_I, f43, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA1 f7 = ALPHA_I, f55, f7 + } + ;; + { .mmf + (p16) LDFD f89 = [Y1], INCY7M1 + (p16) LDFD f101 = [Y2], INCY7M1 + (p18) FMA1 f10 = ALPHA_I, f67, f10 + } + { .mmf + (p18) add YY1 = YY1, INCYM1 + (p18) add YY2 = YY2, INCYM1 + (p18) FMA1 f11 = ALPHA_I, f79, f11 + } + ;; + { .mmf + (p16) LDFD f113 = [Y3], INCY7M1 + (p16) LDFD f125 = [Y4], INCY7M1 + (p18) FMA f12 = ALPHA_I, f37, f12 + } + { .mmf + (p18) add YY3 = YY3, INCYM1 + (p18) add YY4 = YY4, INCYM1 + (p18) FMA f13 = ALPHA_I, f49, f13 + } + ;; + { .mmf + (p16) LDFD f35 = [X1], 1 * SIZE + (p16) LDFD f47 = [X2], 1 * SIZE + (p18) FMA f14 = ALPHA_I, f61, f14 + } + { .mmf + (p16) LDFD f59 = [X3], 1 * SIZE + (p16) LDFD f71 = [X4], 1 * SIZE + (p18) FMA f15 = ALPHA_I, f73, f15 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) FMA f6 = ALPHA_R, f33, f81 + } + { .mmf + (p16) LDFD f41 = [X1], INCX7M1 + (p16) LDFD f53 = [X2], INCX7M1 + (p17) FMA f7 = ALPHA_R, f45, f93 + } + ;; + { .mmf + (p18) STFD [YY3] = f10, 1 * SIZE + (p18) STFD [YY4] = f11, 1 * SIZE + (p17) FMA f10 = ALPHA_R, f57, f105 + } + { .mmf + (p16) LDFD f65 = [X3], INCX7M1 + (p16) LDFD f77 = [X4], INCX7M1 + (p17) FMA f11 = ALPHA_R, f69, f117 + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) STFD [YY2] = f13 + (p17) FMA2 f12 = ALPHA_R, f39, f87 + } + { .mfb + (p16) lfetch.nt1 [PREX1], INCX8 + (p17) FMA2 f13 = ALPHA_R, f51, f99 + br.ctop.sptk.few .L22 + } + ;; + (p19) add YY1 = YY1, INCY7M1 + (p19) add YY2 = YY2, INCY7M1 + ;; + { .mmf + (p19) STFD [YY3] = f14 + (p19) STFD [YY4] = f15 + } + { .mmf + (p19) add YY3 = YY3, INCY7M1 + (p19) add YY4 = YY4, INCY7M1 + } + ;; + .align 32 + +.L25: + { .mmi + (p13) LDFD f32 = [X1], 1 * SIZE + (p13) LDFD f36 = [X2], 1 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmi + (p13) LDFD f80 = [Y1], 1 * SIZE + (p13) LDFD f84 = [Y2], 1 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p13) LDFD f33 = [X1], INCXM1 + (p13) LDFD f37 = [X2], INCXM1 + cmp.eq p12, p0 = r0, J + } + ;; + { .mmb + (p13) LDFD f81 = [Y1], INCYM1 + (p13) LDFD f85 = [Y2], INCYM1 + (p12) br.ret.sptk.many b0 + } + ;; + { .mmi + (p13) LDFD f34 = [X1], 1 * SIZE + (p13) LDFD f38 = [X2], 1 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFD f82 = [Y1], 1 * SIZE + (p13) LDFD f86 = [Y2], 1 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmf + (p13) LDFD f35 = [X1], INCX3M1 + (p13) LDFD f39 = [X2], INCX3M1 + (p13) FMA f80 = ALPHA_R, f32, f80 + } + ;; + { .mmf + (p13) LDFD f83 = [Y1], INCY3M1 + (p13) LDFD f87 = [Y2], INCY3M1 + (p13) FMA f84 = ALPHA_R, f36, f84 + } + ;; + { .mmf + (p14) LDFD f40 = [X1], 1 * SIZE + (p14) LDFD f88 = [Y1], 1 * SIZE + (p13) FMA2 f81 = ALPHA_R, f33, f81 + } + ;; + { .mmf + (p14) LDFD f41 = [X1], INCXM1 + (p14) LDFD f89 = [Y1], INCYM1 + (p13) FMA2 f85 = ALPHA_R, f37, f85 + } + ;; + { .mmf + (p14) LDFD f42 = [X1], 1 * SIZE + (p14) LDFD f90 = [Y1], 1 * SIZE + (p13) FMA f82 = ALPHA_R, f34, f82 + } + ;; + { .mmf + (p14) LDFD f43 = [X1], INCXM1 + (p14) LDFD f91 = [Y1], INCYM1 + (p13) FMA f86 = ALPHA_R, f38, f86 + } + ;; + { .mmf + (p15) LDFD f44 = [X1], 1 * SIZE + (p15) LDFD f92 = [Y1], 1 * SIZE + (p13) FMA2 f83 = ALPHA_R, f35, f83 + } + ;; + { .mmf + (p15) LDFD f45 = [X1] + (p15) LDFD f93 = [Y1] + (p13) FMA2 f87 = ALPHA_R, f39, f87 + } + ;; + (p13) FMA1 f80 = ALPHA_I, f33, f80 + (p13) FMA1 f84 = ALPHA_I, f37, f84 + (p13) FMA f81 = ALPHA_I, f32, f81 + (p13) FMA f85 = ALPHA_I, f36, f85 + (p13) FMA1 f82 = ALPHA_I, f35, f82 + (p13) FMA1 f86 = ALPHA_I, f39, f86 + (p13) FMA f83 = ALPHA_I, f34, f83 + (p13) FMA f87 = ALPHA_I, f38, f87 + ;; + { .mmf + (p13) STFD [YY1] = f80, 1 * SIZE + (p13) STFD [YY2] = f84, 1 * SIZE + (p14) FMA f88 = ALPHA_R, f40, f88 + } + ;; + { .mmf + (p13) STFD [YY1] = f81 + (p13) STFD [YY2] = f85 + (p14) FMA2 f89 = ALPHA_R, f41, f89 + } + { .mmf + (p13) add YY1 = YY1, INCYM1 + (p13) add YY2 = YY2, INCYM1 + (p14) FMA f90 = ALPHA_R, f42, f90 + } + ;; + { .mmf + (p13) STFD [YY1] = f82, 1 * SIZE + (p13) STFD [YY2] = f86, 1 * SIZE + (p14) FMA2 f91 = ALPHA_R, f43, f91 + } + ;; + { .mmf + (p13) STFD [YY1] = f83 + (p13) STFD [YY2] = f87 + (p15) FMA f92 = ALPHA_R, f44, f92 + } + { .mmf + (p13) add YY1 = YY1, INCY3M1 + nop __LINE__ + (p15) FMA2 f93 = ALPHA_R, f45, f93 + } + ;; + (p14) FMA1 f88 = ALPHA_I, f41, f88 + (p14) FMA f89 = ALPHA_I, f40, f89 + (p14) FMA1 f90 = ALPHA_I, f43, f90 + (p14) FMA f91 = ALPHA_I, f42, f91 + ;; + { .mmf + (p14) STFD [YY1] = f88, 1 * SIZE + (p15) FMA1 f92 = ALPHA_I, f45, f92 + } + ;; + { .mmf + (p14) STFD [YY1] = f89 + (p14) add YY1 = YY1, INCYM1 + (p15) FMA f93 = ALPHA_I, f44, f93 + } + ;; + (p14) STFD [YY1] = f90, 1 * SIZE + ;; + (p14) STFD [YY1] = f91 + (p14) add YY1 = YY1, INCYM1 + ;; + (p15) STFD [YY1] = f92, 1 * SIZE + ;; + { .mmb + (p15) STFD [YY1] = f93 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + EPILOGUE diff --git a/kernel/ia64/copy.S b/kernel/ia64/copy.S new file mode 100644 index 0000000..b5d7f48 --- /dev/null +++ b/kernel/ia64/copy.S @@ -0,0 +1,873 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREA r2 +#define PREB r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define INCX3 r18 +#define INCY3 r19 +#define INCX5 r20 +#define INCY5 r21 +#define INCX16 r22 +#define INCY16 r23 +#define XX r24 +#define YY r25 +#define XA r26 +#define YA r27 +#define PR r30 +#define ARLC r31 + +#ifdef DOUBLE +#define PREFETCH_SIZE (4 * 32) +#else +#define PREFETCH_SIZE (4 * 64) +#endif + + PROLOGUE + .prologue + PROFCODE + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + tbit.z p0, p7 = X1, BASE_SHIFT + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + sub XA = Y1, X1 + (p7) LDFD f32 = [X1], INCX + mov PR = pr + } + { .mmi + mov YY = Y1 + (p7) adds N = -1, N + (p7) add Y1 = Y1, INCY + } + ;; + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCY5 = INCY, 2, INCY + mov pr.rot = 0 + } + { .mmi + mov XX = X1 + nop.m 0 + shr.u XA = XA, BASE_SHIFT + } + ;; + { .mmi + and J = 15, N + cmp.eq p16, p0 = r0, r0 + shr I = N, 4 + } + { .mmb + cmp.ne p6, p0 = SIZE, INCX +#ifdef DOUBLE + adds XA = 2, XA +#else + nop.m 0 +#endif + (p6) br.cond.dpnt .L100 + } + ;; +/* INCX == 1 */ + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + { .mmi +#ifdef DOUBLE + and XA = 31, XA +#else + and XA = 63, XA +#endif + adds I = -1, I + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + shladd Y2 = INCY, 2, Y1 + mov ar.lc = I + } + { .mib +#ifdef DOUBLE + cmp.gt p8, p0 = 15, XA +#else + cmp.gt p8, p0 = 30, XA +#endif + cmp.eq p9, p0 = r0, J + (p8)br.cond.dpnt .L30 + } + ;; + { .mmi + (p7) STFD [YY] = f32 + cmp.gt p8 ,p0 = r0, I + mov ar.ec = 5 + } + { .mmb + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 +#ifdef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE + 32, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE - 40, Y1 +#endif + (p8) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmi + (p20) STFD [Y1] = f36 + (p20) STFD [Y2] = f56 + (p20) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX16 + (p16) LDFPD f32, f37 = [X1], 2 * SIZE + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f41 + (p20) STFD [Y2] = f61 + (p20) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY16 + (p16) LDFPD f42, f47 = [X1], 2 * SIZE + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f46 + (p20) STFD [Y2] = f66 + (p20) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1], 2 * SIZE + nop.m 0 + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f51 + (p20) STFD [Y2] = f71 + (p20) add Y1 = INCY5, Y1 + } + { .mmi + (p16) LDFPD f62, f67 = [X1], 2 * SIZE + nop.m 0 + (p20) add Y2 = INCY5, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f76 + (p20) STFD [Y2] = f96 + (p16) adds XX = 8 * SIZE, X1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1], 2 * SIZE + (p20) add Y1 = INCY, Y1 + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f81 + (p20) STFD [Y2] = f101 + (p20) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFPD f82, f87 = [X1], 2 * SIZE + nop.m 0 + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f86 + (p20) STFD [Y2] = f106 + (p16) shladd X2 = INCX, 2, XX + } + { .mmi + (p16) LDFPD f92, f97 = [X1], 2 * SIZE + (p20) add Y1 = INCY, Y1 + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f111 + (p20) add Y1 = INCY5, Y1 + } + { .mmb + (p16) LDFPD f102, f107 = [X1], 2 * SIZE + (p20) add Y2 = INCY5, Y2 + br.ctop.sptk.few .L22 + } + ;; + .align 32 +.L25: + { .mmi + (p12) LDFPD f48, f49 = [X1], 2 * SIZE + (p12) LDFPD f52, f53 = [X2], 2 * SIZE + mov ar.lc = ARLC + } + { .mmi + (p12) adds XX = 8 * SIZE, XX + nop.m 0 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) LDFPD f50, f51 = [X1] + (p12) LDFPD f54, f55 = [X2] + mov pr = PR, -65474 + } + { .mmb + (p12) adds X1 = 6 * SIZE, X1 + (p13) adds XX = 4 * SIZE, XX + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p13) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFPD f60, f61 = [XX], 2 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p13) LDFPD f58, f59 = [X1], 2 * SIZE + (p15) LDFD f62 = [XX] + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + mov YY = Y1 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p12) add Y2 = INCY, Y2 + (p12) shladd YY = INCY, 3, YY + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p12) add Y2 = INCY, Y2 + (p13) shladd YY = INCY, 2, YY + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY5, Y1 + } + { .mmi + (p12) add Y2 = INCY5, Y2 + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p14) STFD [YY] = f60 + (p13) add Y1 = INCY, Y1 + } + { .mmi + (p14) add YY = INCY, YY + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCY, Y1 + } + { .mmi + (p14) add YY = INCY, YY + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p15) STFD [YY] = f62 + (p13) add Y1 = INCY, Y1 + } + ;; + { .mmb + (p13) STFD [Y1] = f59 + nop.m 0 + br.ret.sptk.many b0 + } + .align 32 + ;; +.L30: + { .mmi + (p7) STFD [YY] = f32 + cmp.gt p8 ,p0 = r0, I + mov ar.ec = 4 + } + { .mmb + adds PREA = PREFETCH_SIZE * SIZE + 24, X1 +#ifdef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE + 64, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE + 72, Y1 +#endif + (p8) br.cond.dpnt .L35 + } + ;; + .align 32 +.L32: + { .mmi + (p19) STFD [Y1] = f35 + (p19) STFD [Y2] = f55 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX16 + (p16) LDFPD f32, f37 = [X1], 2 * SIZE + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY16 + (p16) LDFPD f42, f47 = [X1], 2 * SIZE + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f45 + (p19) STFD [Y2] = f65 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1], 2 * SIZE + nop.m 0 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f50 + (p19) STFD [Y2] = f70 + (p19) add Y1 = INCY5, Y1 + } + { .mmi + (p16) LDFPD f62, f67 = [X1], 2 * SIZE + nop.m 0 + (p19) add Y2 = INCY5, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f95 + (p16) adds XX = 8 * SIZE, X1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1], 2 * SIZE + (p19) add Y1 = INCY, Y1 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f80 + (p19) STFD [Y2] = f100 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFPD f82, f87 = [X1], 2 * SIZE + nop.m 0 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f85 + (p19) STFD [Y2] = f105 + (p16) shladd X2 = INCX, 2, XX + } + { .mmi + (p16) LDFPD f92, f97 = [X1], 2 * SIZE + (p19) add Y1 = INCY, Y1 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f90 + (p19) STFD [Y2] = f110 + (p19) add Y1 = INCY5, Y1 + } + { .mmb + (p16) LDFPD f102, f107 = [X1], 2 * SIZE + (p19) add Y2 = INCY5, Y2 + br.ctop.sptk.few .L32 + } + ;; + .align 32 +.L35: + { .mmi + (p12) LDFPD f48, f49 = [X1], 2 * SIZE + (p12) LDFPD f52, f53 = [X2], 2 * SIZE + mov ar.lc = ARLC + } + { .mmi + (p12) adds XX = 8 * SIZE, XX + nop.m 0 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) LDFPD f50, f51 = [X1] + (p12) LDFPD f54, f55 = [X2] + mov pr = PR, -65474 + } + { .mmi + (p12) adds X1 = 6 * SIZE, X1 + (p12) adds X2 = 6 * SIZE, X2 + (p13) adds XX = 4 * SIZE, XX + } + ;; + { .mmi + (p13) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFPD f60, f61 = [XX], 2 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmb + (p13) LDFPD f58, f59 = [X1], 2 * SIZE + (p15) LDFD f62 = [XX] + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + mov YY = Y1 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p12) add Y2 = INCY, Y2 + (p12) shladd YY = INCY, 3, YY + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p12) add Y2 = INCY, Y2 + (p13) shladd YY = INCY, 2, YY + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + nop.i 0 + } + { .mmi + (p12) add Y1 = INCY5, Y1 + (p12) add Y2 = INCY5, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p14) STFD [YY] = f60 + nop.i 0 + } + { .mmi + (p13) add Y1 = INCY, Y1 + (p14) add YY = INCY, YY + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + nop.i 0 + } + { .mmi + (p13) add Y1 = INCY, Y1 + (p14) add YY = INCY, YY + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p15) STFD [YY] = f62 + (p13) add Y1 = INCY, Y1 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + .align 32 + ;; + + /* INCX != 1 */ +.L100: + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + { .mmi + nop.m 0 + nop.m 0 + nop.i 0 + } + ;; + { .mmi + adds PREA = PREFETCH_SIZE * SIZE, X1 + adds PREB = PREFETCH_SIZE * SIZE, Y1 + mov ar.ec = 6 + } + { .mmi + cmp.eq p8 ,p0 = r0, I + cmp.eq p9, p0 = r0, J + adds I = -1, I + } + ;; + { .mmi + (p7) STFD [YY] = f32 + shladd X2 = INCX, 2, X1 + mov ar.lc = I + } + { .mib + shladd Y2 = INCY, 2, Y1 + cmp.eq p16, p0 = r0, r0 + (p8) br.cond.dpnt .L120 + } + ;; + .align 32 + +.L110: + { .mmi + (p21) STFD [Y1] = f37 + (p21) STFD [Y2] = f61 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX16 + (p16) lfetch.excl.nt1 [PREB], INCY16 + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f43 + (p21) STFD [Y2] = f67 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f56 = [X2], INCX + (p16) LDFD f32 = [X1], INCX + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f49 + (p21) STFD [Y2] = f73 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f38 = [X1], INCX + (p16) LDFD f62 = [X2], INCX + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f55 + (p21) STFD [Y2] = f79 + (p21) add Y1 = INCY5, Y1 + } + { .mmi + (p16) LDFD f44 = [X1], INCX + (p16) LDFD f68 = [X2], INCX + (p21) add Y2 = INCY5, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f85 + (p21) STFD [Y2] = f109 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f50 = [X1], INCX5 + (p16) LDFD f74 = [X2], INCX5 + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f91 + (p21) STFD [Y2] = f115 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f104 = [X2], INCX + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f97 + (p21) STFD [Y2] = f121 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f86 = [X1], INCX + (p16) LDFD f110 = [X2], INCX + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f103 + (p21) STFD [Y2] = f127 + (p21) add Y1 = INCY5, Y1 + } + { .mmi + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f116 = [X2], INCX + (p21) add Y2 = INCY5, Y2 + } + ;; + { .mmi + nop.m 0 + (p16) add XX = INCX5, X1 + nop.i 0 + } + { .mmb + (p16) LDFD f98 = [X1], INCX5 + (p16) LDFD f122 = [X2], INCX5 + br.ctop.sptk.few .L110 + } + ;; + .align 32 + +.L120: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f52 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f54 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + { .mmb + nop.m 0 + nop.m 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX5 + (p12) LDFD f55 = [X2], INCX5 + (p12) shladd XX = INCX, 3, XX + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX + (p13) shladd XX = INCX, 2, XX + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCX + (p14) LDFD f60 = [XX], INCX + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX + (p14) LDFD f61 = [XX], INCX + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p13) LDFD f59 = [X1], INCX + (p15) LDFD f62 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + nop.i 0 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + nop.i 0 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + nop.i 0 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY5, Y1 + } + { .mmi + (p12) add Y2 = INCY5, Y2 + (p12) shladd YY = INCY, 3, YY + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) add Y1 = INCY, Y1 + (p13) shladd YY =INCY, 2, YY + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f60 + nop.i 0 + } + { .mmi + (p13) add Y1 = INCY, Y1 + (p14) add YY = INCY, YY + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p14) STFD [YY] = f61 + nop.i 0 + } + { .mmi + (p13) add Y1 = INCY, Y1 + (p14) add YY = INCY, YY + nop.i 0 + } + ;; + { .mmb + (p13) STFD [Y1] = f59 + (p15) STFD [YY] = f62 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/daxpy.S b/kernel/ia64/daxpy.S new file mode 100644 index 0000000..b971df6 --- /dev/null +++ b/kernel/ia64/daxpy.S @@ -0,0 +1,1504 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (16 * 16) + +#define N r32 +#define X1 r36 +#define INCX r37 +#define Y1 r38 +#define INCY r39 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define X2 r16 +#define Y2 r17 +#define YY1 r18 +#define YY2 r19 +#define INCX16 r20 +#define INCY16 r21 +#define X3 r26 +#define YY r27 +#define PR r30 +#define ARLC r31 + +#define ALPHA f8 + + PROLOGUE + PROFCODE + .prologue + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + tbit.nz p10, p0 = Y1, BASE_SHIFT + (p6) br.ret.dpnt.many b0 + } + ;; + .body + { .mmi + (p10) LDFD f32 = [X1], INCX + (p10) LDFD f33 = [Y1] + mov PR = pr + } + { .mmi + (p10) adds N = -1, N + mov YY = Y1 + (p10) add Y1 = Y1, INCY + } + ;; + { .mmi + mov YY1 = Y1 + shladd YY2 = INCY, 1, Y1 + mov pr.rot= 0 + } + { .mmi + sub r8 = X1, Y1 + mov r9 = 0xf0 + nop __LINE__ + } + ;; + { .mmi + cmp.ne p6, p0 = SIZE, INCX + cmp.ne p7, p0 = SIZE, INCY + tbit.nz p8, p0 = X1, BASE_SHIFT + } + { .mbb + and J = 15, N + (p6) br.cond.dpnt .L100 + (p7) br.cond.dpnt .L100 + } + ;; + { .mfi + cmp.eq p16, p0 = r0, r0 + (p10) FMA f9 = ALPHA, f32, f33 + shr I = N, 4 + } + { .mmb + add X3 = X1, INCX + and r8 = r9, r8 + (p8) br.cond.dpnt.many .L30 + } + ;; + { .mmi + cmp.eq p11, p0 = r0, J + adds I = -1, I + mov ar.ec = 3 + } + { .mib + cmp.lt p9, p0 = 127, r8 + tbit.nz p12, p0 = N, 3 + (p9) br.cond.dpnt.many .L20 + } + ;; + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 + (p7) br.cond.dpnt .L15 + } + ;; + .align 32 + +.L12: + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) lfetch.fault.nt1 [PREX], 16 * SIZE + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p18) FMA f7 = ALPHA, f64, f112 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) LDFPD f80, f83 = [Y1], 2 * SIZE + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p16) LDFPD f86, f89 = [Y1], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p18) FMA f14 = ALPHA, f73, f121 + } + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p16) LDFPD f92, f95 = [Y1], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p16) LDFPD f98, f101 = [Y1], 2 * SIZE + (p17) FMA f7 = ALPHA, f39, f87 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p17) FMA f10 = ALPHA, f36, f84 + } + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p16) LDFPD f104, f107 = [Y1], 2 * SIZE + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p17) FMA f12 = ALPHA, f45, f93 + } + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p16) LDFPD f110, f113 = [Y1], 2 * SIZE + (p17) FMA f13 = ALPHA, f51, f99 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p17) FMA f14 = ALPHA, f48, f96 + } + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p16) LDFPD f116, f119 = [Y1], 2 * SIZE + (p17) FMA f15 = ALPHA, f54, f102 + } + ;; + { .mmi + (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE + (p16) LDFPD f122, f125 = [Y1], 2 * SIZE + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + (p12) LDFPD f34, f35 = [Y1], 2 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) LDFPD f38, f39 = [Y1], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p12) LDFPD f42, f43 = [Y1], 2 * SIZE + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFPD f46, f47 = [Y1], 2 * SIZE + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFPD f50, f51 = [Y1], 2 * SIZE + tbit.nz p14, p0 = N, 1 + } + ;; + { .mmi + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFPD f54, f55 = [Y1], 2 * SIZE + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmi + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFPD f58, f59 = [Y1], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f60 = [X1] + (p15) LDFD f61 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f36, f38 + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f37, f39 + (p12) FMA f12 = ALPHA, f40, f42 + (p12) FMA f13 = ALPHA, f44, f46 + (p12) FMA f14 = ALPHA, f41, f43 + (p12) FMA f15 = ALPHA, f45, f47 + ;; + { .mmf + (p12) STFD [YY1] = f6, 1 * SIZE + (p12) STFD [YY2] = f7, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f7 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f10, 3 * SIZE + (p12) STFD [YY2] = f11, 3 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p12) STFD [YY1] = f12, 1 * SIZE + (p12) STFD [YY2] = f13, 1 * SIZE + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p12) STFD [YY1] = f14, 3 * SIZE + (p12) STFD [YY2] = f15, 3 * SIZE + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f6, 1 * SIZE + (p13) STFD [YY2] = f7, 1 * SIZE + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f10, 3 * SIZE + (p13) STFD [YY2] = f11, 3 * SIZE + } + ;; + { .mmi + (p14) STFD [YY1] = f12, 1 * SIZE + ;; + (p14) STFD [YY1] = f13, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L20: + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE - 4) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 + (p7) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) lfetch.fault.nt1 [PREX], 16 * SIZE + (p17) LDFPD f57, f60 = [X1], 2 * SIZE + (p18) FMA f7 = ALPHA, f64, f112 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE + (p16) LDFPD f80, f83 = [Y1], 2 * SIZE + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmf + (p17) LDFPD f63, f66 = [X1], 2 * SIZE + (p16) LDFPD f86, f89 = [Y1], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p18) FMA f14 = ALPHA, f73, f121 + } + { .mmf + (p17) LDFPD f69, f72 = [X1], 2 * SIZE + (p16) LDFPD f92, f95 = [Y1], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmf + (p17) LDFPD f75, f78 = [X1], 2 * SIZE + (p16) LDFPD f98, f101 = [Y1], 2 * SIZE + (p17) FMA f7 = ALPHA, f39, f87 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p17) FMA f10 = ALPHA, f36, f84 + } + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) LDFPD f104, f107 = [Y1], 2 * SIZE + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p17) FMA f12 = ALPHA, f45, f93 + } + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) LDFPD f110, f113 = [Y1], 2 * SIZE + (p17) FMA f13 = ALPHA, f51, f99 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p17) FMA f14 = ALPHA, f48, f96 + } + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p16) LDFPD f116, f119 = [Y1], 2 * SIZE + (p17) FMA f15 = ALPHA, f54, f102 + } + ;; + { .mmi + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p16) LDFPD f122, f125 = [Y1], 2 * SIZE + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + (p12) LDFPD f34, f35 = [Y1], 2 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) LDFPD f38, f39 = [Y1], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p12) LDFPD f42, f43 = [Y1], 2 * SIZE + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFPD f46, f47 = [Y1], 2 * SIZE + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFPD f50, f51 = [Y1], 2 * SIZE + tbit.nz p14, p0 = N, 1 + } + ;; + { .mmi + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFPD f54, f55 = [Y1], 2 * SIZE + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmi + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFPD f58, f59 = [Y1], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f60 = [X1] + (p15) LDFD f61 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f36, f38 + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f37, f39 + (p12) FMA f12 = ALPHA, f40, f42 + (p12) FMA f13 = ALPHA, f44, f46 + (p12) FMA f14 = ALPHA, f41, f43 + (p12) FMA f15 = ALPHA, f45, f47 + ;; + { .mmf + (p12) STFD [YY1] = f6, 1 * SIZE + (p12) STFD [YY2] = f7, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f7 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f10, 3 * SIZE + (p12) STFD [YY2] = f11, 3 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p12) STFD [YY1] = f12, 1 * SIZE + (p12) STFD [YY2] = f13, 1 * SIZE + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p12) STFD [YY1] = f14, 3 * SIZE + (p12) STFD [YY2] = f15, 3 * SIZE + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f6, 1 * SIZE + (p13) STFD [YY2] = f7, 1 * SIZE + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f10, 3 * SIZE + (p13) STFD [YY2] = f11, 3 * SIZE + } + ;; + { .mmi + (p14) STFD [YY1] = f12, 1 * SIZE + ;; + (p14) STFD [YY1] = f13, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L30: + { .mmi + cmp.eq p11, p0 = r0, J + adds I = -1, I + mov ar.ec = 3 + } + { .mib + cmp.lt p9, p0 = 127, r8 + tbit.nz p12, p0 = N, 3 + (p9) br.cond.dptk.many .L40 + } + ;; + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 + (p7) br.cond.dpnt .L35 + } + ;; + .align 32 + +.L32: + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) lfetch.fault.nt1 [PREX], 16 * SIZE + (p16) LDFD f32 = [X1], 1 * SIZE + (p18) FMA f7 = ALPHA, f64, f112 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p16) LDFPD f35, f38 = [X1], 2 * SIZE + (p16) LDFPD f80, f83 = [Y1], 2 * SIZE + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmf + (p16) LDFPD f41, f44 = [X1], 2 * SIZE + (p16) LDFPD f86, f89 = [Y1], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p18) FMA f14 = ALPHA, f73, f121 + } + { .mmf + (p16) LDFPD f47, f50 = [X1], 2 * SIZE + (p16) LDFPD f92, f95 = [Y1], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmf + (p16) LDFPD f53, f56 = [X1], 2 * SIZE + (p16) LDFPD f98, f101 = [Y1], 2 * SIZE + (p17) FMA f7 = ALPHA, f39, f87 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p17) FMA f10 = ALPHA, f36, f84 + } + { .mmf + (p16) LDFPD f59, f62 = [X1], 2 * SIZE + (p16) LDFPD f104, f107 = [Y1], 2 * SIZE + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p17) FMA f12 = ALPHA, f45, f93 + } + { .mmf + (p16) LDFPD f65, f68 = [X1], 2 * SIZE + (p16) LDFPD f110, f113 = [Y1], 2 * SIZE + (p17) FMA f13 = ALPHA, f51, f99 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p17) FMA f14 = ALPHA, f48, f96 + } + { .mmf + (p16) LDFPD f71, f74 = [X1], 2 * SIZE + (p16) LDFPD f116, f119 = [Y1], 2 * SIZE + (p17) FMA f15 = ALPHA, f54, f102 + } + ;; + { .mmi + (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE + (p16) LDFPD f122, f125 = [Y1], 2 * SIZE + adds X3 = 1 * SIZE, X1 + } + { .mmb + (p16) LDFD f77 = [X1], 1 * SIZE + nop __LINE__ + br.ctop.sptk.few .L32 + } + ;; + .align 32 + +.L35: + { .mmi + (p12) LDFPD f33, f36 = [X3] + (p12) LDFPD f34, f35 = [Y1], 2 * SIZE + mov pr = PR, -65474 + } + { .mmi + (p12) LDFD f32 = [X1], 3 * SIZE + (p12) adds X3 = 8 * SIZE, X3 + nop __LINE__ + } + ;; + { .mmi + (p12) LDFPD f37, f40 = [X1], 2 * SIZE + (p12) LDFPD f38, f39 = [Y1], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f41, f44 = [X1], 2 * SIZE + (p12) LDFPD f42, f43 = [Y1], 2 * SIZE + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFD f45 = [X1], 1 * SIZE + (p12) LDFPD f46, f47 = [Y1], 2 * SIZE + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f49, f52 = [X3] + (p13) LDFPD f50, f51 = [Y1], 2 * SIZE + tbit.nz p14, p0 = N, 1 + } + { .mmi + (p13) LDFD f48 = [X1], 3 * SIZE + (p13) adds X3 = 4 * SIZE, X3 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f53 = [X1], 1 * SIZE + (p13) LDFPD f54, f55 = [Y1], 2 * SIZE + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmi + (p14) LDFD f56 = [X1], 2 * SIZE + (p14) LDFPD f58, f59 = [Y1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p14) LDFD f57 = [X3] + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f60 = [X1] + (p15) LDFD f61 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f36, f38 + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f37, f39 + (p12) FMA f12 = ALPHA, f40, f42 + (p12) FMA f13 = ALPHA, f44, f46 + (p12) FMA f14 = ALPHA, f41, f43 + (p12) FMA f15 = ALPHA, f45, f47 + ;; + { .mmf + (p12) STFD [YY1] = f6, 1 * SIZE + (p12) STFD [YY2] = f7, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f7 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f10, 3 * SIZE + (p12) STFD [YY2] = f11, 3 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p12) STFD [YY1] = f12, 1 * SIZE + (p12) STFD [YY2] = f13, 1 * SIZE + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p12) STFD [YY1] = f14, 3 * SIZE + (p12) STFD [YY2] = f15, 3 * SIZE + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f6, 1 * SIZE + (p13) STFD [YY2] = f7, 1 * SIZE + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f10, 3 * SIZE + (p13) STFD [YY2] = f11, 3 * SIZE + } + ;; + { .mmi + (p14) STFD [YY1] = f12, 1 * SIZE + ;; + (p14) STFD [YY1] = f13, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L40: + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 8) * SIZE, Y1 + (p7) br.cond.dpnt .L45 + } + ;; + .align 32 + +.L42: + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p17) LDFPD f54, f57 = [X1], 2 * SIZE + (p16) LDFPD f80, f83 = [Y1], 2 * SIZE + (p18) FMA f7 = ALPHA, f64, f112 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p18) FMA f64 = ALPHA, f61, f109 + } + { .mmf + (p17) LDFPD f60, f63 = [X1], 2 * SIZE + (p16) LDFPD f86, f89 = [Y1], 2 * SIZE + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p18) FMA f67 = ALPHA, f70, f118 + } + { .mmf + (p17) LDFPD f66, f69 = [X1], 2 * SIZE + (p16) LDFPD f92, f95 = [Y1], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p18) FMA f9 = ALPHA, f73, f121 + } + { .mmf + (p17) LDFPD f72, f75 = [X1], 2 * SIZE + (p16) LDFPD f98, f101 = [Y1], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmi + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) adds X3 = 2 * SIZE, X1 + } + { .mmf + (p16) LDFPD f104, f107 = [Y1], 2 * SIZE + (p17) LDFD f78 = [X1], 1 * SIZE + (p17) FMA f6 = ALPHA, f33, f81 + } + ;; + { .mmf + (p16) LDFPD f110, f113 = [Y1], 2 * SIZE + (p16) lfetch.fault.nt1 [PREX], 16 * SIZE + (p17) FMA f7 = ALPHA, f39, f87 + } + { .mmf + (p16) LDFD f32 = [X1], 1 * SIZE + (p17) FMA f10 = ALPHA, f36, f84 + } + ;; + { .mmf + (p18) STFD [YY1] = f64, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p17) FMA f11 = ALPHA, f42, f90 + } + { .mmf + (p16) LDFPD f35, f38 = [X1], 2 * SIZE + (p16) LDFPD f116, f119 = [Y1], 2 * SIZE + (p17) FMA f12 = ALPHA, f45, f93 + } + ;; + { .mmf + (p18) STFD [YY1] = f67, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p17) FMA f13 = ALPHA, f51, f99 + } + { .mmf + (p16) LDFPD f41, f44 = [X1], 2 * SIZE + (p16) LDFPD f122, f125 = [Y1], 2 * SIZE + (p17) FMA f14 = ALPHA, f48, f96 + } + ;; + { .mmf + (p18) STFD [YY1] = f9, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p17) FMA f15 = ALPHA, f54, f102 + } + { .mmb + (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE + (p16) LDFPD f47, f50 = [X1], 2 * SIZE + br.ctop.sptk.few .L42 + } + ;; + .align 32 + +.L45: + { .mmi + (p12) LDFPD f33, f36 = [X3] + (p12) LDFPD f34, f35 = [Y1], 2 * SIZE + mov pr = PR, -65474 + } + { .mmi + (p12) LDFD f32 = [X1], 3 * SIZE + (p12) adds X3 = 8 * SIZE, X3 + nop __LINE__ + } + ;; + { .mmi + (p12) LDFPD f37, f40 = [X1], 2 * SIZE + (p12) LDFPD f38, f39 = [Y1], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f41, f44 = [X1], 2 * SIZE + (p12) LDFPD f42, f43 = [Y1], 2 * SIZE + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFD f45 = [X1], 1 * SIZE + (p12) LDFPD f46, f47 = [Y1], 2 * SIZE + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f49, f52 = [X3] + (p13) LDFPD f50, f51 = [Y1], 2 * SIZE + tbit.nz p14, p0 = N, 1 + } + { .mmi + (p13) LDFD f48 = [X1], 3 * SIZE + (p13) adds X3 = 4 * SIZE, X3 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f53 = [X1], 1 * SIZE + (p13) LDFPD f54, f55 = [Y1], 2 * SIZE + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmi + (p14) LDFD f56 = [X1], 2 * SIZE + (p14) LDFPD f58, f59 = [Y1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p14) LDFD f57 = [X3] + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f60 = [X1] + (p15) LDFD f61 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f36, f38 + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f37, f39 + (p12) FMA f12 = ALPHA, f40, f42 + (p12) FMA f13 = ALPHA, f44, f46 + (p12) FMA f14 = ALPHA, f41, f43 + (p12) FMA f15 = ALPHA, f45, f47 + ;; + { .mmf + (p12) STFD [YY1] = f6, 1 * SIZE + (p12) STFD [YY2] = f7, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f7 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f10, 3 * SIZE + (p12) STFD [YY2] = f11, 3 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p12) STFD [YY1] = f12, 1 * SIZE + (p12) STFD [YY2] = f13, 1 * SIZE + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p12) STFD [YY1] = f14, 3 * SIZE + (p12) STFD [YY2] = f15, 3 * SIZE + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f6, 1 * SIZE + (p13) STFD [YY2] = f7, 1 * SIZE + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f10, 3 * SIZE + (p13) STFD [YY2] = f11, 3 * SIZE + } + ;; + { .mmi + (p14) STFD [YY1] = f12, 1 * SIZE + ;; + (p14) STFD [YY1] = f13, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mfi + cmp.eq p16, p0 = r0, r0 + (p10) FMA f9 = ALPHA, f32, f33 + shr I = N, 4 + } + ;; + { .mmi + cmp.eq p11, p0 = r0, J + adds I = -1, I + mov ar.ec = 3 + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.nz p12, p0 = N, 3 + } + ;; + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 + (p7) br.cond.dpnt .L115 + } + ;; + .align 32 + +.L112: + { .mmf + (p18) STFD [YY1] = f6 + (p16) lfetch.fault.nt1 [PREX], INCX16 + (p18) FMA f12 = ALPHA, f46, f94 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f80 = [Y1], INCY + (p18) add YY1 = YY1, INCY + } + ;; + { .mmf + (p18) STFD [YY1] = f7 + (p18) add YY1 = YY1, INCY + (p18) FMA f13 = ALPHA, f49, f97 + } + { .mmi + (p16) LDFD f35 = [X1], INCX + (p16) LDFD f83 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f10 + (p18) add YY1 = YY1, INCY + (p18) FMA f14 = ALPHA, f52, f100 + } + { .mmi + (p16) LDFD f38 = [X1], INCX + (p16) LDFD f86 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f11 + (p18) add YY1 = YY1, INCY + (p18) FMA f15 = ALPHA, f55, f103 + } + { .mmi + (p16) LDFD f41 = [X1], INCX + (p16) LDFD f89 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) add YY1 = YY1, INCY + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmi + (p16) LDFD f44 = [X1], INCX + (p16) LDFD f92 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f13 + (p18) add YY1 = YY1, INCY + (p18) FMA f7 = ALPHA, f61, f109 + } + { .mmi + (p16) LDFD f47 = [X1], INCX + (p16) LDFD f95 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f14 + (p18) add YY1 = YY1, INCY + (p18) FMA f10 = ALPHA, f64, f112 + } + { .mmi + (p16) LDFD f50 = [X1], INCX + (p16) LDFD f98 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f15 + (p18) add YY1 = YY1, INCY + (p18) FMA f11 = ALPHA, f67, f115 + } + { .mmi + (p16) LDFD f53 = [X1], INCX + (p16) LDFD f101 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f6 + (p16) lfetch.fault.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmi + (p16) LDFD f56 = [X1], INCX + (p16) LDFD f104 = [Y1], INCY + (p18) add YY1 = YY1, INCY + } + ;; + { .mmf + (p18) STFD [YY1] = f7 + (p18) add YY1 = YY1, INCY + (p18) FMA f13 = ALPHA, f73, f121 + } + { .mmi + (p16) LDFD f59 = [X1], INCX + (p16) LDFD f107 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f10 + (p18) add YY1 = YY1, INCY + (p18) FMA f14 = ALPHA, f76, f124 + } + { .mmi + (p16) LDFD f62 = [X1], INCX + (p16) LDFD f110 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f11 + (p18) add YY1 = YY1, INCY + (p18) FMA f15 = ALPHA, f79, f127 + } + { .mmi + (p16) LDFD f65 = [X1], INCX + (p16) LDFD f113 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) add YY1 = YY1, INCY + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmi + (p16) LDFD f68 = [X1], INCX + (p16) LDFD f116 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f13 + (p18) add YY1 = YY1, INCY + (p17) FMA f7 = ALPHA, f36, f84 + } + { .mmi + (p16) LDFD f71 = [X1], INCX + (p16) LDFD f119 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f14 + (p18) add YY1 = YY1, INCY + (p17) FMA f10 = ALPHA, f39, f87 + } + { .mmi + (p16) LDFD f74 = [X1], INCX + (p16) LDFD f122 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f15 + (p18) add YY1 = YY1, INCY + (p17) FMA f11 = ALPHA, f42, f90 + } + { .mmb + (p16) LDFD f77 = [X1], INCX + (p16) LDFD f125 = [Y1], INCY + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + { .mmi + (p12) LDFD f32 = [X1], INCX + (p12) LDFD f34 = [Y1], INCY + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFD f33 = [X1], INCX + (p12) LDFD f35 = [Y1], INCY + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFD f36 = [X1], INCX + (p12) LDFD f38 = [Y1], INCY + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFD f37 = [X1], INCX + (p12) LDFD f39 = [Y1], INCY + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p12) LDFD f40 = [X1], INCX + (p12) LDFD f42 = [Y1], INCY + tbit.nz p14, p0 = N, 1 + } + ;; + { .mmi + (p12) LDFD f41 = [X1], INCX + (p12) LDFD f43 = [Y1], INCY + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmf + (p12) LDFD f44 = [X1], INCX + (p12) LDFD f46 = [Y1], INCY + (p12) FMA f6 = ALPHA, f32, f34 + } + ;; + { .mmf + (p12) LDFD f45 = [X1], INCX + (p12) LDFD f47 = [Y1], INCY + (p12) FMA f7 = ALPHA, f33, f35 + } + ;; + { .mmf + (p13) LDFD f48 = [X1], INCX + (p13) LDFD f50 = [Y1], INCY + (p12) FMA f10 = ALPHA, f36, f38 + } + ;; + { .mmf + (p13) LDFD f49 = [X1], INCX + (p13) LDFD f51 = [Y1], INCY + (p12) FMA f11 = ALPHA, f37, f39 + } + ;; + { .mmf + (p12) STFD [YY1] = f6 + (p12) add YY1 = YY1, INCY + (p12) FMA f12 = ALPHA, f40, f42 + } + { .mmi + (p13) LDFD f52 = [X1], INCX + (p13) LDFD f54 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f7 + (p12) add YY1 = YY1, INCY + (p12) FMA f13 = ALPHA, f41, f43 + } + { .mmi + (p13) LDFD f53 = [X1], INCX + (p13) LDFD f55 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f10 + (p12) add YY1 = YY1, INCY + (p12) FMA f14 = ALPHA, f44, f46 + } + { .mmi + (p14) LDFD f56 = [X1], INCX + (p14) LDFD f58 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f11 + (p12) add YY1 = YY1, INCY + (p12) FMA f15 = ALPHA, f45, f47 + } + { .mmi + (p14) LDFD f57 = [X1], INCX + (p14) LDFD f59 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f12 + (p12) add YY1 = YY1, INCY + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmi + (p15) LDFD f60 = [X1], INCX + (p15) LDFD f61 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f13 + (p12) add YY1 = YY1, INCY + (p13) FMA f7 = ALPHA, f49, f51 + } + ;; + { .mmf + (p12) STFD [YY1] = f14 + (p12) add YY1 = YY1, INCY + (p13) FMA f10 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f15 + (p12) add YY1 = YY1, INCY + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + ;; + { .mmf + (p13) STFD [YY1] = f6 + (p13) add YY1 = YY1, INCY + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p13) STFD [YY1] = f7 + (p13) add YY1 = YY1, INCY + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f10 + (p13) add YY1 = YY1, INCY + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f11 + (p13) add YY1 = YY1, INCY + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YY1] = f12 + (p14) add YY1 = YY1, INCY + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YY1] = f13 + (p14) add YY1 = YY1, INCY + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + + + EPILOGUE + diff --git a/kernel/ia64/ddot.S b/kernel/ia64/ddot.S new file mode 100644 index 0000000..082c303 --- /dev/null +++ b/kernel/ia64/ddot.S @@ -0,0 +1,1184 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (16 * 16 + 2) + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 +#define INCX16 r18 +#define INCY16 r19 +#define INCX3 r20 +#define INCY3 r21 +#define YY r22 +#define XA r23 +#define YA r24 +#define XX r25 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + nop.m 0 + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov r26 = 1 + mov f9 = f0 + shr XA = X1, 4 + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif + + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + (p6) shladd X1 = r26, BASE_SHIFT, X1 + (p7) shladd Y1 = r27, BASE_SHIFT, Y1 + ;; +#endif + { .mfi + shladd INCX = INCX, BASE_SHIFT, r0 + mov f32 = f0 + mov PR = pr + } + { .mfb + cmp.lt p0, p6 = r0, N + mov f80 = f0 + (p6) br.ret.sptk.many b0 + } + ;; + { .mfi + shladd INCY = INCY, BASE_SHIFT, r0 + mov f10 = f0 + tbit.nz p15, p0 = X1, BASE_SHIFT + } + { .mfb + cmp.ne p6, p0 = SIZE, INCX + mov f11 = f0 + (p6) br.cond.dptk .L100 + } + ;; + { .mfi + (p15) LDFD f32 = [X1], INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + (p15) adds N = -1, N + mov f13 = f0 + shr YA = Y1, 4 + } + ;; + { .mfi + (p15) LDFD f80 = [Y1], INCY + mov f14 = f0 + shr I = N, 4 + } + { .mmi + and J = 15, N + and XA = 0xf, XA + and YA = 0xf, YA + } + ;; + { .mmi + shladd INCX3 = INCX, 1, INCX + shladd INCY3 = INCY, 1, INCY + sub XA = YA, XA + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + shladd Y2 = INCY, 1, Y1 + cmp.eq p7, p0 = r0, J + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.ge p8, p0 = 2, XA + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mbb + cmp.le p9, p0 = 12, XA + (p8) br.cond.dpnt .L20 + (p9) br.cond.dpnt .L20 + } + ;; + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L15 + } + ;; + .align 32 + +/* INCX == 1 && X is aligned */ +.L12: + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [Y1], INCY + (p16) LDFD f86 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f83 = [Y1], INCY3 + (p16) LDFD f89 = [Y2], INCY3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f92 = [Y1], INCY + (p16) LDFD f98 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f95 = [Y1], INCY3 + (p16) LDFD f101 = [Y2], INCY3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f104 = [Y1], INCY + (p16) LDFD f110 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f107 = [Y1], INCY3 + (p16) LDFD f113 = [Y2], INCY3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f116 = [Y1], INCY + (p16) LDFD f122 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p16) LDFD f119 = [Y1], INCY3 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f125 = [Y2], INCY3 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + mov YY = Y1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f38 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) shladd YY = INCY, 3, YY + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [Y1], INCY3 + (p12) LDFD f39 = [Y2], INCY3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p13) shladd YY = INCY, 2, YY + } + { .mmi + (p12) LDFD f42 = [Y1], INCY + (p12) LDFD f46 = [Y2], INCY + } + ;; + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFD f43 = [Y1], INCY3 + (p12) LDFD f47 = [Y2], INCY3 + (p14) shladd YY = INCY, 1, YY + ;; + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFD f50 = [Y1], INCY + (p13) LDFD f54 = [Y2], INCY + ;; + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFD f51 = [Y1], INCY3 + (p13) LDFD f55 = [Y2], INCY3 + ;; + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFD f58 = [Y1], INCY + (p15) LDFD f61 = [YY] + ;; + (p14) LDFD f59 = [Y1] + (p15) LDFD f60 = [X1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L20: + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 18) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p17) LDFD f105 = [Y1], INCY + (p17) LDFD f111 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p17) LDFD f108 = [Y1], INCY3 + (p17) LDFD f114 = [Y2], INCY3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p17) LDFD f117 = [Y1], INCY + (p17) LDFD f123 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p17) LDFD f120 = [Y1], INCY3 + (p17) LDFD f126 = [Y2], INCY3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f80 = [Y1], INCY + (p16) LDFD f86 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f83 = [Y1], INCY3 + (p16) LDFD f89 = [Y2], INCY3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f92 = [Y1], INCY + (p16) LDFD f98 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p16) LDFD f95 = [Y1], INCY3 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f101 = [Y2], INCY3 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + mov YY = Y1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f38 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) shladd YY = INCY, 3, YY + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [Y1], INCY3 + (p12) LDFD f39 = [Y2], INCY3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p13) shladd YY = INCY, 2, YY + } + { .mmi + (p12) LDFD f42 = [Y1], INCY + (p12) LDFD f46 = [Y2], INCY + } + ;; + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFD f43 = [Y1], INCY3 + (p12) LDFD f47 = [Y2], INCY3 + (p14) shladd YY = INCY, 1, YY + ;; + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFD f50 = [Y1], INCY + (p13) LDFD f54 = [Y2], INCY + ;; + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFD f51 = [Y1], INCY3 + (p13) LDFD f55 = [Y2], INCY3 + ;; + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFD f58 = [Y1], INCY + (p15) LDFD f61 = [YY] + ;; + (p14) LDFD f59 = [Y1] + (p15) LDFD f60 = [X1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L100: + { .mmi + shladd X2 = INCX, 1, X1 + } + { .mib + cmp.ne p6, p0 = SIZE, INCY + tbit.nz p15, p0 = Y1, BASE_SHIFT + (p6) br.cond.dptk .L200 + } + ;; + { .mfi + (p15) LDFD f32 = [X1], INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + (p15) adds N = -1, N + mov f13 = f0 + shr YA = Y1, 4 + } + ;; + { .mfi + (p15) LDFD f80 = [Y1], INCY + mov f14 = f0 + shr I = N, 4 + } + { .mmi + and J = 15, N + and XA = 0xf, XA + and YA = 0xf, YA + } + ;; + { .mmi + shladd INCX3 = INCX, 1, INCX + shladd INCY3 = INCY, 1, INCY + sub XA = YA, XA + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + shladd X2 = INCX, 1, X1 + cmp.eq p7, p0 = r0, J + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.ge p8, p0 = 4, XA + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mbb + cmp.le p9, p0 = 14, XA + (p8) br.cond.dpnt .L120 + (p9) br.cond.dpnt .L120 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L115 + } + ;; + .align 32 + +/* INCY == 1 */ +.L112: + { .mmf + (p16) LDFPD f32, f35 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f86 = [X2], INCX + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f83 = [X1], INCX3 + (p16) LDFD f89 = [X2], INCX3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [Y1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [Y1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f95 = [X1], INCX3 + (p16) LDFD f101 = [X2], INCX3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [Y1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f104 = [X1], INCX + (p16) LDFD f110 = [X2], INCX + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [Y1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f107 = [X1], INCX3 + (p16) LDFD f113 = [X2], INCX3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [Y1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f116 = [X1], INCX + (p16) LDFD f122 = [X2], INCX + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [Y1], 2 * SIZE + (p16) LDFD f119 = [X1], INCX3 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f125 = [X2], INCX3 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + { .mmi + (p12) LDFPD f32, f33 = [Y1], 2 * SIZE + mov XX = X1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f38 = [X2], INCX + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [Y1], 2 * SIZE + (p12) shladd XX = INCX, 3, XX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [X1], INCX3 + (p12) LDFD f39 = [X2], INCX3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [Y1], 2 * SIZE + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f42 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + } + ;; + (p12) LDFPD f44, f45 = [Y1], 2 * SIZE + (p12) LDFD f43 = [X1], INCX3 + (p12) LDFD f47 = [X2], INCX3 + (p14) shladd XX = INCX, 1, XX + ;; + (p13) LDFPD f48, f49 = [Y1], 2 * SIZE + (p13) LDFD f50 = [X1], INCX + (p13) LDFD f54 = [X2], INCX + ;; + (p13) LDFPD f52, f53 = [Y1], 2 * SIZE + (p13) LDFD f51 = [X1], INCX3 + (p13) LDFD f55 = [X2], INCX3 + ;; + (p14) LDFPD f56, f57 = [Y1], 2 * SIZE + (p14) LDFD f58 = [X1], INCX + (p15) LDFD f61 = [XX] + ;; + (p14) LDFD f59 = [X1] + (p15) LDFD f60 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L120: + { .mmi + adds PREX = (PREFETCH_SIZE + 17) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 19) * SIZE, X1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L125 + } + ;; + .align 32 + +.L122: + { .mmf + (p16) LDFPD f32, f35 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p17) LDFD f105 = [X1], INCX + (p17) LDFD f111 = [X2], INCX + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p17) LDFD f108 = [X1], INCX3 + (p17) LDFD f114 = [X2], INCX3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [Y1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p17) LDFD f117 = [X1], INCX + (p17) LDFD f123 = [X2], INCX + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [Y1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p17) LDFD f120 = [X1], INCX3 + (p17) LDFD f126 = [X2], INCX3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [Y1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f86 = [X2], INCX + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [Y1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f83 = [X1], INCX3 + (p16) LDFD f89 = [X2], INCX3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [Y1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [Y1], 2 * SIZE + (p16) LDFD f95 = [X1], INCX3 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f101 = [X2], INCX3 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L122 + } + ;; + .align 32 + +.L125: + { .mmi + (p12) LDFPD f32, f33 = [Y1], 2 * SIZE + mov XX = X1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f38 = [X2], INCX + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [Y1], 2 * SIZE + (p12) shladd XX = INCX, 3, XX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [X1], INCX3 + (p12) LDFD f39 = [X2], INCX3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [Y1], 2 * SIZE + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f42 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + } + ;; + (p12) LDFPD f44, f45 = [Y1], 2 * SIZE + (p12) LDFD f43 = [X1], INCX3 + (p12) LDFD f47 = [X2], INCX3 + (p14) shladd XX = INCX, 1, XX + ;; + (p13) LDFPD f48, f49 = [Y1], 2 * SIZE + (p13) LDFD f50 = [X1], INCX + (p13) LDFD f54 = [X2], INCX + ;; + (p13) LDFPD f52, f53 = [Y1], 2 * SIZE + (p13) LDFD f51 = [X1], INCX3 + (p13) LDFD f55 = [X2], INCX3 + ;; + (p14) LDFPD f56, f57 = [Y1], 2 * SIZE + (p14) LDFD f58 = [X1], INCX + (p15) LDFD f61 = [XX] + ;; + (p14) LDFD f59 = [X1] + (p15) LDFD f60 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L200: + { .mfi + shladd INCX3 = INCX, 1, INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + and J = 15, N + mov f13 = f0 + shr I = N, 4 + } + ;; + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd INCY3 = INCY, 1, INCY + mov f14 = f0 + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + cmp.eq p7, p0 = r0, J + adds I = -1, I + mov ar.ec= 3 + } + { .mmi + shladd Y2 = INCY, 1, Y1 + mov XX = X1 + mov YY = Y1 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + mov f15 = f0 + (p6) br.cond.dpnt .L215 + } + ;; + .align 32 + +/* INCY == 1 */ +.L212: + { .mmf + (p16) lfetch.nt1 [PREX], INCX16 + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f32 = [Y1], INCY + (p16) LDFD f38 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f86 = [X2], INCX + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f35 = [Y1], INCY3 + (p16) LDFD f41 = [Y2], INCY3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFD f83 = [X1], INCX3 + (p16) LDFD f89 = [X2], INCX3 + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f44 = [Y1], INCY + (p16) LDFD f50 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f47 = [Y1], INCY3 + (p16) LDFD f53 = [Y2], INCY3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFD f95 = [X1], INCX3 + (p16) LDFD f101 = [X2], INCX3 + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f56 = [Y1], INCY + (p16) LDFD f62 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFD f104 = [X1], INCX + (p16) LDFD f110 = [X2], INCX + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f59 = [Y1], INCY3 + (p16) LDFD f65 = [Y2], INCY3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFD f107 = [X1], INCX3 + (p16) LDFD f113 = [X2], INCX3 + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f68 = [Y1], INCY + (p16) LDFD f74 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFD f116 = [X1], INCX + (p16) LDFD f122 = [X2], INCX + (p18) FMA f14 = f76, f124, f14 + } + { .mmf + (p16) LDFD f71 = [Y1], INCY3 + (p16) LDFD f77 = [Y2], INCY3 + (p18) FMA f15 = f79, f127, f15 + } + ;; + { .mmi + (p16) LDFD f119 = [X1], INCX3 + (p16) LDFD f125 = [X2], INCX3 + } + { .mmb + (p16) add XX = INCX16, XX + (p16) add YY = INCY16, YY + br.ctop.sptk.few .L212 + } + ;; + .align 32 + +.L215: + { .mmi + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f38 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f32 = [Y1], INCY + (p12) LDFD f36 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFD f35 = [X1], INCX3 + (p12) LDFD f39 = [X2], INCX3 + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f33 = [Y1], INCY3 + (p12) LDFD f37 = [Y2], INCY3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFD f42 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + (p12) shladd XX = INCX, 3, XX + } + { .mmi + (p12) LDFD f40 = [Y1], INCY + (p12) LDFD f44 = [Y2], INCY + (p12) shladd YY = INCY, 3, YY + } + ;; + { .mmi + (p12) LDFD f43 = [X1], INCX3 + (p12) LDFD f47 = [X2], INCX3 + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f41 = [Y1], INCY3 + (p12) LDFD f45 = [Y2], INCY3 + (p13) shladd YY = INCY, 2, YY + } + ;; + (p13) LDFD f50 = [X1], INCX + (p13) LDFD f54 = [X2], INCX + (p14) shladd XX = INCX, 1, XX + (p13) LDFD f48 = [Y1], INCY + (p13) LDFD f52 = [Y2], INCY + (p14) shladd YY = INCY, 1, YY + ;; + (p13) LDFD f51 = [X1], INCX3 + (p13) LDFD f55 = [X2] + (p13) LDFD f49 = [Y1], INCY3 + (p13) LDFD f53 = [Y2] + ;; + (p14) LDFD f58 = [X1], INCX + (p15) LDFD f61 = [XX] + (p14) LDFD f56 = [Y1], INCY + (p15) LDFD f60 = [YY] + ;; + (p14) LDFD f59 = [X1] + (p14) LDFD f57 = [Y1] + ;; + ;; + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + ;; + .align 32 + +.L999: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f10 + FADD f12 = f12, f14 + mov ar.lc = ARLC + ;; + FADD f8 = f8, f12 + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/gemm_beta.S b/kernel/ia64/gemm_beta.S new file mode 100644 index 0000000..ceeca4a --- /dev/null +++ b/kernel/ia64/gemm_beta.S @@ -0,0 +1,512 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 140 + +#define CO1 r14 +#define CO2 r15 +#define CO3 r16 +#define DO1 r17 +#define DO2 r18 +#define DO3 r19 + +#define I r22 +#define I_AND_15 r23 +#define PRE1 r24 + +#define PR r30 +#define ARLC r31 + +#define M r32 +#define N r33 +#define C r34 +#define LDC r35 +#define J r36 + +#define BETA f8 + + PROLOGUE + .prologue + PROFCODE + + { .mmi +#ifndef XDOUBLE + adds CO1 = 16, r12 + adds CO2 = 24, r12 +#else + adds CO1 = 32, r12 + adds CO2 = 40, r12 +#endif + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, N + fcmp.eq p0, p15 = BETA, f0 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + ld8 C = [CO1], 8 + ld8 LDC = [CO2] + mov PR = pr + } + { .mmi + mov J = N + shr I = M, 4 + } + ;; + { .mmb + shladd LDC = LDC, BASE_SHIFT, r0 + adds I = -1, I + (p15) br.cond.dpnt .L100 // if (beta != 0) goto L100 + } + ;; + .align 32 + +.L60: + { .mmi + mov CO1 = C + mov CO3 = C + add CO2 = 4 * SIZE, C + } + { .mmi + adds PRE1 = PREFETCHSIZE * SIZE, C + add C = C, LDC + tbit.nz p12, p0 = M, 3 + } + ;; + { .mmi + and I_AND_15 = 15, M + mov ar.lc = I + } + { .mib + cmp.gt p8, p0 = 0, I + (p8) br.cond.dpnt .L80 + } + ;; + .align 32 + +.L70: + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + { .mmi + lfetch.excl.nt1 [PRE1] + nop.m 0 + adds PRE1 = 16 * SIZE, PRE1 + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + adds CO3 = 16 * SIZE, CO3 + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 5 * SIZE + STFD [CO2] = f0, 5 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmb + STFD [CO1] = f0, 5 * SIZE + STFD [CO2] = f0, 5 * SIZE + br.cloop.sptk.few .L70 + } + ;; + .align 32 + +.L80: + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + tbit.nz p13, p0 = M, 2 + } + { .mmb + cmp.eq p9, p0 = 0, I_AND_15 + adds J = -1, J + (p9) br.cond.dptk .L99 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + tbit.nz p14, p0 = M, 1 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + (p12) adds CO3 = 8 * SIZE, CO3 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 5 * SIZE + (p12) STFD [CO2] = f0 + (p13) adds CO3 = 4 * SIZE, CO3 + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p14) STFD [CO3] = f0, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p14) STFD [CO3] = f0, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p15) STFD [CO3] = f0 + } + ;; + { .mmi + (p13) STFD [CO1] = f0 + } + ;; + .align 32 + +.L99: + { .mib + cmp.lt p6, p0 = 0, J + mov ar.lc = ARLC + } + { .mbb + (p6) br.cond.dptk .L60 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mmi + mov CO1 = C + mov CO3 = C + mov pr.rot = 0 + } + { .mmi + adds PRE1 = PREFETCHSIZE * SIZE, C + add CO2 = 4 * SIZE, C + mov DO1 = C + } + ;; + { .mmi + mov ar.ec = 6 + } + { .mmi + adds DO2 = 4 * SIZE, C + mov DO3 = C + add C = C, LDC + } + ;; + { .mmi + and I_AND_15 = 15, M + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + } + { .mib + cmp.gt p8, p0 = 0, I + tbit.nz p12, p0 = M, 3 + (p8) br.cond.dpnt .L180 + } + ;; + .align 32 + +.L170: + { .mmf + (p21) STFD [DO1] = f6, 1 * SIZE + (p21) STFD [DO2] = f7, 1 * SIZE + (p21) FMPY f6 = BETA, f85 + } + { .mmf + (p16) lfetch.excl.nt1 [PRE1] + (p16) adds CO3 = 16 * SIZE, CO3 + (p21) FMPY f7 = BETA, f91 + } + ;; + { .mmf + (p21) STFD [DO1] = f10, 1 * SIZE + (p21) STFD [DO2] = f11, 1 * SIZE + (p21) FMPY f10 = BETA, f97 + } + { .mmf + (p16) LDFD f32 = [CO1], 1 * SIZE + (p16) LDFD f38 = [CO2], 1 * SIZE + (p21) FMPY f11 = BETA, f103 + } + ;; + { .mmf + (p21) STFD [DO1] = f12, 1 * SIZE + (p21) STFD [DO2] = f13, 1 * SIZE + (p21) FMPY f12 = BETA, f109 + } + { .mmf + (p16) LDFD f44 = [CO1], 1 * SIZE + (p16) LDFD f50 = [CO2], 1 * SIZE + (p21) FMPY f13 = BETA, f115 + } + ;; + { .mmf + (p21) STFD [DO1] = f14, 5 * SIZE + (p21) STFD [DO2] = f15, 5 * SIZE + (p21) FMPY f14 = BETA, f121 + } + { .mmf + (p16) LDFD f56 = [CO1], 1 * SIZE + (p16) LDFD f62 = [CO2], 1 * SIZE + (p21) FMPY f15 = BETA, f127 + } + ;; + { .mmf + (p21) STFD [DO1] = f6, 1 * SIZE + (p21) STFD [DO2] = f7, 1 * SIZE + (p20) FMPY f6 = BETA, f36 + } + { .mmf + (p16) LDFD f68 = [CO1], 5 * SIZE + (p16) LDFD f74 = [CO2], 5 * SIZE + (p20) FMPY f7 = BETA, f42 + } + ;; + { .mmf + (p21) STFD [DO1] = f10, 1 * SIZE + (p21) STFD [DO2] = f11, 1 * SIZE + (p20) FMPY f10 = BETA, f48 + } + { .mmf + (p16) LDFD f80 = [CO1], 1 * SIZE + (p16) LDFD f86 = [CO2], 1 * SIZE + (p20) FMPY f11 = BETA, f54 + } + ;; + { .mmf + (p21) STFD [DO1] = f12, 1 * SIZE + (p21) STFD [DO2] = f13, 1 * SIZE + (p20) FMPY f12 = BETA, f60 + } + { .mmf + (p16) LDFD f92 = [CO1], 1 * SIZE + (p16) LDFD f98 = [CO2], 1 * SIZE + (p20) FMPY f13 = BETA, f66 + } + ;; + { .mmf + (p21) STFD [DO1] = f14, 5 * SIZE + (p21) STFD [DO2] = f15, 5 * SIZE + (p20) FMPY f14 = BETA, f72 + } + { .mmf + (p16) LDFD f104 = [CO1], 1 * SIZE + (p16) LDFD f110 = [CO2], 1 * SIZE + (p20) FMPY f15 = BETA, f78 + } + ;; + { .mmi + (p16) LDFD f116 = [CO1], 5 * SIZE + (p16) LDFD f122 = [CO2], 5 * SIZE + adds PRE1 = 16 * SIZE, PRE1 + } + { .mmb + (p16) adds DO3 = 16 * SIZE, DO3 + nop.m 0 + br.ctop.sptk.few .L170 + } + ;; + .align 32 + +.L180: + { .mmi + (p12) LDFD f32 = [CO1], 1 * SIZE + (p12) LDFD f36 = [CO2], 1 * SIZE + tbit.nz p13, p0 = M, 2 + } + { .mmb + cmp.eq p9, p0 = 0, I_AND_15 + adds J = -1, J + (p9) br.cond.dptk .L199 + } + ;; + { .mmi + (p12) LDFD f33 = [CO1], 1 * SIZE + (p12) LDFD f37 = [CO2], 1 * SIZE + tbit.nz p14, p0 = M, 1 + } + ;; + { .mmi + (p12) LDFD f34 = [CO1], 1 * SIZE + (p12) LDFD f38 = [CO2], 1 * SIZE + (p12) adds CO3 = 8 * SIZE, CO3 + } + ;; + { .mmi + (p12) LDFD f35 = [CO1], 5 * SIZE + (p12) LDFD f39 = [CO2] + (p13) adds CO3 = 4 * SIZE, CO3 + } + ;; + { .mmi + (p13) LDFD f40 = [CO1], 1 * SIZE + (p14) LDFD f44 = [CO3], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f41 = [CO1], 1 * SIZE + (p14) LDFD f45 = [CO3], 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + ;; + { .mmf + (p13) LDFD f42 = [CO1], 1 * SIZE + (p15) LDFD f46 = [CO3] + (p12) FMPY f32 = BETA, f32 + } + { .mmf + (p12) FMPY f36 = BETA, f36 + } + ;; + { .mmf + (p13) LDFD f43 = [CO1] + (p12) FMPY f33 = BETA, f33 + } + { .mmf + (p12) FMPY f37 = BETA, f37 + } + ;; + (p12) FMPY f34 = BETA, f34 + (p12) FMPY f38 = BETA, f38 + (p12) FMPY f35 = BETA, f35 + (p12) FMPY f39 = BETA, f39 + + ;; + { .mmf + (p12) STFD [DO1] = f32, 1 * SIZE + (p12) STFD [DO2] = f36, 1 * SIZE + (p13) FMPY f40 = BETA, f40 + } + { .mmf + (p12) adds DO3 = 8 * SIZE, DO3 + (p14) FMPY f44 = BETA, f44 + } + ;; + { .mmf + (p12) STFD [DO1] = f33, 1 * SIZE + (p12) STFD [DO2] = f37, 1 * SIZE + (p13) FMPY f41 = BETA, f41 + } + { .mmf + (p13) adds DO3 = 4 * SIZE, DO3 + (p14) FMPY f45 = BETA, f45 + } + ;; + { .mmf + (p12) STFD [DO1] = f34, 1 * SIZE + (p12) STFD [DO2] = f38, 1 * SIZE + (p13) FMPY f42 = BETA, f42 + } + { .mmf + (p15) FMPY f46 = BETA, f46 + } + ;; + { .mmf + (p12) STFD [DO1] = f35, 5 * SIZE + (p12) STFD [DO2] = f39 + (p13) FMPY f43 = BETA, f43 + } + ;; + { .mmi + (p13) STFD [DO1] = f40, 1 * SIZE + (p14) STFD [DO3] = f44, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [DO1] = f41, 1 * SIZE + (p14) STFD [DO3] = f45, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [DO1] = f42, 1 * SIZE + (p15) STFD [DO3] = f46 + } + ;; + { .mmi + (p13) STFD [DO1] = f43 + } + ;; + .align 32 + +.L199: + { .mib + cmp.lt p6, p0 = 0, J + mov ar.lc = ARLC + (p6) br.cond.dptk .L100 + } + ;; + { .mib + mov pr = PR, -1 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/gemm_kernel.S b/kernel/ia64/gemm_kernel.S new file mode 100644 index 0000000..d1d4731 --- /dev/null +++ b/kernel/ia64/gemm_kernel.S @@ -0,0 +1,8958 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE 7 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r36 +#define B r37 +#define C r38 +#define LDC r39 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define BB r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS +#ifdef TRMMKERNEL + alloc ARPFS = ar.pfs, 8, 16, 0, 0 +#else + alloc ARPFS = ar.pfs, 8, 8, 0, 0 +#endif + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -16 * 16, SP + adds r9 = -15 * 16, SP + adds SP = -16 * 16, SP + } + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + shladd LDC = LDC, BASE_SHIFT, r0 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + shr J = N, 3 + ;; + stf.spill [r8] = f22, 32 + stf.spill [r9] = f23, 32 + mov AOFFSET = A + ;; + stf.spill [r8] = f24, 32 + stf.spill [r9] = f25, 32 + cmp.ge p6, p0 = 0, J + ;; + stf.spill [r8] = f26, 32 + stf.spill [r9] = f27, 32 + shr BB = K, 3 + ;; + stf.spill [r8] = f28, 32 + stf.spill [r9] = f29, 32 + ;; + stf.spill [r8] = f30 + stf.spill [r9] = f31 +#ifndef TRMMKERNEL + (p6) br.cond.dpnt .L050 + .body + ;; +#else + .body + ;; + ld8 OFFSET = [r14] +#if defined(TRMMKERNEL) && !defined(LEFT) + ;; + sub KK = r0, OFFSET +#endif + (p6) br.cond.dpnt .L050 + ;; +#endif + .align 32 + +.L010: + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 + shladd BB = BB, BASE_SHIFT, B + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc + shladd C = LDC, 3, C // coffset += 8 * ldc + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc + shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + }{ .mfb + sub C8 = C, LDC // coffset8 = c + 7 * ldc + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 3, B + mov f65 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + { .mfb + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mmf + lfetch.nt1 [BB] + setf.d f113 = r0 + mov f121 = f0 + } + ;; + { .mmf + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + adds BB = 16 * SIZE, BB + } + ;; + { .mmf + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfb + setf.d f114 = r0 + mov f122 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 8, KK +#endif +#endif + } + ;; + { .mmf + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfi + setf.d f119 = r0 + mov f127 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfi + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfi + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfi + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfi + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfi + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfi + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfi + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfi + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfi + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfi + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfi + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfi + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfi + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfi + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfi + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfi + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfi + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfi + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfi + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfi + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfi + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfi + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfi + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfi + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfi + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfi + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfi + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfi + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfi + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfi + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfi + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfi + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfi + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfi + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfi + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfi + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f6 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f7 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f10 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f11 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f12 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f13 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f14 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f15 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f16 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f17 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f18 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f19 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f20 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f21 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f22 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f23 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f24 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f25 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f26 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f27 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f28 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f29 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f30 = [C3 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f31 = [C11], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f32 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f33 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f34 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f35 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f36 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f37 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f38 = [C4 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f39 = [C12], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f48 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f49 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f50 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f51 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f52 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f53 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f54 = [C5 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f55 = [C13], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f40 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f41 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f42 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f43 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f44 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f45 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f59 = [C6 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f60 = [C14], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f61 = [C7 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f62 = [C15], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; +.L013: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + (p5) LDFD f63 = [C7 ], SIZE + FMA f64 = ALPHA, f64, f6 + cmp.ne p6, p0 = 1, I + } + { .mfb + (p5) LDFD f6 = [C15], SIZE + FMA f68 = ALPHA, f68, f7 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f7 = [C7 ], SIZE + FMA f65 = ALPHA, f65, f10 + adds I = -1, I + } + { .mfb + (p5) LDFD f10 = [C15], SIZE + FMA f69 = ALPHA, f69, f11 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f11 = [C7 ], -3 * SIZE + FMA f66 = ALPHA, f66, f12 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C15], -3 * SIZE + FMA f70 = ALPHA, f70, f13 + nop __LINE__ + } + ;; + { .mfb + LDFD f13 = [C8 ], SIZE + FMA f67 = ALPHA, f67, f14 + nop __LINE__ + } + { .mfb + LDFD f14 = [C16], SIZE + FMA f71 = ALPHA, f71, f15 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + FMA f72 = ALPHA, f72, f16 + } + { .mmf + LDFD f15 = [C8 ], SIZE + LDFD f16 = [C16], SIZE + FMA f76 = ALPHA, f76, f17 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + FMA f73 = ALPHA, f73, f18 + } + { .mmf + LDFD f17 = [C8 ], SIZE + LDFD f18 = [C16], SIZE + FMA f77 = ALPHA, f77, f19 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + FMA f74 = ALPHA, f74, f20 + } + { .mmf + LDFD f19 = [C8 ], -3 * SIZE + LDFD f20 = [C16], -3 * SIZE + FMA f78 = ALPHA, f78, f21 + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f22 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f23 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f80 = ALPHA, f80, f24 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMA f84 = ALPHA, f84, f25 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMA f81 = ALPHA, f81, f26 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMA f85 = ALPHA, f85, f27 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMA f82 = ALPHA, f82, f28 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMA f86 = ALPHA, f86, f29 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMA f83 = ALPHA, f83, f30 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMA f87 = ALPHA, f87, f31 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f88 = ALPHA, f88, f32 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMA f92 = ALPHA, f92, f33 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f89 = ALPHA, f89, f34 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMA f93 = ALPHA, f93, f35 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMA f90 = ALPHA, f90, f36 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMA f94 = ALPHA, f94, f37 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMA f91 = ALPHA, f91, f38 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMA f95 = ALPHA, f95, f39 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMA f96 = ALPHA, f96, f48 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + FMA f100 = ALPHA, f100, f49 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + FMA f97 = ALPHA, f97, f50 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + FMA f101 = ALPHA, f101, f51 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + FMA f98 = ALPHA, f98, f52 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + FMA f102 = ALPHA, f102, f53 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + FMA f99 = ALPHA, f99, f54 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + FMA f103 = ALPHA, f103, f55 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMA f104 = ALPHA, f104, f40 + nop __LINE__ + } + { .mfb + STFD [C13] = f100, SIZE + FMA f108 = ALPHA, f108, f41 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + FMA f105 = ALPHA, f105, f42 + nop __LINE__ + } + { .mfb + STFD [C13] = f101, SIZE + FMA f109 = ALPHA, f109, f43 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f98, SIZE + FMA f106 = ALPHA, f106, f44 + nop __LINE__ + } + { .mfb + STFD [C13] = f102, SIZE + FMA f110 = ALPHA, f110, f45 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f99, 5 * SIZE + FMA f107 = ALPHA, f107, f59 + nop __LINE__ + } + { .mfb + STFD [C13] = f103, 5 * SIZE + FMA f111 = ALPHA, f111, f60 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMA f112 = ALPHA, f112, f61 + nop __LINE__ + } + { .mfb + STFD [C14] = f108, SIZE + FMA f116 = ALPHA, f116, f62 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, SIZE + FMA f113 = ALPHA, f113, f63 + nop __LINE__ + } + { .mfb + STFD [C14] = f109, SIZE + FMA f117 = ALPHA, f117, f6 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f106, SIZE + FMA f114 = ALPHA, f114, f7 + nop __LINE__ + } + { .mfb + STFD [C14] = f110, SIZE + FMA f118 = ALPHA, f118, f10 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f107, 5 * SIZE + FMA f115 = ALPHA, f115, f11 + nop __LINE__ + } + { .mfb + STFD [C14] = f111, 5 * SIZE + FMA f119 = ALPHA, f119, f12 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + FMA f120 = ALPHA, f120, f13 + nop __LINE__ + } + { .mfb + STFD [C15] = f116, SIZE + FMA f124 = ALPHA, f124, f14 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + FMA f121 = ALPHA, f121, f15 + nop __LINE__ + } + { .mfb + STFD [C15] = f117, SIZE + FMA f125 = ALPHA, f125, f16 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f114, SIZE + FMA f122 = ALPHA, f122, f17 + nop __LINE__ + } + { .mfb + STFD [C15] = f118, SIZE + FMA f126 = ALPHA, f126, f18 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f115, 5 * SIZE + FMA f123 = ALPHA, f123, f19 + nop __LINE__ + } + { .mfb + STFD [C15] = f119, 5 * SIZE + FMA f127 = ALPHA, f127, f20 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f120, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f124, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f125, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f122, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f126, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f123, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f127, 5 * SIZE + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + FMPY f72 = ALPHA, f72 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f76 = ALPHA, f76 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + FMPY f73 = ALPHA, f73 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f77 = ALPHA, f77 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + FMPY f74 = ALPHA, f74 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f78 = ALPHA, f78 + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMPY f84 = ALPHA, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMPY f85 = ALPHA, f85 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMPY f86 = ALPHA, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMPY f87 = ALPHA, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMPY f92 = ALPHA, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMPY f93 = ALPHA, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMPY f94 = ALPHA, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMPY f91 = ALPHA, f91 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMPY f95 = ALPHA, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + FMPY f100 = ALPHA, f100 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + FMPY f101 = ALPHA, f101 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + FMPY f98 = ALPHA, f98 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + FMPY f102 = ALPHA, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + FMPY f99 = ALPHA, f99 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + FMPY f103 = ALPHA, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + { .mfb + STFD [C13] = f100, SIZE + FMPY f108 = ALPHA, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + { .mfb + STFD [C13] = f101, SIZE + FMPY f109 = ALPHA, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f98, SIZE + FMPY f106 = ALPHA, f106 + nop __LINE__ + } + { .mfb + STFD [C13] = f102, SIZE + FMPY f110 = ALPHA, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f99, 5 * SIZE + FMPY f107 = ALPHA, f107 + nop __LINE__ + } + { .mfb + STFD [C13] = f103, 5 * SIZE + FMPY f111 = ALPHA, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMPY f112 = ALPHA, f112 + nop __LINE__ + } + { .mfb + STFD [C14] = f108, SIZE + FMPY f116 = ALPHA, f116 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, SIZE + FMPY f113 = ALPHA, f113 + nop __LINE__ + } + { .mfb + STFD [C14] = f109, SIZE + FMPY f117 = ALPHA, f117 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f106, SIZE + FMPY f114 = ALPHA, f114 + nop __LINE__ + } + { .mfb + STFD [C14] = f110, SIZE + FMPY f118 = ALPHA, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f107, 5 * SIZE + FMPY f115 = ALPHA, f115 + nop __LINE__ + } + { .mfb + STFD [C14] = f111, 5 * SIZE + FMPY f119 = ALPHA, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + FMPY f120 = ALPHA, f120 + nop __LINE__ + } + { .mfb + STFD [C15] = f116, SIZE + FMPY f124 = ALPHA, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + FMPY f121 = ALPHA, f121 + nop __LINE__ + } + { .mfb + STFD [C15] = f117, SIZE + FMPY f125 = ALPHA, f125 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f114, SIZE + FMPY f122 = ALPHA, f122 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C15] = f118, SIZE + FMPY f126 = ALPHA, f126 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f115, 5 * SIZE + FMPY f123 = ALPHA, f123 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C15] = f119, 5 * SIZE + FMPY f127 = ALPHA, f127 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f120, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f124, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C16] = f125, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f122, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f126, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f123, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f127, 5 * SIZE + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f89 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 8, KK +#endif +#endif + mov f81 = f0 + (p6) br.cond.dptk .L030 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 3, B + mov f65 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; +#endif + { .mmf + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfi + setf.d f113 = r0 + mov f121 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mmf + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f75 = f0 + adds L = -1, L + } + { .mmf + setf.d f67 = r0 + setf.d f83 = r0 + mov f91 = f0 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f107 = f0 + mov ar.lc = L + } + { .mmf + setf.d f99 = r0 + setf.d f115 = r0 + mov f123 = f0 + } + ;; + .align 32 + +.L022: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C10], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f86 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f87 = [C11], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C12], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C5 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C13], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C6 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C14], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f116 = [C7 ], SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f118 = [C15], SIZE + FMA f66 = ALPHA, f66, f70 + nop __LINE__ + } + ;; + { .mfb + LDFD f117 = [C7 ], -1 * SIZE + FMA f65 = ALPHA, f65, f69 + nop __LINE__ + } + { .mfb + LDFD f119 = [C15], -1 * SIZE + FMA f67 = ALPHA, f67, f71 + nop __LINE__ + } + ;; + { .mfb + LDFD f124 = [C8], SIZE + FMA f72 = ALPHA, f72, f76 + nop __LINE__ + } + { .mfb + LDFD f126 = [C16], SIZE + FMA f74 = ALPHA, f74, f78 + nop __LINE__ + } + ;; + { .mfb + LDFD f125 = [C8], -1 * SIZE + FMA f73 = ALPHA, f73, f77 + nop __LINE__ + } + { .mfb + LDFD f127 = [C16], -1 * SIZE + FMA f75 = ALPHA, f75, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMA f82 = ALPHA, f82, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMA f83 = ALPHA, f83, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMA f90 = ALPHA, f90, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMA f91 = ALPHA, f91, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f96 = ALPHA, f96, f100 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + FMA f98 = ALPHA, f98, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + FMA f97 = ALPHA, f97, f101 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + FMA f99 = ALPHA, f99, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMA f104 = ALPHA, f104, f108 + nop __LINE__ + } + { .mfb + STFD [C12] = f90, SIZE + FMA f106 = ALPHA, f106, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, 3 * SIZE + FMA f105 = ALPHA, f105, f109 + nop __LINE__ + } + { .mfb + STFD [C12] = f91, 3 * SIZE + FMA f107 = ALPHA, f107, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMA f112 = ALPHA, f112, f116 + nop __LINE__ + } + { .mfb + STFD [C13] = f98, SIZE + FMA f114 = ALPHA, f114, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, 3 * SIZE + FMA f113 = ALPHA, f113, f117 + nop __LINE__ + } + { .mfb + STFD [C13] = f99, 3 * SIZE + FMA f115 = ALPHA, f115, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMA f120 = ALPHA, f120, f124 + nop __LINE__ + } + { .mfb + STFD [C14] = f106, SIZE + FMA f122 = ALPHA, f122, f126 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, 3 * SIZE + FMA f121 = ALPHA, f121, f125 + nop __LINE__ + } + { .mfb + STFD [C14] = f107, 3 * SIZE + FMA f123 = ALPHA, f123, f127 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C15] = f114, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, 3 * SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C15] = f115, 3 * SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f120, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f122, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f121, 3 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f123, 3 * SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mfb + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + { .mfb + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + ;; + { .mfb + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + ;; + { .mfb + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMPY f91 = ALPHA, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + FMPY f98 = ALPHA, f98 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + FMPY f99 = ALPHA, f99 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + { .mfb + STFD [C12] = f90, SIZE + FMPY f106 = ALPHA, f106 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, 3 * SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + { .mfb + STFD [C12] = f91, 3 * SIZE + FMPY f107 = ALPHA, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMPY f112 = ALPHA, f112 + nop __LINE__ + } + { .mfb + STFD [C13] = f98, SIZE + FMPY f114 = ALPHA, f114 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, 3 * SIZE + FMPY f113 = ALPHA, f113 + nop __LINE__ + } + { .mfb + STFD [C13] = f99, 3 * SIZE + FMPY f115 = ALPHA, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C6 ] = f104, SIZE + FMPY f120 = ALPHA, f120 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C14] = f106, SIZE + FMPY f122 = ALPHA, f122 + nop __LINE__ + } + ;; + { .mfi + STFD [C6 ] = f105, 3 * SIZE + FMPY f121 = ALPHA, f121 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C14] = f107, 3 * SIZE + FMPY f123 = ALPHA, f123 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C15] = f114, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f113, 3 * SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C15] = f115, 3 * SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f120, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f122, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, 3 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f123, 3 * SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L030: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 8, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#else + { .mmf + shladd BOFFSET = KK8, 3, B + shladd AOFFSET = KK8, 1, AOFFSET + mov f65 = f0 + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#endif + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f81 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f97 = f0 + adds L = -1, L + } + { .mfi + nop __LINE__ + mov f105 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f113 = f0 + mov ar.lc = L + } + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f121 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f100 = [C5], SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f108 = [C6], SIZE + FMA f65 = ALPHA, f65, f69 + nop __LINE__ + } + ;; + { .mfb + LDFD f101 = [C5], -1 * SIZE + FMA f72 = ALPHA, f72, f76 + nop __LINE__ + } + { .mfb + LDFD f109 = [C6], -1 * SIZE + FMA f73 = ALPHA, f73, f77 + nop __LINE__ + } + ;; + { .mfb + LDFD f116 = [C7], SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + LDFD f124 = [C8], SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + ;; + { .mfb + LDFD f117 = [C7], -1 * SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + LDFD f125 = [C8], -1 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f96 = ALPHA, f96, f100 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + FMA f104 = ALPHA, f104, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f97 = ALPHA, f97, f101 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + FMA f105 = ALPHA, f105, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f112 = ALPHA, f112, f116 + nop __LINE__ + } + { .mfb + STFD [C4 ] = f88, SIZE + FMA f120 = ALPHA, f120, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f113 = ALPHA, f113, f117 + nop __LINE__ + } + { .mfb + STFD [C4 ] = f89, SIZE + FMA f121 = ALPHA, f121, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C6 ] = f104, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C6 ] = f105, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f120, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f121, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + FMPY f112 = ALPHA, f112 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f120 = ALPHA, f120 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f81, SIZE + FMPY f113 = ALPHA, f113 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4 ] = f89, SIZE + FMPY f121 = ALPHA, f121 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C5 ] = f96, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C6 ] = f104, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C5 ] = f97, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6 ] = f105, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8 ] = f120, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f113, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8 ] = f121, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L040: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 8, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#else + { .mmi + shladd BOFFSET = KK8, 3, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#endif + ;; + { .mii + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + LDFD f32 = [AOFFSET], 1 * SIZE + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + { .mmi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + nop __LINE__ + } + ;; + .align 32 + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2] +#else + nop __LINE__ +#endif + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3] +#else + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4] +#else + nop __LINE__ +#endif + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C5] + (p5) LDFD f108 = [C6] +#else + nop __LINE__ + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f116 = [C7] + (p5) LDFD f124 = [C8] +#else + nop __LINE__ + nop __LINE__ +#endif + br.cloop.sptk.few .L042 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + FMA f80 = ALPHA, f80, f84 + FMA f88 = ALPHA, f88, f92 + + FMA f96 = ALPHA, f96, f100 + FMA f104 = ALPHA, f104, f108 + FMA f112 = ALPHA, f112, f116 + FMA f120 = ALPHA, f120, f124 + ;; + STFD [C1 ] = f64, SIZE + mov f64 = f0 + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; + STFD [C5 ] = f96, SIZE + mov f96 = f0 + STFD [C6 ] = f104, SIZE + mov f104 = f0 + ;; + STFD [C7 ] = f112, SIZE + mov f112 = f0 + STFD [C8 ] = f120, SIZE + mov f120 = f0 + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f72 = ALPHA, f72 + FMPY f80 = ALPHA, f80 + FMPY f88 = ALPHA, f88 + + { .mfi + FMPY f96 = ALPHA, f96 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f112 = ALPHA, f112 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f120 = ALPHA, f120 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C5 ] = f96, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6 ] = f104, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8 ] = f120, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L049: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mib + cmp.lt p6, p0 = 0, J + shr BB = K, 3 + (p6) br.cond.dptk .L010 + } + ;; + .align 32 + +.L050: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 2 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + shladd C3 = LDC, 1, C + mov f80 = f0 + nop __LINE__ + } + { .mfb + mov AOFFSET = A + mov f88 = f0 + (p6) br.cond.dpnt .L090 + } + ;; + { .mfi + cmp.eq p6, p7 = 0, I + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + { .mfi + shladd C4 = LDC, 1, C2 + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + shladd C = LDC, 2, C + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 32 + +.L052: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f74 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f66 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; +#endif + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 4, KK +#endif +#endif + } + { .mfi + setf.d f84 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f67 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f75 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f83 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f91 = r0 + mov f68 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f76 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f92 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f69 = f0 + shr L = L, 1 + } + { .mmf + setf.d f77 = r0 + setf.d f85 = r0 + mov f93 = f0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f70 = f0 + adds L = -1, L + } + { .mmf + setf.d f78 = r0 + setf.d f86 = r0 + mov f94 = f0 + } + ;; + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + mov ar.lc = L + } + { .mmf + setf.d f79 = r0 + setf.d f87 = r0 + mov f95 = f0 + } + ;; + .align 32 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f96 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f97 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f98 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f99 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f112 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f113 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f114 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f115 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f116 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f117 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f118 = [C3 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f119 = [C11], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 32 + +.L058: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + LDFD f120 = [C4 ], SIZE + FMA f64 = ALPHA, f64, f96 + cmp.ne p6, p0 = 1, I + } + { .mfb + LDFD f121 = [C12], SIZE + FMA f68 = ALPHA, f68, f97 + nop __LINE__ + } + ;; + { .mfi + LDFD f122 = [C4 ], SIZE + FMA f65 = ALPHA, f65, f98 + adds I = -1, I + } + { .mfb + LDFD f123 = [C12], SIZE + FMA f69 = ALPHA, f69, f99 + nop __LINE__ + } + ;; + { .mfb + LDFD f124 = [C4 ], SIZE + FMA f66 = ALPHA, f66, f100 + nop __LINE__ + } + { .mfb + LDFD f125 = [C12], SIZE + FMA f70 = ALPHA, f70, f101 + nop __LINE__ + } + ;; + { .mfb + LDFD f126 = [C4 ], -3 * SIZE + FMA f67 = ALPHA, f67, f102 + nop __LINE__ + } + { .mfb + LDFD f127 = [C12], -3 * SIZE + FMA f71 = ALPHA, f71, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f72 = ALPHA, f72, f104 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMA f76 = ALPHA, f76, f105 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f73 = ALPHA, f73, f106 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMA f77 = ALPHA, f77, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMA f74 = ALPHA, f74, f108 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMA f78 = ALPHA, f78, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f110 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f80 = ALPHA, f80, f112 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMA f84 = ALPHA, f84, f113 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMA f81 = ALPHA, f81, f114 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMA f85 = ALPHA, f85, f115 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMA f82 = ALPHA, f82, f116 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMA f86 = ALPHA, f86, f117 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMA f83 = ALPHA, f83, f118 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMA f87 = ALPHA, f87, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f88 = ALPHA, f88, f120 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMA f92 = ALPHA, f92, f121 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f89 = ALPHA, f89, f122 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMA f93 = ALPHA, f93, f123 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMA f90 = ALPHA, f90, f124 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMA f94 = ALPHA, f94, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMA f91 = ALPHA, f91, f126 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMA f95 = ALPHA, f95, f127 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMPY f76 = ALPHA, f76 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMPY f77 = ALPHA, f77 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMPY f78 = ALPHA, f78 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMPY f84 = ALPHA, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMPY f85 = ALPHA, f85 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMPY f86 = ALPHA, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMPY f87 = ALPHA, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMPY f92 = ALPHA, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMPY f93 = ALPHA, f93 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f82, SIZE + FMPY f90 = ALPHA, f90 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C11] = f86, SIZE + FMPY f94 = ALPHA, f94 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f83, 5 * SIZE + FMPY f91 = ALPHA, f91 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C11] = f87, 5 * SIZE + FMPY f95 = ALPHA, f95 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f92, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C12] = f93, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f90, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f94, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f91, 5 * SIZE + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f95, 5 * SIZE + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#endif + .align 32 + +.L060: + { .mfi + nop __LINE__ + mov f66 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 4, KK +#endif +#endif + mov f74 = f0 + (p6) br.cond.dptk .L070 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f82 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f90 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f82 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f90 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f67 = f0 + adds L = -1, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f75 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C10], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f86 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f87 = [C11], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C12], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f66 = ALPHA, f66, f70 + FMA f65 = ALPHA, f65, f69 + FMA f67 = ALPHA, f67, f71 + FMA f72 = ALPHA, f72, f76 + FMA f74 = ALPHA, f74, f78 + FMA f73 = ALPHA, f73, f77 + FMA f75 = ALPHA, f75, f79 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMA f82 = ALPHA, f82, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMA f83 = ALPHA, f83, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMA f90 = ALPHA, f90, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMA f91 = ALPHA, f91, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + mov f64 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 + adds L = 1, K + } + { .mfb + STFD [C12] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, 3 * SIZE + mov f89 = f0 + shr L = L, 1 + } + { .mfb + STFD [C12] = f91, 3 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f66 = ALPHA, f66 + FMPY f65 = ALPHA, f65 + FMPY f67 = ALPHA, f67 + FMPY f72 = ALPHA, f72 + FMPY f74 = ALPHA, f74 + FMPY f73 = ALPHA, f73 + FMPY f75 = ALPHA, f75 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f74, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, 3 * SIZE + FMPY f89 = ALPHA, f89 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C10] = f75, 3 * SIZE + FMPY f91 = ALPHA, f91 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C11] = f82, SIZE + mov f64 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f81, 3 * SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C11] = f83, 3 * SIZE + mov f72 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, 3 * SIZE + mov f89 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f91, 3 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L070: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L080 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 2, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + nop __LINE__ + } + ;; +#endif + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + FMA f72 = ALPHA, f72, f76 + FMA f73 = ALPHA, f73, f77 + + FMA f80 = ALPHA, f80, f84 + FMA f81 = ALPHA, f81, f85 + FMA f88 = ALPHA, f88, f92 + FMA f89 = ALPHA, f89, f93 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 + adds L = 1, K + } + { .mfb + STFD [C4 ] = f88, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE + shr L = L, 1 + } + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f65 = ALPHA, f65 + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = ALPHA, f80 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f81 = ALPHA, f81 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4 ] = f88, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L080: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 2, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + nop __LINE__ + } + ;; +#endif + + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2] + (p5) LDFD f84 = [C3] +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4] +#else + nop __LINE__ +#endif + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + FMA f80 = ALPHA, f80, f84 + FMA f88 = ALPHA, f88, f92 + ;; + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = ALPHA, f80 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L089: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + +.L090: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 1 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + setf.d f66 = r0 + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + { .mfb + mov AOFFSET = A + mov f73 = f0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mfi +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + mov f67 = f0 + shladd C = LDC, 1, C + } + { .mfb + cmp.eq p6, p7 = 0, I + mov f74 = f0 + (p6) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L092: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f79 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f68 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f79 = f0 + nop __LINE__ + } + ;; +#endif + + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f75 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 2, KK +#endif +#endif + } + ;; + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + mov f76 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f69 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f77 = f0 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC], LDC + mov f70 = f0 + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f78 = f0 + mov ar.lc = L + } + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + .align 32 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f96 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f97 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f98 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f99 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + nop __LINE__ + FMA f64 = ALPHA, f64, f96 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMA f68 = ALPHA, f68, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f65 = ALPHA, f65, f98 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMA f69 = ALPHA, f69, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f66 = ALPHA, f66, f100 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f70 = ALPHA, f70, f101 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = ALPHA, f67, f102 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = ALPHA, f71, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f72 = ALPHA, f72, f104 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMA f76 = ALPHA, f76, f105 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f73 = ALPHA, f73, f106 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMA f77 = ALPHA, f77, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMA f74 = ALPHA, f74, f108 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMA f78 = ALPHA, f78, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f110 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + mov f67 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + (p6) br.cond.dptk .L092 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMPY f76 = ALPHA, f76 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMPY f77 = ALPHA, f77 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMPY f74 = ALPHA, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f70, SIZE + FMPY f78 = ALPHA, f78 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f76, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C10] = f77, SIZE + mov f73 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f74, SIZE + mov f66 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f78, SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f75, 5 * SIZE + mov f67 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + STFD [C10] = f79, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L092 + } + ;; +#endif + .align 32 + +.L100: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmf + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + mov f75 = f0 + } + { .mii + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f75 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f77 = [C2 ], -1 * SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f79 = [C10], -1 * SIZE + FMA f66 = ALPHA, f66, f70 + nop __LINE__ + } + ;; + FMA f65 = ALPHA, f65, f69 + adds L = 1, K + FMA f67 = ALPHA, f67, f71 + ;; + FMA f72 = ALPHA, f72, f76 + shr L = L, 1 + FMA f74 = ALPHA, f74, f78 + FMA f73 = ALPHA, f73, f77 + FMA f75 = ALPHA, f75, f79 + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f66, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, 3 * SIZE + STFD [C9 ] = f67, 3 * SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f74, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C2 ] = f73, 3 * SIZE + STFD [C10] = f75, 3 * SIZE + mov f73 = f0 + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + FMPY f65 = ALPHA, f65 + FMPY f67 = ALPHA, f67 + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f75 = ALPHA, f75 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f66, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f67, 3 * SIZE + nop __LINE__ +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + mov f72 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C10] = f74, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, 3 * SIZE + mov f73 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + STFD [C10] = f75, 3 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#endif + + .align 32 + +.L110: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + shladd AOFFSET = KK8, 1, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + FMA f72 = ALPHA, f72, f76 + FMA f73 = ALPHA, f73, f77 + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f73, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L120: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + add AOFFSET = KK8, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + nop __LINE__ + mov ar.lc = L + } + ;; + .align 32 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] + (p5) LDFD f76 = [C2] +#else + nop __LINE__ + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + ;; + { .mfi + STFD [C1 ] = f64 + mov f64 = f0 + } + { .mfb + STFD [C2 ] = f72 + mov f72 = f0 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + ;; + { .mmi + nop __LINE__ +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + { .mfi + STFD [C1 ] = f64 + mov f64 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72 + mov f72 = f0 + } + ;; +#endif + .align 32 + +.L129: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + +.L130: + { .mfi +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f64 = f0 + tbit.z p6, p0 = N, 0 + } + { .mib + mov AOFFSET = A + shr I = M, 3 + (p6) br.cond.dpnt .L999 + } + ;; + { .mfi + mov C1 = C + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + mov f66 = f0 + nop __LINE__ + } + { .mfb + cmp.eq p7, p0 = 0, I + mov f67 = f0 + (p7) br.cond.dpnt .L140 + } + ;; + .align 32 + +.L132: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFD f48 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 1 * SIZE, B + mov f69 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#else + { .mfi + add BOFFSET = KK8, B + mov f68 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFD f48 = [BOFFSET], 1 * SIZE + mov f69 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f70 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mii + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f71 = f0 + adds L = -1, L + } + ;; + { .mmi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds PREC = CPREFETCHSIZE * SIZE, C1 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmi + CPREFETCH [PREC] + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f6 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f7 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f10 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f11 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f12 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f13 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f14 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f15 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + FMA f64 = ALPHA, f64, f6 + cmp.ne p6, p0 = 1, I + } + { .mfb + FMA f68 = ALPHA, f68, f7 + } + ;; + { .mfi + FMA f65 = ALPHA, f65, f10 + adds I = -1, I + } + { .mfb + FMA f69 = ALPHA, f69, f11 + } + ;; + { .mfi + FMA f66 = ALPHA, f66, f12 + } + { .mfb + FMA f70 = ALPHA, f70, f13 + } + ;; + { .mfb + FMA f67 = ALPHA, f67, f14 + } + { .mfb + FMA f71 = ALPHA, f71, f15 + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + mov f66 = f0 + } + ;; + { .mmf + STFD [C1 ] = f67, 5 * SIZE + nop __LINE__ + mov f67 = f0 + } + { .mmb + STFD [C9 ] = f71, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; +#else + { .mfi + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + FMPY f68 = ALPHA, f68 + } + ;; + { .mfi + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + FMPY f69 = ALPHA, f69 + } + ;; + { .mfi + FMPY f66 = ALPHA, f66 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + FMPY f70 = ALPHA, f70 + } + ;; + { .mfi + FMPY f67 = ALPHA, f67 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f71 = ALPHA, f71 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f68, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f69, SIZE + nop __LINE__ +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f66 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f70, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f67 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmb + STFD [C9 ] = f71, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; +#endif + .align 32 + +.L140: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 2, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + adds L = -1, L + nop __LINE__ + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f66 = ALPHA, f66, f70 + FMA f65 = ALPHA, f65, f69 + FMA f67 = ALPHA, f67, f71 + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + adds L = 1, K + } + { .mfb + STFD [C9 ] = f66, SIZE + mov f66 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 + shr L = L, 1 + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f67 = ALPHA, f67 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f66, SIZE + mov f66 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L150: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + LDFD f68 = [C1 ], SIZE + ;; + LDFD f69 = [C1 ], -1 * SIZE + ;; + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + ;; + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mii + nop __LINE__ +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L160: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 +#else + FMPY f64 = ALPHA, f64 +#endif + ;; + STFD [C1 ] = f64 + ;; + .align 32 + +.L169: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f24 = [SP], 32 + ldf.fill f25 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f26 = [SP], 32 + ldf.fill f27 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f28 = [SP], 32 + ldf.fill f29 = [r9], 32 + ;; + ldf.fill f30 = [SP], 32 + ldf.fill f31 = [r9] + + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/gemm_ncopy.S b/kernel/ia64/gemm_ncopy.S new file mode 100644 index 0000000..ebb80bf --- /dev/null +++ b/kernel/ia64/gemm_ncopy.S @@ -0,0 +1,493 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 64 +#define WPREFETCHSIZE 32 + +#ifndef XDOUBLE +#define LD LDF8 +#define ST STF8_NTA +#else +#define LD LDFD +#define ST STFD_NTA +#endif + +#define J r15 +#define PREB r17 +#define PREA r18 + +#define A1 r19 +#define A2 r20 +#define A3 r21 +#define A4 r22 +#define A5 r23 +#define A6 r24 +#define A7 r25 +#define A8 r26 +#define B1 r27 +#define B2 r28 + +#define COUNT r9 +#define I r10 +#define II r11 + +#define ARLC r29 +#define PR r30 + +#define M r32 +#define N r33 +#define A r34 +#define LDA r35 +#define B r36 + + PROLOGUE + .prologue + PROFCODE + + .body + { .mii + shladd LDA = LDA, BASE_SHIFT, r0 + mov PR = pr + shr J = N, 3 + } + ;; + { .mib + cmp.eq p8, p0 = 0, J + mov ARLC = ar.lc + (p8) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L11: + { .mmi + mov A1 = A + add A2 = A, LDA + mov pr.rot = 0 + } + { .mmi + shladd A3 = LDA, 1, A + shladd A5 = LDA, 2, A + adds I = 1, M + } + ;; + { .mmi + shladd A4 = LDA, 1, A2 + shladd A6 = LDA, 2, A2 + mov ar.ec = 6 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + shladd A7 = LDA, 2, A3 + shr I = I, 1 + } + ;; + { .mmi + adds B1 = 8 * SIZE, B + shladd A8 = LDA, 2, A4 + shladd A = LDA, 3, A + } + { .mmi + adds I = -1, I + mov COUNT = 0 + adds J = -1, J + } + ;; + { .mmi + adds PREA = PREFETCHSIZE * SIZE, A + adds PREB = WPREFETCHSIZE * SIZE, B + mov ar.lc = I + } + { .mmi + mov I = M + mov II = M + cmp.ne p14, p0 = r0, r0 + } + ;; + .align 32 + +.L12: + { .mmi + (p21) ST [B ] = f37, 1 * SIZE + (p14) ST [B1] = f49, 1 * SIZE + (p16) cmp.ne.unc p13, p0 = 1, I + } + { .mmi + lfetch.nt1 [PREA], LDA + lfetch.excl.nt1 [PREB] + adds PREB = 16 * SIZE, PREB + } + ;; + { .mmi + (p21) ST [B ] = f43, 1 * SIZE + (p14) ST [B1] = f55, 1 * SIZE + cmp.eq p9, p0 = 8, COUNT + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f38 = [A2], SIZE + (p16) adds I = -2, I + } + ;; + { .mmi + (p21) ST [B ] = f61, 1 * SIZE + (p14) ST [B1] = f73, 1 * SIZE + (p9) mov COUNT = 0 + } + { .mmi + (p13) LD f44 = [A1], SIZE + (p13) LD f50 = [A2], SIZE + (p21) adds II = -2, II + } + ;; + { .mmb + (p21) ST [B ] = f67, 1 * SIZE + (p14) ST [B1] = f79, 1 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f56 = [A3], SIZE + (p16) LD f62 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmi + (p21) ST [B ] = f85, 1 * SIZE + (p14) ST [B1] = f97, 1 * SIZE + (p9) adds PREA = (PREFETCHSIZE - 2)* SIZE, A1 + } + { .mmb + (p13) LD f68 = [A3], SIZE + (p13) LD f74 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p21) ST [B ] = f91, 1 * SIZE + (p14) ST [B1] = f103, 1 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f80 = [A5], SIZE + (p16) LD f86 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p21) ST [B ] = f109, 1 * SIZE + (p14) ST [B1] = f121, 1 * SIZE + nop __LINE__ + } + { .mmb + (p13) LD f92 = [A5], SIZE + (p13) LD f98 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p21) ST [B ] = f115, 1 * SIZE + (p14) ST [B1] = f127, 9 * SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p16) LD f104 = [A7], SIZE + (p16) LD f110 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LD f116 = [A7], SIZE + (p13) LD f122 = [A8], SIZE + (p14) adds B = 8 * SIZE, B + } + { .mmb + (p20) cmp.ne.unc p14, p0 = 1, II + nop __LINE__ + br.ctop.sptk.few .L12 + } + ;; + { .mmb + cmp.ne p6, p0 = 0, J + nop __LINE__ + (p6) br.cond.dptk .L11 + } + ;; + .align 32 + +.L20: + { .mmi + adds I = 1, M + mov A1 = A + mov pr.rot = 0 + } + { .mmi + add A2 = A, LDA + shladd A3 = LDA, 1, A + tbit.z p6, p0 = N, 2 + } + ;; + { .mmi + shladd A4 = LDA, 1, A2 + adds B1 = 4 * SIZE, B + mov ar.ec = 6 + } + { .mib + cmp.eq p16, p0 = r0, r0 + shr I = I, 1 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmi + shladd A = LDA, 2, A + nop __LINE__ + nop __LINE__ + } + { .mmi + adds I = -1, I + mov COUNT = 0 + adds J = -1, J + } + ;; + { .mmi + adds PREA = PREFETCHSIZE * SIZE, A + adds PREB = WPREFETCHSIZE * SIZE, B + mov ar.lc = I + } + { .mmi + mov I = M + mov II = M + cmp.ne p14, p0 = r0, r0 + } + ;; + .align 32 + +.L22: + { .mmi + (p21) ST [B ] = f37, 1 * SIZE + (p14) ST [B1] = f49, 1 * SIZE + (p16) cmp.ne.unc p13, p0 = 1, I + } + { .mmi + lfetch.nt1 [PREA], LDA + lfetch.excl.nt1 [PREB], 8 * SIZE + cmp.eq p9, p0 = 4, COUNT + } + ;; + { .mmi + (p21) ST [B ] = f43, 1 * SIZE + (p14) ST [B1] = f55, 1 * SIZE + (p16) adds I = -2, I + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f38 = [A2], SIZE + (p21) adds II = -2, II + } + ;; + { .mmi + (p21) ST [B ] = f61, 1 * SIZE + (p14) ST [B1] = f73, 1 * SIZE + (p9) mov COUNT = 0 + } + { .mmi + (p13) LD f44 = [A1], SIZE + (p13) LD f50 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p21) ST [B ] = f67, 1 * SIZE + (p14) ST [B1] = f79, 5 * SIZE + (p9) adds PREA = PREFETCHSIZE * SIZE, A1 + } + { .mmb + (p16) LD f56 = [A3], SIZE + (p16) LD f62 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LD f68 = [A3], SIZE + (p13) LD f74 = [A4], SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p14) adds B = 4 * SIZE, B + (p20) cmp.ne.unc p14, p0 = 1, II + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L30: + { .mmi + adds I = 1, M + mov A1 = A + mov pr.rot = 0 + } + { .mmi + add A2 = A, LDA + adds B1 = 2 * SIZE, B + tbit.z p6, p0 = N, 1 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.ec = 6 + } + { .mib + cmp.eq p16, p0 = r0, r0 + shr I = I, 1 + (p6) br.cond.dpnt .L40 + } + ;; + { .mmi + adds I = -1, I + ;; + shladd A = LDA, 1, A + mov ar.lc = I + } + { .mmi + mov I = M + mov II = M + cmp.ne p14, p0 = r0, r0 + } + ;; + .align 32 + +.L32: + { .mmi + (p21) ST [B ] = f37, 1 * SIZE + (p14) ST [B1] = f49, 1 * SIZE + (p16) cmp.ne.unc p13, p0 = 1, I + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) adds II = -2, II + } + ;; + { .mmi + (p21) ST [B ] = f43, 1 * SIZE + (p14) ST [B1] = f55, 3 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f38 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LD f44 = [A1], SIZE + (p13) LD f50 = [A2], SIZE + (p16) adds I = -2, I + } + { .mmb + (p14) adds B = 2 * SIZE, B + (p20) cmp.ne.unc p14, p0 = 1, II + br.ctop.sptk.few .L32 + } + ;; + .align 32 + +.L40: + { .mmi + adds I = 1, M + mov A1 = A + mov pr.rot = 0 + } + { .mmi + tbit.z p6, p0 = N, 0 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.ec = 6 + } + { .mib + cmp.eq p16, p0 = r0, r0 + shr I = I, 1 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + adds I = -1, I + ;; + mov ar.lc = I + } + { .mmi + mov I = M + mov II = M + cmp.ne p14, p0 = r0, r0 + } + ;; + .align 32 + +.L42: + { .mmi + (p21) ST [B ] = f37, 1 * SIZE + (p16) cmp.ne.unc p13, p0 = 1, I + (p21) adds II = -2, II + } + ;; + { .mmi + (p14) ST [B ] = f49, 1 * SIZE + (p16) LD f32 = [A1], SIZE + (p16) adds I = -2, I + } + ;; + { .mmb + (p13) LD f44 = [A1], SIZE + (p20) cmp.ne.unc p14, p0 = 1, II + br.ctop.sptk.few .L42 + } + ;; + .align 32 + +.L999: + mov pr = PR, -1 + mov ar.lc = ARLC + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/gemm_tcopy.S b/kernel/ia64/gemm_tcopy.S new file mode 100644 index 0000000..44555fa --- /dev/null +++ b/kernel/ia64/gemm_tcopy.S @@ -0,0 +1,1695 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 24 +#define WPREFETCHSIZE 32 + +#ifndef XDOUBLE +#define LD LDFD +#define ST STFD_NTA +#else +#define LD LDFD +#define ST STFD_NTA +#endif + +#define PREA r2 +#define PREB r3 + +#define A1 r14 +#define A2 r15 +#define B1 r16 +#define B2 r17 +#define I r18 +#define J r19 + +#define BO2 r20 +#define BO3 r21 +#define BO4 r22 + +#define LDB r23 +#define II r24 +#define TEMP1 r25 +#define TEMP2 r26 +#define TEMP3 r27 +#define LCOUNT r28 +#define SCOUNT r29 + +#define ARLC r30 +#define PR r31 + +#define MLDA8 r8 + +#define M r32 +#define N r33 +#define A r34 +#define LDA r35 +#define B r36 + + PROLOGUE + .prologue + PROFCODE + + .body + { .mmi + setf.sig f32 = M + and r8 = -8, N + mov ARLC = ar.lc + } + ;; + { .mmi + setf.sig f33 = r8 + and r9 = -4, N + mov PR = pr + } + ;; + { .mmi + setf.sig f34 = r9 + and r10 = -2, N + shladd LDA = LDA, BASE_SHIFT, r0 + } + ;; + { .mmi + setf.sig f35 = r10 + shladd MLDA8 = LDA, 3, r0 + shl LDB = M, BASE_SHIFT + 3 + } + ;; + { .mfi + sub MLDA8 = r0, MLDA8 + xmpy.l f33 = f32, f33 + shr J = M, 3 + } + { .mfi + xmpy.l f34 = f32, f34 + } + ;; + { .mmf + getf.sig BO2 = f33 + adds MLDA8 = 16 * SIZE, MLDA8 + xmpy.l f35 = f32, f35 + } + ;; + { .mmi + getf.sig BO3 = f34 + getf.sig BO4 = f35 + nop __LINE__ + } + ;; + { .mmi + shladd BO2 = BO2, BASE_SHIFT, B + shladd BO3 = BO3, BASE_SHIFT, B + shladd BO4 = BO4, BASE_SHIFT, B + } + { .mib + cmp.eq p6, p0 = 0, J + nop __LINE__ + (p6) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L11: + { .mmi + add I = 8, N + mov A1 = A + mov pr.rot = 0 + } + { .mmi + adds A2 = 4 * SIZE, A + shladd A = LDA, 3, A + shr II = N, 3 + } + ;; + { .mmi + mov B1 = B + cmp.eq p16, p0 = r0, r0 + mov ar.ec = 3 + } + { .mmi + adds B2 = 4 * SIZE, B + adds B = 64 * SIZE, B + shr I = I, 4 + } + ;; + { .mmi + cmp.eq p8, p0 = 0, I + shladd I = I, 2, r0 + nop __LINE__ + } + ;; + { .mmi + mov LCOUNT = 0 + mov SCOUNT = 0 + adds I = -1, I + } + ;; + { .mmi + adds PREA = PREFETCHSIZE * SIZE, A1 + adds PREB = WPREFETCHSIZE * SIZE, B1 + mov ar.lc = I + } + { .mib + adds J = -1, J + mov I = II + (p8) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L12: + { .mmi + (p18) ST [B1] = f34, 1 * SIZE + (p18) ST [B2] = f46, 1 * SIZE + (p18) cmp.ne.unc p13, p0 = 1, II + } + { .mmi + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], LDB + (p16) cmp.ne.unc p12, p0 = 1, I + } + ;; + { .mmi + (p18) ST [B1] = f37, 1 * SIZE + (p18) ST [B2] = f49, 1 * SIZE + (p18) adds SCOUNT = 1, SCOUNT + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f44 = [A2], SIZE + (p16) adds LCOUNT = 1, LCOUNT + } + ;; + { .mmi + (p18) ST [B1] = f40, 1 * SIZE + (p18) ST [B2] = f52, 1 * SIZE + (p16) cmp.eq.unc p14, p0 = 4, LCOUNT + } + { .mmi + (p16) LD f35 = [A1], SIZE + (p16) LD f47 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f43, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + (p18) cmp.eq.unc p15, p0 = 4, SCOUNT + } + { .mmi + (p16) LD f38 = [A1], SIZE + (p16) LD f50 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p18) ST [B1] = f82, 1 * SIZE + (p18) ST [B2] = f94, 1 * SIZE + } + { .mmi + (p16) LD f41 = [A1], TEMP1 + (p16) LD f53 = [A2], TEMP1 + } + ;; + { .mmi + (p18) ST [B1] = f85, 1 * SIZE + (p18) ST [B2] = f97, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f56 = [A1], SIZE + (p12) LD f68 = [A2], SIZE + shladd TEMP3 = LDA, 3, r0 + } + ;; + { .mmi + (p18) ST [B1] = f88, 1 * SIZE + (p18) ST [B2] = f100, 1 * SIZE + (p13) adds TEMP2 = - 11 * SIZE, LDB + } + { .mmi + (p12) LD f59 = [A1], SIZE + (p12) LD f71 = [A2], SIZE + (p12) adds TEMP1 = - 11 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f91 + (p18) ST [B2] = f103 + (p18) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f62 = [A1], SIZE + (p12) LD f74 = [A2], SIZE + (p18) add B2 = B2, TEMP2 + } + ;; + { .mmi + (p13) ST [B1] = f58, 1 * SIZE + (p13) ST [B2] = f70, 1 * SIZE + } + { .mmi + (p12) LD f65 = [A1], TEMP1 + (p12) LD f77 = [A2], TEMP1 + sub TEMP3 = LDA, TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f61, 1 * SIZE + (p13) ST [B2] = f73, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB] + adds TEMP3 = 5 * SIZE, TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f64, 1 * SIZE + (p13) ST [B2] = f76, 1 * SIZE + } + { .mmi + (p16) LD f80 = [A1], SIZE + (p16) LD f92 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p13) ST [B1] = f67, 5 * SIZE + (p13) ST [B2] = f79, 5 * SIZE + } + { .mmi + (p16) LD f83 = [A1], SIZE + (p16) LD f95 = [A2], SIZE + (p14) mov TEMP1 = TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f106, 1 * SIZE + (p13) ST [B2] = f118, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p16) LD f86 = [A1], SIZE + (p16) LD f98 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p13) ST [B1] = f109, 1 * SIZE + (p13) ST [B2] = f121, 1 * SIZE + sub TEMP2 = TEMP2, LDB + } + { .mmi + (p16) LD f89 = [A1], TEMP1 + (p16) LD f101 = [A2], TEMP1 + } + ;; + { .mmi + (p13) ST [B1] = f112, 1 * SIZE + (p13) ST [B2] = f124, 1 * SIZE + (p15) adds TEMP2 = -59 * SIZE, LDB + } + { .mmi + (p12) LD f104 = [A1], SIZE + (p12) LD f116 = [A2], SIZE + (p14) add PREA = PREA, MLDA8 + } + ;; + { .mmi + (p13) ST [B1] = f115 + (p13) ST [B2] = f127 + (p13) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f107 = [A1], SIZE + (p12) LD f119 = [A2], SIZE + adds TEMP1 = -11 * SIZE, LDA + } + ;; + { .mmi + (p12) LD f110 = [A1], SIZE + (p12) LD f122 = [A2], SIZE + (p14) mov TEMP1 = TEMP3 + } + { .mmi + (p14) mov LCOUNT = 0 + (p15) mov SCOUNT = 0 + adds PREB = WPREFETCHSIZE * SIZE, B1 + } + ;; + { .mmi + (p12) LD f113 = [A1], TEMP1 + (p12) LD f125 = [A2], TEMP1 + (p13) add B2 = B2, TEMP2 + } + { .mib + (p14) adds I = -2, I + (p15) adds II = -2, II + br.ctop.sptk .L12 + } + ;; + .align 32 + +.L20: + { .mmi + add A2 = A1, LDA + and TEMP3 = 7, N + tbit.nz p7, p0 = N, 2 + } + ;; + { .mmi + (p7) LD f32 = [A1], SIZE + (p7) LD f36 = [A2], SIZE + cmp.eq p6, p0 = 0, TEMP3 + } + ;; + { .mmi + (p7) LD f33 = [A1], SIZE + (p7) LD f37 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p7) LD f34 = [A1], SIZE + (p7) LD f38 = [A2], SIZE + add TEMP1 = TEMP1, LDA + } + ;; + { .mmi + (p7) LD f35 = [A1], TEMP1 + (p7) LD f39 = [A2], TEMP1 + (p6) cmp.ne.unc p10, p0 = 0, J + } + ;; + { .mmb + (p7) LD f40 = [A1], SIZE + (p7) LD f44 = [A2], SIZE + (p10) br.cond.dptk .L11 + } + ;; + { .mmi + (p7) LD f41 = [A1], SIZE + (p7) LD f45 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) LD f42 = [A1], SIZE + (p7) LD f46 = [A2], SIZE + tbit.nz p8, p0 = N, 1 + } + ;; + { .mmi + (p7) LD f43 = [A1], TEMP1 + (p7) LD f47 = [A2], TEMP1 + adds B2 = 4 * SIZE, BO2 + } + ;; + { .mmi + (p7) ST [BO2] = f32, 1 * SIZE + (p7) ST [B2 ] = f36, 1 * SIZE + tbit.nz p9, p0 = N, 0 + } + { .mmi + (p7) LD f48 = [A1], SIZE + (p7) LD f52 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f33, 1 * SIZE + (p7) ST [B2 ] = f37, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f49 = [A1], SIZE + (p7) LD f53 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f34, 1 * SIZE + (p7) ST [B2 ] = f38, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f50 = [A1], SIZE + (p7) LD f54 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f35, 5 * SIZE + (p7) ST [B2 ] = f39, 5 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f51 = [A1], TEMP1 + (p7) LD f55 = [A2], TEMP1 + mov TEMP1 = -1 * SIZE + } + ;; + { .mmi + (p7) ST [BO2] = f40, 1 * SIZE + (p7) ST [B2 ] = f44, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f56 = [A1], SIZE + (p7) LD f60 = [A2], SIZE + shladd TEMP1 = LDA, 3, TEMP1 + } + ;; + { .mmi + (p7) ST [BO2] = f41, 1 * SIZE + (p7) ST [B2 ] = f45, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f57 = [A1], SIZE + (p7) LD f61 = [A2], SIZE + sub TEMP1 = 0, TEMP1 + } + ;; + { .mmi + (p7) ST [BO2] = f42, 1 * SIZE + (p7) ST [B2 ] = f46, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f58 = [A1], SIZE + (p7) LD f62 = [A2], SIZE + shladd TEMP1 = LDA, 1, TEMP1 + } + ;; + { .mmi + (p7) ST [BO2] = f43, 5 * SIZE + (p7) ST [B2 ] = f47, 5 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f59 = [A1], TEMP1 + (p7) LD f63 = [A2], TEMP1 + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f48, 1 * SIZE + (p7) ST [B2 ] = f52, 1 * SIZE + nop __LINE__ + } + { .mmi + add A2 = A1, LDA + adds TEMP1 = -1 * SIZE, LDA + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f49, 1 * SIZE + (p7) ST [B2 ] = f53, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f64 = [A1], SIZE + (p8) LD f66 = [A2], SIZE + add TEMP1 = TEMP1, LDA + } + ;; + { .mmi + (p7) ST [BO2] = f50, 1 * SIZE + (p7) ST [B2 ] = f54, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f65 = [A1], TEMP1 + (p8) LD f67 = [A2], TEMP1 + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f51, 5 * SIZE + (p7) ST [B2 ] = f55, 5 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f68 = [A1], SIZE + (p8) LD f70 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f56, 1 * SIZE + (p7) ST [B2 ] = f60, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f69 = [A1], TEMP1 + (p8) LD f71 = [A2], TEMP1 + mov TEMP3 = -1 * SIZE + } + ;; + { .mmi + (p7) ST [BO2] = f57, 1 * SIZE + (p7) ST [B2 ] = f61, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f72 = [A1], SIZE + (p8) LD f74 = [A2], SIZE + shladd TEMP3 = LDA, 3, TEMP3 + } + ;; + { .mmi + (p7) ST [BO2] = f58, 1 * SIZE + (p7) ST [B2 ] = f62, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f73 = [A1], TEMP1 + (p8) LD f75 = [A2], TEMP1 + sub TEMP3 = 0, TEMP3 + } + ;; + { .mmi + (p7) ST [BO2] = f59, 5 * SIZE + (p7) ST [B2 ] = f63 + adds B2 = 4 * SIZE, BO3 + } + { .mmi + (p8) LD f76 = [A1], SIZE + (p8) LD f78 = [A2], SIZE + shladd TEMP3 = LDA, 1, TEMP3 + } + ;; + { .mmi + (p8) ST [BO3] = f64, 1 * SIZE + (p8) ST [B2 ] = f68, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f77 = [A1], TEMP3 + (p8) LD f79 = [A2], TEMP3 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f65, 1 * SIZE + (p8) ST [B2 ] = f69, 1 * SIZE + nop __LINE__ + } + { .mmi + add A2 = A1, LDA + shladd TEMP3 = LDA, 1, r0 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f66, 1 * SIZE + (p8) ST [B2 ] = f70, 1 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f80 = [A1], TEMP3 + (p9) LD f81 = [A2], TEMP3 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f67, 5 * SIZE + (p8) ST [B2 ] = f71, 5 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f82 = [A1], TEMP3 + (p9) LD f83 = [A2], TEMP3 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f72, 1 * SIZE + (p8) ST [B2 ] = f76, 1 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f84 = [A1], TEMP3 + (p9) LD f85 = [A2], TEMP3 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f73, 1 * SIZE + (p8) ST [B2 ] = f77, 1 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f86 = [A1] + (p9) LD f87 = [A2] + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f74, 1 * SIZE + (p8) ST [B2 ] = f78, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f75, 5 * SIZE + (p8) ST [B2 ] = f79 + adds B2 = 4 * SIZE, BO4 + } + ;; + { .mmi + (p9) ST [BO4] = f80, 1 * SIZE + (p9) ST [B2 ] = f84, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p9) ST [BO4] = f81, 1 * SIZE + (p9) ST [B2 ] = f85, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p9) ST [BO4] = f82, 1 * SIZE + (p9) ST [B2 ] = f86, 1 * SIZE + cmp.ne p8, p0 = 0, J + } + ;; + { .mmb + (p9) ST [BO4] = f83, 5 * SIZE + (p9) ST [B2 ] = f87, 5 * SIZE + (p8) br.cond.dptk .L11 + } + ;; + .align 32 + +.L100: + { .mmi + mov A1 = A + add I = 8, N + mov pr.rot = 0 + } + { .mmi + adds A2 = 4 * SIZE, A + tbit.z p6, p0 = M, 2 + } + ;; + { .mmi + mov B1 = B + adds B2 = 4 * SIZE, B + mov ar.ec = 3 + } + { .mib + cmp.eq p16, p0 = r0, r0 + shr I = I, 4 + (p6) br.cond.dpnt .L200 + } + ;; + { .mmi + cmp.eq p8, p0 = 0, I + shladd I = I, 1, r0 + shladd A = LDA, 2, A + } + ;; + { .mmi + adds B = 32 * SIZE, B + adds I = -1, I + shr II = N, 3 + } + ;; + { .mmi + mov LCOUNT = 0 + mov SCOUNT = 0 + mov ar.lc = I + } + { .mib + nop __LINE__ + mov I = II + (p8) br.cond.dpnt .L120 + } + ;; + .align 32 + +.L112: + { .mmi + (p18) ST [B1] = f34, 1 * SIZE + (p18) ST [B2] = f46, 1 * SIZE + (p16) cmp.ne.unc p12, p0 = 1, I + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f44 = [A2], SIZE + (p18) cmp.ne.unc p13, p0 = 1, II + } + ;; + { .mmi + (p18) ST [B1] = f37, 1 * SIZE + (p18) ST [B2] = f49, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f35 = [A1], SIZE + (p16) LD f47 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f40, 1 * SIZE + (p18) ST [B2] = f52, 1 * SIZE + shladd TEMP3 = LDA, 2, r0 + } + { .mmi + (p16) LD f38 = [A1], SIZE + (p16) LD f50 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p18) ST [B1] = f43, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + (p16) adds LCOUNT = 1, LCOUNT + } + { .mmi + (p16) LD f41 = [A1], TEMP1 + (p16) LD f53 = [A2], TEMP1 + (p18) adds SCOUNT = 1, SCOUNT + } + ;; + { .mmi + (p18) ST [B1] = f82, 1 * SIZE + (p18) ST [B2] = f94, 1 * SIZE + (p16) cmp.eq.unc p14, p0 = 2, LCOUNT + } + { .mmi + (p12) LD f56 = [A1], SIZE + (p12) LD f68 = [A2], SIZE + (p18) cmp.eq.unc p15, p0 = 2, SCOUNT + } + ;; + { .mmi + (p18) ST [B1] = f85, 1 * SIZE + (p18) ST [B2] = f97, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f59 = [A1], SIZE + (p12) LD f71 = [A2], SIZE + sub TEMP3 = LDA, TEMP3 + } + ;; + { .mmi + (p18) ST [B1] = f88, 1 * SIZE + (p18) ST [B2] = f100, 1 * SIZE + (p13) adds TEMP2 = - 11 * SIZE, LDB + } + { .mmi + (p12) LD f62 = [A1], SIZE + (p12) LD f74 = [A2], SIZE + (p12) adds TEMP1 = - 11 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f91 + (p18) ST [B2] = f103 + (p18) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f65 = [A1], TEMP1 + (p12) LD f77 = [A2], TEMP1 + (p18) add B2 = B2, TEMP2 + } + ;; + { .mmi + (p13) ST [B1] = f58, 1 * SIZE + (p13) ST [B2] = f70, 1 * SIZE + adds TEMP3 = 5 * SIZE, TEMP3 + } + { .mmi + (p16) LD f80 = [A1], SIZE + (p16) LD f92 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p13) ST [B1] = f61, 1 * SIZE + (p13) ST [B2] = f73, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f83 = [A1], SIZE + (p16) LD f95 = [A2], SIZE + (p14) mov TEMP1 = TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f64, 1 * SIZE + (p13) ST [B2] = f76, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f86 = [A1], SIZE + (p16) LD f98 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p13) ST [B1] = f67, 5 * SIZE + (p13) ST [B2] = f79, 5 * SIZE + (p14) mov LCOUNT = 0 + } + { .mmi + (p16) LD f89 = [A1], TEMP1 + (p16) LD f101 = [A2], TEMP1 + (p15) mov SCOUNT = 0 + } + ;; + { .mmi + (p13) ST [B1] = f106, 1 * SIZE + (p13) ST [B2] = f118, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f104 = [A1], SIZE + (p12) LD f116 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) ST [B1] = f109, 1 * SIZE + (p13) ST [B2] = f121, 1 * SIZE + sub TEMP2 = TEMP2, LDB + } + { .mmi + (p12) LD f107 = [A1], SIZE + (p12) LD f119 = [A2], SIZE + adds TEMP1 = -11 * SIZE, LDA + } + ;; + { .mmi + (p13) ST [B1] = f112, 1 * SIZE + (p13) ST [B2] = f124, 1 * SIZE + (p15) adds TEMP2 = -27 * SIZE, LDB + } + { .mmi + (p12) LD f110 = [A1], SIZE + (p12) LD f122 = [A2], SIZE + (p14) mov TEMP1 = TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f115 + (p13) ST [B2] = f127 + (p13) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f113 = [A1], TEMP1 + (p12) LD f125 = [A2], TEMP1 + (p13) add B2 = B2, TEMP2 + } + ;; + { .mmb + (p14) adds I = -2, I + (p15) adds II = -2, II + br.ctop.sptk .L112 + } + ;; + .align 32 + +.L120: + { .mmi + add A2 = A1, LDA + nop __LINE__ + tbit.nz p7, p0 = N, 2 + } + ;; + { .mmi + (p7) LD f32 = [A1], SIZE + (p7) LD f36 = [A2], SIZE + tbit.nz p8, p0 = N, 1 + } + ;; + { .mmi + (p7) LD f33 = [A1], SIZE + (p7) LD f37 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p7) LD f34 = [A1], SIZE + (p7) LD f38 = [A2], SIZE + add TEMP1 = TEMP1, LDA + } + ;; + { .mmi + (p7) LD f35 = [A1], TEMP1 + (p7) LD f39 = [A2], TEMP1 + tbit.nz p9, p0 = N, 0 + } + ;; + { .mmi + (p7) LD f40 = [A1], SIZE + (p7) LD f44 = [A2], SIZE + mov TEMP2 = -1 * SIZE + } + ;; + { .mmi + (p7) LD f41 = [A1], SIZE + (p7) LD f45 = [A2], SIZE + shladd TEMP2 = LDA, 1, TEMP2 + } + ;; + { .mmi + (p7) LD f42 = [A1], SIZE + (p7) LD f46 = [A2], SIZE + sub TEMP2 = 0, TEMP2 + } + ;; + { .mmi + (p7) LD f43 = [A1], TEMP2 + (p7) LD f47 = [A2] + nop __LINE__ + } + ;; + { .mmi + add A2 = A1, LDA + adds TEMP1 = -1 * SIZE, LDA + mov TEMP2 = -1 * SIZE + } + ;; + { .mmi + (p8) LD f48 = [A1], SIZE + (p8) LD f50 = [A2], SIZE + add TEMP1 = TEMP1, LDA + } + ;; + { .mmi + (p8) LD f49 = [A1], TEMP1 + (p8) LD f51 = [A2], TEMP1 + shladd TEMP2 = LDA, 1, TEMP2 + } + ;; + { .mmi + (p8) LD f52 = [A1], SIZE + (p8) LD f54 = [A2], SIZE + sub TEMP2 = r0, TEMP2 + } + ;; + { .mmi + (p8) LD f53 = [A1], TEMP2 + (p8) LD f55 = [A2], TEMP2 + nop __LINE__ + } + ;; + { .mmi + add A2 = A1, LDA + adds B2 = 4 * SIZE, BO2 + nop __LINE__ + } + ;; + { .mmi + (p9) LD f56 = [A1] + nop __LINE__ + (p9) shladd A1 = LDA, 1, A1 + } + { .mmi + (p9) LD f57 = [A2] + nop __LINE__ + (p9) shladd A2 = LDA, 1, A2 + } + ;; + { .mmi + (p7) ST [BO2] = f32, 1 * SIZE + (p7) ST [B2 ] = f36, 1 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f58 = [A1] + (p9) LD f59 = [A2] + nop __LINE__ + } + ;; + ;; + { .mmi + (p7) ST [BO2] = f33, 1 * SIZE + (p7) ST [B2 ] = f37, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f34, 1 * SIZE + (p7) ST [B2 ] = f38, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f35, 5 * SIZE + (p7) ST [B2 ] = f39, 5 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f40, 1 * SIZE + (p7) ST [B2 ] = f44, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f41, 1 * SIZE + (p7) ST [B2 ] = f45, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f42, 1 * SIZE + (p7) ST [B2 ] = f46, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f43, 5 * SIZE + (p7) ST [B2 ] = f47 + adds B2 = 4 * SIZE, BO3 + } + ;; + { .mmi + (p8) ST [BO3] = f48, 1 * SIZE + (p8) ST [B2 ] = f52, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f49, 1 * SIZE + (p8) ST [B2 ] = f53, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f50, 1 * SIZE + (p8) ST [B2 ] = f54, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f51, 5 * SIZE + (p8) ST [B2 ] = f55 + adds B2 = 2 * SIZE, BO4 + } + ;; + { .mmi + (p9) ST [BO4] = f56, 1 * SIZE + (p9) ST [B2 ] = f58, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p9) ST [BO4] = f57, 3 * SIZE + (p9) ST [B2 ] = f59 + nop __LINE__ + } + ;; + .align 32 + +.L200: + { .mmi + add I = 8, N + mov A1 = A + mov pr.rot = 0 + } + { .mmi + adds A2 = 4 * SIZE, A + nop __LINE__ + tbit.z p6, p0 = M, 1 + } + ;; + { .mmi + mov B1 = B + cmp.eq p16, p0 = r0, r0 + mov ar.ec = 3 + } + { .mib + adds B2 = 4 * SIZE, B + shr I = I, 4 + (p6) br.cond.dpnt .L300 + } + ;; + { .mmi + shladd A = LDA, 1, A + adds B = 16 * SIZE, B + shr II = N, 3 + } + { .mmi + cmp.eq p8, p0 = 0, I + adds I = -1, I + nop __LINE__ + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = I + } + { .mib + mov I = II + nop __LINE__ + (p8) br.cond.dpnt .L220 + } + ;; + .align 32 + +.L212: + { .mmi + (p18) ST [B1] = f34, 1 * SIZE + (p18) ST [B2] = f46, 1 * SIZE + (p16) cmp.ne.unc p12, p0 = 1, I + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f44 = [A2], SIZE + (p18) cmp.ne.unc p13, p0 = 1, II + } + ;; + { .mmi + (p18) ST [B1] = f37, 1 * SIZE + (p18) ST [B2] = f49, 1 * SIZE + adds TEMP1 = -3 * SIZE, LDA + } + { .mmi + (p16) LD f35 = [A1], SIZE + (p16) LD f47 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f40, 1 * SIZE + (p18) ST [B2] = f52, 1 * SIZE + (p12) mov TEMP1 = 5 * SIZE + } + { .mmi + (p16) LD f38 = [A1], SIZE + (p16) LD f50 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f43, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f41 = [A1], TEMP1 + (p16) LD f53 = [A2], TEMP1 + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f82, 1 * SIZE + (p18) ST [B2] = f94, 1 * SIZE + nop __LINE__ + } + { .mmi + (p12) LD f56 = [A1], SIZE + (p12) LD f68 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f85, 1 * SIZE + (p18) ST [B2] = f97, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f59 = [A1], SIZE + (p12) LD f71 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f88, 1 * SIZE + (p18) ST [B2] = f100, 1 * SIZE + (p13) adds TEMP2 = - 11 * SIZE, LDB + } + { .mmi + (p12) LD f62 = [A1], SIZE + (p12) LD f74 = [A2], SIZE + (p12) adds TEMP1 = - 11 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f91 + (p18) ST [B2] = f103 + (p18) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f65 = [A1], TEMP1 + (p12) LD f77 = [A2], TEMP1 + (p18) add B2 = B2, TEMP2 + } + ;; + { .mmi + (p13) ST [B1] = f58, 1 * SIZE + (p13) ST [B2] = f70, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f80 = [A1], SIZE + (p16) LD f92 = [A2], SIZE + sub TEMP1 = r0, LDA + } + ;; + { .mmi + (p13) ST [B1] = f61, 1 * SIZE + (p13) ST [B2] = f73, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f83 = [A1], SIZE + (p16) LD f95 = [A2], SIZE + (p16) adds TEMP1 = 5 * SIZE, TEMP1 + } + ;; + { .mmi + (p13) ST [B1] = f64, 1 * SIZE + (p13) ST [B2] = f76, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f86 = [A1], SIZE + (p16) LD f98 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p13) ST [B1] = f67, 5 * SIZE + (p13) ST [B2] = f79, 5 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f89 = [A1], TEMP1 + (p16) LD f101 = [A2], TEMP1 + adds TEMP1 = -11 * SIZE, LDA + } + ;; + { .mmi + (p13) ST [B1] = f106, 1 * SIZE + (p13) ST [B2] = f118, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f104 = [A1], SIZE + (p12) LD f116 = [A2], SIZE + (p16) shladd TEMP1 = LDA, 1, r0 + } + ;; + { .mmi + (p13) ST [B1] = f109, 1 * SIZE + (p13) ST [B2] = f121, 1 * SIZE + sub TEMP2 = TEMP2, LDB + } + { .mmi + (p12) LD f107 = [A1], SIZE + (p12) LD f119 = [A2], SIZE + (p16) sub TEMP1 = LDA, TEMP1 + } + ;; + { .mmi + (p13) ST [B1] = f112, 1 * SIZE + (p13) ST [B2] = f124, 1 * SIZE + (p18) adds TEMP2 = -11 * SIZE, LDB + } + { .mmi + (p12) LD f110 = [A1], SIZE + (p12) LD f122 = [A2], SIZE + (p16) adds TEMP1 = 5 * SIZE, TEMP1 + } + ;; + { .mmi + (p13) ST [B1] = f115 + (p13) ST [B2] = f127 + (p13) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f113 = [A1], TEMP1 + (p12) LD f125 = [A2], TEMP1 + (p13) add B2 = B2, TEMP2 + } + ;; + { .mmb + (p16) adds I = -2, I + (p18) adds II = -2, II + br.ctop.sptk .L212 + } + ;; + .align 32 + +.L220: + { .mmi + add A2 = A1, LDA + nop __LINE__ + tbit.nz p7, p0 = N, 2 + } + ;; + { .mmi + (p7) LD f32 = [A1], SIZE + (p7) LD f36 = [A2], SIZE + tbit.nz p8, p0 = N, 1 + } + ;; + { .mmi + (p7) LD f33 = [A1], SIZE + (p7) LD f37 = [A2], SIZE + tbit.nz p9, p0 = N, 0 + } + ;; + { .mmi + (p7) LD f34 = [A1], SIZE + (p7) LD f38 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) LD f35 = [A1], SIZE + (p7) LD f39 = [A2] + nop __LINE__ + } + ;; + { .mmi + add A2 = A1, LDA + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p8) LD f40 = [A1], SIZE + (p8) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) LD f41 = [A1], SIZE + (p8) LD f43 = [A2] + nop __LINE__ + } + ;; + { .mmi + add A2 = A1, LDA + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p9) LD f44 = [A1] + (p9) LD f45 = [A2] + adds B2 = 4 * SIZE, BO2 + } + ;; + { .mmi + (p7) ST [BO2] = f32, 1 * SIZE + (p7) ST [B2 ] = f36, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f33, 1 * SIZE + (p7) ST [B2 ] = f37, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f34, 1 * SIZE + (p7) ST [B2 ] = f38, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f35, 5 * SIZE + (p7) ST [B2 ] = f39 + adds B2 = 2 * SIZE, BO3 + } + ;; + { .mmi + (p8) ST [BO3] = f40, 1 * SIZE + (p8) ST [B2 ] = f42, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f41, 3 * SIZE + (p8) ST [B2 ] = f43 + adds B2 = 1 * SIZE, BO4 + } + ;; + { .mmi + (p9) ST [BO4] = f44, 2 * SIZE + (p9) ST [B2 ] = f45 + nop __LINE__ + } + ;; + .align 32 + +.L300: + { .mmi + add I = 8, N + mov A1 = A + mov pr.rot = 0 + } + { .mmi + mov B1 = B + adds A2 = 4 * SIZE, A + tbit.z p6, p0 = M, 0 + } + ;; + { .mmi + adds B2 = 4 * SIZE, B + cmp.eq p16, p0 = r0, r0 + mov ar.ec = 3 + } + { .mib + nop __LINE__ + shr I = I, 4 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + cmp.eq p8, p0 = 0, I + adds I = -1, I + shr II = N, 3 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = I + } + { .mib + nop __LINE__ + mov I = II + (p8) br.cond.dpnt .L320 + } + ;; + .align 32 + +.L312: + { .mmi + (p18) ST [B1] = f34, 1 * SIZE + (p18) ST [B2] = f46, 1 * SIZE + (p16) cmp.ne.unc p12, p0 = 1, I + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f44 = [A2], SIZE + (p18) cmp.ne.unc p13, p0 = 1, II + } + ;; + { .mmi + (p18) ST [B1] = f37, 1 * SIZE + (p18) ST [B2] = f49, 1 * SIZE + adds TEMP2 = - 3 * SIZE, LDB + } + { .mmi + (p16) LD f35 = [A1], SIZE + (p16) LD f47 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f40, 1 * SIZE + (p18) ST [B2] = f52, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f38 = [A1], SIZE + (p16) LD f50 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f43 + (p18) ST [B2] = f55 + (p18) add B1 = B1, TEMP2 + } + { .mmi + (p16) LD f41 = [A1], 5 * SIZE + (p16) LD f53 = [A2], 5 * SIZE + (p18) add B2 = B2, TEMP2 + } + ;; + { .mmi + (p13) ST [B1] = f58, 1 * SIZE + (p13) ST [B2] = f70, 1 * SIZE + (p16) adds I = -2, I + } + { .mmi + (p12) LD f56 = [A1], SIZE + (p12) LD f68 = [A2], SIZE + (p18) adds II = -2, II + } + ;; + { .mmi + (p13) ST [B1] = f61, 1 * SIZE + (p13) ST [B2] = f73, 1 * SIZE + nop __LINE__ + } + { .mmi + (p12) LD f59 = [A1], SIZE + (p12) LD f71 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) ST [B1] = f64, 1 * SIZE + (p13) ST [B2] = f76, 1 * SIZE + nop __LINE__ + } + { .mmi + (p12) LD f62 = [A1], SIZE + (p12) LD f74 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) ST [B1] = f67 + (p13) ST [B2] = f79 + (p13) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f65 = [A1], 5 * SIZE + (p12) LD f77 = [A2], 5 * SIZE + (p13) add B2 = B2, TEMP2 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + br.ctop.sptk .L312 + } + ;; + .align 32 + +.L320: + { .mmi + adds A2 = 2 * SIZE, A1 + adds B2 = 2 * SIZE, BO2 + tbit.nz p7, p0 = N, 2 + } + ;; + { .mmi + (p7) LD f32 = [A1], SIZE + (p7) LD f34 = [A2], SIZE + tbit.nz p8, p0 = N, 1 + } + ;; + { .mmi + (p7) LD f33 = [A1], 3 * SIZE + (p7) LD f35 = [A2] + nop __LINE__ + } + ;; + { .mmi + adds A2 = SIZE, A1 + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p8) LD f36 = [A1], 2 * SIZE + (p8) LD f37 = [A2] + tbit.nz p9, p0 = N, 0 + } + ;; + { .mmi + (p9) LD f38 = [A1] + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f32, 1 * SIZE + (p7) ST [B2 ] = f34, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f33, 3 * SIZE + (p7) ST [B2 ] = f35 + adds B2 = SIZE, BO3 + } + ;; + { .mmi + (p8) ST [BO3] = f36, 2 * SIZE + (p8) ST [B2 ] = f37 + nop __LINE__ + } + ;; + { .mmi + (p9) ST [BO4] = f38, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + .align 32 + +.L999: + mov pr = PR, -1 + mov ar.lc = ARLC + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/gemv_n.S b/kernel/ia64/gemv_n.S new file mode 100644 index 0000000..4826bf5 --- /dev/null +++ b/kernel/ia64/gemv_n.S @@ -0,0 +1,3317 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#define A r36 +#define LDA r37 +#define X r38 +#define INCX r39 +#define Y r34 +#define INCY r35 +#define BUFFER r11 + +#define I r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define AO5 r20 +#define AO6 r21 +#define AO7 r22 +#define AO8 r23 +#define YLD1 r24 +#define YST1 r25 +#define YST2 r27 +#define MM r28 +#define YY r9 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define AO11 loc8 +#define AO21 loc9 +#define AO31 loc10 +#define AO41 loc11 +#define AO51 loc12 +#define AO61 loc13 +#define AO71 loc14 +#define AO81 loc15 + +#define PREB r8 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 8) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 8, 0 + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + ;; + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + ;; + stf.spill [r8] = f22 + stf.spill [r9] = f23 + .body + ;; + + ld8 Y = [r14] + ld8 INCY = [r15] + ld8 BUFFER = [r16] + + mov ALPHA = f8 + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + ;; + shladd INCX = INCX, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + ;; + tbit.nz p8, p0 = A, BASE_SHIFT + tbit.nz p9, p0 = LDA, BASE_SHIFT + mov MM = M + ;; + (p8) adds MM = -1, M + ;; + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + ;; + sub I = A, Y + cmp.eq p10, p0 = SIZE, INCY + mov YY = Y + ;; + (p10) tbit.z.unc p10, p0 = I, BASE_SHIFT + ;; + (p10) br.cond.dptk .L10 + ;; + shr J = M, 3 + mov YY = BUFFER + ;; + (p8) adds YY = SIZE, BUFFER + ;; + mov ar.lc = J + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + ;; +.L02: + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 5 * SIZE + STFD [YST2] = f0, 5 * SIZE + br.cloop.sptk.few .L02 + ;; + +.L10: + { .mib + nop __LINE__ + shr J = N, 3 + (p9) br.cond.dptk .L100 + } + ;; + { .mib + nop __LINE__ + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + mov YLD1 = YY + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + ;; + LDFD f8 = [X], INCX + ;; + LDFD f9 = [X], INCX + ;; + LDFD f10 = [X], INCX + ;; + LDFD f11 = [X], INCX + ;; + LDFD f12 = [X], INCX + ;; + LDFD f13 = [X], INCX + ;; + LDFD f14 = [X], INCX + ;; + LDFD f15 = [X], INCX + ;; + FMPY f8 = ALPHA, f8 + FMPY f9 = ALPHA, f9 + FMPY f10 = ALPHA, f10 + FMPY f11 = ALPHA, f11 + FMPY f12 = ALPHA, f12 + FMPY f13 = ALPHA, f13 + FMPY f14 = ALPHA, f14 + FMPY f15 = ALPHA, f15 + ;; + mov AO1 = A + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + ;; + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + ;; + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + shladd A = LDA, 3, A + ;; + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + + (p8) LDFD f80 = [AO1], 1 * SIZE + (p8) LDFD f81 = [AO2], 1 * SIZE + (p8) LDFD f82 = [AO3], 1 * SIZE + (p8) LDFD f83 = [AO4], 1 * SIZE + (p8) LDFD f84 = [AO5], 1 * SIZE + (p8) LDFD f85 = [AO6], 1 * SIZE + (p8) LDFD f86 = [AO7], 1 * SIZE + (p8) LDFD f87 = [AO8], 1 * SIZE + (p8) LDFD f106 = [YLD1], 1 * SIZE + ;; + (p8) FMPY f32 = f8, f80 + (p8) FMPY f33 = f9, f81 + (p8) FMPY f34 = f10, f82 + (p8) FMA f35 = f11, f83, f106 + ;; + (p8) FMA f32 = f12, f84, f32 + (p8) FMA f33 = f13, f85, f33 + (p8) FMA f34 = f14, f86, f34 + (p8) FMA f35 = f15, f87, f35 + ;; + (p8) FADD f32 = f32, f33 + (p8) FADD f34 = f34, f35 + ;; + (p8) FADD f32 = f32, f34 + ;; + (p8) STFD [YST1] = f32, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + ;; + + shr I = MM, 3 + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + ;; + mov ar.lc = I + mov ar.ec = 2 + (p6) br.cond.dpnt .L15 + ;; + .align 16 + +.L12: + { .mmf + (p18) STFD [YST1] = f16, 1 * SIZE + (p18) STFD [YST2] = f17, 1 * SIZE + (p17) FMA f16 = f8, f33, f101 + } + { .mfi + (p17) LDFPD f93, f94 = [AO8], 2 * SIZE + (p17) FMA f17 = f8, f37, f113 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p18) STFD [YST1] = f18, 1 * SIZE + (p18) STFD [YST2] = f19, 1 * SIZE + (p17) FMA f18 = f8, f34, f104 + } + { .mmf + (p14) lfetch.excl.nt1 [PREB], 16 * SIZE + (p17) LDFPD f95, f96 = [AO8], 2 * SIZE + (p17) FMA f19 = f8, f38, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f20, 1 * SIZE + (p18) STFD [YST2] = f21, 1 * SIZE + (p17) FMA f20 = f8, f35, f107 + } + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f21 = f8, f39, f119 + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YST1] = f22, 5 * SIZE + (p18) STFD [YST2] = f23, 5 * SIZE + (p17) FMA f22 = f8, f36, f110 + } + { .mmf + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f23 = f8, f40, f122 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f16 = f9, f41, f16 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f17 = f9, f45, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f18 = f9, f42, f18 + nop __LINE__ + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f19 = f9, f46, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f43, f20 + nop __LINE__ + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f21 = f9, f47, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f44, f22 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f9, f48, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f16 = f10, f49, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f10, f53, f17 + nop __LINE__ + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFPD f46, f47 = [AO2], 2 * SIZE + (p17) FMA f18 = f10, f50, f18 + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f10, f54, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f20 = f10, f51, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f10, f55, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f22 = f10, f52, f22 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f10, f56, f23 + nop __LINE__ + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f16 + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f11, f61, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f18 = f11, f58, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f11, f62, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f59, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f11, f63, f21 + nop __LINE__ + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFPD f58, f59 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f60, f22 + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f11, f64, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO4], 2 * SIZE + (p17) FMA f16 = f12, f65, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f12, f69, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f62, f63 = [AO4], 2 * SIZE + (p17) FMA f18 = f12, f66, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f12, f70, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO5], 2 * SIZE + (p17) FMA f20 = f12, f67, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f12, f71, f21 + nop __LINE__ + } + ;; + { .mmf + (p14) PREFETCH [RPRE5], 16 * SIZE + (p16) LDFPD f66, f67 = [AO5], 2 * SIZE + (p17) FMA f22 = f12, f68, f22 + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f12, f72, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO5], 2 * SIZE + (p17) FMA f16 = f13, f73, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f13, f77, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO5], 2 * SIZE + (p17) FMA f18 = f13, f74, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f13, f78, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f72, f73 = [AO6], 2 * SIZE + (p17) FMA f20 = f13, f75, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f13, f79, f21 + nop __LINE__ + } + ;; + { .mmf + (p15) PREFETCH [RPRE6], 16 * SIZE + (p16) LDFPD f74, f75 = [AO6], 2 * SIZE + (p17) FMA f22 = f13, f76, f22 + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f13, f80, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f76, f77 = [AO6], 2 * SIZE + (p17) FMA f16 = f14, f81, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f14, f85, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f78, f79 = [AO6], 2 * SIZE + (p17) FMA f18 = f14, f82, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f14, f86, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO7], 2 * SIZE + (p17) FMA f20 = f14, f83, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f14, f87, f21 + nop __LINE__ + } + ;; + { .mmf + (p14) PREFETCH [RPRE7], 16 * SIZE + (p16) LDFPD f82, f83 = [AO7], 2 * SIZE + (p17) FMA f22 = f14, f84, f22 + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f14, f88, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO7], 2 * SIZE + (p17) FMA f16 = f15, f89, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f15, f93, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO7], 2 * SIZE + (p17) FMA f18 = f15, f90, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f15, f94, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f88, f89 = [AO8], 2 * SIZE + (p17) FMA f20 = f15, f91, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f15, f95, f21 + (p16) adds I = -1, I + } + ;; + { .mmf + (p15) PREFETCH [RPRE8], 16 * SIZE + (p16) LDFPD f90, f91 = [AO8], 2 * SIZE + (p17) FMA f22 = f15, f92, f22 + } + { .mfb + nop __LINE__ + (p17) FMA f23 = f15, f96, f23 + br.ctop.sptk.few .L12 + } + ;; + .align 16 + +.L15: + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + (p18) STFD [YST2] = f17, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + cmp.lt p6, p0 = 1, J + } + ;; + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + (p18) STFD [YST2] = f19, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + adds J = -1, J + } + ;; + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + (p18) STFD [YST2] = f21, 1 * SIZE + nop __LINE__ + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [YST1] = f22, 5 * SIZE + (p18) STFD [YST2] = f23, 5 * SIZE + nop __LINE__ + } + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f81 = [AO2] + (p15) LDFD f82 = [AO3] + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f38, f39 = [AO4], 2 * SIZE + (p13) FMA f100 = f8, f32, f100 + nop __LINE__ + } + { .mfi + (p13) LDFPD f40, f41 = [AO5], 2 * SIZE + (p13) FMA f101 = f8, f33, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f54, f55 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f56, f57 = [AO5], 2 * SIZE + (p13) FMA f103 = f8, f49, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f70, f71 = [AO4], 2 * SIZE + (p14) FMA f104 = f8, f64, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f72, f73 = [AO5], 2 * SIZE + (p14) FMA f105 = f8, f65, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f83 = [AO4] + (p15) FMA f106 = f8, f80, f106 + nop __LINE__ + } + { .mfi + (p15) LDFD f84 = [AO5] + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f42, f43 = [AO6], 2 * SIZE + (p13) FMA f100 = f9, f34, f100 + nop __LINE__ + } + { .mfi + (p13) LDFPD f44, f45 = [AO7], 2 * SIZE + (p13) FMA f101 = f9, f35, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f58, f59 = [AO6], 2 * SIZE + (p13) FMA f102 = f9, f50, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f60, f61 = [AO7], 2 * SIZE + (p13) FMA f103 = f9, f51, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f74, f75 = [AO6], 2 * SIZE + (p14) FMA f104 = f9, f66, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f76, f77 = [AO7], 2 * SIZE + (p14) FMA f105 = f9, f67, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f85 = [AO6] + (p15) FMA f106 = f9, f81, f106 + nop __LINE__ + } + { .mfi + (p15) LDFD f86 = [AO7] + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f46, f47 = [AO8], 2 * SIZE + (p13) FMA f100 = f10, f36, f100 + nop __LINE__ + } + { .mfi + (p13) FMA f101 = f10, f37, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f62, f63 = [AO8], 2 * SIZE + (p13) FMA f102 = f10, f52, f102 + nop __LINE__ + } + { .mfi + (p13) FMA f103 = f10, f53, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f78, f79 = [AO8], 2 * SIZE + (p14) FMA f104 = f10, f68, f104 + nop __LINE__ + } + { .mfi + (p14) FMA f105 = f10, f69, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f87 = [AO8] + (p15) FMA f106 = f10, f82, f106 + nop __LINE__ + } + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + (p13) FMA f102 = f11, f54, f102 + (p13) FMA f103 = f11, f55, f103 + (p14) FMA f104 = f11, f70, f104 + (p14) FMA f105 = f11, f71, f105 + (p15) FMA f106 = f11, f83, f106 + ;; + (p13) FMA f100 = f12, f40, f100 + (p13) FMA f101 = f12, f41, f101 + (p13) FMA f102 = f12, f56, f102 + (p13) FMA f103 = f12, f57, f103 + (p14) FMA f104 = f12, f72, f104 + (p14) FMA f105 = f12, f73, f105 + (p15) FMA f106 = f12, f84, f106 + ;; + (p13) FMA f100 = f13, f42, f100 + (p13) FMA f101 = f13, f43, f101 + (p13) FMA f102 = f13, f58, f102 + (p13) FMA f103 = f13, f59, f103 + (p14) FMA f104 = f13, f74, f104 + (p14) FMA f105 = f13, f75, f105 + (p15) FMA f106 = f13, f85, f106 + ;; + (p13) FMA f100 = f14, f44, f100 + (p13) FMA f101 = f14, f45, f101 + (p13) FMA f102 = f14, f60, f102 + (p13) FMA f103 = f14, f61, f103 + (p14) FMA f104 = f14, f76, f104 + (p14) FMA f105 = f14, f77, f105 + (p15) FMA f106 = f14, f86, f106 + ;; + (p13) FMA f100 = f15, f46, f100 + (p13) FMA f101 = f15, f47, f101 + (p13) FMA f102 = f15, f62, f102 + (p13) FMA f103 = f15, f63, f103 + (p14) FMA f104 = f15, f78, f104 + (p14) FMA f105 = f15, f79, f105 + (p15) FMA f106 = f15, f87, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + (p6) br.cond.dptk .L11 + ;; + .align 16 + + +.L20: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd AO4 = LDA, 1, AO2 + } + ;; + { .mmi + LDFD f10 = [X], INCX + (p8) LDFD f81 = [AO2], 1 * SIZE + shladd AO3 = LDA, 1, A + } + ;; + { .mmi + LDFD f11 = [X], INCX + (p8) LDFD f82 = [AO3], 1 * SIZE + } + ;; + { .mfi + (p8) LDFD f83 = [AO4], 1 * SIZE + FMPY f8 = ALPHA, f8 + adds PREB = RPREFETCH * SIZE, YLD1 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + } + ;; + FMPY f10 = ALPHA, f10 + shladd A = LDA, 2, A + FMPY f11 = ALPHA, f11 + ;; + { .mfi + adds RPRE3 = RPREFETCH * SIZE, AO3 + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 2 + } + ;; + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMA f106 = f9, f81, f106 + shr I = MM, 3 + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + (p8) FMA f106 = f10, f82, f106 + } + ;; + { .mfi + adds I = -1, I + (p8) FMA f106 = f11, f83, f106 + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mfi + (p17) LDFPD f63, f64 = [AO4], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + (p16) adds I = -1, I + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mmf + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFPD f46, f47 = [AO2], 2 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f101 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f17 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO4], 2 * SIZE + (p17) FMA f18 = f11, f59, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f58, f59 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f61, f113 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f63, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f11, f64, f122 + br.ctop.sptk.few .L22 + } + ;; + .align 16 + +.L25: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmf + (p15) LDFD f81 = [AO2] + (p15) LDFD f82 = [AO3] + (p13) FMA f100 = f8, f32, f100 + } + { .mfi + (p18) STFD [YST1] = f23, 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + } + ;; + ;; + { .mfi + (p13) LDFPD f38, f39 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + } + { .mfi + (p13) FMA f103 = f8, f49, f103 + } + ;; + { .mfi + (p13) LDFPD f54, f55 = [AO4], 2 * SIZE + (p14) FMA f104 = f8, f64, f104 + } + { .mfi + (p14) FMA f105 = f8, f65, f105 + } + ;; + { .mfi + (p14) LDFPD f70, f71 = [AO4], 2 * SIZE + (p15) FMA f106 = f8, f80, f106 + } + { .mfi + (p13) FMA f100 = f9, f34, f100 + } + ;; + { .mfi + (p15) LDFD f83 = [AO4] + (p13) FMA f101 = f9, f35, f101 + } + { .mfi + (p13) FMA f102 = f9, f50, f102 + } + ;; + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) FMA f100 = f10, f36, f100 + (p13) FMA f101 = f10, f37, f101 + (p13) FMA f102 = f10, f52, f102 + (p13) FMA f103 = f10, f53, f103 + (p14) FMA f104 = f10, f68, f104 + (p14) FMA f105 = f10, f69, f105 + (p15) FMA f106 = f10, f82, f106 + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + ;; + (p13) FMA f102 = f11, f54, f102 + (p13) STFD [YST1] = f100, 1 * SIZE + (p13) FMA f103 = f11, f55, f103 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f104 = f11, f70, f104 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p14) FMA f105 = f11, f71, f105 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + (p15) FMA f106 = f11, f83, f106 + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L30: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L40 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd A = LDA, 1, A + } + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA, f8 + mov ar.ec= 2 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + shr I = MM, 3 + ;; + (p8) LDFD f81 = [AO2], 1 * SIZE + cmp.eq p6, p0 = 0, I + ;; + (p8) FMA f106 = f8, f80, f106 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + tbit.nz p13, p0 = MM, 2 + ;; + (p8) FMA f106 = f9, f81, f106 + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + { .mfi + (p17) LDFPD f47, f48 = [AO2], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mmf + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + adds I = -1, I + } + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mmf + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f16 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f17 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p17) FMA f18 = f9, f43, f107 + } + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f45, f113 + } + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f47, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f9, f48, f122 + br.ctop.sptk.few .L32 + } + ;; + .align 16 + +.L35: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f81 = [AO2] + (p18) STFD [YST1] = f23, 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) FMA f100 = f9, f34, f100 + (p13) FMA f101 = f9, f35, f101 + (p13) FMA f102 = f9, f50, f102 + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L40: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 0 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L990 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + adds RPRE1 = RPREFETCH * SIZE, AO1 + } + ;; + { .mii + (p8) LDFD f80 = [AO1], 1 * SIZE + adds PREB = RPREFETCH * SIZE, YLD1 + } + ;; + FMPY f8 = ALPHA, f8 + shr I = MM, 3 + ;; + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 3 + ;; + { .mmi + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + tbit.nz p14, p15 = r0, 0 + } + ;; + { .mmi + adds YST2 = 4 * SIZE, YST1 + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mmi + (p8) STFD [YST1] = f106, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + } + { .mib + mov ar.lc = I + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L42: + { .mmf + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + (p18) FMA f16 = f8, f34, f102 + } + { .mmf + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) FMA f20 = f8, f46, f114 + } + ;; + { .mmf + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + (p18) FMA f17 = f8, f37, f105 + } + { .mmf + (p16) LDFPD f38, f41 = [AO1], 2 * SIZE + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) FMA f21 = f8, f49, f117 + } + ;; + { .mmf + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + (p18) FMA f18 = f8, f40, f108 + } + { .mmf + (p16) LDFPD f44, f47 = [AO1], 2 * SIZE + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) FMA f22 = f8, f52, f120 + } + ;; + { .mmf + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + (p18) FMA f19 = f8, f43, f111 + } + { .mmf + (p16) LDFPD f50, f53 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) FMA f23 = f8, f55, f123 + } + ;; + { .mmi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p14) PREFETCH [PREB], 16 * SIZE + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mib + nop __LINE__ + (p16) adds I = -1, I + br.ctop.sptk.few .L42 + } + ;; + .align 16 + +.L45: + { .mmi + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + } + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f105 = f8, f65, f105 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + br .L990 + ;; + .align 16 + +.L100: + shr J = N, 3 + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L120 + ;; + .align 16 + +.L111: + mov YLD1 = YY + mov YST1 = YY + ;; + LDFD f8 = [X], INCX + ;; + LDFD f9 = [X], INCX + ;; + LDFD f10 = [X], INCX + ;; + LDFD f11 = [X], INCX + ;; + LDFD f12 = [X], INCX + ;; + LDFD f13 = [X], INCX + ;; + LDFD f14 = [X], INCX + ;; + LDFD f15 = [X], INCX + ;; + FMPY f8 = ALPHA, f8 + FMPY f9 = ALPHA, f9 + FMPY f10 = ALPHA, f10 + FMPY f11 = ALPHA, f11 + FMPY f12 = ALPHA, f12 + FMPY f13 = ALPHA, f13 + FMPY f14 = ALPHA, f14 + FMPY f15 = ALPHA, f15 + ;; + mov AO1 = A + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + ;; + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + ;; + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + shladd A = LDA, 3, A + ;; + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + + (p8) LDFD f80 = [AO1], 1 * SIZE + (p8) LDFD f81 = [AO2], 1 * SIZE + (p8) LDFD f82 = [AO3], 1 * SIZE + (p8) LDFD f83 = [AO4], 1 * SIZE + (p8) LDFD f84 = [AO5], 1 * SIZE + (p8) LDFD f85 = [AO6], 1 * SIZE + (p8) LDFD f86 = [AO7], 1 * SIZE + (p8) LDFD f87 = [AO8], 1 * SIZE + (p8) LDFD f106 = [YLD1], 1 * SIZE + ;; + (p8) FMPY f32 = f8, f80 + (p8) FMPY f33 = f9, f81 + (p8) FMPY f34 = f10, f82 + (p8) FMA f35 = f11, f83, f106 + ;; + (p8) FMA f32 = f12, f84, f32 + (p8) FMA f33 = f13, f85, f33 + (p8) FMA f34 = f14, f86, f34 + (p8) FMA f35 = f15, f87, f35 + ;; + (p8) FADD f32 = f32, f33 + (p8) FADD f34 = f34, f35 + ;; + (p8) FADD f32 = f32, f34 + ;; + (p8) STFD [YST1] = f32, 1 * SIZE + + shr I = MM, 3 + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + ;; + mov ar.lc = I + mov ar.ec= 2 + (p6) br.cond.dpnt .L115 + ;; + .align 16 + +.L112: + { .mfi + (p17) LDFD f96 = [AO8], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFD f47 = [AO2], 1 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f101 = f11, f57, f101 + } + { .mmf + (p18) STFD [YST1] = f19, 1 * SIZE + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f104 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f57, f58 = [AO4], 2 * SIZE + (p17) FMA f107 = f11, f59, f107 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f110 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f59, f60 = [AO4], 2 * SIZE + (p17) FMA f113 = f11, f61, f113 + } + { .mfi + (p17) FMA f116 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f61, f62 = [AO4], 2 * SIZE + (p17) FMA f119 = f11, f63, f119 + } + { .mfi + (p17) FMA f122 = f11, f64, f122 + } + ;; + { .mfi + (p16) LDFD f63 = [AO4], 1 * SIZE + (p17) FMA f101 = f12, f65, f101 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f104 = f12, f66, f104 + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO5], 2 * SIZE + (p17) FMA f107 = f12, f67, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f110 = f12, f68, f110 + } + ;; + { .mfi + (p16) LDFPD f66, f67 = [AO5], 2 * SIZE + (p17) FMA f113 = f12, f69, f113 + } + { .mfi + (p14) PREFETCH [RPRE5], 16 * SIZE + (p17) FMA f116 = f12, f70, f116 + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO5], 2 * SIZE + (p17) FMA f119 = f12, f71, f119 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f122 = f12, f72, f122 + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO5], 2 * SIZE + (p17) FMA f101 = f13, f73, f101 + } + { .mmf + (p18) STFD [YST1] = f23, 1 * SIZE + (p16) LDFD f72 = [AO6], 1 * SIZE + (p17) FMA f104 = f13, f74, f104 + } + ;; + { .mfi + (p16) LDFPD f73, f74 = [AO6], 2 * SIZE + (p17) FMA f107 = f13, f75, f107 + } + { .mfi + (p15) PREFETCH [RPRE6], 16 * SIZE + (p17) FMA f110 = f13, f76, f110 + } + ;; + { .mfi + (p16) LDFPD f75, f76 = [AO6], 2 * SIZE + (p17) FMA f113 = f13, f77, f113 + } + { .mfi + (p17) FMA f116 = f13, f78, f116 + } + ;; + { .mfi + (p16) LDFPD f77, f78 = [AO6], 2 * SIZE + (p17) FMA f119 = f13, f79, f119 + } + { .mfi + (p17) FMA f122 = f13, f80, f122 + } + ;; + { .mfi + (p16) LDFD f79 = [AO6], 1 * SIZE + (p17) FMA f101 = f14, f81, f101 + } + { .mfi + (p17) FMA f104 = f14, f82, f104 + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO7], 2 * SIZE + (p17) FMA f107 = f14, f83, f107 + } + { .mfi + (p14) PREFETCH [RPRE7], 16 * SIZE + (p17) FMA f110 = f14, f84, f110 + } + ;; + { .mfi + (p16) LDFPD f82, f83 = [AO7], 2 * SIZE + (p17) FMA f113 = f14, f85, f113 + } + { .mfi + (p17) FMA f116 = f14, f86, f116 + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO7], 2 * SIZE + (p17) FMA f119 = f14, f87, f119 + } + { .mfi + (p17) FMA f122 = f14, f88, f122 + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO7], 2 * SIZE + (p17) FMA f16 = f15, f89, f101 + } + { .mfi + (p16) LDFD f88 = [AO8], 1 * SIZE + (p17) FMA f17 = f15, f90, f104 + } + ;; + { .mfi + (p16) LDFPD f89, f90 = [AO8], 2 * SIZE + (p17) FMA f18 = f15, f91, f107 + } + { .mfi + (p15) PREFETCH [RPRE8], 16 * SIZE + (p17) FMA f19 = f15, f92, f110 + } + ;; + { .mfi + (p16) LDFPD f91, f92 = [AO8], 2 * SIZE + (p17) FMA f20 = f15, f93, f113 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f21 = f15, f94, f116 + } + ;; + { .mfi + (p16) LDFPD f93, f94 = [AO8], 2 * SIZE + (p17) FMA f22 = f15, f95, f119 + } + { .mfb + (p16) adds I = -1, I + (p17) FMA f23 = f15, f96, f122 + br.ctop.sptk.few .L112 + } + ;; + .align 16 + +.L115: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + cmp.lt p6, p0 = 1, J + adds J = -1, J + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + (p13) LDFD f34 = [AO2], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f35, f50 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f66 = [AO2], 1 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f67 = [AO2], 1 * SIZE + (p15) LDFD f82 = [AO3] + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f23, 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p15) LDFD f81 = [AO2] + (p13) LDFD f38 = [AO4], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + } + { .mfi + (p13) LDFPD f40, f41 = [AO5], 2 * SIZE + (p13) FMA f101 = f8, f33, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f39, f54 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f56, f57 = [AO5], 2 * SIZE + (p13) FMA f103 = f8, f49, f103 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFD f55 = [AO4], 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f72, f73 = [AO5], 2 * SIZE + (p14) FMA f105 = f8, f65, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f70 = [AO4], 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + nop __LINE__ + } + { .mmi + (p15) LDFD f84 = [AO5] + (p13) LDFD f42 = [AO6], 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p13) LDFPD f43, f58 = [AO6], 2 * SIZE + (p14) LDFD f71 = [AO4], 1 * SIZE + (p13) FMA f100 = f9, f34, f100 + } + { .mfi + (p13) LDFPD f44, f45 = [AO7], 2 * SIZE + (p13) FMA f101 = f9, f35, f101 + nop __LINE__ + } + ;; + { .mmf + (p13) LDFD f59 = [AO6], 1 * SIZE + (p15) LDFD f83 = [AO4] + (p13) FMA f102 = f9, f50, f102 + } + { .mfi + (p13) LDFPD f60, f61 = [AO7], 2 * SIZE + (p13) FMA f103 = f9, f51, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f74 = [AO6], 1 * SIZE + (p14) FMA f104 = f9, f66, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f76, f77 = [AO7], 2 * SIZE + (p14) FMA f105 = f9, f67, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f75 = [AO6], 1 * SIZE + (p15) FMA f106 = f9, f81, f106 + nop __LINE__ + } + { .mmi + (p15) LDFD f86 = [AO7] + (p13) LDFD f46 = [AO8], 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p13) LDFPD f47, f62 = [AO8], 2 * SIZE + (p15) LDFD f85 = [AO6] + (p13) FMA f100 = f10, f36, f100 + } + { .mfi + (p13) FMA f101 = f10, f37, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFD f63 = [AO8], 1 * SIZE + (p13) FMA f102 = f10, f52, f102 + nop __LINE__ + } + { .mfi + (p13) FMA f103 = f10, f53, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f78 = [AO8], 1 * SIZE + (p14) FMA f104 = f10, f68, f104 + nop __LINE__ + } + { .mfi + (p14) FMA f105 = f10, f69, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f79 = [AO8], 1 * SIZE + (p15) FMA f106 = f10, f82, f106 + nop __LINE__ + } + ;; + (p15) LDFD f87 = [AO8] + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + (p13) FMA f102 = f11, f54, f102 + (p13) FMA f103 = f11, f55, f103 + (p14) FMA f104 = f11, f70, f104 + (p14) FMA f105 = f11, f71, f105 + (p15) FMA f106 = f11, f83, f106 + ;; + (p13) FMA f100 = f12, f40, f100 + (p13) FMA f101 = f12, f41, f101 + (p13) FMA f102 = f12, f56, f102 + (p13) FMA f103 = f12, f57, f103 + (p14) FMA f104 = f12, f72, f104 + (p14) FMA f105 = f12, f73, f105 + (p15) FMA f106 = f12, f84, f106 + ;; + (p13) FMA f100 = f13, f42, f100 + (p13) FMA f101 = f13, f43, f101 + (p13) FMA f102 = f13, f58, f102 + (p13) FMA f103 = f13, f59, f103 + (p14) FMA f104 = f13, f74, f104 + (p14) FMA f105 = f13, f75, f105 + (p15) FMA f106 = f13, f85, f106 + ;; + (p13) FMA f100 = f14, f44, f100 + (p13) FMA f101 = f14, f45, f101 + (p13) FMA f102 = f14, f60, f102 + (p13) FMA f103 = f14, f61, f103 + (p14) FMA f104 = f14, f76, f104 + (p14) FMA f105 = f14, f77, f105 + (p15) FMA f106 = f14, f86, f106 + ;; + (p13) FMA f100 = f15, f46, f100 + (p13) FMA f101 = f15, f47, f101 + (p13) FMA f102 = f15, f62, f102 + (p13) FMA f103 = f15, f63, f103 + (p14) FMA f104 = f15, f78, f104 + (p14) FMA f105 = f15, f79, f105 + (p15) FMA f106 = f15, f87, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + (p6) br.cond.dptk .L111 + ;; + .align 16 + +.L120: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd AO4 = LDA, 1, AO2 + } + ;; + { .mmi + LDFD f10 = [X], INCX + (p8) LDFD f81 = [AO2], 1 * SIZE + shladd AO3 = LDA, 1, A + } + ;; + { .mmi + LDFD f11 = [X], INCX + (p8) LDFD f82 = [AO3], 1 * SIZE + } + ;; + { .mfi + (p8) LDFD f83 = [AO4], 1 * SIZE + FMPY f8 = ALPHA, f8 + adds PREB = RPREFETCH * SIZE, YLD1 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + } + ;; + FMPY f10 = ALPHA, f10 + shladd A = LDA, 2, A + FMPY f11 = ALPHA, f11 + ;; + { .mfi + adds RPRE3 = RPREFETCH * SIZE, AO3 + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 2 + } + ;; + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMA f106 = f9, f81, f106 + shr I = MM, 3 + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + (p8) FMA f106 = f10, f82, f106 + } + ;; + { .mfi + adds I = -1, I + (p8) FMA f106 = f11, f83, f106 + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L125 + } + ;; + .align 16 + +.L122: + { .mfi + (p17) LDFD f64 = [AO4], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + (p16) adds I = -1, I + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mmf + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFD f47 = [AO2], 1 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f101 + } + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f17 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f57, f58 = [AO4], 2 * SIZE + (p17) FMA f18 = f11, f59, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f59, f60 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f61, f113 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f61, f62 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f63, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f11, f64, f122 + br.ctop.sptk.few .L122 + } + ;; + .align 16 + +.L125: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + (p15) LDFD f80 = [AO1] + } + { .mmi + (p15) LDFD f106 = [YLD1], 1 * SIZE + (p13) LDFD f34 = [AO2], 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f35, f50 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFD f66 = [AO2], 1 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmf + (p18) STFD [YST1] = f23, 1 * SIZE + (p14) LDFD f67 = [AO2], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + } + { .mmf + (p15) LDFD f82 = [AO3] + (p13) LDFD f38 = [AO4], 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + } + ;; + ;; + { .mmf + (p13) LDFPD f39, f54 = [AO4], 2 * SIZE + (p15) LDFD f81 = [AO2] + (p13) FMA f102 = f8, f48, f102 + } + { .mfi + (p13) FMA f103 = f8, f49, f103 + } + ;; + { .mfi + (p13) LDFD f55 = [AO4], 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + } + { .mfi + (p14) FMA f105 = f8, f65, f105 + } + ;; + { .mfi + (p14) LDFD f70 = [AO4], 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + } + { .mfi + (p13) FMA f100 = f9, f34, f100 + } + ;; + { .mfi + (p14) LDFD f71 = [AO4], 1 * SIZE + (p13) FMA f101 = f9, f35, f101 + } + { .mfi + (p13) FMA f102 = f9, f50, f102 + } + ;; + (p15) LDFD f83 = [AO4] + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) FMA f100 = f10, f36, f100 + (p13) FMA f101 = f10, f37, f101 + (p13) FMA f102 = f10, f52, f102 + (p13) FMA f103 = f10, f53, f103 + (p14) FMA f104 = f10, f68, f104 + (p14) FMA f105 = f10, f69, f105 + (p15) FMA f106 = f10, f82, f106 + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + ;; + (p13) FMA f102 = f11, f54, f102 + (p13) STFD [YST1] = f100, 1 * SIZE + (p13) FMA f103 = f11, f55, f103 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f104 = f11, f70, f104 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p14) FMA f105 = f11, f71, f105 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + (p15) FMA f106 = f11, f83, f106 + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L130: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L140 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd A = LDA, 1, A + } + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA, f8 + mov ar.ec= 2 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + shr I = MM, 3 + ;; + (p8) LDFD f81 = [AO2], 1 * SIZE + cmp.eq p6, p0 = 0, I + ;; + (p8) FMA f106 = f8, f80, f106 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + tbit.nz p13, p0 = MM, 2 + ;; + (p8) FMA f106 = f9, f81, f106 + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L135 + } + ;; + .align 16 + +.L132: + { .mfi + (p17) LDFD f48 = [AO2], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mmf + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + adds I = -1, I + } + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mmf + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mmf + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f16 = f9, f41, f101 + } + { .mmf + (p18) STFD [YST1] = f20, 1 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f17 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p17) FMA f18 = f9, f43, f107 + } + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f45, f113 + } + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f47, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f9, f48, f122 + br.ctop.sptk.few .L132 + } + ;; + .align 16 + +.L135: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f34 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f35 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f50 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f23, 1 * SIZE + } + ;; + (p14) LDFD f66 = [AO2], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + ;; + (p14) LDFD f67 = [AO2], 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + ;; + (p15) LDFD f81 = [AO2] + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) FMA f100 = f9, f34, f100 + (p13) FMA f101 = f9, f35, f101 + (p13) FMA f102 = f9, f50, f102 + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L140: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 0 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L990 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + adds RPRE1 = RPREFETCH * SIZE, AO1 + } + ;; + { .mmi + (p8) LDFD f80 = [AO1], 1 * SIZE + adds PREB = RPREFETCH * SIZE, YLD1 + } + ;; + FMPY f8 = ALPHA, f8 + shr I = MM, 3 + ;; + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 3 + ;; + { .mmi + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + tbit.nz p14, p15 = r0, 0 + } + ;; + { .mmi + adds YST2 = 4 * SIZE, YST1 + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mmi + (p8) STFD [YST1] = f106, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + } + { .mib + mov ar.lc = I + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L142: + { .mmf + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + (p18) FMA f16 = f8, f34, f102 + } + { .mmf + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) FMA f20 = f8, f46, f114 + } + ;; + { .mmf + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + (p18) FMA f17 = f8, f37, f105 + } + { .mmf + (p16) LDFPD f38, f41 = [AO1], 2 * SIZE + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) FMA f21 = f8, f49, f117 + } + ;; + { .mmf + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + (p18) FMA f18 = f8, f40, f108 + } + { .mmf + (p16) LDFPD f44, f47 = [AO1], 2 * SIZE + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) FMA f22 = f8, f52, f120 + } + ;; + { .mmf + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + (p18) FMA f19 = f8, f43, f111 + } + { .mmf + (p16) LDFPD f50, f53 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) FMA f23 = f8, f55, f123 + } + ;; + { .mmi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p14) PREFETCH [PREB], 16 * SIZE + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mib + nop __LINE__ + (p16) adds I = -1, I + br.ctop.sptk.few .L142 + } + ;; + .align 16 + +.L145: + { .mmi + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + } + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L990: + { .mmi + mov YLD1 = YY + mov YST1 = Y + mov pr.rot= 0 + } + { .mib + mov YST2 = Y + shr J = M, 3 + (p10) br.cond.dptk .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = r0, J + adds J = -1, J + mov ar.ec = 4 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + nop __LINE__ + tbit.nz p13, p0 = M, 2 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = J + (p6) br.cond.dpnt .L995 + } + ;; +.L992: + { .mfi + (p19) STFD [YST2] = f35 + (p18) FADD f34 = f34, f66 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f64 = [YLD1], 1 * SIZE + (p16) LDFD f32 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f39 + (p18) FADD f38 = f38, f70 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f36 = [YST1], INCY + (p16) LDFD f68 = [YLD1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f43 + (p18) FADD f42 = f42, f74 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f72 = [YLD1], 1 * SIZE + (p16) LDFD f40 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f47 + (p18) FADD f46 = f46, f78 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f76 = [YLD1], 1 * SIZE + (p16) LDFD f44 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f51 + (p18) FADD f50 = f50, f82 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f80 = [YLD1], 1 * SIZE + (p16) LDFD f48 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f55 + (p18) FADD f54 = f54, f86 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f84 = [YLD1], 1 * SIZE + (p16) LDFD f52 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f59 + (p18) FADD f58 = f58, f90 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f88 = [YLD1], 1 * SIZE + (p16) LDFD f56 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f63 + (p18) FADD f62 = f62, f94 + (p19) add YST2 = YST2, INCY + } + { .mmb + (p16) LDFD f92 = [YLD1], 1 * SIZE + (p16) LDFD f60 = [YST1], INCY + br.ctop.sptk.few .L992 + } + ;; + +.L995: + (p13) LDFD f32 = [YST1], INCY + (p13) LDFD f40 = [YLD1], 1 * SIZE + tbit.nz p14, p0 = M, 1 + ;; + (p13) LDFD f33 = [YST1], INCY + (p13) LDFD f41 = [YLD1], 1 * SIZE + tbit.nz p15, p0 = M, 0 + ;; + (p13) LDFD f34 = [YST1], INCY + (p13) LDFD f42 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f35 = [YST1], INCY + (p13) LDFD f43 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f36 = [YST1], INCY + (p14) LDFD f44 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f37 = [YST1], INCY + (p14) LDFD f45 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f38 = [YST1], INCY + (p15) LDFD f46 = [YLD1], 1 * SIZE + ;; + (p13) FADD f32 = f32, f40 + (p13) FADD f33 = f33, f41 + (p13) FADD f34 = f34, f42 + (p13) FADD f35 = f35, f43 + (p14) FADD f36 = f36, f44 + (p14) FADD f37 = f37, f45 + (p15) FADD f38 = f38, f46 + ;; + (p13) STFD [YST2] = f32 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f33 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f34 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f35 + (p13) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f36 + (p14) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f37 + (p14) add YST2 = YST2, INCY + ;; + (p15) STFD [YST2] = f38 + ;; + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/gemv_t.S b/kernel/ia64/gemv_t.S new file mode 100644 index 0000000..6bc579e --- /dev/null +++ b/kernel/ia64/gemv_t.S @@ -0,0 +1,3557 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 4096 +#define SP r12 + +#define M r32 +#define N r33 +#define A r36 +#define LDA r37 +#define X r38 +#define INCX r39 +#define Y r34 +#define INCY r35 +#define BUFFER r11 + +#define MIN_M r14 +#define I r15 +#define J r16 +#define IS r17 +#define AO1 r18 +#define AO2 r19 +#define AO3 r20 +#define AO4 r21 +#define AO5 r22 +#define AO6 r23 +#define AO7 r24 +#define AO8 r25 +#define BO r26 +#define LDAP r27 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define AO21 loc8 +#define AO41 loc9 +#define AO61 loc10 +#define AO81 loc11 + +#define PREB r8 +#define WPRE r9 +#define OFFSET PREB +#define CO r10 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 8) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 8, 0 + setf.sig f11 = LDA + mov ARLC = ar.lc + } + { .mmi + adds r15 = 24, SP + adds r16 = 32, SP + adds r14 = 16, SP + } + ;; + { .mmi + setf.sig f10 = N + ld8 Y = [r14] + mov PR = pr + } + { .mmi + ld8 INCY = [r15] + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + adds SP = -8 * 16, SP + } + ;; + { .mmf + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + mov ALPHA = f8 + } + ;; + { .mmi + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + mov IS = 0 + } + ;; + { .mmf + stf.spill [r8] = f22 + stf.spill [r9] = f23 + xmpy.l f10 = f10, f11 + } + .body + ;; + ;; + { .mmi + ld8 BUFFER = [r16] + cmp.ge p7, p0 = r0, M + cmp.ge p6, p0 = r0, N + } + ;; + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + } + ;; + { .mmi + getf.sig LDAP = f10 + mov r2 = P + tbit.nz p8, p0 = A, BASE_SHIFT + } + { .mmi + nop __LINE__ + nop __LINE__ + tbit.nz p9, p0 = LDA, BASE_SHIFT + } + ;; + { .mbb + sub LDAP = r2, LDAP + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + } + .align 16 + ;; + +.LIs_loop: + { .mmi + sub MIN_M = M, IS + (p8) LDFD f32 = [X], INCX + mov pr.rot= 0 + } + { .mmi + mov AO1 = BUFFER + adds AO2 = 4 * SIZE, BUFFER + } + ;; + cmp.le p6, p0 = r2, MIN_M + ;; + (p6) mov MIN_M = P + ;; + (p8) adds MIN_M = -1, MIN_M + ;; + { .mmi + shladd OFFSET = INCX, 2, INCX + shladd BO = INCX, 2, X + shr I = MIN_M, 3 + } + ;; + { .mmi + adds I = -1, I + cmp.eq p16, p0 = r0, r0 + mov ar.ec= 5 + } + ;; + { .mmi + (p8) STFD [AO1] = f32, 2 * SIZE + (p8) adds AO2 = 6 * SIZE, BUFFER + mov ar.lc = I + } + { .mib + cmp.gt p6, p0 = 0, I + tbit.nz p13, p0 = MIN_M, 2 + (p6) br.cond.dpnt .L05 + } + ;; + .align 16 + +.L01: + (p20) STFD [AO1] = f36, SIZE + (p20) STFD [AO2] = f56, SIZE + (p16) LDFD f32 = [X], INCX + (p16) LDFD f52 = [BO], INCX + ;; + (p20) STFD [AO1] = f41, SIZE + (p20) STFD [AO2] = f61, SIZE + (p16) LDFD f37 = [X], INCX + (p16) LDFD f57 = [BO], INCX + ;; + (p20) STFD [AO1] = f46, SIZE + (p20) STFD [AO2] = f66, SIZE + (p16) LDFD f42 = [X], INCX + (p16) LDFD f62 = [BO], INCX + ;; + (p20) STFD [AO1] = f51, 5 * SIZE + (p20) STFD [AO2] = f71, 5 * SIZE + (p16) LDFD f47 = [X], OFFSET + (p16) LDFD f67 = [BO], OFFSET + br.ctop.sptk.few .L01 + ;; + .align 16 + +.L05: + (p13) LDFD f32 = [X], INCX + tbit.nz p14, p0 = MIN_M, 1 + ;; + (p13) LDFD f33 = [X], INCX + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p13) LDFD f34 = [X], INCX + ;; + (p13) LDFD f35 = [X], INCX + ;; + (p14) LDFD f36 = [X], INCX + ;; + (p13) STFD [AO1] = f32, SIZE + (p14) LDFD f37 = [X], INCX + ;; + (p13) STFD [AO1] = f33, SIZE + (p15) LDFD f38 = [X], INCX + ;; + (p13) STFD [AO1] = f34, SIZE + ;; + (p13) STFD [AO1] = f35, SIZE + ;; + (p14) STFD [AO1] = f36, SIZE + ;; + (p14) STFD [AO1] = f37, SIZE + ;; + (p15) STFD [AO1] = f38, SIZE + (p9) br.cond.dpnt .L100 + ;; + .align 16 + +.L10: + { .mmi + mov CO = Y + nop __LINE__ + shr J = N, 3 + } + ;; + { .mib + nop __LINE__ + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + shr I = MIN_M, 4 + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f14 = f0 + } + ;; + { .mmf + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + mov f16 = f0 + } + { .mmf + (p8) LDFD f34 = [AO3], SIZE + (p8) LDFD f35 = [AO4], SIZE + mov f18 = f0 + } + ;; + { .mmf + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + mov f20 = f0 + } + { .mmf + (p8) LDFD f36 = [AO5], SIZE + (p8) LDFD f37 = [AO6], SIZE + mov f22 = f0 + } + ;; + { .mfi + (p8) LDFD f38 = [AO7], SIZE + mov f9 = f0 + mov ar.ec= 2 + } + { .mmf + (p8) LDFD f39 = [AO8], SIZE + mov BO = BUFFER + mov f11 = f0 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + shladd A = LDA, 3, A + cmp.eq p16, p0 = r0, r0 + mov f15 = f0 + } + ;; + { .mmf + add I = I, I + nop __LINE__ + mov f17 = f0 + } + { .mmf + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + mov f19 = f0 + } + ;; + { .mmf + adds I = -1, I + nop __LINE__ + mov f21 = f0 + } + { .mmf + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + mov f23 = f0 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p8) FMPY f8 = f40, f32 + } + { .mmf + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + (p8) FMPY f10 = f40, f33 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p8) FMPY f12 = f40, f34 + } + { .mmf + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + (p8) FMPY f14 = f40, f35 + } + ;; + { .mfi + nop __LINE__ + (p8) FMPY f16 = f40, f36 + mov ar.lc = I + } + { .mmf + adds WPRE = 8 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f18 = f40, f37 + } + ;; + { .mmf + lfetch.excl.nt1 [WPRE] + nop __LINE__ + (p8) FMPY f20 = f40, f38 + } + { .mfb + nop __LINE__ + (p8) FMPY f22 = f40, f39 + (p6) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mfi + (p17) LDFPD f95, f96 = [AO8], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f9 = f105, f34, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f11 = f105, f36, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO2], 2 * SIZE + (p17) FMA f12 = f104, f37, f12 + nop __LINE__ + } + { .mfi + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f13 = f105, f38, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO3], 2 * SIZE + (p17) FMA f14 = f104, f39, f14 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f15 = f105, f40, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO4], 2 * SIZE + (p17) FMA f16 = f104, f41, f16 + nop __LINE__ + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f17 = f105, f42, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO5], 2 * SIZE + (p17) FMA f18 = f104, f43, f18 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [RPRE5], 16 * SIZE + (p17) FMA f19 = f105, f44, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO6], 2 * SIZE + (p17) FMA f20 = f104, f45, f20 + nop __LINE__ + } + { .mfi + (p15) PREFETCH [RPRE6], 16 * SIZE + (p17) FMA f21 = f105, f46, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO7], 2 * SIZE + (p17) FMA f22 = f104, f47, f22 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [RPRE7], 16 * SIZE + (p17) FMA f23 = f105, f48, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f46, f47 = [AO8], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + nop __LINE__ + } + { .mfi + (p15) PREFETCH [RPRE8], 16 * SIZE + (p17) FMA f9 = f107, f50, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [PREB], 16 * SIZE + (p17) FMA f11 = f107, f52, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO2], 2 * SIZE + (p17) FMA f12 = f106, f53, f12 + nop __LINE__ + } + { .mfi + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f13 = f107, f54, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f14 = f106, f55, f14 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f15 = f107, f56, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO4], 2 * SIZE + (p17) FMA f16 = f106, f57, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f107, f58, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO5], 2 * SIZE + (p17) FMA f18 = f106, f59, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f107, f60, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f58, f59 = [AO6], 2 * SIZE + (p17) FMA f20 = f106, f61, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f107, f62, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO7], 2 * SIZE + (p17) FMA f22 = f106, f63, f22 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f107, f64, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f62, f63 = [AO8], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + nop __LINE__ + } + { .mfi + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p17) FMA f9 = f109, f66, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p17) FMA f10 = f108, f67, f10 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f66, f67 = [AO2], 2 * SIZE + (p17) FMA f12 = f108, f69, f12 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f13 = f109, f70, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO3], 2 * SIZE + (p17) FMA f14 = f108, f71, f14 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f15 = f109, f72, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO4], 2 * SIZE + (p17) FMA f16 = f108, f73, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f109, f74, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f72, f73 = [AO5], 2 * SIZE + (p17) FMA f18 = f108, f75, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f109, f76, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f74, f75 = [AO6], 2 * SIZE + (p17) FMA f20 = f108, f77, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f109, f78, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f76, f77 = [AO7], 2 * SIZE + (p17) FMA f22 = f108, f79, f22 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f109, f80, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + nop __LINE__ + } + { .mfi + (p16) LDFPD f78, f79 = [AO8], 2 * SIZE + (p17) FMA f9 = f111, f82, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + (p17) FMA f10 = f110, f83, f10 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f11 = f111, f84, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f82, f83 = [AO2], 2 * SIZE + (p17) FMA f12 = f110, f85, f12 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f13 = f111, f86, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO3], 2 * SIZE + (p17) FMA f14 = f110, f87, f14 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f15 = f111, f88, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO4], 2 * SIZE + (p17) FMA f16 = f110, f89, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f111, f90, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f88, f89 = [AO5], 2 * SIZE + (p17) FMA f18 = f110, f91, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f111, f92, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f90, f91 = [AO6], 2 * SIZE + (p17) FMA f20 = f110, f93, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f111, f94, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f92, f93 = [AO7], 2 * SIZE + (p17) FMA f22 = f110, f95, f22 + nop __LINE__ + } + { .mfb + adds I = -1, I + (p17) FMA f23 = f111, f96, f23 + br.ctop.sptk.few .L12 + } + ;; + .align 16 + +.L15: + and I = 15, MIN_M + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p15 = r0, r0 + ;; + adds I = 1, I + ;; + shr I = I, 1 + ;; + adds I = -1, I + ;; + mov ar.lc = I + mov ar.ec= 3 + and I = 15, MIN_M + (p6) br.cond.dpnt .L18 + ;; + .align 16 + +.L16: + { .mfi + (p16) LDFPD f104, f107 = [BO], 2 * SIZE + (p18) FMA f8 = f106, f34, f8 + nop __LINE__ + } + { .mfi + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p15) FMA f9 = f109, f37, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f38, f41 = [AO2], 2 * SIZE + (p18) FMA f10 = f106, f40, f10 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f11 = f109, f43, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f44, f47 = [AO3], 2 * SIZE + (p18) FMA f12 = f106, f46, f12 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f13 = f109, f49, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f50, f53 = [AO4], 2 * SIZE + (p18) FMA f14 = f106, f52, f14 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f15 = f109, f55, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f56, f59 = [AO5], 2 * SIZE + (p18) FMA f16 = f106, f58, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f17 = f109, f61, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f62, f65 = [AO6], 2 * SIZE + (p18) FMA f18 = f106, f64, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f19 = f109, f67, f19 + (p17) adds I = -2, I + } + ;; + { .mfi + (p16) LDFPD f68, f71 = [AO7], 2 * SIZE + (p18) FMA f20 = f106, f70, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f21 = f109, f73, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f74, f77 = [AO8], 2 * SIZE + (p15) FMA f23 = f109, f79, f23 + (p17) cmp.ne.unc p15, p0 = -1, I + } + { .mfb + nop __LINE__ + (p18) FMA f22 = f106, f76, f22 + br.ctop.sptk.few .L16 + } + ;; + +.L18: + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + FADD f8 = f8, f9 + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + LDFD f34 = [CO], INCY + nop __LINE__ + FADD f12 = f12, f13 + } + ;; + { .mmf + LDFD f35 = [CO], INCY + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + LDFD f36 = [CO], INCY + nop __LINE__ + FADD f16 = f16, f17 + } + ;; + { .mmf + LDFD f37 = [CO], INCY + nop __LINE__ + FADD f18 = f18, f19 + } + ;; + { .mmf + LDFD f38 = [CO], INCY + nop __LINE__ + FADD f20 = f20, f21 + } + ;; + { .mmf + LDFD f39 = [CO], INCY + nop __LINE__ + FADD f22 = f22, f23 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f34 = ALPHA, f12, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA, f14, f35 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + FMA f36 = ALPHA, f16, f36 + } + ;; + { .mmf + STFD [AO1] = f33 + add AO1 = AO1, INCY + FMA f37 = ALPHA, f18, f37 + } + ;; + { .mmf + STFD [AO1] = f34 + add AO1 = AO1, INCY + FMA f38 = ALPHA, f20, f38 + } + ;; + { .mmf + STFD [AO1] = f35 + add AO1 = AO1, INCY + FMA f39 = ALPHA, f22, f39 + } + ;; + { .mmi + STFD [AO1] = f36 + add AO1 = AO1, INCY + adds J = -1, J + } + ;; + { .mmi + STFD [AO1] = f37 + add AO1 = AO1, INCY + nop __LINE__ + } + ;; + { .mmi + STFD [AO1] = f38 + add AO1 = AO1, INCY + cmp4.lt p6, p0 = 0, J + } + ;; + { .mib + STFD [AO1] = f39 + add AO1 = AO1, INCY + (p6) br.cond.dptk .L11 + } + ;; + .align 16 + +.L20: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 2 + } + ;; + { .mfi + shladd AO3 = LDA, 1, A + mov f12 = f0 + shr I = MIN_M, 4 + } + { .mfb + shladd AO4 = LDA, 1, AO2 + mov f14 = f0 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f9 = f0 + } + { .mmf + mov BO = BUFFER + shladd A = LDA, 2, A + mov f11 = f0 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + (p8) LDFD f34 = [AO3], SIZE + (p8) LDFD f35 = [AO4], SIZE + mov f15 = f0 + } + ;; + { .mmi + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + mov ar.ec= 2 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + add I = I, I + } + ;; + { .mmf + adds WPRE = 4 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f8 = f40, f32 + } + { .mmf + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds I = -1, I + (p8) FMPY f10 = f40, f33 + } + ;; + { .mfi + lfetch.excl.nt1 [WPRE] + (p8) FMPY f12 = f40, f34 + mov ar.lc = I + } + { .mfb + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMPY f14 = f40, f35 + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mmf + (p17) LDFPD f87, f88 = [AO4], 2 * SIZE + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + nop __LINE__ + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFPD f34, f35 = [AO2], 2 * SIZE + (p17) FMA f12 = f104, f37, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f105, f38, f13 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFPD f36, f37 = [AO3], 2 * SIZE + (p17) FMA f14 = f104, f39, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f105, f40, f15 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFPD f38, f39 = [AO4], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f50, f51 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f12 = f106, f53, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f107, f54, f13 + } + ;; + { .mmf + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f106, f55, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f107, f56, f15 + } + ;; + { .mmf + (p16) LDFPD f54, f55 = [AO4], 2 * SIZE + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f66, f67 = [AO2], 2 * SIZE + nop __LINE__ + (p17) FMA f12 = f108, f69, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f109, f70, f13 + } + ;; + { .mmf + (p16) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f108, f71, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f109, f72, f15 + } + ;; + { .mmf + (p16) LDFPD f70, f71 = [AO4], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f111, f84, f11 + } + ;; + { .mmf + (p16) LDFPD f82, f83 = [AO2], 2 * SIZE + nop __LINE__ + (p17) FMA f12 = f110, f85, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f111, f86, f13 + } + ;; + { .mmf + (p16) LDFPD f84, f85 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f110, f87, f14 + } + { .mfb + adds I = -1, I + (p17) FMA f15 = f111, f88, f15 + br.ctop.sptk.few .L22 + } + ;; + .align 16 + +.L25: + and I = 15, MIN_M + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p15 = r0, r0 + ;; + adds I = 1, I + ;; + shr I = I, 1 + ;; + adds I = -1, I + ;; + mov ar.lc = I + mov ar.ec= 3 + and I = 15, MIN_M + (p6) br.cond.dpnt .L28 + ;; + .align 16 + +.L26: + { .mmf + (p16) LDFPD f104, f107 = [BO], 2 * SIZE + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p18) FMA f8 = f106, f34, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f9 = f109, f37, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [AO2], 2 * SIZE + nop __LINE__ + (p18) FMA f10 = f106, f40, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f11 = f109, f43, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [AO3], 2 * SIZE + nop __LINE__ + (p18) FMA f12 = f106, f46, f12 + } + { .mmf + nop __LINE__ + (p17) adds I = -2, I + (p15) FMA f13 = f109, f49, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [AO4], 2 * SIZE + nop __LINE__ + (p15) FMA f15 = f109, f55, f15 + } + { .mfb + (p17) cmp.ne.unc p15, p0 = -1, I + (p18) FMA f14 = f106, f52, f14 + br.ctop.sptk.few .L26 + } + ;; + +.L28: + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + FADD f8 = f8, f9 + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + LDFD f34 = [CO], INCY + nop __LINE__ + FADD f12 = f12, f13 + } + ;; + { .mmf + LDFD f35 = [CO], INCY + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f34 = ALPHA, f12, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA, f14, f35 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f33 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f34 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f35 + add AO1 = AO1, INCY + } + ;; + .align 16 + +.L30: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 1 + } + ;; + { .mfi + mov BO = BUFFER + mov f12 = f0 + shr I = MIN_M, 4 + } + { .mfb + adds WPRE = 4 * SIZE, CO + mov f14 = f0 + (p6) br.cond.dpnt .L40 + } + ;; + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f9 = f0 + } + { .mfi + shladd A = LDA, 1, A + mov f11 = f0 + mov ar.ec= 2 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + adds RPRE1 = RPREFETCH * SIZE, AO1 + add I = I, I + mov f15 = f0 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds I = -1, I + } + ;; + { .mfi + lfetch.excl.nt1 [WPRE] + (p8) FMPY f8 = f40, f32 + mov ar.lc = I + } + { .mfb + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f10 = f40, f33 + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + { .mmf + (p17) LDFPD f83, f84 = [AO2], 2 * SIZE + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + nop __LINE__ + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFPD f34, f35 = [AO2], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f50, f51 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f66, f67 = [AO2], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mfb + adds I = -1, I + (p17) FMA f11 = f111, f84, f11 + br.ctop.sptk.few .L32 + } + ;; + .align 16 + +.L35: + and I = 15, MIN_M + ;; + cmp.eq p6, p0 = 0, I + (p6) br.cond.dpnt .L38 + ;; + tbit.nz p12, p0 = MIN_M, 3 + tbit.nz p13, p0 = MIN_M, 2 + tbit.nz p14, p0 = MIN_M, 1 + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p12) LDFPD f32, f33 = [AO1], 2 * SIZE + (p12) LDFPD f34, f35 = [AO2], 2 * SIZE + (p12) LDFPD f100, f101 = [BO], 2 * SIZE + ;; + (p12) LDFPD f36, f37 = [AO1], 2 * SIZE + (p12) LDFPD f38, f39 = [AO2], 2 * SIZE + (p12) LDFPD f102, f103 = [BO], 2 * SIZE + ;; + (p12) LDFPD f40, f41 = [AO1], 2 * SIZE + (p12) LDFPD f42, f43 = [AO2], 2 * SIZE + (p12) LDFPD f104, f105 = [BO], 2 * SIZE + ;; + (p12) LDFPD f44, f45 = [AO1], 2 * SIZE + (p12) LDFPD f46, f47 = [AO2], 2 * SIZE + (p12) LDFPD f106, f107 = [BO], 2 * SIZE + ;; + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f108, f109 = [BO], 2 * SIZE + ;; + (p13) LDFPD f52, f53 = [AO1], 2 * SIZE + (p13) LDFPD f54, f55 = [AO2], 2 * SIZE + (p13) LDFPD f110, f111 = [BO], 2 * SIZE + ;; + (p14) LDFPD f56, f57 = [AO1], 2 * SIZE + (p14) LDFPD f58, f59 = [AO2], 2 * SIZE + (p14) LDFPD f112, f113 = [BO], 2 * SIZE + ;; + (p15) LDFD f60 = [AO1] + (p15) LDFD f61 = [AO2] + (p15) LDFD f114 = [BO] + ;; + (p12) FMA f8 = f100, f32, f8 + (p12) FMA f9 = f101, f33, f9 + (p12) FMA f10 = f100, f34, f10 + (p12) FMA f11 = f101, f35, f11 + ;; + (p12) FMA f12 = f102, f36, f12 + (p12) FMA f13 = f103, f37, f13 + (p12) FMA f14 = f102, f38, f14 + (p12) FMA f15 = f103, f39, f15 + ;; + (p12) FMA f8 = f104, f40, f8 + (p12) FMA f9 = f105, f41, f9 + (p12) FMA f10 = f104, f42, f10 + (p12) FMA f11 = f105, f43, f11 + ;; + (p12) FMA f12 = f106, f44, f12 + (p12) FMA f13 = f107, f45, f13 + (p12) FMA f14 = f106, f46, f14 + (p12) FMA f15 = f107, f47, f15 + ;; + (p13) FMA f8 = f108, f48, f8 + (p13) FMA f9 = f109, f49, f9 + (p13) FMA f10 = f108, f50, f10 + (p13) FMA f11 = f109, f51, f11 + ;; + (p13) FMA f12 = f110, f52, f12 + (p13) FMA f13 = f111, f53, f13 + (p13) FMA f14 = f110, f54, f14 + (p13) FMA f15 = f111, f55, f15 + ;; + (p14) FMA f8 = f112, f56, f8 + (p14) FMA f9 = f113, f57, f9 + (p14) FMA f10 = f112, f58, f10 + (p14) FMA f11 = f113, f59, f11 + ;; + (p15) FMA f12 = f114, f60, f12 + (p15) FMA f14 = f114, f61, f14 + ;; +.L38: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f12 + FADD f10 = f10, f14 + ;; + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f33 + } + ;; + .align 16 + +.L40: + { .mfi + mov AO1 = A + mov f8 = f0 + shr I = MIN_M, 4 + } + { .mfi + mov BO = BUFFER + mov f10 = f0 + tbit.z p7, p0 = N, 0 + } + ;; + { .mfi + cmp.eq p6, p0 = 0, I + mov f12 = f0 + mov pr.rot= 0 + } + { .mfb + add I = I, I + mov f14 = f0 + (p7) br.cond.dpnt .L99 + } + ;; + { .mfi + (p8) LDFD f32 = [AO1], SIZE + mov f9 = f0 + mov ar.ec= 2 + } + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + add A = A, LDA + mov f11 = f0 + } + ;; + { .mmf + adds WPRE = 1 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + mov f13 = f0 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + mov f15 = f0 + } + ;; + { .mfi + lfetch.excl.nt1 [WPRE] + (p8) FMPY f8 = f40, f32 + mov ar.lc = I + } + { .mmb + nop __LINE__ + nop __LINE__ + (p6) br.cond.dpnt .L45 + } + ;; + .align 16 + +.L42: + { .mmf + (p17) LDFPD f81, f82 = [AO1], 2 * SIZE + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + nop __LINE__ + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mfb + adds I = -1, I + (p17) FMA f9 = f111, f82, f9 + br.ctop.sptk.few .L42 + } + ;; + .align 16 + +.L45: + and I = 15, MIN_M + ;; + cmp.eq p6, p0 = 0, I + (p6) br.cond.dpnt .L48 + ;; + tbit.nz p12, p0 = MIN_M, 3 + tbit.nz p13, p0 = MIN_M, 2 + tbit.nz p14, p0 = MIN_M, 1 + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p12) LDFPD f32, f33 = [AO1], 2 * SIZE + (p12) LDFPD f100, f101 = [BO], 2 * SIZE + ;; + (p12) LDFPD f36, f37 = [AO1], 2 * SIZE + (p12) LDFPD f102, f103 = [BO], 2 * SIZE + ;; + (p12) LDFPD f40, f41 = [AO1], 2 * SIZE + (p12) LDFPD f104, f105 = [BO], 2 * SIZE + ;; + (p12) LDFPD f44, f45 = [AO1], 2 * SIZE + (p12) LDFPD f106, f107 = [BO], 2 * SIZE + ;; + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f108, f109 = [BO], 2 * SIZE + ;; + (p13) LDFPD f52, f53 = [AO1], 2 * SIZE + (p13) LDFPD f110, f111 = [BO], 2 * SIZE + ;; + (p14) LDFPD f56, f57 = [AO1], 2 * SIZE + (p14) LDFPD f112, f113 = [BO], 2 * SIZE + ;; + (p15) LDFD f60 = [AO1] + (p15) LDFD f114 = [BO] + ;; + (p12) FMA f8 = f100, f32, f8 + (p12) FMA f9 = f101, f33, f9 + (p12) FMA f10 = f102, f36, f10 + (p12) FMA f11 = f103, f37, f11 + (p12) FMA f12 = f104, f40, f12 + (p12) FMA f13 = f105, f41, f13 + (p12) FMA f14 = f106, f44, f14 + (p12) FMA f15 = f107, f45, f15 + ;; + (p13) FMA f8 = f108, f48, f8 + (p13) FMA f9 = f109, f49, f9 + (p13) FMA f10 = f110, f52, f10 + (p13) FMA f11 = f111, f53, f11 + (p14) FMA f12 = f112, f56, f12 + (p14) FMA f13 = f113, f57, f13 + (p15) FMA f14 = f114, f60, f14 + ;; +.L48: + { .mmf + LDFD f32 = [CO] + nop __LINE__ + FADD f8 = f8, f9 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f12 = f12, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f8 = f8, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f10 = f10, f14 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f8 = f8, f10 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + ;; + { .mmf + STFD [CO] = f32 + } + ;; + .align 16 + +.L99: + adds IS = P, IS + shladd A = LDAP, BASE_SHIFT, A + ;; + cmp.gt p6, p0 = M, IS + (p6) br.cond.dptk .LIs_loop + br .L999 + .align 4 + ;; + +.L100: + shr J = N, 3 + mov CO = Y + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L120 + ;; + .align 16 + +.L111: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + shr I = MIN_M, 4 + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f14 = f0 + } + ;; + { .mmf + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + mov f16 = f0 + } + { .mmf + (p8) LDFD f34 = [AO3], SIZE + (p8) LDFD f35 = [AO4], SIZE + mov f18 = f0 + } + ;; + { .mmf + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + mov f20 = f0 + } + { .mmf + (p8) LDFD f36 = [AO5], SIZE + (p8) LDFD f37 = [AO6], SIZE + mov f22 = f0 + } + ;; + { .mfi + (p8) LDFD f38 = [AO7], SIZE + mov f9 = f0 + mov ar.ec= 2 + } + { .mmf + (p8) LDFD f39 = [AO8], SIZE + mov BO = BUFFER + mov f11 = f0 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + shladd A = LDA, 3, A + cmp.eq p16, p0 = r0, r0 + mov f15 = f0 + } + ;; + { .mmf + add I = I, I + nop __LINE__ + mov f17 = f0 + } + { .mmf + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + mov f19 = f0 + } + ;; + { .mmf + adds I = -1, I + nop __LINE__ + mov f21 = f0 + } + { .mmf + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + mov f23 = f0 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p8) FMPY f8 = f40, f32 + } + { .mmf + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + (p8) FMPY f10 = f40, f33 + } + ;; + { .mmf + adds AO21 = 7 * SIZE, AO2 + adds AO41 = 7 * SIZE, AO4 + (p8) FMPY f12 = f40, f34 + } + { .mmf + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + (p8) FMPY f14 = f40, f35 + } + ;; + { .mfi + nop __LINE__ + (p8) FMPY f16 = f40, f36 + mov ar.lc = I + } + { .mmf + adds WPRE = 8 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f18 = f40, f37 + } + ;; + { .mmf + lfetch.excl.nt1 [WPRE] + adds AO61 = 7 * SIZE, AO6 + (p8) FMPY f20 = f40, f38 + } + { .mfb + adds AO81 = 7 * SIZE, AO8 + (p8) FMPY f22 = f40, f39 + (p6) br.cond.dpnt .L115 + } + ;; + .align 16 + +.L112: + { .mmf + (p17) LDFPD f80, f95 = [AO8] + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + (p17) adds AO8 = 3 * SIZE, AO8 + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f34 = [AO2], 1 * SIZE + (p17) FMA f12 = f104, f37, f12 + } + { .mmf + (p17) LDFD f84 = [AO21], 8 * SIZE + nop __LINE__ + (p17) FMA f13 = f105, f38, f13 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFPD f36, f37 = [AO3], 2 * SIZE + (p17) FMA f14 = f104, f39, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f105, f40, f15 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f38 = [AO4], 1 * SIZE + (p17) FMA f16 = f104, f41, f16 + } + { .mmf + (p17) LDFD f88 = [AO41], 8 * SIZE + nop __LINE__ + (p17) FMA f17 = f105, f42, f17 + } + ;; + { .mmf + (p14) PREFETCH [RPRE5], 16 * SIZE + (p16) LDFPD f40, f41 = [AO5], 2 * SIZE + (p17) FMA f18 = f104, f43, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f19 = f105, f44, f19 + } + ;; + { .mmf + (p15) PREFETCH [RPRE6], 16 * SIZE + (p16) LDFD f42 = [AO6], 1 * SIZE + (p17) FMA f20 = f104, f45, f20 + } + { .mmf + (p17) LDFD f92 = [AO61], 8 * SIZE + nop __LINE__ + (p17) FMA f21 = f105, f46, f21 + } + ;; + { .mmf + (p14) PREFETCH [RPRE7], 16 * SIZE + (p16) LDFPD f44, f45 = [AO7], 2 * SIZE + (p17) FMA f22 = f104, f47, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f23 = f105, f48, f23 + } + ;; + { .mmf + (p15) PREFETCH [RPRE8], 16 * SIZE + (p16) LDFD f46 = [AO8], 1 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + (p17) LDFD f96 = [AO81], 8 * SIZE + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f35, f50 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f12 = f106, f53, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f107, f54, f13 + } + ;; + { .mmf + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f106, f55, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f107, f56, f15 + } + ;; + { .mmf + (p16) LDFPD f39, f54 = [AO4], 2 * SIZE + nop __LINE__ + (p17) FMA f16 = f106, f57, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f17 = f107, f58, f17 + } + ;; + { .mmf + (p16) LDFPD f56, f57 = [AO5], 2 * SIZE + nop __LINE__ + (p17) FMA f18 = f106, f59, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f19 = f107, f60, f19 + } + ;; + { .mmf + (p16) LDFPD f43, f58 = [AO6], 2 * SIZE + nop __LINE__ + (p17) FMA f20 = f106, f61, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f21 = f107, f62, f21 + } + ;; + { .mmf + (p16) LDFPD f60, f61 = [AO7], 2 * SIZE + nop __LINE__ + (p17) FMA f22 = f106, f63, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f23 = f107, f64, f23 + } + ;; + { .mmf + (p16) LDFPD f47, f62 = [AO8], 2 * SIZE + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f51, f66 = [AO2], 2 * SIZE + nop __LINE__ + (p17) FMA f12 = f108, f69, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f109, f70, f13 + } + ;; + { .mmf + (p16) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f108, f71, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f109, f72, f15 + } + ;; + { .mmf + (p16) LDFPD f55, f70 = [AO4], 2 * SIZE + nop __LINE__ + (p17) FMA f16 = f108, f73, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f17 = f109, f74, f17 + } + ;; + { .mmf + (p16) LDFPD f72, f73 = [AO5], 2 * SIZE + nop __LINE__ + (p17) FMA f18 = f108, f75, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f19 = f109, f76, f19 + } + ;; + { .mmf + (p16) LDFPD f59, f74 = [AO6], 2 * SIZE + nop __LINE__ + (p17) FMA f20 = f108, f77, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f21 = f109, f78, f21 + } + ;; + { .mmf + (p16) LDFPD f76, f77 = [AO7], 2 * SIZE + nop __LINE__ + (p17) FMA f22 = f108, f79, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f23 = f109, f80, f23 + } + ;; + { .mmf + (p16) LDFPD f63, f78 = [AO8], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f111, f84, f11 + } + ;; + { .mmf + (p16) LDFPD f67, f82 = [AO2] + nop __LINE__ + (p17) FMA f12 = f110, f85, f12 + } + { .mmf + nop __LINE__ + (p16) adds AO2 = 3 * SIZE, AO2 + (p17) FMA f13 = f111, f86, f13 + } + ;; + { .mmf + (p16) LDFPD f84, f85 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f110, f87, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f111, f88, f15 + } + ;; + { .mmf + (p16) LDFPD f71, f86 = [AO4] + nop __LINE__ + (p17) FMA f16 = f110, f89, f16 + } + { .mmf + nop __LINE__ + (p16) adds AO4 = 3 * SIZE, AO4 + (p17) FMA f17 = f111, f90, f17 + } + ;; + { .mmf + (p16) LDFPD f88, f89 = [AO5], 2 * SIZE + nop __LINE__ + (p17) FMA f18 = f110, f91, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f19 = f111, f92, f19 + } + ;; + { .mmf + (p16) LDFPD f75, f90 = [AO6] + nop __LINE__ + (p17) FMA f20 = f110, f93, f20 + } + { .mmf + nop __LINE__ + (p16) adds AO6 = 3 * SIZE, AO6 + (p17) FMA f21 = f111, f94, f21 + } + ;; + { .mmf + (p16) LDFPD f92, f93 = [AO7], 2 * SIZE + nop __LINE__ + (p17) FMA f22 = f110, f95, f22 + } + { .mfb + adds I = -1, I + (p17) FMA f23 = f111, f96, f23 + br.ctop.sptk.few .L112 + } + ;; + .align 16 + +.L115: + and I = 15, MIN_M + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p15 = r0, r0 + ;; + adds I = 1, I + ;; + shr I = I, 1 + ;; + adds I = -1, I + adds AO21 = 1 * SIZE, AO2 + adds AO41 = 1 * SIZE, AO4 + adds AO61 = 1 * SIZE, AO6 + adds AO81 = 1 * SIZE, AO8 + ;; + mov ar.lc = I + mov ar.ec= 3 + and I = 15, MIN_M + (p6) br.cond.dpnt .L118 + ;; + .align 16 + +.L116: + { .mmf + (p16) LDFPD f104, f107 = [BO], 2 * SIZE + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p18) FMA f8 = f106, f34, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f9 = f109, f37, f9 + } + ;; + { .mmf + (p16) LDFD f38 = [AO2], 2 * SIZE + (p17) LDFD f42 = [AO21], 2 * SIZE + (p18) FMA f10 = f106, f40, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f11 = f109, f43, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [AO3], 2 * SIZE + nop __LINE__ + (p18) FMA f12 = f106, f46, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f13 = f109, f49, f13 + } + ;; + { .mmf + (p16) LDFD f50 = [AO4], 2 * SIZE + (p17) LDFD f54 = [AO41], 2 * SIZE + (p18) FMA f14 = f106, f52, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f15 = f109, f55, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [AO5], 2 * SIZE + nop __LINE__ + (p18) FMA f16 = f106, f58, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f17 = f109, f61, f17 + } + ;; + { .mmf + (p16) LDFD f62 = [AO6], 2 * SIZE + (p17) LDFD f66 = [AO61], 2 * SIZE + (p18) FMA f18 = f106, f64, f18 + } + { .mmf + nop __LINE__ + (p17) adds I = -2, I + (p15) FMA f19 = f109, f67, f19 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [AO7], 2 * SIZE + nop __LINE__ + (p18) FMA f20 = f106, f70, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f21 = f109, f73, f21 + } + ;; + { .mmf + (p16) LDFD f74 = [AO8], 2 * SIZE + (p17) LDFD f78 = [AO81], 2 * SIZE + (p15) FMA f23 = f109, f79, f23 + } + { .mfb + (p17) cmp.ne.unc p15, p0 = -1, I + (p18) FMA f22 = f106, f76, f22 + br.ctop.sptk.few .L116 + } + ;; + +.L118: + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + FADD f8 = f8, f9 + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + LDFD f34 = [CO], INCY + nop __LINE__ + FADD f12 = f12, f13 + } + ;; + { .mmf + LDFD f35 = [CO], INCY + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + LDFD f36 = [CO], INCY + nop __LINE__ + FADD f16 = f16, f17 + } + ;; + { .mmf + LDFD f37 = [CO], INCY + nop __LINE__ + FADD f18 = f18, f19 + } + ;; + { .mmf + LDFD f38 = [CO], INCY + nop __LINE__ + FADD f20 = f20, f21 + } + ;; + { .mmf + LDFD f39 = [CO], INCY + nop __LINE__ + FADD f22 = f22, f23 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f34 = ALPHA, f12, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA, f14, f35 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + FMA f36 = ALPHA, f16, f36 + } + ;; + { .mmf + STFD [AO1] = f33 + add AO1 = AO1, INCY + FMA f37 = ALPHA, f18, f37 + } + ;; + { .mmf + STFD [AO1] = f34 + add AO1 = AO1, INCY + FMA f38 = ALPHA, f20, f38 + } + ;; + { .mmf + STFD [AO1] = f35 + add AO1 = AO1, INCY + FMA f39 = ALPHA, f22, f39 + } + ;; + { .mmi + STFD [AO1] = f36 + add AO1 = AO1, INCY + adds J = -1, J + } + ;; + { .mmi + STFD [AO1] = f37 + add AO1 = AO1, INCY + nop __LINE__ + } + ;; + { .mmi + STFD [AO1] = f38 + add AO1 = AO1, INCY + cmp4.lt p6, p0 = 0, J + } + ;; + { .mib + STFD [AO1] = f39 + add AO1 = AO1, INCY + (p6) br.cond.dptk .L111 + } + ;; + .align 16 + +.L120: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 2 + } + ;; + { .mfi + shladd AO3 = LDA, 1, A + mov f12 = f0 + shr I = MIN_M, 4 + } + { .mfb + shladd AO4 = LDA, 1, AO2 + mov f14 = f0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f9 = f0 + } + { .mmf + mov BO = BUFFER + shladd A = LDA, 2, A + mov f11 = f0 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + (p8) LDFD f34 = [AO3], SIZE + (p8) LDFD f35 = [AO4], SIZE + mov f15 = f0 + } + ;; + { .mmi + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + mov ar.ec= 2 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + add I = I, I + adds AO21 = 7 * SIZE, AO2 + } + ;; + { .mmf + adds WPRE = 4 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f8 = f40, f32 + } + { .mmf + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds I = -1, I + (p8) FMPY f10 = f40, f33 + } + ;; + { .mfi + adds AO41 = 7 * SIZE, AO4 + (p8) FMPY f12 = f40, f34 + mov ar.lc = I + } + { .mfb + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMPY f14 = f40, f35 + (p6) br.cond.dpnt .L125 + } + ;; + .align 16 + +.L122: + { .mmf + (p17) LDFPD f72, f87 = [AO4] + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + (p17) adds AO4 = 3 * SIZE, AO4 + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f34 = [AO2], 1 * SIZE + (p17) FMA f12 = f104, f37, f12 + } + { .mmf + (p17) LDFD f84 = [AO21], 8 * SIZE + nop __LINE__ + (p17) FMA f13 = f105, f38, f13 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFPD f36, f37 = [AO3], 2 * SIZE + (p17) FMA f14 = f104, f39, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f105, f40, f15 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f38 = [AO4], 1 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + (p17) LDFD f88 = [AO41], 8 * SIZE + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f35, f50 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f12 = f106, f53, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f107, f54, f13 + } + ;; + { .mmf + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f106, f55, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f107, f56, f15 + } + ;; + { .mmf + (p16) LDFPD f39, f54 = [AO4], 2 * SIZE + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f51, f66 = [AO2], 2 * SIZE + nop __LINE__ + (p17) FMA f12 = f108, f69, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f109, f70, f13 + } + ;; + { .mmf + (p16) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f108, f71, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f109, f72, f15 + } + ;; + { .mmf + (p16) LDFPD f55, f70 = [AO4], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f111, f84, f11 + } + ;; + { .mmf + (p16) LDFPD f67, f82 = [AO2] + nop __LINE__ + (p17) FMA f12 = f110, f85, f12 + } + { .mmf + nop __LINE__ + (p16) adds AO2 = 3 * SIZE, AO2 + (p17) FMA f13 = f111, f86, f13 + } + ;; + { .mmf + (p16) LDFPD f84, f85 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f110, f87, f14 + } + { .mfb + adds I = -1, I + (p17) FMA f15 = f111, f88, f15 + br.ctop.sptk.few .L122 + } + ;; + .align 16 + +.L125: + and I = 15, MIN_M + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p15 = r0, r0 + ;; + adds I = 1, I + adds AO21 = 1 * SIZE, AO2 + adds AO41 = 1 * SIZE, AO4 + ;; + shr I = I, 1 + ;; + adds I = -1, I + ;; + mov ar.lc = I + mov ar.ec= 3 + and I = 15, MIN_M + (p6) br.cond.dpnt .L128 + ;; + .align 16 + +.L126: + { .mmf + (p16) LDFPD f104, f107 = [BO], 2 * SIZE + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p18) FMA f8 = f106, f34, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f9 = f109, f37, f9 + } + ;; + { .mmf + (p17) LDFD f42 = [AO21], 2 * SIZE + (p16) LDFD f38 = [AO2], 2 * SIZE + (p18) FMA f10 = f106, f40, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f11 = f109, f43, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [AO3], 2 * SIZE + nop __LINE__ + (p18) FMA f12 = f106, f46, f12 + } + { .mmf + nop __LINE__ + (p17) adds I = -2, I + (p15) FMA f13 = f109, f49, f13 + } + ;; + { .mmf + (p17) LDFD f54 = [AO41], 2 * SIZE + (p16) LDFD f50 = [AO4], 2 * SIZE + (p15) FMA f15 = f109, f55, f15 + } + { .mfb + (p17) cmp.ne.unc p15, p0 = -1, I + (p18) FMA f14 = f106, f52, f14 + br.ctop.sptk.few .L126 + } + ;; + +.L128: + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + FADD f8 = f8, f9 + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + LDFD f34 = [CO], INCY + nop __LINE__ + FADD f12 = f12, f13 + } + ;; + { .mmf + LDFD f35 = [CO], INCY + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f34 = ALPHA, f12, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA, f14, f35 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f33 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f34 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f35 + add AO1 = AO1, INCY + } + ;; + .align 16 + +.L130: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 1 + } + ;; + { .mfi + mov BO = BUFFER + mov f12 = f0 + shr I = MIN_M, 4 + } + { .mfb + adds WPRE = 4 * SIZE, CO + mov f14 = f0 + (p6) br.cond.dpnt .L140 + } + ;; + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f9 = f0 + } + { .mfi + shladd A = LDA, 1, A + mov f11 = f0 + mov ar.ec= 2 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + adds RPRE1 = RPREFETCH * SIZE, AO1 + add I = I, I + mov f15 = f0 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds I = -1, I + } + ;; + { .mfi + adds AO21 = 7 * SIZE, AO2 + (p8) FMPY f8 = f40, f32 + mov ar.lc = I + } + { .mfb + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f10 = f40, f33 + (p6) br.cond.dpnt .L135 + } + ;; + .align 16 + +.L132: + { .mmf + (p17) LDFPD f68, f83 = [AO2] + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + (p17) adds AO2 = 3 * SIZE, AO2 + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f34 = [AO2], 1 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + (p17) LDFD f84 = [AO21], 8 * SIZE + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f35, f50 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f51, f66 = [AO2], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mfb + adds I = -1, I + (p17) FMA f11 = f111, f84, f11 + br.ctop.sptk.few .L132 + } + ;; + .align 16 + +.L135: + and I = 15, MIN_M + ;; + cmp.eq p6, p0 = 0, I + (p6) br.cond.dpnt .L138 + ;; + tbit.nz p12, p0 = MIN_M, 3 + tbit.nz p13, p0 = MIN_M, 2 + tbit.nz p14, p0 = MIN_M, 1 + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p12) LDFPD f100, f101 = [BO], 2 * SIZE + (p12) LDFPD f32, f33 = [AO1], 2 * SIZE + (p12) LDFD f34 = [AO2], 1 * SIZE + ;; + (p12) LDFPD f36, f37 = [AO1], 2 * SIZE + (p12) LDFPD f35, f38 = [AO2], 2 * SIZE + ;; + (p12) LDFPD f102, f103 = [BO], 2 * SIZE + (p12) LDFPD f39, f42 = [AO2], 2 * SIZE + ;; + (p12) LDFPD f40, f41 = [AO1], 2 * SIZE + (p12) LDFPD f43, f46 = [AO2], 2 * SIZE + ;; + (p12) LDFPD f104, f105 = [BO], 2 * SIZE + (p12) LDFPD f44, f45 = [AO1], 2 * SIZE + (p12) LDFD f47 = [AO2], 1 * SIZE + ;; + (p12) LDFPD f106, f107 = [BO], 2 * SIZE + (p13) LDFD f50 = [AO2], 1 * SIZE + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + ;; + (p13) LDFPD f108, f109 = [BO], 2 * SIZE + (p13) LDFPD f51, f54 = [AO2], 2 * SIZE + ;; + (p13) LDFPD f110, f111 = [BO], 2 * SIZE + (p13) LDFPD f52, f53 = [AO1], 2 * SIZE + (p13) LDFD f55 = [AO2], 1 * SIZE + ;; + (p14) LDFPD f56, f57 = [AO1], 2 * SIZE + (p14) LDFD f58 = [AO2], 1 * SIZE + ;; + (p14) LDFPD f112, f113 = [BO], 2 * SIZE + (p15) LDFD f60 = [AO1] + (p14) LDFD f59 = [AO2], 1 * SIZE + ;; + (p15) LDFD f61 = [AO2] + (p15) LDFD f114 = [BO] + ;; + (p12) FMA f8 = f100, f32, f8 + (p12) FMA f9 = f101, f33, f9 + (p12) FMA f10 = f100, f34, f10 + (p12) FMA f11 = f101, f35, f11 + ;; + (p12) FMA f12 = f102, f36, f12 + (p12) FMA f13 = f103, f37, f13 + (p12) FMA f14 = f102, f38, f14 + (p12) FMA f15 = f103, f39, f15 + ;; + (p12) FMA f8 = f104, f40, f8 + (p12) FMA f9 = f105, f41, f9 + (p12) FMA f10 = f104, f42, f10 + (p12) FMA f11 = f105, f43, f11 + ;; + (p12) FMA f12 = f106, f44, f12 + (p12) FMA f13 = f107, f45, f13 + (p12) FMA f14 = f106, f46, f14 + (p12) FMA f15 = f107, f47, f15 + ;; + (p13) FMA f8 = f108, f48, f8 + (p13) FMA f9 = f109, f49, f9 + (p13) FMA f10 = f108, f50, f10 + (p13) FMA f11 = f109, f51, f11 + ;; + (p13) FMA f12 = f110, f52, f12 + (p13) FMA f13 = f111, f53, f13 + (p13) FMA f14 = f110, f54, f14 + (p13) FMA f15 = f111, f55, f15 + ;; + (p14) FMA f8 = f112, f56, f8 + (p14) FMA f9 = f113, f57, f9 + (p14) FMA f10 = f112, f58, f10 + (p14) FMA f11 = f113, f59, f11 + ;; + (p15) FMA f12 = f114, f60, f12 + (p15) FMA f14 = f114, f61, f14 + ;; +.L138: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f12 + FADD f10 = f10, f14 + ;; + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f33 + } + ;; + .align 16 + +.L140: + { .mfi + mov AO1 = A + mov f8 = f0 + shr I = MIN_M, 4 + } + { .mfi + mov BO = BUFFER + mov f10 = f0 + tbit.z p7, p0 = N, 0 + } + ;; + { .mfi + cmp.eq p6, p0 = 0, I + mov f12 = f0 + mov pr.rot= 0 + } + { .mfb + add I = I, I + mov f14 = f0 + (p7) br.cond.dpnt .L199 + } + ;; + { .mfi + (p8) LDFD f32 = [AO1], SIZE + mov f9 = f0 + mov ar.ec= 2 + } + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + add A = A, LDA + mov f11 = f0 + } + ;; + { .mmf + adds WPRE = 1 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + mov f13 = f0 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + mov f15 = f0 + } + ;; + { .mfi + lfetch.excl.nt1 [WPRE] + (p8) FMPY f8 = f40, f32 + mov ar.lc = I + } + { .mmb + nop __LINE__ + nop __LINE__ + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L142: + { .mmf + (p17) LDFPD f81, f82 = [AO1], 2 * SIZE + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + nop __LINE__ + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mfb + adds I = -1, I + (p17) FMA f9 = f111, f82, f9 + br.ctop.sptk.few .L142 + } + ;; + .align 16 + +.L145: + and I = 15, MIN_M + ;; + cmp.eq p6, p0 = 0, I + (p6) br.cond.dpnt .L148 + ;; + tbit.nz p12, p0 = MIN_M, 3 + tbit.nz p13, p0 = MIN_M, 2 + tbit.nz p14, p0 = MIN_M, 1 + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p12) LDFPD f32, f33 = [AO1], 2 * SIZE + (p12) LDFPD f100, f101 = [BO], 2 * SIZE + ;; + (p12) LDFPD f36, f37 = [AO1], 2 * SIZE + (p12) LDFPD f102, f103 = [BO], 2 * SIZE + ;; + (p12) LDFPD f40, f41 = [AO1], 2 * SIZE + (p12) LDFPD f104, f105 = [BO], 2 * SIZE + ;; + (p12) LDFPD f44, f45 = [AO1], 2 * SIZE + (p12) LDFPD f106, f107 = [BO], 2 * SIZE + ;; + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f108, f109 = [BO], 2 * SIZE + ;; + (p13) LDFPD f52, f53 = [AO1], 2 * SIZE + (p13) LDFPD f110, f111 = [BO], 2 * SIZE + ;; + (p14) LDFPD f56, f57 = [AO1], 2 * SIZE + (p14) LDFPD f112, f113 = [BO], 2 * SIZE + ;; + (p15) LDFD f60 = [AO1] + (p15) LDFD f114 = [BO] + ;; + (p12) FMA f8 = f100, f32, f8 + (p12) FMA f9 = f101, f33, f9 + (p12) FMA f10 = f102, f36, f10 + (p12) FMA f11 = f103, f37, f11 + (p12) FMA f12 = f104, f40, f12 + (p12) FMA f13 = f105, f41, f13 + (p12) FMA f14 = f106, f44, f14 + (p12) FMA f15 = f107, f45, f15 + ;; + (p13) FMA f8 = f108, f48, f8 + (p13) FMA f9 = f109, f49, f9 + (p13) FMA f10 = f110, f52, f10 + (p13) FMA f11 = f111, f53, f11 + (p14) FMA f12 = f112, f56, f12 + (p14) FMA f13 = f113, f57, f13 + (p15) FMA f14 = f114, f60, f14 + ;; +.L148: + { .mmf + LDFD f32 = [CO] + nop __LINE__ + FADD f8 = f8, f9 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f12 = f12, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f8 = f8, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f10 = f10, f14 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f8 = f8, f10 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + ;; + { .mmf + STFD [CO] = f32 + nop __LINE__ + nop __LINE__ + } + ;; + .align 16 + +.L199: + adds IS = P, IS + shladd A = LDAP, BASE_SHIFT, A + ;; + cmp.gt p6, p0 = M, IS + (p6) br.cond.dptk .LIs_loop + .align 4 + ;; + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/iamax.S b/kernel/ia64/iamax.S new file mode 100644 index 0000000..a091675 --- /dev/null +++ b/kernel/ia64/iamax.S @@ -0,0 +1,639 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#if !defined(USE_MIN) && defined(USE_ABS) +#define FMAX famax +#elif !defined(USE_MIN) && !defined(USE_ABS) +#define FMAX fmax +#elif defined(USE_MIN) && defined(USE_ABS) +#define FMAX famin +#else +#define FMAX fmin +#endif + +#define IMAX1 r8 +#define IMAX2 r26 +#define IMAX3 r27 +#define IMAX4 r28 + +#define PRE1 r2 + +#define N r14 +#define X1 r15 +#define INCX r16 + +#define I r17 +#define X2 r18 +#define INCX5 r19 +#define INCX16 r20 +#define CURRENT r21 + +#define DMAX1 f8 +#define DMAX2 f9 +#define DMAX3 f10 +#define DMAX4 f11 +#define DMAX5 f12 +#define DMAX6 f13 +#define DMAX7 f14 +#define DMAX8 f15 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + mov IMAX1 = 0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body + +#ifdef F_INTERFACE + { .mmi + LDINT N = [r32] + LDINT INCX = [r34] + mov X1 = r33 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#else + { .mmi + mov N = r32 + mov X1 = r33 + mov INCX = r34 + } + ;; +#endif + + { .mii + mov PR = pr + cmp.ge p6, p0 = 0, INCX + } + { .mbb + cmp.ge p8, p0 = 0, N + (p8) br.ret.sptk.many b0 + (p6) br.ret.sptk.many b0 + } + ;; + { .mmi + LDFD DMAX1 = [X1] + shladd INCX = INCX, BASE_SHIFT, r0 + mov pr.rot= 0 + } + ;; + mov IMAX1 = 1 + mov IMAX2 = 1 + mov IMAX3 = 1 + mov IMAX4 = 1 + mov CURRENT = 1 + adds N = -1, N + ;; + + { .mmf + add X1 = X1, INCX + mov DMAX2 = DMAX1 + } + ;; + { .mmf + shladd X2 = INCX, 2, X1 + } + { .mfi + cmp.eq p16, p0 = r0, r0 + shr I = N, 4 + } + ;; + { .mfi + shladd INCX5 = INCX, 2, INCX + mov DMAX3 = DMAX1 + mov ar.ec= 4 + } + { .mmf +#ifdef XDOUBLE + shladd INCX16= INCX, 3, r0 +#else + shladd INCX16= INCX, 4, r0 +#endif + adds I = -1, I + } + ;; + tbit.z p0, p7 = N, 3 + ;; + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, X1 + mov DMAX4 = DMAX1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + (p6) br.cond.dpnt .L15 + } + .align 32 + ;; +.L10: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X1], INCX + (p19) fcmp.neq.unc p12, p0 = DMAX1, DMAX5 + } + { .mmf + (p8) adds IMAX1 = 1, CURRENT + nop __LINE__ + (p19) FMAX DMAX5 = f67, DMAX1 + } + ;; + { .mmf + (p16) LDFD f36 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p13, p0 = DMAX2, DMAX6 + } + { .mmf + (p9) adds IMAX2 = 2, CURRENT + nop __LINE__ + (p19) FMAX DMAX6 = f71, DMAX2 + } + ;; + { .mmf + (p16) LDFD f40 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p14, p0 = DMAX3, DMAX7 + } + { .mmf + (p10) adds IMAX3 = 3, CURRENT + nop __LINE__ + (p19) FMAX DMAX7 = f75, DMAX3 + } + ;; + { .mmf + (p16) LDFD f44 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p15, p0 = DMAX4, DMAX8 + } + { .mmf + (p11) adds IMAX4 = 4, CURRENT + nop __LINE__ + (p19) FMAX DMAX8 = f79, DMAX4 + } + ;; + { .mmf + (p16) LDFD f48 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + (p12) adds IMAX1 = 5, CURRENT + nop __LINE__ + (p19) FMAX DMAX1 = f83, DMAX5 + } + ;; + { .mmf + (p16) LDFD f52 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + (p13) adds IMAX2 = 6, CURRENT + nop __LINE__ + (p19) FMAX DMAX2 = f87, DMAX6 + } + ;; + { .mmf + (p16) LDFD f56 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + (p14) adds IMAX3 = 7, CURRENT + nop __LINE__ + (p19) FMAX DMAX3 = f91, DMAX7 + } + ;; + { .mmf + (p16) LDFD f60 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mmf + (p15) adds IMAX4 = 8, CURRENT + nop __LINE__ + (p19) FMAX DMAX4 = f95, DMAX8 + } + ;; + { .mmf +#ifdef XDOUBLE + (p16) lfetch.nt1 [PRE1], INCX16 +#endif + (p16) LDFD f64 = [X1], INCX +#ifndef XDOUBLE + nop __LINE__ +#endif + (p19) fcmp.neq.unc p12, p0 = DMAX1, DMAX5 + } + { .mmf + (p8) adds IMAX1 = 9, CURRENT + nop __LINE__ + (p18) FMAX DMAX5 = f34, DMAX1 + } + ;; + { .mmf + (p16) LDFD f68 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p13, p0 = DMAX2, DMAX6 + } + { .mmf + (p9) adds IMAX2 = 10, CURRENT + nop __LINE__ + (p18) FMAX DMAX6 = f38, DMAX2 + } + ;; + { .mmf + (p16) LDFD f72 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p14, p0 = DMAX3, DMAX7 + } + { .mmf + (p10) adds IMAX3 = 11, CURRENT + nop __LINE__ + (p18) FMAX DMAX7 = f42, DMAX3 + } + ;; + { .mmf + (p16) LDFD f76 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p15, p0 = DMAX4, DMAX8 + } + { .mmf + (p11) adds IMAX4 = 12, CURRENT + nop __LINE__ + (p18) FMAX DMAX8 = f46, DMAX4 + } + ;; + { .mmf + (p16) LDFD f80 = [X1], INCX + nop __LINE__ + (p18) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + (p12) adds IMAX1 = 13, CURRENT + nop __LINE__ + (p18) FMAX DMAX1 = f50, DMAX5 + } + ;; + { .mmf + (p16) LDFD f84 = [X1], INCX + nop __LINE__ + (p18) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + (p13) adds IMAX2 = 14, CURRENT + nop __LINE__ + (p18) FMAX DMAX2 = f54, DMAX6 + } + ;; + { .mmf + (p16) LDFD f88 = [X1], INCX + nop __LINE__ + (p18) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + (p14) adds IMAX3 = 15, CURRENT + nop __LINE__ + (p18) FMAX DMAX3 = f58, DMAX7 + } + ;; + { .mmf + (p16) LDFD f92 = [X1], INCX + (p15) adds IMAX4 = 16, CURRENT + (p18) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mfb + (p19) adds CURRENT = 16, CURRENT + (p18) FMAX DMAX4 = f62, DMAX8 + br.ctop.sptk.few .L10 + } + ;; + .align 32 + +.L15: + { .mmi + (p7) LDFD f32 = [X1], INCX + and I = 15, N + cmp.ne p14, p0 = r0, r0 + } + ;; + { .mmb + (p7) LDFD f33 = [X1], INCX + cmp.eq p6, p0 = 0, I + (p6) br.cond.dptk .L999 + } + ;; + { .mmi + (p7) LDFD f34 = [X1], INCX + ;; + (p7) LDFD f35 = [X1], INCX + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p7) LDFD f36 = [X1], INCX + ;; + (p7) LDFD f37 = [X1], INCX + tbit.z p0, p14 = N, 1 + } + ;; + { .mfi + (p7) LDFD f38 = [X1], INCX + (p7) FMAX DMAX5 = f32, DMAX1 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmf + (p7) LDFD f39 = [X1], INCX + nop __LINE__ + (p7) FMAX DMAX6 = f33, DMAX2 + } + ;; + { .mmf + (p13) LDFD f40 = [X1], INCX + nop __LINE__ + (p7) FMAX DMAX7 = f34, DMAX3 + } + ;; + { .mmf + (p13) LDFD f41 = [X1], INCX + nop __LINE__ + (p7) FMAX DMAX8 = f35, DMAX4 + } + ;; + { .mmf + (p13) LDFD f42 = [X1], INCX + nop __LINE__ + (p7) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p7) FMAX DMAX1 = f36, DMAX5 + } + ;; + { .mmf + (p13) LDFD f43 = [X1], INCX + nop __LINE__ + (p7) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p7) FMAX DMAX2 = f37, DMAX6 + } + ;; + { .mmf + (p14) LDFD f44 = [X1], INCX + nop __LINE__ + (p7) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p7) FMAX DMAX3 = f38, DMAX7 + } + ;; + { .mmf + (p14) LDFD f45 = [X1], INCX + nop __LINE__ + (p7) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p7) FMAX DMAX4 = f39, DMAX8 + } + ;; + { .mmf + (p15) LDFD f46 = [X1], INCX + (p8) adds IMAX1 = 1, CURRENT + (p7) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMAX DMAX5 = f40, DMAX1 + } + { .mmf + (p9) adds IMAX2 = 2, CURRENT + nop __LINE__ + (p7) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMAX DMAX6 = f41, DMAX2 + } + { .mmf + (p10) adds IMAX3 = 3, CURRENT + nop __LINE__ + (p7) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMAX DMAX7 = f42, DMAX3 + } + { .mmf + (p11) adds IMAX4 = 4, CURRENT + nop __LINE__ + (p7) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMAX DMAX8 = f43, DMAX4 + } + ;; + { .mmf + (p8) adds IMAX1 = 5, CURRENT + nop __LINE__ + (p13) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) mov DMAX1 = DMAX5 + } + { .mmf + (p9) adds IMAX2 = 6, CURRENT + nop __LINE__ + (p13) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) mov DMAX2 = DMAX6 + } + { .mmf + (p10) adds IMAX3 = 7, CURRENT + nop __LINE__ + (p13) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) mov DMAX3 = DMAX7 + } + { .mmf + (p11) adds IMAX4 = 8, CURRENT + nop __LINE__ + (p13) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mmf + (p7) adds CURRENT = 8, CURRENT + nop __LINE__ + (p13) mov DMAX4 = DMAX8 + } + ;; + { .mmf + (p8) adds IMAX1 = 1, CURRENT + nop __LINE__ + (p14) FMAX DMAX5 = f44, DMAX1 + } + { .mmf + (p9) adds IMAX2 = 2, CURRENT + (p10) adds IMAX3 = 3, CURRENT + (p14) FMAX DMAX6 = f45, DMAX2 + } + { .mmf + (p11) adds IMAX4 = 4, CURRENT + (p13) adds CURRENT = 4, CURRENT + (p15) FMAX DMAX7 = f46, DMAX3 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p14) fcmp.neq.unc p8, p0 = DMAX5, DMAX1 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) mov DMAX1 = DMAX5 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) fcmp.neq.unc p9, p0 = DMAX6, DMAX2 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) mov DMAX2 = DMAX6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) fcmp.neq.unc p10, p0 = DMAX7, DMAX3 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) mov DMAX3 = DMAX7 + } + ;; +.L999: + { .mmf + (p8) adds IMAX1 = 1, CURRENT + nop __LINE__ + FMAX DMAX5 = DMAX2, DMAX1 + } + { .mmf + (p9) adds IMAX2 = 2, CURRENT + (p14) adds CURRENT = 2, CURRENT + FMAX DMAX6 = DMAX4, DMAX3 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + fcmp.neq p12, p0 = DMAX5, DMAX1 + } + { .mmf + (p10) adds IMAX3 = 1, CURRENT + nop __LINE__ + fcmp.neq p13, p0 = DMAX6, DMAX3 + } + ;; + { .mmf + (p12) mov IMAX1 = IMAX2 + (p13) mov IMAX3 = IMAX4 + FMAX DMAX1 = DMAX6, DMAX5 + } + ;; + { .mfi + nop __LINE__ + fcmp.neq p12, p0 = DMAX1, DMAX5 + mov ar.lc = ARLC + } + ;; + { .mib + (p12) mov IMAX1 = IMAX3 + mov pr = PR, -65474 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/izamax.S b/kernel/ia64/izamax.S new file mode 100644 index 0000000..c43bcca --- /dev/null +++ b/kernel/ia64/izamax.S @@ -0,0 +1,579 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#ifdef USE_MIN +#define CMPUNC cmp.lt.unc +#define CMP cmp.lt +#else +#define CMPUNC cmp.gt.unc +#define CMP cmp.gt +#endif + +#define RET r8 + +#define N r32 +#define DX r33 +#define INCX r34 + +#define PRE1 r2 + +#define I r14 +#define J r15 +#define K r16 +#define TMP r17 +#define INCXM1 r18 +#define INCX8 r19 +#define MAX1 r20 +#define DMAX1 r21 +#define DATA1 r22 +#define DATA2 r23 +#define DATA3 r24 +#define DATA4 r25 +#define DATA5 r26 +#define DATA6 r27 +#define DATA7 r28 +#define DATA8 r29 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mmi + mov MAX1 = -1 + mov DMAX1 = 0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + .body + +#ifdef F_INTERFACE + { .mmi + LDINT N = [N] + LDINT INCX = [INCX] + nop.i 0 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#endif + + { .mii + adds K = -1, N + shl INCX = INCX, ZBASE_SHIFT + mov PR = pr + } + { .mmb + cmp.ge p8, p0 = 0, N + (p8) br.cond.dptk .L999 + } + ;; + { .mib + cmp.ge p6, p0 = 0, INCX + mov pr.rot= 0 + (p6) br.cond.dptk .L999 + } + ;; + { .mmi + LDFD f6 = [DX], SIZE + adds INCXM1 = - SIZE, INCX + mov ar.ec= 5 + } + ;; + { .mmi + LDFD f7 = [DX], INCXM1 + mov MAX1 = 0 + mov I = 1 + } + ;; + { .mfi + cmp.eq p16, p0 = r0, r0 + fabs f6 = f6 + shr J = K, 3 + } + { .mmf + nop.m 0 + nop.m 0 + fabs f7 = f7 + } + ;; + { .mmi + cmp.ne p8, p0 = r0, r0 + adds J = -1, J + shladd INCX8 = INCX, 3, r0 + } + { .mmf + nop.m 0 + nop.m 0 + FADD f6 = f6, f7 + } + ;; + { .mmi + getf.d DMAX1 = f6 + adds PRE1 = PREFETCH_SIZE * SIZE, DX + mov ar.lc = J + } + { .mib + cmp.eq p7 ,p0 = -1, J + tbit.z p0, p13 = K, 2 + (p7) br.cond.dpnt .L15 + } + .align 32 + ;; +.L10: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX8 + (p16) LDFD f32 = [DX], SIZE + (p19) fabs f35 = f35 + } + { .mmf + (p8 ) mov DMAX1 = DATA1 + nop.m 0 + (p19) fabs f40 = f40 + } + ;; + { .mmf + (p20) getf.d DATA5 = f12 + (p16) LDFD f37 = [DX], INCXM1 + (p20) FADD f14 = f96, f101 + } + { .mmi + (p8 ) adds MAX1 = 0, I + (p20) CMPUNC p8, p0 = DATA2, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f42 = [DX], SIZE + (p8 ) mov DMAX1 = DATA2 + (p19) fabs f45 = f45 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f50 = f50 + } + ;; + { .mmf + (p20) getf.d DATA6 = f13 + (p16) LDFD f47 = [DX], INCXM1 + (p20) FADD f15 = f106, f111 + } + { .mmi + (p8 ) adds MAX1 = 1, I + (p20) CMPUNC p8, p0 = DATA3, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f52 = [DX], SIZE + (p8 ) mov DMAX1 = DATA3 + (p19) fabs f55 = f55 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f60 = f60 + } + ;; + { .mmf + (p20) getf.d DATA7 = f14 + (p16) LDFD f57 = [DX], INCXM1 + (p19) FADD f8 = f35, f40 + } + { .mmi + (p8 ) adds MAX1 = 2, I + (p20) CMPUNC p8, p0 = DATA4, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f62 = [DX], SIZE + (p8 ) mov DMAX1 = DATA4 + (p19) fabs f65 = f65 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f70 = f70 + } + ;; + { .mmf + (p20) getf.d DATA8 = f15 + (p16) LDFD f67 = [DX], INCXM1 + (p19) FADD f9 = f45, f50 + } + { .mmi + (p8 ) adds MAX1 = 3, I + (p20) CMPUNC p8, p0 = DATA5, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f72 = [DX], SIZE + (p8 ) mov DMAX1 = DATA5 + (p19) fabs f75 = f75 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f80 = f80 + } + ;; + { .mmf + (p19) getf.d DATA1 = f8 + (p16) LDFD f77 = [DX], INCXM1 + (p19) FADD f10 = f55, f60 + } + { .mmi + (p8 ) adds MAX1 = 4, I + (p20) CMPUNC p8, p0 = DATA6, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f82 = [DX], SIZE + (p8 ) mov DMAX1 = DATA6 + (p19) fabs f85 = f85 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f90 = f90 + } + ;; + { .mmf + (p19) getf.d DATA2 = f9 + (p16) LDFD f87 = [DX], INCXM1 + (p19) FADD f11 = f65, f70 + } + { .mmi + (p8 ) adds MAX1 = 5, I + (p20) CMPUNC p8, p0 = DATA7, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f92 = [DX], SIZE + (p8 ) mov DMAX1 = DATA7 + (p19) fabs f95 = f95 + } + { .mmf + mov TMP = I + nop.m 0 + (p19) fabs f100 = f100 + } + ;; + { .mmf + (p19) getf.d DATA3 = f10 + (p16) LDFD f97 = [DX], INCXM1 + (p19) FADD f12 = f75, f80 + } + { .mmi + (p8 ) adds MAX1 = 6, I + (p20) CMPUNC p8, p0 = DATA8, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f102 = [DX], SIZE + (p8 ) mov DMAX1 = DATA8 + (p19) fabs f105 = f105 + } + { .mmf + (p20) adds I = 8, I + nop.m 0 + (p19) fabs f110 = f110 + } + ;; + { .mmi + (p19) getf.d DATA4 = f11 + (p16) LDFD f107 = [DX], INCXM1 + (p8 ) adds MAX1 = 7, TMP + } + { .mfb + (p19) CMPUNC p8, p0 = DATA1, DMAX1 + (p19) FADD f13 = f85, f90 + br.ctop.sptk.few .L10 + } + ;; + .align 32 + +.L15: + { .mmi + (p13) LDFD f32 = [DX], SIZE + and J = 7, K + mov pr = PR, -65474 + } + ;; + { .mmb + (p13) LDFD f33 = [DX], INCXM1 + cmp.eq p8 ,p0 = r0, J + (p8) br.cond.dpnt .L999 + } + ;; + { .mmi + (p13) LDFD f34 = [DX], SIZE + ;; + (p13) LDFD f35 = [DX], INCXM1 + nop.i 0 + } + ;; + { .mmi + (p13) LDFD f36 = [DX], SIZE + ;; + (p13) LDFD f37 = [DX], INCXM1 + nop.i 0 + } + ;; + { .mfi + (p13) LDFD f38 = [DX], SIZE + (p13) fabs f32 = f32 + tbit.z p0, p14 = K, 1 + } + ;; + { .mmf + (p13) LDFD f39 = [DX], INCXM1 + nop.m 0 + (p13) fabs f33 = f33 + } + ;; + { .mmf + (p14) LDFD f40 = [DX], SIZE + nop.m 0 + (p13) fabs f34 = f34 + } + ;; + { .mfi + (p14) LDFD f41 = [DX], INCXM1 + (p13) fabs f35 = f35 + tbit.z p0, p15 = K, 0 + } + ;; + { .mmf + (p14) LDFD f42 = [DX], SIZE + nop.m 0 + (p13) fabs f36 = f36 + } + ;; + { .mmf + (p14) LDFD f43 = [DX], INCXM1 + nop.m 0 + (p13) fabs f37 = f37 + } + { .mmf + nop.m 0 + nop.m 0 + (p13) FADD f32 = f32, f33 + } + ;; + { .mmf + (p15) LDFD f44 = [DX], SIZE + nop.m 0 + (p13) fabs f38 = f38 + } + ;; + { .mmf + (p15) LDFD f45 = [DX], INCXM1 + nop.m 0 + (p13) fabs f39 = f39 + } + { .mmf + nop.m 0 + nop.m 0 + (p13) FADD f34 = f34, f35 + } + ;; + { .mmf + nop.m 0 + nop.m 0 + (p14) fabs f40 = f40 + } + ;; + { .mmf + (p13) getf.d DATA1 = f32 + nop.m 0 + (p14) fabs f41 = f41 + } + { .mmf + nop.m 0 + nop.m 0 + (p13) FADD f36 = f36, f37 + } + ;; + { .mmf + nop.m 0 + nop.m 0 + (p14) fabs f42 = f42 + } + ;; + { .mmf + (p13) getf.d DATA2 = f34 + nop.m 0 + (p14) fabs f43 = f43 + } + { .mmf + nop.m 0 + nop.m 0 + (p13) FADD f38 = f38, f39 + } + ;; + { .mmf + nop.m 0 + nop.m 0 + (p15) fabs f44 = f44 + } + ;; + { .mmf + (p13) getf.d DATA3 = f36 + nop.m 0 + (p15) fabs f45 = f45 + } + { .mmf + nop.m 0 + nop.m 0 + (p14) FADD f40 = f40, f41 + } + ;; + { .mmf + (p13) getf.d DATA4 = f38 + nop.m 0 + (p14) FADD f42 = f42, f43 + } + ;; + { .mmf + (p14) getf.d DATA5 = f40 + nop.m 0 + (p15) FADD f44 = f44, f45 + } + ;; + { .mmi + (p14) getf.d DATA6 = f42 + nop.m 0 + (p13) CMPUNC p8, p0 = DATA1, DMAX1 + } + ;; + { .mmi + (p15) getf.d DATA7 = f44 + (p8 ) adds MAX1 = 0, I + (p8 ) mov DMAX1 = DATA1 + } + ;; + { .mmi + (p13) CMPUNC p8, p0 = DATA2, DMAX1 + ;; + (p8 ) adds MAX1 = 1, I + (p8 ) mov DMAX1 = DATA2 + } + ;; + { .mmi + (p13) CMPUNC p8, p0 = DATA3, DMAX1 + ;; + (p8 ) adds MAX1 = 2, I + (p8 ) mov DMAX1 = DATA3 + } + ;; + { .mmi + (p13) CMPUNC p8, p0 = DATA4, DMAX1 + ;; + (p8 ) adds MAX1 = 3, I + (p8 ) mov DMAX1 = DATA4 + }{ .mmi + (p13) adds I = 4, I + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p14) CMPUNC p8, p0 = DATA5, DMAX1 + ;; + (p8 ) adds MAX1 = 0, I + (p8 ) mov DMAX1 = DATA5 + } + ;; + { .mmi + (p14) CMPUNC p8, p0 = DATA6, DMAX1 + ;; + (p8 ) adds MAX1 = 1, I + (p8 ) mov DMAX1 = DATA6 + }{ .mmi + (p14) adds I = 2, I + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p15) CMPUNC p8, p0 = DATA7, DMAX1 + ;; + (p8) adds MAX1 = 0, I + (p8) mov DMAX1 = DATA7 + } + ;; + .align 32 + +.L999: + { .mmi + setf.d f8 = DMAX1 + adds RET = 1, MAX1 + mov ar.lc = ARLC + } + { .mmb + nop.m 0 + nop.m 0 + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/lsame.S b/kernel/ia64/lsame.S new file mode 100644 index 0000000..3f2a7db --- /dev/null +++ b/kernel/ia64/lsame.S @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + .prologue + .body + ld1 r14 = [r32] + ld1 r15 = [r33] + ;; + adds r16 = -32, r14 // a1 = a - 32 + adds r17 = -32, r15 // b1 = b - 32 + ;; + cmp4.ge p6, p7 = 96, r14 // if (a > 96) + cmp4.ge p8, p9 = 96, r15 // if (b > 96) + ;; + (p7) mov r14 = r16 + (p9) mov r15 = r17 + ;; + cmp4.eq p6, p7 = r15, r14 + mov r8 = 1 + ;; + (p7) mov r8 = 0 + br.ret.sptk.many b0 + + EPILOGUE + diff --git a/kernel/ia64/nrm2.S b/kernel/ia64/nrm2.S new file mode 100644 index 0000000..bb88cfb --- /dev/null +++ b/kernel/ia64/nrm2.S @@ -0,0 +1,310 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#ifndef COMPLEX +#define COMPADD 0 +#define STRIDE INCX +#else +#define COMPADD 1 +#define STRIDE SIZE +#endif + +#define PRE1 r2 + +#define I r17 +#define J r18 +#define X2 r19 +#define INCX5 r20 +#define INCX16 r21 + +#define N r32 +#define X r33 +#define INCX r34 +#define PR r30 +#define ARLC r31 + + + PROLOGUE + .prologue + PROFCODE + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, X + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + ;; +#endif +#endif + { .mmi + cmp.ge p6, p0 = r0, N + cmp.ge p7, p0 = r0, INCX + shr I = N, (4 - COMPADD) + } + { .mbb + and J = ((1 << (4 - COMPADD)) - 1), N + (p6) br.ret.sptk.many b0 + (p7) br.ret.sptk.many b0 + } + ;; + { .mfi + mov f9 = f0 + mov PR = pr + } + { .mfi + adds I = -1, I + mov f10 = f0 + shl INCX = INCX, (BASE_SHIFT + COMPADD) + } + ;; + { .mfi + shladd X2 = INCX, (2 - COMPADD), X + mov f11 = f0 + mov pr.rot = 0 + } + { .mfi + shladd INCX5 = INCX, (2 - COMPADD), INCX + mov f12 = f0 + tbit.z p0, p12 = N, (3 - COMPADD) + } + ;; + { .mfi + shladd INCX16 = INCX, (4 - COMPADD), r0 + mov f13 = f0 + mov ar.ec= 3 + } + { .mmf + cmp.gt p8 ,p0 = r0, I + cmp.eq p16, p0 = r0, r0 + mov f14 = f0 + } + ;; + { .mmf +#ifdef COMPLEX + adds INCX = - SIZE, INCX + adds INCX5 = - SIZE, INCX5 +#else + nop.m 0 + nop.m 0 +#endif + mov f15 = f0 + } + { .mib + cmp.eq p9, p0 = r0, J + mov ar.lc = I + (p8) br.cond.dpnt .L52 + } + ;; + .align 32 + +.L51: + (p16) LDFD f32 = [X], STRIDE + (p16) lfetch.nt1 [PRE1], INCX16 + (p18) fma.d.s1 f8 = f34, f34, f8 + + (p16) LDFD f35 = [X2], STRIDE + (p18) fma.d.s1 f9 = f37, f37, f9 + nop.b 0 + ;; + (p16) LDFD f38 = [X], INCX + (p18) fma.d.s1 f10 = f40, f40, f10 + nop.b 0 + (p16) LDFD f41 = [X2], INCX + (p18) fma.d.s1 f11 = f43, f43, f11 + nop.b 0 + ;; + (p16) LDFD f44 = [X], STRIDE + (p18) fma.d.s1 f12 = f46, f46, f12 + nop.b 0 + (p16) LDFD f47 = [X2], STRIDE + (p18) fma.d.s1 f13 = f49, f49, f13 + nop.b 0 + ;; + (p16) LDFD f50 = [X], INCX5 + (p18) fma.d.s1 f14 = f52, f52, f14 + nop.b 0 + (p16) LDFD f53 = [X2], INCX5 + (p18) fma.d.s1 f15 = f55, f55, f15 + nop.b 0 + ;; + (p16) LDFD f56 = [X], STRIDE + (p18) fma.d.s1 f8 = f58, f58, f8 + nop.b 0 + (p16) LDFD f59 = [X2], STRIDE + (p18) fma.d.s1 f9 = f61, f61, f9 + nop.b 0 + ;; + (p16) LDFD f62 = [X], INCX + (p18) fma.d.s1 f10 = f64, f64, f10 + nop.b 0 + (p16) LDFD f65 = [X2], INCX + (p18) fma.d.s1 f11 = f67, f67, f11 + nop.b 0 + ;; + (p16) LDFD f68 = [X], STRIDE + (p18) fma.d.s1 f12 = f70, f70, f12 + nop.b 0 + (p16) LDFD f71 = [X2], STRIDE + (p18) fma.d.s1 f13 = f73, f73, f13 + nop.b 0 + ;; + (p16) LDFD f74 = [X], INCX5 + (p18) fma.d.s1 f14 = f76, f76, f14 + nop.b 0 + (p16) LDFD f77 = [X2], INCX5 + (p18) fma.d.s1 f15 = f79, f79, f15 + br.ctop.sptk.few .L51 + ;; + .align 32 + +.L52: + { .mmb + (p12) LDFD f32 = [X], STRIDE + (p12) LDFD f33 = [X2], STRIDE + (p9) br.cond.dptk .L998 + } + ;; + { .mmi + (p12) LDFD f34 = [X], INCX + (p12) LDFD f35 = [X2], INCX + tbit.z p0, p13 = N, (2 - COMPADD) + } + ;; + { .mmi + (p12) LDFD f36 = [X], STRIDE + (p12) LDFD f37 = [X2], STRIDE + tbit.z p0, p14 = N, (1 - COMPADD) + } + ;; + { .mmi + (p12) LDFD f38 = [X], INCX5 + (p12) LDFD f39 = [X2], INCX5 +#ifndef COMPLEX + tbit.z p0, p15 = N, 0 +#endif + } + ;; + (p13) LDFD f40 = [X], STRIDE + (p12) fma.d.s1 f8 = f32, f32, f8 + (p12) fma.d.s1 f9 = f33, f33, f9 + ;; + (p13) LDFD f41 = [X], INCX + (p12) fma.d.s1 f10 = f34, f34, f10 + (p12) fma.d.s1 f11 = f35, f35, f11 + ;; + (p13) LDFD f42 = [X], STRIDE + (p12) fma.d.s1 f12 = f36, f36, f12 + (p12) fma.d.s1 f13 = f37, f37, f13 + ;; + (p13) LDFD f43 = [X], INCX + (p12) fma.d.s1 f14 = f38, f38, f14 + (p12) fma.d.s1 f15 = f39, f39, f15 + ;; + (p14) LDFD f44 = [X], STRIDE + (p13) fma.d.s1 f8 = f40, f40, f8 + (p13) fma.d.s1 f9 = f41, f41, f9 + ;; + (p14) LDFD f45 = [X], INCX + (p13) fma.d.s1 f10 = f42, f42, f10 + (p13) fma.d.s1 f11 = f43, f43, f11 + ;; +#ifndef COMPLEX + (p15) LDFD f46 = [X] +#endif + (p14) fma.d.s1 f12 = f44, f44, f12 + (p14) fma.d.s1 f13 = f45, f45, f13 + ;; +#ifndef COMPLEX + (p15) fma.d.s1 f14 = f46, f46, f14 + ;; +#endif + .align 32 + +.L998: + { .mmf + fadd.d.s1 f8 = f8, f9 + } + { .mmf + fadd.d.s1 f10 = f10, f11 + } + { .mmf + fadd.d.s1 f12 = f12, f13 + } + { .mfi + fadd.d.s1 f14 = f14, f15 + mov ar.lc = ARLC + } + ;; + { .mmf + fadd.d.s1 f8 = f8, f10 + } + { .mfi + fadd.d.s1 f12 = f12, f14 + mov pr = PR, -65474 + } + ;; + { .mfb + fadd.d.s1 f8 = f8, f12 + br sqrt + } + ;; + EPILOGUE + + .section .data + .type sqrt, @function + .global sqrt diff --git a/kernel/ia64/qaxpy.S b/kernel/ia64/qaxpy.S new file mode 100644 index 0000000..2acb86b --- /dev/null +++ b/kernel/ia64/qaxpy.S @@ -0,0 +1,509 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 16) + +#define N r32 +#define X1 r38 +#define INCX r39 +#define Y1 r33 +#define INCY r34 + +#define PRE1 r2 +#define PRE2 r3 + +#define I r14 +#define J r15 +#define X2 r16 +#define Y2 r17 +#define X3 r18 +#define Y3 r19 +#define X4 r20 +#define Y4 r21 + +#define YY1 r22 +#define YY2 r23 +#define YY3 r24 +#define YY4 r25 + +#define INCX4 r8 +#define INCY4 r9 +#define INCX2 r10 +#define INCY2 r11 + +#define INCX8 r26 +#define INCY8 r27 + +#define PR r30 +#define ARLC r31 + +#define ALPHA f8 +#define SP r12 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds r8 = 16, SP + adds r9 = 24, SP + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mmb + adds PRE1 = (PREFETCHSIZE + 2) * SIZE, X1 + cmp.lt p0, p6 = r0, N + (p6) br.ret.sptk.many b0 + } + ;; + { .mmi + ld8 Y1 = [r8] + ld8 INCY = [r9] + mov PR = pr + } + ;; + .body + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + mov pr.rot = 0 + } + ;; + { .mmi + shladd INCX4 = INCX, 2, r0 + shladd INCY4 = INCY, 2, r0 + mov ar.ec = 3 + } + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + shr I = N, 4 + } + ;; + { .mmi + add X2 = INCX, X1 + add Y2 = INCY, Y1 + add YY2 = INCY, Y1 + } + ;; + { .mmi + shladd X3 = INCX, 1, X1 + shladd Y3 = INCY, 1, Y1 + shladd YY3 = INCY, 1, Y1 + } + { .mmi + shladd X4 = INCX, 1, X2 + shladd Y4 = INCY, 1, Y2 + shladd YY4 = INCY, 1, Y2 + } + ;; + { .mmi + cmp.eq p7 ,p0 = 0, I + adds I = -1, I + mov YY1 = Y1 + } + { .mmi + and r28 = 127, Y1 + and PRE1 = -128, PRE1 + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mmi + adds PRE2 = (PREFETCHSIZE + 2) * SIZE, Y1 + or PRE1 = PRE1, r28 + mov ar.lc = I + } + { .mib + and J = 15, N + tbit.z p0, p12 = N, 3 + (p7) br.cond.dpnt .L115 + } + ;; + .align 32 + +.L112: + { .mmf + (p18) STFD [YY1] = f6 + (p18) STFD [YY2] = f7 + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) lfetch.excl.nt1 [PRE2], INCY8 + nop __LINE__ + (p18) FMA f7 = ALPHA, f61, f109 + } + ;; + { .mmf + (p18) STFD [YY3] = f10 + (p18) STFD [YY4] = f11 + (p18) FMA f10 = ALPHA, f64, f112 + } + { .mmf + (p16) lfetch.nt1 [PRE1], INCX8 + nop __LINE__ + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmi + (p16) LDFD f32 = [X1], INCX4 + (p16) LDFD f35 = [X2], INCX4 + (p18) add YY1 = INCY4, YY1 + } + { .mmi + (p16) LDFD f38 = [X3], INCX4 + (p16) LDFD f41 = [X4], INCX4 + (p18) add YY2 = INCY4, YY2 + } + ;; + { .mmi + (p17) LDFD f117 = [Y1], INCY4 + (p17) LDFD f120 = [Y2], INCY4 + (p18) add YY3 = INCY4, YY3 + } + { .mmi + (p17) LDFD f123 = [Y3], INCY4 + (p17) LDFD f126 = [Y4], INCY4 + (p18) add YY4 = INCY4, YY4 + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) STFD [YY2] = f13 + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmf + (p18) add YY1 = INCY4, YY1 + (p18) add YY2 = INCY4, YY2 + (p18) FMA f13 = ALPHA, f73, f121 + } + ;; + { .mmf + (p18) STFD [YY3] = f14 + (p18) STFD [YY4] = f15 + (p18) FMA f14 = ALPHA, f76, f124 + } + { .mmf + (p18) add YY3 = INCY4, YY3 + (p18) add YY4 = INCY4, YY4 + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmi + (p16) LDFD f44 = [X1], INCX4 + (p16) LDFD f47 = [X2], INCX4 + nop __LINE__ + } + { .mmi + (p16) LDFD f50 = [X3], INCX4 + (p16) LDFD f53 = [X4], INCX4 + nop __LINE__ + } + ;; + { .mmi + (p16) LDFD f80 = [Y1], INCY4 + (p16) LDFD f83 = [Y2], INCY4 + nop __LINE__ + } + { .mmi + (p16) LDFD f86 = [Y3], INCY4 + (p16) LDFD f89 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f6 + (p18) STFD [YY2] = f7 + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmf + (p16) lfetch.excl.nt1 [PRE2], INCY8 + nop __LINE__ + (p17) FMA f7 = ALPHA, f36, f84 + } + ;; + { .mmf + (p18) STFD [YY3] = f10 + (p18) STFD [YY4] = f11 + (p17) FMA f10 = ALPHA, f39, f87 + } + { .mmf + (p16) lfetch.nt1 [PRE1], INCX8 + nop __LINE__ + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmi + (p16) LDFD f56 = [X1], INCX4 + (p16) LDFD f59 = [X2], INCX4 + (p18) add YY1 = INCY4, YY1 + } + { .mmi + (p16) LDFD f62 = [X3], INCX4 + (p16) LDFD f65 = [X4], INCX4 + (p18) add YY2 = INCY4, YY2 + } + ;; + { .mmi + (p16) LDFD f92 = [Y1], INCY4 + (p16) LDFD f95 = [Y2], INCY4 + (p18) add YY3 = INCY4, YY3 + } + { .mmi + (p16) LDFD f98 = [Y3], INCY4 + (p16) LDFD f101 = [Y4], INCY4 + (p18) add YY4 = INCY4, YY4 + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) STFD [YY2] = f13 + (p17) FMA f12 = ALPHA, f45, f93 + } + { .mmf + (p18) add YY1 = INCY4, YY1 + (p18) add YY2 = INCY4, YY2 + (p17) FMA f13 = ALPHA, f48, f96 + } + ;; + { .mmf + (p18) STFD [YY3] = f14 + (p18) STFD [YY4] = f15 + (p17) FMA f14 = ALPHA, f51, f99 + } + { .mmf + (p18) add YY3 = INCY4, YY3 + (p18) add YY4 = INCY4, YY4 + (p17) FMA f15 = ALPHA, f54, f102 + } + ;; + { .mmi + (p16) LDFD f68 = [X1], INCX4 + (p16) LDFD f71 = [X2], INCX4 + nop __LINE__ + } + { .mmi + (p16) LDFD f74 = [X3], INCX4 + (p16) LDFD f77 = [X4], INCX4 + nop __LINE__ + } + ;; + { .mmi + (p16) LDFD f104 = [Y1], INCY4 + (p16) LDFD f107 = [Y2], INCY4 + nop __LINE__ + } + { .mmb + (p16) LDFD f110 = [Y3], INCY4 + (p16) LDFD f113 = [Y4], INCY4 + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + { .mmi + (p12) LDFD f32 = [X1], INCX4 + (p12) LDFD f33 = [X2], INCX4 + mov pr = PR, -65474 + } + { .mmi + (p12) LDFD f34 = [X3], INCX4 + (p12) LDFD f35 = [X4], INCX4 + cmp.eq p9, p0 = r0, J + } + ;; + { .mmi + (p12) LDFD f64 = [Y1], INCY4 + (p12) LDFD f65 = [Y2], INCY4 + mov ar.lc = ARLC + } + { .mmb + (p12) LDFD f66 = [Y3], INCY4 + (p12) LDFD f67 = [Y4], INCY4 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f36 = [X1], INCX4 + (p12) LDFD f37 = [X2], INCX4 + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) LDFD f38 = [X3], INCX4 + (p12) LDFD f39 = [X4], INCX4 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) LDFD f68 = [Y1], INCY4 + (p12) LDFD f69 = [Y2], INCY4 + tbit.z p0, p15 = N, 0 + } + { .mmi + (p12) LDFD f70 = [Y3], INCY4 + (p12) LDFD f71 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f40 = [X1], INCX4 + (p13) LDFD f41 = [X2], INCX4 + shladd INCX2 = INCX, 1, r0 + } + { .mmi + (p13) LDFD f42 = [X3], INCX4 + (p13) LDFD f43 = [X4], INCX4 + shladd INCY2 = INCY, 1, r0 + } + ;; + { .mmi + (p13) LDFD f72 = [Y1], INCY4 + (p13) LDFD f73 = [Y2], INCY4 + nop __LINE__ + } + { .mmi + (p13) LDFD f74 = [Y3], INCY4 + (p13) LDFD f75 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f44 = [X1], INCX2 + (p14) LDFD f45 = [X2], INCX2 + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f76 = [Y1], INCY2 + (p14) LDFD f77 = [Y2], INCY2 + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f46 = [X1] + (p15) LDFD f78 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f32 = ALPHA, f32, f64 + (p12) FMA f33 = ALPHA, f33, f65 + (p12) FMA f34 = ALPHA, f34, f66 + (p12) FMA f35 = ALPHA, f35, f67 + (p12) FMA f36 = ALPHA, f36, f68 + (p12) FMA f37 = ALPHA, f37, f69 + (p12) FMA f38 = ALPHA, f38, f70 + (p12) FMA f39 = ALPHA, f39, f71 + ;; + { .mmf + (p12) STFD [YY1] = f32 + (p12) STFD [YY2] = f33 + (p13) FMA f40 = ALPHA, f40, f72 + } + { .mmf + (p12) add YY1 = INCY4, YY1 + (p12) add YY2 = INCY4, YY2 + (p13) FMA f41 = ALPHA, f41, f73 + } + ;; + { .mmf + (p12) STFD [YY3] = f34 + (p12) STFD [YY4] = f35 + (p13) FMA f42 = ALPHA, f42, f74 + } + { .mmf + (p12) add YY3 = INCY4, YY3 + (p12) add YY4 = INCY4, YY4 + (p13) FMA f43 = ALPHA, f43, f75 + } + ;; + { .mmf + (p12) STFD [YY1] = f36 + (p12) STFD [YY2] = f37 + (p14) FMA f44 = ALPHA, f44, f76 + } + { .mmf + (p12) add YY1 = INCY4, YY1 + (p12) add YY2 = INCY4, YY2 + (p14) FMA f45 = ALPHA, f45, f77 + } + ;; + { .mmf + (p12) STFD [YY3] = f38 + (p12) STFD [YY4] = f39 + (p15) FMA f46 = ALPHA, f46, f78 + } + { .mmi + (p12) add YY3 = INCY4, YY3 + (p12) add YY4 = INCY4, YY4 + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YY1] = f40 + (p13) STFD [YY2] = f41 + nop __LINE__ + } + { .mmi + (p13) add YY1 = INCY4, YY1 + (p13) add YY2 = INCY4, YY2 + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YY3] = f42 + (p13) STFD [YY4] = f43 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YY1] = f44 + (p14) STFD [YY2] = f45 + (p14) add YY1 = INCY2, YY1 + } + ;; + { .mmb + (p15) STFD [YY1] = f46 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/qcopy.S b/kernel/ia64/qcopy.S new file mode 100644 index 0000000..9200470 --- /dev/null +++ b/kernel/ia64/qcopy.S @@ -0,0 +1,581 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define INCX2 r18 +#define INCY2 r19 +#define INCX8 r20 +#define INCY8 r21 +#define PR r30 +#define ARLC r31 + +#define PREFETCH_SIZE (8 * 16) + + PROLOGUE + .prologue + PROFCODE + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + shr I = N, 4 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + sub r8 = X1, Y1 + mov r9 = 0xf0 + mov PR = pr + } + { .mmi + shladd INCX2 = INCX, 1, r0 + shladd INCY2 = INCY, 1, r0 + and J = 15, N + } + ;; + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + mov pr.rot = 0 + } + { .mmi + and r8 = r9, r8 + cmp.eq p9, p0 = r0, J + adds I = -1, I + } + ;; + { .mmi + add X2 = X1, INCX + add Y2 = Y1, INCY + mov ar.ec = 4 + } + { .mmb + cmp.gt p6, p0 = 127, r8 + cmp.eq p16, p0 = r0, r0 + (p6) br.cond.dpnt .L20 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 2) * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = N, 3 + (p8) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mmi + (p19) STFD [Y1] = f35 + (p19) STFD [Y2] = f39 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p17) LDFD f81 = [X1], INCX2 + (p17) LDFD f85 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f43 + (p19) STFD [Y2] = f47 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p17) LDFD f89 = [X1], INCX2 + (p17) LDFD f93 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f51 + (p19) STFD [Y2] = f55 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], INCX2 + (p16) LDFD f36 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f59 + (p19) STFD [Y2] = f63 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p16) LDFD f40 = [X1], INCX2 + (p16) LDFD f44 = [X2], INCX2 + nop __LINE__ + } + ;; + { .mmi + (p19) STFD [Y1] = f67 + (p19) STFD [Y2] = f71 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f48 = [X1], INCX2 + (p16) LDFD f52 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f79 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f56 = [X1], INCX2 + (p16) LDFD f60 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f83 + (p19) STFD [Y2] = f87 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f91 + (p19) STFD [Y2] = f95 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f64 = [X1], INCX2 + (p16) LDFD f68 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmb + (p16) LDFD f72 = [X1], INCX2 + (p16) LDFD f76 = [X2], INCX2 + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFD f48 = [X1], INCX2 + (p12) LDFD f49 = [X2], INCX2 + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX2 + (p12) LDFD f51 = [X2], INCX2 + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f52 = [X1], INCX2 + (p12) LDFD f53 = [X2], INCX2 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f54 = [X1], INCX2 + (p12) LDFD f55 = [X2], INCX2 + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX2 + (p13) LDFD f57 = [X2], INCX2 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX2 + (p13) LDFD f59 = [X2], INCX2 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f49 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + (p14) LDFD f60 = [X1], INCX2 + (p14) LDFD f61 = [X2], INCX2 + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f51 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + (p15) LDFD f62 = [X1] + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f52 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f54 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) STFD [Y2] = f57 + (p13) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p13) STFD [Y2] = f59 + (p13) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p14) STFD [Y1] = f60 + (p14) STFD [Y2] = f61 + (p14) add Y1 = INCY2, Y1 + } + ;; + { .mmb + (p15) STFD [Y1] = f62 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 16 + +.L20: + { .mmi + adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 10) * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = N, 3 + (p8) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mmi + (p19) STFD [Y1] = f67 + (p19) STFD [Y2] = f71 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p17) LDFD f81 = [X1], INCX2 + (p17) LDFD f85 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f79 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p17) LDFD f89 = [X1], INCX2 + (p17) LDFD f93 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f83 + (p19) STFD [Y2] = f87 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], INCX2 + (p16) LDFD f36 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f91 + (p19) STFD [Y2] = f95 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p16) LDFD f40 = [X1], INCX2 + (p16) LDFD f44 = [X2], INCX2 + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [Y1] = f34 + (p18) STFD [Y2] = f38 + (p18) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f48 = [X1], INCX2 + (p16) LDFD f52 = [X2], INCX2 + (p18) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f42 + (p18) STFD [Y2] = f46 + (p18) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f56 = [X1], INCX2 + (p16) LDFD f60 = [X2], INCX2 + (p18) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f50 + (p18) STFD [Y2] = f54 + (p18) add Y1 = INCY2, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p18) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f58 + (p18) STFD [Y2] = f62 + (p18) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f64 = [X1], INCX2 + (p16) LDFD f68 = [X2], INCX2 + (p18) add Y2 = INCY2, Y2 + } + ;; + { .mmb + (p16) LDFD f72 = [X1], INCX2 + (p16) LDFD f76 = [X2], INCX2 + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFD f48 = [X1], INCX2 + (p12) LDFD f49 = [X2], INCX2 + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX2 + (p12) LDFD f51 = [X2], INCX2 + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f52 = [X1], INCX2 + (p12) LDFD f53 = [X2], INCX2 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f54 = [X1], INCX2 + (p12) LDFD f55 = [X2], INCX2 + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX2 + (p13) LDFD f57 = [X2], INCX2 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX2 + (p13) LDFD f59 = [X2], INCX2 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f49 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + (p14) LDFD f60 = [X1], INCX2 + (p14) LDFD f61 = [X2], INCX2 + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f51 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + (p15) LDFD f62 = [X1] + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f52 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f54 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) STFD [Y2] = f57 + (p13) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p13) STFD [Y2] = f59 + (p13) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p14) STFD [Y1] = f60 + (p14) STFD [Y2] = f61 + (p14) add Y1 = INCY2, Y1 + } + ;; + { .mmb + (p15) STFD [Y1] = f62 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + + EPILOGUE + diff --git a/kernel/ia64/qdot.S b/kernel/ia64/qdot.S new file mode 100644 index 0000000..ff3f93b --- /dev/null +++ b/kernel/ia64/qdot.S @@ -0,0 +1,421 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (8 * 24) + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX1 r2 +#define PREY1 r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 +#define Y3 r18 +#define X3 r19 +#define Y4 r20 +#define X4 r21 + +#define INCX2 r22 +#define INCY2 r23 + +#define INCX4 r24 +#define INCY4 r25 +#define INCX16 r26 +#define INCY16 r27 + +#define PREX2 r28 +#define PREY2 r29 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + nop __LINE__ + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov r26 = 1 + mov f9 = f0 + nop __LINE__ + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + (p6) shladd X1 = r26, BASE_SHIFT, X1 + (p7) shladd Y1 = r27, BASE_SHIFT, Y1 + ;; +#endif + { .mmi + adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1 + adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1 + mov PR = pr + } + { .mib + cmp.lt p0, p6 = r0, N + shl INCX = INCX, BASE_SHIFT + (p6) br.ret.sptk.many b0 + } + ;; + { .mfi + add X2 = INCX, X1 + mov f10 = f0 + shl INCY = INCY, BASE_SHIFT + } + { .mmf + and r8 = 127, X1 + shladd X3 = INCX, 1, X1 + mov f11 = f0 + } + ;; + { .mmi + and PREY1 = -128, PREY1 + shladd X4 = INCX, 1, X2 + add INCX2 = INCX, INCX + } + { .mmi + shladd INCX4 = INCX, 2, r0 + add Y2 = INCY, Y1 + shladd Y3 = INCY, 1, Y1 + } + ;; + { .mmi + shladd Y4 = INCY, 1, Y2 + add INCY2 = INCY, INCY + nop __LINE__ + } + { .mmi + shladd INCY4 = INCY, 2, r0 + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + } + ;; + { .mfi + nop __LINE__ + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + or PREY1 = PREY1, r8 + mov f13 = f0 + shr I = N, 4 + } + ;; + { .mfi + adds I = -1, I + mov f14 = f0 + mov ar.ec= 3 + } + { .mmf + shladd PREX2 = INCX, 3, PREX1 + shladd PREY2 = INCY, 3, PREY1 + mov f15 = f0 + } + ;; + { .mmi + and J = 15, N + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + } + { .mib + cmp.eq p6 ,p0 = -1, I + tbit.nz p12, p0 = N, 3 + (p6) br.cond.dpnt .L215 + } + ;; + .align 32 + +.L212: + { .mmf + (p16) lfetch.nt1 [PREX1], INCX16 + (p16) lfetch.nt1 [PREX2], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [X1], INCX4 + (p16) LDFD f83 = [X2], INCX4 + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFD f86 = [X3], INCX4 + (p16) LDFD f89 = [X4], INCX4 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f92 = [X1], INCX4 + (p16) LDFD f95 = [X2], INCX4 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFD f32 = [Y1], INCY4 + (p16) LDFD f35 = [Y2], INCY4 + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f38 = [Y3], INCY4 + (p16) LDFD f41 = [Y4], INCY4 + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFD f98 = [X3], INCX4 + (p16) LDFD f101 = [X4], INCX4 + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f104 = [X1], INCX4 + (p16) LDFD f107 = [X2], INCX4 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFD f44 = [Y1], INCY4 + (p16) LDFD f47 = [Y2], INCY4 + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f50 = [Y3], INCY4 + (p16) LDFD f53 = [Y4], INCY4 + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) lfetch.nt1 [PREY1], INCY16 + (p16) lfetch.nt1 [PREY2], INCY16 + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f110 = [X3], INCX4 + (p16) LDFD f113 = [X4], INCX4 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFD f56 = [Y1], INCY4 + (p16) LDFD f59 = [Y2], INCY4 + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f62 = [Y3], INCY4 + (p16) LDFD f65 = [Y4], INCY4 + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFD f116 = [X1], INCX4 + (p16) LDFD f119 = [X2], INCX4 + (p18) FMA f14 = f76, f124, f14 + } + { .mmf + (p16) LDFD f122 = [X3], INCX4 + (p16) LDFD f125 = [X4], INCX4 + (p18) FMA f15 = f79, f127, f15 + } + ;; + { .mmi + (p16) LDFD f68 = [Y1], INCY4 + (p16) LDFD f71 = [Y2], INCY4 + nop __LINE__ + } + { .mmb + (p16) LDFD f74 = [Y3], INCY4 + (p16) LDFD f77 = [Y4], INCY4 + br.ctop.sptk.few .L212 + } + ;; + .align 32 + +.L215: + { .mmi + (p12) LDFD f48 = [X1], INCX4 + (p12) LDFD f49 = [X2], INCX4 + cmp.eq p7, p0 = r0, J + } + { .mmb + (p12) LDFD f50 = [X3], INCX4 + (p12) LDFD f51 = [X4], INCX4 + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFD f32 = [Y1], INCY4 + (p12) LDFD f33 = [Y2], INCY4 + tbit.nz p13, p0 = N, 2 + } + { .mmi + (p12) LDFD f34 = [Y3], INCY4 + (p12) LDFD f35 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p12) LDFD f52 = [X1], INCX4 + (p12) LDFD f53 = [X2], INCX4 + tbit.nz p14, p0 = N, 1 + } + { .mmi + (p12) LDFD f54 = [X3], INCX4 + (p12) LDFD f55 = [X4], INCX4 + nop __LINE__ + } + ;; + { .mmi + (p12) LDFD f36 = [Y1], INCY4 + (p12) LDFD f37 = [Y2], INCY4 + tbit.nz p15, p0 = N, 0 + } + { .mmi + (p12) LDFD f38 = [Y3], INCY4 + (p12) LDFD f39 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX4 + (p13) LDFD f57 = [X2], INCX4 + nop __LINE__ + } + { .mmi + (p13) LDFD f58 = [X3], INCX4 + (p13) LDFD f59 = [X4], INCX4 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f40 = [Y1], INCY4 + (p13) LDFD f41 = [Y2], INCY4 + nop __LINE__ + } + { .mmi + (p13) LDFD f42 = [Y3], INCY4 + (p13) LDFD f43 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f60 = [X1], INCX2 + (p14) LDFD f61 = [X2], INCX2 + nop __LINE__ + } + { .mmi + (p14) LDFD f44 = [Y1], INCY2 + (p14) LDFD f45 = [Y2], INCY2 + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f62 = [X1] + (p15) LDFD f46 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f8 = f32, f48, f8 + (p12) FMA f9 = f33, f49, f9 + (p12) FMA f10 = f34, f50, f10 + (p12) FMA f11 = f35, f51, f11 + ;; + (p12) FMA f12 = f36, f52, f12 + (p12) FMA f13 = f37, f53, f13 + (p12) FMA f14 = f38, f54, f14 + (p12) FMA f15 = f39, f55, f15 + ;; + (p13) FMA f8 = f40, f56, f8 + (p13) FMA f9 = f41, f57, f9 + (p13) FMA f10 = f42, f58, f10 + (p13) FMA f11 = f43, f59, f11 + ;; + (p14) FMA f8 = f44, f60, f8 + (p14) FMA f9 = f45, f61, f9 + (p15) FMA f10 = f46, f62, f10 + ;; + .align 32 + +.L999: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f10 + FADD f12 = f12, f14 + mov ar.lc = ARLC + ;; + FADD f8 = f8, f12 + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/qgemm_kernel.S b/kernel/ia64/qgemm_kernel.S new file mode 100644 index 0000000..3c9fb69 --- /dev/null +++ b/kernel/ia64/qgemm_kernel.S @@ -0,0 +1,8993 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 16) + +#define CPREFETCHSIZE 7 +#define CPREFETCH lfetch.excl.nt2 + +#define M r32 +#define N r33 +#define K r34 +#define A r38 +#define B r39 +#define C r36 +#define LDC r37 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS +#ifdef TRMMKERNEL + alloc ARPFS = ar.pfs, 8, 16, 0, 0 +#else + alloc ARPFS = ar.pfs, 8, 8, 0, 0 +#endif + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -16 * 16, SP + adds r9 = -15 * 16, SP + adds SP = -16 * 16, SP + } + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + shr J = N, 3 + ;; + stf.spill [r8] = f22, 32 + stf.spill [r9] = f23, 32 + mov AOFFSET = A + ;; + stf.spill [r8] = f24, 32 + stf.spill [r9] = f25, 32 + cmp.ge p6, p0 = 0, J + ;; + stf.spill [r8] = f26, 32 + stf.spill [r9] = f27, 32 + ;; + stf.spill [r8] = f28, 32 + stf.spill [r9] = f29, 32 + ;; + stf.spill [r8] = f30 + stf.spill [r9] = f31 + ld8 C = [r14], 8 + ;; + ld8 LDC = [r14], 8 + ;; + shladd LDC = LDC, BASE_SHIFT, r0 + ;; +#ifndef TRMMKERNEL + (p6) br.cond.dpnt .L050 + .body + ;; +#else + .body + ;; + ld8 OFFSET = [r14], 8 + ;; + +#if defined(TRMMKERNEL) && !defined(LEFT) + ;; + sub KK = r0, OFFSET +#endif + (p6) br.cond.dpnt .L050 + ;; +#endif + .align 32 + +.L010: + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc + shladd C = LDC, 3, C // coffset += 8 * ldc + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc + shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + }{ .mfb + sub C8 = C, LDC // coffset8 = c + 7 * ldc + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + mov BOFFSET = B + ;; + + { .mfb + LDFD f48 = [BOFFSET], SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfb + LDFD f49 = [BOFFSET], SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 3, B + mov f65 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + LDFD f48 = [BOFFSET], SIZE + ;; + { .mfi + LDFD f49 = [BOFFSET], SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + LDFD f32 = [AOFFSET], SIZE + LDFD f50 = [BOFFSET], SIZE + ;; + + { .mfb + LDFD f33 = [AOFFSET], SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + LDFD f51 = [BOFFSET], SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + LDFD f52 = [BOFFSET], SIZE + ;; + { .mmf + LDFD f53 = [BOFFSET], SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfb + setf.d f113 = r0 + mov f121 = f0 + nop __LINE__ + } + ;; + LDFD f54 = [BOFFSET], SIZE + ;; + { .mmf + LDFD f55 = [BOFFSET], SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfb + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + LDFD f34 = [AOFFSET], SIZE + ;; + { .mmf + LDFD f35 = [AOFFSET], SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfb + setf.d f114 = r0 + mov f122 = f0 + nop __LINE__ + } + ;; + LDFD f36 = [AOFFSET], SIZE + ;; + { .mmf + LDFD f37 = [AOFFSET], SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 8, KK +#endif +#endif + } + ;; + LDFD f38 = [AOFFSET], SIZE + ;; + { .mmf + LDFD f39 = [AOFFSET], SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfi + setf.d f119 = r0 + mov f127 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.fault.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFD f40 = [AOFFSET], SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfi + (p3) LDFD f56 = [BOFFSET], SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + adds C10 = 4 * SIZE, C2 + } + { .mfb + (p3) LDFD f41 = [AOFFSET], SIZE + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfi + (p3) LDFD f57 = [BOFFSET], SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfb + (p3) LDFD f42 = [AOFFSET], SIZE + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfi + (p3) LDFD f58 = [BOFFSET], SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + (p3) LDFD f43 = [AOFFSET], SIZE + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfi + (p3) LDFD f59 = [BOFFSET], SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + adds C13 = 4 * SIZE, C5 + } + { .mfb + (p3) LDFD f44 = [AOFFSET], SIZE + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfi + (p3) LDFD f60 = [BOFFSET], SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + adds C14 = 4 * SIZE, C6 + } + { .mfb + (p3) LDFD f45 = [AOFFSET], SIZE + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfi + (p3) LDFD f61 = [BOFFSET], SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C15 = 4 * SIZE, C7 + } + { .mfb + (p3) LDFD f46 = [AOFFSET], SIZE + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfi + (p3) LDFD f62 = [BOFFSET], SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + adds C16 = 4 * SIZE, C8 + } + { .mfb + (p3) LDFD f47 = [AOFFSET], SIZE + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + (p3) LDFD f63 = [BOFFSET], SIZE + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + nop __LINE__ + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + nop __LINE__ + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + (p4) LDFD f32 = [AOFFSET], SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFD f33 = [AOFFSET], SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + (p4) LDFD f48 = [BOFFSET], SIZE + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFD f34 = [AOFFSET], SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + (p4) LDFD f49 = [BOFFSET], SIZE + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + lfetch.fault.nt1 [PREA], 8 * SIZE + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFD f35 = [AOFFSET], SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p4) LDFD f50 = [BOFFSET], SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFD f36 = [AOFFSET], SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + (p4) LDFD f51 = [BOFFSET], SIZE + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFD f37 = [AOFFSET], SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + (p4) LDFD f52 = [BOFFSET], SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFD f38 = [AOFFSET], SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFD f53 = [BOFFSET], SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFD f39 = [AOFFSET], SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p4) LDFD f54 = [BOFFSET], SIZE + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFD f55 = [BOFFSET], SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f6 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f7 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f10 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f11 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f12 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f13 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f14 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f15 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f16 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f17 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f18 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f19 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f20 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f21 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f22 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f23 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f24 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f25 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f26 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f27 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f28 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f29 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f30 = [C3 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f31 = [C11], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f32 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f33 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f34 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f35 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f36 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f37 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f38 = [C4 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f39 = [C12], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f48 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f49 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f50 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f51 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f52 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f53 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f54 = [C5 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f55 = [C13], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f40 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f41 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f42 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f43 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f44 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f45 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f59 = [C6 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f60 = [C14], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f61 = [C7 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f62 = [C15], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; +.L013: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + (p5) LDFD f63 = [C7 ], SIZE + FMA f64 = ALPHA, f64, f6 + cmp.ne p6, p0 = 1, I + } + { .mfb + (p5) LDFD f6 = [C15], SIZE + FMA f68 = ALPHA, f68, f7 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f7 = [C7 ], SIZE + FMA f65 = ALPHA, f65, f10 + adds I = -1, I + } + { .mfb + (p5) LDFD f10 = [C15], SIZE + FMA f69 = ALPHA, f69, f11 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f11 = [C7 ], -3 * SIZE + FMA f66 = ALPHA, f66, f12 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C15], -3 * SIZE + FMA f70 = ALPHA, f70, f13 + nop __LINE__ + } + ;; + { .mfb + LDFD f13 = [C8 ], SIZE + FMA f67 = ALPHA, f67, f14 + nop __LINE__ + } + { .mfb + LDFD f14 = [C16], SIZE + FMA f71 = ALPHA, f71, f15 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + FMA f72 = ALPHA, f72, f16 + } + { .mmf + LDFD f15 = [C8 ], SIZE + LDFD f16 = [C16], SIZE + FMA f76 = ALPHA, f76, f17 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + FMA f73 = ALPHA, f73, f18 + } + { .mmf + LDFD f17 = [C8 ], SIZE + LDFD f18 = [C16], SIZE + FMA f77 = ALPHA, f77, f19 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + FMA f74 = ALPHA, f74, f20 + } + { .mmf + LDFD f19 = [C8 ], -3 * SIZE + LDFD f20 = [C16], -3 * SIZE + FMA f78 = ALPHA, f78, f21 + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f22 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f23 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f80 = ALPHA, f80, f24 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMA f84 = ALPHA, f84, f25 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMA f81 = ALPHA, f81, f26 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMA f85 = ALPHA, f85, f27 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMA f82 = ALPHA, f82, f28 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMA f86 = ALPHA, f86, f29 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMA f83 = ALPHA, f83, f30 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMA f87 = ALPHA, f87, f31 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f88 = ALPHA, f88, f32 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMA f92 = ALPHA, f92, f33 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f89 = ALPHA, f89, f34 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMA f93 = ALPHA, f93, f35 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMA f90 = ALPHA, f90, f36 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMA f94 = ALPHA, f94, f37 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMA f91 = ALPHA, f91, f38 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMA f95 = ALPHA, f95, f39 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMA f96 = ALPHA, f96, f48 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + FMA f100 = ALPHA, f100, f49 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + FMA f97 = ALPHA, f97, f50 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + FMA f101 = ALPHA, f101, f51 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + FMA f98 = ALPHA, f98, f52 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + FMA f102 = ALPHA, f102, f53 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + FMA f99 = ALPHA, f99, f54 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + FMA f103 = ALPHA, f103, f55 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMA f104 = ALPHA, f104, f40 + nop __LINE__ + } + { .mfb + STFD [C13] = f100, SIZE + FMA f108 = ALPHA, f108, f41 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + FMA f105 = ALPHA, f105, f42 + nop __LINE__ + } + { .mfb + STFD [C13] = f101, SIZE + FMA f109 = ALPHA, f109, f43 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f98, SIZE + FMA f106 = ALPHA, f106, f44 + nop __LINE__ + } + { .mfb + STFD [C13] = f102, SIZE + FMA f110 = ALPHA, f110, f45 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f99, 5 * SIZE + FMA f107 = ALPHA, f107, f59 + nop __LINE__ + } + { .mfb + STFD [C13] = f103, 5 * SIZE + FMA f111 = ALPHA, f111, f60 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMA f112 = ALPHA, f112, f61 + nop __LINE__ + } + { .mfb + STFD [C14] = f108, SIZE + FMA f116 = ALPHA, f116, f62 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, SIZE + FMA f113 = ALPHA, f113, f63 + nop __LINE__ + } + { .mfb + STFD [C14] = f109, SIZE + FMA f117 = ALPHA, f117, f6 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f106, SIZE + FMA f114 = ALPHA, f114, f7 + nop __LINE__ + } + { .mfb + STFD [C14] = f110, SIZE + FMA f118 = ALPHA, f118, f10 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f107, 5 * SIZE + FMA f115 = ALPHA, f115, f11 + nop __LINE__ + } + { .mfb + STFD [C14] = f111, 5 * SIZE + FMA f119 = ALPHA, f119, f12 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + FMA f120 = ALPHA, f120, f13 + nop __LINE__ + } + { .mfb + STFD [C15] = f116, SIZE + FMA f124 = ALPHA, f124, f14 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + FMA f121 = ALPHA, f121, f15 + nop __LINE__ + } + { .mfb + STFD [C15] = f117, SIZE + FMA f125 = ALPHA, f125, f16 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f114, SIZE + FMA f122 = ALPHA, f122, f17 + nop __LINE__ + } + { .mfb + STFD [C15] = f118, SIZE + FMA f126 = ALPHA, f126, f18 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f115, 5 * SIZE + FMA f123 = ALPHA, f123, f19 + nop __LINE__ + } + { .mfb + STFD [C15] = f119, 5 * SIZE + FMA f127 = ALPHA, f127, f20 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f120, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f124, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f125, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f122, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f126, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f123, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f127, 5 * SIZE + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + FMPY f72 = ALPHA, f72 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f76 = ALPHA, f76 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + FMPY f73 = ALPHA, f73 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f77 = ALPHA, f77 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + FMPY f74 = ALPHA, f74 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f78 = ALPHA, f78 + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMPY f84 = ALPHA, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMPY f85 = ALPHA, f85 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMPY f86 = ALPHA, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMPY f87 = ALPHA, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMPY f92 = ALPHA, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMPY f93 = ALPHA, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMPY f94 = ALPHA, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMPY f91 = ALPHA, f91 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMPY f95 = ALPHA, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + FMPY f100 = ALPHA, f100 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + FMPY f101 = ALPHA, f101 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + FMPY f98 = ALPHA, f98 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + FMPY f102 = ALPHA, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + FMPY f99 = ALPHA, f99 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + FMPY f103 = ALPHA, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + { .mfb + STFD [C13] = f100, SIZE + FMPY f108 = ALPHA, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + { .mfb + STFD [C13] = f101, SIZE + FMPY f109 = ALPHA, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f98, SIZE + FMPY f106 = ALPHA, f106 + nop __LINE__ + } + { .mfb + STFD [C13] = f102, SIZE + FMPY f110 = ALPHA, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f99, 5 * SIZE + FMPY f107 = ALPHA, f107 + nop __LINE__ + } + { .mfb + STFD [C13] = f103, 5 * SIZE + FMPY f111 = ALPHA, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMPY f112 = ALPHA, f112 + nop __LINE__ + } + { .mfb + STFD [C14] = f108, SIZE + FMPY f116 = ALPHA, f116 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, SIZE + FMPY f113 = ALPHA, f113 + nop __LINE__ + } + { .mfb + STFD [C14] = f109, SIZE + FMPY f117 = ALPHA, f117 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f106, SIZE + FMPY f114 = ALPHA, f114 + nop __LINE__ + } + { .mfb + STFD [C14] = f110, SIZE + FMPY f118 = ALPHA, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f107, 5 * SIZE + FMPY f115 = ALPHA, f115 + nop __LINE__ + } + { .mfb + STFD [C14] = f111, 5 * SIZE + FMPY f119 = ALPHA, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + FMPY f120 = ALPHA, f120 + nop __LINE__ + } + { .mfb + STFD [C15] = f116, SIZE + FMPY f124 = ALPHA, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + FMPY f121 = ALPHA, f121 + nop __LINE__ + } + { .mfb + STFD [C15] = f117, SIZE + FMPY f125 = ALPHA, f125 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f114, SIZE + FMPY f122 = ALPHA, f122 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C15] = f118, SIZE + FMPY f126 = ALPHA, f126 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f115, 5 * SIZE + FMPY f123 = ALPHA, f123 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C15] = f119, 5 * SIZE + FMPY f127 = ALPHA, f127 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f120, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f124, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C16] = f125, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f122, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f126, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f123, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f127, 5 * SIZE + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: +#if 0 + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f89 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 8, KK +#endif +#endif + mov f81 = f0 + (p6) br.cond.dptk .L030 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 3, B + mov f65 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; +#endif + { .mmf + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfi + setf.d f113 = r0 + mov f121 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mmf + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f75 = f0 + adds L = -1, L + } + { .mmf + setf.d f67 = r0 + setf.d f83 = r0 + mov f91 = f0 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f107 = f0 + mov ar.lc = L + } + { .mmf + setf.d f99 = r0 + setf.d f115 = r0 + mov f123 = f0 + } + ;; + .align 32 + +.L022: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C10], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f86 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f87 = [C11], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C12], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C5 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C13], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C6 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C14], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f116 = [C7 ], SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f118 = [C15], SIZE + FMA f66 = ALPHA, f66, f70 + nop __LINE__ + } + ;; + { .mfb + LDFD f117 = [C7 ], -1 * SIZE + FMA f65 = ALPHA, f65, f69 + nop __LINE__ + } + { .mfb + LDFD f119 = [C15], -1 * SIZE + FMA f67 = ALPHA, f67, f71 + nop __LINE__ + } + ;; + { .mfb + LDFD f124 = [C8], SIZE + FMA f72 = ALPHA, f72, f76 + nop __LINE__ + } + { .mfb + LDFD f126 = [C16], SIZE + FMA f74 = ALPHA, f74, f78 + nop __LINE__ + } + ;; + { .mfb + LDFD f125 = [C8], -1 * SIZE + FMA f73 = ALPHA, f73, f77 + nop __LINE__ + } + { .mfb + LDFD f127 = [C16], -1 * SIZE + FMA f75 = ALPHA, f75, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMA f82 = ALPHA, f82, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMA f83 = ALPHA, f83, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMA f90 = ALPHA, f90, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMA f91 = ALPHA, f91, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f96 = ALPHA, f96, f100 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + FMA f98 = ALPHA, f98, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + FMA f97 = ALPHA, f97, f101 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + FMA f99 = ALPHA, f99, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMA f104 = ALPHA, f104, f108 + nop __LINE__ + } + { .mfb + STFD [C12] = f90, SIZE + FMA f106 = ALPHA, f106, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, 3 * SIZE + FMA f105 = ALPHA, f105, f109 + nop __LINE__ + } + { .mfb + STFD [C12] = f91, 3 * SIZE + FMA f107 = ALPHA, f107, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMA f112 = ALPHA, f112, f116 + nop __LINE__ + } + { .mfb + STFD [C13] = f98, SIZE + FMA f114 = ALPHA, f114, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, 3 * SIZE + FMA f113 = ALPHA, f113, f117 + nop __LINE__ + } + { .mfb + STFD [C13] = f99, 3 * SIZE + FMA f115 = ALPHA, f115, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMA f120 = ALPHA, f120, f124 + nop __LINE__ + } + { .mfb + STFD [C14] = f106, SIZE + FMA f122 = ALPHA, f122, f126 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, 3 * SIZE + FMA f121 = ALPHA, f121, f125 + nop __LINE__ + } + { .mfb + STFD [C14] = f107, 3 * SIZE + FMA f123 = ALPHA, f123, f127 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C15] = f114, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, 3 * SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C15] = f115, 3 * SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f120, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f122, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f121, 3 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f123, 3 * SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mfb + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + { .mfb + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + ;; + { .mfb + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + ;; + { .mfb + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMPY f91 = ALPHA, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + FMPY f98 = ALPHA, f98 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + FMPY f99 = ALPHA, f99 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + { .mfb + STFD [C12] = f90, SIZE + FMPY f106 = ALPHA, f106 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, 3 * SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + { .mfb + STFD [C12] = f91, 3 * SIZE + FMPY f107 = ALPHA, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMPY f112 = ALPHA, f112 + nop __LINE__ + } + { .mfb + STFD [C13] = f98, SIZE + FMPY f114 = ALPHA, f114 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, 3 * SIZE + FMPY f113 = ALPHA, f113 + nop __LINE__ + } + { .mfb + STFD [C13] = f99, 3 * SIZE + FMPY f115 = ALPHA, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C6 ] = f104, SIZE + FMPY f120 = ALPHA, f120 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C14] = f106, SIZE + FMPY f122 = ALPHA, f122 + nop __LINE__ + } + ;; + { .mfi + STFD [C6 ] = f105, 3 * SIZE + FMPY f121 = ALPHA, f121 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C14] = f107, 3 * SIZE + FMPY f123 = ALPHA, f123 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C15] = f114, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f113, 3 * SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C15] = f115, 3 * SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f120, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f122, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, 3 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f123, 3 * SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L030: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 8, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#else + { .mmf + shladd BOFFSET = KK8, 3, B + shladd AOFFSET = KK8, 1, AOFFSET + mov f65 = f0 + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#endif + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f81 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f97 = f0 + adds L = -1, L + } + { .mfi + nop __LINE__ + mov f105 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f113 = f0 + mov ar.lc = L + } + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f121 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f100 = [C5], SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f108 = [C6], SIZE + FMA f65 = ALPHA, f65, f69 + nop __LINE__ + } + ;; + { .mfb + LDFD f101 = [C5], -1 * SIZE + FMA f72 = ALPHA, f72, f76 + nop __LINE__ + } + { .mfb + LDFD f109 = [C6], -1 * SIZE + FMA f73 = ALPHA, f73, f77 + nop __LINE__ + } + ;; + { .mfb + LDFD f116 = [C7], SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + LDFD f124 = [C8], SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + ;; + { .mfb + LDFD f117 = [C7], -1 * SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + LDFD f125 = [C8], -1 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f96 = ALPHA, f96, f100 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + FMA f104 = ALPHA, f104, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f97 = ALPHA, f97, f101 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + FMA f105 = ALPHA, f105, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f112 = ALPHA, f112, f116 + nop __LINE__ + } + { .mfb + STFD [C4 ] = f88, SIZE + FMA f120 = ALPHA, f120, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f113 = ALPHA, f113, f117 + nop __LINE__ + } + { .mfb + STFD [C4 ] = f89, SIZE + FMA f121 = ALPHA, f121, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C6 ] = f104, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C6 ] = f105, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f120, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f121, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + FMPY f112 = ALPHA, f112 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f120 = ALPHA, f120 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f81, SIZE + FMPY f113 = ALPHA, f113 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4 ] = f89, SIZE + FMPY f121 = ALPHA, f121 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C5 ] = f96, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C6 ] = f104, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C5 ] = f97, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6 ] = f105, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8 ] = f120, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f113, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8 ] = f121, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L040: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 8, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#else + { .mmi + shladd BOFFSET = KK8, 3, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#endif + ;; + { .mii + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + LDFD f32 = [AOFFSET], 1 * SIZE + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + { .mmi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + nop __LINE__ + } + ;; + .align 32 + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2] +#else + nop __LINE__ +#endif + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3] +#else + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4] +#else + nop __LINE__ +#endif + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C5] + (p5) LDFD f108 = [C6] +#else + nop __LINE__ + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f116 = [C7] + (p5) LDFD f124 = [C8] +#else + nop __LINE__ + nop __LINE__ +#endif + br.cloop.sptk.few .L042 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + FMA f80 = ALPHA, f80, f84 + FMA f88 = ALPHA, f88, f92 + + FMA f96 = ALPHA, f96, f100 + FMA f104 = ALPHA, f104, f108 + FMA f112 = ALPHA, f112, f116 + FMA f120 = ALPHA, f120, f124 + ;; + STFD [C1 ] = f64, SIZE + mov f64 = f0 + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; + STFD [C5 ] = f96, SIZE + mov f96 = f0 + STFD [C6 ] = f104, SIZE + mov f104 = f0 + ;; + STFD [C7 ] = f112, SIZE + mov f112 = f0 + STFD [C8 ] = f120, SIZE + mov f120 = f0 + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f72 = ALPHA, f72 + FMPY f80 = ALPHA, f80 + FMPY f88 = ALPHA, f88 + + { .mfi + FMPY f96 = ALPHA, f96 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f112 = ALPHA, f112 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f120 = ALPHA, f120 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C5 ] = f96, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6 ] = f104, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8 ] = f120, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 +#endif + +.L049: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 32 + +.L050: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 2 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + shladd C3 = LDC, 1, C + mov f80 = f0 + nop __LINE__ + } + { .mfb + mov AOFFSET = A + mov f88 = f0 + (p6) br.cond.dpnt .L090 + } + ;; +#if 0 + { .mfi + cmp.eq p6, p7 = 0, I + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + { .mfi + shladd C4 = LDC, 1, C2 + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + shladd C = LDC, 2, C + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 32 + +.L052: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f74 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f66 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; +#endif + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 4, KK +#endif +#endif + } + { .mfi + setf.d f84 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f67 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f75 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f83 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f91 = r0 + mov f68 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f76 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f92 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f69 = f0 + shr L = L, 1 + } + { .mmf + setf.d f77 = r0 + setf.d f85 = r0 + mov f93 = f0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f70 = f0 + adds L = -1, L + } + { .mmf + setf.d f78 = r0 + setf.d f86 = r0 + mov f94 = f0 + } + ;; + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + mov ar.lc = L + } + { .mmf + setf.d f79 = r0 + setf.d f87 = r0 + mov f95 = f0 + } + ;; + .align 32 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f96 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f97 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f98 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f99 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f112 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f113 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f114 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f115 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f116 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f117 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f118 = [C3 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f119 = [C11], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 32 + +.L058: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + LDFD f120 = [C4 ], SIZE + FMA f64 = ALPHA, f64, f96 + cmp.ne p6, p0 = 1, I + } + { .mfb + LDFD f121 = [C12], SIZE + FMA f68 = ALPHA, f68, f97 + nop __LINE__ + } + ;; + { .mfi + LDFD f122 = [C4 ], SIZE + FMA f65 = ALPHA, f65, f98 + adds I = -1, I + } + { .mfb + LDFD f123 = [C12], SIZE + FMA f69 = ALPHA, f69, f99 + nop __LINE__ + } + ;; + { .mfb + LDFD f124 = [C4 ], SIZE + FMA f66 = ALPHA, f66, f100 + nop __LINE__ + } + { .mfb + LDFD f125 = [C12], SIZE + FMA f70 = ALPHA, f70, f101 + nop __LINE__ + } + ;; + { .mfb + LDFD f126 = [C4 ], -3 * SIZE + FMA f67 = ALPHA, f67, f102 + nop __LINE__ + } + { .mfb + LDFD f127 = [C12], -3 * SIZE + FMA f71 = ALPHA, f71, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f72 = ALPHA, f72, f104 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMA f76 = ALPHA, f76, f105 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f73 = ALPHA, f73, f106 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMA f77 = ALPHA, f77, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMA f74 = ALPHA, f74, f108 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMA f78 = ALPHA, f78, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f110 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f80 = ALPHA, f80, f112 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMA f84 = ALPHA, f84, f113 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMA f81 = ALPHA, f81, f114 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMA f85 = ALPHA, f85, f115 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMA f82 = ALPHA, f82, f116 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMA f86 = ALPHA, f86, f117 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMA f83 = ALPHA, f83, f118 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMA f87 = ALPHA, f87, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f88 = ALPHA, f88, f120 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMA f92 = ALPHA, f92, f121 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f89 = ALPHA, f89, f122 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMA f93 = ALPHA, f93, f123 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMA f90 = ALPHA, f90, f124 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMA f94 = ALPHA, f94, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMA f91 = ALPHA, f91, f126 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMA f95 = ALPHA, f95, f127 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMPY f76 = ALPHA, f76 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMPY f77 = ALPHA, f77 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMPY f78 = ALPHA, f78 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMPY f84 = ALPHA, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMPY f85 = ALPHA, f85 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMPY f86 = ALPHA, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMPY f87 = ALPHA, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMPY f92 = ALPHA, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMPY f93 = ALPHA, f93 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f82, SIZE + FMPY f90 = ALPHA, f90 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C11] = f86, SIZE + FMPY f94 = ALPHA, f94 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f83, 5 * SIZE + FMPY f91 = ALPHA, f91 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C11] = f87, 5 * SIZE + FMPY f95 = ALPHA, f95 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f92, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C12] = f93, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f90, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f94, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f91, 5 * SIZE + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f95, 5 * SIZE + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#endif + .align 32 + +.L060: + { .mfi + nop __LINE__ + mov f66 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 4, KK +#endif +#endif + mov f74 = f0 + (p6) br.cond.dptk .L070 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f82 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f90 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f82 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f90 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f67 = f0 + adds L = -1, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f75 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C10], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f86 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f87 = [C11], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C12], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f66 = ALPHA, f66, f70 + FMA f65 = ALPHA, f65, f69 + FMA f67 = ALPHA, f67, f71 + FMA f72 = ALPHA, f72, f76 + FMA f74 = ALPHA, f74, f78 + FMA f73 = ALPHA, f73, f77 + FMA f75 = ALPHA, f75, f79 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMA f82 = ALPHA, f82, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMA f83 = ALPHA, f83, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMA f90 = ALPHA, f90, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMA f91 = ALPHA, f91, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + mov f64 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 + adds L = 1, K + } + { .mfb + STFD [C12] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, 3 * SIZE + mov f89 = f0 + shr L = L, 1 + } + { .mfb + STFD [C12] = f91, 3 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f66 = ALPHA, f66 + FMPY f65 = ALPHA, f65 + FMPY f67 = ALPHA, f67 + FMPY f72 = ALPHA, f72 + FMPY f74 = ALPHA, f74 + FMPY f73 = ALPHA, f73 + FMPY f75 = ALPHA, f75 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f74, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, 3 * SIZE + FMPY f89 = ALPHA, f89 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C10] = f75, 3 * SIZE + FMPY f91 = ALPHA, f91 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C11] = f82, SIZE + mov f64 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f81, 3 * SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C11] = f83, 3 * SIZE + mov f72 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, 3 * SIZE + mov f89 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f91, 3 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L070: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L080 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 2, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + nop __LINE__ + } + ;; +#endif + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + FMA f72 = ALPHA, f72, f76 + FMA f73 = ALPHA, f73, f77 + + FMA f80 = ALPHA, f80, f84 + FMA f81 = ALPHA, f81, f85 + FMA f88 = ALPHA, f88, f92 + FMA f89 = ALPHA, f89, f93 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 + adds L = 1, K + } + { .mfb + STFD [C4 ] = f88, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE + shr L = L, 1 + } + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f65 = ALPHA, f65 + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = ALPHA, f80 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f81 = ALPHA, f81 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4 ] = f88, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L080: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 2, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + nop __LINE__ + } + ;; +#endif + + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2] + (p5) LDFD f84 = [C3] +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4] +#else + nop __LINE__ +#endif + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + FMA f80 = ALPHA, f80, f84 + FMA f88 = ALPHA, f88, f92 + ;; + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = ALPHA, f80 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L089: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 +#endif + +.L090: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 1 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + setf.d f66 = r0 + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + { .mfb + mov AOFFSET = A + mov f73 = f0 + (p6) br.cond.dpnt .L130 + } + ;; +#if 0 + { .mfi +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + mov f67 = f0 + shladd C = LDC, 1, C + } + { .mfb + cmp.eq p6, p7 = 0, I + mov f74 = f0 + (p6) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L092: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f79 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f68 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f79 = f0 + nop __LINE__ + } + ;; +#endif + + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f75 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 2, KK +#endif +#endif + } + ;; + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + mov f76 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f69 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f77 = f0 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC], LDC + mov f70 = f0 + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f78 = f0 + mov ar.lc = L + } + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + .align 32 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f96 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f97 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f98 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f99 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + nop __LINE__ + FMA f64 = ALPHA, f64, f96 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMA f68 = ALPHA, f68, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f65 = ALPHA, f65, f98 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMA f69 = ALPHA, f69, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f66 = ALPHA, f66, f100 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f70 = ALPHA, f70, f101 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = ALPHA, f67, f102 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = ALPHA, f71, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f72 = ALPHA, f72, f104 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMA f76 = ALPHA, f76, f105 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f73 = ALPHA, f73, f106 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMA f77 = ALPHA, f77, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMA f74 = ALPHA, f74, f108 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMA f78 = ALPHA, f78, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f110 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + mov f67 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + (p6) br.cond.dptk .L092 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMPY f76 = ALPHA, f76 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMPY f77 = ALPHA, f77 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMPY f74 = ALPHA, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f70, SIZE + FMPY f78 = ALPHA, f78 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f76, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C10] = f77, SIZE + mov f73 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f74, SIZE + mov f66 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f78, SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f75, 5 * SIZE + mov f67 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + STFD [C10] = f79, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L092 + } + ;; +#endif + .align 32 + +.L100: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmf + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + mov f75 = f0 + } + { .mii + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f75 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f77 = [C2 ], -1 * SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f79 = [C10], -1 * SIZE + FMA f66 = ALPHA, f66, f70 + nop __LINE__ + } + ;; + FMA f65 = ALPHA, f65, f69 + adds L = 1, K + FMA f67 = ALPHA, f67, f71 + ;; + FMA f72 = ALPHA, f72, f76 + shr L = L, 1 + FMA f74 = ALPHA, f74, f78 + FMA f73 = ALPHA, f73, f77 + FMA f75 = ALPHA, f75, f79 + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f66, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, 3 * SIZE + STFD [C9 ] = f67, 3 * SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f74, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C2 ] = f73, 3 * SIZE + STFD [C10] = f75, 3 * SIZE + mov f73 = f0 + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + FMPY f65 = ALPHA, f65 + FMPY f67 = ALPHA, f67 + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f75 = ALPHA, f75 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f66, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f67, 3 * SIZE + nop __LINE__ +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + mov f72 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C10] = f74, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, 3 * SIZE + mov f73 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + STFD [C10] = f75, 3 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#endif + + .align 32 + +.L110: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + shladd AOFFSET = KK8, 1, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + FMA f72 = ALPHA, f72, f76 + FMA f73 = ALPHA, f73, f77 + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f73, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L120: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + add AOFFSET = KK8, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + nop __LINE__ + mov ar.lc = L + } + ;; + .align 32 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] + (p5) LDFD f76 = [C2] +#else + nop __LINE__ + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + ;; + { .mfi + STFD [C1 ] = f64 + mov f64 = f0 + } + { .mfb + STFD [C2 ] = f72 + mov f72 = f0 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + ;; + { .mmi + nop __LINE__ +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + { .mfi + STFD [C1 ] = f64 + mov f64 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72 + mov f72 = f0 + } + ;; +#endif + .align 32 + +.L129: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 +#endif + +.L130: + { .mfi +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f64 = f0 + tbit.z p6, p0 = N, 0 + } + { .mib + mov AOFFSET = A + shr I = M, 3 + (p6) br.cond.dpnt .L999 + } + ;; +#if 0 + { .mfi + mov C1 = C + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + mov f66 = f0 + nop __LINE__ + } + { .mfb + cmp.eq p7, p0 = 0, I + mov f67 = f0 + (p7) br.cond.dpnt .L140 + } + ;; + .align 32 + +.L132: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFD f48 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 1 * SIZE, B + mov f69 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#else + { .mfi + add BOFFSET = KK8, B + mov f68 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFD f48 = [BOFFSET], 1 * SIZE + mov f69 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f70 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mii + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f71 = f0 + adds L = -1, L + } + ;; + { .mmi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds PREC = CPREFETCHSIZE * SIZE, C1 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmi + CPREFETCH [PREC] + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f6 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f7 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f10 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f11 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f12 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f13 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f14 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f15 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + FMA f64 = ALPHA, f64, f6 + cmp.ne p6, p0 = 1, I + } + { .mfb + FMA f68 = ALPHA, f68, f7 + } + ;; + { .mfi + FMA f65 = ALPHA, f65, f10 + adds I = -1, I + } + { .mfb + FMA f69 = ALPHA, f69, f11 + } + ;; + { .mfi + FMA f66 = ALPHA, f66, f12 + } + { .mfb + FMA f70 = ALPHA, f70, f13 + } + ;; + { .mfb + FMA f67 = ALPHA, f67, f14 + } + { .mfb + FMA f71 = ALPHA, f71, f15 + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + mov f66 = f0 + } + ;; + { .mmf + STFD [C1 ] = f67, 5 * SIZE + nop __LINE__ + mov f67 = f0 + } + { .mmb + STFD [C9 ] = f71, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; +#else + { .mfi + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + FMPY f68 = ALPHA, f68 + } + ;; + { .mfi + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + FMPY f69 = ALPHA, f69 + } + ;; + { .mfi + FMPY f66 = ALPHA, f66 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + FMPY f70 = ALPHA, f70 + } + ;; + { .mfi + FMPY f67 = ALPHA, f67 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f71 = ALPHA, f71 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f68, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f69, SIZE + nop __LINE__ +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f66 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f70, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f67 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmb + STFD [C9 ] = f71, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; +#endif + .align 32 + +.L140: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 2, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + adds L = -1, L + nop __LINE__ + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f66 = ALPHA, f66, f70 + FMA f65 = ALPHA, f65, f69 + FMA f67 = ALPHA, f67, f71 + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + adds L = 1, K + } + { .mfb + STFD [C9 ] = f66, SIZE + mov f66 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 + shr L = L, 1 + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f67 = ALPHA, f67 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f66, SIZE + mov f66 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L150: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + LDFD f68 = [C1 ], SIZE + ;; + LDFD f69 = [C1 ], -1 * SIZE + ;; + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + ;; + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mii + nop __LINE__ +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L160: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 +#else + FMPY f64 = ALPHA, f64 +#endif + ;; + STFD [C1 ] = f64 + ;; + .align 32 + +.L169: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 +#endif + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f24 = [SP], 32 + ldf.fill f25 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f26 = [SP], 32 + ldf.fill f27 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f28 = [SP], 32 + ldf.fill f29 = [r9], 32 + ;; + ldf.fill f30 = [SP], 32 + ldf.fill f31 = [r9] + + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/qgemv_n.S b/kernel/ia64/qgemv_n.S new file mode 100644 index 0000000..4eeac12 --- /dev/null +++ b/kernel/ia64/qgemv_n.S @@ -0,0 +1,1676 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#ifndef XDOUBLE +#define A r36 +#define LDA r37 +#define X r38 +#define INCX r39 +#define Y r34 +#define INCY r35 +#else +#define A r38 +#define LDA r39 +#define X r34 +#define INCX r35 +#define Y r36 +#define INCY r37 +#endif + +#define BUFFER r11 + +#define I r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define AO5 r20 +#define AO6 r21 +#define AO7 r22 +#define AO8 r23 +#define YLD1 r24 +#define YLD2 r25 +#define YST1 r26 +#define YST2 r27 +#define II r28 +#define YY r29 + +#define ARLC r30 +#define PR r31 + +#define LDA7M8 r8 +#define PREA r9 +#define PREB r10 + +#define ALPHA1 f8 +#define ALPHA2 f9 +#define ALPHA3 f10 +#define ALPHA4 f11 +#define ALPHA5 f12 +#define ALPHA6 f13 +#define ALPHA7 f14 +#define ALPHA8 f15 + +#define RPREFETCHSIZE ( 8 * 1 + 6) +#define WPREFETCHSIZE ( 8 * 1 + 6) + +#define RPREFETCH lfetch.nt1 +#define WPREFETCH lfetch.excl.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + .body + ;; + +#ifdef XDOUBLE + ld8 X = [r14], 16 + ld8 INCX = [r15], 16 + ;; +#endif + ld8 Y = [r14], 16 + ld8 INCY = [r15], 16 + ;; + ld8 BUFFER = [r14] + ;; + + mov ALPHA = f8 + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + ;; + shladd INCX = INCX, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + ;; + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + ;; + sub I = A, Y + mov YY = Y + ;; + cmp.eq p10, p0 = SIZE, INCY + (p10) br.cond.dptk .L10 + ;; + shr J = M, 3 + mov YY = BUFFER + ;; + (p8) adds YY = SIZE, BUFFER + ;; + mov ar.lc = J + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + ;; +.L02: + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 5 * SIZE + STFD [YST2] = f0, 5 * SIZE + br.cloop.sptk.few .L02 + ;; + +.L10: + shr J = N, 3 + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + ;; + .align 16 + +.L11: + shladd LDA7M8 = LDA, 3, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov YLD1 = YY + mov YST1 = YY + adds YLD2 = 1 * SIZE, YY + adds YST2 = 1 * SIZE, YY + ;; + LDFD ALPHA1 = [X], INCX + ;; + LDFD ALPHA2 = [X], INCX + ;; + LDFD ALPHA3 = [X], INCX + ;; + LDFD ALPHA4 = [X], INCX + ;; + LDFD ALPHA5 = [X], INCX + ;; + LDFD ALPHA6 = [X], INCX + ;; + LDFD ALPHA7 = [X], INCX + ;; + LDFD ALPHA8 = [X], INCX + ;; + FMPY ALPHA1 = ALPHA, ALPHA1 + FMPY ALPHA2 = ALPHA, ALPHA2 + FMPY ALPHA3 = ALPHA, ALPHA3 + FMPY ALPHA4 = ALPHA, ALPHA4 + FMPY ALPHA5 = ALPHA, ALPHA5 + FMPY ALPHA6 = ALPHA, ALPHA6 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 3, A + ;; + shr I = M, 3 + mov pr.rot= 0 + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + adds J = -1, J + ;; + adds PREB = (WPREFETCHSIZE) * SIZE, YY + ;; + cmp.lt p7, p8 = r0, J + tbit.nz p13, p11 = M, 2 + mov ar.ec= 2 + ;; + FMPY ALPHA7 = ALPHA, ALPHA7 + ;; + { .mfi + and II = 7, M + FMPY ALPHA8 = ALPHA, ALPHA8 + mov ar.lc = I + } + { .mib + cmp.eq p6, p0 = -1, I + tbit.nz p14, p12 = M, 1 + (p6) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mmf + (p17) LDFD f93 = [AO5], LDA7M8 + (p17) LDFD f94 = [AO6], LDA7M8 + (p17) FMA f101 = ALPHA1, f33, f101 + } + { .mmf + (p17) LDFD f95 = [AO7], LDA7M8 + (p17) LDFD f96 = [AO8], LDA7M8 + (p17) FMA f104 = ALPHA1, f34, f104 + } + ;; + { .mmf + (p16) LDFD f32 = [AO1] + (p16) LDFD f33 = [AO2], LDA + (p17) FMA f107 = ALPHA1, f35, f107 + } + { .mmf + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f35 = [AO4], LDA + (p17) FMA f110 = ALPHA1, f36, f110 + } + ;; + { .mmf + (p16) LDFD f100 = [YLD1], 2 * SIZE + (p16) LDFD f103 = [YLD2], 2 * SIZE + (p17) FMA f113 = ALPHA1, f37, f113 + } + { .mmf + (p16) adds PREA = (RPREFETCHSIZE) * SIZE, AO1 + (p16) add AO1 = AO1, LDA + (p17) FMA f116 = ALPHA1, f38, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 2 * SIZE + (p18) STFD [YST2] = f105, 2 * SIZE + (p17) FMA f119 = ALPHA1, f39, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA1, f40, f122 + } + ;; + { .mmf + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f37 = [AO6], LDA + (p17) FMA f101 = ALPHA2, f41, f101 + } + { .mmf + (p16) LDFD f38 = [AO7], LDA + (p16) LDFD f39 = [AO8], LDA + (p17) FMA f104 = ALPHA2, f42, f104 + } + ;; + { .mmf + (p16) LDFD f40 = [AO1], LDA + (p16) LDFD f41 = [AO2], LDA + (p17) FMA f107 = ALPHA2, f43, f107 + } + { .mmf + (p16) LDFD f42 = [AO3], LDA + (p16) LDFD f43 = [AO4], LDA + (p17) FMA f110 = ALPHA2, f44, f110 + } + ;; + { .mmf + (p16) LDFD f106 = [YLD1], 2 * SIZE + (p16) LDFD f109 = [YLD2], 2 * SIZE + (p17) FMA f113 = ALPHA2, f45, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA2, f46, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 2 * SIZE + (p18) STFD [YST2] = f111, 2 * SIZE + (p17) FMA f119 = ALPHA2, f47, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA2, f48, f122 + } + ;; + { .mmf + (p16) LDFD f44 = [AO5], LDA + (p16) LDFD f45 = [AO6], LDA + (p17) FMA f101 = ALPHA3, f49, f101 + } + { .mmf + (p16) LDFD f46 = [AO7], LDA + (p16) LDFD f47 = [AO8], LDA + (p17) FMA f104 = ALPHA3, f50, f104 + } + ;; + { .mmf + (p16) LDFD f48 = [AO1], LDA + (p16) LDFD f49 = [AO2], LDA + (p17) FMA f107 = ALPHA3, f51, f107 + } + { .mmf + (p16) LDFD f50 = [AO3], LDA + (p16) LDFD f51 = [AO4], LDA + (p17) FMA f110 = ALPHA3, f52, f110 + } + ;; + { .mmf + (p16) LDFD f112 = [YLD1], 2 * SIZE + (p16) LDFD f115 = [YLD2], 2 * SIZE + (p17) FMA f113 = ALPHA3, f53, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA3, f54, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f114, 2 * SIZE + (p18) STFD [YST2] = f117, 2 * SIZE + (p17) FMA f119 = ALPHA3, f55, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA3, f56, f122 + } + ;; + { .mmf + (p16) LDFD f52 = [AO5], LDA + (p16) LDFD f53 = [AO6], LDA + (p17) FMA f101 = ALPHA4, f57, f101 + } + { .mmf + (p16) LDFD f54 = [AO7], LDA + (p16) LDFD f55 = [AO8], LDA + (p17) FMA f104 = ALPHA4, f58, f104 + } + ;; + { .mmf + (p16) LDFD f56 = [AO1], LDA + (p16) LDFD f57 = [AO2], LDA + (p17) FMA f107 = ALPHA4, f59, f107 + } + { .mmf + (p16) LDFD f58 = [AO3], LDA + (p16) LDFD f59 = [AO4], LDA + (p17) FMA f110 = ALPHA4, f60, f110 + } + ;; + { .mmf + (p16) LDFD f118 = [YLD1], 2 * SIZE + (p16) LDFD f121 = [YLD2], 2 * SIZE + (p17) FMA f113 = ALPHA4, f61, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA4, f62, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f120, 2 * SIZE + (p18) STFD [YST2] = f123, 2 * SIZE + (p17) FMA f119 = ALPHA4, f63, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA4, f64, f122 + } + ;; + { .mmf + (p16) LDFD f60 = [AO5], LDA + (p16) LDFD f61 = [AO6], LDA + (p17) FMA f101 = ALPHA5, f65, f101 + } + { .mmf + (p16) LDFD f62 = [AO7], LDA + (p16) LDFD f63 = [AO8], LDA + (p17) FMA f104 = ALPHA5, f66, f104 + } + ;; + { .mmf + (p16) LDFD f64 = [AO1], LDA + (p16) LDFD f65 = [AO2], LDA + (p17) FMA f107 = ALPHA5, f67, f107 + } + { .mmf + (p16) LDFD f66 = [AO3], LDA + (p16) LDFD f67 = [AO4], LDA + (p17) FMA f110 = ALPHA5, f68, f110 + } + ;; + { .mmf + (p16) WPREFETCH [PREB], 8 * SIZE + nop __LINE__ + (p17) FMA f113 = ALPHA5, f69, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA5, f70, f116 + } + ;; + { .mmf + (p16) RPREFETCH [PREA] + nop __LINE__ + (p17) FMA f119 = ALPHA5, f71, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA5, f72, f122 + } + ;; + { .mmf + (p16) LDFD f68 = [AO5], LDA + (p16) LDFD f69 = [AO6], LDA + (p17) FMA f101 = ALPHA6, f73, f101 + } + { .mmf + (p16) LDFD f70 = [AO7], LDA + (p16) LDFD f71 = [AO8], LDA + (p17) FMA f104 = ALPHA6, f74, f104 + } + ;; + { .mmf + (p16) LDFD f72 = [AO1], LDA + (p16) LDFD f73 = [AO2], LDA + (p17) FMA f107 = ALPHA6, f75, f107 + } + { .mmf + (p16) LDFD f74 = [AO3], LDA + (p16) LDFD f75 = [AO4], LDA + (p17) FMA f110 = ALPHA6, f76, f110 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f113 = ALPHA6, f77, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA6, f78, f116 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f119 = ALPHA6, f79, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA6, f80, f122 + } + ;; + { .mmf + (p16) LDFD f76 = [AO5], LDA + (p16) LDFD f77 = [AO6], LDA + (p17) FMA f101 = ALPHA7, f81, f101 + } + { .mmf + (p16) LDFD f78 = [AO7], LDA + (p16) LDFD f79 = [AO8], LDA + (p17) FMA f104 = ALPHA7, f82, f104 + } + ;; + { .mmf + (p16) LDFD f80 = [AO1], LDA + (p16) LDFD f81 = [AO2], LDA + (p17) FMA f107 = ALPHA7, f83, f107 + } + { .mmf + (p16) LDFD f82 = [AO3], LDA + (p16) LDFD f83 = [AO4], LDA + (p17) FMA f110 = ALPHA7, f84, f110 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f113 = ALPHA7, f85, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA7, f86, f116 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f119 = ALPHA7, f87, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA7, f88, f122 + } + ;; + { .mmf + (p16) LDFD f84 = [AO5], LDA + (p16) LDFD f85 = [AO6], LDA + (p17) FMA f101 = ALPHA8, f89, f101 + } + { .mmf + (p16) LDFD f86 = [AO7], LDA + (p16) LDFD f87 = [AO8], LDA + (p17) FMA f104 = ALPHA8, f90, f104 + } + ;; + { .mmf + (p16) LDFD f88 = [AO1], LDA7M8 + (p16) LDFD f89 = [AO2], LDA7M8 + (p17) FMA f107 = ALPHA8, f91, f107 + } + { .mmf + (p16) LDFD f90 = [AO3], LDA7M8 + (p16) LDFD f91 = [AO4], LDA7M8 + (p17) FMA f110 = ALPHA8, f92, f110 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f113 = ALPHA8, f93, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA8, f94, f116 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f119 = ALPHA8, f95, f119 + } + { .mfb + nop __LINE__ + (p17) FMA f122 = ALPHA8, f96, f122 + br.ctop.sptk.few .L12 + } + ;; + { .mmi + (p18) STFD [YST1] = f102, 2 * SIZE + (p18) STFD [YST2] = f105, 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 2 * SIZE + (p18) STFD [YST2] = f111, 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [YST1] = f114, 2 * SIZE + (p18) STFD [YST2] = f117, 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [YST1] = f120, 2 * SIZE + (p18) STFD [YST2] = f123, 2 * SIZE + nop __LINE__ + } + ;; + .align 16 + +.L15: + { .mmi + (p7) cmp.eq.unc p9, p0 = r0, II + (p8) cmp.eq.unc p10, p0 = r0, II + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + (p9) br.cond.dptk .L11 + (p10) br.cond.dptk .L20 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f33 = [AO2], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p13) LDFD f34 = [AO3], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f36 = [AO5], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f41 = [AO2], LDA + (p13) LDFD f42 = [AO3], LDA + (p13) LDFD f43 = [AO4], LDA + ;; + (p14) LDFD f44 = [AO5], LDA + (p14) LDFD f45 = [AO6], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f48 = [AO1], LDA + (p13) LDFD f49 = [AO2], LDA + (p13) LDFD f50 = [AO3], LDA + (p13) LDFD f51 = [AO4], LDA + ;; + (p14) LDFD f52 = [AO5], LDA + (p14) LDFD f53 = [AO6], LDA + (p15) LDFD f54 = [AO7], LDA + ;; + (p13) LDFD f56 = [AO1], LDA + (p13) LDFD f57 = [AO2], LDA + (p13) LDFD f58 = [AO3], LDA + (p13) LDFD f59 = [AO4], LDA + ;; + (p14) LDFD f60 = [AO5], LDA + (p14) LDFD f61 = [AO6], LDA + (p15) LDFD f62 = [AO7], LDA + ;; + (p13) LDFD f64 = [AO1], LDA + (p13) LDFD f65 = [AO2], LDA + (p13) LDFD f66 = [AO3], LDA + (p13) LDFD f67 = [AO4], LDA + ;; + (p14) LDFD f68 = [AO5], LDA + (p14) LDFD f69 = [AO6], LDA + (p15) LDFD f70 = [AO7], LDA + ;; + (p13) LDFD f72 = [AO1], LDA + (p13) LDFD f73 = [AO2], LDA + (p13) LDFD f74 = [AO3], LDA + (p13) LDFD f75 = [AO4], LDA + ;; + (p14) LDFD f76 = [AO5], LDA + (p14) LDFD f77 = [AO6], LDA + (p15) LDFD f78 = [AO7], LDA + ;; + (p13) LDFD f80 = [AO1], LDA + (p13) LDFD f81 = [AO2], LDA + (p13) LDFD f82 = [AO3], LDA + (p13) LDFD f83 = [AO4], LDA + ;; + (p14) LDFD f84 = [AO5], LDA + (p14) LDFD f85 = [AO6], LDA + (p15) LDFD f86 = [AO7], LDA + ;; + (p13) LDFD f88 = [AO1] + (p13) LDFD f89 = [AO2] + (p13) LDFD f90 = [AO3] + (p13) LDFD f91 = [AO4] + ;; + (p14) LDFD f92 = [AO5] + (p14) LDFD f93 = [AO6] + (p15) LDFD f94 = [AO7] + ;; + (p13) LDFD f96 = [YLD1], 2 * SIZE + (p13) LDFD f97 = [YLD2], 2 * SIZE + ;; + (p13) LDFD f98 = [YLD1], 2 * SIZE + (p13) LDFD f99 = [YLD2], 2 * SIZE + ;; + (p14) LDFD f100 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f101 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f102 = [YLD1], 1 * SIZE + ;; + + (p13) FMA f96 = ALPHA1, f32, f96 + (p13) FMA f97 = ALPHA1, f33, f97 + (p13) FMA f98 = ALPHA1, f34, f98 + (p13) FMA f99 = ALPHA1, f35, f99 + (p14) FMA f100 = ALPHA1, f36, f100 + (p14) FMA f101 = ALPHA1, f37, f101 + (p15) FMA f102 = ALPHA1, f38, f102 + ;; + (p13) FMA f96 = ALPHA2, f40, f96 + (p13) FMA f97 = ALPHA2, f41, f97 + (p13) FMA f98 = ALPHA2, f42, f98 + (p13) FMA f99 = ALPHA2, f43, f99 + (p14) FMA f100 = ALPHA2, f44, f100 + (p14) FMA f101 = ALPHA2, f45, f101 + (p15) FMA f102 = ALPHA2, f46, f102 + ;; + (p13) FMA f96 = ALPHA3, f48, f96 + (p13) FMA f97 = ALPHA3, f49, f97 + (p13) FMA f98 = ALPHA3, f50, f98 + (p13) FMA f99 = ALPHA3, f51, f99 + (p14) FMA f100 = ALPHA3, f52, f100 + (p14) FMA f101 = ALPHA3, f53, f101 + (p15) FMA f102 = ALPHA3, f54, f102 + ;; + (p13) FMA f96 = ALPHA4, f56, f96 + (p13) FMA f97 = ALPHA4, f57, f97 + (p13) FMA f98 = ALPHA4, f58, f98 + (p13) FMA f99 = ALPHA4, f59, f99 + (p14) FMA f100 = ALPHA4, f60, f100 + (p14) FMA f101 = ALPHA4, f61, f101 + (p15) FMA f102 = ALPHA4, f62, f102 + ;; + (p13) FMA f96 = ALPHA5, f64, f96 + (p13) FMA f97 = ALPHA5, f65, f97 + (p13) FMA f98 = ALPHA5, f66, f98 + (p13) FMA f99 = ALPHA5, f67, f99 + (p14) FMA f100 = ALPHA5, f68, f100 + (p14) FMA f101 = ALPHA5, f69, f101 + (p15) FMA f102 = ALPHA5, f70, f102 + ;; + (p13) FMA f96 = ALPHA6, f72, f96 + (p13) FMA f97 = ALPHA6, f73, f97 + (p13) FMA f98 = ALPHA6, f74, f98 + (p13) FMA f99 = ALPHA6, f75, f99 + (p14) FMA f100 = ALPHA6, f76, f100 + (p14) FMA f101 = ALPHA6, f77, f101 + (p15) FMA f102 = ALPHA6, f78, f102 + ;; + (p13) FMA f96 = ALPHA7, f80, f96 + (p13) FMA f97 = ALPHA7, f81, f97 + (p13) FMA f98 = ALPHA7, f82, f98 + (p13) FMA f99 = ALPHA7, f83, f99 + (p14) FMA f100 = ALPHA7, f84, f100 + (p14) FMA f101 = ALPHA7, f85, f101 + (p15) FMA f102 = ALPHA7, f86, f102 + ;; + (p13) FMA f16 = ALPHA8, f88, f96 + (p13) FMA f17 = ALPHA8, f89, f97 + (p13) FMA f18 = ALPHA8, f90, f98 + (p13) FMA f19 = ALPHA8, f91, f99 + (p14) FMA f20 = ALPHA8, f92, f100 + (p14) FMA f21 = ALPHA8, f93, f101 + (p15) FMA f22 = ALPHA8, f94, f102 + ;; + { .mmi + (p13) STFD [YST1] = f16, 2 * SIZE + (p13) STFD [YST2] = f17, 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YST1] = f18, 2 * SIZE + (p13) STFD [YST2] = f19 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YST1] = f20, 1 * SIZE + ;; + (p14) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + } + ;; + { .mib + (p15) STFD [YST1] = f22 + cmp.lt p11, p12 = r0, J + (p11) br.cond.dptk .L11 + } + ;; + .align 16 + +.L20: + tbit.z p6, p0 = N, 2 + ;; + (p6) br.cond.dpnt .L30 + ;; + + shladd LDA7M8 = LDA, 2, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov YLD1 = YY + mov YST1 = YY + adds YLD2 = 2 * SIZE, YY + adds YST2 = 2 * SIZE, YY + ;; + LDFD ALPHA1 = [X], INCX + ;; + LDFD ALPHA2 = [X], INCX + ;; + LDFD ALPHA3 = [X], INCX + ;; + LDFD ALPHA4 = [X], INCX + ;; + FMPY ALPHA1 = ALPHA, ALPHA1 + FMPY ALPHA2 = ALPHA, ALPHA2 + FMPY ALPHA3 = ALPHA, ALPHA3 + FMPY ALPHA4 = ALPHA, ALPHA4 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 2, A + ;; + shr I = M, 3 + mov pr.rot= 0 + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + adds J = -1, J + ;; + cmp.lt p7, p8 = r0, J + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + mov ar.ec= 1 + ;; + { .mfi + and II = 7, M + mov ar.lc = I + } + { .mfb + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA + (p16) LDFD f42 = [AO3], LDA + (p16) LDFD f44 = [AO5], LDA + (p16) LDFD f46 = [AO7], LDA + ;; + (p16) LDFD f41 = [AO2], LDA + (p16) LDFD f43 = [AO4], LDA + (p16) LDFD f45 = [AO6], LDA + (p16) LDFD f47 = [AO8], LDA + ;; + (p16) LDFD f48 = [AO1], LDA + (p16) LDFD f50 = [AO3], LDA + (p16) LDFD f52 = [AO5], LDA + (p16) LDFD f54 = [AO7], LDA + ;; + (p16) LDFD f49 = [AO2], LDA + (p16) LDFD f51 = [AO4], LDA + (p16) LDFD f53 = [AO6], LDA + (p16) LDFD f55 = [AO8], LDA + ;; + (p16) LDFD f56 = [AO1], LDA7M8 + (p16) LDFD f58 = [AO3], LDA7M8 + (p16) LDFD f60 = [AO5], LDA7M8 + (p16) LDFD f62 = [AO7], LDA7M8 + ;; + (p16) LDFD f57 = [AO2], LDA7M8 + (p16) LDFD f59 = [AO4], LDA7M8 + (p16) LDFD f61 = [AO6], LDA7M8 + (p16) LDFD f63 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [YLD1], 1 * SIZE + (p16) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f97 = [YLD1], 3 * SIZE + (p16) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p16) LDFD f102 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f101 = [YLD1], 3 * SIZE + (p16) LDFD f103 = [YLD2], 3 * SIZE + ;; + (p16) FMA f96 = ALPHA1, f32, f96 + (p16) FMA f98 = ALPHA1, f34, f98 + (p16) FMA f97 = ALPHA1, f33, f97 + (p16) FMA f99 = ALPHA1, f35, f99 + (p16) FMA f100 = ALPHA1, f36, f100 + (p16) FMA f102 = ALPHA1, f38, f102 + (p16) FMA f101 = ALPHA1, f37, f101 + (p16) FMA f103 = ALPHA1, f39, f103 + ;; + (p16) FMA f96 = ALPHA2, f40, f96 + (p16) FMA f98 = ALPHA2, f42, f98 + (p16) FMA f97 = ALPHA2, f41, f97 + (p16) FMA f99 = ALPHA2, f43, f99 + (p16) FMA f100 = ALPHA2, f44, f100 + (p16) FMA f102 = ALPHA2, f46, f102 + (p16) FMA f101 = ALPHA2, f45, f101 + (p16) FMA f103 = ALPHA2, f47, f103 + ;; + (p16) FMA f96 = ALPHA3, f48, f96 + (p16) FMA f98 = ALPHA3, f50, f98 + (p16) FMA f97 = ALPHA3, f49, f97 + (p16) FMA f99 = ALPHA3, f51, f99 + (p16) FMA f100 = ALPHA3, f52, f100 + (p16) FMA f102 = ALPHA3, f54, f102 + (p16) FMA f101 = ALPHA3, f53, f101 + (p16) FMA f103 = ALPHA3, f55, f103 + ;; + (p16) FMA f16 = ALPHA4, f56, f96 + (p16) FMA f18 = ALPHA4, f58, f98 + (p16) FMA f17 = ALPHA4, f57, f97 + (p16) FMA f19 = ALPHA4, f59, f99 + (p16) FMA f20 = ALPHA4, f60, f100 + (p16) FMA f22 = ALPHA4, f62, f102 + (p16) FMA f21 = ALPHA4, f61, f101 + (p16) FMA f23 = ALPHA4, f63, f103 + ;; + (p16) STFD [YST1] = f16, 1 * SIZE + (p16) STFD [YST2] = f18, 1 * SIZE + ;; + (p16) STFD [YST1] = f17, 3 * SIZE + (p16) STFD [YST2] = f19, 3 * SIZE + ;; + (p16) STFD [YST1] = f20, 1 * SIZE + (p16) STFD [YST2] = f22, 1 * SIZE + ;; + (p16) STFD [YST1] = f21, 3 * SIZE + (p16) STFD [YST2] = f23, 3 * SIZE + br.ctop.sptk.few .L22 + ;; + .align 16 + +.L25: + { .mmi + (p8) cmp.eq.unc p10, p0 = r0, II + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + (p10) br.cond.dptk .L30 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f42 = [AO3], LDA + (p14) LDFD f44 = [AO5], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f41 = [AO2], LDA + (p13) LDFD f43 = [AO4], LDA + (p14) LDFD f45 = [AO6], LDA + ;; + (p13) LDFD f48 = [AO1], LDA + (p13) LDFD f50 = [AO3], LDA + (p14) LDFD f52 = [AO5], LDA + (p15) LDFD f54 = [AO7], LDA + ;; + (p13) LDFD f49 = [AO2], LDA + (p13) LDFD f51 = [AO4], LDA + (p14) LDFD f53 = [AO6], LDA + ;; + (p13) LDFD f56 = [AO1] + (p13) LDFD f58 = [AO3] + (p14) LDFD f60 = [AO5] + (p15) LDFD f62 = [AO7] + ;; + (p13) LDFD f57 = [AO2] + (p13) LDFD f59 = [AO4] + (p14) LDFD f61 = [AO6] + ;; + (p13) LDFD f96 = [YLD1], 1 * SIZE + (p13) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p13) LDFD f97 = [YLD1], 3 * SIZE + (p13) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p14) LDFD f100 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f101 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f102 = [YLD1], 1 * SIZE + ;; + + (p13) FMA f96 = ALPHA1, f32, f96 + (p13) FMA f98 = ALPHA1, f34, f98 + (p13) FMA f97 = ALPHA1, f33, f97 + (p13) FMA f99 = ALPHA1, f35, f99 + (p14) FMA f100 = ALPHA1, f36, f100 + (p15) FMA f102 = ALPHA1, f38, f102 + (p14) FMA f101 = ALPHA1, f37, f101 + ;; + (p13) FMA f96 = ALPHA2, f40, f96 + (p13) FMA f98 = ALPHA2, f42, f98 + (p13) FMA f97 = ALPHA2, f41, f97 + (p13) FMA f99 = ALPHA2, f43, f99 + (p14) FMA f100 = ALPHA2, f44, f100 + (p15) FMA f102 = ALPHA2, f46, f102 + (p14) FMA f101 = ALPHA2, f45, f101 + ;; + (p13) FMA f96 = ALPHA3, f48, f96 + (p13) FMA f98 = ALPHA3, f50, f98 + (p13) FMA f97 = ALPHA3, f49, f97 + (p13) FMA f99 = ALPHA3, f51, f99 + (p14) FMA f100 = ALPHA3, f52, f100 + (p15) FMA f102 = ALPHA3, f54, f102 + (p14) FMA f101 = ALPHA3, f53, f101 + ;; + (p13) FMA f16 = ALPHA4, f56, f96 + (p13) FMA f18 = ALPHA4, f58, f98 + (p13) FMA f17 = ALPHA4, f57, f97 + (p13) FMA f19 = ALPHA4, f59, f99 + (p14) FMA f20 = ALPHA4, f60, f100 + (p15) FMA f22 = ALPHA4, f62, f102 + (p14) FMA f21 = ALPHA4, f61, f101 + ;; + { .mmi + (p13) STFD [YST1] = f16, 1 * SIZE + (p13) STFD [YST2] = f18, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YST1] = f17, 3 * SIZE + (p13) STFD [YST2] = f19 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YST1] = f20, 1 * SIZE + ;; + (p14) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + } + ;; + { .mib + (p15) STFD [YST1] = f22 + } + ;; + .align 16 + +.L30: + tbit.z p6, p0 = N, 1 + ;; + (p6) br.cond.dpnt .L40 + ;; + + shladd LDA7M8 = LDA, 1, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov YLD1 = YY + mov YST1 = YY + adds YLD2 = 2 * SIZE, YY + adds YST2 = 2 * SIZE, YY + ;; + LDFD ALPHA1 = [X], INCX + ;; + LDFD ALPHA2 = [X], INCX + ;; + FMPY ALPHA1 = ALPHA, ALPHA1 + FMPY ALPHA2 = ALPHA, ALPHA2 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 1, A + ;; + shr I = M, 3 + mov pr.rot= 0 + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + adds J = -1, J + ;; + cmp.lt p7, p8 = r0, J + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + mov ar.ec= 1 + ;; + { .mfi + and II = 7, M + mov ar.lc = I + } + { .mfb + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA7M8 + (p16) LDFD f42 = [AO3], LDA7M8 + (p16) LDFD f44 = [AO5], LDA7M8 + (p16) LDFD f46 = [AO7], LDA7M8 + ;; + (p16) LDFD f41 = [AO2], LDA7M8 + (p16) LDFD f43 = [AO4], LDA7M8 + (p16) LDFD f45 = [AO6], LDA7M8 + (p16) LDFD f47 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [YLD1], 1 * SIZE + (p16) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f97 = [YLD1], 3 * SIZE + (p16) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p16) LDFD f102 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f101 = [YLD1], 3 * SIZE + (p16) LDFD f103 = [YLD2], 3 * SIZE + ;; + (p16) FMA f96 = ALPHA1, f32, f96 + (p16) FMA f98 = ALPHA1, f34, f98 + (p16) FMA f97 = ALPHA1, f33, f97 + (p16) FMA f99 = ALPHA1, f35, f99 + (p16) FMA f100 = ALPHA1, f36, f100 + (p16) FMA f102 = ALPHA1, f38, f102 + (p16) FMA f101 = ALPHA1, f37, f101 + (p16) FMA f103 = ALPHA1, f39, f103 + ;; + (p16) FMA f16 = ALPHA2, f40, f96 + (p16) FMA f18 = ALPHA2, f42, f98 + (p16) FMA f17 = ALPHA2, f41, f97 + (p16) FMA f19 = ALPHA2, f43, f99 + (p16) FMA f20 = ALPHA2, f44, f100 + (p16) FMA f22 = ALPHA2, f46, f102 + (p16) FMA f21 = ALPHA2, f45, f101 + (p16) FMA f23 = ALPHA2, f47, f103 + ;; + (p16) STFD [YST1] = f16, 1 * SIZE + (p16) STFD [YST2] = f18, 1 * SIZE + ;; + (p16) STFD [YST1] = f17, 3 * SIZE + (p16) STFD [YST2] = f19, 3 * SIZE + ;; + (p16) STFD [YST1] = f20, 1 * SIZE + (p16) STFD [YST2] = f22, 1 * SIZE + ;; + (p16) STFD [YST1] = f21, 3 * SIZE + (p16) STFD [YST2] = f23, 3 * SIZE + br.ctop.sptk.few .L32 + ;; + .align 16 + +.L35: + { .mmi + (p8) cmp.eq.unc p10, p0 = r0, II + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + (p10) br.cond.dptk .L40 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f42 = [AO3], LDA + (p14) LDFD f44 = [AO5], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f41 = [AO2] + (p13) LDFD f43 = [AO4] + (p14) LDFD f45 = [AO6] + ;; + (p13) LDFD f96 = [YLD1], 1 * SIZE + (p13) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p13) LDFD f97 = [YLD1], 3 * SIZE + (p13) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p14) LDFD f100 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f101 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f102 = [YLD1], 1 * SIZE + ;; + + (p13) FMA f96 = ALPHA1, f32, f96 + (p13) FMA f98 = ALPHA1, f34, f98 + (p13) FMA f97 = ALPHA1, f33, f97 + (p13) FMA f99 = ALPHA1, f35, f99 + (p14) FMA f100 = ALPHA1, f36, f100 + (p15) FMA f102 = ALPHA1, f38, f102 + (p14) FMA f101 = ALPHA1, f37, f101 + ;; + (p13) FMA f16 = ALPHA2, f40, f96 + (p13) FMA f18 = ALPHA2, f42, f98 + (p13) FMA f17 = ALPHA2, f41, f97 + (p13) FMA f19 = ALPHA2, f43, f99 + (p14) FMA f20 = ALPHA2, f44, f100 + (p15) FMA f22 = ALPHA2, f46, f102 + (p14) FMA f21 = ALPHA2, f45, f101 + ;; + { .mmi + (p13) STFD [YST1] = f16, 1 * SIZE + (p13) STFD [YST2] = f18, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YST1] = f17, 3 * SIZE + (p13) STFD [YST2] = f19 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YST1] = f20, 1 * SIZE + ;; + (p14) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + } + ;; + { .mib + (p15) STFD [YST1] = f22 + } + ;; + .align 16 + +.L40: + tbit.z p6, p0 = N, 0 + ;; + (p6) br.cond.dpnt .L990 + ;; + mov LDA7M8 = 8 * SIZE + ;; + mov YLD1 = YY + mov YST1 = YY + adds YLD2 = 2 * SIZE, YY + adds YST2 = 2 * SIZE, YY + ;; + LDFD ALPHA1 = [X], INCX + ;; + LDFD ALPHA2 = [X], INCX + ;; + FMPY ALPHA1 = ALPHA, ALPHA1 + FMPY ALPHA2 = ALPHA, ALPHA2 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + add A = LDA, A + ;; + shr I = M, 3 + mov pr.rot= 0 + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + adds J = -1, J + ;; + cmp.lt p7, p8 = r0, J + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + mov ar.ec= 1 + ;; + { .mfi + and II = 7, M + mov ar.lc = I + } + { .mfb + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L45 + } + ;; + .align 16 + +.L42: + (p16) LDFD f32 = [AO1], 8 * SIZE + (p16) LDFD f34 = [AO3], 8 * SIZE + (p16) LDFD f36 = [AO5], 8 * SIZE + (p16) LDFD f38 = [AO7], 8 * SIZE + ;; + (p16) LDFD f33 = [AO2], 8 * SIZE + (p16) LDFD f35 = [AO4], 8 * SIZE + (p16) LDFD f37 = [AO6], 8 * SIZE + (p16) LDFD f39 = [AO8], 8 * SIZE + ;; + (p16) LDFD f96 = [YLD1], 1 * SIZE + (p16) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f97 = [YLD1], 3 * SIZE + (p16) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p16) LDFD f102 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f101 = [YLD1], 3 * SIZE + (p16) LDFD f103 = [YLD2], 3 * SIZE + ;; + (p16) FMA f16 = ALPHA1, f32, f96 + (p16) FMA f18 = ALPHA1, f34, f98 + (p16) FMA f17 = ALPHA1, f33, f97 + (p16) FMA f19 = ALPHA1, f35, f99 + (p16) FMA f20 = ALPHA1, f36, f100 + (p16) FMA f22 = ALPHA1, f38, f102 + (p16) FMA f21 = ALPHA1, f37, f101 + (p16) FMA f23 = ALPHA1, f39, f103 + ;; + (p16) STFD [YST1] = f16, 1 * SIZE + (p16) STFD [YST2] = f18, 1 * SIZE + ;; + (p16) STFD [YST1] = f17, 3 * SIZE + (p16) STFD [YST2] = f19, 3 * SIZE + ;; + (p16) STFD [YST1] = f20, 1 * SIZE + (p16) STFD [YST2] = f22, 1 * SIZE + ;; + (p16) STFD [YST1] = f21, 3 * SIZE + (p16) STFD [YST2] = f23, 3 * SIZE + br.ctop.sptk.few .L42 + ;; + .align 16 + +.L45: + { .mmi + (p8) cmp.eq.unc p10, p0 = r0, II + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + (p10) br.cond.dptk .L990 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f96 = [YLD1], 1 * SIZE + (p13) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p13) LDFD f97 = [YLD1], 3 * SIZE + (p13) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p14) LDFD f100 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f101 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f102 = [YLD1], 1 * SIZE + ;; + + (p13) FMA f16 = ALPHA1, f32, f96 + (p13) FMA f18 = ALPHA1, f34, f98 + (p13) FMA f17 = ALPHA1, f33, f97 + (p13) FMA f19 = ALPHA1, f35, f99 + (p14) FMA f20 = ALPHA1, f36, f100 + (p15) FMA f22 = ALPHA1, f38, f102 + (p14) FMA f21 = ALPHA1, f37, f101 + ;; + { .mmi + (p13) STFD [YST1] = f16, 1 * SIZE + (p13) STFD [YST2] = f18, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YST1] = f17, 3 * SIZE + (p13) STFD [YST2] = f19 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YST1] = f20, 1 * SIZE + ;; + (p14) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + } + ;; + { .mib + (p15) STFD [YST1] = f22 + } + ;; + .align 16 + + +.L990: + cmp.eq p10, p0 = SIZE, INCY + ;; + { .mmi + mov YLD1 = YY + mov YST1 = Y + mov pr.rot= 0 + } + { .mib + mov YST2 = Y + shr J = M, 3 + (p10) br.cond.dptk .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = r0, J + adds J = -1, J + mov ar.ec = 4 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + nop __LINE__ + tbit.nz p13, p0 = M, 2 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = J + (p6) br.cond.dpnt .L995 + } + ;; +.L992: + { .mfi + (p19) STFD [YST2] = f35 + (p18) FADD f34 = f34, f66 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f64 = [YLD1], 1 * SIZE + (p16) LDFD f32 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f39 + (p18) FADD f38 = f38, f70 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f36 = [YST1], INCY + (p16) LDFD f68 = [YLD1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f43 + (p18) FADD f42 = f42, f74 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f72 = [YLD1], 1 * SIZE + (p16) LDFD f40 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f47 + (p18) FADD f46 = f46, f78 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f76 = [YLD1], 1 * SIZE + (p16) LDFD f44 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f51 + (p18) FADD f50 = f50, f82 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f80 = [YLD1], 1 * SIZE + (p16) LDFD f48 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f55 + (p18) FADD f54 = f54, f86 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f84 = [YLD1], 1 * SIZE + (p16) LDFD f52 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f59 + (p18) FADD f58 = f58, f90 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f88 = [YLD1], 1 * SIZE + (p16) LDFD f56 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f63 + (p18) FADD f62 = f62, f94 + (p19) add YST2 = YST2, INCY + } + { .mmb + (p16) LDFD f92 = [YLD1], 1 * SIZE + (p16) LDFD f60 = [YST1], INCY + br.ctop.sptk.few .L992 + } + ;; + +.L995: + (p13) LDFD f32 = [YST1], INCY + (p13) LDFD f40 = [YLD1], 1 * SIZE + tbit.nz p14, p0 = M, 1 + ;; + (p13) LDFD f33 = [YST1], INCY + (p13) LDFD f41 = [YLD1], 1 * SIZE + tbit.nz p15, p0 = M, 0 + ;; + (p13) LDFD f34 = [YST1], INCY + (p13) LDFD f42 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f35 = [YST1], INCY + (p13) LDFD f43 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f36 = [YST1], INCY + (p14) LDFD f44 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f37 = [YST1], INCY + (p14) LDFD f45 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f38 = [YST1], INCY + (p15) LDFD f46 = [YLD1], 1 * SIZE + ;; + (p13) FADD f32 = f32, f40 + (p13) FADD f33 = f33, f41 + (p13) FADD f34 = f34, f42 + (p13) FADD f35 = f35, f43 + (p14) FADD f36 = f36, f44 + (p14) FADD f37 = f37, f45 + (p15) FADD f38 = f38, f46 + ;; + (p13) STFD [YST2] = f32 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f33 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f34 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f35 + (p13) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f36 + (p14) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f37 + (p14) add YST2 = YST2, INCY + ;; + (p15) STFD [YST2] = f38 + ;; + +.L999: + mov ar.lc = ARLC + mov pr = PR, -1 + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/qgemv_t.S b/kernel/ia64/qgemv_t.S new file mode 100644 index 0000000..f3fc693 --- /dev/null +++ b/kernel/ia64/qgemv_t.S @@ -0,0 +1,1287 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#ifndef XDOUBLE +#define A r36 +#define LDA r37 +#define X1 r38 +#define INCX r39 +#define Y1 r34 +#define INCY r35 +#else +#define A r38 +#define LDA r39 +#define X1 r34 +#define INCX r35 +#define Y1 r36 +#define INCY r37 +#endif + +#define BUFFER r11 + +#define I r15 +#define J r16 +#define AO1 r17 +#define AO2 r18 +#define AO3 r19 +#define AO4 r20 +#define AO5 r21 +#define AO6 r22 +#define AO7 r23 +#define AO8 r24 +#define X2 r25 +#define Y2 r26 +#define LDA7M8 r27 +#define INCX5 r28 +#define INCY5 r29 + +#define YY1 r8 +#define YY2 r9 + +#define ARLC r30 +#define PR r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 8) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + mov ARLC = ar.lc + } + { .mmi + adds r15 = 24, SP + adds r14 = 16, SP + } + ;; +#ifdef XDOUBLE + ld8 X1 = [r14], 16 + ld8 INCX = [r15], 16 + ;; +#endif + ld8 Y1 = [r14], 16 + ld8 INCY = [r15], 16 + ;; + ld8 BUFFER = [r14] + ;; + mov PR = pr + ;; + mov ALPHA = f8 + .body + ;; + { .mmi + cmp.ge p7, p0 = r0, M + cmp.ge p6, p0 = r0, N + } + ;; + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + } + ;; + { .mbb + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + } + .align 16 + ;; + shladd INCY5 = INCY, 2, INCY + shladd INCX5 = INCX, 2, INCX + cmp.eq p10, p0 = SIZE, INCX + ;; + (p10) mov BUFFER = X1 + (p10) br.cond.dptk .L10 + ;; + + + mov pr.rot= 0 + shladd X2 = INCX, 2, X1 + mov YY1 = BUFFER + adds YY2 = 4 * SIZE, BUFFER + ;; + shr I = M, 3 + ;; + { .mmi + adds I = -1, I + cmp.eq p16, p0 = r0, r0 + mov ar.ec= 5 + } + ;; + { .mmi + mov ar.lc = I + } + { .mib + cmp.gt p6, p0 = 0, I + tbit.nz p13, p0 = M, 2 + (p6) br.cond.dpnt .L05 + } + ;; + .align 16 + +.L01: + (p20) STFD [YY1] = f36, SIZE + (p20) STFD [YY2] = f56, SIZE + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f52 = [X2], INCX + ;; + (p20) STFD [YY1] = f41, SIZE + (p20) STFD [YY2] = f61, SIZE + (p16) LDFD f37 = [X1], INCX + (p16) LDFD f57 = [X2], INCX + ;; + (p20) STFD [YY1] = f46, SIZE + (p20) STFD [YY2] = f66, SIZE + (p16) LDFD f42 = [X1], INCX + (p16) LDFD f62 = [X2], INCX + ;; + (p20) STFD [YY1] = f51, 5 * SIZE + (p20) STFD [YY2] = f71, 5 * SIZE + (p16) LDFD f47 = [X1], INCX5 + (p16) LDFD f67 = [X2], INCX5 + br.ctop.sptk.few .L01 + ;; + .align 16 + +.L05: + (p13) LDFD f32 = [X1], INCX + tbit.nz p14, p0 = M, 1 + ;; + (p13) LDFD f33 = [X1], INCX + tbit.nz p15, p0 = M, 0 + ;; + (p13) LDFD f34 = [X1], INCX + ;; + (p13) LDFD f35 = [X1], INCX + ;; + (p14) LDFD f36 = [X1], INCX + ;; + (p13) STFD [YY1] = f32, SIZE + (p14) LDFD f37 = [X1], INCX + ;; + (p13) STFD [YY1] = f33, SIZE + (p15) LDFD f38 = [X1], INCX + ;; + (p13) STFD [YY1] = f34, SIZE + ;; + (p13) STFD [YY1] = f35, SIZE + ;; + (p14) STFD [YY1] = f36, SIZE + ;; + (p14) STFD [YY1] = f37, SIZE + ;; + (p15) STFD [YY1] = f38, SIZE + ;; + .align 16 + +.L10: + mov YY1 = Y1 + shladd Y2 = INCY, 2, Y1 + shladd YY2 = INCY, 2, Y1 + ;; + { .mmi + nop __LINE__ + shr J = N, 3 + } + ;; + { .mib + nop __LINE__ + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 3, A + ;; + shladd LDA7M8 = LDA, 3, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov f8 = f0 + mov f9 = f0 + mov f10 = f0 + mov f11 = f0 + mov f12 = f0 + mov f13 = f0 + mov f14 = f0 + mov f15 = f0 + + mov pr.rot= 0 + shr I = M, 3 + mov ar.ec = 2 + ;; + mov X1 = BUFFER + adds X2 = 2 * SIZE, BUFFER + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + ;; + mov ar.lc = I + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L15 + ;; + .align 16 + +.L12: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA + (p16) LDFD f42 = [AO3], LDA + (p16) LDFD f44 = [AO5], LDA + (p16) LDFD f46 = [AO7], LDA + ;; + (p16) LDFD f41 = [AO2], LDA + (p16) LDFD f43 = [AO4], LDA + (p16) LDFD f45 = [AO6], LDA + (p16) LDFD f47 = [AO8], LDA + ;; + (p16) LDFD f48 = [AO1], LDA + (p16) LDFD f50 = [AO3], LDA + (p16) LDFD f52 = [AO5], LDA + (p16) LDFD f54 = [AO7], LDA + ;; + (p16) LDFD f49 = [AO2], LDA + (p16) LDFD f51 = [AO4], LDA + (p16) LDFD f53 = [AO6], LDA + (p16) LDFD f55 = [AO8], LDA + ;; + (p16) LDFD f56 = [AO1], LDA + (p16) LDFD f58 = [AO3], LDA + (p16) LDFD f60 = [AO5], LDA + (p16) LDFD f62 = [AO7], LDA + ;; + (p16) LDFD f57 = [AO2], LDA + (p16) LDFD f59 = [AO4], LDA + (p16) LDFD f61 = [AO6], LDA + (p16) LDFD f63 = [AO8], LDA + ;; + (p16) LDFD f64 = [AO1], LDA + (p16) LDFD f66 = [AO3], LDA + (p16) LDFD f68 = [AO5], LDA + (p16) LDFD f70 = [AO7], LDA + ;; + (p16) LDFD f65 = [AO2], LDA + (p16) LDFD f67 = [AO4], LDA + (p16) LDFD f69 = [AO6], LDA + (p16) LDFD f71 = [AO8], LDA + ;; + (p16) LDFD f72 = [AO1], LDA + (p16) LDFD f74 = [AO3], LDA + (p16) LDFD f76 = [AO5], LDA + (p16) LDFD f78 = [AO7], LDA + ;; + (p16) LDFD f73 = [AO2], LDA + (p16) LDFD f75 = [AO4], LDA + (p16) LDFD f77 = [AO6], LDA + (p16) LDFD f79 = [AO8], LDA + ;; + (p16) LDFD f80 = [AO1], LDA + (p16) LDFD f82 = [AO3], LDA + (p16) LDFD f84 = [AO5], LDA + (p16) LDFD f86 = [AO7], LDA + ;; + (p16) LDFD f81 = [AO2], LDA + (p16) LDFD f83 = [AO4], LDA + (p16) LDFD f85 = [AO6], LDA + (p16) LDFD f87 = [AO8], LDA + ;; + (p16) LDFD f88 = [AO1], LDA7M8 + (p16) LDFD f90 = [AO3], LDA7M8 + (p16) LDFD f92 = [AO5], LDA7M8 + (p16) LDFD f94 = [AO7], LDA7M8 + ;; + (p16) LDFD f89 = [AO2], LDA7M8 + (p16) LDFD f91 = [AO4], LDA7M8 + (p16) LDFD f93 = [AO6], LDA7M8 + (p16) LDFD f95 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [X1], 1 * SIZE + (p16) LDFD f98 = [X2], 1 * SIZE + ;; + (p16) LDFD f97 = [X1], 3 * SIZE + (p16) LDFD f99 = [X2], 3 * SIZE + ;; + (p16) LDFD f100 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + ;; + (p16) LDFD f101 = [X1], 3 * SIZE + (p16) LDFD f103 = [X2], 3 * SIZE + ;; + (p16) FMA f8 = f96, f32, f8 + (p16) FMA f9 = f96, f40, f9 + (p16) FMA f10 = f96, f48, f10 + (p16) FMA f11 = f96, f56, f11 + (p16) FMA f12 = f96, f64, f12 + (p16) FMA f13 = f96, f72, f13 + (p16) FMA f14 = f96, f80, f14 + (p16) FMA f15 = f96, f88, f15 + ;; + (p16) FMA f8 = f97, f33, f8 + (p16) FMA f9 = f97, f41, f9 + (p16) FMA f10 = f97, f49, f10 + (p16) FMA f11 = f97, f57, f11 + (p16) FMA f12 = f97, f65, f12 + (p16) FMA f13 = f97, f73, f13 + (p16) FMA f14 = f97, f81, f14 + (p16) FMA f15 = f97, f89, f15 + ;; + (p16) FMA f8 = f98, f34, f8 + (p16) FMA f9 = f98, f42, f9 + (p16) FMA f10 = f98, f50, f10 + (p16) FMA f11 = f98, f58, f11 + (p16) FMA f12 = f98, f66, f12 + (p16) FMA f13 = f98, f74, f13 + (p16) FMA f14 = f98, f82, f14 + (p16) FMA f15 = f98, f90, f15 + ;; + (p16) FMA f8 = f99, f35, f8 + (p16) FMA f9 = f99, f43, f9 + (p16) FMA f10 = f99, f51, f10 + (p16) FMA f11 = f99, f59, f11 + (p16) FMA f12 = f99, f67, f12 + (p16) FMA f13 = f99, f75, f13 + (p16) FMA f14 = f99, f83, f14 + (p16) FMA f15 = f99, f91, f15 + ;; + (p16) FMA f8 = f100, f36, f8 + (p16) FMA f9 = f100, f44, f9 + (p16) FMA f10 = f100, f52, f10 + (p16) FMA f11 = f100, f60, f11 + (p16) FMA f12 = f100, f68, f12 + (p16) FMA f13 = f100, f76, f13 + (p16) FMA f14 = f100, f84, f14 + (p16) FMA f15 = f100, f92, f15 + ;; + (p16) FMA f8 = f101, f37, f8 + (p16) FMA f9 = f101, f45, f9 + (p16) FMA f10 = f101, f53, f10 + (p16) FMA f11 = f101, f61, f11 + (p16) FMA f12 = f101, f69, f12 + (p16) FMA f13 = f101, f77, f13 + (p16) FMA f14 = f101, f85, f14 + (p16) FMA f15 = f101, f93, f15 + ;; + (p16) FMA f8 = f102, f38, f8 + (p16) FMA f9 = f102, f46, f9 + (p16) FMA f10 = f102, f54, f10 + (p16) FMA f11 = f102, f62, f11 + (p16) FMA f12 = f102, f70, f12 + (p16) FMA f13 = f102, f78, f13 + (p16) FMA f14 = f102, f86, f14 + (p16) FMA f15 = f102, f94, f15 + ;; + (p16) FMA f8 = f103, f39, f8 + (p16) FMA f9 = f103, f47, f9 + (p16) FMA f10 = f103, f55, f10 + (p16) FMA f11 = f103, f63, f11 + (p16) FMA f12 = f103, f71, f12 + (p16) FMA f13 = f103, f79, f13 + (p16) FMA f14 = f103, f87, f14 + (p16) FMA f15 = f103, f95, f15 + br.ctop.sptk.few .L12 + ;; + .align 16 + +.L15: + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + ;; + { .mmi + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f42 = [AO3], LDA + (p14) LDFD f44 = [AO5], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f41 = [AO2], LDA + (p13) LDFD f43 = [AO4], LDA + (p14) LDFD f45 = [AO6], LDA + ;; + (p13) LDFD f48 = [AO1], LDA + (p13) LDFD f50 = [AO3], LDA + (p14) LDFD f52 = [AO5], LDA + (p15) LDFD f54 = [AO7], LDA + ;; + (p13) LDFD f49 = [AO2], LDA + (p13) LDFD f51 = [AO4], LDA + (p14) LDFD f53 = [AO6], LDA + ;; + (p13) LDFD f56 = [AO1], LDA + (p13) LDFD f58 = [AO3], LDA + (p14) LDFD f60 = [AO5], LDA + (p15) LDFD f62 = [AO7], LDA + ;; + (p13) LDFD f57 = [AO2], LDA + (p13) LDFD f59 = [AO4], LDA + (p14) LDFD f61 = [AO6], LDA + ;; + (p13) LDFD f64 = [AO1], LDA + (p13) LDFD f66 = [AO3], LDA + (p14) LDFD f68 = [AO5], LDA + (p15) LDFD f70 = [AO7], LDA + ;; + (p13) LDFD f65 = [AO2], LDA + (p13) LDFD f67 = [AO4], LDA + (p14) LDFD f69 = [AO6], LDA + ;; + (p13) LDFD f72 = [AO1], LDA + (p13) LDFD f74 = [AO3], LDA + (p14) LDFD f76 = [AO5], LDA + (p15) LDFD f78 = [AO7], LDA + ;; + (p13) LDFD f73 = [AO2], LDA + (p13) LDFD f75 = [AO4], LDA + (p14) LDFD f77 = [AO6], LDA + ;; + (p13) LDFD f80 = [AO1], LDA + (p13) LDFD f82 = [AO3], LDA + (p14) LDFD f84 = [AO5], LDA + (p15) LDFD f86 = [AO7], LDA + ;; + (p13) LDFD f81 = [AO2], LDA + (p13) LDFD f83 = [AO4], LDA + (p14) LDFD f85 = [AO6], LDA + ;; + (p13) LDFD f88 = [AO1] + (p13) LDFD f90 = [AO3] + (p14) LDFD f92 = [AO5] + (p15) LDFD f94 = [AO7] + ;; + (p13) LDFD f89 = [AO2] + (p13) LDFD f91 = [AO4] + (p14) LDFD f93 = [AO6] + ;; + (p13) LDFD f96 = [X1], 1 * SIZE + (p13) LDFD f98 = [X2], 1 * SIZE + ;; + (p13) LDFD f97 = [X1], 3 * SIZE + (p13) LDFD f99 = [X2], 3 * SIZE + ;; + (p14) LDFD f100 = [X1], 1 * SIZE + ;; + (p14) LDFD f101 = [X1], 1 * SIZE + ;; + (p15) LDFD f102 = [X1], 1 * SIZE + ;; + (p13) FMA f8 = f96, f32, f8 + (p13) FMA f9 = f96, f40, f9 + (p13) FMA f10 = f96, f48, f10 + (p13) FMA f11 = f96, f56, f11 + (p13) FMA f12 = f96, f64, f12 + (p13) FMA f13 = f96, f72, f13 + (p13) FMA f14 = f96, f80, f14 + (p13) FMA f15 = f96, f88, f15 + ;; + (p13) FMA f8 = f97, f33, f8 + (p13) FMA f9 = f97, f41, f9 + (p13) FMA f10 = f97, f49, f10 + (p13) FMA f11 = f97, f57, f11 + (p13) FMA f12 = f97, f65, f12 + (p13) FMA f13 = f97, f73, f13 + (p13) FMA f14 = f97, f81, f14 + (p13) FMA f15 = f97, f89, f15 + ;; + (p13) FMA f8 = f98, f34, f8 + (p13) FMA f9 = f98, f42, f9 + (p13) FMA f10 = f98, f50, f10 + (p13) FMA f11 = f98, f58, f11 + (p13) FMA f12 = f98, f66, f12 + (p13) FMA f13 = f98, f74, f13 + (p13) FMA f14 = f98, f82, f14 + (p13) FMA f15 = f98, f90, f15 + ;; + (p13) FMA f8 = f99, f35, f8 + (p13) FMA f9 = f99, f43, f9 + (p13) FMA f10 = f99, f51, f10 + (p13) FMA f11 = f99, f59, f11 + (p13) FMA f12 = f99, f67, f12 + (p13) FMA f13 = f99, f75, f13 + (p13) FMA f14 = f99, f83, f14 + (p13) FMA f15 = f99, f91, f15 + ;; + (p14) FMA f8 = f100, f36, f8 + (p14) FMA f9 = f100, f44, f9 + (p14) FMA f10 = f100, f52, f10 + (p14) FMA f11 = f100, f60, f11 + (p14) FMA f12 = f100, f68, f12 + (p14) FMA f13 = f100, f76, f13 + (p14) FMA f14 = f100, f84, f14 + (p14) FMA f15 = f100, f92, f15 + ;; + (p14) FMA f8 = f101, f37, f8 + (p14) FMA f9 = f101, f45, f9 + (p14) FMA f10 = f101, f53, f10 + (p14) FMA f11 = f101, f61, f11 + (p14) FMA f12 = f101, f69, f12 + (p14) FMA f13 = f101, f77, f13 + (p14) FMA f14 = f101, f85, f14 + (p14) FMA f15 = f101, f93, f15 + ;; + (p15) FMA f8 = f102, f38, f8 + (p15) FMA f9 = f102, f46, f9 + (p15) FMA f10 = f102, f54, f10 + (p15) FMA f11 = f102, f62, f11 + (p15) FMA f12 = f102, f70, f12 + (p15) FMA f13 = f102, f78, f13 + (p15) FMA f14 = f102, f86, f14 + (p15) FMA f15 = f102, f94, f15 + ;; + LDFD f32 = [Y1], INCY + ;; + LDFD f33 = [Y1], INCY + ;; + LDFD f34 = [Y1], INCY + ;; + LDFD f35 = [Y1], INCY5 + ;; + LDFD f36 = [Y2], INCY + ;; + LDFD f37 = [Y2], INCY + ;; + LDFD f38 = [Y2], INCY + ;; + LDFD f39 = [Y2], INCY5 + ;; + FMA f32 = ALPHA, f8, f32 + FMA f33 = ALPHA, f9, f33 + FMA f34 = ALPHA, f10, f34 + FMA f35 = ALPHA, f11, f35 + FMA f36 = ALPHA, f12, f36 + FMA f37 = ALPHA, f13, f37 + FMA f38 = ALPHA, f14, f38 + FMA f39 = ALPHA, f15, f39 + ;; + STFD [YY1] = f32 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f33 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f34 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f35 + add YY1 = YY1, INCY5 + ;; + STFD [YY2] = f36 + add YY2 = YY2, INCY + ;; + STFD [YY2] = f37 + add YY2 = YY2, INCY + ;; + STFD [YY2] = f38 + add YY2 = YY2, INCY + ;; + STFD [YY2] = f39 + add YY2 = YY2, INCY5 + ;; + adds J = -1, J + ;; + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L11 + ;; + .align 16 + +.L20: + tbit.z p6, p0 = N, 2 + ;; + (p6) br.cond.dpnt .L30 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 2, A + ;; + shladd LDA7M8 = LDA, 2, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov f8 = f0 + mov f9 = f0 + mov f10 = f0 + mov f11 = f0 + mov f12 = f0 + mov f13 = f0 + mov f14 = f0 + mov f15 = f0 + + mov pr.rot= 0 + shr I = M, 3 + mov ar.ec = 2 + ;; + mov X1 = BUFFER + adds X2 = 2 * SIZE, BUFFER + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + ;; + mov ar.lc = I + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L25 + ;; + .align 16 + +.L22: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA + (p16) LDFD f42 = [AO3], LDA + (p16) LDFD f44 = [AO5], LDA + (p16) LDFD f46 = [AO7], LDA + ;; + (p16) LDFD f41 = [AO2], LDA + (p16) LDFD f43 = [AO4], LDA + (p16) LDFD f45 = [AO6], LDA + (p16) LDFD f47 = [AO8], LDA + ;; + (p16) LDFD f48 = [AO1], LDA + (p16) LDFD f50 = [AO3], LDA + (p16) LDFD f52 = [AO5], LDA + (p16) LDFD f54 = [AO7], LDA + ;; + (p16) LDFD f49 = [AO2], LDA + (p16) LDFD f51 = [AO4], LDA + (p16) LDFD f53 = [AO6], LDA + (p16) LDFD f55 = [AO8], LDA + ;; + (p16) LDFD f56 = [AO1], LDA7M8 + (p16) LDFD f58 = [AO3], LDA7M8 + (p16) LDFD f60 = [AO5], LDA7M8 + (p16) LDFD f62 = [AO7], LDA7M8 + ;; + (p16) LDFD f57 = [AO2], LDA7M8 + (p16) LDFD f59 = [AO4], LDA7M8 + (p16) LDFD f61 = [AO6], LDA7M8 + (p16) LDFD f63 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [X1], 1 * SIZE + (p16) LDFD f98 = [X2], 1 * SIZE + ;; + (p16) LDFD f97 = [X1], 3 * SIZE + (p16) LDFD f99 = [X2], 3 * SIZE + ;; + (p16) LDFD f100 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + ;; + (p16) LDFD f101 = [X1], 3 * SIZE + (p16) LDFD f103 = [X2], 3 * SIZE + ;; + (p16) FMA f8 = f96, f32, f8 + (p16) FMA f9 = f96, f40, f9 + (p16) FMA f10 = f96, f48, f10 + (p16) FMA f11 = f96, f56, f11 + ;; + (p16) FMA f8 = f97, f33, f8 + (p16) FMA f9 = f97, f41, f9 + (p16) FMA f10 = f97, f49, f10 + (p16) FMA f11 = f97, f57, f11 + ;; + (p16) FMA f8 = f98, f34, f8 + (p16) FMA f9 = f98, f42, f9 + (p16) FMA f10 = f98, f50, f10 + (p16) FMA f11 = f98, f58, f11 + ;; + (p16) FMA f8 = f99, f35, f8 + (p16) FMA f9 = f99, f43, f9 + (p16) FMA f10 = f99, f51, f10 + (p16) FMA f11 = f99, f59, f11 + ;; + (p16) FMA f8 = f100, f36, f8 + (p16) FMA f9 = f100, f44, f9 + (p16) FMA f10 = f100, f52, f10 + (p16) FMA f11 = f100, f60, f11 + + ;; + (p16) FMA f8 = f101, f37, f8 + (p16) FMA f9 = f101, f45, f9 + (p16) FMA f10 = f101, f53, f10 + (p16) FMA f11 = f101, f61, f11 + ;; + (p16) FMA f8 = f102, f38, f8 + (p16) FMA f9 = f102, f46, f9 + (p16) FMA f10 = f102, f54, f10 + (p16) FMA f11 = f102, f62, f11 + ;; + (p16) FMA f8 = f103, f39, f8 + (p16) FMA f9 = f103, f47, f9 + (p16) FMA f10 = f103, f55, f10 + (p16) FMA f11 = f103, f63, f11 + br.ctop.sptk.few .L22 + ;; + .align 16 + +.L25: + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + ;; + { .mmi + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f42 = [AO3], LDA + (p14) LDFD f44 = [AO5], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f41 = [AO2], LDA + (p13) LDFD f43 = [AO4], LDA + (p14) LDFD f45 = [AO6], LDA + ;; + (p13) LDFD f48 = [AO1], LDA + (p13) LDFD f50 = [AO3], LDA + (p14) LDFD f52 = [AO5], LDA + (p15) LDFD f54 = [AO7], LDA + ;; + (p13) LDFD f49 = [AO2], LDA + (p13) LDFD f51 = [AO4], LDA + (p14) LDFD f53 = [AO6], LDA + ;; + (p13) LDFD f56 = [AO1] + (p13) LDFD f58 = [AO3] + (p14) LDFD f60 = [AO5] + (p15) LDFD f62 = [AO7] + ;; + (p13) LDFD f57 = [AO2] + (p13) LDFD f59 = [AO4] + (p14) LDFD f61 = [AO6] + ;; + (p13) LDFD f96 = [X1], 1 * SIZE + (p13) LDFD f98 = [X2], 1 * SIZE + ;; + (p13) LDFD f97 = [X1], 3 * SIZE + (p13) LDFD f99 = [X2], 3 * SIZE + ;; + (p14) LDFD f100 = [X1], 1 * SIZE + ;; + (p14) LDFD f101 = [X1], 1 * SIZE + ;; + (p15) LDFD f102 = [X1], 1 * SIZE + ;; + (p13) FMA f8 = f96, f32, f8 + (p13) FMA f9 = f96, f40, f9 + (p13) FMA f10 = f96, f48, f10 + (p13) FMA f11 = f96, f56, f11 + ;; + (p13) FMA f8 = f97, f33, f8 + (p13) FMA f9 = f97, f41, f9 + (p13) FMA f10 = f97, f49, f10 + (p13) FMA f11 = f97, f57, f11 + ;; + (p13) FMA f8 = f98, f34, f8 + (p13) FMA f9 = f98, f42, f9 + (p13) FMA f10 = f98, f50, f10 + (p13) FMA f11 = f98, f58, f11 + ;; + (p13) FMA f8 = f99, f35, f8 + (p13) FMA f9 = f99, f43, f9 + (p13) FMA f10 = f99, f51, f10 + (p13) FMA f11 = f99, f59, f11 + ;; + (p14) FMA f8 = f100, f36, f8 + (p14) FMA f9 = f100, f44, f9 + (p14) FMA f10 = f100, f52, f10 + (p14) FMA f11 = f100, f60, f11 + ;; + (p14) FMA f8 = f101, f37, f8 + (p14) FMA f9 = f101, f45, f9 + (p14) FMA f10 = f101, f53, f10 + (p14) FMA f11 = f101, f61, f11 + ;; + (p15) FMA f8 = f102, f38, f8 + (p15) FMA f9 = f102, f46, f9 + (p15) FMA f10 = f102, f54, f10 + (p15) FMA f11 = f102, f62, f11 + ;; + LDFD f32 = [Y1], INCY + ;; + LDFD f33 = [Y1], INCY + ;; + LDFD f34 = [Y1], INCY + ;; + LDFD f35 = [Y1], INCY + ;; + FMA f32 = ALPHA, f8, f32 + FMA f33 = ALPHA, f9, f33 + FMA f34 = ALPHA, f10, f34 + FMA f35 = ALPHA, f11, f35 + ;; + STFD [YY1] = f32 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f33 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f34 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f35 + add YY1 = YY1, INCY + ;; + .align 16 + +.L30: + tbit.z p6, p0 = N, 1 + ;; + (p6) br.cond.dpnt .L40 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 1, A + ;; + shladd LDA7M8 = LDA, 1, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov f8 = f0 + mov f9 = f0 + mov f10 = f0 + mov f11 = f0 + mov f12 = f0 + mov f13 = f0 + mov f14 = f0 + mov f15 = f0 + + mov pr.rot= 0 + shr I = M, 3 + mov ar.ec = 2 + ;; + mov X1 = BUFFER + adds X2 = 2 * SIZE, BUFFER + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + ;; + mov ar.lc = I + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L35 + ;; + .align 16 + +.L32: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA7M8 + (p16) LDFD f42 = [AO3], LDA7M8 + (p16) LDFD f44 = [AO5], LDA7M8 + (p16) LDFD f46 = [AO7], LDA7M8 + ;; + (p16) LDFD f41 = [AO2], LDA7M8 + (p16) LDFD f43 = [AO4], LDA7M8 + (p16) LDFD f45 = [AO6], LDA7M8 + (p16) LDFD f47 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [X1], 1 * SIZE + (p16) LDFD f98 = [X2], 1 * SIZE + ;; + (p16) LDFD f97 = [X1], 3 * SIZE + (p16) LDFD f99 = [X2], 3 * SIZE + ;; + (p16) LDFD f100 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + ;; + (p16) LDFD f101 = [X1], 3 * SIZE + (p16) LDFD f103 = [X2], 3 * SIZE + ;; + (p16) FMA f8 = f96, f32, f8 + (p16) FMA f9 = f96, f40, f9 + ;; + (p16) FMA f8 = f97, f33, f8 + (p16) FMA f9 = f97, f41, f9 + ;; + (p16) FMA f8 = f98, f34, f8 + (p16) FMA f9 = f98, f42, f9 + ;; + (p16) FMA f8 = f99, f35, f8 + (p16) FMA f9 = f99, f43, f9 + ;; + (p16) FMA f8 = f100, f36, f8 + (p16) FMA f9 = f100, f44, f9 + ;; + (p16) FMA f8 = f101, f37, f8 + (p16) FMA f9 = f101, f45, f9 + ;; + (p16) FMA f8 = f102, f38, f8 + (p16) FMA f9 = f102, f46, f9 + ;; + (p16) FMA f8 = f103, f39, f8 + (p16) FMA f9 = f103, f47, f9 + br.ctop.sptk.few .L32 + ;; + .align 16 + +.L35: + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + ;; + { .mmi + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1] + (p13) LDFD f42 = [AO3] + (p14) LDFD f44 = [AO5] + (p15) LDFD f46 = [AO7] + ;; + (p13) LDFD f41 = [AO2] + (p13) LDFD f43 = [AO4] + (p14) LDFD f45 = [AO6] + ;; + (p13) LDFD f96 = [X1], 1 * SIZE + (p13) LDFD f98 = [X2], 1 * SIZE + ;; + (p13) LDFD f97 = [X1], 3 * SIZE + (p13) LDFD f99 = [X2], 3 * SIZE + ;; + (p14) LDFD f100 = [X1], 1 * SIZE + ;; + (p14) LDFD f101 = [X1], 1 * SIZE + ;; + (p15) LDFD f102 = [X1], 1 * SIZE + ;; + (p13) FMA f8 = f96, f32, f8 + (p13) FMA f9 = f96, f40, f9 + ;; + (p13) FMA f8 = f97, f33, f8 + (p13) FMA f9 = f97, f41, f9 + ;; + (p13) FMA f8 = f98, f34, f8 + (p13) FMA f9 = f98, f42, f9 + ;; + (p13) FMA f8 = f99, f35, f8 + (p13) FMA f9 = f99, f43, f9 + ;; + (p14) FMA f8 = f100, f36, f8 + (p14) FMA f9 = f100, f44, f9 + ;; + (p14) FMA f8 = f101, f37, f8 + (p14) FMA f9 = f101, f45, f9 + ;; + (p15) FMA f8 = f102, f38, f8 + (p15) FMA f9 = f102, f46, f9 + ;; + LDFD f32 = [Y1], INCY + ;; + LDFD f33 = [Y1], INCY + ;; + FMA f32 = ALPHA, f8, f32 + FMA f33 = ALPHA, f9, f33 + ;; + STFD [YY1] = f32 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f33 + add YY1 = YY1, INCY + ;; + .align 16 + +.L40: + tbit.z p6, p0 = N, 0 + ;; + (p6) br.cond.dpnt .L999 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + add A = LDA, A + ;; + mov f8 = f0 + mov f9 = f0 + mov f10 = f0 + mov f11 = f0 + mov f12 = f0 + mov f13 = f0 + mov f14 = f0 + mov f15 = f0 + + mov pr.rot= 0 + shr I = M, 3 + mov ar.ec = 2 + ;; + mov X1 = BUFFER + adds X2 = 2 * SIZE, BUFFER + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + ;; + mov ar.lc = I + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L45 + ;; + .align 16 + +.L42: + (p16) LDFD f32 = [AO1], 8 * SIZE + (p16) LDFD f34 = [AO3], 8 * SIZE + (p16) LDFD f36 = [AO5], 8 * SIZE + (p16) LDFD f38 = [AO7], 8 * SIZE + ;; + (p16) LDFD f33 = [AO2], 8 * SIZE + (p16) LDFD f35 = [AO4], 8 * SIZE + (p16) LDFD f37 = [AO6], 8 * SIZE + (p16) LDFD f39 = [AO8], 8 * SIZE + ;; + (p16) LDFD f96 = [X1], 1 * SIZE + (p16) LDFD f98 = [X2], 1 * SIZE + ;; + (p16) LDFD f97 = [X1], 3 * SIZE + (p16) LDFD f99 = [X2], 3 * SIZE + ;; + (p16) LDFD f100 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + ;; + (p16) LDFD f101 = [X1], 3 * SIZE + (p16) LDFD f103 = [X2], 3 * SIZE + ;; + (p16) FMA f8 = f96, f32, f8 + ;; + (p16) FMA f8 = f97, f33, f8 + ;; + (p16) FMA f8 = f98, f34, f8 + ;; + (p16) FMA f8 = f99, f35, f8 + ;; + (p16) FMA f8 = f100, f36, f8 + ;; + (p16) FMA f8 = f101, f37, f8 + ;; + (p16) FMA f8 = f102, f38, f8 + ;; + (p16) FMA f8 = f103, f39, f8 + br.ctop.sptk.few .L42 + ;; + .align 16 + +.L45: + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + ;; + { .mmi + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1] + (p13) LDFD f34 = [AO3] + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5] + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2] + (p13) LDFD f35 = [AO4] + (p14) LDFD f37 = [AO6] + (p15) LDFD f38 = [AO7] + ;; + (p13) LDFD f96 = [X1], 1 * SIZE + (p13) LDFD f98 = [X2], 1 * SIZE + ;; + (p13) LDFD f97 = [X1], 3 * SIZE + (p13) LDFD f99 = [X2], 3 * SIZE + ;; + (p14) LDFD f100 = [X1], 1 * SIZE + ;; + (p14) LDFD f101 = [X1], 1 * SIZE + ;; + (p15) LDFD f102 = [X1], 1 * SIZE + ;; + (p13) FMA f8 = f96, f32, f8 + ;; + (p13) FMA f8 = f97, f33, f8 + ;; + (p13) FMA f8 = f98, f34, f8 + ;; + (p13) FMA f8 = f99, f35, f8 + ;; + (p14) FMA f8 = f100, f36, f8 + ;; + (p14) FMA f8 = f101, f37, f8 + ;; + (p15) FMA f8 = f102, f38, f8 + ;; + LDFD f32 = [Y1], INCY + ;; + FMA f32 = ALPHA, f8, f32 + ;; + STFD [YY1] = f32 + .align 16 + +.L999: + mov ar.lc = ARLC + mov pr = PR, -1 + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/qscal.S b/kernel/ia64/qscal.S new file mode 100644 index 0000000..3f978af --- /dev/null +++ b/kernel/ia64/qscal.S @@ -0,0 +1,693 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (16 * 16) + +#define ALPHA f8 + +#define N r32 +#define X1 r38 +#define INCX r39 + +#define X2 r14 +#define Y1 r15 +#define Y2 r16 +#define PRE1 r17 +#define I r18 +#define NAND15 r19 +#define INCX5 r20 +#define INCX8 r21 +#define XX r22 +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + shladd INCX = INCX, BASE_SHIFT, r0 + fcmp.eq p0, p6 = ALPHA, f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + .body + { .mib + cmp.ge p7, p0 = 0, N + (p7) br.ret.sptk.many b0 + } + ;; + { .mmi + mov XX = X1 + mov PR = pr + } + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCX8 = INCX, 3, r0 + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + nop.m 0 + mov ar.ec = 5 + } + { .mmi + and NAND15 = 15, N + nop.m 0 + shr I = N, 4 + } + ;; + { .mmi + adds I = -1, I + nop.m 0 + tbit.z p0, p12 = N, 3 + } + { .mmb + cmp.ge p9, p0 = 0, NAND15 + nop.m 0 + (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3 + } + ;; + { .mmi + adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 + mov ar.lc = I + } + { .mmb + cmp.gt p8, p0 = 0, I + (p8) br.cond.dpnt .L30 + } + ;; + .align 32 + +.L20: + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + lfetch.excl.nt1 [PRE1], INCX8 + add X1 = INCX, X1 + add X2 = INCX, X2 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX5, X1 + add X2 = INCX5, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + lfetch.excl.nt1 [PRE1], INCX8 + add X1 = INCX, X1 + add X2 = INCX, X2 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmb + add X1 = INCX5, X1 + add X2 = INCX5, X2 + br.cloop.sptk.few .L20 + } + ;; + .align 16 + +.L30: + { .mmi + (p12) STFD [X1] = f0 + (p12) STFD [X2] = f0 + mov ar.lc = ARLC + } + { .mmb + (p12) add X1 = INCX, X1 + (p12) add X2 = INCX, X2 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) STFD [X1] = f0 + (p12) add X1 = INCX, X1 + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) STFD [X2] = f0 + (p12) add X2 = INCX, X2 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) STFD [X1] = f0 + (p12) add X1 = INCX, X1 + tbit.z p0, p15 = N, 0 + } + { .mmb + (p12) STFD [X2] = f0 + (p12) add X2 = INCX, X2 + nop __LINE__ + } + ;; + { .mmb + (p12) STFD [X1] = f0 + (p12) add X1 = INCX5, X1 + nop __LINE__ + } + { .mmb + (p12) STFD [X2] = f0 + (p12) add X2 = INCX5, X2 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [X1] = f0 + nop.m 0 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mmi + mov Y1 = X1 + shladd Y2 = INCX, 2, X1 + mov pr.rot= 0 + } + ;; + { .mmi + mov ar.lc = I + } + cmp.eq p16, p0 = r0, r0 + ;; + + { .mmi + adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 + nop.m 0 + mov.i ar.ec = 6 + } + { .mmb + cmp.gt p8, p0 = 0, I + nop.m 0 + (p8) br.cond.dpnt .L320 + } + ;; + .align 32 + +.L310: + { .mmf + (p16) lfetch.excl.nt1 [PRE1], INCX8 + (p22) STFD [Y1] = f12 + (p21) FMPY f6 = ALPHA, f37 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + nop __LINE__ + (p22) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p22) STFD [Y1] = f13 + (p16) LDFD f38 = [X1], INCX + (p21) FMPY f7 = ALPHA, f43 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p22) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p22) STFD [Y1] = f14 + (p16) LDFD f44 = [X1], INCX + (p21) FMPY f10 = ALPHA, f49 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p22) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p22) STFD [Y1] = f15 + (p16) LDFD f50 = [X1], INCX + (p21) FMPY f11 = ALPHA, f55 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p22) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f6 + (p16) LDFD f56 = [X1], INCX + (p21) FMPY f12 = ALPHA, f61 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p16) lfetch.excl.nt1 [PRE1], INCX8 + (p21) STFD [Y1] = f7 + (p21) FMPY f13 = ALPHA, f67 + } + { .mmi + (p16) LDFD f62 = [X1], INCX + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f10 + (p16) LDFD f68 = [X1], INCX + (p21) FMPY f14 = ALPHA, f73 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f11 + (p16) LDFD f74 = [X1], INCX + (p21) FMPY f15 = ALPHA, f79 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f12 + (p16) LDFD f80 = [X1], INCX + (p21) FMPY f6 = ALPHA, f85 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f13 + (p16) LDFD f86 = [X1], INCX + (p21) FMPY f7 = ALPHA, f91 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f14 + (p16) LDFD f92 = [X1], INCX + (p21) FMPY f10 = ALPHA, f97 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f15 + (p16) LDFD f98 = [X1], INCX + (p21) FMPY f11 = ALPHA, f103 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f6 + (p16) LDFD f104 = [X1], INCX + (p21) FMPY f12 = ALPHA, f109 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f7 + (p16) LDFD f110 = [X1], INCX + (p21) FMPY f13 = ALPHA, f115 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f10 + (p16) LDFD f116 = [X1], INCX + (p21) FMPY f14 = ALPHA, f121 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f11 + (p16) LDFD f122 = [X1], INCX + (p21) FMPY f15 = ALPHA, f127 + } + { .mmb + nop __LINE__ + (p21) add Y1 = INCX, Y1 + br.ctop.sptk.few .L310 + } + ;; + { .mmi + STFD [Y1] = f12 + add Y1 = INCX, Y1 + shladd Y2 = INCX, 2, X1 + } + ;; + { .mmi + STFD [Y1] = f13 + add Y1 = INCX, Y1 + shladd X2 = INCX, 2, X1 + } + ;; + { .mmi + STFD [Y1] = f14 + nop __LINE__ + add Y1 = INCX, Y1 + } + ;; + { .mmi + STFD [Y1] = f15 + nop __LINE__ + add Y1 = INCX, Y1 + } + ;; + .align 16 + +.L320: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f52 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + mov pr = PR, -65474 + } + { .mmb + nop __LINE__ + nop __LINE__ + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f54 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX5 + (p12) LDFD f55 = [X2], INCX5 + tbit.z p0, p14 = N, 1 + } + ;; + (p13) LDFD f56 = [X1], INCX + tbit.z p0, p15 = N, 0 + ;; + (p13) LDFD f57 = [X1], INCX + ;; + { .mmf + (p13) LDFD f58 = [X1], INCX + nop __LINE__ + (p12) FMPY f48 = ALPHA, f48 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f52 = ALPHA, f52 + } + ;; + { .mmf + (p13) LDFD f59 = [X1], INCX + nop __LINE__ + (p12) FMPY f49 = ALPHA, f49 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f53 = ALPHA, f53 + } + ;; + { .mmf + (p14) LDFD f60 = [X1], INCX + nop __LINE__ + (p12) FMPY f50 = ALPHA, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f54 = ALPHA, f54 + } + ;; + { .mmf + (p14) LDFD f61 = [X1], INCX + nop __LINE__ + (p12) FMPY f51 = ALPHA, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f55 = ALPHA, f55 + } + ;; + { .mmf + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + (p13) FMPY f56 = ALPHA, f56 + } + { .mmi + (p15) LDFD f62 = [X1] + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + } + ;; + { .mmf + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p13) FMPY f57 = ALPHA, f57 + } + { .mmi + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + (p13) FMPY f58 = ALPHA, f58 + } + { .mmi + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) FMPY f59 = ALPHA, f59 + } + { .mmi + (p12) add Y1 = INCX5, Y1 + (p12) add Y2 = INCX5, Y2 + nop __LINE__ + } + ;; + { .mfi + (p13) STFD [Y1] = f56 + (p14) FMPY f60 = ALPHA, f60 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mfi + (p13) STFD [Y1] = f57 + (p14) FMPY f61 = ALPHA, f61 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mfi + (p13) STFD [Y1] = f58 + (p15) FMPY f62 = ALPHA, f62 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f59 + nop __LINE__ + (p13) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p14) STFD [Y1] = f60 + nop __LINE__ + (p14) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p14) STFD [Y1] = f61 + nop __LINE__ + (p14) add Y1 = INCX, Y1 + } + ;; + { .mib + (p15) STFD [Y1] = f62 + mov pr = PR, -65474 + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/rot.S b/kernel/ia64/rot.S new file mode 100644 index 0000000..8e349f6 --- /dev/null +++ b/kernel/ia64/rot.S @@ -0,0 +1,891 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 8 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 8 + 8) +#else +#define PREFETCH_SIZE (32 * 8 + 16) +#endif + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 + +#define INCX16 r18 +#define INCY16 r19 + +#define PR r30 +#define ARLC r31 + +#define C f8 +#define S f9 + + PROLOGUE + .prologue + PROFCODE + { .mmi + adds r29 = 16, r12 + shladd INCX = INCX, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + shr I = N, 4 + (p6) br.ret.spnt.many b0 + } + .body + ;; + { .mmi +#ifdef XDOUBLE + LDFD S = [r29] +#else + nop __LINE__ +#endif + shladd INCY = INCY, BASE_SHIFT, r0 + mov PR = pr + } + { .mmi + mov X2 = X1 + mov Y2 = Y1 + mov pr.rot= 0 + } + ;; + { .mmi +#ifndef XDOUBLE + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 +#else + shladd INCX16 = INCX, 3, r0 + shladd INCY16 = INCY, 3, r0 +#endif + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.eq p16, p0 = r0, r0 + and J = 15, N + } + ;; + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = PREFETCH_SIZE * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p6 ,p0 = -1, I + tbit.z p0, p12 = N, 3 + (p6) br.cond.dpnt .L15 + } + ;; + .align 32 + +.L12: + { .mmf + (p18) STFD [X2] = f6 + (p16) lfetch.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = C, f40, f12 + } + { .mmf + (p17) LDFD f120 = [Y1], INCY + (p18) add X2 = X2, INCX + (p18) FMPY f6 = S, f94 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) lfetch.excl.nt1 [PREX], INCX16 + (p18) FNMA f13 = S, f40, f13 + } + { .mmf + (p16) LDFD f32 = [X1], INCX + (p18) add Y2 = Y2, INCY + (p18) FMPY f7 = C, f94 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p17) LDFD f123 = [Y1], INCY + (p18) FMA f14 = C, f43, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f97 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f35 = [X1], INCX + (p18) FNMA f15 = S, f43, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f97 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p17) LDFD f126 = [Y1], INCY + (p18) FMPY f12 = S, f100 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f6 = C, f46, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f38 = [X1], INCX + (p18) FMPY f13 = C, f100 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f7 = S, f46, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f80 = [Y1], INCY + (p18) FMPY f14 = S, f103 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f49, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f41 = [X1], INCX + (p18) FMPY f15 = C, f103 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f49, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f83 = [Y1], INCY + (p18) FMA f12 = C, f52, f12 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f6 = S, f106 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f44 = [X1], INCX + (p18) FNMA f13 = S, f52, f13 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f7 = C, f106 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f86 = [Y1], INCY + (p18) FMA f14 = C, f55, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f109 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f47 = [X1], INCX + (p18) FNMA f15 = S, f55, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f109 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f89 = [Y1], INCY + (p18) FMPY f12 = S, f112 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f6 = C, f58, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f50 = [X1], INCX + (p18) FMPY f13 = C, f112 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f7 = S, f58, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f92 = [Y1], INCY + (p18) FMPY f14 = S, f115 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f61, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f53 = [X1], INCX + (p18) FMPY f15 = C, f115 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f61, f11 + } + ;; +#ifndef XDOUBLE + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f95 = [Y1], INCY + (p18) FMA f12 = C, f64, f12 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f6 = S, f118 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f56 = [X1], INCX + (p18) FNMA f13 = S, f64, f13 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f7 = C, f118 + } + ;; +#else + { .mmf + (p18) STFD [X2] = f6 + (p16) lfetch.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = C, f64, f12 + } + { .mmf + (p16) LDFD f95 = [Y1], INCY + (p18) add X2 = X2, INCX + (p18) FMPY f6 = S, f118 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) lfetch.excl.nt1 [PREX], INCX16 + (p18) FNMA f13 = S, f64, f13 + } + { .mmf + (p16) LDFD f56 = [X1], INCX + (p18) add Y2 = Y2, INCY + (p18) FMPY f7 = C, f118 + } + ;; +#endif + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f98 = [Y1], INCY + (p18) FMA f14 = C, f67, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f121 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f59 = [X1], INCX + (p18) FNMA f15 = S, f67, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f121 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f101 = [Y1], INCY + (p18) FMPY f12 = S, f124 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f6 = C, f70, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f62 = [X1], INCX + (p18) FMPY f13 = C, f124 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f7 = S, f70, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f104 = [Y1], INCY + (p18) FMPY f14 = S, f127 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f73, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f65 = [X1], INCX + (p18) FMPY f15 = C, f127 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f73, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f107 = [Y1], INCY + (p18) FMA f12 = C, f76, f12 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMPY f6 = S, f81 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f68 = [X1], INCX + (p18) FNMA f13 = S, f76, f13 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p17) FMPY f7 = C, f81 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f110 = [Y1], INCY + (p18) FMA f14 = C, f79, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMPY f10 = S, f84 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f71 = [X1], INCX + (p18) FNMA f15 = S, f79, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p17) FMPY f11 = C, f84 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f113 = [Y1], INCY + (p17) FMPY f12 = S, f87 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMA f6 = C, f33, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f74 = [X1], INCX + (p17) FMPY f13 = C, f87 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p17) FNMA f7 = S, f33, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f116 = [Y1], INCY + (p17) FMPY f14 = S, f90 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMA f10 = C, f36, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f77 = [X1], INCX + (p17) FMPY f15 = C, f90 + } + { .mfb + (p18) add Y2 = Y2, INCY + (p17) FNMA f11 = S, f36, f11 + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFD f40 = [Y1], INCY + (p12) LDFD f32 = [X1], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f41 = [Y1], INCY + (p12) LDFD f33 = [X1], INCX + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f42 = [Y1], INCY + cmp.eq p7, p0 = r0, J + (p7) br.ret.sptk.many b0 + } + ;; + { .mmf + (p12) LDFD f43 = [Y1], INCY + nop __LINE__ + (p12) FMPY f6 = S, f40 + } + ;; + { .mmf + (p12) LDFD f34 = [X1], INCX + nop __LINE__ + (p12) FMPY f7 = C, f40 + } + ;; + { .mmf + (p12) LDFD f44 = [Y1], INCY + nop __LINE__ + (p12) FMPY f10 = S, f41 + } + ;; + { .mmf + (p12) LDFD f35 = [X1], INCX + nop __LINE__ + (p12) FMPY f11 = C, f41 + } + ;; + { .mmf + (p12) LDFD f45 = [Y1], INCY + nop __LINE__ + (p12) FMPY f12 = S, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f6 = C, f32, f6 + } + ;; + { .mmf + (p12) LDFD f36 = [X1], INCX + nop __LINE__ + (p12) FMPY f13 = C, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f7 = S, f32, f7 + } + ;; + { .mmf + (p12) LDFD f46 = [Y1], INCY + nop __LINE__ + (p12) FMPY f14 = S, f43 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f10 = C, f33, f10 + } + ;; + { .mmf + (p12) LDFD f37 = [X1], INCX + nop __LINE__ + (p12) FMPY f15 = C, f43 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f11 = S, f33, f11 + } + ;; + { .mmf + (p12) STFD [X2] = f6 + (p12) LDFD f47 = [Y1], INCY + (p12) FMA f12 = C, f34, f12 + } + { .mfi + (p12) add X2 = X2, INCX + (p12) FMPY f6 = S, f44 + tbit.z p0, p13 = N, 2 + } + ;; + { .mmf + (p12) STFD [Y2] = f7 + (p12) LDFD f38 = [X1], INCX + (p12) FNMA f13 = S, f34, f13 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p12) FMPY f7 = C, f44 + } + ;; + { .mmf + (p12) STFD [X2] = f10 + (p13) LDFD f52 = [Y1], INCY + (p12) FMA f14 = C, f35, f14 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMPY f10 = S, f45 + } + ;; + { .mmf + (p12) STFD [Y2] = f11 + (p12) LDFD f39 = [X1], INCX + (p12) FNMA f15 = S, f35, f15 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p12) FMPY f11 = C, f45 + } + ;; + { .mmf + (p12) STFD [X2] = f12 + (p13) LDFD f53 = [Y1], INCY + (p12) FMPY f12 = S, f46 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMA f6 = C, f36, f6 + } + ;; + { .mmf + (p12) STFD [Y2] = f13 + (p13) LDFD f48 = [X1], INCX + (p12) FMPY f13 = C, f46 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p12) FNMA f7 = S, f36, f7 + } + ;; + { .mmf + (p12) STFD [X2] = f14 + (p13) LDFD f54 = [Y1], INCY + (p12) FMPY f14 = S, f47 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMA f10 = C, f37, f10 + } + ;; + { .mmf + (p12) STFD [Y2] = f15 + (p13) LDFD f49 = [X1], INCX + (p12) FMPY f15 = C, f47 + } + { .mfi + (p12) add Y2 = Y2, INCY + (p12) FNMA f11 = S, f37, f11 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmf + (p12) STFD [X2] = f6 + (p13) LDFD f55 = [Y1], INCY + (p12) FMA f12 = C, f38, f12 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMPY f6 = S, f52 + } + ;; + { .mmf + (p12) STFD [Y2] = f7 + (p13) LDFD f50 = [X1], INCX + (p12) FNMA f13 = S, f38, f13 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FMPY f7 = C, f52 + } + ;; + { .mmf + (p12) STFD [X2] = f10 + (p14) LDFD f58 = [Y1], INCY + (p12) FMA f14 = C, f39, f14 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMPY f10 = S, f53 + } + ;; + { .mmf + (p12) STFD [Y2] = f11 + (p13) LDFD f51 = [X1], INCX + (p12) FNMA f15 = S, f39, f15 + } + { .mfi + (p12) add Y2 = Y2, INCY + (p13) FMPY f11 = C, f53 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmf + (p12) STFD [X2] = f12 + (p14) LDFD f59 = [Y1], INCY + (p13) FMPY f12 = S, f54 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMA f6 = C, f48, f6 + } + ;; + { .mmf + (p12) STFD [Y2] = f13 + (p14) LDFD f56 = [X1], INCX + (p13) FMPY f13 = C, f54 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FNMA f7 = S, f48, f7 + } + ;; + { .mmf + (p12) STFD [X2] = f14 + (p15) LDFD f61 = [Y1], INCY + (p13) FMPY f14 = S, f55 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMA f10 = C, f49, f10 + } + ;; + { .mmf + (p12) STFD [Y2] = f15 + (p14) LDFD f57 = [X1], INCX + (p13) FMPY f15 = C, f55 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FNMA f11 = S, f49, f11 + } + ;; + { .mmf + (p13) STFD [X2] = f6 + nop __LINE__ + (p13) FMA f12 = C, f50, f12 + } + { .mmf + (p13) add X2 = X2, INCX + nop __LINE__ + (p14) FMPY f6 = S, f58 + } + ;; + { .mmf + (p13) STFD [Y2] = f7 + (p15) LDFD f60 = [X1], INCX + (p13) FNMA f13 = S, f50, f13 + } + { .mmf + (p13) add Y2 = Y2, INCY + nop __LINE__ + (p14) FMPY f7 = C, f58 + } + ;; + { .mmf + (p13) STFD [X2] = f10 + nop __LINE__ + (p13) FMA f14 = C, f51, f14 + } + { .mmf + (p13) add X2 = X2, INCX + nop __LINE__ + (p14) FMPY f10 = S, f59 + } + ;; + { .mmf + (p13) STFD [Y2] = f11 + nop __LINE__ + (p13) FNMA f15 = S, f51, f15 + } + { .mmf + (p13) add Y2 = Y2, INCY + nop __LINE__ + (p14) FMPY f11 = C, f59 + } + ;; + { .mmf + (p13) STFD [X2] = f12 + nop __LINE__ + (p14) FMA f6 = C, f56, f6 + } + { .mmf + (p13) add X2 = X2, INCX + nop __LINE__ + (p15) FMPY f12 = S, f61 + } + ;; + { .mmf + (p13) STFD [Y2] = f13 + nop __LINE__ + (p14) FNMA f7 = S, f56, f7 + } + { .mmf + (p13) add Y2 = Y2, INCY + nop __LINE__ + (p15) FMPY f13 = C, f61 + } + ;; + { .mmf + (p13) STFD [X2] = f14 + (p13) add X2 = X2, INCX + (p14) FMA f10 = C, f57, f10 + } + ;; + { .mmf + (p13) STFD [Y2] = f15 + (p13) add Y2 = Y2, INCY + (p14) FNMA f11 = S, f57, f11 + } + ;; + { .mmf + (p14) STFD [X2] = f6 + (p14) add X2 = X2, INCX + (p15) FMA f12 = C, f60, f12 + } + ;; + { .mmf + (p14) STFD [Y2] = f7 + (p14) add Y2 = Y2, INCY + (p15) FNMA f13 = S, f60, f13 + } + ;; + { .mmi + (p14) STFD [X2] = f10 + (p14) add X2 = X2, INCX + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [Y2] = f11 + (p14) add Y2 = Y2, INCY + nop __LINE__ + } + ;; + { .mmi + (p15) STFD [X2] = f12 + (p15) add X2 = X2, INCX + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [Y2] = f13 + (p15) add Y2 = Y2, INCY + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/saxpy.S b/kernel/ia64/saxpy.S new file mode 100644 index 0000000..c3b2c1b --- /dev/null +++ b/kernel/ia64/saxpy.S @@ -0,0 +1,1667 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 64 * 8 + +#define N r32 +#define X r36 +#define INCX r37 +#define Y r38 +#define INCY r39 + +#define PRE1 r2 +#define PRE2 r3 + +#define I r14 +#define J r15 +#define Y1 r16 +#define Y2 r17 +#define X1 r18 +#define X2 r19 +#define INCX16 r20 +#define INCY16 r21 +#define YYY r25 +#define YY r27 +#define XA r28 +#define XB r29 +#define PR r30 +#define ARLC r31 + +#define ALPHA f8 +#define ALPHA_P f9 + + PROLOGUE + .prologue + PROFCODE + + { .mii + shladd INCX = INCX, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + tbit.nz p10, p0 = X, BASE_SHIFT + } + { .mfb + cmp.lt p0, p6 = r0, N + fcmp.eq p7, p0 = ALPHA, f0 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + (p10) LDFD f32 = [X], INCX + shladd INCY = INCY, BASE_SHIFT, r0 + mov PR = pr + } + { .mib + (p10) adds N = -1, N + mov YYY = Y + (p7) br.ret.sptk.many b0 + } + ;; + { .mmi + (p10) LDFD f33 = [Y], INCY + cmp.ne p13, p0 = SIZE, INCX + shr XA = X, 2 + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + nop.i 0 + } + ;; + { .mii + mov Y1 = Y + tbit.nz p11, p0 = Y, BASE_SHIFT + shr XB = Y, 2 + } + ;; + { .mmf + and XA = 0x3f, XA + and XB = 0x3f, XB + (p10) FMA f32 = ALPHA, f32, f33 + } + ;; + { .mmi + sub XA = XB, XA + shladd Y2 = INCY, 2, Y + mov pr.rot = 0x10000 + } + { .mbb + cmp.ne p14, p0 = SIZE, INCY + (p13) br.cond.dpnt .L100 + (p14) br.cond.dpnt .L100 + } + ;; + { .mmi + cmp.gt p14, p0 = r0, XA + ;; + and J = 15, N + shr I = N, 4 + } + { .mfb + (p14) adds XA = 64, XA + fpack ALPHA_P = f8, f8 + (p11) br.cond.dpnt .L30 + } + ;; + { .mmi + cmp.gt p14, p0 = 32, XA + cmp.lt p15, p0 = 58, XA + mov ar.ec = 3 + } + { .mmi + and J = 31, N + cmp.eq p16, p0 = r0, r0 + shr I = N, 5 + } + ;; + { .mmi + cmp.eq p9, p0 = r0, J + cmp.eq p7 ,p0 = 0, I + adds I = -1, I + } + { .mbb + nop.m 0 + (p14) br.cond.dpnt .L20 + (p15) br.cond.dpnt .L20 + } + ;; + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = PREFETCHSIZE * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = (PREFETCHSIZE - 24) * SIZE, Y + tbit.z p0, p11 = N, 4 + (p7) br.cond.dpnt .L15 + } + ;; + .align 32 + +.L12: +/* 0 */ + { .mmf + (p18) stf8 [Y1] = f6, 2 * SIZE + (p16) lfetch.nt1 [PRE1], 32 * SIZE + (p18) fpma f12 = ALPHA_P, f46, f94 + } + { .mmi + (p16) ldf8 f32 = [X], 2 * SIZE + (p16) ldf8 f80 = [Y], 2 * SIZE + } + ;; +/* 1 */ + { .mmf + (p18) stf8 [Y1] = f7, 2 * SIZE + (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE + (p18) fpma f13 = ALPHA_P, f49, f97 + } + { .mmi + (p16) ldf8 f35 = [X], 2 * SIZE + (p16) ldf8 f83 = [Y], 2 * SIZE + } + ;; +/* 2 */ + { .mmf + (p18) stf8 [Y1] = f10, 2 * SIZE + (p18) fpma f14 = ALPHA_P, f52, f100 + } + { .mmi + (p16) ldf8 f38 = [X], 2 * SIZE + (p16) ldf8 f86 = [Y], 2 * SIZE + } + ;; +/* 3 */ + { .mmf + (p18) stf8 [Y1] = f11, 2 * SIZE + (p18) fpma f15 = ALPHA_P, f55, f103 + } + { .mmi + (p16) ldf8 f41 = [X], 2 * SIZE + (p16) ldf8 f89 = [Y], 2 * SIZE + } + ;; +/* 4 */ + { .mmf + (p18) stf8 [Y1] = f12, 2 * SIZE + (p18) fpma f6 = ALPHA_P, f58, f106 + } + { .mmi + (p16) ldf8 f44 = [X], 2 * SIZE + (p16) ldf8 f92 = [Y], 2 * SIZE + } + ;; +/* 5 */ + { .mmf + (p18) stf8 [Y1] = f13, 2 * SIZE + (p18) fpma f7 = ALPHA_P, f61, f109 + } + { .mmi + (p16) ldf8 f47 = [X], 2 * SIZE + (p16) ldf8 f95 = [Y], 2 * SIZE + } + ;; +/* 6 */ + { .mmf + (p18) stf8 [Y1] = f14, 2 * SIZE + (p18) fpma f10 = ALPHA_P, f64, f112 + } + { .mmi + (p16) ldf8 f50 = [X], 2 * SIZE + (p16) ldf8 f98 = [Y], 2 * SIZE + } + ;; +/* 7 */ + { .mmf + (p18) stf8 [Y1] = f15, 2 * SIZE + (p18) fpma f11 = ALPHA_P, f67, f115 + } + { .mmi + (p16) ldf8 f53 = [X], 2 * SIZE + (p16) ldf8 f101 = [Y], 2 * SIZE + } + ;; +/* 8 */ + { .mmf + (p18) stf8 [Y1] = f6, 2 * SIZE + (p18) fpma f12 = ALPHA_P, f70, f118 + } + { .mmi + (p16) ldf8 f56 = [X], 2 * SIZE + (p16) ldf8 f104 = [Y], 2 * SIZE + } + ;; +/* 9 */ + { .mmf + (p18) stf8 [Y1] = f7, 2 * SIZE + (p18) fpma f13 = ALPHA_P, f73, f121 + } + { .mmi + (p16) ldf8 f59 = [X], 2 * SIZE + (p16) ldf8 f107 = [Y], 2 * SIZE + } + ;; +/* 10 */ + { .mmf + (p18) stf8 [Y1] = f10, 2 * SIZE + (p18) fpma f14 = ALPHA_P, f76, f124 + } + { .mmi + (p16) ldf8 f62 = [X], 2 * SIZE + (p16) ldf8 f110 = [Y], 2 * SIZE + } + ;; +/* 11 */ + { .mmf + (p18) stf8 [Y1] = f11, 2 * SIZE + (p18) fpma f15 = ALPHA_P, f79, f127 + } + { .mmi + (p16) ldf8 f65 = [X], 2 * SIZE + (p16) ldf8 f113 = [Y], 2 * SIZE + } + ;; +/* 12 */ + { .mmf + (p18) stf8 [Y1] = f12, 2 * SIZE + (p17) fpma f6 = ALPHA_P, f33, f81 + } + { .mmi + (p16) ldf8 f68 = [X], 2 * SIZE + (p16) ldf8 f116 = [Y], 2 * SIZE + } + ;; +/* 13 */ + { .mmf + (p18) stf8 [Y1] = f13, 2 * SIZE + (p17) fpma f7 = ALPHA_P, f36, f84 + } + { .mmi + (p16) ldf8 f71 = [X], 2 * SIZE + (p16) ldf8 f119 = [Y], 2 * SIZE + } + ;; +/* 14 */ + { .mmf + (p18) stf8 [Y1] = f14, 2 * SIZE + (p17) fpma f10 = ALPHA_P, f39, f87 + } + { .mmi + (p16) ldf8 f74 = [X], 2 * SIZE + (p16) ldf8 f122 = [Y], 2 * SIZE + } + ;; +/*15 */ + { .mmf + (p18) stf8 [Y1] = f15, 2 * SIZE + (p17) fpma f11 = ALPHA_P, f42, f90 + } + { .mmb + (p16) ldf8 f77 = [X], 2 * SIZE + (p16) ldf8 f125 = [Y], 2 * SIZE + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p11) ldf8 f32 = [X], 2 * SIZE + (p11) ldf8 f33 = [Y], 2 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p11) ldf8 f34 = [X], 2 * SIZE + (p11) ldf8 f35 = [Y], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p11) ldf8 f36 = [X], 2 * SIZE + (p11) ldf8 f37 = [Y], 2 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p11) ldf8 f38 = [X], 2 * SIZE + (p11) ldf8 f39 = [Y], 2 * SIZE + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + (p11) ldf8 f40 = [X], 2 * SIZE + (p11) ldf8 f41 = [Y], 2 * SIZE + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p11) ldf8 f42 = [X], 2 * SIZE + (p11) ldf8 f43 = [Y], 2 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmf + (p11) ldf8 f44 = [X], 2 * SIZE + (p11) ldf8 f45 = [Y], 2 * SIZE + (p11) fpma f6 = ALPHA_P, f32, f33 + } + ;; + { .mmf + (p11) ldf8 f46 = [X], 2 * SIZE + (p11) ldf8 f47 = [Y], 2 * SIZE + (p11) fpma f7 = ALPHA_P, f34, f35 + } + ;; + { .mmf + (p12) ldf8 f48 = [X], 2 * SIZE + (p12) ldf8 f49 = [Y], 2 * SIZE + (p11) fpma f10 = ALPHA_P, f36, f37 + } + ;; + { .mmi + (p11) stf8 [Y1] = f6, 2 * SIZE + nop.m 0 + tbit.z p0, p15 = N, 0 + } + { .mmf + (p12) ldf8 f50 = [X], 2 * SIZE + (p12) ldf8 f51 = [Y], 2 * SIZE + (p11) fpma f11 = ALPHA_P, f38, f39 + } + ;; + { .mmi + (p11) stf8 [Y1] = f7, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p12) ldf8 f52 = [X], 2 * SIZE + (p12) ldf8 f53 = [Y], 2 * SIZE + } + ;; + { .mmi + (p11) stf8 [Y1] = f10, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p12) ldf8 f54 = [X], 2 * SIZE + (p12) ldf8 f55 = [Y], 2 * SIZE + (p11) fpma f12 = ALPHA_P, f40, f41 + } + ;; + { .mmi + (p11) stf8 [Y1] = f11, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p13) ldf8 f56 = [X], 2 * SIZE + (p13) ldf8 f57 = [Y], 2 * SIZE + (p11) fpma f13 = ALPHA_P, f42, f43 + } + ;; + { .mmi + (p11) stf8 [Y1] = f12, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p13) ldf8 f58 = [X], 2 * SIZE + (p13) ldf8 f59 = [Y], 2 * SIZE + (p11) fpma f14 = ALPHA_P, f44, f45 + } + ;; + { .mmi + (p11) stf8 [Y1] = f13, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p14) ldf8 f60 = [X], 2 * SIZE + (p14) ldf8 f61 = [Y], 2 * SIZE + (p11) fpma f15 = ALPHA_P, f46, f47 + } + ;; + { .mmi + (p11) stf8 [Y1] = f14, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p15) ldfs f62 = [X] + (p15) ldfs f63 = [Y] + (p12) fpma f6 = ALPHA_P, f48, f49 + } + ;; + (p12) fpma f7 = ALPHA_P, f50, f51 + (p12) fpma f10 = ALPHA_P, f52, f53 + ;; + (p11) stf8 [Y1] = f15, 2 * SIZE + (p12) fpma f11 = ALPHA_P, f54, f55 + ;; + (p12) stf8 [Y1] = f6, 2 * SIZE + (p13) fpma f12 = ALPHA_P, f56, f57 + ;; + (p12) stf8 [Y1] = f7, 2 * SIZE + (p13) fpma f13 = ALPHA_P, f58, f59 + ;; + (p12) stf8 [Y1] = f10, 2 * SIZE + (p14) fpma f14 = ALPHA_P, f60, f61 + ;; + (p12) stf8 [Y1] = f11, 2 * SIZE + (p15) FMA f15 = ALPHA, f62, f63 + ;; + (p13) stf8 [Y1] = f12, 2 * SIZE + ;; + (p13) stf8 [Y1] = f13, 2 * SIZE + ;; + (p14) stf8 [Y1] = f14, 2 * SIZE + ;; + (p15) stfs [Y1] = f15 + br.ret.sptk.many b0 + ;; + .align 32 + +/* X is aligned; case 2 */ + +.L20: + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = (PREFETCHSIZE - 28) * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = (PREFETCHSIZE + 4) * SIZE, Y + tbit.z p0, p11 = N, 4 + (p7) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: +/* 0 */ + { .mmf + (p18) stf8 [Y1] = f6, 2 * SIZE + (p16) lfetch.nt1 [PRE1], 32 * SIZE + (p18) fpma f12 = ALPHA_P, f46, f94 + } + { .mmi + (p17) ldf8 f60 = [X], 2 * SIZE + (p16) ldf8 f80 = [Y], 2 * SIZE + } + ;; +/* 1 */ + { .mmf + (p18) stf8 [Y1] = f7, 2 * SIZE + (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE + (p18) fpma f13 = ALPHA_P, f49, f97 + } + { .mmi + (p17) ldf8 f63 = [X], 2 * SIZE + (p16) ldf8 f83 = [Y], 2 * SIZE + } + ;; +/* 2 */ + { .mmf + (p18) stf8 [Y1] = f10, 2 * SIZE + (p18) fpma f14 = ALPHA_P, f52, f100 + } + { .mmi + (p17) ldf8 f66 = [X], 2 * SIZE + (p16) ldf8 f86 = [Y], 2 * SIZE + } + ;; +/* 3 */ + { .mmf + (p18) stf8 [Y1] = f11, 2 * SIZE + (p18) fpma f15 = ALPHA_P, f55, f103 + } + { .mmi + (p17) ldf8 f69 = [X], 2 * SIZE + (p16) ldf8 f89 = [Y], 2 * SIZE + } + ;; +/* 4 */ + { .mmf + (p18) stf8 [Y1] = f12, 2 * SIZE + (p18) fpma f6 = ALPHA_P, f58, f106 + } + { .mmi + (p17) ldf8 f72 = [X], 2 * SIZE + (p16) ldf8 f92 = [Y], 2 * SIZE + } + ;; +/* 5 */ + { .mmf + (p18) stf8 [Y1] = f13, 2 * SIZE + (p18) fpma f7 = ALPHA_P, f61, f109 + } + { .mmi + (p17) ldf8 f75 = [X], 2 * SIZE + (p16) ldf8 f95 = [Y], 2 * SIZE + } + ;; +/* 6 */ + { .mmf + (p18) stf8 [Y1] = f14, 2 * SIZE + (p18) fpma f10 = ALPHA_P, f64, f112 + } + { .mmi + (p17) ldf8 f78 = [X], 2 * SIZE + (p16) ldf8 f98 = [Y], 2 * SIZE + } + ;; +/* 7 */ + { .mmf + (p18) stf8 [Y1] = f15, 2 * SIZE + (p18) fpma f11 = ALPHA_P, f67, f115 + } + { .mmi + (p16) ldf8 f32 = [X], 2 * SIZE + (p16) ldf8 f101 = [Y], 2 * SIZE + } + ;; +/* 8 */ + { .mmf + (p18) stf8 [Y1] = f6, 2 * SIZE + (p18) fpma f12 = ALPHA_P, f70, f118 + } + { .mmi + (p16) ldf8 f35 = [X], 2 * SIZE + (p16) ldf8 f104 = [Y], 2 * SIZE + } + ;; +/* 9 */ + { .mmf + (p18) stf8 [Y1] = f7, 2 * SIZE + (p18) fpma f13 = ALPHA_P, f73, f121 + } + { .mmi + (p16) ldf8 f38 = [X], 2 * SIZE + (p16) ldf8 f107 = [Y], 2 * SIZE + } + ;; +/* 10 */ + { .mmf + (p18) stf8 [Y1] = f10, 2 * SIZE + (p18) fpma f14 = ALPHA_P, f76, f124 + } + { .mmi + (p16) ldf8 f41 = [X], 2 * SIZE + (p16) ldf8 f110 = [Y], 2 * SIZE + } + ;; +/* 11 */ + { .mmf + (p18) stf8 [Y1] = f11, 2 * SIZE + (p18) fpma f15 = ALPHA_P, f79, f127 + } + { .mmi + (p16) ldf8 f44 = [X], 2 * SIZE + (p16) ldf8 f113 = [Y], 2 * SIZE + } + ;; +/* 12 */ + { .mmf + (p18) stf8 [Y1] = f12, 2 * SIZE + (p17) fpma f6 = ALPHA_P, f33, f81 + } + { .mmi + (p16) ldf8 f47 = [X], 2 * SIZE + (p16) ldf8 f116 = [Y], 2 * SIZE + } + ;; +/* 13 */ + { .mmf + (p18) stf8 [Y1] = f13, 2 * SIZE + (p17) fpma f7 = ALPHA_P, f36, f84 + } + { .mmi + (p16) ldf8 f50 = [X], 2 * SIZE + (p16) ldf8 f119 = [Y], 2 * SIZE + } + ;; +/* 14 */ + { .mmf + (p18) stf8 [Y1] = f14, 2 * SIZE + (p17) fpma f10 = ALPHA_P, f39, f87 + } + { .mmi + (p16) ldf8 f53 = [X], 2 * SIZE + (p16) ldf8 f122 = [Y], 2 * SIZE + } + ;; +/*15 */ + { .mmf + (p18) stf8 [Y1] = f15, 2 * SIZE + (p17) fpma f11 = ALPHA_P, f42, f90 + } + { .mmb + (p16) ldf8 f56 = [X], 2 * SIZE + (p16) ldf8 f125 = [Y], 2 * SIZE + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p11) ldf8 f32 = [X], 2 * SIZE + (p11) ldf8 f33 = [Y], 2 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p11) ldf8 f34 = [X], 2 * SIZE + (p11) ldf8 f35 = [Y], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p11) ldf8 f36 = [X], 2 * SIZE + (p11) ldf8 f37 = [Y], 2 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p11) ldf8 f38 = [X], 2 * SIZE + (p11) ldf8 f39 = [Y], 2 * SIZE + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + (p11) ldf8 f40 = [X], 2 * SIZE + (p11) ldf8 f41 = [Y], 2 * SIZE + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p11) ldf8 f42 = [X], 2 * SIZE + (p11) ldf8 f43 = [Y], 2 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmf + (p11) ldf8 f44 = [X], 2 * SIZE + (p11) ldf8 f45 = [Y], 2 * SIZE + (p11) fpma f6 = ALPHA_P, f32, f33 + } + ;; + { .mmf + (p11) ldf8 f46 = [X], 2 * SIZE + (p11) ldf8 f47 = [Y], 2 * SIZE + (p11) fpma f7 = ALPHA_P, f34, f35 + } + ;; + { .mmf + (p12) ldf8 f48 = [X], 2 * SIZE + (p12) ldf8 f49 = [Y], 2 * SIZE + (p11) fpma f10 = ALPHA_P, f36, f37 + } + ;; + { .mmi + (p11) stf8 [Y1] = f6, 2 * SIZE + nop.m 0 + tbit.z p0, p15 = N, 0 + } + { .mmf + (p12) ldf8 f50 = [X], 2 * SIZE + (p12) ldf8 f51 = [Y], 2 * SIZE + (p11) fpma f11 = ALPHA_P, f38, f39 + } + ;; + { .mmi + (p11) stf8 [Y1] = f7, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p12) ldf8 f52 = [X], 2 * SIZE + (p12) ldf8 f53 = [Y], 2 * SIZE + } + ;; + { .mmi + (p11) stf8 [Y1] = f10, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p12) ldf8 f54 = [X], 2 * SIZE + (p12) ldf8 f55 = [Y], 2 * SIZE + (p11) fpma f12 = ALPHA_P, f40, f41 + } + ;; + { .mmi + (p11) stf8 [Y1] = f11, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p13) ldf8 f56 = [X], 2 * SIZE + (p13) ldf8 f57 = [Y], 2 * SIZE + (p11) fpma f13 = ALPHA_P, f42, f43 + } + ;; + { .mmi + (p11) stf8 [Y1] = f12, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p13) ldf8 f58 = [X], 2 * SIZE + (p13) ldf8 f59 = [Y], 2 * SIZE + (p11) fpma f14 = ALPHA_P, f44, f45 + } + ;; + { .mmi + (p11) stf8 [Y1] = f13, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p14) ldf8 f60 = [X], 2 * SIZE + (p14) ldf8 f61 = [Y], 2 * SIZE + (p11) fpma f15 = ALPHA_P, f46, f47 + } + ;; + { .mmi + (p11) stf8 [Y1] = f14, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p15) ldfs f62 = [X] + (p15) ldfs f63 = [Y] + (p12) fpma f6 = ALPHA_P, f48, f49 + } + ;; + (p12) fpma f7 = ALPHA_P, f50, f51 + (p12) fpma f10 = ALPHA_P, f52, f53 + ;; + (p11) stf8 [Y1] = f15, 2 * SIZE + (p12) fpma f11 = ALPHA_P, f54, f55 + ;; + (p12) stf8 [Y1] = f6, 2 * SIZE + (p13) fpma f12 = ALPHA_P, f56, f57 + ;; + (p12) stf8 [Y1] = f7, 2 * SIZE + (p13) fpma f13 = ALPHA_P, f58, f59 + ;; + (p12) stf8 [Y1] = f10, 2 * SIZE + (p14) fpma f14 = ALPHA_P, f60, f61 + ;; + (p12) stf8 [Y1] = f11, 2 * SIZE + (p15) FMA f15 = ALPHA, f62, f63 + ;; + (p13) stf8 [Y1] = f12, 2 * SIZE + ;; + (p13) stf8 [Y1] = f13, 2 * SIZE + ;; + (p14) stf8 [Y1] = f14, 2 * SIZE + ;; + (p15) stfs [Y1] = f15 + br.ret.sptk.many b0 + ;; + .align 32 + +.L30: + { .mmi + cmp.eq p9, p0 = r0, J + cmp.eq p7 ,p0 = 0, I + mov ar.ec = 4 + } + { .mmi + cmp.lt p12, p0 = 33, XA + adds I = -1, I + } + ;; + { .mmi + cmp.gt p14, p0 = 15, XA + cmp.lt p15, p0 = 60, XA + (p12) cmp.gt.unc p13, p0 = 53, XA + } + { .bbb + (p13) br.cond.dpnt .L40 + (p14) br.cond.dpnt .L40 + (p15) br.cond.dpnt .L40 + } + ;; + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = (PREFETCHSIZE + 6) * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = (PREFETCHSIZE + 0) * SIZE, Y + tbit.z p0, p12 = N, 3 + (p7) br.cond.dpnt .L35 + } + ;; + .align 32 + +.L32: + { .mmf + (p19) STFD [Y1] = f6, 1 * SIZE + (p19) STFD [Y2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f34, f82 + } + { .mmf + (p16) LDFPD f32, f35 = [X], 2 * SIZE + (p16) LDFD f80 = [Y], 1 * SIZE + (p18) FMA f7 = ALPHA, f46, f94 + } + ;; + { .mmf + (p19) STFD [Y1] = f10, 1 * SIZE + (p19) STFD [Y2] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA, f37, f85 + } + { .mmf + (p16) LDFPD f38, f41 = [X], 2 * SIZE + (p16) LDFPD f83, f86 = [Y], 2 * SIZE + (p18) FMA f11 = ALPHA, f49, f97 + } + ;; + { .mmf + (p19) STFD [Y1] = f12, 1 * SIZE + (p19) STFD [Y2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f40, f88 + } + { .mmf + (p16) LDFPD f44, f47 = [X], 2 * SIZE + (p16) LDFPD f89, f92 = [Y], 2 * SIZE + (p18) FMA f13 = ALPHA, f52, f100 + } + ;; + { .mmf + (p19) STFD [Y1] = f14, 5 * SIZE + (p19) STFD [Y2] = f15, 5 * SIZE + (p18) FMA f14 = ALPHA, f43, f91 + } + { .mmf + (p16) LDFPD f50, f53 = [X], 2 * SIZE + (p16) LDFPD f95, f98 = [Y], 2 * SIZE + (p18) FMA f15 = ALPHA, f55, f103 + } + ;; + { .mmf + (p18) STFD [Y1] = f6, 1 * SIZE + (p18) STFD [Y2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) LDFPD f56, f59 = [X], 2 * SIZE + (p16) LDFPD f101, f104 = [Y], 2 * SIZE + (p18) FMA f7 = ALPHA, f70, f118 + } + ;; + { .mmf + (p18) STFD [Y1] = f10, 1 * SIZE + (p18) STFD [Y2] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p16) LDFPD f62, f65 = [X], 2 * SIZE + (p16) LDFPD f107, f110 = [Y], 2 * SIZE + (p18) FMA f11 = ALPHA, f73, f121 + } + ;; + { .mmf + (p18) STFD [Y1] = f12, 1 * SIZE + (p18) STFD [Y2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f64, f112 + } + { .mmf + (p16) LDFPD f68, f71 = [X], 2 * SIZE + (p16) LDFPD f113, f116 = [Y], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [Y1] = f14, 5 * SIZE + (p18) STFD [Y2] = f15, 5 * SIZE + (p18) FMA f14 = ALPHA, f67, f115 + } + { .mmf + (p16) LDFPD f74, f77 = [X], 2 * SIZE + (p16) LDFPD f119, f122 = [Y], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmi + (p16) lfetch.nt1 [PRE1], 16 * SIZE + (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE + nop.i 0 + } + { .mmb + (p16) LDFD f125 = [Y], 1 * SIZE + nop.m 0 + br.ctop.sptk.few .L32 + } + ;; + .align 32 + +.L35: + { .mmi + (p12) LDFPD f32, f33 = [X], 2 * SIZE + (p12) LDFD f34 = [Y], 1 * SIZE; + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X], 2 * SIZE + (p12) LDFPD f35, f38 = [Y], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f40, f41 = [X], 2 * SIZE + (p12) LDFPD f39, f42 = [Y], 2 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFPD f44, f45 = [X], 2 * SIZE + (p12) LDFPD f43, f46 = [Y], 2 * SIZE + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [X], 2 * SIZE + (p12) LDFD f47 = [Y], 1 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFPD f52, f53 = [X], 2 * SIZE + (p13) LDFD f50 = [Y], 1 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p14) LDFPD f56, f57 = [X], 2 * SIZE + (p13) LDFPD f51, f54 = [Y], 2 * SIZE + mov YY = Y1; + } + ;; + (p15) LDFD f60 = [X] + (p13) LDFD f55 = [Y], 1 * SIZE + ;; + (p14) LDFD f58 = [Y], 1 * SIZE + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f40, f42 + ;; + (p14) LDFD f59 = [Y], 1 * SIZE + (p12) shladd YY = INCY, 3, YY + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f41, f43 + ;; + (p15) LDFD f61 = [Y] + (p13) shladd YY = INCY, 2, YY + (p12) FMA f12 = ALPHA, f36, f38 + (p12) FMA f13 = ALPHA, f44, f46 + ;; + (p12) STFD [Y1] = f6, 1 * SIZE + (p12) FMA f14 = ALPHA, f37, f39 + (p12) STFD [Y2] = f7, 1 * SIZE + (p12) FMA f15 = ALPHA, f45, f47 + ;; + (p12) STFD [Y1] = f10, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + (p12) STFD [Y2] = f11, 1 * SIZE + (p14) FMA f7 = ALPHA, f56, f58 + ;; + (p12) STFD [Y1] = f12, 1 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + (p12) STFD [Y2] = f13, 1 * SIZE + (p14) FMA f11 = ALPHA, f57, f59 + ;; + (p12) STFD [Y1] = f14, 5 * SIZE + (p13) FMA f12 = ALPHA, f52, f54 + (p12) STFD [Y2] = f15, 5 * SIZE + (p15) FMA f13 = ALPHA, f60, f61 + ;; + (p13) STFD [Y1] = f6, 1 * SIZE + (p14) STFD [YY] = f7, 1 * SIZE + (p13) FMA f14 = ALPHA, f53, f55 + ;; + (p13) STFD [Y1] = f10, 1 * SIZE + (p14) STFD [YY] = f11, 1 * SIZE + ;; + (p13) STFD [Y1] = f12, 1 * SIZE + (p15) STFD [YY] = f13 + ;; + (p13) STFD [Y1] = f14 + br.ret.sptk.many b0 + ;; + .align 32 + +.L40: + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = (PREFETCHSIZE + 38) * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = (PREFETCHSIZE + 14) * SIZE, Y + tbit.z p0, p12 = N, 3 + (p7) br.cond.dpnt .L45 + } + ;; + .align 32 + +.L42: + { .mmf + (p19) STFD [Y1] = f6, 1 * SIZE + (p19) STFD [Y2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f34, f82 + } + { .mmf + (p16) lfetch.nt1 [PRE1], 16 * SIZE + (p17) LDFPD f102, f105 = [Y], 2 * SIZE + (p18) FMA f7 = ALPHA, f46, f94 + } + ;; + { .mmf + (p19) STFD [Y1] = f10, 1 * SIZE + (p19) STFD [Y2] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA, f37, f85 + } + { .mmf + (p17) LDFPD f33, f36 = [X], 2 * SIZE + (p17) LDFPD f108, f111 = [Y], 2 * SIZE + (p18) FMA f11 = ALPHA, f49, f97 + } + ;; + { .mmf + (p19) STFD [Y1] = f12, 1 * SIZE + (p19) STFD [Y2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f40, f88 + } + { .mmf + (p17) LDFPD f39, f42 = [X], 2 * SIZE + (p17) LDFPD f114, f117 = [Y], 2 * SIZE + (p18) FMA f13 = ALPHA, f52, f100 + } + ;; + { .mmf + (p19) STFD [Y1] = f14, 5 * SIZE + (p19) STFD [Y2] = f15, 5 * SIZE + (p18) FMA f14 = ALPHA, f43, f91 + } + { .mmf + (p17) LDFPD f45, f48 = [X], 2 * SIZE + (p17) LDFPD f120, f123 = [Y], 2 * SIZE + (p18) FMA f15 = ALPHA, f55, f103 + } + ;; + { .mmf + (p18) STFD [Y1] = f6, 1 * SIZE + (p18) STFD [Y2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p17) LDFPD f51, f54 = [X], 2 * SIZE + (p17) LDFD f126 = [Y], 1 * SIZE + (p18) FMA f7 = ALPHA, f70, f118 + } + ;; + { .mmf + (p18) STFD [Y1] = f10, 1 * SIZE + (p18) STFD [Y2] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p17) LDFPD f57, f60 = [X], 2 * SIZE + (p16) LDFD f80 = [Y], 1 * SIZE + (p18) FMA f11 = ALPHA, f73, f121 + } + ;; + { .mmf + (p18) STFD [Y1] = f12, 1 * SIZE + (p18) STFD [Y2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f64, f112 + } + { .mmf + (p17) LDFPD f63, f66 = [X], 2 * SIZE + (p16) LDFPD f83, f86 = [Y], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [Y1] = f14, 5 * SIZE + (p18) STFD [Y2] = f15, 5 * SIZE + (p18) FMA f14 = ALPHA, f67, f115 + } + { .mmf + (p17) LDFPD f69, f72 = [X], 2 * SIZE + (p16) LDFPD f89, f92 = [Y], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; +#if 0 + (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE +#endif + { .mmb + (p17) LDFPD f75, f78 = [X], 2 * SIZE + (p16) LDFPD f95, f98 = [Y], 2 * SIZE + br.ctop.sptk.few .L42 + } + ;; + { .mmf + (p19) STFD [Y1] = f6, 1 * SIZE + (p19) STFD [Y2] = f7, 1 * SIZE + } + ;; + { .mmf + (p19) STFD [Y1] = f10, 1 * SIZE + (p19) STFD [Y2] = f11, 1 * SIZE + } + ;; + { .mmf + (p19) STFD [Y1] = f12, 1 * SIZE + (p19) STFD [Y2] = f13, 1 * SIZE + } + ;; + { .mmf + (p19) STFD [Y1] = f14, 5 * SIZE + (p19) STFD [Y2] = f15, 5 * SIZE + } + ;; + .align 32 + +.L45: + { .mmi + (p12) LDFPD f32, f33 = [X], 2 * SIZE + (p12) LDFD f34 = [Y], 1 * SIZE; + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X], 2 * SIZE + (p12) LDFPD f35, f38 = [Y], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f40, f41 = [X], 2 * SIZE + (p12) LDFPD f39, f42 = [Y], 2 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFPD f44, f45 = [X], 2 * SIZE + (p12) LDFPD f43, f46 = [Y], 2 * SIZE + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [X], 2 * SIZE + (p12) LDFD f47 = [Y], 1 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFPD f52, f53 = [X], 2 * SIZE + (p13) LDFD f50 = [Y], 1 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p14) LDFPD f56, f57 = [X], 2 * SIZE + (p13) LDFPD f51, f54 = [Y], 2 * SIZE + mov YY = Y1; + } + ;; + (p15) LDFD f60 = [X] + (p13) LDFD f55 = [Y], 1 * SIZE + ;; + (p14) LDFD f58 = [Y], 1 * SIZE + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f40, f42 + ;; + (p14) LDFD f59 = [Y], 1 * SIZE + (p12) shladd YY = INCY, 3, YY + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f41, f43 + ;; + (p15) LDFD f61 = [Y] + (p13) shladd YY = INCY, 2, YY + (p12) FMA f12 = ALPHA, f36, f38 + (p12) FMA f13 = ALPHA, f44, f46 + ;; + (p12) STFD [Y1] = f6, 1 * SIZE + (p12) FMA f14 = ALPHA, f37, f39 + (p12) STFD [Y2] = f7, 1 * SIZE + (p12) FMA f15 = ALPHA, f45, f47 + ;; + (p12) STFD [Y1] = f10, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + (p12) STFD [Y2] = f11, 1 * SIZE + (p14) FMA f7 = ALPHA, f56, f58 + ;; + (p12) STFD [Y1] = f12, 1 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + (p12) STFD [Y2] = f13, 1 * SIZE + (p14) FMA f11 = ALPHA, f57, f59 + ;; + (p12) STFD [Y1] = f14, 5 * SIZE + (p13) FMA f12 = ALPHA, f52, f54 + (p12) STFD [Y2] = f15, 5 * SIZE + (p15) FMA f13 = ALPHA, f60, f61 + ;; + (p13) STFD [Y1] = f6, 1 * SIZE + (p14) STFD [YY] = f7, 1 * SIZE + (p13) FMA f14 = ALPHA, f53, f55 + ;; + (p13) STFD [Y1] = f10, 1 * SIZE + (p14) STFD [YY] = f11, 1 * SIZE + ;; + (p13) STFD [Y1] = f12, 1 * SIZE + (p15) STFD [YY] = f13 + ;; + (p13) STFD [Y1] = f14 + br.ret.sptk.many b0 + ;; + .align 32 + +.L100: + { .mii + and J = 15, N + shr I = N, 4 + mov ar.ec = 3 + } + ;; + { .mmi + cmp.eq p9, p0 = r0, J + cmp.eq p7 ,p0 = 0, I + adds I = -1, I + } + ;; + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = PREFETCHSIZE * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = PREFETCHSIZE * SIZE, Y + tbit.z p0, p12 = N, 3 + (p7) br.cond.dpnt .L115 + } + ;; + .align 32 + +.L112: + { .mmi + (p18) STFD [Y1] = f6 + (p16) lfetch.nt1 [PRE1], INCX16 + (p18) add Y1 = INCY, Y1 + } + {.mmf + (p16) LDFD f32 = [X], INCX + (p16) LDFD f80 = [Y], INCY + (p18) FMA f6 = ALPHA, f58, f106 + } + ;; + { .mmi + (p18) STFD [Y1] = f7 + (p16) lfetch.excl.nt1 [PRE2], INCY16 + (p18) add Y1 = INCY, Y1 + } + { .mmf + (p16) LDFD f35 = [X], INCX + (p16) LDFD f83 = [Y], INCY + (p18) FMA f7 = ALPHA, f61, f109 + } + ;; + { .mmi + (p18) STFD [Y1] = f10 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f38 = [X], INCX + (p16) LDFD f86 = [Y], INCY + (p18) FMA f10 = ALPHA, f64, f112 + } + ;; + { .mmi + (p18) STFD [Y1] = f11 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f41 = [X], INCX + (p16) LDFD f89 = [Y], INCY + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmi + (p18) STFD [Y1] = f12 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f44 = [X], INCX + (p16) LDFD f92 = [Y], INCY + (p18) FMA f12 = ALPHA, f70, f118 + } + ;; + { .mmi + (p18) STFD [Y1] = f13 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f47 = [X], INCX + (p16) LDFD f95 = [Y], INCY + (p18) FMA f13 = ALPHA, f73, f121 + } + ;; + { .mmi + (p18) STFD [Y1] = f14 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f50 = [X], INCX + (p16) LDFD f98 = [Y], INCY + (p18) FMA f14 = ALPHA, f76, f124 + } + ;; + { .mmi + (p18) STFD [Y1] = f15 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f53 = [X], INCX + (p16) LDFD f101 = [Y], INCY + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmi + (p18) STFD [Y1] = f6 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f56 = [X], INCX + (p16) LDFD f104 = [Y], INCY + (p17) FMA f6 = ALPHA, f33, f81 + } + ;; + { .mmi + (p18) STFD [Y1] = f7 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f59 = [X], INCX + (p16) LDFD f107 = [Y], INCY + (p17) FMA f7 = ALPHA, f36, f84 + } + ;; + { .mmi + (p18) STFD [Y1] = f10 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f62 = [X], INCX + (p16) LDFD f110 = [Y], INCY + (p17) FMA f10 = ALPHA, f39, f87 + } + ;; + { .mmi + (p18) STFD [Y1] = f11 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f65 = [X], INCX + (p16) LDFD f113 = [Y], INCY + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmi + (p18) STFD [Y1] = f12 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f68 = [X], INCX + (p16) LDFD f116 = [Y], INCY + (p17) FMA f12 = ALPHA, f45, f93 + } + ;; + { .mmi + (p18) STFD [Y1] = f13 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f71 = [X], INCX + (p16) LDFD f119 = [Y], INCY + (p17) FMA f13 = ALPHA, f48, f96 + } + ;; + { .mmi + (p18) STFD [Y1] = f14 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f74 = [X], INCX + (p16) LDFD f122 = [Y], INCY + (p17) FMA f14 = ALPHA, f51, f99 + } + ;; + { .mmf + (p18) STFD [Y1] = f15 + (p18) add Y1 = INCY, Y1 + (p17) FMA f15 = ALPHA, f54, f102 + } + { .mmb + (p16) LDFD f77 = [X], INCX + (p16) LDFD f125 = [Y], INCY + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + (p12) LDFD f32 = [X], INCX + (p12) LDFD f34 = [Y], INCY + mov pr = PR, -65474 + ;; + (p12) LDFD f33 = [X], INCX + (p12) LDFD f35 = [Y], INCY + mov ar.lc = ARLC + ;; + (p12) LDFD f36 = [X], INCX + (p12) LDFD f38 = [Y], INCY + (p9) br.ret.sptk.many b0 + ;; + (p12) LDFD f37 = [X], INCX + (p12) LDFD f39 = [Y], INCY + tbit.z p0, p13 = N, 2 + ;; + (p12) LDFD f40 = [X], INCX + (p12) LDFD f42 = [Y], INCY + tbit.z p0, p14 = N, 1 + ;; + (p12) LDFD f41 = [X], INCX + (p12) LDFD f43 = [Y], INCY + tbit.z p0, p15 = N, 0 + ;; + { .mmf + (p12) LDFD f44 = [X], INCX + (p12) LDFD f46 = [Y], INCY + (p12) FMA f6 = ALPHA, f32, f34 + } + ;; + { .mmf + (p12) LDFD f45 = [X], INCX + (p12) LDFD f47 = [Y], INCY + (p12) FMA f7 = ALPHA, f33, f35 + } + ;; + { .mmf + (p13) LDFD f48 = [X], INCX + (p13) LDFD f50 = [Y], INCY + (p12) FMA f10 = ALPHA, f36, f38 + } + ;; + { .mmf + (p13) LDFD f49 = [X], INCX + (p13) LDFD f51 = [Y], INCY + (p12) FMA f11 = ALPHA, f37, f39 + } + ;; + { .mmi + (p12) STFD [Y1] = f6 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p13) LDFD f52 = [X], INCX + (p13) LDFD f54 = [Y], INCY + (p12) FMA f12 = ALPHA, f40, f42 + } + ;; + { .mmi + (p12) STFD [Y1] = f7 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p13) LDFD f53 = [X], INCX + (p13) LDFD f55 = [Y], INCY + (p12) FMA f13 = ALPHA, f41, f43 + } + ;; + { .mmi + (p12) STFD [Y1] = f10 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p14) LDFD f56 = [X], INCX + (p14) LDFD f58 = [Y], INCY + (p12) FMA f14 = ALPHA, f44, f46 + } + ;; + { .mmi + (p12) STFD [Y1] = f11 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p14) LDFD f57 = [X], INCX + (p14) LDFD f59 = [Y], INCY + (p12) FMA f15 = ALPHA, f45, f47 + } + ;; + { .mmi + (p12) STFD [Y1] = f12 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p15) LDFD f60 = [X] + (p15) LDFD f61 = [Y] + (p13) FMA f6 = ALPHA, f48, f50 + } + ;; + { .mmf + (p12) STFD [Y1] = f13 + (p12) add Y1 = INCY, Y1 + (p13) FMA f7 = ALPHA, f49, f51 + } + ;; + { .mmf + (p12) STFD [Y1] = f14 + (p12) add Y1 = INCY, Y1 + (p13) FMA f10 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [Y1] = f15 + (p12) add Y1 = INCY, Y1 + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p13) STFD [Y1] = f6 + (p13) add Y1 = INCY, Y1 + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p13) STFD [Y1] = f7 + (p13) add Y1 = INCY, Y1 + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [Y1] = f10 + (p13) add Y1 = INCY, Y1 + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + (p13) STFD [Y1] = f11 + (p13) add Y1 = INCY, Y1 + ;; + (p14) STFD [Y1] = f12 + (p14) add Y1 = INCY, Y1 + ;; + (p14) STFD [Y1] = f13 + (p14) add Y1 = INCY, Y1 + ;; + (p15) STFD [Y1] = f14 + br.ret.sptk.many b0 + ;; + EPILOGUE + diff --git a/kernel/ia64/scal.S b/kernel/ia64/scal.S new file mode 100644 index 0000000..e3d93dd --- /dev/null +++ b/kernel/ia64/scal.S @@ -0,0 +1,950 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCH_SIZE (8 * 16) +#else +#define PREFETCH_SIZE (1 * 64) +#endif + +#define ALPHA f8 + +#define N r32 +#define X1 r36 +#define INCX r37 + +#define X2 r14 +#define Y1 r15 +#define Y2 r16 +#define PRE1 r17 +#define I r18 +#define NAND15 r19 +#define INCX5 r20 +#define INCX16 r21 +#define XX r22 +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + + { .mfi + shladd INCX = INCX, BASE_SHIFT, r0 + fcmp.eq p0, p6 = ALPHA, f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.ge p7, p0 = 0, N + tbit.z p0, p10 = X1, BASE_SHIFT + (p7) br.ret.sptk.many b0 + } + .body + ;; + { .mmi + mov XX = X1 + (p10) LDFD f32 = [X1], INCX + mov PR = pr + } + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCX16 = INCX, 4, r0 + (p10) adds N = -1, N + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + nop __LINE__ + mov ar.ec = 5 + } + { .mmi + and NAND15 = 15, N + nop __LINE__ + shr I = N, 4 + } + ;; + { .mmi + adds I = -1, I + nop __LINE__ + tbit.z p0, p12 = N, 3 + } + { .mmb + cmp.ge p9, p0 = 0, NAND15 + adds PRE1 = PREFETCH_SIZE * SIZE + 192, XX + (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3 + } + ;; + { .mmi + (p10) STFD [XX] = f0 + nop __LINE__ + mov ar.lc = I + } + { .mmb + cmp.gt p8, p0 = 0, I + (p8) br.cond.dpnt .L30 + } + ;; + .align 32 + +.L20: + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + lfetch.excl.nt1 [PRE1], INCX16 + add X1 = INCX, X1 + add X2 = INCX, X2 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX5, X1 + add X2 = INCX5, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmb + add X1 = INCX5, X1 + add X2 = INCX5, X2 + br.cloop.sptk.few .L20 + } + ;; + .align 16 + +.L30: + { .mmi + (p12) STFD [X1] = f0 + (p12) STFD [X2] = f0 + mov ar.lc = ARLC + } + { .mmb + (p12) add X1 = INCX, X1 + (p12) add X2 = INCX, X2 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) STFD [X1] = f0 + (p12) add X1 = INCX, X1 + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) STFD [X2] = f0 + (p12) add X2 = INCX, X2 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) STFD [X1] = f0 + (p12) add X1 = INCX, X1 + tbit.z p0, p15 = N, 0 + } + { .mmb + (p12) STFD [X2] = f0 + (p12) add X2 = INCX, X2 + nop __LINE__ + } + ;; + { .mmb + (p12) STFD [X1] = f0 + (p12) add X1 = INCX5, X1 + nop __LINE__ + } + { .mmb + (p12) STFD [X2] = f0 + (p12) add X2 = INCX5, X2 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [X1] = f0 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mmi + mov Y1 = X1 + shladd Y2 = INCX, 2, X1 + mov pr.rot = 0 + } + { .mmf + cmp.gt p8, p0 = 0, I + shladd X2 = INCX, 2, X1 + (p10) FMPY f32 = ALPHA, f32 + } + ;; + { .mmi + (p10) STFD [XX] = f32 + cmp.eq p0, p7 = SIZE, INCX + mov ar.lc = I + } + { .mbb + cmp.eq p16, p0 = r0, r0 + (p7) br.cond.dpnt .L300 + (p8) br.cond.dpnt .L120 + } + ;; + .align 32 + +.L110: + { .mmf + (p21) STFD [Y1] = f6, 1 * SIZE + (p21) STFD [Y2] = f7, 1 * SIZE + (p20) FMPY f112 = ALPHA, f36 + } + { .mmf + (p16) lfetch.excl.nt1 [PRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [X1], 2 * SIZE + (p20) FMPY f113 = ALPHA, f56 + } + ;; + { .mmf + (p21) STFD [Y1] = f10, 1 * SIZE + (p21) STFD [Y2] = f11, 1 * SIZE + (p20) FMPY f114 = ALPHA, f41 + } + { .mfi + (p16) LDFPD f42, f47 = [X1], 2 * SIZE + (p20) FMPY f115 = ALPHA, f61 + nop __LINE__ + } + ;; + { .mmf + (p21) STFD [Y1] = f12, 1 * SIZE + (p21) STFD [Y2] = f13, 1 * SIZE + (p20) FMPY f116 = ALPHA, f46 + } + { .mfi + (p16) LDFPD f52, f57 = [X1], 2 * SIZE + (p20) FMPY f117 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mmf + (p21) STFD [Y1] = f14, 5 * SIZE + (p21) STFD [Y2] = f15, 5 * SIZE + (p20) FMPY f118 = ALPHA, f51 + } + { .mfi + (p16) LDFPD f62, f67 = [X1], 2 * SIZE + (p20) FMPY f119 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mmf + (p20) STFD [Y1] = f112, 1 * SIZE + (p20) STFD [Y2] = f113, 1 * SIZE + (p20) FMPY f6 = ALPHA, f76 + } + { .mfi + (p16) LDFPD f72, f77 = [X1], 2 * SIZE + (p20) FMPY f7 = ALPHA, f96 + nop __LINE__ + } + ;; + { .mmf + (p20) STFD [Y1] = f114, 1 * SIZE + (p20) STFD [Y2] = f115, 1 * SIZE + (p20) FMPY f10 = ALPHA, f81 + } + { .mfi + (p16) LDFPD f82, f87 = [X1], 2 * SIZE + (p20) FMPY f11 = ALPHA, f101 + nop __LINE__ + } + ;; + { .mmf + (p20) STFD [Y1] = f116, 1 * SIZE + (p20) STFD [Y2] = f117, 1 * SIZE + (p20) FMPY f12 = ALPHA, f86 + } + { .mfi + (p16) LDFPD f92, f97 = [X1], 2 * SIZE + (p20) FMPY f13 = ALPHA, f106 + (p20) shladd X2 = INCX, 2, X1 + } + ;; + { .mmf + (p20) STFD [Y1] = f118, 5 * SIZE + (p20) STFD [Y2] = f119, 5 * SIZE + (p20) FMPY f14 = ALPHA, f91 + } + { .mfb + (p16) LDFPD f102, f107 = [X1], 2 * SIZE + (p20) FMPY f15 = ALPHA, f111 + br.ctop.sptk.few .L110 + } + ;; + .align 32 + +.L120: + { .mmi + (p21) STFD [Y1] = f6, 1 * SIZE + (p21) STFD [Y2] = f7, 1 * SIZE + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + (p12) LDFPD f36, f37 = [X2], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p21) STFD [Y1] = f10, 1 * SIZE + (p21) STFD [Y2] = f11, 1 * SIZE + mov ar.lc = ARLC + } + { .mmi + (p12) LDFPD f34, f35 = [X1] + (p12) LDFPD f38, f39 = [X2] + (p12) adds X1 = 6 * SIZE,X1 + } + ;; + { .mmi + (p21) STFD [Y1] = f12, 1 * SIZE + (p21) STFD [Y2] = f13, 1 * SIZE + tbit.z p0, p14 = N, 1 + } + { .mmi + (p13) LDFPD f40, f41 = [X1], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p21) STFD [Y1] = f14, 5 * SIZE + (p21) STFD [Y2] = f15, 5 * SIZE + mov pr = PR, -65474 + } + { .mib + (p13) LDFPD f42, f43 = [X1], 2 * SIZE + nop __LINE__ + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p14) LDFPD f44, f45 = [X1], 2 * SIZE + nop __LINE__ + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p15) LDFD f46 = [X1] + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f32 = ALPHA, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f36 = ALPHA, f36 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f33 = ALPHA, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f37 = ALPHA, f37 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f34 = ALPHA, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f38 = ALPHA, f38 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f35 = ALPHA, f35 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f39 = ALPHA, f39 + } + ;; + { .mmf + (p12) STFD [Y1] = f32, 1 * SIZE + nop __LINE__ + (p13) FMPY f40 = ALPHA, f40 + } + { .mmf + (p12) STFD [Y2] = f36, 1 * SIZE + nop __LINE__ + (p13) FMPY f41 = ALPHA, f41 + } + ;; + { .mmf + (p12) STFD [Y1] = f33, 1 * SIZE + nop __LINE__ + (p13) FMPY f42 = ALPHA, f42 + } + { .mmf + (p12) STFD [Y2] = f37, 1 * SIZE + nop __LINE__ + (p13) FMPY f43 = ALPHA, f43 + } + ;; + { .mmf + (p12) STFD [Y1] = f34, 1 * SIZE + nop __LINE__ + (p14) FMPY f44 = ALPHA, f44 + } + { .mmf + (p12) STFD [Y2] = f38, 1 * SIZE + nop __LINE__ + (p14) FMPY f45 = ALPHA, f45 + } + ;; + { .mmf + (p12) STFD [Y1] = f35, 5 * SIZE + (p12) STFD [Y2] = f39, 5 * SIZE + (p15) FMPY f46 = ALPHA, f46 + } + ;; + { .mmi + (p13) STFD [Y1] = f40, 1 * SIZE + ;; + (p13) STFD [Y1] = f41, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [Y1] = f42, 1 * SIZE + ;; + (p13) STFD [Y1] = f43, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [Y1] = f44, 1 * SIZE + ;; + (p14) STFD [Y1] = f45, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [Y1] = f46 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L300: + { .mmi + adds PRE1 = PREFETCH_SIZE * SIZE + 64, X1 + nop __LINE__ + mov.i ar.ec = 6 + } + { .mmb + cmp.gt p8, p0 = 0, I + nop __LINE__ + (p8) br.cond.dpnt .L320 + } + ;; + .align 32 + +.L310: + { .mmf + (p16) lfetch.excl.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X1], INCX + (p21) FMPY f6 = ALPHA, f37 + } + { .mmb + (p22) STFD [Y1] = f12 + (p22) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f38 = [X1], INCX + (p21) FMPY f7 = ALPHA, f43 + nop __LINE__ + } + { .mmb + (p22) STFD [Y1] = f13 + (p22) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f44 = [X1], INCX + (p21) FMPY f10 = ALPHA, f49 + nop __LINE__ + } + { .mmb + (p22) STFD [Y1] = f14 + (p22) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f50 = [X1], INCX + (p21) FMPY f11 = ALPHA, f55 + nop __LINE__ + } + { .mmb + (p22) STFD [Y1] = f15 + (p22) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f56 = [X1], INCX + (p21) FMPY f12 = ALPHA, f61 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f6 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f62 = [X1], INCX + (p21) FMPY f13 = ALPHA, f67 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f7 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f68 = [X1], INCX + (p21) FMPY f14 = ALPHA, f73 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f10 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f74 = [X1], INCX + (p21) FMPY f15 = ALPHA, f79 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f11 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f80 = [X1], INCX + (p21) FMPY f6 = ALPHA, f85 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f12 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f86 = [X1], INCX + (p21) FMPY f7 = ALPHA, f91 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f13 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f92 = [X1], INCX + (p21) FMPY f10 = ALPHA, f97 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f14 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f98 = [X1], INCX + (p21) FMPY f11 = ALPHA, f103 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f15 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f104 = [X1], INCX + (p21) FMPY f12 = ALPHA, f109 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f6 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f110 = [X1], INCX + (p21) FMPY f13 = ALPHA, f115 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f7 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f116 = [X1], INCX + (p21) FMPY f14 = ALPHA, f121 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f10 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f122 = [X1], INCX + (p21) FMPY f15 = ALPHA, f127 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f11 + (p21) add Y1 = INCX, Y1 + br.ctop.sptk.few .L310 + } + ;; + STFD [Y1] = f12 + add Y1 = INCX, Y1 + shladd Y2 = INCX, 2, X1 + ;; + STFD [Y1] = f13 + add Y1 = INCX, Y1 + shladd X2 = INCX, 2, X1 + ;; + STFD [Y1] = f14 + add Y1 = INCX, Y1 + ;; + STFD [Y1] = f15 + add Y1 = INCX, Y1 + ;; + .align 16 + +.L320: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f52 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + mov pr = PR, -65474 + } + { .mmb + nop.m 0 + nop.m 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f54 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX5 + (p12) LDFD f55 = [X2], INCX5 + tbit.z p0, p14 = N, 1 + } + ;; + (p13) LDFD f56 = [X1], INCX + tbit.z p0, p15 = N, 0 + ;; + (p13) LDFD f57 = [X1], INCX + ;; + { .mfi + (p13) LDFD f58 = [X1], INCX + (p12) FMPY f48 = ALPHA, f48 + } + { .mfi + (p12) FMPY f52 = ALPHA, f52 + } + ;; + { .mfi + (p13) LDFD f59 = [X1], INCX + (p12) FMPY f49 = ALPHA, f49 + } + { .mfi + (p12) FMPY f53 = ALPHA, f53 + } + ;; + { .mfi + (p14) LDFD f60 = [X1], INCX + (p12) FMPY f50 = ALPHA, f50 + } + { .mfi + (p12) FMPY f54 = ALPHA, f54 + } + ;; + { .mfi + (p14) LDFD f61 = [X1], INCX + (p12) FMPY f51 = ALPHA, f51 + } + { .mfi + (p12) FMPY f55 = ALPHA, f55 + } + ;; + { .mmf + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + (p13) FMPY f56 = ALPHA, f56 + } + { .mmi + (p15) LDFD f62 = [X1] + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + } + ;; + { .mmf + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p13) FMPY f57 = ALPHA, f57 + } + { .mmi + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + (p13) FMPY f58 = ALPHA, f58 + } + { .mmi + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) FMPY f59 = ALPHA, f59 + } + { .mmi + (p12) add Y1 = INCX5, Y1 + (p12) add Y2 = INCX5, Y2 + nop __LINE__ + } + ;; + { .mfi + (p13) STFD [Y1] = f56 + (p14) FMPY f60 = ALPHA, f60 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mfi + (p13) STFD [Y1] = f57 + (p14) FMPY f61 = ALPHA, f61 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mfi + (p13) STFD [Y1] = f58 + (p15) FMPY f62 = ALPHA, f62 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f59 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p14) STFD [Y1] = f60 + (p14) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p14) STFD [Y1] = f61 + (p14) add Y1 = INCX, Y1 + } + ;; + { .mib + (p15) STFD [Y1] = f62 + mov pr = PR, -65474 + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/sdot.S b/kernel/ia64/sdot.S new file mode 100644 index 0000000..5a058e7 --- /dev/null +++ b/kernel/ia64/sdot.S @@ -0,0 +1,1177 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (8 * 16 + 4) + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 +#define INCX16 r18 +#define INCY16 r19 +#define INCX5 r20 +#define INCY5 r21 +#define YY r22 +#define XA r23 +#define YA r24 +#define XX r25 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + nop.m 0 + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov r26 = 1 + mov f9 = f0 + shr XA = X1, 3 + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif + + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + (p6) shladd X1 = r26, BASE_SHIFT, X1 + (p7) shladd Y1 = r27, BASE_SHIFT, Y1 + ;; +#endif + { .mfi + shladd INCX = INCX, BASE_SHIFT, r0 + mov f32 = f0 + mov PR = pr + } + { .mfb + cmp.lt p0, p6 = r0, N + mov f80 = f0 + (p6) br.ret.sptk.many b0 + } + ;; + { .mfi + shladd INCY = INCY, BASE_SHIFT, r0 + mov f10 = f0 + tbit.nz p15, p0 = X1, BASE_SHIFT + } + { .mfb + cmp.ne p6, p0 = SIZE, INCX + mov f11 = f0 + (p6) br.cond.dptk .L100 + } + ;; + { .mfi + (p15) LDFD f32 = [X1], INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + (p15) adds N = -1, N + mov f13 = f0 + shr YA = Y1, 3 + } + ;; + { .mfi + (p15) LDFD f80 = [Y1], INCY + mov f14 = f0 + shr I = N, 4 + } + { .mmi + and J = 15, N + and XA = 0x1f, XA + and YA = 0x1f, YA + } + ;; + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCY5 = INCY, 2, INCY + sub XA = YA, XA + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + shladd Y2 = INCY, 2, Y1 + cmp.eq p7, p0 = r0, J + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.ge p8, p0 = 4, XA + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mbb + cmp.le p9, p0 = 24, XA + (p8) br.cond.dpnt .L20 + (p9) br.cond.dpnt .L20 + } + ;; + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 6) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L15 + } + ;; + .align 32 + +/* INCX == 1 && X is aligned */ +.L12: + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [Y1], INCY + (p16) LDFD f92 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f83 = [Y1], INCY + (p16) LDFD f95 = [Y2], INCY + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f86 = [Y1], INCY + (p16) LDFD f98 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f89 = [Y1], INCY5 + (p16) LDFD f101 = [Y2], INCY5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f104 = [Y1], INCY + (p16) LDFD f116 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f107 = [Y1], INCY + (p16) LDFD f119 = [Y2], INCY + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f110 = [Y1], INCY + (p16) LDFD f122 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p16) LDFD f113 = [Y1], INCY5 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f125 = [Y2], INCY5 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + mov YY = Y1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f42 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) shladd YY = INCY, 3, YY + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [Y1], INCY + (p12) LDFD f43 = [Y2], INCY + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p13) shladd YY = INCY, 2, YY + } + { .mmi + (p12) LDFD f38 = [Y1], INCY + (p12) LDFD f46 = [Y2], INCY + } + ;; + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFD f39 = [Y1], INCY5 + (p12) LDFD f47 = [Y2], INCY5 + ;; + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFD f50 = [Y1], INCY + (p14) LDFD f58 = [YY], INCY + ;; + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFD f51 = [Y1], INCY + (p14) LDFD f59 = [YY], INCY + ;; + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p13) LDFD f54 = [Y1], INCY + (p15) LDFD f61 = [YY] + ;; + (p13) LDFD f55 = [Y1], INCY + (p15) LDFD f60 = [X1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L20: + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 38) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p17) LDFD f81 = [Y1], INCY + (p17) LDFD f93 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p17) LDFD f84 = [Y1], INCY + (p17) LDFD f96 = [Y2], INCY + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p17) LDFD f87 = [Y1], INCY + (p17) LDFD f99 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p17) LDFD f90 = [Y1], INCY5 + (p17) LDFD f102 = [Y2], INCY5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p17) LDFD f105 = [Y1], INCY + (p17) LDFD f117 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p17) LDFD f108 = [Y1], INCY + (p17) LDFD f120 = [Y2], INCY + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p17) LDFD f111 = [Y1], INCY + (p17) LDFD f123 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p17) LDFD f114 = [Y1], INCY5 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p17) LDFD f126 = [Y2], INCY5 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + mov YY = Y1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f42 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) shladd YY = INCY, 3, YY + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [Y1], INCY + (p12) LDFD f43 = [Y2], INCY + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p13) shladd YY = INCY, 2, YY + } + { .mmi + (p12) LDFD f38 = [Y1], INCY + (p12) LDFD f46 = [Y2], INCY + } + ;; + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFD f39 = [Y1], INCY5 + (p12) LDFD f47 = [Y2], INCY5 + ;; + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFD f50 = [Y1], INCY + (p14) LDFD f58 = [YY], INCY + ;; + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFD f51 = [Y1], INCY + (p14) LDFD f59 = [YY], INCY + ;; + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p13) LDFD f54 = [Y1], INCY + (p15) LDFD f61 = [YY] + ;; + (p13) LDFD f55 = [Y1], INCY + (p15) LDFD f60 = [X1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L100: + { .mmi + shladd X2 = INCX, 2, X1 + } + { .mib + cmp.ne p6, p0 = SIZE, INCY + tbit.nz p15, p0 = Y1, BASE_SHIFT + (p6) br.cond.dptk .L200 + } + ;; + { .mfi + (p15) LDFD f32 = [X1], INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + (p15) adds N = -1, N + mov f13 = f0 + shr YA = Y1, 3 + } + ;; + { .mfi + (p15) LDFD f80 = [Y1], INCY + mov f14 = f0 + shr I = N, 4 + } + { .mmi + and J = 15, N + and XA = 0x1f, XA + and YA = 0x1f, YA + } + ;; + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCY5 = INCY, 2, INCY + sub XA = YA, XA + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + cmp.eq p7, p0 = r0, J + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.ge p8, p0 = 8, XA + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mbb + cmp.le p9, p0 = 28, XA + (p8) br.cond.dpnt .L120 + (p9) br.cond.dpnt .L120 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L115 + } + ;; + .align 32 + +/* INCY == 1 */ +.L112: + { .mmf + (p16) LDFPD f32, f35 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f92 = [X2], INCX + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f83 = [X1], INCX + (p16) LDFD f95 = [X2], INCX + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [Y1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f86 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [Y1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f89 = [X1], INCX5 + (p16) LDFD f101 = [X2], INCX5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [Y1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f104 = [X1], INCX + (p16) LDFD f116 = [X2], INCX + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [Y1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f107 = [X1], INCX + (p16) LDFD f119 = [X2], INCX + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [Y1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f110 = [X1], INCX + (p16) LDFD f122 = [X2], INCX + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [Y1], 2 * SIZE + (p16) LDFD f113 = [X1], INCX5 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f125 = [X2], INCX5 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + { .mmi + (p12) LDFPD f32, f33 = [Y1], 2 * SIZE + mov XX = X1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f42 = [X2], INCX + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [Y1], 2 * SIZE + (p12) shladd XX = INCX, 3, XX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [X1], INCX + (p12) LDFD f43 = [X2], INCX + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [Y1], 2 * SIZE + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f38 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + } + ;; + (p12) LDFPD f44, f45 = [Y1], 2 * SIZE + (p12) LDFD f39 = [X1], INCX5 + (p12) LDFD f47 = [X2], INCX5 + ;; + (p13) LDFPD f48, f49 = [Y1], 2 * SIZE + (p13) LDFD f50 = [X1], INCX + (p14) LDFD f58 = [XX], INCX + ;; + (p13) LDFPD f52, f53 = [Y1], 2 * SIZE + (p13) LDFD f51 = [X1], INCX + (p14) LDFD f59 = [XX], INCX + ;; + (p14) LDFPD f56, f57 = [Y1], 2 * SIZE + (p13) LDFD f54 = [X1], INCX + (p15) LDFD f61 = [XX] + ;; + (p13) LDFD f55 = [X1], INCX + (p15) LDFD f60 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L120: + { .mmi + adds PREX = (PREFETCH_SIZE + 17) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 19) * SIZE, X1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L125 + } + ;; + .align 32 + +.L122: + { .mmf + (p16) LDFPD f32, f35 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p17) LDFD f81 = [X1], INCX + (p17) LDFD f93 = [X2], INCX + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCX16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p17) LDFD f84 = [X1], INCX + (p17) LDFD f96 = [X2], INCX + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [Y1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p17) LDFD f87 = [X1], INCX + (p17) LDFD f99 = [X2], INCX + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [Y1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p17) LDFD f90 = [X1], INCX5 + (p17) LDFD f102 = [X2], INCX5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [Y1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p17) LDFD f105 = [X1], INCX + (p17) LDFD f117 = [X2], INCX + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [Y1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p17) LDFD f108 = [X1], INCX + (p17) LDFD f120 = [X2], INCX + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [Y1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p17) LDFD f111 = [X1], INCX + (p17) LDFD f123 = [X2], INCX + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [Y1], 2 * SIZE + (p17) LDFD f114 = [X1], INCX5 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p17) LDFD f126 = [X2], INCX5 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L122 + } + ;; + .align 32 + +.L125: + { .mmi + (p12) LDFPD f32, f33 = [Y1], 2 * SIZE + mov XX = X1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f42 = [X2], INCX + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [Y1], 2 * SIZE + (p12) shladd XX = INCX, 3, XX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [X1], INCX + (p12) LDFD f43 = [X2], INCX + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [Y1], 2 * SIZE + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f38 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + } + ;; + (p12) LDFPD f44, f45 = [Y1], 2 * SIZE + (p12) LDFD f39 = [X1], INCX5 + (p12) LDFD f47 = [X2], INCX5 + ;; + (p13) LDFPD f48, f49 = [Y1], 2 * SIZE + (p13) LDFD f50 = [X1], INCX + (p14) LDFD f58 = [XX], INCX + ;; + (p13) LDFPD f52, f53 = [Y1], 2 * SIZE + (p13) LDFD f51 = [X1], INCX + (p14) LDFD f59 = [XX], INCX + ;; + (p14) LDFPD f56, f57 = [Y1], 2 * SIZE + (p13) LDFD f54 = [X1], INCX + (p15) LDFD f61 = [XX] + ;; + (p13) LDFD f55 = [X1], INCX + (p15) LDFD f60 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L200: + { .mfi + shladd INCX5 = INCX, 2, INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + and J = 15, N + mov f13 = f0 + shr I = N, 4 + } + ;; + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd INCY5 = INCY, 2, INCY + mov f14 = f0 + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + cmp.eq p7, p0 = r0, J + adds I = -1, I + mov ar.ec= 3 + } + { .mmi + shladd Y2 = INCY, 2, Y1 + mov XX = X1 + mov YY = Y1 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + mov f15 = f0 + (p6) br.cond.dpnt .L215 + } + ;; + .align 32 + +/* INCY == 1 */ +.L212: + { .mmf + (p16) lfetch.nt1 [PREX], INCX16 + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f32 = [Y1], INCY + (p16) LDFD f44 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f92 = [X2], INCX + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f35 = [Y1], INCY + (p16) LDFD f47 = [Y2], INCY + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFD f83 = [X1], INCX + (p16) LDFD f95 = [X2], INCX + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f38 = [Y1], INCY + (p16) LDFD f50 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFD f86 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f41 = [Y1], INCY5 + (p16) LDFD f53 = [Y2], INCY5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFD f89 = [X1], INCX5 + (p16) LDFD f101 = [X2], INCX5 + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f56 = [Y1], INCY + (p16) LDFD f68 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFD f104 = [X1], INCX + (p16) LDFD f116 = [X2], INCX + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f59 = [Y1], INCY + (p16) LDFD f71 = [Y2], INCY + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFD f107 = [X1], INCX + (p16) LDFD f119 = [X2], INCX + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f62 = [Y1], INCY + (p16) LDFD f74 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFD f110 = [X1], INCX + (p16) LDFD f122 = [X2], INCX + (p18) FMA f14 = f76, f124, f14 + } + { .mmf + (p16) LDFD f65 = [Y1], INCY5 + (p16) LDFD f77 = [Y2], INCY5 + (p18) FMA f15 = f79, f127, f15 + } + ;; + { .mmi + (p16) LDFD f113 = [X1], INCX5 + (p16) LDFD f125 = [X2], INCX5 + } + { .mmb + (p16) add XX = INCX16, XX + (p16) add YY = INCY16, YY + br.ctop.sptk.few .L212 + } + ;; + .align 32 + +.L215: + { .mmi + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f42 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f32 = [Y1], INCY + (p12) LDFD f40 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFD f35 = [X1], INCX + (p12) LDFD f43 = [X2], INCX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f33 = [Y1], INCY + (p12) LDFD f41 = [Y2], INCY + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFD f38 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + (p12) shladd XX = INCX, 3, XX + } + { .mmi + (p12) LDFD f36 = [Y1], INCY + (p12) LDFD f44 = [Y2], INCY + (p12) shladd YY = INCY, 3, YY + } + ;; + { .mmi + (p12) LDFD f39 = [X1], INCX5 + (p12) LDFD f47 = [X2], INCX5 + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f37 = [Y1], INCY5 + (p12) LDFD f45 = [Y2], INCY5 + (p13) shladd YY = INCY, 2, YY + } + ;; + (p13) LDFD f50 = [X1], INCX + (p13) LDFD f48 = [Y1], INCY + (p14) LDFD f58 = [XX], INCX + (p14) LDFD f56 = [YY], INCY + ;; + (p13) LDFD f51 = [X1], INCX + (p13) LDFD f49 = [Y1], INCY + (p14) LDFD f59 = [XX], INCX + (p14) LDFD f57 = [YY], INCY + ;; + (p13) LDFD f54 = [X1], INCX + (p13) LDFD f52 = [Y1], INCY + (p15) LDFD f61 = [XX] + (p15) LDFD f60 = [YY] + ;; + (p13) LDFD f55 = [X1] + (p13) LDFD f53 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L999: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f10 + FADD f12 = f12, f14 + mov ar.lc = ARLC + ;; + FADD f8 = f8, f12 + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/sgemv_n.S b/kernel/ia64/sgemv_n.S new file mode 100644 index 0000000..f5949e6 --- /dev/null +++ b/kernel/ia64/sgemv_n.S @@ -0,0 +1,3241 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#define A r36 +#define LDA r37 +#define X r38 +#define INCX r39 +#define Y r34 +#define INCY r35 +#define BUFFER r11 + +#define I r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define AO5 r20 +#define AO6 r21 +#define AO7 r22 +#define AO8 r23 +#define YLD1 r24 +#define YST1 r25 +#define YST2 r27 +#define MM r28 +#define YY r9 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define AO11 loc8 +#define AO21 loc9 +#define AO31 loc10 +#define AO41 loc11 +#define AO51 loc12 +#define AO61 loc13 +#define AO71 loc14 +#define AO81 loc15 + +#define PREB r8 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 8) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 8, 0 + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + ;; + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + ;; + stf.spill [r8] = f22 + stf.spill [r9] = f23 + .body + ;; + + ld8 Y = [r14] + ld8 INCY = [r15] + ld8 BUFFER = [r16] + + mov ALPHA = f8 + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + ;; + shladd INCX = INCX, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + ;; + tbit.nz p8, p0 = A, BASE_SHIFT + tbit.nz p9, p0 = LDA, BASE_SHIFT + mov MM = M + ;; + (p8) adds MM = -1, M + ;; + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + ;; + sub I = A, Y + cmp.eq p10, p0 = SIZE, INCY + mov YY = Y + ;; + (p10) tbit.z.unc p10, p0 = I, BASE_SHIFT + ;; + (p10) br.cond.dptk .L10 + ;; + shr J = M, 3 + mov YY = BUFFER + ;; + (p8) adds YY = SIZE, BUFFER + ;; + mov ar.lc = J + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + ;; +.L02: + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 5 * SIZE + STFD [YST2] = f0, 5 * SIZE + br.cloop.sptk.few .L02 + ;; + +.L10: + (p9) br.cond.dptk .L100 + + shr J = N, 3 + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + ;; + .align 16 + +.L11: + mov YLD1 = YY + mov YST1 = YY + ;; + LDFD f8 = [X], INCX + ;; + LDFD f9 = [X], INCX + ;; + LDFD f10 = [X], INCX + ;; + LDFD f11 = [X], INCX + ;; + LDFD f12 = [X], INCX + ;; + LDFD f13 = [X], INCX + ;; + LDFD f14 = [X], INCX + ;; + LDFD f15 = [X], INCX + ;; + FMPY f8 = ALPHA, f8 + FMPY f9 = ALPHA, f9 + FMPY f10 = ALPHA, f10 + FMPY f11 = ALPHA, f11 + FMPY f12 = ALPHA, f12 + FMPY f13 = ALPHA, f13 + FMPY f14 = ALPHA, f14 + FMPY f15 = ALPHA, f15 + ;; + mov AO1 = A + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + ;; + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + ;; + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + shladd A = LDA, 3, A + ;; + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + + (p8) LDFD f80 = [AO1], 1 * SIZE + (p8) LDFD f81 = [AO2], 1 * SIZE + (p8) LDFD f82 = [AO3], 1 * SIZE + (p8) LDFD f83 = [AO4], 1 * SIZE + (p8) LDFD f84 = [AO5], 1 * SIZE + (p8) LDFD f85 = [AO6], 1 * SIZE + (p8) LDFD f86 = [AO7], 1 * SIZE + (p8) LDFD f87 = [AO8], 1 * SIZE + (p8) LDFD f106 = [YLD1], 1 * SIZE + ;; + (p8) FMPY f32 = f8, f80 + (p8) FMPY f33 = f9, f81 + (p8) FMPY f34 = f10, f82 + (p8) FMA f35 = f11, f83, f106 + ;; + (p8) FMA f32 = f12, f84, f32 + (p8) FMA f33 = f13, f85, f33 + (p8) FMA f34 = f14, f86, f34 + (p8) FMA f35 = f15, f87, f35 + ;; + (p8) FADD f32 = f32, f33 + (p8) FADD f34 = f34, f35 + ;; + (p8) FADD f32 = f32, f34 + ;; + (p8) STFD [YST1] = f32, 1 * SIZE + + shr I = MM, 3 + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + ;; + mov ar.lc = I + mov ar.ec= 2 + (p6) br.cond.dpnt .L15 + ;; + .align 16 + +.L12: + { .mfi + (p17) LDFPD f95, f96 = [AO8], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFPD f46, f47 = [AO2], 2 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f101 = f11, f57, f101 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f104 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO4], 2 * SIZE + (p17) FMA f107 = f11, f59, f107 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f110 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f58, f59 = [AO4], 2 * SIZE + (p17) FMA f113 = f11, f61, f113 + } + { .mfi + (p17) FMA f116 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO4], 2 * SIZE + (p17) FMA f119 = f11, f63, f119 + } + { .mfi + (p17) FMA f122 = f11, f64, f122 + } + ;; + { .mfi + (p16) LDFPD f62, f63 = [AO4], 2 * SIZE + (p17) FMA f101 = f12, f65, f101 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f104 = f12, f66, f104 + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO5], 2 * SIZE + (p17) FMA f107 = f12, f67, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f110 = f12, f68, f110 + } + ;; + { .mfi + (p16) LDFPD f66, f67 = [AO5], 2 * SIZE + (p17) FMA f113 = f12, f69, f113 + } + { .mfi + (p14) PREFETCH [RPRE5], 16 * SIZE + (p17) FMA f116 = f12, f70, f116 + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO5], 2 * SIZE + (p17) FMA f119 = f12, f71, f119 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f122 = f12, f72, f122 + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO5], 2 * SIZE + (p17) FMA f101 = f13, f73, f101 + } + { .mfi + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f104 = f13, f74, f104 + } + ;; + { .mfi + (p16) LDFPD f72, f73 = [AO6], 2 * SIZE + (p17) FMA f107 = f13, f75, f107 + } + { .mfi + (p15) PREFETCH [RPRE6], 16 * SIZE + (p17) FMA f110 = f13, f76, f110 + } + ;; + { .mfi + (p16) LDFPD f74, f75 = [AO6], 2 * SIZE + (p17) FMA f113 = f13, f77, f113 + } + { .mfi + (p17) FMA f116 = f13, f78, f116 + } + ;; + { .mfi + (p16) LDFPD f76, f77 = [AO6], 2 * SIZE + (p17) FMA f119 = f13, f79, f119 + } + { .mfi + (p17) FMA f122 = f13, f80, f122 + } + ;; + { .mfi + (p16) LDFPD f78, f79 = [AO6], 2 * SIZE + (p17) FMA f101 = f14, f81, f101 + } + { .mfi + (p17) FMA f104 = f14, f82, f104 + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO7], 2 * SIZE + (p17) FMA f107 = f14, f83, f107 + } + { .mfi + (p14) PREFETCH [RPRE7], 16 * SIZE + (p17) FMA f110 = f14, f84, f110 + } + ;; + { .mfi + (p16) LDFPD f82, f83 = [AO7], 2 * SIZE + (p17) FMA f113 = f14, f85, f113 + } + { .mfi + (p17) FMA f116 = f14, f86, f116 + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO7], 2 * SIZE + (p17) FMA f119 = f14, f87, f119 + } + { .mfi + (p17) FMA f122 = f14, f88, f122 + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO7], 2 * SIZE + (p17) FMA f16 = f15, f89, f101 + } + { .mfi + (p17) FMA f17 = f15, f90, f104 + } + ;; + { .mfi + (p16) LDFPD f88, f89 = [AO8], 2 * SIZE + (p17) FMA f18 = f15, f91, f107 + } + { .mfi + (p15) PREFETCH [RPRE8], 16 * SIZE + (p17) FMA f19 = f15, f92, f110 + } + ;; + { .mfi + (p16) LDFPD f90, f91 = [AO8], 2 * SIZE + (p17) FMA f20 = f15, f93, f113 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f21 = f15, f94, f116 + } + ;; + { .mfi + (p16) LDFPD f92, f93 = [AO8], 2 * SIZE + (p17) FMA f22 = f15, f95, f119 + } + { .mfb + (p16) adds I = -1, I + (p17) FMA f23 = f15, f96, f122 + br.ctop.sptk.few .L12 + } + ;; + .align 16 + +.L15: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + cmp.lt p6, p0 = 1, J + adds J = -1, J + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f81 = [AO2] + (p15) LDFD f82 = [AO3] + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f23, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f38, f39 = [AO4], 2 * SIZE + (p13) FMA f100 = f8, f32, f100 + nop __LINE__ + } + { .mfi + (p13) LDFPD f40, f41 = [AO5], 2 * SIZE + (p13) FMA f101 = f8, f33, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f54, f55 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f56, f57 = [AO5], 2 * SIZE + (p13) FMA f103 = f8, f49, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f70, f71 = [AO4], 2 * SIZE + (p14) FMA f104 = f8, f64, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f72, f73 = [AO5], 2 * SIZE + (p14) FMA f105 = f8, f65, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f83 = [AO4] + (p15) FMA f106 = f8, f80, f106 + nop __LINE__ + } + { .mfi + (p15) LDFD f84 = [AO5] + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f42, f43 = [AO6], 2 * SIZE + (p13) FMA f100 = f9, f34, f100 + nop __LINE__ + } + { .mfi + (p13) LDFPD f44, f45 = [AO7], 2 * SIZE + (p13) FMA f101 = f9, f35, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f58, f59 = [AO6], 2 * SIZE + (p13) FMA f102 = f9, f50, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f60, f61 = [AO7], 2 * SIZE + (p13) FMA f103 = f9, f51, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f74, f75 = [AO6], 2 * SIZE + (p14) FMA f104 = f9, f66, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f76, f77 = [AO7], 2 * SIZE + (p14) FMA f105 = f9, f67, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f85 = [AO6] + (p15) FMA f106 = f9, f81, f106 + nop __LINE__ + } + { .mfi + (p15) LDFD f86 = [AO7] + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f46, f47 = [AO8], 2 * SIZE + (p13) FMA f100 = f10, f36, f100 + nop __LINE__ + } + { .mfi + (p13) FMA f101 = f10, f37, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f62, f63 = [AO8], 2 * SIZE + (p13) FMA f102 = f10, f52, f102 + nop __LINE__ + } + { .mfi + (p13) FMA f103 = f10, f53, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f78, f79 = [AO8], 2 * SIZE + (p14) FMA f104 = f10, f68, f104 + nop __LINE__ + } + { .mfi + (p14) FMA f105 = f10, f69, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f87 = [AO8] + (p15) FMA f106 = f10, f82, f106 + nop __LINE__ + } + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + (p13) FMA f102 = f11, f54, f102 + (p13) FMA f103 = f11, f55, f103 + (p14) FMA f104 = f11, f70, f104 + (p14) FMA f105 = f11, f71, f105 + (p15) FMA f106 = f11, f83, f106 + ;; + (p13) FMA f100 = f12, f40, f100 + (p13) FMA f101 = f12, f41, f101 + (p13) FMA f102 = f12, f56, f102 + (p13) FMA f103 = f12, f57, f103 + (p14) FMA f104 = f12, f72, f104 + (p14) FMA f105 = f12, f73, f105 + (p15) FMA f106 = f12, f84, f106 + ;; + (p13) FMA f100 = f13, f42, f100 + (p13) FMA f101 = f13, f43, f101 + (p13) FMA f102 = f13, f58, f102 + (p13) FMA f103 = f13, f59, f103 + (p14) FMA f104 = f13, f74, f104 + (p14) FMA f105 = f13, f75, f105 + (p15) FMA f106 = f13, f85, f106 + ;; + (p13) FMA f100 = f14, f44, f100 + (p13) FMA f101 = f14, f45, f101 + (p13) FMA f102 = f14, f60, f102 + (p13) FMA f103 = f14, f61, f103 + (p14) FMA f104 = f14, f76, f104 + (p14) FMA f105 = f14, f77, f105 + (p15) FMA f106 = f14, f86, f106 + ;; + (p13) FMA f100 = f15, f46, f100 + (p13) FMA f101 = f15, f47, f101 + (p13) FMA f102 = f15, f62, f102 + (p13) FMA f103 = f15, f63, f103 + (p14) FMA f104 = f15, f78, f104 + (p14) FMA f105 = f15, f79, f105 + (p15) FMA f106 = f15, f87, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + (p6) br.cond.dptk .L11 + ;; + .align 16 + + +.L20: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd AO4 = LDA, 1, AO2 + } + ;; + { .mmi + LDFD f10 = [X], INCX + (p8) LDFD f81 = [AO2], 1 * SIZE + shladd AO3 = LDA, 1, A + } + ;; + { .mmi + LDFD f11 = [X], INCX + (p8) LDFD f82 = [AO3], 1 * SIZE + } + ;; + { .mfi + (p8) LDFD f83 = [AO4], 1 * SIZE + FMPY f8 = ALPHA, f8 + adds PREB = RPREFETCH * SIZE, YLD1 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + } + ;; + FMPY f10 = ALPHA, f10 + shladd A = LDA, 2, A + FMPY f11 = ALPHA, f11 + ;; + { .mfi + adds RPRE3 = RPREFETCH * SIZE, AO3 + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 2 + } + ;; + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMA f106 = f9, f81, f106 + shr I = MM, 3 + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + (p8) FMA f106 = f10, f82, f106 + } + ;; + { .mfi + adds I = -1, I + (p8) FMA f106 = f11, f83, f106 + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mfi + (p17) LDFPD f63, f64 = [AO4], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + (p16) adds I = -1, I + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mmf + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFPD f46, f47 = [AO2], 2 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f101 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f17 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO4], 2 * SIZE + (p17) FMA f18 = f11, f59, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f58, f59 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f61, f113 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f63, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f11, f64, f122 + br.ctop.sptk.few .L22 + } + ;; + .align 16 + +.L25: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmf + (p15) LDFD f81 = [AO2] + (p15) LDFD f82 = [AO3] + (p13) FMA f100 = f8, f32, f100 + } + { .mfi + (p18) STFD [YST1] = f23, 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + } + ;; + ;; + { .mfi + (p13) LDFPD f38, f39 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + } + { .mfi + (p13) FMA f103 = f8, f49, f103 + } + ;; + { .mfi + (p13) LDFPD f54, f55 = [AO4], 2 * SIZE + (p14) FMA f104 = f8, f64, f104 + } + { .mfi + (p14) FMA f105 = f8, f65, f105 + } + ;; + { .mfi + (p14) LDFPD f70, f71 = [AO4], 2 * SIZE + (p15) FMA f106 = f8, f80, f106 + } + { .mfi + (p13) FMA f100 = f9, f34, f100 + } + ;; + { .mfi + (p15) LDFD f83 = [AO4] + (p13) FMA f101 = f9, f35, f101 + } + { .mfi + (p13) FMA f102 = f9, f50, f102 + } + ;; + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) FMA f100 = f10, f36, f100 + (p13) FMA f101 = f10, f37, f101 + (p13) FMA f102 = f10, f52, f102 + (p13) FMA f103 = f10, f53, f103 + (p14) FMA f104 = f10, f68, f104 + (p14) FMA f105 = f10, f69, f105 + (p15) FMA f106 = f10, f82, f106 + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + ;; + (p13) FMA f102 = f11, f54, f102 + (p13) STFD [YST1] = f100, 1 * SIZE + (p13) FMA f103 = f11, f55, f103 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f104 = f11, f70, f104 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p14) FMA f105 = f11, f71, f105 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + (p15) FMA f106 = f11, f83, f106 + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L30: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L40 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd A = LDA, 1, A + } + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA, f8 + mov ar.ec= 2 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + shr I = MM, 3 + ;; + (p8) LDFD f81 = [AO2], 1 * SIZE + cmp.eq p6, p0 = 0, I + ;; + (p8) FMA f106 = f8, f80, f106 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + tbit.nz p13, p0 = MM, 2 + ;; + (p8) FMA f106 = f9, f81, f106 + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + { .mfi + (p17) LDFPD f47, f48 = [AO2], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mmf + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + adds I = -1, I + } + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mmf + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f16 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f17 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p17) FMA f18 = f9, f43, f107 + } + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f45, f113 + } + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f47, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f9, f48, f122 + br.ctop.sptk.few .L32 + } + ;; + .align 16 + +.L35: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f81 = [AO2] + (p18) STFD [YST1] = f23, 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) FMA f100 = f9, f34, f100 + (p13) FMA f101 = f9, f35, f101 + (p13) FMA f102 = f9, f50, f102 + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L40: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 0 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L990 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + adds RPRE1 = RPREFETCH * SIZE, AO1 + } + ;; + { .mii + (p8) LDFD f80 = [AO1], 1 * SIZE + adds PREB = RPREFETCH * SIZE, YLD1 + } + ;; + FMPY f8 = ALPHA, f8 + shr I = MM, 3 + ;; + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 3 + ;; + { .mmi + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + tbit.nz p14, p15 = r0, 0 + } + ;; + { .mmi + adds YST2 = 4 * SIZE, YST1 + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mmi + (p8) STFD [YST1] = f106, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + } + { .mib + mov ar.lc = I + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L42: + { .mmf + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + (p18) FMA f16 = f8, f34, f102 + } + { .mmf + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) FMA f20 = f8, f46, f114 + } + ;; + { .mmf + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + (p18) FMA f17 = f8, f37, f105 + } + { .mmf + (p16) LDFPD f38, f41 = [AO1], 2 * SIZE + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) FMA f21 = f8, f49, f117 + } + ;; + { .mmf + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + (p18) FMA f18 = f8, f40, f108 + } + { .mmf + (p16) LDFPD f44, f47 = [AO1], 2 * SIZE + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) FMA f22 = f8, f52, f120 + } + ;; + { .mmf + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + (p18) FMA f19 = f8, f43, f111 + } + { .mmf + (p16) LDFPD f50, f53 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) FMA f23 = f8, f55, f123 + } + ;; + { .mmi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p14) PREFETCH [PREB], 16 * SIZE + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mib + nop __LINE__ + (p16) adds I = -1, I + br.ctop.sptk.few .L42 + } + ;; + .align 16 + +.L45: + { .mmi + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + } + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f105 = f8, f65, f105 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + br .L990 + ;; + .align 16 + +.L100: + shr J = N, 3 + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L120 + ;; + .align 16 + +.L111: + mov YLD1 = YY + mov YST1 = YY + ;; + LDFD f8 = [X], INCX + ;; + LDFD f9 = [X], INCX + ;; + LDFD f10 = [X], INCX + ;; + LDFD f11 = [X], INCX + ;; + LDFD f12 = [X], INCX + ;; + LDFD f13 = [X], INCX + ;; + LDFD f14 = [X], INCX + ;; + LDFD f15 = [X], INCX + ;; + FMPY f8 = ALPHA, f8 + FMPY f9 = ALPHA, f9 + FMPY f10 = ALPHA, f10 + FMPY f11 = ALPHA, f11 + FMPY f12 = ALPHA, f12 + FMPY f13 = ALPHA, f13 + FMPY f14 = ALPHA, f14 + FMPY f15 = ALPHA, f15 + ;; + mov AO1 = A + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + ;; + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + ;; + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + shladd A = LDA, 3, A + ;; + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + + (p8) LDFD f80 = [AO1], 1 * SIZE + (p8) LDFD f81 = [AO2], 1 * SIZE + (p8) LDFD f82 = [AO3], 1 * SIZE + (p8) LDFD f83 = [AO4], 1 * SIZE + (p8) LDFD f84 = [AO5], 1 * SIZE + (p8) LDFD f85 = [AO6], 1 * SIZE + (p8) LDFD f86 = [AO7], 1 * SIZE + (p8) LDFD f87 = [AO8], 1 * SIZE + (p8) LDFD f106 = [YLD1], 1 * SIZE + ;; + (p8) FMPY f32 = f8, f80 + (p8) FMPY f33 = f9, f81 + (p8) FMPY f34 = f10, f82 + (p8) FMA f35 = f11, f83, f106 + ;; + (p8) FMA f32 = f12, f84, f32 + (p8) FMA f33 = f13, f85, f33 + (p8) FMA f34 = f14, f86, f34 + (p8) FMA f35 = f15, f87, f35 + ;; + (p8) FADD f32 = f32, f33 + (p8) FADD f34 = f34, f35 + ;; + (p8) FADD f32 = f32, f34 + ;; + (p8) STFD [YST1] = f32, 1 * SIZE + + shr I = MM, 3 + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + ;; + mov ar.lc = I + mov ar.ec= 2 + (p6) br.cond.dpnt .L115 + ;; + .align 16 + +.L112: + { .mfi + (p17) LDFD f96 = [AO8], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFD f47 = [AO2], 1 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f101 = f11, f57, f101 + } + { .mmf + (p18) STFD [YST1] = f19, 1 * SIZE + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f104 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f57, f58 = [AO4], 2 * SIZE + (p17) FMA f107 = f11, f59, f107 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f110 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f59, f60 = [AO4], 2 * SIZE + (p17) FMA f113 = f11, f61, f113 + } + { .mfi + (p17) FMA f116 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f61, f62 = [AO4], 2 * SIZE + (p17) FMA f119 = f11, f63, f119 + } + { .mfi + (p17) FMA f122 = f11, f64, f122 + } + ;; + { .mfi + (p16) LDFD f63 = [AO4], 1 * SIZE + (p17) FMA f101 = f12, f65, f101 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f104 = f12, f66, f104 + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO5], 2 * SIZE + (p17) FMA f107 = f12, f67, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f110 = f12, f68, f110 + } + ;; + { .mfi + (p16) LDFPD f66, f67 = [AO5], 2 * SIZE + (p17) FMA f113 = f12, f69, f113 + } + { .mfi + (p14) PREFETCH [RPRE5], 16 * SIZE + (p17) FMA f116 = f12, f70, f116 + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO5], 2 * SIZE + (p17) FMA f119 = f12, f71, f119 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f122 = f12, f72, f122 + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO5], 2 * SIZE + (p17) FMA f101 = f13, f73, f101 + } + { .mmf + (p18) STFD [YST1] = f23, 1 * SIZE + (p16) LDFD f72 = [AO6], 1 * SIZE + (p17) FMA f104 = f13, f74, f104 + } + ;; + { .mfi + (p16) LDFPD f73, f74 = [AO6], 2 * SIZE + (p17) FMA f107 = f13, f75, f107 + } + { .mfi + (p15) PREFETCH [RPRE6], 16 * SIZE + (p17) FMA f110 = f13, f76, f110 + } + ;; + { .mfi + (p16) LDFPD f75, f76 = [AO6], 2 * SIZE + (p17) FMA f113 = f13, f77, f113 + } + { .mfi + (p17) FMA f116 = f13, f78, f116 + } + ;; + { .mfi + (p16) LDFPD f77, f78 = [AO6], 2 * SIZE + (p17) FMA f119 = f13, f79, f119 + } + { .mfi + (p17) FMA f122 = f13, f80, f122 + } + ;; + { .mfi + (p16) LDFD f79 = [AO6], 1 * SIZE + (p17) FMA f101 = f14, f81, f101 + } + { .mfi + (p17) FMA f104 = f14, f82, f104 + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO7], 2 * SIZE + (p17) FMA f107 = f14, f83, f107 + } + { .mfi + (p14) PREFETCH [RPRE7], 16 * SIZE + (p17) FMA f110 = f14, f84, f110 + } + ;; + { .mfi + (p16) LDFPD f82, f83 = [AO7], 2 * SIZE + (p17) FMA f113 = f14, f85, f113 + } + { .mfi + (p17) FMA f116 = f14, f86, f116 + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO7], 2 * SIZE + (p17) FMA f119 = f14, f87, f119 + } + { .mfi + (p17) FMA f122 = f14, f88, f122 + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO7], 2 * SIZE + (p17) FMA f16 = f15, f89, f101 + } + { .mfi + (p16) LDFD f88 = [AO8], 1 * SIZE + (p17) FMA f17 = f15, f90, f104 + } + ;; + { .mfi + (p16) LDFPD f89, f90 = [AO8], 2 * SIZE + (p17) FMA f18 = f15, f91, f107 + } + { .mfi + (p15) PREFETCH [RPRE8], 16 * SIZE + (p17) FMA f19 = f15, f92, f110 + } + ;; + { .mfi + (p16) LDFPD f91, f92 = [AO8], 2 * SIZE + (p17) FMA f20 = f15, f93, f113 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f21 = f15, f94, f116 + } + ;; + { .mfi + (p16) LDFPD f93, f94 = [AO8], 2 * SIZE + (p17) FMA f22 = f15, f95, f119 + } + { .mfb + (p16) adds I = -1, I + (p17) FMA f23 = f15, f96, f122 + br.ctop.sptk.few .L112 + } + ;; + .align 16 + +.L115: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + cmp.lt p6, p0 = 1, J + adds J = -1, J + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + (p13) LDFD f34 = [AO2], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f35, f50 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f66 = [AO2], 1 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f67 = [AO2], 1 * SIZE + (p15) LDFD f82 = [AO3] + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f23, 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p15) LDFD f81 = [AO2] + (p13) LDFD f38 = [AO4], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + } + { .mfi + (p13) LDFPD f40, f41 = [AO5], 2 * SIZE + (p13) FMA f101 = f8, f33, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f39, f54 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f56, f57 = [AO5], 2 * SIZE + (p13) FMA f103 = f8, f49, f103 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFD f55 = [AO4], 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f72, f73 = [AO5], 2 * SIZE + (p14) FMA f105 = f8, f65, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f70 = [AO4], 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + nop __LINE__ + } + { .mmi + (p15) LDFD f84 = [AO5] + (p13) LDFD f42 = [AO6], 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p13) LDFPD f43, f58 = [AO6], 2 * SIZE + (p14) LDFD f71 = [AO4], 1 * SIZE + (p13) FMA f100 = f9, f34, f100 + } + { .mfi + (p13) LDFPD f44, f45 = [AO7], 2 * SIZE + (p13) FMA f101 = f9, f35, f101 + nop __LINE__ + } + ;; + { .mmf + (p13) LDFD f59 = [AO6], 1 * SIZE + (p15) LDFD f83 = [AO4] + (p13) FMA f102 = f9, f50, f102 + } + { .mfi + (p13) LDFPD f60, f61 = [AO7], 2 * SIZE + (p13) FMA f103 = f9, f51, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f74 = [AO6], 1 * SIZE + (p14) FMA f104 = f9, f66, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f76, f77 = [AO7], 2 * SIZE + (p14) FMA f105 = f9, f67, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f75 = [AO6], 1 * SIZE + (p15) FMA f106 = f9, f81, f106 + nop __LINE__ + } + { .mmi + (p15) LDFD f86 = [AO7] + (p13) LDFD f46 = [AO8], 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p13) LDFPD f47, f62 = [AO8], 2 * SIZE + (p15) LDFD f85 = [AO6] + (p13) FMA f100 = f10, f36, f100 + } + { .mfi + (p13) FMA f101 = f10, f37, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFD f63 = [AO8], 1 * SIZE + (p13) FMA f102 = f10, f52, f102 + nop __LINE__ + } + { .mfi + (p13) FMA f103 = f10, f53, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f78 = [AO8], 1 * SIZE + (p14) FMA f104 = f10, f68, f104 + nop __LINE__ + } + { .mfi + (p14) FMA f105 = f10, f69, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f79 = [AO8], 1 * SIZE + (p15) FMA f106 = f10, f82, f106 + nop __LINE__ + } + ;; + (p15) LDFD f87 = [AO8] + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + (p13) FMA f102 = f11, f54, f102 + (p13) FMA f103 = f11, f55, f103 + (p14) FMA f104 = f11, f70, f104 + (p14) FMA f105 = f11, f71, f105 + (p15) FMA f106 = f11, f83, f106 + ;; + (p13) FMA f100 = f12, f40, f100 + (p13) FMA f101 = f12, f41, f101 + (p13) FMA f102 = f12, f56, f102 + (p13) FMA f103 = f12, f57, f103 + (p14) FMA f104 = f12, f72, f104 + (p14) FMA f105 = f12, f73, f105 + (p15) FMA f106 = f12, f84, f106 + ;; + (p13) FMA f100 = f13, f42, f100 + (p13) FMA f101 = f13, f43, f101 + (p13) FMA f102 = f13, f58, f102 + (p13) FMA f103 = f13, f59, f103 + (p14) FMA f104 = f13, f74, f104 + (p14) FMA f105 = f13, f75, f105 + (p15) FMA f106 = f13, f85, f106 + ;; + (p13) FMA f100 = f14, f44, f100 + (p13) FMA f101 = f14, f45, f101 + (p13) FMA f102 = f14, f60, f102 + (p13) FMA f103 = f14, f61, f103 + (p14) FMA f104 = f14, f76, f104 + (p14) FMA f105 = f14, f77, f105 + (p15) FMA f106 = f14, f86, f106 + ;; + (p13) FMA f100 = f15, f46, f100 + (p13) FMA f101 = f15, f47, f101 + (p13) FMA f102 = f15, f62, f102 + (p13) FMA f103 = f15, f63, f103 + (p14) FMA f104 = f15, f78, f104 + (p14) FMA f105 = f15, f79, f105 + (p15) FMA f106 = f15, f87, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + (p6) br.cond.dptk .L111 + ;; + .align 16 + +.L120: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd AO4 = LDA, 1, AO2 + } + ;; + { .mmi + LDFD f10 = [X], INCX + (p8) LDFD f81 = [AO2], 1 * SIZE + shladd AO3 = LDA, 1, A + } + ;; + { .mmi + LDFD f11 = [X], INCX + (p8) LDFD f82 = [AO3], 1 * SIZE + } + ;; + { .mfi + (p8) LDFD f83 = [AO4], 1 * SIZE + FMPY f8 = ALPHA, f8 + adds PREB = RPREFETCH * SIZE, YLD1 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + } + ;; + FMPY f10 = ALPHA, f10 + shladd A = LDA, 2, A + FMPY f11 = ALPHA, f11 + ;; + { .mfi + adds RPRE3 = RPREFETCH * SIZE, AO3 + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 2 + } + ;; + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMA f106 = f9, f81, f106 + shr I = MM, 3 + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + (p8) FMA f106 = f10, f82, f106 + } + ;; + { .mfi + adds I = -1, I + (p8) FMA f106 = f11, f83, f106 + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L125 + } + ;; + .align 16 + +.L122: + { .mfi + (p17) LDFD f64 = [AO4], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + (p16) adds I = -1, I + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mmf + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFD f47 = [AO2], 1 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f101 + } + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f17 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f57, f58 = [AO4], 2 * SIZE + (p17) FMA f18 = f11, f59, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f59, f60 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f61, f113 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f61, f62 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f63, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f11, f64, f122 + br.ctop.sptk.few .L122 + } + ;; + .align 16 + +.L125: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + (p15) LDFD f80 = [AO1] + } + { .mmi + (p15) LDFD f106 = [YLD1], 1 * SIZE + (p13) LDFD f34 = [AO2], 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f35, f50 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFD f66 = [AO2], 1 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmf + (p18) STFD [YST1] = f23, 1 * SIZE + (p14) LDFD f67 = [AO2], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + } + { .mmf + (p15) LDFD f82 = [AO3] + (p13) LDFD f38 = [AO4], 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + } + ;; + ;; + { .mmf + (p13) LDFPD f39, f54 = [AO4], 2 * SIZE + (p15) LDFD f81 = [AO2] + (p13) FMA f102 = f8, f48, f102 + } + { .mfi + (p13) FMA f103 = f8, f49, f103 + } + ;; + { .mfi + (p13) LDFD f55 = [AO4], 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + } + { .mfi + (p14) FMA f105 = f8, f65, f105 + } + ;; + { .mfi + (p14) LDFD f70 = [AO4], 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + } + { .mfi + (p13) FMA f100 = f9, f34, f100 + } + ;; + { .mfi + (p14) LDFD f71 = [AO4], 1 * SIZE + (p13) FMA f101 = f9, f35, f101 + } + { .mfi + (p13) FMA f102 = f9, f50, f102 + } + ;; + (p15) LDFD f83 = [AO4] + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) FMA f100 = f10, f36, f100 + (p13) FMA f101 = f10, f37, f101 + (p13) FMA f102 = f10, f52, f102 + (p13) FMA f103 = f10, f53, f103 + (p14) FMA f104 = f10, f68, f104 + (p14) FMA f105 = f10, f69, f105 + (p15) FMA f106 = f10, f82, f106 + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + ;; + (p13) FMA f102 = f11, f54, f102 + (p13) STFD [YST1] = f100, 1 * SIZE + (p13) FMA f103 = f11, f55, f103 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f104 = f11, f70, f104 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p14) FMA f105 = f11, f71, f105 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + (p15) FMA f106 = f11, f83, f106 + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L130: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L140 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd A = LDA, 1, A + } + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA, f8 + mov ar.ec= 2 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + shr I = MM, 3 + ;; + (p8) LDFD f81 = [AO2], 1 * SIZE + cmp.eq p6, p0 = 0, I + ;; + (p8) FMA f106 = f8, f80, f106 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + tbit.nz p13, p0 = MM, 2 + ;; + (p8) FMA f106 = f9, f81, f106 + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L135 + } + ;; + .align 16 + +.L132: + { .mfi + (p17) LDFD f48 = [AO2], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mmf + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + adds I = -1, I + } + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mmf + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mmf + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f16 = f9, f41, f101 + } + { .mmf + (p18) STFD [YST1] = f20, 1 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f17 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p17) FMA f18 = f9, f43, f107 + } + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f45, f113 + } + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f47, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f9, f48, f122 + br.ctop.sptk.few .L132 + } + ;; + .align 16 + +.L135: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f34 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f35 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f50 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f23, 1 * SIZE + } + ;; + (p14) LDFD f66 = [AO2], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + ;; + (p14) LDFD f67 = [AO2], 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + ;; + (p15) LDFD f81 = [AO2] + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) FMA f100 = f9, f34, f100 + (p13) FMA f101 = f9, f35, f101 + (p13) FMA f102 = f9, f50, f102 + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L140: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 0 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L990 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + adds RPRE1 = RPREFETCH * SIZE, AO1 + } + ;; + { .mmi + (p8) LDFD f80 = [AO1], 1 * SIZE + adds PREB = RPREFETCH * SIZE, YLD1 + } + ;; + FMPY f8 = ALPHA, f8 + shr I = MM, 3 + ;; + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 3 + ;; + { .mmi + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + tbit.nz p14, p15 = r0, 0 + } + ;; + { .mmi + adds YST2 = 4 * SIZE, YST1 + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mmi + (p8) STFD [YST1] = f106, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + } + { .mib + mov ar.lc = I + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L142: + { .mmf + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + (p18) FMA f16 = f8, f34, f102 + } + { .mmf + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) FMA f20 = f8, f46, f114 + } + ;; + { .mmf + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + (p18) FMA f17 = f8, f37, f105 + } + { .mmf + (p16) LDFPD f38, f41 = [AO1], 2 * SIZE + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) FMA f21 = f8, f49, f117 + } + ;; + { .mmf + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + (p18) FMA f18 = f8, f40, f108 + } + { .mmf + (p16) LDFPD f44, f47 = [AO1], 2 * SIZE + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) FMA f22 = f8, f52, f120 + } + ;; + { .mmf + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + (p18) FMA f19 = f8, f43, f111 + } + { .mmf + (p16) LDFPD f50, f53 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) FMA f23 = f8, f55, f123 + } + ;; + { .mmi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p14) PREFETCH [PREB], 16 * SIZE + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mib + nop __LINE__ + (p16) adds I = -1, I + br.ctop.sptk.few .L142 + } + ;; + .align 16 + +.L145: + { .mmi + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + } + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L990: + { .mmi + mov YLD1 = YY + mov YST1 = Y + mov pr.rot= 0 + } + { .mib + mov YST2 = Y + shr J = M, 3 + (p10) br.cond.dptk .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = r0, J + adds J = -1, J + mov ar.ec = 4 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + nop __LINE__ + tbit.nz p13, p0 = M, 2 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = J + (p6) br.cond.dpnt .L995 + } + ;; +.L992: + { .mfi + (p19) STFD [YST2] = f35 + (p18) FADD f34 = f34, f66 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f64 = [YLD1], 1 * SIZE + (p16) LDFD f32 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f39 + (p18) FADD f38 = f38, f70 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f36 = [YST1], INCY + (p16) LDFD f68 = [YLD1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f43 + (p18) FADD f42 = f42, f74 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f72 = [YLD1], 1 * SIZE + (p16) LDFD f40 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f47 + (p18) FADD f46 = f46, f78 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f76 = [YLD1], 1 * SIZE + (p16) LDFD f44 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f51 + (p18) FADD f50 = f50, f82 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f80 = [YLD1], 1 * SIZE + (p16) LDFD f48 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f55 + (p18) FADD f54 = f54, f86 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f84 = [YLD1], 1 * SIZE + (p16) LDFD f52 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f59 + (p18) FADD f58 = f58, f90 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f88 = [YLD1], 1 * SIZE + (p16) LDFD f56 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f63 + (p18) FADD f62 = f62, f94 + (p19) add YST2 = YST2, INCY + } + { .mmb + (p16) LDFD f92 = [YLD1], 1 * SIZE + (p16) LDFD f60 = [YST1], INCY + br.ctop.sptk.few .L992 + } + ;; + +.L995: + (p13) LDFD f32 = [YST1], INCY + (p13) LDFD f40 = [YLD1], 1 * SIZE + tbit.nz p14, p0 = M, 1 + ;; + (p13) LDFD f33 = [YST1], INCY + (p13) LDFD f41 = [YLD1], 1 * SIZE + tbit.nz p15, p0 = M, 0 + ;; + (p13) LDFD f34 = [YST1], INCY + (p13) LDFD f42 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f35 = [YST1], INCY + (p13) LDFD f43 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f36 = [YST1], INCY + (p14) LDFD f44 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f37 = [YST1], INCY + (p14) LDFD f45 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f38 = [YST1], INCY + (p15) LDFD f46 = [YLD1], 1 * SIZE + ;; + (p13) FADD f32 = f32, f40 + (p13) FADD f33 = f33, f41 + (p13) FADD f34 = f34, f42 + (p13) FADD f35 = f35, f43 + (p14) FADD f36 = f36, f44 + (p14) FADD f37 = f37, f45 + (p15) FADD f38 = f38, f46 + ;; + (p13) STFD [YST2] = f32 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f33 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f34 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f35 + (p13) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f36 + (p14) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f37 + (p14) add YST2 = YST2, INCY + ;; + (p15) STFD [YST2] = f38 + ;; + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/staticbuffer.S b/kernel/ia64/staticbuffer.S new file mode 100644 index 0000000..a30bb74 --- /dev/null +++ b/kernel/ia64/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 1024 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 4096 +#endif diff --git a/kernel/ia64/swap.S b/kernel/ia64/swap.S new file mode 100644 index 0000000..585f418 --- /dev/null +++ b/kernel/ia64/swap.S @@ -0,0 +1,577 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#define SP r12 + +#ifndef XDOUBLE +#define N r32 +#define X1 r36 +#define INCX r37 +#define Y1 r38 +#define INCY r39 +#else +#define N r32 +#define X1 r38 +#define INCX r39 +#define Y1 r33 +#define INCY r34 +#endif + +#define PRE1 r2 +#define PRE2 r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define X3 r18 +#define Y3 r19 +#define X4 r20 +#define Y4 r21 + +#define YY r22 +#define XX r23 +#define INCX5 r24 +#define INCY5 r25 +#define INCX16 r26 +#define INCY16 r27 +#define XYSUB r28 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + +#ifdef XDOUBLE + adds r8 = 16, SP + adds r9 = 24, SP + ;; + ld8 Y1 = [r8] + ld8 INCY = [r9] + ;; +#endif + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + tbit.z p0, p8 = Y1, BASE_SHIFT + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + mov PR = pr + } + { .mmi + sub XYSUB = X1, Y1 + mov X3 = X1 + shr I = N, 4 + } + ;; + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCY5 = INCY, 2, INCY + mov pr.rot= 0 + } + { .mmi + adds I = -1, I + and J = 15, N + extr XYSUB = XYSUB, BASE_SHIFT, 6 + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + shladd Y2 = INCY, 2, Y1 + mov ar.lc = I + } + { .mmi + shladd X4 = INCX, 2, X1 + shladd Y4 = INCY, 2, Y1 + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mmi + shladd PRE2 = XYSUB, BASE_SHIFT, Y1 + cmp.lt p8 ,p0 = 28, XYSUB + mov Y3 = Y1 + } + ;; + { .mmi + adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 + adds PRE2 = (PREFETCH_SIZE - 12) * SIZE, PRE2 + mov ar.ec= 2 + } + { .mib + cmp.eq p9 ,p0 = -1, I + tbit.z p0, p12 = N, 3 + (p9) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mmi + (p18) STFD [Y3] = f56 + (p18) STFD [Y4] = f64 + (p18) add Y3 = Y3, INCY5 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f40 = [X2], INCX + (p18) add Y4 = Y4, INCY5 + } + ;; + { .mmi + (p17) STFD [X3] = f65 + (p17) STFD [X4] = f73 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f64 = [Y1], INCY + (p16) LDFD f72 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f33 + (p17) STFD [Y4] = f41 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f34 = [X1], INCX + (p16) LDFD f42 = [X2], INCX + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f67 + (p17) STFD [X4] = f75 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f66 = [Y1], INCY + (p16) LDFD f74 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f35 + (p17) STFD [Y4] = f43 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f36 = [X1], INCX + (p16) LDFD f44 = [X2], INCX + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f69 + (p17) STFD [X4] = f77 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f68 = [Y1], INCY + (p16) LDFD f76 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f37 + (p17) STFD [Y4] = f45 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f38 = [X1], INCX5 + (p16) LDFD f46 = [X2], INCX5 + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f71 + (p17) STFD [X4] = f79 + (p17) add X3 = X3, INCX5 + } + { .mmi + (p16) LDFD f70 = [Y1], INCY5 + (p16) LDFD f78 = [Y2], INCY5 + (p17) add X4 = X4, INCX5 + } + ;; + { .mmi + (p17) STFD [Y3] = f39 + (p17) STFD [Y4] = f47 + (p17) add Y3 = Y3, INCY5 + } + { .mmi + (p16) LDFD f48 = [X1], INCX + (p16) LDFD f56 = [X2], INCX + (p17) add Y4 = Y4, INCY5 + } + ;; + { .mmi + (p17) STFD [X3] = f81 + (p17) STFD [X4] = f89 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f80 = [Y1], INCY + (p16) LDFD f88 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f49 + (p17) STFD [Y4] = f57 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f50 = [X1], INCX + (p16) LDFD f58 = [X2], INCX + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f83 + (p17) STFD [X4] = f91 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f82 = [Y1], INCY + (p16) LDFD f90 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f51 + (p17) STFD [Y4] = f59 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f52 = [X1], INCX + (p16) LDFD f60 = [X2], INCX + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f85 + (p17) STFD [X4] = f93 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f84 = [Y1], INCY + (p16) LDFD f92 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p16) lfetch.nt1 [PRE1] + (p16) lfetch.nt1 [PRE2] + (p16) shladd PRE1 = INCX, 4, PRE1 + } + { .mmi + (p16) LDFD f54 = [X1], INCX5 + (p16) LDFD f62 = [X2], INCX5 + (p16) shladd PRE2 = INCX, 4, PRE2 + } + ;; + { .mmi + (p17) STFD [Y3] = f53 + (p17) STFD [Y4] = f61 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f86 = [Y1], INCY5 + (p16) LDFD f94 = [Y2], INCY5 + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f87 + (p17) STFD [X4] = f95 + (p17) add X3 = X3, INCX5 + } + { .mib + nop __LINE__ + (p17) add X4 = X4, INCX5 + br.ctop.sptk.few .L12 + } + ;; +.L15: + { .mmi + (p18) STFD [Y3] = f56 + (p18) STFD [Y4] = f64 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f32 = [X1], INCX + (p12) LDFD f36 = [X2], INCX + cmp.eq p10, p0 = r0, J + } + ;; + { .mmi + (p12) LDFD f80 = [Y1], INCY + (p12) LDFD f84 = [Y2], INCY + (p18) add Y3 = Y3, INCY5 + } + { .mmi + (p12) LDFD f33 = [X1], INCX + (p12) LDFD f37 = [X2], INCX + (p18) add Y4 = Y4, INCY5 + } + ;; + { .mmi + (p12) LDFD f81 = [Y1], INCY + (p12) LDFD f85 = [Y2], INCY + mov pr = PR, -65474 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f38 = [X2], INCX + (p10) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f82 = [Y1], INCY + (p12) LDFD f86 = [Y2], INCY + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) LDFD f35 = [X1], INCX5 + (p12) LDFD f39 = [X2], INCX5 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) LDFD f83 = [Y1], INCY5 + (p12) LDFD f87 = [Y2], INCY5 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p13) LDFD f40 = [X1], INCX + (p13) LDFD f88 = [Y1], INCY + } + ;; + { .mmi + (p13) LDFD f41 = [X1], INCX + (p13) LDFD f89 = [Y1], INCY + } + ;; + { .mmi + (p12) STFD [Y3] = f32 + (p12) STFD [Y4] = f36 + (p12) add Y3 = Y3, INCY + } + { .mmi + (p13) LDFD f42 = [X1], INCX + (p13) LDFD f90 = [Y1], INCY + (p12) add Y4 = Y4, INCY + } + ;; + { .mmi + (p12) STFD [X3] = f80 + (p12) STFD [X4] = f84 + (p12) add X3 = X3, INCX + } + { .mmi + (p13) LDFD f43 = [X1], INCX + (p13) LDFD f91 = [Y1], INCY + (p12) add X4 = X4, INCX + } + ;; + { .mmi + (p12) STFD [Y3] = f33 + (p12) STFD [Y4] = f37 + (p12) add Y3 = Y3, INCY + } + { .mmi + (p14) LDFD f44 = [X1], INCX + (p14) LDFD f92 = [Y1], INCY + (p12) add Y4 = Y4, INCY + } + ;; + { .mmi + (p12) STFD [X3] = f81 + (p12) STFD [X4] = f85 + (p12) add X3 = X3, INCX + } + { .mmi + (p14) LDFD f45 = [X1], INCX + (p14) LDFD f93 = [Y1], INCY + (p12) add X4 = X4, INCX + } + ;; + { .mmi + (p12) STFD [X3] = f82 + (p12) STFD [X4] = f86 + (p12) add X3 = X3, INCX + } + { .mmi + (p15) LDFD f46 = [X1], INCX + (p15) LDFD f94 = [Y1], INCY + (p12) add X4 = X4, INCX + } + ;; + { .mmi + (p12) STFD [Y3] = f34 + (p12) STFD [Y4] = f38 + (p12) add Y3 = Y3, INCY + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y4 = Y4, INCY + } + ;; + { .mmi + (p12) STFD [X3] = f83 + (p12) STFD [X4] = f87 + (p12) add X3 = X3, INCX5 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add X4 = X4, INCX5 + } + ;; + { .mmi + (p12) STFD [Y3] = f35 + (p12) STFD [Y4] = f39 + (p12) add Y3 = Y3, INCY5 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y4 = Y4, INCY5 + } + ;; + { .mmi + (p13) STFD [X3] = f88 + (p13) STFD [Y3] = f40 + (p13) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y3 = Y3, INCY + } + ;; + { .mmi + (p13) STFD [X3] = f89 + (p13) STFD [Y3] = f41 + (p13) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y3 = Y3, INCY + } + ;; + { .mmi + (p13) STFD [X3] = f90 + (p13) STFD [Y3] = f42 + (p13) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y3 = Y3, INCY + } + ;; + { .mmi + (p13) STFD [X3] = f91 + (p13) STFD [Y3] = f43 + (p13) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y3 = Y3, INCY + } + ;; + { .mmi + (p14) STFD [X3] = f92 + (p14) STFD [Y3] = f44 + (p14) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p14) add Y3 = Y3, INCY + } + ;; + { .mmi + (p14) STFD [X3] = f93 + (p14) STFD [Y3] = f45 + (p14) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p14) add Y3 = Y3, INCY + } + ;; + { .mmb + (p15) STFD [X3] = f94 + (p15) STFD [Y3] = f46 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/symv_U.S b/kernel/ia64/symv_U.S new file mode 100644 index 0000000..4f6c451 --- /dev/null +++ b/kernel/ia64/symv_U.S @@ -0,0 +1,463 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define A r34 +#define LDA r35 +#define X r36 +#define INCX r37 +#define Y r38 +#define INCY r39 +#define BUFFER r33 + +#define I r14 +#define IS r15 +#define A1 r16 +#define A2 r17 +#define A3 r18 +#define A4 r19 + +#define NEW_X r20 +#define NEW_Y r21 +#define XX r22 +#define YY r23 +#define TEMP r24 +#define YYS r25 + +#define PREA1 loc0 +#define PREA2 loc1 +#define PREA3 loc2 +#define PREA4 loc3 + +#define A11 loc4 +#define A21 loc5 +#define A31 loc6 +#define A41 loc7 + +#define PREX r8 +#define PREY r9 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 4) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 +#define PREFETCHW lfetch.excl.nt1 + +#define alpha f8 +#define atemp1 f6 +#define atemp2 f7 +#define atemp3 f10 +#define atemp4 f11 + +#define xsum1 f12 +#define xsum2 f13 +#define xsum3 f14 +#define xsum4 f15 + + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 8, 0 + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + ;; + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + ;; + stf.spill [r8] = f22 + stf.spill [r9] = f23 + .body + ;; + ld8 BUFFER = [r14] + ;; + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + ;; + cmp.ge p7, p0 = 0, M + ;; + (p7) br.cond.dpnt .L999 + ;; + mov NEW_X = X + cmp.eq p10, p0 = SIZE, INCX + (p10) br.cond.dptk .L10 + ;; +.L10: + mov NEW_Y = Y + cmp.eq p10, p0 = SIZE, INCY + (p10) br.cond.dptk .L20 + ;; + +.L20: + mov IS = 0 + cmp.gt p10, p0 = 4, M + (p10) br.cond.dpnt .L30 + ;; +.L21: + mov A1 = A + add A2 = LDA, A + ;; + shladd A3 = LDA, 1, A + shladd A4 = LDA, 1, A2 + shladd A = LDA, 2, A + ;; + ;; + adds PREX = RPREFETCH * SIZE, NEW_X + adds PREY = RPREFETCH * SIZE, NEW_Y + adds PREA1 = RPREFETCH * SIZE, A1 + adds PREA2 = RPREFETCH * SIZE, A2 + adds PREA3 = RPREFETCH * SIZE, A3 + adds PREA4 = RPREFETCH * SIZE, A4 + ;; + shladd TEMP = IS, BASE_SHIFT, NEW_X + ;; + LDFD atemp1 = [TEMP], 1 * SIZE + ;; + LDFD atemp2 = [TEMP], 1 * SIZE + ;; + LDFD atemp3 = [TEMP], 1 * SIZE + ;; + LDFD atemp4 = [TEMP], 1 * SIZE + ;; + FMPY atemp1 = alpha, atemp1 + FMPY atemp2 = alpha, atemp2 + FMPY atemp3 = alpha, atemp3 + FMPY atemp4 = alpha, atemp4 + ;; + mov xsum1 = f0 + mov xsum2 = f0 + mov xsum3 = f0 + mov xsum4 = f0 + ;; + mov XX = NEW_X + mov YY = NEW_Y + mov YYS = NEW_Y + ;; + shr I = IS, 2 + mov pr.rot = 0 + ;; + mov ar.ec = 3 + cmp.eq p16, p0 = r0, r0 + ;; + cmp.eq p6, p0 = 0, I + adds I = -1, I + ;; + mov ar.lc = I + (p6) br.cond.dpnt .L28 + ;; + .align 16 + +.L22: + { .mmf + (p16) LDFPD f32, f35 = [A1], 2 * SIZE + (p19) STFD [YYS] = f95, 1 * SIZE + (p18) FMA xsum1 = f82, f34, xsum1 + } + { .mmf + (p18) FMA f94 = atemp1, f34, f94 + } + ;; + { .mmf + (p17) LDFD f90 = [XX], 1 * SIZE + (p18) FMA xsum2 = f82, f46, xsum2 + } + { .mmf + (p18) FMA f98 = atemp1, f37, f98 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [A2], 2 * SIZE + (p19) STFD [YYS] = f99, 1 * SIZE + (p18) FMA xsum3 = f82, f58, xsum3 + } + { .mmf + (p18) FMA f102 = atemp1, f40, f102 + } + ;; + { .mmf + (p16) PREFETCHW [PREY], 4 * SIZE + (p16) LDFD f92 = [YY], 1 * SIZE + (p18) FMA xsum4 = f82, f70, xsum4 + } + { .mmf + (p18) FMA f106 = atemp1, f43, f106 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [A3], 2 * SIZE + (p19) STFD [YYS] = f103, 1 * SIZE + (p18) FMA xsum1 = f85, f37, xsum1 + } + { .mmf + (p18) FMA f94 = atemp2, f46, f94 + } + ;; + { .mmf + (p16) LDFD f96 = [YY], 1 * SIZE + (p18) FMA xsum2 = f85, f49, xsum2 + } + { .mmf + (p18) FMA f98 = atemp2, f49, f98 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [A4], 2 * SIZE + (p19) STFD [YYS] = f107, 1 * SIZE + (p18) FMA xsum3 = f85, f61, xsum3 + } + { .mmf + (p18) FMA f102 = atemp2, f52, f102 + } + ;; + { .mmf + (p16) LDFD f100 = [YY], 1 * SIZE + (p18) FMA xsum4 = f85, f73, xsum4 + } + { .mmf + (p18) FMA f106 = atemp2, f55, f106 + } + ;; + { .mmf + (p16) PREFETCH [PREA1], 4 * SIZE + (p16) LDFPD f38, f41 = [A1], 2 * SIZE + (p18) FMA xsum1 = f88, f40, xsum1 + } + { .mmf + (p18) FMA f94 = atemp3, f58, f94 + } + ;; + { .mmf + (p16) LDFD f104 = [YY], 1 * SIZE + (p18) FMA xsum2 = f88, f52, xsum2 + } + { .mmf + (p18) FMA f98 = atemp3, f61, f98 + } + ;; + { .mmf + (p16) PREFETCH [PREA2], 4 * SIZE + (p16) LDFPD f50, f53 = [A2], 2 * SIZE + (p18) FMA xsum3 = f88, f64, xsum3 + } + { .mmf + (p18) FMA f102 = atemp3, f64, f102 + } + ;; + { .mmf + (p16) PREFETCH [PREX], 4 * SIZE + (p16) LDFD f80 = [XX], 1 * SIZE + (p18) FMA xsum4 = f88, f76, xsum4 + } + { .mmf + (p18) FMA f106 = atemp3, f67, f106 + } + ;; + { .mmf + (p16) PREFETCH [PREA3], 4 * SIZE + (p16) LDFPD f62, f65 = [A3], 2 * SIZE + (p18) FMA xsum1 = f91, f43, xsum1 + } + { .mmf + (p18) FMA f94 = atemp4, f70, f94 + } + ;; + { .mmf + (p16) LDFD f83 = [XX], 1 * SIZE + (p18) FMA xsum2 = f91, f55, xsum2 + } + { .mmf + (p18) FMA f98 = atemp4, f73, f98 + } + ;; + { .mmf + (p16) PREFETCH [PREA4], 4 * SIZE + (p16) LDFPD f74, f77 = [A4], 2 * SIZE + (p18) FMA xsum3 = f91, f67, xsum3 + } + { .mmf + (p18) FMA f102 = atemp4, f76, f102 + } + ;; + { .mmf + (p16) LDFD f86 = [XX], 1 * SIZE + (p18) FMA xsum4 = f91, f79, xsum4 + } + { .mfb + (p18) FMA f106 = atemp4, f79, f106 + br.ctop.sptk.few .L22 + } + ;; + (p19) STFD [YYS] = f95, 1 * SIZE + ;; + (p19) STFD [YYS] = f99, 1 * SIZE + ;; + (p19) STFD [YYS] = f103, 1 * SIZE + ;; + (p19) STFD [YYS] = f107, 1 * SIZE + ;; + ;; + .align 16 + +.L28: + FMPY xsum1 = alpha, xsum1 + FMPY xsum2 = alpha, xsum2 + FMPY xsum3 = alpha, xsum3 + FMPY xsum4 = alpha, xsum4 + ;; + LDFD f64 = [A1], 1 * SIZE + LDFD f65 = [A2], 1 * SIZE + LDFD f66 = [A3], 1 * SIZE + LDFD f67 = [A4], 1 * SIZE + ;; + LDFD f68 = [A1], 1 * SIZE + LDFD f69 = [A2], 1 * SIZE + LDFD f70 = [A3], 1 * SIZE + LDFD f71 = [A4], 1 * SIZE + ;; + LDFD f72 = [A1], 1 * SIZE + LDFD f73 = [A2], 1 * SIZE + LDFD f74 = [A3], 1 * SIZE + LDFD f75 = [A4], 1 * SIZE + ;; + LDFD f76 = [A1], 1 * SIZE + LDFD f77 = [A2], 1 * SIZE + LDFD f78 = [A3], 1 * SIZE + LDFD f79 = [A4], 1 * SIZE + ;; + FMA xsum1 = atemp1, f64, xsum1 + FMA xsum2 = atemp1, f65, xsum2 + FMA xsum3 = atemp1, f66, xsum3 + FMA xsum4 = atemp1, f67, xsum4 + ;; + FMA xsum1 = atemp2, f65, xsum1 + FMA xsum2 = atemp2, f69, xsum2 + FMA xsum3 = atemp2, f70, xsum3 + FMA xsum4 = atemp2, f71, xsum4 + ;; + FMA xsum1 = atemp3, f66, xsum1 + FMA xsum2 = atemp3, f70, xsum2 + FMA xsum3 = atemp3, f74, xsum3 + FMA xsum4 = atemp3, f75, xsum4 + ;; + FMA xsum1 = atemp4, f67, xsum1 + FMA xsum2 = atemp4, f71, xsum2 + FMA xsum3 = atemp4, f75, xsum3 + FMA xsum4 = atemp4, f79, xsum4 + ;; + LDFD f36 = [YY], 1 * SIZE + ;; + LDFD f37 = [YY], 1 * SIZE + ;; + LDFD f38 = [YY], 1 * SIZE + ;; + LDFD f39 = [YY], 1 * SIZE + ;; + FADD f36 = f36, xsum1 + FADD f37 = f37, xsum2 + FADD f38 = f38, xsum3 + FADD f39 = f39, xsum4 + ;; + STFD [YYS] = f36, 1 * SIZE + ;; + STFD [YYS] = f37, 1 * SIZE + ;; + STFD [YYS] = f38, 1 * SIZE + ;; + STFD [YYS] = f39, 1 * SIZE + ;; + adds IS = 4, IS + ;; + adds TEMP = 4, IS + ;; + cmp.le p6, p0 = TEMP, M + ;; + (p6) br.cond.dpnt .L21 + ;; +.L30: + + +.L990: + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/trsm_kernel_LN.S b/kernel/ia64/trsm_kernel_LN.S new file mode 100644 index 0000000..9b1f2b2 --- /dev/null +++ b/kernel/ia64/trsm_kernel_LN.S @@ -0,0 +1,14028 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE -7 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r36 +#define B r37 +#define C r38 +#define LDC r39 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 +#define AOFFSET2 loc12 +#define BOFFSET2 loc13 + + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -6 * 16, SP + adds r9 = -5 * 16, SP + adds SP = -6 * 16, SP + } + ;; + { .mmi + setf.sig f32 = M + setf.sig f33 = K + mov PR = pr + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + shr J = N, 3 + } + ;; + { .mmi + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + shladd LDC = LDC, BASE_SHIFT, r0 + } + ;; + { .mmi + stf.spill [r8] = f20 + stf.spill [r9] = f21 + mov AOFFSET = A + } + ;; + .body + { .mmf + ld8 OFFSET = [r14] + cmp.ge p6, p0 = 0, J + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + shladd C = M, BASE_SHIFT, C + nop __LINE__ + } + ;; + { .mmb + shladd A = r2, BASE_SHIFT, A + nop __LINE__ + (p6) br.cond.dpnt .L050 + } + ;; + .align 8 + +.L000: + { .mmf + mov C1 = C + add KK = M, OFFSET + } + { .mmi + mov AORIG = A + add C2 = LDC, C + shladd C3 = LDC, 1, C + } + ;; + { .mmf + shladd C5 = LDC, 2, C + shladd C = LDC, 3, C + } + { .mmf + shladd C4 = LDC, 1, C2 + shladd C6 = LDC, 2, C2 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 + shladd C8 = LDC, 2, C4 + } + ;; + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f96 = f0 + mov f104 = f0 + mov f112 = f0 + mov f120 = f0 + +.L040: + { .mib + sub L = K, KK + tbit.z p6, p0 = M, 0 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + shladd BOFFSET = r3, 3, B + sub AORIG = AORIG, r2 + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L048 + } + ;; + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.cloop.sptk.few .L042 + } + ;; + +.L048: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f96 = f36, f96 + FSUB f104 = f37, f104 + FSUB f112 = f38, f112 + FSUB f120 = f39, f120 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f96 = f96, f32 + FMPY f72 = f72, f32 + FMPY f104 = f104, f32 + FMPY f80 = f80, f32 + FMPY f112 = f112, f32 + FMPY f88 = f88, f32 + FMPY f120 = f120, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -1 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + adds C3 = -1 * SIZE, C3 + adds C4 = -1 * SIZE, C4 + adds C5 = -1 * SIZE, C5 + adds C6 = -1 * SIZE, C6 + adds C7 = -1 * SIZE, C7 + adds C8 = -1 * SIZE, C8 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, -3 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, -3 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FNMA f96 = f64, f36, f96 + ;; + FNMA f104 = f64, f37, f104 + ;; + FNMA f112 = f64, f38, f112 + ;; + FNMA f120 = f64, f39, f120 + ;; + FMPY f72 = f72, f40 + ;; + FNMA f80 = f72, f41, f80 + ;; + FNMA f88 = f72, f42, f88 + ;; + FNMA f96 = f72, f43, f96 + ;; + FNMA f104 = f72, f44, f104 + ;; + FNMA f112 = f72, f45, f112 + ;; + FNMA f120 = f72, f46, f120 + ;; + FMPY f80 = f80, f47 + ;; + FNMA f88 = f80, f48, f88 + ;; + FNMA f96 = f80, f49, f96 + ;; + FNMA f104 = f80, f50, f104 + ;; + FNMA f112 = f80, f51, f112 + ;; + FNMA f120 = f80, f52, f120 + ;; + FMPY f88 = f88, f53 + ;; + FNMA f96 = f88, f54, f96 + ;; + FNMA f104 = f88, f55, f104 + ;; + FNMA f112 = f88, f56, f112 + ;; + FNMA f120 = f88, f57, f120 + ;; + FMPY f96 = f96, f58 + ;; + FNMA f104 = f96, f59, f104 + ;; + FNMA f112 = f96, f60, f112 + ;; + FNMA f120 = f96, f61, f120 + ;; + FMPY f104 = f104, f16 + ;; + FNMA f112 = f104, f17, f112 + ;; + FNMA f120 = f104, f18, f120 + ;; + FMPY f112 = f112, f19 + ;; + FNMA f120 = f112, f20, f120 + ;; + FMPY f120 = f120, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + ;; + FNMA f112 = f120, f33, f112 + ;; + FNMA f104 = f120, f34, f104 + ;; + FNMA f96 = f120, f35, f96 + ;; + FNMA f88 = f120, f36, f88 + ;; + FNMA f80 = f120, f37, f80 + ;; + FNMA f72 = f120, f38, f72 + ;; + FNMA f64 = f120, f39, f64 + ;; + FMPY f112 = f112, f40 + ;; + FNMA f104 = f112, f41, f104 + ;; + FNMA f96 = f112, f42, f96 + ;; + FNMA f88 = f112, f43, f88 + ;; + FNMA f80 = f112, f44, f80 + ;; + FNMA f72 = f112, f45, f72 + ;; + FNMA f64 = f112, f46, f64 + ;; + FMPY f104 = f104, f47 + ;; + FNMA f96 = f104, f48, f96 + ;; + FNMA f88 = f104, f49, f88 + ;; + FNMA f80 = f104, f50, f80 + ;; + FNMA f72 = f104, f51, f72 + ;; + FNMA f64 = f104, f52, f64 + ;; + FMPY f96 = f96, f53 + ;; + FNMA f88 = f96, f54, f88 + ;; + FNMA f80 = f96, f55, f80 + ;; + FNMA f72 = f96, f56, f72 + ;; + FNMA f64 = f96, f57, f64 + ;; + FMPY f88 = f88, f58 + ;; + FNMA f80 = f88, f59, f80 + ;; + FNMA f72 = f88, f60, f72 + ;; + FNMA f64 = f88, f61, f64 + ;; + FMPY f80 = f80, f16 + ;; + FNMA f72 = f80, f17, f72 + ;; + FNMA f64 = f80, f18, f64 + ;; + FMPY f72 = f72, f19 + ;; + FNMA f64 = f72, f20, f64 + ;; + FMPY f64 = f64, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, - 3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; + +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif +#ifndef LN + STFD [C3 ] = f80, SIZE +#else + STFD [C3 ] = f80 +#endif +#ifndef LN + STFD [C4 ] = f88, SIZE +#else + STFD [C4 ] = f88 +#endif +#ifndef LN + STFD [C5 ] = f96, SIZE +#else + STFD [C5 ] = f96 +#endif +#ifndef LN + STFD [C6 ] = f104, SIZE +#else + STFD [C6 ] = f104 +#endif +#ifndef LN + STFD [C7 ] = f112, SIZE +#else + STFD [C7 ] = f112 +#endif +#ifndef LN + STFD [C8 ] = f120, SIZE +#else + STFD [C8 ] = f120 +#endif + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f96 = f0 + mov f104 = f0 + mov f112 = f0 + mov f120 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L030: + { .mib + sub L = K, KK + tbit.z p6, p0 = M, 1 + (p6) br.cond.dptk .L020 + } + ;; + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L038 + } + ;; + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + FSUB f113 = f46, f113 + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; + FSUB f96 = f40, f96 + FSUB f97 = f41, f97 + ;; + FSUB f104 = f42, f104 + FSUB f105 = f43, f105 + ;; + FSUB f112 = f44, f112 + FSUB f113 = f45, f113 + ;; + FSUB f120 = f46, f120 + FSUB f121 = f47, f121 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f97 = f97, f32 + FMPY f73 = f73, f32 + FMPY f105 = f105, f32 + FMPY f81 = f81, f32 + FMPY f113 = f113, f32 + FMPY f89 = f89, f32 + FMPY f121 = f121, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f96 = f97, f33, f96 + FNMA f72 = f73, f33, f72 + FNMA f104 = f105, f33, f104 + FNMA f80 = f81, f33, f80 + FNMA f112 = f113, f33, f112 + FNMA f88 = f89, f33, f88 + FNMA f120 = f121, f33, f120 + ;; + FMPY f64 = f64, f34 + FMPY f96 = f96, f34 + FMPY f72 = f72, f34 + FMPY f104 = f104, f34 + FMPY f80 = f80, f34 + FMPY f112 = f112, f34 + FMPY f88 = f88, f34 + FMPY f120 = f120, f34 + ;; + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -2 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -2 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 + adds C5 = -2 * SIZE, C5 + adds C6 = -2 * SIZE, C6 + adds C7 = -2 * SIZE, C7 + adds C8 = -2 * SIZE, C8 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + FMPY f65 = f65, f34 + FMPY f97 = f97, f34 + FMPY f73 = f73, f34 + FMPY f105 = f105, f34 + FMPY f81 = f81, f34 + FMPY f113 = f113, f34 + FMPY f89 = f89, f34 + FMPY f121 = f121, f34 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, -11 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, -11 * SIZE + } +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + ;; + FNMA f120 = f104, f18, f120 + FNMA f121 = f105, f18, f121 + ;; + FMPY f112 = f112, f19 + FMPY f113 = f113, f19 + ;; + FNMA f120 = f112, f20, f120 + FNMA f121 = f113, f20, f121 + ;; + FMPY f120 = f120, f21 + FMPY f121 = f121, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, 5 * SIZE + STFD [AOFFSET2] = f89, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f105, -11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + FMPY f121 = f121, f32 + ;; + FNMA f112 = f120, f33, f112 + FNMA f113 = f121, f33, f113 + ;; + FNMA f104 = f120, f34, f104 + FNMA f105 = f121, f34, f105 + ;; + FNMA f96 = f120, f35, f96 + FNMA f97 = f121, f35, f97 + ;; + FNMA f88 = f120, f36, f88 + FNMA f89 = f121, f36, f89 + ;; + FNMA f80 = f120, f37, f80 + FNMA f81 = f121, f37, f81 + ;; + FNMA f72 = f120, f38, f72 + FNMA f73 = f121, f38, f73 + ;; + FNMA f64 = f120, f39, f64 + FNMA f65 = f121, f39, f65 + ;; + FMPY f112 = f112, f40 + FMPY f113 = f113, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f105 = f113, f41, f105 + ;; + FNMA f96 = f112, f42, f96 + FNMA f97 = f113, f42, f97 + ;; + FNMA f88 = f112, f43, f88 + FNMA f89 = f113, f43, f89 + ;; + FNMA f80 = f112, f44, f80 + FNMA f81 = f113, f44, f81 + ;; + FNMA f72 = f112, f45, f72 + FNMA f73 = f113, f45, f73 + ;; + FNMA f64 = f112, f46, f64 + FNMA f65 = f113, f46, f65 + ;; + FMPY f104 = f104, f47 + FMPY f105 = f105, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f97 = f105, f48, f97 + ;; + FNMA f88 = f104, f49, f88 + FNMA f89 = f105, f49, f89 + ;; + FNMA f80 = f104, f50, f80 + FNMA f81 = f105, f50, f81 + ;; + FNMA f72 = f104, f51, f72 + FNMA f73 = f105, f51, f73 + ;; + FNMA f64 = f104, f52, f64 + FNMA f65 = f105, f52, f65 + ;; + FMPY f96 = f96, f53 + FMPY f97 = f97, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f89 = f97, f54, f89 + ;; + FNMA f80 = f96, f55, f80 + FNMA f81 = f97, f55, f81 + ;; + FNMA f72 = f96, f56, f72 + FNMA f73 = f97, f56, f73 + ;; + FNMA f64 = f96, f57, f64 + FNMA f65 = f97, f57, f65 + ;; + FMPY f88 = f88, f58 + FMPY f89 = f89, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f81 = f89, f59, f81 + ;; + FNMA f72 = f88, f60, f72 + FNMA f73 = f89, f60, f73 + ;; + FNMA f64 = f88, f61, f64 + FNMA f65 = f89, f61, f65 + ;; + FMPY f80 = f80, f16 + FMPY f81 = f81, f16 + ;; + FNMA f72 = f80, f17, f72 + FNMA f73 = f81, f17, f73 + ;; + FNMA f64 = f80, f18, f64 + FNMA f65 = f81, f18, f65 + ;; + FMPY f72 = f72, f19 + FMPY f73 = f73, f19 + ;; + FNMA f64 = f72, f20, f64 + FNMA f65 = f73, f20, f65 + ;; + FMPY f64 = f64, f21 + FMPY f65 = f65, f21 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f105, - 11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, - 3 * SIZE + STFD [AOFFSET2] = f89, - 3 * SIZE + ;; + +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; +#ifndef LN + STFD [C3 ] = f81, SIZE +#else + STFD [C3 ] = f81, - SIZE +#endif + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; +#ifndef LN + STFD [C4 ] = f89, SIZE +#else + STFD [C4 ] = f89, -SIZE +#endif + ;; + STFD [C5 ] = f96, SIZE + mov f96 = f0 + ;; +#ifndef LN + STFD [C5 ] = f97, SIZE +#else + STFD [C5 ] = f97, -SIZE +#endif + ;; + STFD [C6 ] = f104, SIZE + mov f104 = f0 + ;; +#ifndef LN + STFD [C6 ] = f105, SIZE +#else + STFD [C6 ] = f105, -SIZE +#endif + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + STFD [C7 ] = f112, SIZE + mov f112 = f0 + ;; + { .mmi +#ifndef LN + STFD [C7 ] = f113, SIZE +#else + STFD [C7 ] = f113, -SIZE +#endif + +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + mov f120 = f0 + } + ;; + { .mmi +#ifndef LN + STFD [C8 ] = f121, SIZE +#else + STFD [C8 ] = f121, -SIZE +#endif + +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L020: + { .mib + sub L = K, KK + tbit.z p6, p0 = M, 2 + (p6) br.cond.dptk .L010 + } + ;; + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; + #if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; + #else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 + #ifdef LN + sub AORIG = AORIG, r2 + #else + nop __LINE__ + #endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; + #endif + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f67 = f0 + } + { .mfi + setf.d f74 = r0 + mov f75 = f0 + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f83 = f0 + } + { .mfi + setf.d f90 = r0 + mov f91 = f0 + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f99 = f0 + } + { .mfi + setf.d f106 = r0 + mov f107 = f0 + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f114 = r0 + mov f115 = f0 + } + { .mfb + setf.d f122 = r0 + mov f123 = f0 + (p6) br.cond.dpnt .L028 + } + ;; + + .L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + + .L028: + #if defined(LN) || defined(RT) + #ifdef LN + adds r2 = -4, KK + #else + adds r2 = -8, KK + #endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 3, B + ;; + #endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + + #if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f113 = f46, f113 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + FSUB f66 = f48, f66 + FSUB f74 = f49, f74 + FSUB f82 = f50, f82 + FSUB f90 = f51, f90 + FSUB f98 = f52, f98 + FSUB f106 = f53, f106 + FSUB f114 = f54, f114 + FSUB f122 = f55, f122 + ;; + FSUB f67 = f56, f67 + FSUB f75 = f57, f75 + FSUB f83 = f58, f83 + FSUB f91 = f59, f91 + FSUB f99 = f60, f99 + FSUB f107 = f61, f107 + FSUB f115 = f62, f115 + FSUB f123 = f63, f123 + ;; + #else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; + FSUB f96 = f48, f96 + FSUB f97 = f49, f97 + FSUB f98 = f50, f98 + FSUB f99 = f51, f99 + ;; + FSUB f104 = f52, f104 + FSUB f105 = f53, f105 + FSUB f106 = f54, f106 + FSUB f107 = f55, f107 + ;; + FSUB f112 = f56, f112 + FSUB f113 = f57, f113 + FSUB f114 = f58, f114 + FSUB f115 = f59, f115 + ;; + FSUB f120 = f60, f120 + FSUB f121 = f61, f121 + FSUB f122 = f62, f122 + FSUB f123 = f63, f123 + ;; + #endif + + #ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f99 = f99, f32 + FMPY f75 = f75, f32 + FMPY f107 = f107, f32 + FMPY f83 = f83, f32 + FMPY f115 = f115, f32 + FMPY f91 = f91, f32 + FMPY f123 = f123, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f98 = f99, f33, f98 + FNMA f74 = f75, f33, f74 + FNMA f106 = f107, f33, f106 + FNMA f82 = f83, f33, f82 + FNMA f114 = f115, f33, f114 + FNMA f90 = f91, f33, f90 + FNMA f122 = f123, f33, f122 + ;; + FNMA f65 = f67, f34, f65 + FNMA f97 = f99, f34, f97 + FNMA f73 = f75, f34, f73 + FNMA f105 = f107, f34, f105 + FNMA f81 = f83, f34, f81 + FNMA f113 = f115, f34, f113 + FNMA f89 = f91, f34, f89 + FNMA f121 = f123, f34, f121 + ;; + FNMA f64 = f67, f35, f64 + FNMA f96 = f99, f35, f96 + FNMA f72 = f75, f35, f72 + FNMA f104 = f107, f35, f104 + FNMA f80 = f83, f35, f80 + FNMA f112 = f115, f35, f112 + FNMA f88 = f91, f35, f88 + FNMA f120 = f123, f35, f120 + ;; + FMPY f66 = f66, f36 + FMPY f98 = f98, f36 + FMPY f74 = f74, f36 + FMPY f106 = f106, f36 + FMPY f82 = f82, f36 + FMPY f114 = f114, f36 + FMPY f90 = f90, f36 + FMPY f122 = f122, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f97 = f98, f37, f97 + FNMA f73 = f74, f37, f73 + FNMA f105 = f106, f37, f105 + FNMA f81 = f82, f37, f81 + FNMA f113 = f114, f37, f113 + FNMA f89 = f90, f37, f89 + FNMA f121 = f122, f37, f121 + ;; + FNMA f64 = f66, f38, f64 + FNMA f96 = f98, f38, f96 + FNMA f72 = f74, f38, f72 + FNMA f104 = f106, f38, f104 + FNMA f80 = f82, f38, f80 + FNMA f112 = f114, f38, f112 + FNMA f88 = f90, f38, f88 + FNMA f120 = f122, f38, f120 + ;; + adds BOFFSET = 24 * SIZE, BOFFSET + adds BOFFSET2 = 24 * SIZE, BOFFSET2 + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMPY f65 = f65, f39 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMPY f97 = f97, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FMPY f73 = f73, f39 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FMPY f105 = f105, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FMPY f81 = f81, f39 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FMPY f113 = f113, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f91, - 11 * SIZE + FMPY f89 = f89, f39 + } + { .mfi + STFD [BOFFSET2] = f123, - 11 * SIZE + FMPY f121 = f121, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f65, f40, f64 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f97, f40, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f72 = f73, f40, f72 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f104 = f105, f40, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f81, f40, f80 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f113, f40, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f90, -11 * SIZE + FNMA f88 = f89, f40, f88 + } + { .mfi + STFD [BOFFSET2] = f122, -11 * SIZE + FNMA f120 = f121, f40, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f64 = f64, f41 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f96 = f96, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f72 = f72, f41 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f104 = f104, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f80 = f80, f41 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f112 = f112, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + FMPY f88 = f88, f41 + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + FMPY f120 = f120, f41 + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -4 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -4 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + #endif + + #ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + FNMA f74 = f72, f34, f74 + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + FNMA f67 = f64, f35, f67 + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + FNMA f83 = f80, f35, f83 + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + nop __LINE__ + } + ;; + FMPY f65 = f65, f36 + FMPY f97 = f97, f36 + FMPY f73 = f73, f36 + FMPY f105 = f105, f36 + FMPY f81 = f81, f36 + FMPY f113 = f113, f36 + FMPY f89 = f89, f36 + FMPY f121 = f121, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f98 = f97, f37, f98 + FNMA f74 = f73, f37, f74 + FNMA f106 = f105, f37, f106 + FNMA f82 = f81, f37, f82 + FNMA f114 = f113, f37, f114 + FNMA f90 = f89, f37, f90 + FNMA f122 = f121, f37, f122 + ;; + FNMA f67 = f65, f38, f67 + FNMA f99 = f97, f38, f99 + FNMA f75 = f73, f38, f75 + FNMA f107 = f105, f38, f107 + FNMA f83 = f81, f38, f83 + FNMA f115 = f113, f38, f115 + FNMA f91 = f89, f38, f91 + FNMA f123 = f121, f38, f123 + ;; + FMPY f66 = f66, f39 + FMPY f98 = f98, f39 + FMPY f74 = f74, f39 + FMPY f106 = f106, f39 + FMPY f82 = f82, f39 + FMPY f114 = f114, f39 + FMPY f90 = f90, f39 + FMPY f122 = f122, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f99 = f98, f40, f99 + FNMA f75 = f74, f40, f75 + FNMA f107 = f106, f40, f107 + FNMA f83 = f82, f40, f83 + FNMA f115 = f114, f40, f115 + FNMA f91 = f90, f40, f91 + FNMA f123 = f122, f40, f123 + ;; + FMPY f67 = f67, f41 + FMPY f99 = f99, f41 + FMPY f75 = f75, f41 + FMPY f107 = f107, f41 + FMPY f83 = f83, f41 + FMPY f115 = f115, f41 + FMPY f91 = f91, f41 + FMPY f123 = f123, f41 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f91, -27 * SIZE + } + { .mfi + STFD [BOFFSET2] = f123, -27 * SIZE + } + ;; + #endif + + #ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + FNMA f98 = f66, f36, f98 + FNMA f99 = f67, f36, f99 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + FNMA f106 = f66, f37, f106 + FNMA f107 = f67, f37, f107 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + FNMA f114 = f66, f38, f114 + FNMA f115 = f67, f38, f115 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + FNMA f122 = f66, f39, f122 + FNMA f123 = f67, f39, f123 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + FMPY f74 = f74, f40 + FMPY f75 = f75, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + FNMA f82 = f74, f41, f82 + FNMA f83 = f75, f41, f83 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + FNMA f90 = f74, f42, f90 + FNMA f91 = f75, f42, f91 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + FNMA f98 = f74, f43, f98 + FNMA f99 = f75, f43, f99 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + FNMA f106 = f74, f44, f106 + FNMA f107 = f75, f44, f107 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + FNMA f114 = f74, f45, f114 + FNMA f115 = f75, f45, f115 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + FNMA f122 = f74, f46, f122 + FNMA f123 = f75, f46, f123 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + FMPY f82 = f82, f47 + FMPY f83 = f83, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + FNMA f90 = f82, f48, f90 + FNMA f91 = f83, f48, f91 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + FNMA f98 = f82, f49, f98 + FNMA f99 = f83, f49, f99 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + FNMA f106 = f82, f50, f106 + FNMA f107 = f83, f50, f107 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + FNMA f114 = f82, f51, f114 + FNMA f115 = f83, f51, f115 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + FNMA f122 = f82, f52, f122 + FNMA f123 = f83, f52, f123 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + FMPY f90 = f90, f53 + FMPY f91 = f91, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + FNMA f98 = f90, f54, f98 + FNMA f99 = f91, f54, f99 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + FNMA f106 = f90, f55, f106 + FNMA f107 = f91, f55, f107 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + FNMA f114 = f90, f56, f114 + FNMA f115 = f91, f56, f115 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + FNMA f122 = f90, f57, f122 + FNMA f123 = f91, f57, f123 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + FMPY f98 = f98, f58 + FMPY f99 = f99, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + FNMA f106 = f98, f59, f106 + FNMA f107 = f99, f59, f107 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + FNMA f114 = f98, f60, f114 + FNMA f115 = f99, f60, f115 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + FNMA f122 = f98, f61, f122 + FNMA f123 = f99, f61, f123 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + FMPY f106 = f106, f16 + FMPY f107 = f107, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + FNMA f114 = f106, f17, f114 + FNMA f115 = f107, f17, f115 + ;; + FNMA f120 = f104, f18, f120 + FNMA f121 = f105, f18, f121 + FNMA f122 = f106, f18, f122 + FNMA f123 = f107, f18, f123 + ;; + FMPY f112 = f112, f19 + FMPY f113 = f113, f19 + FMPY f114 = f114, f19 + FMPY f115 = f115, f19 + ;; + FNMA f120 = f112, f20, f120 + FNMA f121 = f113, f20, f121 + FNMA f122 = f114, f20, f122 + FNMA f123 = f115, f20, f123 + ;; + FMPY f120 = f120, f21 + FMPY f121 = f121, f21 + FMPY f122 = f122, f21 + FMPY f123 = f123, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f91, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f105, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f106, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f107, 5 * SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + ;; + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + ;; + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f123, - 27 * SIZE + ;; + #endif + + #ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + FMPY f121 = f121, f32 + FMPY f122 = f122, f32 + FMPY f123 = f123, f32 + ;; + FNMA f112 = f120, f33, f112 + FNMA f113 = f121, f33, f113 + FNMA f114 = f122, f33, f114 + FNMA f115 = f123, f33, f115 + ;; + FNMA f104 = f120, f34, f104 + FNMA f105 = f121, f34, f105 + FNMA f106 = f122, f34, f106 + FNMA f107 = f123, f34, f107 + ;; + FNMA f96 = f120, f35, f96 + FNMA f97 = f121, f35, f97 + FNMA f98 = f122, f35, f98 + FNMA f99 = f123, f35, f99 + ;; + FNMA f88 = f120, f36, f88 + FNMA f89 = f121, f36, f89 + FNMA f90 = f122, f36, f90 + FNMA f91 = f123, f36, f91 + ;; + FNMA f80 = f120, f37, f80 + FNMA f81 = f121, f37, f81 + FNMA f82 = f122, f37, f82 + FNMA f83 = f123, f37, f83 + ;; + FNMA f72 = f120, f38, f72 + FNMA f73 = f121, f38, f73 + FNMA f74 = f122, f38, f74 + FNMA f75 = f123, f38, f75 + ;; + FNMA f64 = f120, f39, f64 + FNMA f65 = f121, f39, f65 + FNMA f66 = f122, f39, f66 + FNMA f67 = f123, f39, f67 + ;; + FMPY f112 = f112, f40 + FMPY f113 = f113, f40 + FMPY f114 = f114, f40 + FMPY f115 = f115, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f105 = f113, f41, f105 + FNMA f106 = f114, f41, f106 + FNMA f107 = f115, f41, f107 + ;; + FNMA f96 = f112, f42, f96 + FNMA f97 = f113, f42, f97 + FNMA f98 = f114, f42, f98 + FNMA f99 = f115, f42, f99 + ;; + FNMA f88 = f112, f43, f88 + FNMA f89 = f113, f43, f89 + FNMA f90 = f114, f43, f90 + FNMA f91 = f115, f43, f91 + ;; + FNMA f80 = f112, f44, f80 + FNMA f81 = f113, f44, f81 + FNMA f82 = f114, f44, f82 + FNMA f83 = f115, f44, f83 + ;; + FNMA f72 = f112, f45, f72 + FNMA f73 = f113, f45, f73 + FNMA f74 = f114, f45, f74 + FNMA f75 = f115, f45, f75 + ;; + FNMA f64 = f112, f46, f64 + FNMA f65 = f113, f46, f65 + FNMA f66 = f114, f46, f66 + FNMA f67 = f115, f46, f67 + ;; + FMPY f104 = f104, f47 + FMPY f105 = f105, f47 + FMPY f106 = f106, f47 + FMPY f107 = f107, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f97 = f105, f48, f97 + FNMA f98 = f106, f48, f98 + FNMA f99 = f107, f48, f99 + ;; + FNMA f88 = f104, f49, f88 + FNMA f89 = f105, f49, f89 + FNMA f90 = f106, f49, f90 + FNMA f91 = f107, f49, f91 + ;; + FNMA f80 = f104, f50, f80 + FNMA f81 = f105, f50, f81 + FNMA f82 = f106, f50, f82 + FNMA f83 = f107, f50, f83 + ;; + FNMA f72 = f104, f51, f72 + FNMA f73 = f105, f51, f73 + FNMA f74 = f106, f51, f74 + FNMA f75 = f107, f51, f75 + ;; + FNMA f64 = f104, f52, f64 + FNMA f65 = f105, f52, f65 + FNMA f66 = f106, f52, f66 + FNMA f67 = f107, f52, f67 + ;; + FMPY f96 = f96, f53 + FMPY f97 = f97, f53 + FMPY f98 = f98, f53 + FMPY f99 = f99, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f89 = f97, f54, f89 + FNMA f90 = f98, f54, f90 + FNMA f91 = f99, f54, f91 + ;; + FNMA f80 = f96, f55, f80 + FNMA f81 = f97, f55, f81 + FNMA f82 = f98, f55, f82 + FNMA f83 = f99, f55, f83 + ;; + FNMA f72 = f96, f56, f72 + FNMA f73 = f97, f56, f73 + FNMA f74 = f98, f56, f74 + FNMA f75 = f99, f56, f75 + ;; + FNMA f64 = f96, f57, f64 + FNMA f65 = f97, f57, f65 + FNMA f66 = f98, f57, f66 + FNMA f67 = f99, f57, f67 + ;; + FMPY f88 = f88, f58 + FMPY f89 = f89, f58 + FMPY f90 = f90, f58 + FMPY f91 = f91, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f81 = f89, f59, f81 + FNMA f82 = f90, f59, f82 + FNMA f83 = f91, f59, f83 + ;; + FNMA f72 = f88, f60, f72 + FNMA f73 = f89, f60, f73 + FNMA f74 = f90, f60, f74 + FNMA f75 = f91, f60, f75 + ;; + FNMA f64 = f88, f61, f64 + FNMA f65 = f89, f61, f65 + FNMA f66 = f90, f61, f66 + FNMA f67 = f91, f61, f67 + ;; + FMPY f80 = f80, f16 + FMPY f81 = f81, f16 + FMPY f82 = f82, f16 + FMPY f83 = f83, f16 + ;; + FNMA f72 = f80, f17, f72 + FNMA f73 = f81, f17, f73 + FNMA f74 = f82, f17, f74 + FNMA f75 = f83, f17, f75 + ;; + FNMA f64 = f80, f18, f64 + FNMA f65 = f81, f18, f65 + FNMA f66 = f82, f18, f66 + FNMA f67 = f83, f18, f67 + ;; + FMPY f72 = f72, f19 + FMPY f73 = f73, f19 + FMPY f74 = f74, f19 + FMPY f75 = f75, f19 + ;; + FNMA f64 = f72, f20, f64 + FNMA f65 = f73, f20, f65 + FNMA f66 = f74, f20, f66 + FNMA f67 = f75, f20, f67 + ;; + FMPY f64 = f64, f21 + FMPY f65 = f65, f21 + FMPY f66 = f66, f21 + FMPY f67 = f67, f21 + ;; + adds AOFFSET = 24 * SIZE, AOFFSET + adds AOFFSET2 = 24 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + ;; + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + ;; + STFD [AOFFSET] = f115, - 11 * SIZE + STFD [AOFFSET2] = f123, - 11 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f105, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f106, SIZE + ;; + STFD [AOFFSET] = f99, - 11 * SIZE + STFD [AOFFSET2] = f107, - 11 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f91, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; + + #endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + #ifdef LN + adds C3 = -4 * SIZE, C3 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + #ifndef LN + STFD [C1 ] = f67, SIZE + #else + STFD [C1 ] = f67, - 3 * SIZE + #endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + #ifdef LN + adds C4 = -4 * SIZE, C4 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi + #ifndef LN + STFD [C2 ] = f75, SIZE + #else + STFD [C2 ] = f75, - 3 * SIZE + #endif + #ifdef LN + adds C5 = -4 * SIZE, C5 + #else + nop __LINE__ + #endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + #ifdef LN + adds C6 = -4 * SIZE, C6 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + #ifndef LN + STFD [C3 ] = f83, SIZE + #else + STFD [C3 ] = f83, - 3 * SIZE + #endif + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + #ifdef LN + adds C8 = -4 * SIZE, C8 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi + #ifndef LN + STFD [C4 ] = f91, SIZE + #else + STFD [C4 ] = f91, - 3 * SIZE + #endif + nop __LINE__ + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE + #ifdef LN + adds C7 = -4 * SIZE, C7 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + #ifndef LN + STFD [C5 ] = f99, SIZE + #else + STFD [C5 ] = f99, - 3 * SIZE + #endif + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + sub L = K, KK + } + ;; + { .mmi + #ifndef LN + STFD [C6 ] = f107, SIZE + #else + STFD [C6 ] = f107, - 3 * SIZE + #endif + #ifdef RT + shladd AORIG = r2, 2, AORIG + #else + nop __LINE__ + #endif + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + #if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE + #if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET + #else + nop __LINE__ + #endif + } + ;; + { .mmi + #ifndef LN + STFD [C7 ] = f115, SIZE + #else + STFD [C7 ] = f115, - 3 * SIZE + #endif + #if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET + #else + nop __LINE__ + #endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE + #ifdef LT + adds KK = 4, KK + #elif defined LN + adds KK = -4, KK + #else + nop __LINE__ + #endif + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE + #if defined(LT) || defined(RN) + mov L = KK + #else + sub L = K, KK + #endif + } + ;; + { .mmb + #ifndef LN + STFD [C8 ] = f123, SIZE + #else + STFD [C8 ] = f123, - 3 * SIZE + #endif + } + ;; + .align 8 + +.L010: + { .mib + cmp.gt p6, p0 = 8, M + shr I = M, 3 + (p6) br.cond.dpnt .L049 + } + ;; + .align 8 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + shladd r3 = KK, BASE_SHIFT, r0 + shl r2 = K, 3 + BASE_SHIFT + } + ;; + { .mmi + shladd BOFFSET = r3, 3, B + sub AORIG = AORIG, r2 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f64 = r0 + mov f72 = f0 + } + { .mfi + setf.d f80 = r0 + mov f88 = f0 + shladd AOFFSET = r3, 3, AORIG + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f96 = r0 + mov f104 = f0 + } + { .mfb + setf.d f112 = r0 + mov f120 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f65 = r0 + mov f73 = f0 + } + { .mfb + setf.d f89 = r0 + mov f81 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfb + setf.d f113 = r0 + mov f121 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfb + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfb + setf.d f119 = r0 + mov f127 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 8 + +.L012: +/* 1 */ + { .mfi + lfetch.fault.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: + adds r2 = -8, KK + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 3, B + ;; + + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FSUB f113 = f46, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + FSUB f66 = f48, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f74 = f49, f74 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f90 = f51, f90 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FSUB f98 = f52, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f106 = f53, f106 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + FSUB f114 = f54, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f55, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + FSUB f67 = f56, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f75 = f57, f75 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + FSUB f83 = f58, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + FSUB f99 = f60, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f61, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + FSUB f115 = f62, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f63, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f68 = f32, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f76 = f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f84 = f34, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f92 = f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f108 = f37, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f116 = f38, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f39, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f69 = f40, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f77 = f41, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f85 = f42, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f43, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f101 = f44, f101 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f117 = f46, f117 + adds BOFFSET = -62 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f125 = f47, f125 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f48, f70 +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FSUB f78 = f49, f78 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f86 = f50, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f94 = f51, f94 + nop __LINE__ + } + ;; + { .mfi +#ifdef LN + LDFPD f33, f32 = [AOFFSET] +#else + LDFPD f32, f33 = [AOFFSET] +#endif + FSUB f102 = f52, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f110 = f53, f110 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f126 = f55, f126 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + ;; + { .mfi + nop __LINE__ + FSUB f71 = f56, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f79 = f57, f79 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f58, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f59, f95 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f60, f103 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f111 = f61, f111 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f62, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; + + { .mfi + LDFPD f35, f34 = [AOFFSET] + FMPY f71 = f71, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f103 = f103, f32 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + LDFPD f37, f36 = [AOFFSET] + FMPY f79 = f79, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f111 = f111, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f39, f38 = [AOFFSET] + FMPY f87 = f87, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f119 = f119, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [AOFFSET], -2 * SIZE + FMPY f95 = f95, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f127 = f127, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f41 = [AOFFSET] + FNMA f70 = f71, f33, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f103, f33, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f43 = [AOFFSET] + FNMA f78 = f79, f33, f78 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f110 = f111, f33, f110 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f45 = [AOFFSET] + FNMA f86 = f87, f33, f86 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f118 = f119, f33, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f47 = [AOFFSET] + FNMA f94 = f95, f33, f94 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f126 = f127, f33, f126 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f49 = [AOFFSET] + FNMA f69 = f71, f34, f69 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f103, f34, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f51 = [AOFFSET] + FNMA f77 = f79, f34, f77 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f109 = f111, f34, f109 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [AOFFSET], -2 * SIZE + FNMA f85 = f87, f34, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f119, f34, f117 + nop __LINE__ + } + ;; + { .mfi + LDFPD f55, f54 = [AOFFSET] + FNMA f93 = f95, f34, f93 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f125 = f127, f34, f125 + nop __LINE__ + } + ;; + { .mfi + LDFPD f57, f56 = [AOFFSET] + FNMA f68 = f71, f35, f68 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f103, f35, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f59, f58 = [AOFFSET] + FNMA f76 = f79, f35, f76 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f111, f35, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f61, f60 = [AOFFSET] + FNMA f84 = f87, f35, f84 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f119, f35, f116 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [AOFFSET], -2 * SIZE + FNMA f92 = f95, f35, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f124 = f127, f35, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f18, f17 = [AOFFSET] + FNMA f67 = f71, f36, f67 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f99 = f103, f36, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f20, f19 = [AOFFSET] + FNMA f75 = f79, f36, f75 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f107 = f111, f36, f107 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [AOFFSET] + FNMA f83 = f87, f36, f83 + adds BOFFSET = 56 * SIZE, BOFFSET + } + { .mfi + FNMA f115 = f119, f36, f115 + adds BOFFSET2 = 56 * SIZE, BOFFSET2 + } + ;; + FNMA f91 = f95, f36, f91 + FNMA f123 = f127, f36, f123 + ;; + FNMA f66 = f71, f37, f66 + FNMA f98 = f103, f37, f98 + FNMA f74 = f79, f37, f74 + FNMA f106 = f111, f37, f106 + FNMA f82 = f87, f37, f82 + FNMA f114 = f119, f37, f114 + FNMA f90 = f95, f37, f90 + FNMA f122 = f127, f37, f122 + ;; + FNMA f65 = f71, f38, f65 + FNMA f97 = f103, f38, f97 + FNMA f73 = f79, f38, f73 + FNMA f105 = f111, f38, f105 + FNMA f81 = f87, f38, f81 + FNMA f113 = f119, f38, f113 + FNMA f89 = f95, f38, f89 + FNMA f121 = f127, f38, f121 + ;; + FNMA f64 = f71, f39, f64 + FNMA f96 = f103, f39, f96 + FNMA f72 = f79, f39, f72 + FNMA f104 = f111, f39, f104 + FNMA f80 = f87, f39, f80 + FNMA f112 = f119, f39, f112 + FNMA f88 = f95, f39, f88 + FNMA f120 = f127, f39, f120 + ;; + FMPY f70 = f70, f40 + FMPY f102 = f102, f40 + FMPY f78 = f78, f40 + FMPY f110 = f110, f40 + FMPY f86 = f86, f40 + FMPY f118 = f118, f40 + FMPY f94 = f94, f40 + FMPY f126 = f126, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f101 = f102, f41, f101 + FNMA f77 = f78, f41, f77 + FNMA f109 = f110, f41, f109 + FNMA f85 = f86, f41, f85 + FNMA f117 = f118, f41, f117 + FNMA f93 = f94, f41, f93 + FNMA f125 = f126, f41, f125 + ;; + FNMA f68 = f70, f42, f68 + FNMA f100 = f102, f42, f100 + FNMA f76 = f78, f42, f76 + FNMA f108 = f110, f42, f108 + FNMA f84 = f86, f42, f84 + FNMA f116 = f118, f42, f116 + FNMA f92 = f94, f42, f92 + FNMA f124 = f126, f42, f124 + ;; + FNMA f67 = f70, f43, f67 + FNMA f99 = f102, f43, f99 + FNMA f75 = f78, f43, f75 + FNMA f107 = f110, f43, f107 + FNMA f83 = f86, f43, f83 + FNMA f115 = f118, f43, f115 + FNMA f91 = f94, f43, f91 + FNMA f123 = f126, f43, f123 + ;; + FNMA f66 = f70, f44, f66 + FNMA f98 = f102, f44, f98 + FNMA f74 = f78, f44, f74 + FNMA f106 = f110, f44, f106 + FNMA f82 = f86, f44, f82 + FNMA f114 = f118, f44, f114 + FNMA f90 = f94, f44, f90 + FNMA f122 = f126, f44, f122 + ;; + FNMA f65 = f70, f45, f65 + FNMA f97 = f102, f45, f97 + FNMA f73 = f78, f45, f73 + FNMA f105 = f110, f45, f105 + FNMA f81 = f86, f45, f81 + FNMA f113 = f118, f45, f113 + FNMA f89 = f94, f45, f89 + FNMA f121 = f126, f45, f121 + ;; + FNMA f64 = f70, f46, f64 + FNMA f96 = f102, f46, f96 + FNMA f72 = f78, f46, f72 + FNMA f104 = f110, f46, f104 + FNMA f80 = f86, f46, f80 + FNMA f112 = f118, f46, f112 + FNMA f88 = f94, f46, f88 + FNMA f120 = f126, f46, f120 + ;; + FMPY f69 = f69, f47 + FMPY f101 = f101, f47 + FMPY f77 = f77, f47 + FMPY f109 = f109, f47 + FMPY f85 = f85, f47 + FMPY f117 = f117, f47 + FMPY f93 = f93, f47 + FMPY f125 = f125, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f100 = f101, f48, f100 + FNMA f76 = f77, f48, f76 + FNMA f108 = f109, f48, f108 + FNMA f84 = f85, f48, f84 + FNMA f116 = f117, f48, f116 + FNMA f92 = f93, f48, f92 + FNMA f124 = f125, f48, f124 + ;; + FNMA f67 = f69, f49, f67 + FNMA f99 = f101, f49, f99 + FNMA f75 = f77, f49, f75 + FNMA f107 = f109, f49, f107 + FNMA f83 = f85, f49, f83 + FNMA f115 = f117, f49, f115 + FNMA f91 = f93, f49, f91 + FNMA f123 = f125, f49, f123 + ;; + FNMA f66 = f69, f50, f66 + FNMA f98 = f101, f50, f98 + FNMA f74 = f77, f50, f74 + FNMA f106 = f109, f50, f106 + FNMA f82 = f85, f50, f82 + FNMA f114 = f117, f50, f114 + FNMA f90 = f93, f50, f90 + FNMA f122 = f125, f50, f122 + ;; + FNMA f65 = f69, f51, f65 + FNMA f97 = f101, f51, f97 + FNMA f73 = f77, f51, f73 + FNMA f105 = f109, f51, f105 + FNMA f81 = f85, f51, f81 + FNMA f113 = f117, f51, f113 + FNMA f89 = f93, f51, f89 + FNMA f121 = f125, f51, f121 + ;; + FNMA f64 = f69, f52, f64 + FNMA f96 = f101, f52, f96 + FNMA f72 = f77, f52, f72 + FNMA f104 = f109, f52, f104 + FNMA f80 = f85, f52, f80 + FNMA f112 = f117, f52, f112 + FNMA f88 = f93, f52, f88 + FNMA f120 = f125, f52, f120 + ;; + FMPY f68 = f68, f53 + FMPY f100 = f100, f53 + FMPY f76 = f76, f53 + FMPY f108 = f108, f53 + FMPY f84 = f84, f53 + FMPY f116 = f116, f53 + FMPY f92 = f92, f53 + FMPY f124 = f124, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f99 = f100, f54, f99 + FNMA f75 = f76, f54, f75 + FNMA f107 = f108, f54, f107 + FNMA f83 = f84, f54, f83 + FNMA f115 = f116, f54, f115 + FNMA f91 = f92, f54, f91 + FNMA f123 = f124, f54, f123 + ;; + FNMA f66 = f68, f55, f66 + FNMA f98 = f100, f55, f98 + FNMA f74 = f76, f55, f74 + FNMA f106 = f108, f55, f106 + FNMA f82 = f84, f55, f82 + FNMA f114 = f116, f55, f114 + FNMA f90 = f92, f55, f90 + FNMA f122 = f124, f55, f122 + ;; + FNMA f65 = f68, f56, f65 + FNMA f97 = f100, f56, f97 + FNMA f73 = f76, f56, f73 + FNMA f105 = f108, f56, f105 + FNMA f81 = f84, f56, f81 + FNMA f113 = f116, f56, f113 + FNMA f89 = f92, f56, f89 + FNMA f121 = f124, f56, f121 + ;; + FNMA f64 = f68, f57, f64 + FNMA f96 = f100, f57, f96 + FNMA f72 = f76, f57, f72 + FNMA f104 = f108, f57, f104 + FNMA f80 = f84, f57, f80 + FNMA f112 = f116, f57, f112 + FNMA f88 = f92, f57, f88 + FNMA f120 = f124, f57, f120 + ;; + FMPY f67 = f67, f58 + FMPY f99 = f99, f58 + FMPY f75 = f75, f58 + FMPY f107 = f107, f58 + FMPY f83 = f83, f58 + FMPY f115 = f115, f58 + FMPY f91 = f91, f58 + FMPY f123 = f123, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f98 = f99, f59, f98 + FNMA f74 = f75, f59, f74 + FNMA f106 = f107, f59, f106 + FNMA f82 = f83, f59, f82 + FNMA f114 = f115, f59, f114 + FNMA f90 = f91, f59, f90 + FNMA f122 = f123, f59, f122 + ;; + FNMA f65 = f67, f60, f65 + FNMA f97 = f99, f60, f97 + FNMA f73 = f75, f60, f73 + FNMA f105 = f107, f60, f105 + FNMA f81 = f83, f60, f81 + FNMA f113 = f115, f60, f113 + FNMA f89 = f91, f60, f89 + FNMA f121 = f123, f60, f121 + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FNMA f64 = f67, f61, f64 + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FNMA f96 = f99, f61, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f79, SIZE + FNMA f72 = f75, f61, f72 + } + { .mfi + STFD [BOFFSET2] = f111, SIZE + FNMA f104 = f107, f61, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f87, SIZE + FNMA f80 = f83, f61, f80 + } + { .mfi + STFD [BOFFSET2] = f119, SIZE + FNMA f112 = f115, f61, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f95, - 11 * SIZE + FNMA f88 = f91, f61, f88 + } + { .mfi + STFD [BOFFSET2] = f127, - 11 * SIZE + FNMA f120 = f123, f61, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FMPY f66 = f66, f16 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FMPY f98 = f98, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f78, SIZE + FMPY f74 = f74, f16 + } + { .mfi + STFD [BOFFSET2] = f110, SIZE + FMPY f106 = f106, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FMPY f82 = f82, f16 + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FMPY f114 = f114, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f94, - 11 * SIZE + FMPY f90 = f90, f16 + } + { .mfi + STFD [BOFFSET2] = f126, - 11 * SIZE + FMPY f122 = f122, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FNMA f65 = f66, f17, f65 + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FNMA f97 = f98, f17, f97 + } + ;; + { .mfi + STFD [BOFFSET] = f77, SIZE + FNMA f73 = f74, f17, f73 + } + { .mfi + STFD [BOFFSET2] = f109, SIZE + FNMA f105 = f106, f17, f105 + } + ;; + { .mfi + STFD [BOFFSET] = f85, SIZE + FNMA f81 = f82, f17, f81 + } + { .mfi + STFD [BOFFSET2] = f117, SIZE + FNMA f113 = f114, f17, f113 + } + ;; + { .mfi + STFD [BOFFSET] = f93, - 11 * SIZE + FNMA f89 = f90, f17, f89 + } + { .mfi + STFD [BOFFSET2] = f125, - 11 * SIZE + FNMA f121 = f122, f17, f121 + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f64 = f66, f18, f64 + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f96 = f98, f18, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f76, SIZE + FNMA f72 = f74, f18, f72 + } + { .mfi + STFD [BOFFSET2] = f108, SIZE + FNMA f104 = f106, f18, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f80 = f82, f18, f80 + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f112 = f114, f18, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f92, - 11 * SIZE + FNMA f88 = f90, f18, f88 + } + { .mfi + STFD [BOFFSET2] = f124, - 11 * SIZE + FNMA f120 = f122, f18, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMPY f65 = f65, f19 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMPY f97 = f97, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FMPY f73 = f73, f19 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FMPY f105 = f105, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FMPY f81 = f81, f19 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FMPY f113 = f113, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f91, - 11 * SIZE + FMPY f89 = f89, f19 + } + { .mfi + STFD [BOFFSET2] = f123, - 11 * SIZE + FMPY f121 = f121, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f65, f20, f64 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f97, f20, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f72 = f73, f20, f72 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f104 = f105, f20, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f81, f20, f80 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f113, f20, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f90, -11 * SIZE + FNMA f88 = f89, f20, f88 + } + { .mfi + STFD [BOFFSET2] = f122, -11 * SIZE + FNMA f120 = f121, f20, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f64 = f64, f21 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f96 = f96, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f72 = f72, f21 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f104 = f104, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f80 = f80, f21 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f112 = f112, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + FMPY f88 = f88, f21 + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + FMPY f120 = f120, f21 + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -8 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -8 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + adds C3 = -8 * SIZE, C3 + } + ;; + { .mmi + STFD [C1 ] = f67, - 3 * SIZE + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + adds C4 = -8 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f75, - 3 * SIZE + STFD [C10] = f79 + adds C5 = -8 * SIZE, C5 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + adds C13 = 4 * SIZE, C5 + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + adds C6 = -8 * SIZE, C6 + } + ;; + { .mmi + STFD [C3 ] = f83, - 3 * SIZE + STFD [C11] = f87 + adds C14 = 4 * SIZE, C6 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + adds C8 = -8 * SIZE, C8 + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + adds C16 = 4 * SIZE, C8 + } + ;; + { .mmi + STFD [C4 ] = f91, - 3 * SIZE + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C13] = f100, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + STFD [C13] = f101, SIZE + adds I = -1, I + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE + STFD [C13] = f102, SIZE + adds C7 = -8 * SIZE, C7 + } + ;; + { .mmi + STFD [C5 ] = f99, - 3 * SIZE + STFD [C13] = f103 + adds C15 = 4 * SIZE, C7 + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + STFD [C14] = f108, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + STFD [C14] = f109, SIZE + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + STFD [C14] = f110, SIZE + sub L = K, KK + } + ;; + { .mmi + STFD [C6 ] = f107, - 3 * SIZE + STFD [C14] = f111 + nop __LINE__ + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C15] = f116, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + STFD [C15] = f117, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE + STFD [C15] = f118, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C7 ] = f115, - 3 * SIZE + STFD [C15] = f119 + nop __LINE__ + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + STFD [C16] = f124, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE + STFD [C16] = f125, SIZE + adds KK = -8, KK + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE + STFD [C16] = f126, SIZE + sub L = K, KK + } + ;; + { .mmb + STFD [C8 ] = f123, - 3 * SIZE + STFD [C16] = f127 + (p6) br.cond.dptk .L011 + } + ;; + +.L049: + { .mmi + adds J = -1, J + mov AOFFSET = A + shladd KK8 = K, BASE_SHIFT, r0 + } + ;; + { .mmb + shladd B = KK8, 3, B + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L000 + } + ;; + .align 8 + +.L050: + { .mib + setf.d f64 = r0 + tbit.z p6, p0 = N, 2 + (p6) br.cond.dpnt .L090 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + } + ;; + { .mfi +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + shladd C4 = LDC, 1, C2 + } + ;; + + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + + + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L070 + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L088 + } + ;; + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb + nop __LINE__ + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; + +.L088: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + adds C2 = -1 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + } + ;; + adds C3 = -1 * SIZE, C3 + adds C4 = -1 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FMPY f72 = f72, f36 + ;; + FNMA f80 = f72, f37, f80 + ;; + FNMA f88 = f72, f38, f88 + ;; + FMPY f80 = f80, f39 + ;; + FNMA f88 = f80, f40, f88 + ;; + FMPY f88 = f88, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + ;; + FNMA f80 = f88, f33, f80 + ;; + FNMA f72 = f88, f34, f72 + ;; + FNMA f64 = f88, f35, f64 + ;; + FMPY f80 = f80, f36 + ;; + FNMA f72 = f80, f37, f72 + ;; + FNMA f64 = f80, f38, f64 + ;; + FMPY f72 = f72, f39 + ;; + FNMA f64 = f72, f40, f64 + ;; + FMPY f64 = f64, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, - 3 * SIZE + ;; +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif +#ifndef LN + STFD [C3 ] = f80, SIZE +#else + STFD [C3 ] = f80 +#endif +#ifndef LN + STFD [C4 ] = f88, SIZE +#else + STFD [C4 ] = f88 +#endif + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L070: + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L060 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L078 + } + ;; + .align 8 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f73 = f73, f32 + FMPY f81 = f81, f32 + FMPY f89 = f89, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f72 = f73, f33, f72 + FNMA f80 = f81, f33, f80 + FNMA f88 = f89, f33, f88 + ;; + FMPY f64 = f64, f34 + FMPY f72 = f72, f34 + FMPY f80 = f80, f34 + FMPY f88 = f88, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f89, - 3 * SIZE + ;; + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + FMPY f81 = f81, f34 + FMPY f89 = f89, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + FMPY f89 = f89, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f81 = f89, f33, f81 + ;; + FNMA f72 = f88, f34, f72 + FNMA f73 = f89, f34, f73 + ;; + FNMA f64 = f88, f35, f64 + FNMA f65 = f89, f35, f65 + ;; + FMPY f80 = f80, f36 + FMPY f81 = f81, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f73 = f81, f37, f73 + ;; + FNMA f64 = f80, f38, f64 + FNMA f65 = f81, f38, f65 + ;; + FMPY f72 = f72, f39 + FMPY f73 = f73, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f65 = f73, f40, f65 + ;; + FMPY f64 = f64, f41 + FMPY f65 = f65, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; +#ifndef LN + STFD [C3 ] = f81, SIZE +#else + STFD [C3 ] = f81, - SIZE +#endif + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; +#ifndef LN + STFD [C4 ] = f89, SIZE +#else + STFD [C4 ] = f89, -SIZE +#endif + ;; + mov f96 = f0 + ;; + mov f104 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + mov f112 = f0 + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f120 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L060: + + + + + + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L051 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + mov f80 = f0 + mov f81 = f0 + mov f82 = f0 + mov f83 = f0 + mov f88 = f0 + mov f89 = f0 + mov f90 = f0 + mov f91 = f0 + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L068 + } + ;; + .align 8 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; + .align 8 + +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + ;; + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f75 = f75, f32 + FMPY f83 = f83, f32 + FMPY f91 = f91, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f74 = f75, f33, f74 + FNMA f82 = f83, f33, f82 + FNMA f90 = f91, f33, f90 + ;; + FNMA f65 = f67, f34, f65 + FNMA f73 = f75, f34, f73 + FNMA f81 = f83, f34, f81 + FNMA f89 = f91, f34, f89 + ;; + FNMA f64 = f67, f35, f64 + FNMA f72 = f75, f35, f72 + FNMA f80 = f83, f35, f80 + FNMA f88 = f91, f35, f88 + ;; + FMPY f66 = f66, f36 + FMPY f74 = f74, f36 + FMPY f82 = f82, f36 + FMPY f90 = f90, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f73 = f74, f37, f73 + FNMA f81 = f82, f37, f81 + FNMA f89 = f90, f37, f89 + ;; + FNMA f64 = f66, f38, f64 + FNMA f72 = f74, f38, f72 + FNMA f80 = f82, f38, f80 + FNMA f88 = f90, f38, f88 + ;; + FMPY f65 = f65, f39 + FMPY f73 = f73, f39 + FMPY f81 = f81, f39 + FMPY f89 = f89, f39 + ;; + FNMA f64 = f65, f40, f64 + FNMA f72 = f73, f40, f72 + FNMA f80 = f81, f40, f80 + FNMA f88 = f89, f40, f88 + ;; + FMPY f64 = f64, f41 + FMPY f72 = f72, f41 + FMPY f80 = f80, f41 + FMPY f88 = f88, f41 + ;; + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, - 11 * SIZE + STFD [BOFFSET2] = f91, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + FMPY f81 = f81, f36 + FMPY f89 = f89, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + FNMA f82 = f81, f37, f82 + FNMA f90 = f89, f37, f90 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + FNMA f83 = f81, f38, f83 + FNMA f91 = f89, f38, f91 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + FMPY f82 = f82, f39 + FMPY f90 = f90, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + FNMA f83 = f82, f40, f83 + FNMA f91 = f90, f40, f91 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + FMPY f83 = f83, f41 + FMPY f91 = f91, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, -11 * SIZE + STFD [BOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + FMPY f74 = f74, f36 + FMPY f75 = f75, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + FNMA f82 = f74, f37, f82 + FNMA f83 = f75, f37, f83 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + FNMA f90 = f74, f38, f90 + FNMA f91 = f75, f38, f91 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + FMPY f82 = f82, f39 + FMPY f83 = f83, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + FNMA f90 = f82, f40, f90 + FNMA f91 = f83, f40, f91 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + FMPY f90 = f90, f41 + FMPY f91 = f91, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, -11 * SIZE + STFD [AOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + FMPY f89 = f89, f32 + FMPY f90 = f90, f32 + FMPY f91 = f91, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f81 = f89, f33, f81 + FNMA f82 = f90, f33, f82 + FNMA f83 = f91, f33, f83 + ;; + FNMA f72 = f88, f34, f72 + FNMA f73 = f89, f34, f73 + FNMA f74 = f90, f34, f74 + FNMA f75 = f91, f34, f75 + ;; + FNMA f64 = f88, f35, f64 + FNMA f65 = f89, f35, f65 + FNMA f66 = f90, f35, f66 + FNMA f67 = f91, f35, f67 + ;; + FMPY f80 = f80, f36 + FMPY f81 = f81, f36 + FMPY f82 = f82, f36 + FMPY f83 = f83, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f73 = f81, f37, f73 + FNMA f74 = f82, f37, f74 + FNMA f75 = f83, f37, f75 + ;; + FNMA f64 = f80, f38, f64 + FNMA f65 = f81, f38, f65 + FNMA f66 = f82, f38, f66 + FNMA f67 = f83, f38, f67 + ;; + FMPY f72 = f72, f39 + FMPY f73 = f73, f39 + FMPY f74 = f74, f39 + FMPY f75 = f75, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f65 = f73, f40, f65 + FNMA f66 = f74, f40, f66 + FNMA f67 = f75, f40, f67 + ;; + FMPY f64 = f64, f41 + FMPY f65 = f65, f41 + FMPY f66 = f66, f41 + FMPY f67 = f67, f41 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f91, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + nop __LINE__ + } + ;; + mov f65 = f0 + ;; + mov f73 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f81 = f0 + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f89 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L051: + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + shr I = M, 3 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L089 + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f90 = f0 + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC] + } + ;; + { .mfi + setf.d f70 = r0 + mov f78 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mfi + setf.d f71 = r0 + adds L = -1, L + } + ;; + { .mfi + setf.d f87 = r0 + mov f79 = f0 + mov ar.lc = L + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f95 = f0 + (p6) br.cond.dpnt .L058 + } + ;; + .align 8 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 8 + +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [BOFFSET] + adds BOFFSET = -30 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + + FSUB f68 = f48, f68 + FSUB f76 = f49, f76 + FSUB f84 = f50, f84 + FSUB f92 = f51, f92 + + FSUB f69 = f52, f69 + FSUB f77 = f53, f77 + FSUB f85 = f54, f85 + FSUB f93 = f55, f93 + + FSUB f70 = f56, f70 + FSUB f78 = f57, f78 + FSUB f86 = f58, f86 + FSUB f94 = f59, f94 + + FSUB f71 = f60, f71 + FSUB f79 = f61, f79 + FSUB f87 = f62, f87 + FSUB f95 = f63, f95 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; + FSUB f80 = f48, f80 + FSUB f81 = f49, f81 + FSUB f82 = f50, f82 + FSUB f83 = f51, f83 + FSUB f84 = f52, f84 + FSUB f85 = f53, f85 + FSUB f86 = f54, f86 + FSUB f87 = f55, f87 + + FSUB f88 = f56, f88 + FSUB f89 = f57, f89 + FSUB f90 = f58, f90 + FSUB f91 = f59, f91 + FSUB f92 = f60, f92 + FSUB f93 = f61, f93 + FSUB f94 = f62, f94 + FSUB f95 = f63, f95 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + FMPY f79 = f79, f32 + FMPY f87 = f87, f32 + FMPY f95 = f95, f32 + ;; + FNMA f70 = f71, f33, f70 + FNMA f78 = f79, f33, f78 + FNMA f86 = f87, f33, f86 + FNMA f94 = f95, f33, f94 + ;; + FNMA f69 = f71, f34, f69 + FNMA f77 = f79, f34, f77 + FNMA f85 = f87, f34, f85 + FNMA f93 = f95, f34, f93 + ;; + FNMA f68 = f71, f35, f68 + FNMA f76 = f79, f35, f76 + FNMA f84 = f87, f35, f84 + FNMA f92 = f95, f35, f92 + ;; + FNMA f67 = f71, f36, f67 + FNMA f75 = f79, f36, f75 + FNMA f83 = f87, f36, f83 + FNMA f91 = f95, f36, f91 + ;; + FNMA f66 = f71, f37, f66 + FNMA f74 = f79, f37, f74 + FNMA f82 = f87, f37, f82 + FNMA f90 = f95, f37, f90 + ;; + FNMA f65 = f71, f38, f65 + FNMA f73 = f79, f38, f73 + FNMA f81 = f87, f38, f81 + FNMA f89 = f95, f38, f89 + ;; + FNMA f64 = f71, f39, f64 + FNMA f72 = f79, f39, f72 + FNMA f80 = f87, f39, f80 + FNMA f88 = f95, f39, f88 + ;; + FMPY f70 = f70, f40 + FMPY f78 = f78, f40 + FMPY f86 = f86, f40 + FMPY f94 = f94, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f77 = f78, f41, f77 + FNMA f85 = f86, f41, f85 + FNMA f93 = f94, f41, f93 + ;; + FNMA f68 = f70, f42, f68 + FNMA f76 = f78, f42, f76 + FNMA f84 = f86, f42, f84 + FNMA f92 = f94, f42, f92 + ;; + FNMA f67 = f70, f43, f67 + FNMA f75 = f78, f43, f75 + FNMA f83 = f86, f43, f83 + FNMA f91 = f94, f43, f91 + ;; + FNMA f66 = f70, f44, f66 + FNMA f74 = f78, f44, f74 + FNMA f82 = f86, f44, f82 + FNMA f90 = f94, f44, f90 + ;; + FNMA f65 = f70, f45, f65 + FNMA f73 = f78, f45, f73 + FNMA f81 = f86, f45, f81 + FNMA f89 = f94, f45, f89 + ;; + FNMA f64 = f70, f46, f64 + FNMA f72 = f78, f46, f72 + FNMA f80 = f86, f46, f80 + FNMA f88 = f94, f46, f88 + ;; + FMPY f69 = f69, f47 + FMPY f77 = f77, f47 + FMPY f85 = f85, f47 + FMPY f93 = f93, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f76 = f77, f48, f76 + FNMA f84 = f85, f48, f84 + FNMA f92 = f93, f48, f92 + ;; + FNMA f67 = f69, f49, f67 + FNMA f75 = f77, f49, f75 + FNMA f83 = f85, f49, f83 + FNMA f91 = f93, f49, f91 + ;; + FNMA f66 = f69, f50, f66 + FNMA f74 = f77, f50, f74 + FNMA f82 = f85, f50, f82 + FNMA f90 = f93, f50, f90 + ;; + FNMA f65 = f69, f51, f65 + FNMA f73 = f77, f51, f73 + FNMA f81 = f85, f51, f81 + FNMA f89 = f93, f51, f89 + ;; + FNMA f64 = f69, f52, f64 + FNMA f72 = f77, f52, f72 + FNMA f80 = f85, f52, f80 + FNMA f88 = f93, f52, f88 + ;; + FMPY f68 = f68, f53 + FMPY f76 = f76, f53 + FMPY f84 = f84, f53 + FMPY f92 = f92, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f75 = f76, f54, f75 + FNMA f83 = f84, f54, f83 + FNMA f91 = f92, f54, f91 + ;; + FNMA f66 = f68, f55, f66 + FNMA f74 = f76, f55, f74 + FNMA f82 = f84, f55, f82 + FNMA f90 = f92, f55, f90 + ;; + FNMA f65 = f68, f56, f65 + FNMA f73 = f76, f56, f73 + FNMA f81 = f84, f56, f81 + FNMA f89 = f92, f56, f89 + ;; + FNMA f64 = f68, f57, f64 + FNMA f72 = f76, f57, f72 + FNMA f80 = f84, f57, f80 + FNMA f88 = f92, f57, f88 + ;; + FMPY f67 = f67, f58 + FMPY f75 = f75, f58 + FMPY f83 = f83, f58 + FMPY f91 = f91, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f74 = f75, f59, f74 + FNMA f82 = f83, f59, f82 + FNMA f90 = f91, f59, f90 + ;; + FNMA f65 = f67, f60, f65 + FNMA f73 = f75, f60, f73 + FNMA f81 = f83, f60, f81 + FNMA f89 = f91, f60, f89 + ;; + FNMA f64 = f67, f61, f64 + FNMA f72 = f75, f61, f72 + FNMA f80 = f83, f61, f80 + FNMA f88 = f91, f61, f88 + ;; + FMPY f66 = f66, f16 + FMPY f74 = f74, f16 + FMPY f82 = f82, f16 + FMPY f90 = f90, f16 + ;; + FNMA f65 = f66, f17, f65 + FNMA f73 = f74, f17, f73 + FNMA f81 = f82, f17, f81 + FNMA f89 = f90, f17, f89 + ;; + FNMA f64 = f66, f18, f64 + FNMA f72 = f74, f18, f72 + FNMA f80 = f82, f18, f80 + FNMA f88 = f90, f18, f88 + ;; + FMPY f65 = f65, f19 + FMPY f73 = f73, f19 + FMPY f81 = f81, f19 + FMPY f89 = f89, f19 + ;; + FNMA f64 = f65, f20, f64 + FNMA f72 = f73, f20, f72 + FNMA f80 = f81, f20, f80 + FNMA f88 = f89, f20, f88 + ;; + FMPY f64 = f64, f21 + FMPY f72 = f72, f21 + FMPY f80 = f80, f21 + FMPY f88 = f88, f21 + ;; + + adds BOFFSET = 24 * SIZE, BOFFSET + adds BOFFSET2 = 24 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94, - 11 * SIZE + STFD [BOFFSET2] = f95, - 11 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, - 11 * SIZE + STFD [BOFFSET2] = f93, - 11 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, - 11 * SIZE + STFD [BOFFSET2] = f91, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f89, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C3 = -8 * SIZE, C3 + adds C4 = -8 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + FNMA f84 = f80, f36, f84 + FNMA f92 = f88, f36, f92 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + FNMA f85 = f80, f37, f85 + FNMA f93 = f88, f37, f93 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + FNMA f86 = f80, f38, f86 + FNMA f94 = f88, f38, f94 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + FNMA f87 = f80, f39, f87 + FNMA f95 = f88, f39, f95 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + FMPY f81 = f81, f40 + FMPY f89 = f89, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + FNMA f82 = f81, f41, f82 + FNMA f90 = f89, f41, f90 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + FNMA f83 = f81, f42, f83 + FNMA f91 = f89, f42, f91 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + FNMA f84 = f81, f43, f84 + FNMA f92 = f89, f43, f92 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + FNMA f85 = f81, f44, f85 + FNMA f93 = f89, f44, f93 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + FNMA f86 = f81, f45, f86 + FNMA f94 = f89, f45, f94 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + FNMA f87 = f81, f46, f87 + FNMA f95 = f89, f46, f95 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + FMPY f82 = f82, f47 + FMPY f90 = f90, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + FNMA f83 = f82, f48, f83 + FNMA f91 = f90, f48, f91 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + FNMA f84 = f82, f49, f84 + FNMA f92 = f90, f49, f92 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + FNMA f85 = f82, f50, f85 + FNMA f93 = f90, f50, f93 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + FNMA f86 = f82, f51, f86 + FNMA f94 = f90, f51, f94 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + FNMA f87 = f82, f52, f87 + FNMA f95 = f90, f52, f95 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + FMPY f83 = f83, f53 + FMPY f91 = f91, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + FNMA f84 = f83, f54, f84 + FNMA f92 = f91, f54, f92 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + FNMA f85 = f83, f55, f85 + FNMA f93 = f91, f55, f93 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + FNMA f86 = f83, f56, f86 + FNMA f94 = f91, f56, f94 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + FNMA f87 = f83, f57, f87 + FNMA f95 = f91, f57, f95 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + FMPY f84 = f84, f58 + FMPY f92 = f92, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + FNMA f85 = f84, f59, f85 + FNMA f93 = f92, f59, f93 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + FNMA f86 = f84, f60, f86 + FNMA f94 = f92, f60, f94 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + FNMA f87 = f84, f61, f87 + FNMA f95 = f92, f61, f95 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + FMPY f85 = f85, f16 + FMPY f93 = f93, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + FNMA f86 = f85, f17, f86 + FNMA f94 = f93, f17, f94 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + FNMA f87 = f85, f18, f87 + FNMA f95 = f93, f18, f95 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + FMPY f86 = f86, f19 + FMPY f94 = f94, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + FNMA f87 = f86, f20, f87 + FNMA f95 = f94, f20, f95 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + FMPY f87 = f87, f21 + FMPY f95 = f95, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, 5 * SIZE + STFD [BOFFSET2] = f91, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, 5 * SIZE + STFD [BOFFSET2] = f93, 5 * SIZE + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94 + STFD [BOFFSET2] = f95 + adds C9 = 4 * SIZE, C1 + adds BOFFSET = - 27 * SIZE, BOFFSET + adds BOFFSET2 = - 27 * SIZE, BOFFSET2 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FNMA f80 = f64, f34, f80 + FNMA f84 = f68, f34, f84 + FNMA f81 = f65, f34, f81 + FNMA f85 = f69, f34, f85 + FNMA f82 = f66, f34, f82 + FNMA f86 = f70, f34, f86 + FNMA f83 = f67, f34, f83 + FNMA f87 = f71, f34, f87 + ;; + FNMA f88 = f64, f35, f88 + FNMA f92 = f68, f35, f92 + FNMA f89 = f65, f35, f89 + FNMA f93 = f69, f35, f93 + FNMA f90 = f66, f35, f90 + FNMA f94 = f70, f35, f94 + FNMA f91 = f67, f35, f91 + FNMA f95 = f71, f35, f95 + ;; + FMPY f72 = f72, f36 + FMPY f76 = f76, f36 + FMPY f73 = f73, f36 + FMPY f77 = f77, f36 + FMPY f74 = f74, f36 + FMPY f78 = f78, f36 + FMPY f75 = f75, f36 + FMPY f79 = f79, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f84 = f76, f37, f84 + FNMA f81 = f73, f37, f81 + FNMA f85 = f77, f37, f85 + FNMA f82 = f74, f37, f82 + FNMA f86 = f78, f37, f86 + FNMA f83 = f75, f37, f83 + FNMA f87 = f79, f37, f87 + ;; + FNMA f88 = f72, f38, f88 + FNMA f92 = f76, f38, f92 + FNMA f89 = f73, f38, f89 + FNMA f93 = f77, f38, f93 + FNMA f90 = f74, f38, f90 + FNMA f94 = f78, f38, f94 + FNMA f91 = f75, f38, f91 + FNMA f95 = f79, f38, f95 + ;; + FMPY f80 = f80, f39 + FMPY f84 = f84, f39 + FMPY f81 = f81, f39 + FMPY f85 = f85, f39 + FMPY f82 = f82, f39 + FMPY f86 = f86, f39 + FMPY f83 = f83, f39 + FMPY f87 = f87, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f92 = f84, f40, f92 + FNMA f89 = f81, f40, f89 + FNMA f93 = f85, f40, f93 + FNMA f90 = f82, f40, f90 + FNMA f94 = f86, f40, f94 + FNMA f91 = f83, f40, f91 + FNMA f95 = f87, f40, f95 + ;; + FMPY f88 = f88, f41 + FMPY f92 = f92, f41 + FMPY f89 = f89, f41 + FMPY f93 = f93, f41 + FMPY f90 = f90, f41 + FMPY f94 = f94, f41 + FMPY f91 = f91, f41 + FMPY f95 = f95, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, 5 * SIZE + STFD [AOFFSET2] = f79, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f87, 5 * SIZE + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, -27 * SIZE + STFD [AOFFSET2] = f95, -27 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], -2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + + FMPY f88 = f88, f32 + FMPY f92 = f92, f32 + FMPY f89 = f89, f32 + FMPY f93 = f93, f32 + FMPY f90 = f90, f32 + FMPY f94 = f94, f32 + FMPY f91 = f91, f32 + FMPY f95 = f95, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f84 = f92, f33, f84 + FNMA f81 = f89, f33, f81 + FNMA f85 = f93, f33, f85 + FNMA f82 = f90, f33, f82 + FNMA f86 = f94, f33, f86 + FNMA f83 = f91, f33, f83 + FNMA f87 = f95, f33, f87 + ;; + FNMA f72 = f88, f34, f72 + FNMA f76 = f92, f34, f76 + FNMA f73 = f89, f34, f73 + FNMA f77 = f93, f34, f77 + FNMA f74 = f90, f34, f74 + FNMA f78 = f94, f34, f78 + FNMA f75 = f91, f34, f75 + FNMA f79 = f95, f34, f79 + ;; + FNMA f64 = f88, f35, f64 + FNMA f68 = f92, f35, f68 + FNMA f65 = f89, f35, f65 + FNMA f69 = f93, f35, f69 + FNMA f66 = f90, f35, f66 + FNMA f70 = f94, f35, f70 + FNMA f67 = f91, f35, f67 + FNMA f71 = f95, f35, f71 + ;; + FMPY f80 = f80, f36 + FMPY f84 = f84, f36 + FMPY f81 = f81, f36 + FMPY f85 = f85, f36 + FMPY f82 = f82, f36 + FMPY f86 = f86, f36 + FMPY f83 = f83, f36 + FMPY f87 = f87, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f76 = f84, f37, f76 + FNMA f73 = f81, f37, f73 + FNMA f77 = f85, f37, f77 + FNMA f74 = f82, f37, f74 + FNMA f78 = f86, f37, f78 + FNMA f75 = f83, f37, f75 + FNMA f79 = f87, f37, f79 + ;; + FNMA f64 = f80, f38, f64 + FNMA f68 = f84, f38, f68 + FNMA f65 = f81, f38, f65 + FNMA f69 = f85, f38, f69 + FNMA f66 = f82, f38, f66 + FNMA f70 = f86, f38, f70 + FNMA f67 = f83, f38, f67 + FNMA f71 = f87, f38, f71 + ;; + FMPY f72 = f72, f39 + FMPY f76 = f76, f39 + FMPY f73 = f73, f39 + FMPY f77 = f77, f39 + FMPY f74 = f74, f39 + FMPY f78 = f78, f39 + FMPY f75 = f75, f39 + FMPY f79 = f79, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f68 = f76, f40, f68 + FNMA f65 = f73, f40, f65 + FNMA f69 = f77, f40, f69 + FNMA f66 = f74, f40, f66 + FNMA f70 = f78, f40, f70 + FNMA f67 = f75, f40, f67 + FNMA f71 = f79, f40, f71 + ;; + FMPY f64 = f64, f41 + FMPY f68 = f68, f41 + FMPY f65 = f65, f41 + FMPY f69 = f69, f41 + FMPY f66 = f66, f41 + FMPY f70 = f70, f41 + FMPY f67 = f67, f41 + FMPY f71 = f71, f41 + ;; + adds AOFFSET = 24 * SIZE, AOFFSET + adds AOFFSET2 = 24 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, - 11 * SIZE + STFD [AOFFSET2] = f95, - 11 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f87, - 11 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, - 11 * SIZE + STFD [AOFFSET2] = f79, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + ;; + +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, 5 * SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + STFD [C11] = f87 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, 5 * SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + { .mmb + (p6) br.cond.dptk .L052 + } + ;; + .align 8 + +.L089: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L090: + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L130 + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + ;; + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + } + ;; + { .mfi +#ifndef RT + shladd C = LDC, 1, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + mov f81 = f0 +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L110 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L128 + } + ;; + .align 8 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, -SIZE + adds C2 = -1 * SIZE, C2 + } + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FMPY f72 = f72, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + ;; + FNMA f64 = f72, f33, f64 + ;; + FMPY f64 = f64, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif + + mov f64 = f0 + mov f72 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L110: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L100 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L118 + } + ;; + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + .align 8 + +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f73 = f73, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f72 = f73, f33, f72 + ;; + FMPY f64 = f64, f34 + FMPY f72 = f72, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, - 3 * SIZE + ;; + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + FMPY f73 = f73, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f65 = f73, f33, f65 + ;; + FMPY f64 = f64, f34 + FMPY f65 = f65, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + mov f65 = f0 + mov f73 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L100: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L091 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L108 + } + ;; + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + .align 8 + +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + ;; + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f75 = f75, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f74 = f75, f33, f74 + ;; + FNMA f65 = f67, f34, f65 + FNMA f73 = f75, f34, f73 + ;; + FNMA f64 = f67, f35, f64 + FNMA f72 = f75, f35, f72 + ;; + FMPY f66 = f66, f36 + FMPY f74 = f74, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f73 = f74, f37, f73 + ;; + FNMA f64 = f66, f38, f64 + FNMA f72 = f74, f38, f72 + ;; + FMPY f65 = f65, f39 + FMPY f73 = f73, f39 + ;; + FNMA f64 = f65, f40, f64 + FNMA f72 = f73, f40, f72 + ;; + FMPY f64 = f64, f41 + FMPY f72 = f72, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + FMPY f74 = f74, f34 + FMPY f75 = f75, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + FMPY f73 = f73, f32 + FMPY f74 = f74, f32 + FMPY f75 = f75, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f65 = f73, f33, f65 + FNMA f66 = f74, f33, f66 + FNMA f67 = f75, f33, f67 + ;; + FMPY f64 = f64, f34 + FMPY f65 = f65, f34 + FMPY f66 = f66, f34 + FMPY f67 = f67, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L091: + shr I = M, 3 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L129 + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + mov f76 = f0 + mov f77 = f0 + mov f78 = f0 + mov f79 = f0 + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + } + ;; + .align 8 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; + .align 8 + +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + + FSUB f68 = f40, f68 + FSUB f76 = f41, f76 + FSUB f69 = f42, f69 + FSUB f77 = f43, f77 + + FSUB f70 = f44, f70 + FSUB f78 = f45, f78 + FSUB f71 = f46, f71 + FSUB f79 = f47, f79 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + FMPY f79 = f79, f32 + ;; + FNMA f70 = f71, f33, f70 + FNMA f78 = f79, f33, f78 + ;; + FNMA f69 = f71, f34, f69 + FNMA f77 = f79, f34, f77 + ;; + FNMA f68 = f71, f35, f68 + FNMA f76 = f79, f35, f76 + ;; + FNMA f67 = f71, f36, f67 + FNMA f75 = f79, f36, f75 + ;; + FNMA f66 = f71, f37, f66 + FNMA f74 = f79, f37, f74 + ;; + FNMA f65 = f71, f38, f65 + FNMA f73 = f79, f38, f73 + ;; + FNMA f64 = f71, f39, f64 + FNMA f72 = f79, f39, f72 + ;; + FMPY f70 = f70, f40 + FMPY f78 = f78, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f77 = f78, f41, f77 + ;; + FNMA f68 = f70, f42, f68 + FNMA f76 = f78, f42, f76 + ;; + FNMA f67 = f70, f43, f67 + FNMA f75 = f78, f43, f75 + ;; + FNMA f66 = f70, f44, f66 + FNMA f74 = f78, f44, f74 + ;; + FNMA f65 = f70, f45, f65 + FNMA f73 = f78, f45, f73 + ;; + FNMA f64 = f70, f46, f64 + FNMA f72 = f78, f46, f72 + ;; + FMPY f69 = f69, f47 + FMPY f77 = f77, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f76 = f77, f48, f76 + ;; + FNMA f67 = f69, f49, f67 + FNMA f75 = f77, f49, f75 + ;; + FNMA f66 = f69, f50, f66 + FNMA f74 = f77, f50, f74 + ;; + FNMA f65 = f69, f51, f65 + FNMA f73 = f77, f51, f73 + ;; + FNMA f64 = f69, f52, f64 + FNMA f72 = f77, f52, f72 + ;; + FMPY f68 = f68, f53 + FMPY f76 = f76, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f75 = f76, f54, f75 + ;; + FNMA f66 = f68, f55, f66 + FNMA f74 = f76, f55, f74 + ;; + FNMA f65 = f68, f56, f65 + FNMA f73 = f76, f56, f73 + ;; + FNMA f64 = f68, f57, f64 + FNMA f72 = f76, f57, f72 + ;; + FMPY f67 = f67, f58 + FMPY f75 = f75, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f74 = f75, f59, f74 + ;; + FNMA f65 = f67, f60, f65 + FNMA f73 = f75, f60, f73 + ;; + FNMA f64 = f67, f61, f64 + FNMA f72 = f75, f61, f72 + ;; + FMPY f66 = f66, f16 + FMPY f74 = f74, f16 + ;; + FNMA f65 = f66, f17, f65 + FNMA f73 = f74, f17, f73 + ;; + FNMA f64 = f66, f18, f64 + FNMA f72 = f74, f18, f72 + ;; + FMPY f65 = f65, f19 + FMPY f73 = f73, f19 + ;; + FNMA f64 = f65, f20, f64 + FNMA f72 = f73, f20, f72 + ;; + FMPY f64 = f64, f21 + FMPY f72 = f72, f21 + ;; + + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, - 11 * SIZE + STFD [BOFFSET2] = f79, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, - 3 * SIZE + STFD [BOFFSET2] = f75, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, 5 * SIZE + STFD [BOFFSET2] = f75, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, -11 * SIZE + STFD [BOFFSET2] = f79, -11 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FMPY f72 = f72, f34 + FMPY f76 = f76, f34 + FMPY f73 = f73, f34 + FMPY f77 = f77, f34 + FMPY f74 = f74, f34 + FMPY f78 = f78, f34 + FMPY f75 = f75, f34 + FMPY f79 = f79, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, -11 * SIZE + STFD [AOFFSET2] = f79, -11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + + FMPY f72 = f72, f32 + FMPY f76 = f76, f32 + FMPY f73 = f73, f32 + FMPY f77 = f77, f32 + FMPY f74 = f74, f32 + FMPY f78 = f78, f32 + FMPY f75 = f75, f32 + FMPY f79 = f79, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f68 = f76, f33, f68 + FNMA f65 = f73, f33, f65 + FNMA f69 = f77, f33, f69 + FNMA f66 = f74, f33, f66 + FNMA f70 = f78, f33, f70 + FNMA f67 = f75, f33, f67 + FNMA f71 = f79, f33, f71 + ;; + FMPY f64 = f64, f34 + FMPY f68 = f68, f34 + FMPY f65 = f65, f34 + FMPY f69 = f69, f34 + FMPY f66 = f66, f34 + FMPY f70 = f70, f34 + FMPY f67 = f67, f34 + FMPY f71 = f71, f34 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, - 11 * SIZE + STFD [AOFFSET2] = f79, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + ;; + +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + + (p6) br.cond.dptk .L092 + ;; + .align 8 + +.L129: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L130: + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L999 + ;; + +#ifdef RT + { .mmi + nop __LINE__ + shl r2 = K, BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } +#endif + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + ;; + + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + { .mfi +#ifndef RT + add C = C, LDC // coffset += 8 * ldc +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + +.L160: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L150 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + ;; + shladd r3 = KK, BASE_SHIFT, r0 + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#else + { .mmi + shladd BOFFSET = KK, BASE_SHIFT, B + nop __LINE__ +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + adds L = 1, L + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, L + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L168 + } + ;; + .align 8 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; + .align 8 + +.L168: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + { .mmi + LDFD f32 = [BOFFSET] + LDFD f33 = [AOFFSET] +#ifdef LN + adds C1 = -1 * SIZE, C1 +#else + nop __LINE__ +#endif + } + ;; +#else + { .mmi + LDFD f32 = [AOFFSET] + LDFD f33 = [BOFFSET] + nop __LINE__ + } + ;; +#endif + + { .mmf + sub L = K, KK +#ifdef RT + shladd AORIG = K, BASE_SHIFT, AORIG +#else + nop __LINE__ +#endif + FSUB f64 = f32, f64 + } + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + FMPY f64 = f64, f33 + ;; +#if defined(LN) || defined(LT) + { .mmf + STFD [BOFFSET] = f64 +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif + mov f64 = f0 + } + ;; +#else + { .mmf + STFD [AOFFSET] = f64 + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; +#endif + +#if defined(LT) || defined(RN) + shladd AOFFSET = L, BASE_SHIFT, AOFFSET +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + shladd BOFFSET = L, BASE_SHIFT, BOFFSET +#else + nop __LINE__ +#endif + ;; + .align 8 + +.L150: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L140 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + ;; + shladd r3 = KK, BASE_SHIFT, r0 + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + (p7) LDFD f32 = [AOFFSET], SIZE + ;; + (p7) LDFD f33 = [AOFFSET], SIZE + ;; + ;; + { .mib + mov ar.lc = L + (p6) br.cond.dpnt .L158 + } + ;; + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + ;; + FNMA f64 = f65, f33, f64 + ;; + FMPY f64 = f64, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, - SIZE + ;; + adds C1 = -2 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FMPY f65 = f65, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, -SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + mov f64 = f0 + mov f65 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L140: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L131 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + ;; + shladd r3 = KK, BASE_SHIFT, r0 + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L148 + } + ;; + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf + nop __LINE__ + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + nop __LINE__ + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + ;; + FNMA f66 = f67, f33, f66 + ;; + FNMA f65 = f67, f34, f65 + ;; + FNMA f64 = f67, f35, f64 + ;; + FMPY f66 = f66, f36 + ;; + FNMA f65 = f66, f37, f65 + ;; + FNMA f64 = f66, f38, f64 + ;; + FMPY f65 = f65, f39 + ;; + FNMA f64 = f65, f40, f64 + ;; + FMPY f64 = f64, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FMPY f65 = f65, f36 + ;; + FNMA f66 = f65, f37, f66 + ;; + FNMA f67 = f65, f38, f67 + ;; + FMPY f66 = f66, f39 + ;; + FNMA f67 = f66, f40, f67 + ;; + FMPY f67 = f67, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + mov f72 = f0 + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L131: +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + shr I = M, 3 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L169 + ;; + .align 16 + +.L132: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + ;; + shladd r3 = KK, BASE_SHIFT, r0 + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds L = 1, L + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + ;; + + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L138 + } + ;; + .align 16 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + ;; + FNMA f70 = f71, f33, f70 + ;; + FNMA f69 = f71, f34, f69 + ;; + FNMA f68 = f71, f35, f68 + ;; + FNMA f67 = f71, f36, f67 + ;; + FNMA f66 = f71, f37, f66 + ;; + FNMA f65 = f71, f38, f65 + ;; + FNMA f64 = f71, f39, f64 + ;; + FMPY f70 = f70, f40 + ;; + FNMA f69 = f70, f41, f69 + ;; + FNMA f68 = f70, f42, f68 + ;; + FNMA f67 = f70, f43, f67 + ;; + FNMA f66 = f70, f44, f66 + ;; + FNMA f65 = f70, f45, f65 + ;; + FNMA f64 = f70, f46, f64 + ;; + FMPY f69 = f69, f47 + ;; + FNMA f68 = f69, f48, f68 + ;; + FNMA f67 = f69, f49, f67 + ;; + FNMA f66 = f69, f50, f66 + ;; + FNMA f65 = f69, f51, f65 + ;; + FNMA f64 = f69, f52, f64 + ;; + FMPY f68 = f68, f53 + ;; + FNMA f67 = f68, f54, f67 + ;; + FNMA f66 = f68, f55, f66 + ;; + FNMA f65 = f68, f56, f65 + ;; + FNMA f64 = f68, f57, f64 + ;; + FMPY f67 = f67, f58 + ;; + FNMA f66 = f67, f59, f66 + ;; + FNMA f65 = f67, f60, f65 + ;; + FNMA f64 = f67, f61, f64 + ;; + FMPY f66 = f66, f16 + ;; + FNMA f65 = f66, f17, f65 + ;; + FNMA f64 = f66, f18, f64 + ;; + FMPY f65 = f65, f19 + ;; + FNMA f64 = f65, f20, f64 + ;; + FMPY f64 = f64, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, - 3 * SIZE + STFD [BOFFSET2] = f71, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FNMA f68 = f64, f36, f68 + ;; + FNMA f69 = f64, f37, f69 + ;; + FNMA f70 = f64, f38, f70 + ;; + FNMA f71 = f64, f39, f71 + ;; + FMPY f65 = f65, f40 + ;; + FNMA f66 = f65, f41, f66 + ;; + FNMA f67 = f65, f42, f67 + ;; + FNMA f68 = f65, f43, f68 + ;; + FNMA f69 = f65, f44, f69 + ;; + FNMA f70 = f65, f45, f70 + ;; + FNMA f71 = f65, f46, f71 + ;; + FMPY f66 = f66, f47 + ;; + FNMA f67 = f66, f48, f67 + ;; + FNMA f68 = f66, f49, f68 + ;; + FNMA f69 = f66, f50, f69 + ;; + FNMA f70 = f66, f51, f70 + ;; + FNMA f71 = f66, f52, f71 + ;; + FMPY f67 = f67, f53 + ;; + FNMA f68 = f67, f54, f68 + ;; + FNMA f69 = f67, f55, f69 + ;; + FNMA f70 = f67, f56, f70 + ;; + FNMA f71 = f67, f57, f71 + ;; + FMPY f68 = f68, f58 + ;; + FNMA f69 = f68, f59, f69 + ;; + FNMA f70 = f68, f60, f70 + ;; + FNMA f71 = f68, f61, f71 + ;; + FMPY f69 = f69, f16 + ;; + FNMA f70 = f69, f17, f70 + ;; + FNMA f71 = f69, f18, f71 + ;; + FMPY f70 = f70, f19 + ;; + FNMA f71 = f70, f20, f71 + ;; + FMPY f71 = f71, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + STFD [BOFFSET2] = f71, -3 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + + (p6) br.cond.dptk .L132 + .align 8 + + +.L169: + { .mii +#ifdef LN + shladd B = K, BASE_SHIFT, B +#elif defined(LT) || defined(RN) + mov B = BOFFSET +#else + nop __LINE__ +#endif + +#ifdef RN + adds KK = 1, KK +#elif defined RT + adds KK = -1, KK +#else + nop __LINE__ +#endif + mov AOFFSET = A + } + ;; + .align 16 + + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + mov ar.lc = ARLC + ;; + mov pr = PR, -1 + ;; + mov ar.pfs = ARPFS + ;; + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/trsm_kernel_LT.S b/kernel/ia64/trsm_kernel_LT.S new file mode 100644 index 0000000..eef4e00 --- /dev/null +++ b/kernel/ia64/trsm_kernel_LT.S @@ -0,0 +1,11027 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE 7 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r36 +#define B r37 +#define C r38 +#define LDC r39 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 +#define AOFFSET2 loc12 +#define BOFFSET2 loc13 + + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -6 * 16, SP + adds r9 = -5 * 16, SP + adds SP = -6 * 16, SP + } + ;; + { .mmi + ld8 OFFSET = [r14] + mov AOFFSET = A + mov PR = pr + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + shr J = N, 3 + } + ;; + { .mmi + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + shladd LDC = LDC, BASE_SHIFT, r0 + } + ;; + .body + { .mmi + stf.spill [r8] = f20 + stf.spill [r9] = f21 + cmp.ge p6, p0 = 0, J + } + { .mib + nop __LINE__ +#ifdef RN + sub KK = r0, OFFSET +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L050 + } + ;; + .align 8 + +.L010: + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 +#ifdef LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + mov AOFFSET = A + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc + shladd C = LDC, 3, C // coffset += 8 * ldc + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 + shladd C6 = LDC, 2, C2 + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 + mov f112 = f0 + mov L = KK + }{ .mfb + shladd C8 = LDC, 2, C4 + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mmf + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + mov f65 = f0 + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f81 = f0 + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f119 = r0 + mov f89 = f0 + } + { .mmf + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f113 = r0 + mov f121 = f0 + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + adds L = 1, L + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds AOFFSET2 = 4 * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f127 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.fault.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + adds AOFFSET2 = 4 * SIZE, AOFFSET + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FSUB f113 = f46, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + FSUB f66 = f48, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f74 = f49, f74 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f90 = f51, f90 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FSUB f98 = f52, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f106 = f53, f106 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + FSUB f114 = f54, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f55, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + FSUB f67 = f56, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f75 = f57, f75 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + FSUB f83 = f58, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + FSUB f99 = f60, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f61, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + FSUB f115 = f62, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f63, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f68 = f32, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f76 = f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f84 = f34, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f92 = f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f108 = f37, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f116 = f38, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f39, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f69 = f40, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f77 = f41, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f85 = f42, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f43, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f101 = f44, f101 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f117 = f46, f117 + adds BOFFSET = -62 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f125 = f47, f125 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f48, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f78 = f49, f78 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f86 = f50, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f94 = f51, f94 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET] + FSUB f102 = f52, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f110 = f53, f110 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f126 = f55, f126 + adds AOFFSET = 2 * SIZE, AOFFSET + } + ;; + { .mfi + nop __LINE__ + FSUB f71 = f56, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f79 = f57, f79 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f58, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f59, f95 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f60, f103 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f111 = f61, f111 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f62, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET] + FMPY f80 = f80, f32 + adds AOFFSET = 3 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [AOFFSET], 1 * SIZE + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f45, f46 = [AOFFSET] + FNMA f81 = f80, f33, f81 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + LDFPD f51, f52 = [AOFFSET] + FNMA f74 = f72, f34, f74 + adds AOFFSET = 5 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [AOFFSET], 1 * SIZE + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET] + FNMA f67 = f64, f35, f67 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET] + FNMA f83 = f80, f35, f83 + adds AOFFSET = 7 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [AOFFSET], 1 * SIZE + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f17, f18 = [AOFFSET] + FNMA f68 = f64, f36, f68 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f96, f36, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f19, f20 = [AOFFSET] + FNMA f76 = f72, f36, f76 + adds AOFFSET = 9 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f104, f36, f108 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [AOFFSET] + FNMA f84 = f80, f36, f84 + adds AOFFSET = -63 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f112, f36, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f92 = f88, f36, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f124 = f120, f36, f124 + nop __LINE__ + } + ;; + FNMA f69 = f64, f37, f69 + FNMA f101 = f96, f37, f101 + FNMA f77 = f72, f37, f77 + FNMA f109 = f104, f37, f109 + FNMA f85 = f80, f37, f85 + FNMA f117 = f112, f37, f117 + FNMA f93 = f88, f37, f93 + FNMA f125 = f120, f37, f125 + ;; + FNMA f70 = f64, f38, f70 + FNMA f102 = f96, f38, f102 + FNMA f78 = f72, f38, f78 + FNMA f110 = f104, f38, f110 + FNMA f86 = f80, f38, f86 + FNMA f118 = f112, f38, f118 + FNMA f94 = f88, f38, f94 + FNMA f126 = f120, f38, f126 + ;; + FNMA f71 = f64, f39, f71 + FNMA f103 = f96, f39, f103 + FNMA f79 = f72, f39, f79 + FNMA f111 = f104, f39, f111 + FNMA f87 = f80, f39, f87 + FNMA f119 = f112, f39, f119 + FNMA f95 = f88, f39, f95 + FNMA f127 = f120, f39, f127 + ;; + FMPY f65 = f65, f40 + FMPY f97 = f97, f40 + FMPY f73 = f73, f40 + FMPY f105 = f105, f40 + FMPY f81 = f81, f40 + FMPY f113 = f113, f40 + FMPY f89 = f89, f40 + FMPY f121 = f121, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f98 = f97, f41, f98 + FNMA f74 = f73, f41, f74 + FNMA f106 = f105, f41, f106 + FNMA f82 = f81, f41, f82 + FNMA f114 = f113, f41, f114 + FNMA f90 = f89, f41, f90 + FNMA f122 = f121, f41, f122 + FNMA f67 = f65, f42, f67 + FNMA f99 = f97, f42, f99 + FNMA f75 = f73, f42, f75 + FNMA f107 = f105, f42, f107 + FNMA f83 = f81, f42, f83 + FNMA f115 = f113, f42, f115 + FNMA f91 = f89, f42, f91 + FNMA f123 = f121, f42, f123 + ;; + FNMA f68 = f65, f43, f68 + FNMA f100 = f97, f43, f100 + FNMA f76 = f73, f43, f76 + FNMA f108 = f105, f43, f108 + FNMA f84 = f81, f43, f84 + FNMA f116 = f113, f43, f116 + FNMA f92 = f89, f43, f92 + FNMA f124 = f121, f43, f124 + ;; + FNMA f69 = f65, f44, f69 + FNMA f101 = f97, f44, f101 + FNMA f77 = f73, f44, f77 + FNMA f109 = f105, f44, f109 + FNMA f85 = f81, f44, f85 + FNMA f117 = f113, f44, f117 + FNMA f93 = f89, f44, f93 + FNMA f125 = f121, f44, f125 + ;; + FNMA f70 = f65, f45, f70 + FNMA f102 = f97, f45, f102 + FNMA f78 = f73, f45, f78 + FNMA f110 = f105, f45, f110 + FNMA f86 = f81, f45, f86 + FNMA f118 = f113, f45, f118 + FNMA f94 = f89, f45, f94 + FNMA f126 = f121, f45, f126 + ;; + FNMA f71 = f65, f46, f71 + FNMA f103 = f97, f46, f103 + FNMA f79 = f73, f46, f79 + FNMA f111 = f105, f46, f111 + FNMA f87 = f81, f46, f87 + FNMA f119 = f113, f46, f119 + FNMA f95 = f89, f46, f95 + FNMA f127 = f121, f46, f127 + ;; + FMPY f66 = f66, f47 + FMPY f98 = f98, f47 + FMPY f74 = f74, f47 + FMPY f106 = f106, f47 + FMPY f82 = f82, f47 + FMPY f114 = f114, f47 + FMPY f90 = f90, f47 + FMPY f122 = f122, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f99 = f98, f48, f99 + FNMA f75 = f74, f48, f75 + FNMA f107 = f106, f48, f107 + FNMA f83 = f82, f48, f83 + FNMA f115 = f114, f48, f115 + FNMA f91 = f90, f48, f91 + FNMA f123 = f122, f48, f123 + FNMA f68 = f66, f49, f68 + FNMA f100 = f98, f49, f100 + FNMA f76 = f74, f49, f76 + FNMA f108 = f106, f49, f108 + FNMA f84 = f82, f49, f84 + FNMA f116 = f114, f49, f116 + FNMA f92 = f90, f49, f92 + FNMA f124 = f122, f49, f124 + ;; + FNMA f69 = f66, f50, f69 + FNMA f101 = f98, f50, f101 + FNMA f77 = f74, f50, f77 + FNMA f109 = f106, f50, f109 + FNMA f85 = f82, f50, f85 + FNMA f117 = f114, f50, f117 + FNMA f93 = f90, f50, f93 + FNMA f125 = f122, f50, f125 + ;; + FNMA f70 = f66, f51, f70 + FNMA f102 = f98, f51, f102 + FNMA f78 = f74, f51, f78 + FNMA f110 = f106, f51, f110 + FNMA f86 = f82, f51, f86 + FNMA f118 = f114, f51, f118 + FNMA f94 = f90, f51, f94 + FNMA f126 = f122, f51, f126 + ;; + FNMA f71 = f66, f52, f71 + FNMA f103 = f98, f52, f103 + FNMA f79 = f74, f52, f79 + FNMA f111 = f106, f52, f111 + FNMA f87 = f82, f52, f87 + FNMA f119 = f114, f52, f119 + FNMA f95 = f90, f52, f95 + FNMA f127 = f122, f52, f127 + ;; + FMPY f67 = f67, f53 + FMPY f99 = f99, f53 + FMPY f75 = f75, f53 + FMPY f107 = f107, f53 + FMPY f83 = f83, f53 + FMPY f115 = f115, f53 + FMPY f91 = f91, f53 + FMPY f123 = f123, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f100 = f99, f54, f100 + FNMA f76 = f75, f54, f76 + FNMA f108 = f107, f54, f108 + FNMA f84 = f83, f54, f84 + FNMA f116 = f115, f54, f116 + FNMA f92 = f91, f54, f92 + FNMA f124 = f123, f54, f124 + ;; + FNMA f69 = f67, f55, f69 + FNMA f101 = f99, f55, f101 + FNMA f77 = f75, f55, f77 + FNMA f109 = f107, f55, f109 + FNMA f85 = f83, f55, f85 + FNMA f117 = f115, f55, f117 + FNMA f93 = f91, f55, f93 + FNMA f125 = f123, f55, f125 + ;; + FNMA f70 = f67, f56, f70 + FNMA f102 = f99, f56, f102 + FNMA f78 = f75, f56, f78 + FNMA f110 = f107, f56, f110 + FNMA f86 = f83, f56, f86 + FNMA f118 = f115, f56, f118 + FNMA f94 = f91, f56, f94 + FNMA f126 = f123, f56, f126 + ;; + FNMA f71 = f67, f57, f71 + FNMA f103 = f99, f57, f103 + FNMA f79 = f75, f57, f79 + FNMA f111 = f107, f57, f111 + FNMA f87 = f83, f57, f87 + FNMA f119 = f115, f57, f119 + FNMA f95 = f91, f57, f95 + FNMA f127 = f123, f57, f127 + ;; + FMPY f68 = f68, f58 + FMPY f100 = f100, f58 + FMPY f76 = f76, f58 + FMPY f108 = f108, f58 + FMPY f84 = f84, f58 + FMPY f116 = f116, f58 + FMPY f92 = f92, f58 + FMPY f124 = f124, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f101 = f100, f59, f101 + FNMA f77 = f76, f59, f77 + FNMA f109 = f108, f59, f109 + FNMA f85 = f84, f59, f85 + FNMA f117 = f116, f59, f117 + FNMA f93 = f92, f59, f93 + FNMA f125 = f124, f59, f125 + ;; + FNMA f70 = f68, f60, f70 + FNMA f102 = f100, f60, f102 + FNMA f78 = f76, f60, f78 + FNMA f110 = f108, f60, f110 + FNMA f86 = f84, f60, f86 + FNMA f118 = f116, f60, f118 + FNMA f94 = f92, f60, f94 + FNMA f126 = f124, f60, f126 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f71 = f68, f61, f71 + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f103 = f100, f61, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + FNMA f79 = f76, f61, f79 + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + FNMA f111 = f108, f61, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f87 = f84, f61, f87 + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f119 = f116, f61, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + FNMA f95 = f92, f61, f95 + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + FNMA f127 = f124, f61, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f69 = f69, f16 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f101 = f101, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f77 = f77, f16 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f109 = f109, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f85 = f85, f16 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f117 = f117, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + FMPY f93 = f93, f16 + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + FMPY f125 = f125, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f70 = f69, f17, f70 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f102 = f101, f17, f102 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f78 = f77, f17, f78 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f110 = f109, f17, f110 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f86 = f85, f17, f86 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f118 = f117, f17, f118 + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + FNMA f94 = f93, f17, f94 + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + FNMA f126 = f125, f17, f126 + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FNMA f71 = f69, f18, f71 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FNMA f103 = f101, f18, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FNMA f79 = f77, f18, f79 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FNMA f111 = f109, f18, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FNMA f87 = f85, f18, f87 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FNMA f119 = f117, f18, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f91, 5 * SIZE + FNMA f95 = f93, f18, f95 + } + { .mfi + STFD [BOFFSET2] = f123, 5 * SIZE + FNMA f127 = f125, f18, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FMPY f70 = f70, f19 + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FMPY f102 = f102, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f76, SIZE + FMPY f78 = f78, f19 + } + { .mfi + STFD [BOFFSET2] = f108, SIZE + FMPY f110 = f110, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FMPY f86 = f86, f19 + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FMPY f118 = f118, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f92, 5 * SIZE + FMPY f94 = f94, f19 + } + { .mfi + STFD [BOFFSET2] = f124, 5 * SIZE + FMPY f126 = f126, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FNMA f71 = f70, f20, f71 + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FNMA f103 = f102, f20, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f77, SIZE + FNMA f79 = f78, f20, f79 + } + { .mfi + STFD [BOFFSET2] = f109, SIZE + FNMA f111 = f110, f20, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f85, SIZE + FNMA f87 = f86, f20, f87 + } + { .mfi + STFD [BOFFSET2] = f117, SIZE + FNMA f119 = f118, f20, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f93, 5 * SIZE + FNMA f95 = f94, f20, f95 + } + { .mfi + STFD [BOFFSET2] = f125, 5 * SIZE + FNMA f127 = f126, f20, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FMPY f71 = f71, f21 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FMPY f103 = f103, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f78, SIZE + FMPY f79 = f79, f21 + } + { .mfi + STFD [BOFFSET2] = f110, SIZE + FMPY f111 = f111, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FMPY f87 = f87, f21 + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FMPY f119 = f119, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f94, 5 * SIZE + FMPY f95 = f95, f21 + } + { .mfi + STFD [BOFFSET2] = f126, 5 * SIZE + FMPY f127 = f127, f21 + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f79, SIZE + STFD [BOFFSET2] = f111, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f87, SIZE + STFD [BOFFSET2] = f119, SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + STFD [BOFFSET] = f95 + adds BOFFSET = - 59 * SIZE, BOFFSET + } + { .mfi + STFD [BOFFSET2] = f127 + adds BOFFSET2 = - 59 * SIZE, BOFFSET2 + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + } + { .mfi + FSUB f65 = f33, f65 + } + ;; + { .mfi + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + FSUB f66 = f34, f66 + } + { .mfi + FSUB f67 = f35, f67 + } + ;; + { .mfi + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + FSUB f68 = f36, f68 + } + { .mfi + FSUB f69 = f37, f69 + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FSUB f70 = f38, f70 + } + { .mfi + FSUB f71 = f39, f71 + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + FSUB f72 = f40, f72 + } + { .mfi + FSUB f73 = f41, f73 + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FSUB f74 = f42, f74 + } + { .mfi + FSUB f75 = f43, f75 + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + FSUB f76 = f44, f76 + } + { .mfi + FSUB f77 = f45, f77 + } + ;; + { .mfi + LDFPD f62, f63 = [AOFFSET], 2 * SIZE + FSUB f78 = f46, f78 + } + { .mfi + FSUB f79 = f47, f79 + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FSUB f80 = f48, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f49, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f51, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + FSUB f84 = f52, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f53, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + FSUB f86 = f54, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f55, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FSUB f88 = f56, f88 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f57, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FSUB f90 = f58, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FSUB f92 = f60, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f61, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FSUB f94 = f62, f94 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f63, f95 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + FSUB f96 = f32, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f33, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + FSUB f98 = f34, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f35, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f101 = f37, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FSUB f102 = f38, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f39, f103 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + FSUB f104 = f40, f104 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f41, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FSUB f106 = f42, f106 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f43, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + FSUB f108 = f44, f108 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [AOFFSET] + FSUB f110 = f46, f110 + adds AOFFSET = -62 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f111 = f47, f111 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f48, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f49, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f114 = f50, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f51, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f116 = f52, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f53, f117 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f55, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f56, f120 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f57, f121 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f58, f122 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f59, f123 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f60, f124 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f125 = f61, f125 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + FSUB f126 = f62, f126 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f68 = f68, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FMPY f65 = f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f69 = f69, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET] + FMPY f66 = f66, f32 + adds BOFFSET = 3 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f70 = f70, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [BOFFSET], 1 * SIZE + FMPY f67 = f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f71 = f71, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + FNMA f72 = f64, f33, f72 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f76 = f68, f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + FNMA f73 = f65, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f77 = f69, f33, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f45, f46 = [BOFFSET] + FNMA f74 = f66, f33, f74 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f78 = f70, f33, f78 + nop __LINE__ + } + ;; + { .mfi + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + FNMA f75 = f67, f33, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f79 = f71, f33, f79 + nop __LINE__ + } + ;; + { .mfi + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + FNMA f80 = f64, f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f68, f34, f84 + nop __LINE__ + } + ;; + { .mfi + LDFPD f51, f52 = [BOFFSET] + FNMA f81 = f65, f34, f81 + adds BOFFSET = 5 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f85 = f69, f34, f85 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [BOFFSET], 1 * SIZE + FNMA f82 = f66, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f70, f34, f86 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FNMA f83 = f67, f34, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f71, f34, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET] + FNMA f88 = f64, f35, f88 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f92 = f68, f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FNMA f89 = f65, f35, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f93 = f69, f35, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET] + FNMA f90 = f66, f35, f90 + adds BOFFSET = 7 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f94 = f70, f35, f94 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [BOFFSET], 1 * SIZE + FNMA f91 = f67, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f95 = f71, f35, f95 + nop __LINE__ + } + ;; + { .mfi + LDFPD f17, f18 = [BOFFSET] + FNMA f96 = f64, f36, f96 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f68, f36, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f19, f20 = [BOFFSET] + FNMA f97 = f65, f36, f97 + adds BOFFSET = 9 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f69, f36, f101 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [BOFFSET] + FNMA f98 = f66, f36, f98 + adds BOFFSET = -63 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f70, f36, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f67, f36, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f71, f36, f103 + nop __LINE__ + } + ;; + FNMA f104 = f64, f37, f104 + FNMA f108 = f68, f37, f108 + FNMA f105 = f65, f37, f105 + FNMA f109 = f69, f37, f109 + FNMA f106 = f66, f37, f106 + FNMA f110 = f70, f37, f110 + FNMA f107 = f67, f37, f107 + FNMA f111 = f71, f37, f111 + ;; + FNMA f112 = f64, f38, f112 + FNMA f116 = f68, f38, f116 + FNMA f113 = f65, f38, f113 + FNMA f117 = f69, f38, f117 + FNMA f114 = f66, f38, f114 + FNMA f118 = f70, f38, f118 + FNMA f115 = f67, f38, f115 + FNMA f119 = f71, f38, f119 + ;; + FNMA f120 = f64, f39, f120 + FNMA f124 = f68, f39, f124 + FNMA f121 = f65, f39, f121 + FNMA f125 = f69, f39, f125 + FNMA f122 = f66, f39, f122 + FNMA f126 = f70, f39, f126 + FNMA f123 = f67, f39, f123 + FNMA f127 = f71, f39, f127 + ;; + FMPY f72 = f72, f40 + FMPY f76 = f76, f40 + FMPY f73 = f73, f40 + FMPY f77 = f77, f40 + FMPY f74 = f74, f40 + FMPY f78 = f78, f40 + FMPY f75 = f75, f40 + FMPY f79 = f79, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f84 = f76, f41, f84 + FNMA f81 = f73, f41, f81 + FNMA f85 = f77, f41, f85 + FNMA f82 = f74, f41, f82 + FNMA f86 = f78, f41, f86 + FNMA f83 = f75, f41, f83 + FNMA f87 = f79, f41, f87 + ;; + FNMA f88 = f72, f42, f88 + FNMA f92 = f76, f42, f92 + FNMA f89 = f73, f42, f89 + FNMA f93 = f77, f42, f93 + FNMA f90 = f74, f42, f90 + FNMA f94 = f78, f42, f94 + FNMA f91 = f75, f42, f91 + FNMA f95 = f79, f42, f95 + ;; + FNMA f96 = f72, f43, f96 + FNMA f100 = f76, f43, f100 + FNMA f97 = f73, f43, f97 + FNMA f101 = f77, f43, f101 + FNMA f98 = f74, f43, f98 + FNMA f102 = f78, f43, f102 + FNMA f99 = f75, f43, f99 + FNMA f103 = f79, f43, f103 + ;; + FNMA f104 = f72, f44, f104 + FNMA f108 = f76, f44, f108 + FNMA f105 = f73, f44, f105 + FNMA f109 = f77, f44, f109 + FNMA f106 = f74, f44, f106 + FNMA f110 = f78, f44, f110 + FNMA f107 = f75, f44, f107 + FNMA f111 = f79, f44, f111 + ;; + FNMA f112 = f72, f45, f112 + FNMA f116 = f76, f45, f116 + FNMA f113 = f73, f45, f113 + FNMA f117 = f77, f45, f117 + FNMA f114 = f74, f45, f114 + FNMA f118 = f78, f45, f118 + FNMA f115 = f75, f45, f115 + FNMA f119 = f79, f45, f119 + ;; + FNMA f120 = f72, f46, f120 + FNMA f124 = f76, f46, f124 + FNMA f121 = f73, f46, f121 + FNMA f125 = f77, f46, f125 + FNMA f122 = f74, f46, f122 + FNMA f126 = f78, f46, f126 + FNMA f123 = f75, f46, f123 + FNMA f127 = f79, f46, f127 + ;; + FMPY f80 = f80, f47 + FMPY f84 = f84, f47 + FMPY f81 = f81, f47 + FMPY f85 = f85, f47 + FMPY f82 = f82, f47 + FMPY f86 = f86, f47 + FMPY f83 = f83, f47 + FMPY f87 = f87, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f92 = f84, f48, f92 + FNMA f89 = f81, f48, f89 + FNMA f93 = f85, f48, f93 + FNMA f90 = f82, f48, f90 + FNMA f94 = f86, f48, f94 + FNMA f91 = f83, f48, f91 + FNMA f95 = f87, f48, f95 + ;; + FNMA f96 = f80, f49, f96 + FNMA f100 = f84, f49, f100 + FNMA f97 = f81, f49, f97 + FNMA f101 = f85, f49, f101 + FNMA f98 = f82, f49, f98 + FNMA f102 = f86, f49, f102 + FNMA f99 = f83, f49, f99 + FNMA f103 = f87, f49, f103 + ;; + FNMA f104 = f80, f50, f104 + FNMA f108 = f84, f50, f108 + FNMA f105 = f81, f50, f105 + FNMA f109 = f85, f50, f109 + FNMA f106 = f82, f50, f106 + FNMA f110 = f86, f50, f110 + FNMA f107 = f83, f50, f107 + FNMA f111 = f87, f50, f111 + ;; + FNMA f112 = f80, f51, f112 + FNMA f116 = f84, f51, f116 + FNMA f113 = f81, f51, f113 + FNMA f117 = f85, f51, f117 + FNMA f114 = f82, f51, f114 + FNMA f118 = f86, f51, f118 + FNMA f115 = f83, f51, f115 + FNMA f119 = f87, f51, f119 + ;; + FNMA f120 = f80, f52, f120 + FNMA f124 = f84, f52, f124 + FNMA f121 = f81, f52, f121 + FNMA f125 = f85, f52, f125 + FNMA f122 = f82, f52, f122 + FNMA f126 = f86, f52, f126 + FNMA f123 = f83, f52, f123 + FNMA f127 = f87, f52, f127 + ;; + FMPY f88 = f88, f53 + FMPY f92 = f92, f53 + FMPY f89 = f89, f53 + FMPY f93 = f93, f53 + FMPY f90 = f90, f53 + FMPY f94 = f94, f53 + FMPY f91 = f91, f53 + FMPY f95 = f95, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f100 = f92, f54, f100 + FNMA f97 = f89, f54, f97 + FNMA f101 = f93, f54, f101 + FNMA f98 = f90, f54, f98 + FNMA f102 = f94, f54, f102 + FNMA f99 = f91, f54, f99 + FNMA f103 = f95, f54, f103 + ;; + FNMA f104 = f88, f55, f104 + FNMA f108 = f92, f55, f108 + FNMA f105 = f89, f55, f105 + FNMA f109 = f93, f55, f109 + FNMA f106 = f90, f55, f106 + FNMA f110 = f94, f55, f110 + FNMA f107 = f91, f55, f107 + FNMA f111 = f95, f55, f111 + ;; + FNMA f112 = f88, f56, f112 + FNMA f116 = f92, f56, f116 + FNMA f113 = f89, f56, f113 + FNMA f117 = f93, f56, f117 + FNMA f114 = f90, f56, f114 + FNMA f118 = f94, f56, f118 + FNMA f115 = f91, f56, f115 + FNMA f119 = f95, f56, f119 + ;; + FNMA f120 = f88, f57, f120 + FNMA f124 = f92, f57, f124 + FNMA f121 = f89, f57, f121 + FNMA f125 = f93, f57, f125 + FNMA f122 = f90, f57, f122 + FNMA f126 = f94, f57, f126 + FNMA f123 = f91, f57, f123 + FNMA f127 = f95, f57, f127 + ;; + FMPY f96 = f96, f58 + FMPY f100 = f100, f58 + FMPY f97 = f97, f58 + FMPY f101 = f101, f58 + FMPY f98 = f98, f58 + FMPY f102 = f102, f58 + FMPY f99 = f99, f58 + FMPY f103 = f103, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f108 = f100, f59, f108 + FNMA f105 = f97, f59, f105 + FNMA f109 = f101, f59, f109 + FNMA f106 = f98, f59, f106 + FNMA f110 = f102, f59, f110 + FNMA f107 = f99, f59, f107 + FNMA f111 = f103, f59, f111 + ;; + FNMA f112 = f96, f60, f112 + FNMA f116 = f100, f60, f116 + FNMA f113 = f97, f60, f113 + FNMA f117 = f101, f60, f117 + FNMA f114 = f98, f60, f114 + FNMA f118 = f102, f60, f118 + FNMA f115 = f99, f60, f115 + FNMA f119 = f103, f60, f119 + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f120 = f96, f61, f120 + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f124 = f100, f61, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FNMA f121 = f97, f61, f121 + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FNMA f125 = f101, f61, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f122 = f98, f61, f122 + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f126 = f102, f61, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FNMA f123 = f99, f61, f123 + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FNMA f127 = f103, f61, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f72, SIZE + FMPY f104 = f104, f16 + } + { .mfi + STFD [AOFFSET2] = f76, SIZE + FMPY f108 = f108, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f73, SIZE + FMPY f105 = f105, f16 + } + { .mfi + STFD [AOFFSET2] = f77, SIZE + FMPY f109 = f109, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f74, SIZE + FMPY f106 = f106, f16 + } + { .mfi + STFD [AOFFSET2] = f78, SIZE + FMPY f110 = f110, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f75, 5 * SIZE + FMPY f107 = f107, f16 + } + { .mfi + STFD [AOFFSET2] = f79, 5 * SIZE + FMPY f111 = f111, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f112 = f104, f17, f112 + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f116 = f108, f17, f116 + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FNMA f113 = f105, f17, f113 + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FNMA f117 = f109, f17, f117 + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f114 = f106, f17, f114 + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f118 = f110, f17, f118 + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FNMA f115 = f107, f17, f115 + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FNMA f119 = f111, f17, f119 + } + ;; + { .mfi + STFD [AOFFSET] = f88, SIZE + FNMA f120 = f104, f18, f120 + } + { .mfi + STFD [AOFFSET2] = f92, SIZE + FNMA f124 = f108, f18, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f89, SIZE + FNMA f121 = f105, f18, f121 + } + { .mfi + STFD [AOFFSET2] = f93, SIZE + FNMA f125 = f109, f18, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f90, SIZE + FNMA f122 = f106, f18, f122 + } + { .mfi + STFD [AOFFSET2] = f94, SIZE + FNMA f126 = f110, f18, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f91, 5 * SIZE + FNMA f123 = f107, f18, f123 + } + { .mfi + STFD [AOFFSET2] = f95, 5 * SIZE + FNMA f127 = f111, f18, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FMPY f112 = f112, f19 + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FMPY f116 = f116, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMPY f113 = f113, f19 + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMPY f117 = f117, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FMPY f114 = f114, f19 + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FMPY f118 = f118, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMPY f115 = f115, f19 + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMPY f119 = f119, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f104, SIZE + FNMA f120 = f112, f20, f120 + } + { .mfi + STFD [AOFFSET2] = f108, SIZE + FNMA f124 = f116, f20, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f105, SIZE + FNMA f121 = f113, f20, f121 + } + { .mfi + STFD [AOFFSET2] = f109, SIZE + FNMA f125 = f117, f20, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f106, SIZE + FNMA f122 = f114, f20, f122 + } + { .mfi + STFD [AOFFSET2] = f110, SIZE + FNMA f126 = f118, f20, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f107, 5 * SIZE + FNMA f123 = f115, f20, f123 + } + { .mfi + STFD [AOFFSET2] = f111, 5 * SIZE + FNMA f127 = f119, f20, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FMPY f120 = f120, f21 + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FMPY f124 = f124, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMPY f121 = f121, f21 + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMPY f125 = f125, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FMPY f122 = f122, f21 + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FMPY f126 = f126, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f115, 5 * SIZE + FMPY f123 = f123, f21 + } + { .mfi + STFD [AOFFSET2] = f119, 5 * SIZE + FMPY f127 = f127, f21 + } + ;; + { .mmi + STFD [AOFFSET] = f120, SIZE + STFD [AOFFSET2] = f124, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f121, SIZE + STFD [AOFFSET2] = f125, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f122, SIZE + STFD [AOFFSET2] = f126, SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + STFD [AOFFSET] = f123 + adds AOFFSET = - 59 * SIZE, AOFFSET + } + { .mfi + STFD [AOFFSET2] = f127 + adds AOFFSET2 = - 59 * SIZE, AOFFSET2 + } + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f67, 5 * SIZE + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f75, 5 * SIZE + STFD [C10] = f79 + nop __LINE__ + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + adds C13 = 4 * SIZE, C5 + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f83, 5 * SIZE + STFD [C11] = f87 + adds C14 = 4 * SIZE, C6 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + adds C16 = 4 * SIZE, C8 + } + ;; + { .mmi + STFD [C4 ] = f91, 5 * SIZE + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C13] = f100, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + STFD [C13] = f101, SIZE + adds I = -1, I + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE + STFD [C13] = f102, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C5 ] = f99, 5 * SIZE + STFD [C13] = f103 + adds C15 = 4 * SIZE, C7 + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + STFD [C14] = f108, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + STFD [C14] = f109, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + STFD [C14] = f110, SIZE + sub L = K, KK + } + ;; + { .mmi + STFD [C6 ] = f107, 5 * SIZE + STFD [C14] = f111 + nop __LINE__ + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C15] = f116, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + STFD [C15] = f117, SIZE + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE + STFD [C15] = f118, SIZE + shladd AOFFSET = L, 3, AOFFSET + } + ;; + { .mmi + STFD [C7 ] = f115, 5 * SIZE + STFD [C15] = f119 + shladd BOFFSET = L, 3, BOFFSET + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + STFD [C16] = f124, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE + STFD [C16] = f125, SIZE +#ifdef LT + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE + STFD [C16] = f126, SIZE + mov L = KK + } + ;; + { .mmb + STFD [C8 ] = f123, 5 * SIZE + STFD [C16] = f127 + (p6) br.cond.dptk .L011 + } + ;; + +.L020: + { .mib + mov L = KK + tbit.z p6, p0 = M, 2 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f67 = f0 + } + { .mfi + setf.d f74 = r0 + mov f75 = f0 + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f83 = f0 + } + { .mfi + setf.d f90 = r0 + mov f91 = f0 + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f99 = f0 + } + { .mfi + setf.d f106 = r0 + mov f107 = f0 + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f114 = r0 + mov f115 = f0 + } + { .mfb + setf.d f122 = r0 + mov f123 = f0 + (p6) br.cond.dpnt .L028 + } + ;; + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f113 = f46, f113 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + FSUB f66 = f48, f66 + FSUB f74 = f49, f74 + FSUB f82 = f50, f82 + FSUB f90 = f51, f90 + FSUB f98 = f52, f98 + FSUB f106 = f53, f106 + FSUB f114 = f54, f114 + FSUB f122 = f55, f122 + ;; + FSUB f67 = f56, f67 + FSUB f75 = f57, f75 + FSUB f83 = f58, f83 + FSUB f91 = f59, f91 + FSUB f99 = f60, f99 + FSUB f107 = f61, f107 + FSUB f115 = f62, f115 + FSUB f123 = f63, f123 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; + FSUB f96 = f48, f96 + FSUB f97 = f49, f97 + FSUB f98 = f50, f98 + FSUB f99 = f51, f99 + ;; + FSUB f104 = f52, f104 + FSUB f105 = f53, f105 + FSUB f106 = f54, f106 + FSUB f107 = f55, f107 + ;; + FSUB f112 = f56, f112 + FSUB f113 = f57, f113 + FSUB f114 = f58, f114 + FSUB f115 = f59, f115 + ;; + FSUB f120 = f60, f120 + FSUB f121 = f61, f121 + FSUB f122 = f62, f122 + FSUB f123 = f63, f123 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + FNMA f74 = f72, f34, f74 + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + FNMA f67 = f64, f35, f67 + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + FNMA f83 = f80, f35, f83 + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + FMPY f65 = f65, f36 + FMPY f97 = f97, f36 + FMPY f73 = f73, f36 + FMPY f105 = f105, f36 + FMPY f81 = f81, f36 + FMPY f113 = f113, f36 + FMPY f89 = f89, f36 + FMPY f121 = f121, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f98 = f97, f37, f98 + FNMA f74 = f73, f37, f74 + FNMA f106 = f105, f37, f106 + FNMA f82 = f81, f37, f82 + FNMA f114 = f113, f37, f114 + FNMA f90 = f89, f37, f90 + FNMA f122 = f121, f37, f122 + ;; + FNMA f67 = f65, f38, f67 + FNMA f99 = f97, f38, f99 + FNMA f75 = f73, f38, f75 + FNMA f107 = f105, f38, f107 + FNMA f83 = f81, f38, f83 + FNMA f115 = f113, f38, f115 + FNMA f91 = f89, f38, f91 + FNMA f123 = f121, f38, f123 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FMPY f66 = f66, f39 + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FMPY f98 = f98, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + FMPY f74 = f74, f39 + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + FMPY f106 = f106, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FMPY f82 = f82, f39 + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FMPY f114 = f114, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + FMPY f90 = f90, f39 + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + FMPY f122 = f122, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FNMA f67 = f66, f40, f67 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FNMA f99 = f98, f40, f99 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FNMA f75 = f74, f40, f75 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FNMA f107 = f106, f40, f107 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FNMA f83 = f82, f40, f83 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FNMA f115 = f114, f40, f115 + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + FNMA f91 = f90, f40, f91 + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + FNMA f123 = f122, f40, f123 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FMPY f67 = f67, f41 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FMPY f99 = f99, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FMPY f75 = f75, f41 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FMPY f107 = f107, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FMPY f83 = f83, f41 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FMPY f115 = f115, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + FMPY f91 = f91, f41 + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + FMPY f123 = f123, f41 + } + ;; + { .mmf + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + } + ;; + { .mmf + STFD [BOFFSET] = f75, SIZE + STFD [BOFFSET2] = f107, SIZE + } + ;; + { .mmf + STFD [BOFFSET] = f83, SIZE + STFD [BOFFSET2] = f115, SIZE + } + ;; + { .mmf + STFD [BOFFSET] = f91, -27 * SIZE + STFD [BOFFSET2] = f123, -27 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + FNMA f98 = f66, f36, f98 + FNMA f99 = f67, f36, f99 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + FNMA f106 = f66, f37, f106 + FNMA f107 = f67, f37, f107 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + FNMA f114 = f66, f38, f114 + FNMA f115 = f67, f38, f115 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + FNMA f122 = f66, f39, f122 + FNMA f123 = f67, f39, f123 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + FMPY f74 = f74, f40 + FMPY f75 = f75, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + FNMA f82 = f74, f41, f82 + FNMA f83 = f75, f41, f83 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + FNMA f90 = f74, f42, f90 + FNMA f91 = f75, f42, f91 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + FNMA f98 = f74, f43, f98 + FNMA f99 = f75, f43, f99 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + FNMA f106 = f74, f44, f106 + FNMA f107 = f75, f44, f107 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + FNMA f114 = f74, f45, f114 + FNMA f115 = f75, f45, f115 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + FNMA f122 = f74, f46, f122 + FNMA f123 = f75, f46, f123 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + FMPY f82 = f82, f47 + FMPY f83 = f83, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + FNMA f90 = f82, f48, f90 + FNMA f91 = f83, f48, f91 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + FNMA f98 = f82, f49, f98 + FNMA f99 = f83, f49, f99 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + FNMA f106 = f82, f50, f106 + FNMA f107 = f83, f50, f107 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + FNMA f114 = f82, f51, f114 + FNMA f115 = f83, f51, f115 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + FNMA f122 = f82, f52, f122 + FNMA f123 = f83, f52, f123 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + FMPY f90 = f90, f53 + FMPY f91 = f91, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + FNMA f98 = f90, f54, f98 + FNMA f99 = f91, f54, f99 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + FNMA f106 = f90, f55, f106 + FNMA f107 = f91, f55, f107 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + FNMA f114 = f90, f56, f114 + FNMA f115 = f91, f56, f115 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + FNMA f122 = f90, f57, f122 + FNMA f123 = f91, f57, f123 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + FMPY f98 = f98, f58 + FMPY f99 = f99, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + FNMA f106 = f98, f59, f106 + FNMA f107 = f99, f59, f107 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + FNMA f114 = f98, f60, f114 + FNMA f115 = f99, f60, f115 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + FNMA f122 = f98, f61, f122 + FNMA f123 = f99, f61, f123 + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FMPY f104 = f104, f16 + } + { .mfi + STFD [AOFFSET2] = f72, SIZE + FMPY f105 = f105, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMPY f106 = f106, f16 + } + { .mfi + STFD [AOFFSET2] = f73, SIZE + FMPY f107 = f107, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f112 = f104, f17, f112 + } + { .mfi + STFD [AOFFSET2] = f74, SIZE + FNMA f113 = f105, f17, f113 + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FNMA f114 = f106, f17, f114 + } + { .mfi + STFD [AOFFSET2] = f75, 5 * SIZE + FNMA f115 = f107, f17, f115 + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f120 = f104, f18, f120 + } + { .mfi + STFD [AOFFSET2] = f88, SIZE + FNMA f121 = f105, f18, f121 + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FNMA f122 = f106, f18, f122 + } + { .mfi + STFD [AOFFSET2] = f89, SIZE + FNMA f123 = f107, f18, f123 + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FMPY f112 = f112, f19 + } + { .mfi + STFD [AOFFSET2] = f90, SIZE + FMPY f113 = f113, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FMPY f114 = f114, f19 + } + { .mfi + STFD [AOFFSET2] = f91, 5 * SIZE + FMPY f115 = f115, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f120 = f112, f20, f120 + } + { .mfi + STFD [AOFFSET2] = f104, SIZE + FNMA f121 = f113, f20, f121 + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FNMA f122 = f114, f20, f122 + } + { .mfi + STFD [AOFFSET2] = f105, SIZE + FNMA f123 = f115, f20, f123 + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FMPY f120 = f120, f21 + } + { .mfi + STFD [AOFFSET2] = f106, SIZE + FMPY f121 = f121, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMPY f122 = f122, f21 + } + { .mfi + STFD [AOFFSET2] = f107, 5 * SIZE + FMPY f123 = f123, f21 + } + ;; + { .mmf + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + } + ;; + { .mmf + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + } + ;; + { .mmf + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + } + ;; + { .mmf + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f123, - 27 * SIZE + } + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C2 ] = f74, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C1 ] = f67, SIZE + STFD [C2 ] = f75, SIZE + sub L = K, KK + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmf + STFD [C3 ] = f82, SIZE + STFD [C4 ] = f90, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C3 ] = f83, SIZE + STFD [C4 ] = f91, SIZE + shladd AOFFSET = L, 2, AOFFSET + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C6 ] = f104, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + STFD [C6 ] = f105, SIZE + shladd BOFFSET = L, 3, BOFFSET + } + ;; + { .mmf + STFD [C5 ] = f98, SIZE + STFD [C6 ] = f106, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C5 ] = f99, SIZE + STFD [C6 ] = f107, SIZE +#ifdef LT + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C8 ] = f120, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + STFD [C8 ] = f121, SIZE + mov L = KK + } + ;; + { .mmf + STFD [C7 ] = f114, SIZE + STFD [C8 ] = f122, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C7 ] = f115, SIZE + STFD [C8 ] = f123, SIZE + nop __LINE__ + } + ;; + .align 8 + +.L030: + { .mib + mov L = KK + tbit.z p6, p0 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + nop __LINE__ + } + ;; + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + adds L = 1, L + } + ;; + { .mfi + setf.d f105 = r0 + mov f81 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + mov f65 = f0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + cmp.eq p3, p0 = r0, r0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = 0, L + adds L = -1, L + } + ;; + { .mib + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L038 + } + ;; + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + FSUB f113 = f46, f113 + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + + { .mmi + LDFPD f32, f33 = [AOFFSET] + nop __LINE__ + adds AOFFSET = 3 * SIZE, AOFFSET + } + ;; + { .mfi + LDFD f34 = [AOFFSET], - 3 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FMPY f65 = f65, f34 + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FMPY f97 = f97, f34 + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + FMPY f73 = f73, f34 + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + FMPY f105 = f105, f34 + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FMPY f81 = f81, f34 + sub L = K, KK + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FMPY f113 = f113, f34 + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + FMPY f89 = f89, f34 + shladd L = L, BASE_SHIFT, r0 + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + FMPY f121 = f121, f34 + } + ;; + { .mmi + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f73, SIZE + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f81, SIZE + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f89, -11 * SIZE + STFD [BOFFSET2] = f121, -11 * SIZE + } +#endif + +#ifdef RN + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + FSUB f96 = f40, f96 + FSUB f97 = f41, f97 + FSUB f104 = f42, f104 + FSUB f105 = f43, f105 + FSUB f112 = f44, f112 + FSUB f113 = f45, f113 + FSUB f120 = f46, f120 + FSUB f121 = f47, f121 + ;; + + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f120 = f104, f18, f120 + } + { .mfi + STFD [AOFFSET2] = f80, SIZE + FNMA f121 = f105, f18, f121 + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMPY f112 = f112, f19 + } + { .mfi + STFD [AOFFSET2] = f81, SIZE + FMPY f113 = f113, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f72, SIZE + FNMA f120 = f112, f20, f120 + sub L = K, KK + } + { .mfi + STFD [AOFFSET2] = f88, SIZE + FNMA f121 = f113, f20, f121 + } + ;; + { .mfi + STFD [AOFFSET] = f73, 5 * SIZE + FMPY f120 = f120, f21 + shladd L = L, BASE_SHIFT, r0 + } + { .mfi + STFD [AOFFSET2] = f89, 5 * SIZE + FMPY f121 = f121, f21 + } + ;; + { .mmi + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f105, -11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + nop __LINE__ + } + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmf + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE + mov f80 = f0 + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C6 ] = f104, SIZE + mov f96 = f0 + } + ;; + { .mmf + STFD [C5 ] = f97, SIZE + STFD [C6 ] = f105, SIZE + mov f104 = f0 + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C8 ] = f120, SIZE + mov f112 = f0 + } + ;; + { .mmf + STFD [C7 ] = f113, SIZE + STFD [C8 ] = f121, SIZE + mov f120 = f0 + } + { .mmi + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 3, BOFFSET +#ifdef LT + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L040: + { .mib + mov L = KK + tbit.z p6, p0 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L048 + } + ;; + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.cloop.sptk.few .L042 + } + ;; + +.L048: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f96 = f36, f96 + FSUB f104 = f37, f104 + FSUB f112 = f38, f112 + FSUB f120 = f39, f120 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, -3 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, -3 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FNMA f96 = f64, f36, f96 + ;; + FNMA f104 = f64, f37, f104 + ;; + FNMA f112 = f64, f38, f112 + ;; + FNMA f120 = f64, f39, f120 + ;; + FMPY f72 = f72, f40 + ;; + FNMA f80 = f72, f41, f80 + ;; + FNMA f88 = f72, f42, f88 + ;; + FNMA f96 = f72, f43, f96 + ;; + FNMA f104 = f72, f44, f104 + ;; + FNMA f112 = f72, f45, f112 + ;; + FNMA f120 = f72, f46, f120 + ;; + FMPY f80 = f80, f47 + ;; + FNMA f88 = f80, f48, f88 + ;; + FNMA f96 = f80, f49, f96 + ;; + FNMA f104 = f80, f50, f104 + ;; + FNMA f112 = f80, f51, f112 + ;; + FNMA f120 = f80, f52, f120 + ;; + FMPY f88 = f88, f53 + ;; + FNMA f96 = f88, f54, f96 + ;; + FNMA f104 = f88, f55, f104 + ;; + FNMA f112 = f88, f56, f112 + ;; + FNMA f120 = f88, f57, f120 + ;; + FMPY f96 = f96, f58 + ;; + FNMA f104 = f96, f59, f104 + ;; + FNMA f112 = f96, f60, f112 + ;; + FNMA f120 = f96, f61, f120 + ;; + FMPY f104 = f104, f16 + ;; + FNMA f112 = f104, f17, f112 + ;; + FNMA f120 = f104, f18, f120 + ;; + FMPY f112 = f112, f19 + ;; + FNMA f120 = f112, f20, f120 + ;; + FMPY f120 = f120, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + STFD [C5 ] = f96, SIZE + STFD [C6 ] = f104, SIZE + STFD [C7 ] = f112, SIZE + STFD [C8 ] = f120, SIZE + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f96 = f0 + mov f104 = f0 + mov f112 = f0 + mov f120 = f0 + ;; + sub L = K, KK + ;; + shladd L = L, BASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + ;; + shladd BOFFSET = L, 3, BOFFSET + ;; +#ifdef LT + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + mov L = KK + ;; + .align 8 + +.L049: + mov B = BOFFSET + +#ifdef RN + adds KK = 8, KK +#endif + ;; + + { .mmi + mov AOFFSET = A + } + ;; + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 8 + +.L050: + { .mib + setf.d f64 = r0 + tbit.z p6, p0 = N, 2 + (p6) br.cond.dpnt .L090 + } + ;; + { .mfi + setf.d f72 = r0 + mov f80 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f88 = f0 +#ifdef LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + mov AORIG = A + mov f65 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f73 = f0 + } + ;; + { .mfi + shladd C = LDC, 2, C // coffset += 8 * ldc + mov f81 = f0 + mov L = KK + }{ .mfb + shladd C4 = LDC, 1, C2 + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f90 = f0 + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC] + } + ;; + { .mfi + setf.d f70 = r0 + mov f78 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mfi + setf.d f71 = r0 + adds L = -1, L + } + ;; + { .mfi + setf.d f87 = r0 + mov f79 = f0 + mov ar.lc = L + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f95 = f0 + (p6) br.cond.dpnt .L058 + } + ;; + .align 8 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 8 + +.L058: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [BOFFSET] + adds BOFFSET = -30 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + + FSUB f68 = f48, f68 + FSUB f76 = f49, f76 + FSUB f84 = f50, f84 + FSUB f92 = f51, f92 + + FSUB f69 = f52, f69 + FSUB f77 = f53, f77 + FSUB f85 = f54, f85 + FSUB f93 = f55, f93 + + FSUB f70 = f56, f70 + FSUB f78 = f57, f78 + FSUB f86 = f58, f86 + FSUB f94 = f59, f94 + + FSUB f71 = f60, f71 + FSUB f79 = f61, f79 + FSUB f87 = f62, f87 + FSUB f95 = f63, f95 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; + FSUB f80 = f48, f80 + FSUB f81 = f49, f81 + FSUB f82 = f50, f82 + FSUB f83 = f51, f83 + FSUB f84 = f52, f84 + FSUB f85 = f53, f85 + FSUB f86 = f54, f86 + FSUB f87 = f55, f87 + + FSUB f88 = f56, f88 + FSUB f89 = f57, f89 + FSUB f90 = f58, f90 + FSUB f91 = f59, f91 + FSUB f92 = f60, f92 + FSUB f93 = f61, f93 + FSUB f94 = f62, f94 + FSUB f95 = f63, f95 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + FNMA f84 = f80, f36, f84 + FNMA f92 = f88, f36, f92 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + FNMA f85 = f80, f37, f85 + FNMA f93 = f88, f37, f93 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + FNMA f86 = f80, f38, f86 + FNMA f94 = f88, f38, f94 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + FNMA f87 = f80, f39, f87 + FNMA f95 = f88, f39, f95 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + FMPY f81 = f81, f40 + FMPY f89 = f89, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + FNMA f82 = f81, f41, f82 + FNMA f90 = f89, f41, f90 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + FNMA f83 = f81, f42, f83 + FNMA f91 = f89, f42, f91 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + FNMA f84 = f81, f43, f84 + FNMA f92 = f89, f43, f92 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + FNMA f85 = f81, f44, f85 + FNMA f93 = f89, f44, f93 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + FNMA f86 = f81, f45, f86 + FNMA f94 = f89, f45, f94 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + FNMA f87 = f81, f46, f87 + FNMA f95 = f89, f46, f95 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + FMPY f82 = f82, f47 + FMPY f90 = f90, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + FNMA f83 = f82, f48, f83 + FNMA f91 = f90, f48, f91 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + FNMA f84 = f82, f49, f84 + FNMA f92 = f90, f49, f92 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + FNMA f85 = f82, f50, f85 + FNMA f93 = f90, f50, f93 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + FNMA f86 = f82, f51, f86 + FNMA f94 = f90, f51, f94 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + FNMA f87 = f82, f52, f87 + FNMA f95 = f90, f52, f95 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + FMPY f83 = f83, f53 + FMPY f91 = f91, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + FNMA f84 = f83, f54, f84 + FNMA f92 = f91, f54, f92 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + FNMA f85 = f83, f55, f85 + FNMA f93 = f91, f55, f93 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + FNMA f86 = f83, f56, f86 + FNMA f94 = f91, f56, f94 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + FNMA f87 = f83, f57, f87 + FNMA f95 = f91, f57, f95 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + FMPY f84 = f84, f58 + FMPY f92 = f92, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + FNMA f85 = f84, f59, f85 + FNMA f93 = f92, f59, f93 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + FNMA f86 = f84, f60, f86 + FNMA f94 = f92, f60, f94 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + FNMA f87 = f84, f61, f87 + FNMA f95 = f92, f61, f95 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + FMPY f85 = f85, f16 + FMPY f93 = f93, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + FNMA f86 = f85, f17, f86 + FNMA f94 = f93, f17, f94 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + FNMA f87 = f85, f18, f87 + FNMA f95 = f93, f18, f95 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + FMPY f86 = f86, f19 + FMPY f94 = f94, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + FNMA f87 = f86, f20, f87 + FNMA f95 = f94, f20, f95 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + FMPY f87 = f87, f21 + FMPY f95 = f95, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, 5 * SIZE + STFD [BOFFSET2] = f91, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, 5 * SIZE + STFD [BOFFSET2] = f93, 5 * SIZE + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94 + STFD [BOFFSET2] = f95 + adds C9 = 4 * SIZE, C1 + adds BOFFSET = - 27 * SIZE, BOFFSET + adds BOFFSET2 = - 27 * SIZE, BOFFSET2 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FNMA f80 = f64, f34, f80 + FNMA f84 = f68, f34, f84 + FNMA f81 = f65, f34, f81 + FNMA f85 = f69, f34, f85 + FNMA f82 = f66, f34, f82 + FNMA f86 = f70, f34, f86 + FNMA f83 = f67, f34, f83 + FNMA f87 = f71, f34, f87 + ;; + FNMA f88 = f64, f35, f88 + FNMA f92 = f68, f35, f92 + FNMA f89 = f65, f35, f89 + FNMA f93 = f69, f35, f93 + FNMA f90 = f66, f35, f90 + FNMA f94 = f70, f35, f94 + FNMA f91 = f67, f35, f91 + FNMA f95 = f71, f35, f95 + ;; + FMPY f72 = f72, f36 + FMPY f76 = f76, f36 + FMPY f73 = f73, f36 + FMPY f77 = f77, f36 + FMPY f74 = f74, f36 + FMPY f78 = f78, f36 + FMPY f75 = f75, f36 + FMPY f79 = f79, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f84 = f76, f37, f84 + FNMA f81 = f73, f37, f81 + FNMA f85 = f77, f37, f85 + FNMA f82 = f74, f37, f82 + FNMA f86 = f78, f37, f86 + FNMA f83 = f75, f37, f83 + FNMA f87 = f79, f37, f87 + ;; + FNMA f88 = f72, f38, f88 + FNMA f92 = f76, f38, f92 + FNMA f89 = f73, f38, f89 + FNMA f93 = f77, f38, f93 + FNMA f90 = f74, f38, f90 + FNMA f94 = f78, f38, f94 + FNMA f91 = f75, f38, f91 + FNMA f95 = f79, f38, f95 + ;; + FMPY f80 = f80, f39 + FMPY f84 = f84, f39 + FMPY f81 = f81, f39 + FMPY f85 = f85, f39 + FMPY f82 = f82, f39 + FMPY f86 = f86, f39 + FMPY f83 = f83, f39 + FMPY f87 = f87, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f92 = f84, f40, f92 + FNMA f89 = f81, f40, f89 + FNMA f93 = f85, f40, f93 + FNMA f90 = f82, f40, f90 + FNMA f94 = f86, f40, f94 + FNMA f91 = f83, f40, f91 + FNMA f95 = f87, f40, f95 + ;; + FMPY f88 = f88, f41 + FMPY f92 = f92, f41 + FMPY f89 = f89, f41 + FMPY f93 = f93, f41 + FMPY f90 = f90, f41 + FMPY f94 = f94, f41 + FMPY f91 = f91, f41 + FMPY f95 = f95, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, 5 * SIZE + STFD [AOFFSET2] = f79, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f87, 5 * SIZE + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, -27 * SIZE + STFD [AOFFSET2] = f95, -27 * SIZE + ;; +#endif + + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, 5 * SIZE + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f75, 5 * SIZE + STFD [C10] = f79 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + } + ;; + { .mmi + STFD [C3 ] = f83, 5 * SIZE + STFD [C11] = f87 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + + } + ;; + { .mmi + STFD [C4 ] = f91, 5 * SIZE + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + ;; + { .mmi + shladd AOFFSET = L, 3, AOFFSET + } + ;; + { .mmi + shladd BOFFSET = L, 2, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + mov L = KK + } + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + { .mmb + (p6) br.cond.dptk .L052 + } + ;; + + .align 8 + +.L060: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L070 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + + mov f66 = f0 + mov f67 = f0 + mov f74 = f0 + mov f75 = f0 + mov f82 = f0 + mov f83 = f0 + mov f90 = f0 + mov f91 = f0 + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L068 + } + ;; + .align 8 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; + .align 8 + +.L068: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + ;; + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + FMPY f81 = f81, f36 + FMPY f89 = f89, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + FNMA f82 = f81, f37, f82 + FNMA f90 = f89, f37, f90 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + FNMA f83 = f81, f38, f83 + FNMA f91 = f89, f38, f91 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + FMPY f82 = f82, f39 + FMPY f90 = f90, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + FNMA f83 = f82, f40, f83 + FNMA f91 = f90, f40, f91 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + FMPY f83 = f83, f41 + FMPY f91 = f91, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, -11 * SIZE + STFD [BOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + FMPY f74 = f74, f36 + FMPY f75 = f75, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + FNMA f82 = f74, f37, f82 + FNMA f83 = f75, f37, f83 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + FNMA f90 = f74, f38, f90 + FNMA f91 = f75, f38, f91 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + FMPY f82 = f82, f39 + FMPY f83 = f83, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + FNMA f90 = f82, f40, f90 + FNMA f91 = f83, f40, f91 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + FMPY f90 = f90, f41 + FMPY f91 = f91, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, -11 * SIZE + STFD [AOFFSET2] = f91, -11 * SIZE + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, SIZE + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi + STFD [C2 ] = f75, SIZE + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + } + ;; + { .mmi + STFD [C3 ] = f83, SIZE + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi + STFD [C4 ] = f91, SIZE + nop __LINE__ + } + ;; + mov f65 = f0 + ;; + mov f73 = f0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmf + mov f81 = f0 + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 2, AOFFSET + } + ;; + { .mmi + shladd BOFFSET = L, 2, BOFFSET + } + ;; + { .mmf + mov f89 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L070: + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L080 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; + { .mfi + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L078 + } + ;; + .align 8 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +.L078: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + FMPY f81 = f81, f34 + FMPY f89 = f89, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; + STFD [C2 ] = f73, SIZE + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; + STFD [C3 ] = f81, SIZE + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; + STFD [C4 ] = f89, SIZE + ;; + mov f96 = f0 + ;; + mov f104 = f0 + ;; + sub L = K, KK + ;; + mov f112 = f0 + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 1, AOFFSET + } + ;; + { .mmi + shladd BOFFSET = L, 2, BOFFSET + } + ;; + { .mmf + mov f120 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L080: + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L088 + } + ;; + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb + nop __LINE__ + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; + +.L088: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FMPY f72 = f72, f36 + ;; + FNMA f80 = f72, f37, f80 + ;; + FNMA f88 = f72, f38, f88 + ;; + FMPY f80 = f80, f39 + ;; + FNMA f88 = f80, f40, f88 + ;; + FMPY f88 = f88, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + ;; + sub L = K, KK + ;; + shladd L = L, BASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + ;; + shladd BOFFSET = L, 2, BOFFSET + ;; +#ifdef LT + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + mov L = KK + ;; + .align 8 + +.L089: + mov B = BOFFSET + +#ifdef RN + adds KK = 4, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L090: + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L130 + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + ;; + { .mfi + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + mov AORIG = A + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + } + ;; + { .mfi + shladd C = LDC, 1, C // coffset += 8 * ldc + mov f81 = f0 + mov L = KK + }{ .mfb + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + mov f76 = f0 + mov f77 = f0 + mov f78 = f0 + mov f79 = f0 + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + } + ;; + .align 8 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; + .align 8 + +.L098: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + + FSUB f68 = f40, f68 + FSUB f76 = f41, f76 + FSUB f69 = f42, f69 + FSUB f77 = f43, f77 + + FSUB f70 = f44, f70 + FSUB f78 = f45, f78 + FSUB f71 = f46, f71 + FSUB f79 = f47, f79 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, 5 * SIZE + STFD [BOFFSET2] = f75, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, -11 * SIZE + STFD [BOFFSET2] = f79, -11 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FMPY f72 = f72, f34 + FMPY f76 = f76, f34 + FMPY f73 = f73, f34 + FMPY f77 = f77, f34 + FMPY f74 = f74, f34 + FMPY f78 = f78, f34 + FMPY f75 = f75, f34 + FMPY f79 = f79, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, -11 * SIZE + STFD [AOFFSET2] = f79, -11 * SIZE + ;; +#endif + + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, 5 * SIZE + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f75, 5 * SIZE + STFD [C10] = f79 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + ;; + shladd AOFFSET = L, 3, AOFFSET + shladd BOFFSET = L, 1, BOFFSET + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + mov L = KK + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + (p6) br.cond.dptk .L092 + ;; + .align 8 + +.L100: + { .mib + mov L = KK + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + } + ;; + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L108 + } + ;; + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + .align 8 + +.L108: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + ;; + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + FMPY f74 = f74, f34 + FMPY f75 = f75, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f75, -3 * SIZE + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, SIZE + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi + STFD [C2 ] = f75, SIZE + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + shladd AOFFSET = L, 2, AOFFSET + ;; + shladd BOFFSET = L, 1, BOFFSET + ;; +#ifdef LT + adds KK = 4, KK + nop __LINE__ +#endif + ;; + .align 8 + +.L110: + { .mib + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + } + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; + adds L = 1, L + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L118 + } + ;; + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + .align 8 + +.L118: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; + STFD [C2 ] = f73, SIZE + ;; + mov f65 = f0 + mov f73 = f0 + ;; + sub L = K, KK + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 1, AOFFSET + } + ;; + { .mmi + shladd BOFFSET = L, 1, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L120: + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L128 + } + ;; + .align 8 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FMPY f72 = f72, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + + + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + + mov f64 = f0 + mov f72 = f0 + ;; + sub L = K, KK + ;; + shladd L = L, BASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + ;; + shladd BOFFSET = L, 1, BOFFSET + ;; +#ifdef LT + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + mov L = KK + ;; + .align 8 + +.L129: + mov B = BOFFSET + +#ifdef RN + adds KK = 2, KK +#endif + + ;; + mov AOFFSET = A + ;; + .align 16 + +.L130: + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L999 + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + ;; + + { .mfi + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + mov AORIG = A + } + ;; + { .mfi + add C = C, LDC // coffset += 8 * ldc + mov L = KK + }{ .mfb + (p6) br.cond.dpnt .L140 + } + ;; + .align 16 + +.L132: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds L = 1, L + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L138 + } + ;; + .align 16 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FNMA f68 = f64, f36, f68 + ;; + FNMA f69 = f64, f37, f69 + ;; + FNMA f70 = f64, f38, f70 + ;; + FNMA f71 = f64, f39, f71 + ;; + FMPY f65 = f65, f40 + ;; + FNMA f66 = f65, f41, f66 + ;; + FNMA f67 = f65, f42, f67 + ;; + FNMA f68 = f65, f43, f68 + ;; + FNMA f69 = f65, f44, f69 + ;; + FNMA f70 = f65, f45, f70 + ;; + FNMA f71 = f65, f46, f71 + ;; + FMPY f66 = f66, f47 + ;; + FNMA f67 = f66, f48, f67 + ;; + FNMA f68 = f66, f49, f68 + ;; + FNMA f69 = f66, f50, f69 + ;; + FNMA f70 = f66, f51, f70 + ;; + FNMA f71 = f66, f52, f71 + ;; + FMPY f67 = f67, f53 + ;; + FNMA f68 = f67, f54, f68 + ;; + FNMA f69 = f67, f55, f69 + ;; + FNMA f70 = f67, f56, f70 + ;; + FNMA f71 = f67, f57, f71 + ;; + FMPY f68 = f68, f58 + ;; + FNMA f69 = f68, f59, f69 + ;; + FNMA f70 = f68, f60, f70 + ;; + FNMA f71 = f68, f61, f71 + ;; + FMPY f69 = f69, f16 + ;; + FNMA f70 = f69, f17, f70 + ;; + FNMA f71 = f69, f18, f71 + ;; + FMPY f70 = f70, f19 + ;; + FNMA f71 = f70, f20, f71 + ;; + FMPY f71 = f71, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + STFD [BOFFSET2] = f71, -3 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, 5 * SIZE + STFD [C9 ] = f71 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + ;; + { .mmi + shladd AOFFSET = L, 3, AOFFSET + } + ;; + { .mmi + add BOFFSET = L, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + mov L = KK + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + + (p6) br.cond.dptk .L132 + .align 8 + +.L140: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + mov f65 = f0 + } + ;; + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L148 + } + ;; + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf + nop __LINE__ + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + nop __LINE__ + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FMPY f65 = f65, f36 + ;; + FNMA f66 = f65, f37, f66 + ;; + FNMA f67 = f65, f38, f67 + ;; + FMPY f66 = f66, f39 + ;; + FNMA f67 = f66, f40, f67 + ;; + FMPY f67 = f67, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, SIZE + } + ;; + { .mmf + mov f72 = f0 + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 2, AOFFSET + } + ;; + { .mmi + add BOFFSET = L, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L150: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + } + ;; + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L158 + } + ;; + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FMPY f65 = f65, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, -SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + ;; + sub L = K, KK + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 1, AOFFSET + } + ;; + { .mmi + add BOFFSET = L, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L160: + { .mib + mov L = KK + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, L + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L168 + } + ;; + .align 8 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; + .align 8 + +.L168: +#ifdef LT + { .mmi + LDFD f32 = [BOFFSET] + LDFD f33 = [AOFFSET] + nop __LINE__ + } + ;; +#else + { .mmi + LDFD f32 = [AOFFSET] + LDFD f33 = [BOFFSET] + nop __LINE__ + } + ;; +#endif + + { .mmf + sub L = K, KK + nop __LINE__ + FSUB f64 = f32, f64 + } + ;; +#ifdef LT + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + mov L = KK + ;; + FMPY f64 = f64, f33 + ;; +#ifdef LT + { .mmf + STFD [BOFFSET] = f64 + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; +#else + { .mmf + STFD [AOFFSET] = f64 + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; +#endif + + shladd AOFFSET = L, BASE_SHIFT, AOFFSET + shladd BOFFSET = L, BASE_SHIFT, BOFFSET + ;; + .align 8 + +.L169: + { .mii + mov B = BOFFSET + +#ifdef RN + adds KK = 1, KK +#else + nop __LINE__ +#endif + mov AOFFSET = A + } + ;; + .align 16 + + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + mov ar.lc = ARLC + ;; + mov pr = PR, -1 + ;; + mov ar.pfs = ARPFS + ;; + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/trsm_kernel_RT.S b/kernel/ia64/trsm_kernel_RT.S new file mode 100644 index 0000000..f3482ae --- /dev/null +++ b/kernel/ia64/trsm_kernel_RT.S @@ -0,0 +1,16688 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 4) +#endif + +#ifndef LN +#define CPREFETCHSIZE 8 +#else +#define CPREFETCHSIZE -8 +#endif +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r36 +#define B r37 +#define C r38 +#define LDC r39 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 +#define AOFFSET2 loc12 +#define BOFFSET2 loc13 + + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -6 * 16, SP + adds r9 = -5 * 16, SP + adds SP = -6 * 16, SP + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + } + ;; + { .mmi + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + nop __LINE__ + } + ;; + { .mmi + stf.spill [r8] = f20 + stf.spill [r9] = f21 + shladd LDC = LDC, BASE_SHIFT, r0 + } + ;; + .body + { .mmi + ld8 OFFSET = [r14] + mov AOFFSET = A + } + ;; +#ifdef LN + { .mmi + setf.sig f32 = M + setf.sig f33 = K + shladd C = M, BASE_SHIFT, C + } + ;; + {.mmf + nop __LINE__ + nop __LINE__ + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + ;; + nop __LINE__ + shladd A = r2, BASE_SHIFT, A + } + ;; +#endif + +#ifdef RN + sub KK = r0, OFFSET +#endif + +#ifdef RT + { .mmi + setf.sig f32 = N + setf.sig f33 = K + nop __LINE__ + } + ;; + { .mmi + setf.sig f34 = LDC + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + xmpy.l f33 = f32, f33 + } + { .mmf + nop __LINE__ + sub KK = N, OFFSET + xmpy.l f34 = f32, f34 + } + ;; + { .mmi + getf.sig r2 = f33 + getf.sig r3 = f34 + } + ;; + shladd B = r2, BASE_SHIFT, B + add C = r3, C +#endif + ;; + +.L130: + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L090 + ;; + +#ifdef RT + { .mmi + nop __LINE__ + shl r2 = K, BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } +#endif + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + ;; + + { .mfi + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + { .mfi +#ifndef RT + add C = C, LDC // coffset += 8 * ldc +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + (p6) br.cond.dpnt .L140 + } + ;; + .align 16 + +.L132: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds L = 1, L + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L138 + } + ;; + .align 16 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + ;; + FNMA f70 = f71, f33, f70 + ;; + FNMA f69 = f71, f34, f69 + ;; + FNMA f68 = f71, f35, f68 + ;; + FNMA f67 = f71, f36, f67 + ;; + FNMA f66 = f71, f37, f66 + ;; + FNMA f65 = f71, f38, f65 + ;; + FNMA f64 = f71, f39, f64 + ;; + FMPY f70 = f70, f40 + ;; + FNMA f69 = f70, f41, f69 + ;; + FNMA f68 = f70, f42, f68 + ;; + FNMA f67 = f70, f43, f67 + ;; + FNMA f66 = f70, f44, f66 + ;; + FNMA f65 = f70, f45, f65 + ;; + FNMA f64 = f70, f46, f64 + ;; + FMPY f69 = f69, f47 + ;; + FNMA f68 = f69, f48, f68 + ;; + FNMA f67 = f69, f49, f67 + ;; + FNMA f66 = f69, f50, f66 + ;; + FNMA f65 = f69, f51, f65 + ;; + FNMA f64 = f69, f52, f64 + ;; + FMPY f68 = f68, f53 + ;; + FNMA f67 = f68, f54, f67 + ;; + FNMA f66 = f68, f55, f66 + ;; + FNMA f65 = f68, f56, f65 + ;; + FNMA f64 = f68, f57, f64 + ;; + FMPY f67 = f67, f58 + ;; + FNMA f66 = f67, f59, f66 + ;; + FNMA f65 = f67, f60, f65 + ;; + FNMA f64 = f67, f61, f64 + ;; + FMPY f66 = f66, f16 + ;; + FNMA f65 = f66, f17, f65 + ;; + FNMA f64 = f66, f18, f64 + ;; + FMPY f65 = f65, f19 + ;; + FNMA f64 = f65, f20, f64 + ;; + FMPY f64 = f64, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, - 3 * SIZE + STFD [BOFFSET2] = f71, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FNMA f68 = f64, f36, f68 + ;; + FNMA f69 = f64, f37, f69 + ;; + FNMA f70 = f64, f38, f70 + ;; + FNMA f71 = f64, f39, f71 + ;; + FMPY f65 = f65, f40 + ;; + FNMA f66 = f65, f41, f66 + ;; + FNMA f67 = f65, f42, f67 + ;; + FNMA f68 = f65, f43, f68 + ;; + FNMA f69 = f65, f44, f69 + ;; + FNMA f70 = f65, f45, f70 + ;; + FNMA f71 = f65, f46, f71 + ;; + FMPY f66 = f66, f47 + ;; + FNMA f67 = f66, f48, f67 + ;; + FNMA f68 = f66, f49, f68 + ;; + FNMA f69 = f66, f50, f69 + ;; + FNMA f70 = f66, f51, f70 + ;; + FNMA f71 = f66, f52, f71 + ;; + FMPY f67 = f67, f53 + ;; + FNMA f68 = f67, f54, f68 + ;; + FNMA f69 = f67, f55, f69 + ;; + FNMA f70 = f67, f56, f70 + ;; + FNMA f71 = f67, f57, f71 + ;; + FMPY f68 = f68, f58 + ;; + FNMA f69 = f68, f59, f69 + ;; + FNMA f70 = f68, f60, f70 + ;; + FNMA f71 = f68, f61, f71 + ;; + FMPY f69 = f69, f16 + ;; + FNMA f70 = f69, f17, f70 + ;; + FNMA f71 = f69, f18, f71 + ;; + FMPY f70 = f70, f19 + ;; + FNMA f71 = f70, f20, f71 + ;; + FMPY f71 = f71, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + STFD [BOFFSET2] = f71, -3 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + + (p6) br.cond.dptk .L132 + .align 8 + +.L140: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L148 + } + ;; + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf + nop __LINE__ + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + nop __LINE__ + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + ;; + FNMA f66 = f67, f33, f66 + ;; + FNMA f65 = f67, f34, f65 + ;; + FNMA f64 = f67, f35, f64 + ;; + FMPY f66 = f66, f36 + ;; + FNMA f65 = f66, f37, f65 + ;; + FNMA f64 = f66, f38, f64 + ;; + FMPY f65 = f65, f39 + ;; + FNMA f64 = f65, f40, f64 + ;; + FMPY f64 = f64, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FMPY f65 = f65, f36 + ;; + FNMA f66 = f65, f37, f66 + ;; + FNMA f67 = f65, f38, f67 + ;; + FMPY f66 = f66, f39 + ;; + FNMA f67 = f66, f40, f67 + ;; + FMPY f67 = f67, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + mov f72 = f0 + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L150: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L158 + } + ;; + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + ;; + FNMA f64 = f65, f33, f64 + ;; + FMPY f64 = f64, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, - SIZE + ;; + adds C1 = -2 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FMPY f65 = f65, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, -SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + mov f64 = f0 + mov f65 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L160: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#else + { .mmi + shladd BOFFSET = KK, BASE_SHIFT, B + nop __LINE__ +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + adds L = 1, L + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, L + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L168 + } + ;; + .align 8 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; + .align 8 + +.L168: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + { .mmi + LDFD f32 = [BOFFSET] + LDFD f33 = [AOFFSET] +#ifdef LN + adds C1 = -1 * SIZE, C1 +#else + nop __LINE__ +#endif + } + ;; +#else + { .mmi + LDFD f32 = [AOFFSET] + LDFD f33 = [BOFFSET] + nop __LINE__ + } + ;; +#endif + + { .mmf + sub L = K, KK +#ifdef RT + shladd AORIG = K, BASE_SHIFT, AORIG +#else + nop __LINE__ +#endif + FSUB f64 = f32, f64 + } + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + FMPY f64 = f64, f33 + ;; +#if defined(LN) || defined(LT) + { .mmf + STFD [BOFFSET] = f64 +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif + mov f64 = f0 + } + ;; +#else + { .mmf + STFD [AOFFSET] = f64 + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; +#endif + +#if defined(LT) || defined(RN) + shladd AOFFSET = L, BASE_SHIFT, AOFFSET +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + shladd BOFFSET = L, BASE_SHIFT, BOFFSET +#else + nop __LINE__ +#endif + ;; + .align 8 + +.L169: + { .mii +#ifdef LN + shladd B = K, BASE_SHIFT, B +#elif defined(LT) || defined(RN) + mov B = BOFFSET +#else + nop __LINE__ +#endif + +#ifdef RN + adds KK = 1, KK +#elif defined RT + adds KK = -1, KK +#else + nop __LINE__ +#endif + mov AOFFSET = A + } + ;; + .align 16 + +.L090: + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L050 + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + ;; + { .mfi + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + } + ;; + { .mfi +#ifndef RT + shladd C = LDC, 1, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + mov f81 = f0 +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + mov f76 = f0 + mov f77 = f0 + mov f78 = f0 + mov f79 = f0 + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + } + ;; + .align 8 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; + .align 8 + +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + + FSUB f68 = f40, f68 + FSUB f76 = f41, f76 + FSUB f69 = f42, f69 + FSUB f77 = f43, f77 + + FSUB f70 = f44, f70 + FSUB f78 = f45, f78 + FSUB f71 = f46, f71 + FSUB f79 = f47, f79 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + FMPY f79 = f79, f32 + ;; + FNMA f70 = f71, f33, f70 + FNMA f78 = f79, f33, f78 + ;; + FNMA f69 = f71, f34, f69 + FNMA f77 = f79, f34, f77 + ;; + FNMA f68 = f71, f35, f68 + FNMA f76 = f79, f35, f76 + ;; + FNMA f67 = f71, f36, f67 + FNMA f75 = f79, f36, f75 + ;; + FNMA f66 = f71, f37, f66 + FNMA f74 = f79, f37, f74 + ;; + FNMA f65 = f71, f38, f65 + FNMA f73 = f79, f38, f73 + ;; + FNMA f64 = f71, f39, f64 + FNMA f72 = f79, f39, f72 + ;; + FMPY f70 = f70, f40 + FMPY f78 = f78, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f77 = f78, f41, f77 + ;; + FNMA f68 = f70, f42, f68 + FNMA f76 = f78, f42, f76 + ;; + FNMA f67 = f70, f43, f67 + FNMA f75 = f78, f43, f75 + ;; + FNMA f66 = f70, f44, f66 + FNMA f74 = f78, f44, f74 + ;; + FNMA f65 = f70, f45, f65 + FNMA f73 = f78, f45, f73 + ;; + FNMA f64 = f70, f46, f64 + FNMA f72 = f78, f46, f72 + ;; + FMPY f69 = f69, f47 + FMPY f77 = f77, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f76 = f77, f48, f76 + ;; + FNMA f67 = f69, f49, f67 + FNMA f75 = f77, f49, f75 + ;; + FNMA f66 = f69, f50, f66 + FNMA f74 = f77, f50, f74 + ;; + FNMA f65 = f69, f51, f65 + FNMA f73 = f77, f51, f73 + ;; + FNMA f64 = f69, f52, f64 + FNMA f72 = f77, f52, f72 + ;; + FMPY f68 = f68, f53 + FMPY f76 = f76, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f75 = f76, f54, f75 + ;; + FNMA f66 = f68, f55, f66 + FNMA f74 = f76, f55, f74 + ;; + FNMA f65 = f68, f56, f65 + FNMA f73 = f76, f56, f73 + ;; + FNMA f64 = f68, f57, f64 + FNMA f72 = f76, f57, f72 + ;; + FMPY f67 = f67, f58 + FMPY f75 = f75, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f74 = f75, f59, f74 + ;; + FNMA f65 = f67, f60, f65 + FNMA f73 = f75, f60, f73 + ;; + FNMA f64 = f67, f61, f64 + FNMA f72 = f75, f61, f72 + ;; + FMPY f66 = f66, f16 + FMPY f74 = f74, f16 + ;; + FNMA f65 = f66, f17, f65 + FNMA f73 = f74, f17, f73 + ;; + FNMA f64 = f66, f18, f64 + FNMA f72 = f74, f18, f72 + ;; + FMPY f65 = f65, f19 + FMPY f73 = f73, f19 + ;; + FNMA f64 = f65, f20, f64 + FNMA f72 = f73, f20, f72 + ;; + FMPY f64 = f64, f21 + FMPY f72 = f72, f21 + ;; + + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, - 11 * SIZE + STFD [BOFFSET2] = f79, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, - 3 * SIZE + STFD [BOFFSET2] = f75, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, 5 * SIZE + STFD [BOFFSET2] = f75, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, -11 * SIZE + STFD [BOFFSET2] = f79, -11 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FMPY f72 = f72, f34 + FMPY f76 = f76, f34 + FMPY f73 = f73, f34 + FMPY f77 = f77, f34 + FMPY f74 = f74, f34 + FMPY f78 = f78, f34 + FMPY f75 = f75, f34 + FMPY f79 = f79, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, -11 * SIZE + STFD [AOFFSET2] = f79, -11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + + FMPY f72 = f72, f32 + FMPY f76 = f76, f32 + FMPY f73 = f73, f32 + FMPY f77 = f77, f32 + FMPY f74 = f74, f32 + FMPY f78 = f78, f32 + FMPY f75 = f75, f32 + FMPY f79 = f79, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f68 = f76, f33, f68 + FNMA f65 = f73, f33, f65 + FNMA f69 = f77, f33, f69 + FNMA f66 = f74, f33, f66 + FNMA f70 = f78, f33, f70 + FNMA f67 = f75, f33, f67 + FNMA f71 = f79, f33, f71 + ;; + FMPY f64 = f64, f34 + FMPY f68 = f68, f34 + FMPY f65 = f65, f34 + FMPY f69 = f69, f34 + FMPY f66 = f66, f34 + FMPY f70 = f70, f34 + FMPY f67 = f67, f34 + FMPY f71 = f71, f34 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, - 11 * SIZE + STFD [AOFFSET2] = f79, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + ;; + +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + + (p6) br.cond.dptk .L092 + ;; + .align 8 + +.L100: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L108 + } + ;; + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + .align 8 + +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + ;; + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f75 = f75, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f74 = f75, f33, f74 + ;; + FNMA f65 = f67, f34, f65 + FNMA f73 = f75, f34, f73 + ;; + FNMA f64 = f67, f35, f64 + FNMA f72 = f75, f35, f72 + ;; + FMPY f66 = f66, f36 + FMPY f74 = f74, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f73 = f74, f37, f73 + ;; + FNMA f64 = f66, f38, f64 + FNMA f72 = f74, f38, f72 + ;; + FMPY f65 = f65, f39 + FMPY f73 = f73, f39 + ;; + FNMA f64 = f65, f40, f64 + FNMA f72 = f73, f40, f72 + ;; + FMPY f64 = f64, f41 + FMPY f72 = f72, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + FMPY f74 = f74, f34 + FMPY f75 = f75, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + FMPY f73 = f73, f32 + FMPY f74 = f74, f32 + FMPY f75 = f75, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f65 = f73, f33, f65 + FNMA f66 = f74, f33, f66 + FNMA f67 = f75, f33, f67 + ;; + FMPY f64 = f64, f34 + FMPY f65 = f65, f34 + FMPY f66 = f66, f34 + FMPY f67 = f67, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L110: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L118 + } + ;; + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + .align 8 + +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f73 = f73, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f72 = f73, f33, f72 + ;; + FMPY f64 = f64, f34 + FMPY f72 = f72, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, - 3 * SIZE + ;; + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + FMPY f73 = f73, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f65 = f73, f33, f65 + ;; + FMPY f64 = f64, f34 + FMPY f65 = f65, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + mov f65 = f0 + mov f73 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L120: + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L128 + } + ;; + .align 8 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, -SIZE + adds C2 = -1 * SIZE, C2 + } + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FMPY f72 = f72, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + ;; + FNMA f64 = f72, f33, f64 + ;; + FMPY f64 = f64, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif + + mov f64 = f0 + mov f72 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L129: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L050: + { .mib + setf.d f64 = r0 + tbit.z p6, p0 = N, 2 + (p6) br.cond.dpnt .L000 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + { .mfi + setf.d f72 = r0 + mov f80 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f88 = f0 +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + mov f65 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f73 = f0 + } + ;; + { .mfi +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + mov f81 = f0 +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + shladd C4 = LDC, 1, C2 + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f90 = f0 + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC] + } + ;; + { .mfi + setf.d f70 = r0 + mov f78 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mfi + setf.d f71 = r0 + adds L = -1, L + } + ;; + { .mfi + setf.d f87 = r0 + mov f79 = f0 + mov ar.lc = L + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f95 = f0 + (p6) br.cond.dpnt .L058 + } + ;; + .align 8 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 8 + +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [BOFFSET] + adds BOFFSET = -30 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + + FSUB f68 = f48, f68 + FSUB f76 = f49, f76 + FSUB f84 = f50, f84 + FSUB f92 = f51, f92 + + FSUB f69 = f52, f69 + FSUB f77 = f53, f77 + FSUB f85 = f54, f85 + FSUB f93 = f55, f93 + + FSUB f70 = f56, f70 + FSUB f78 = f57, f78 + FSUB f86 = f58, f86 + FSUB f94 = f59, f94 + + FSUB f71 = f60, f71 + FSUB f79 = f61, f79 + FSUB f87 = f62, f87 + FSUB f95 = f63, f95 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; + FSUB f80 = f48, f80 + FSUB f81 = f49, f81 + FSUB f82 = f50, f82 + FSUB f83 = f51, f83 + FSUB f84 = f52, f84 + FSUB f85 = f53, f85 + FSUB f86 = f54, f86 + FSUB f87 = f55, f87 + + FSUB f88 = f56, f88 + FSUB f89 = f57, f89 + FSUB f90 = f58, f90 + FSUB f91 = f59, f91 + FSUB f92 = f60, f92 + FSUB f93 = f61, f93 + FSUB f94 = f62, f94 + FSUB f95 = f63, f95 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + FMPY f79 = f79, f32 + FMPY f87 = f87, f32 + FMPY f95 = f95, f32 + ;; + FNMA f70 = f71, f33, f70 + FNMA f78 = f79, f33, f78 + FNMA f86 = f87, f33, f86 + FNMA f94 = f95, f33, f94 + ;; + FNMA f69 = f71, f34, f69 + FNMA f77 = f79, f34, f77 + FNMA f85 = f87, f34, f85 + FNMA f93 = f95, f34, f93 + ;; + FNMA f68 = f71, f35, f68 + FNMA f76 = f79, f35, f76 + FNMA f84 = f87, f35, f84 + FNMA f92 = f95, f35, f92 + ;; + FNMA f67 = f71, f36, f67 + FNMA f75 = f79, f36, f75 + FNMA f83 = f87, f36, f83 + FNMA f91 = f95, f36, f91 + ;; + FNMA f66 = f71, f37, f66 + FNMA f74 = f79, f37, f74 + FNMA f82 = f87, f37, f82 + FNMA f90 = f95, f37, f90 + ;; + FNMA f65 = f71, f38, f65 + FNMA f73 = f79, f38, f73 + FNMA f81 = f87, f38, f81 + FNMA f89 = f95, f38, f89 + ;; + FNMA f64 = f71, f39, f64 + FNMA f72 = f79, f39, f72 + FNMA f80 = f87, f39, f80 + FNMA f88 = f95, f39, f88 + ;; + FMPY f70 = f70, f40 + FMPY f78 = f78, f40 + FMPY f86 = f86, f40 + FMPY f94 = f94, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f77 = f78, f41, f77 + FNMA f85 = f86, f41, f85 + FNMA f93 = f94, f41, f93 + ;; + FNMA f68 = f70, f42, f68 + FNMA f76 = f78, f42, f76 + FNMA f84 = f86, f42, f84 + FNMA f92 = f94, f42, f92 + ;; + FNMA f67 = f70, f43, f67 + FNMA f75 = f78, f43, f75 + FNMA f83 = f86, f43, f83 + FNMA f91 = f94, f43, f91 + ;; + FNMA f66 = f70, f44, f66 + FNMA f74 = f78, f44, f74 + FNMA f82 = f86, f44, f82 + FNMA f90 = f94, f44, f90 + ;; + FNMA f65 = f70, f45, f65 + FNMA f73 = f78, f45, f73 + FNMA f81 = f86, f45, f81 + FNMA f89 = f94, f45, f89 + ;; + FNMA f64 = f70, f46, f64 + FNMA f72 = f78, f46, f72 + FNMA f80 = f86, f46, f80 + FNMA f88 = f94, f46, f88 + ;; + FMPY f69 = f69, f47 + FMPY f77 = f77, f47 + FMPY f85 = f85, f47 + FMPY f93 = f93, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f76 = f77, f48, f76 + FNMA f84 = f85, f48, f84 + FNMA f92 = f93, f48, f92 + ;; + FNMA f67 = f69, f49, f67 + FNMA f75 = f77, f49, f75 + FNMA f83 = f85, f49, f83 + FNMA f91 = f93, f49, f91 + ;; + FNMA f66 = f69, f50, f66 + FNMA f74 = f77, f50, f74 + FNMA f82 = f85, f50, f82 + FNMA f90 = f93, f50, f90 + ;; + FNMA f65 = f69, f51, f65 + FNMA f73 = f77, f51, f73 + FNMA f81 = f85, f51, f81 + FNMA f89 = f93, f51, f89 + ;; + FNMA f64 = f69, f52, f64 + FNMA f72 = f77, f52, f72 + FNMA f80 = f85, f52, f80 + FNMA f88 = f93, f52, f88 + ;; + FMPY f68 = f68, f53 + FMPY f76 = f76, f53 + FMPY f84 = f84, f53 + FMPY f92 = f92, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f75 = f76, f54, f75 + FNMA f83 = f84, f54, f83 + FNMA f91 = f92, f54, f91 + ;; + FNMA f66 = f68, f55, f66 + FNMA f74 = f76, f55, f74 + FNMA f82 = f84, f55, f82 + FNMA f90 = f92, f55, f90 + ;; + FNMA f65 = f68, f56, f65 + FNMA f73 = f76, f56, f73 + FNMA f81 = f84, f56, f81 + FNMA f89 = f92, f56, f89 + ;; + FNMA f64 = f68, f57, f64 + FNMA f72 = f76, f57, f72 + FNMA f80 = f84, f57, f80 + FNMA f88 = f92, f57, f88 + ;; + FMPY f67 = f67, f58 + FMPY f75 = f75, f58 + FMPY f83 = f83, f58 + FMPY f91 = f91, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f74 = f75, f59, f74 + FNMA f82 = f83, f59, f82 + FNMA f90 = f91, f59, f90 + ;; + FNMA f65 = f67, f60, f65 + FNMA f73 = f75, f60, f73 + FNMA f81 = f83, f60, f81 + FNMA f89 = f91, f60, f89 + ;; + FNMA f64 = f67, f61, f64 + FNMA f72 = f75, f61, f72 + FNMA f80 = f83, f61, f80 + FNMA f88 = f91, f61, f88 + ;; + FMPY f66 = f66, f16 + FMPY f74 = f74, f16 + FMPY f82 = f82, f16 + FMPY f90 = f90, f16 + ;; + FNMA f65 = f66, f17, f65 + FNMA f73 = f74, f17, f73 + FNMA f81 = f82, f17, f81 + FNMA f89 = f90, f17, f89 + ;; + FNMA f64 = f66, f18, f64 + FNMA f72 = f74, f18, f72 + FNMA f80 = f82, f18, f80 + FNMA f88 = f90, f18, f88 + ;; + FMPY f65 = f65, f19 + FMPY f73 = f73, f19 + FMPY f81 = f81, f19 + FMPY f89 = f89, f19 + ;; + FNMA f64 = f65, f20, f64 + FNMA f72 = f73, f20, f72 + FNMA f80 = f81, f20, f80 + FNMA f88 = f89, f20, f88 + ;; + FMPY f64 = f64, f21 + FMPY f72 = f72, f21 + FMPY f80 = f80, f21 + FMPY f88 = f88, f21 + ;; + + adds BOFFSET = 24 * SIZE, BOFFSET + adds BOFFSET2 = 24 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94, - 11 * SIZE + STFD [BOFFSET2] = f95, - 11 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, - 11 * SIZE + STFD [BOFFSET2] = f93, - 11 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, - 11 * SIZE + STFD [BOFFSET2] = f91, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f89, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C3 = -8 * SIZE, C3 + adds C4 = -8 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + FNMA f84 = f80, f36, f84 + FNMA f92 = f88, f36, f92 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + FNMA f85 = f80, f37, f85 + FNMA f93 = f88, f37, f93 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + FNMA f86 = f80, f38, f86 + FNMA f94 = f88, f38, f94 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + FNMA f87 = f80, f39, f87 + FNMA f95 = f88, f39, f95 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + FMPY f81 = f81, f40 + FMPY f89 = f89, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + FNMA f82 = f81, f41, f82 + FNMA f90 = f89, f41, f90 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + FNMA f83 = f81, f42, f83 + FNMA f91 = f89, f42, f91 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + FNMA f84 = f81, f43, f84 + FNMA f92 = f89, f43, f92 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + FNMA f85 = f81, f44, f85 + FNMA f93 = f89, f44, f93 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + FNMA f86 = f81, f45, f86 + FNMA f94 = f89, f45, f94 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + FNMA f87 = f81, f46, f87 + FNMA f95 = f89, f46, f95 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + FMPY f82 = f82, f47 + FMPY f90 = f90, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + FNMA f83 = f82, f48, f83 + FNMA f91 = f90, f48, f91 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + FNMA f84 = f82, f49, f84 + FNMA f92 = f90, f49, f92 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + FNMA f85 = f82, f50, f85 + FNMA f93 = f90, f50, f93 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + FNMA f86 = f82, f51, f86 + FNMA f94 = f90, f51, f94 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + FNMA f87 = f82, f52, f87 + FNMA f95 = f90, f52, f95 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + FMPY f83 = f83, f53 + FMPY f91 = f91, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + FNMA f84 = f83, f54, f84 + FNMA f92 = f91, f54, f92 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + FNMA f85 = f83, f55, f85 + FNMA f93 = f91, f55, f93 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + FNMA f86 = f83, f56, f86 + FNMA f94 = f91, f56, f94 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + FNMA f87 = f83, f57, f87 + FNMA f95 = f91, f57, f95 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + FMPY f84 = f84, f58 + FMPY f92 = f92, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + FNMA f85 = f84, f59, f85 + FNMA f93 = f92, f59, f93 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + FNMA f86 = f84, f60, f86 + FNMA f94 = f92, f60, f94 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + FNMA f87 = f84, f61, f87 + FNMA f95 = f92, f61, f95 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + FMPY f85 = f85, f16 + FMPY f93 = f93, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + FNMA f86 = f85, f17, f86 + FNMA f94 = f93, f17, f94 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + FNMA f87 = f85, f18, f87 + FNMA f95 = f93, f18, f95 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + FMPY f86 = f86, f19 + FMPY f94 = f94, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + FNMA f87 = f86, f20, f87 + FNMA f95 = f94, f20, f95 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + FMPY f87 = f87, f21 + FMPY f95 = f95, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, 5 * SIZE + STFD [BOFFSET2] = f91, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, 5 * SIZE + STFD [BOFFSET2] = f93, 5 * SIZE + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94 + STFD [BOFFSET2] = f95 + adds C9 = 4 * SIZE, C1 + adds BOFFSET = - 27 * SIZE, BOFFSET + adds BOFFSET2 = - 27 * SIZE, BOFFSET2 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FNMA f80 = f64, f34, f80 + FNMA f84 = f68, f34, f84 + FNMA f81 = f65, f34, f81 + FNMA f85 = f69, f34, f85 + FNMA f82 = f66, f34, f82 + FNMA f86 = f70, f34, f86 + FNMA f83 = f67, f34, f83 + FNMA f87 = f71, f34, f87 + ;; + FNMA f88 = f64, f35, f88 + FNMA f92 = f68, f35, f92 + FNMA f89 = f65, f35, f89 + FNMA f93 = f69, f35, f93 + FNMA f90 = f66, f35, f90 + FNMA f94 = f70, f35, f94 + FNMA f91 = f67, f35, f91 + FNMA f95 = f71, f35, f95 + ;; + FMPY f72 = f72, f36 + FMPY f76 = f76, f36 + FMPY f73 = f73, f36 + FMPY f77 = f77, f36 + FMPY f74 = f74, f36 + FMPY f78 = f78, f36 + FMPY f75 = f75, f36 + FMPY f79 = f79, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f84 = f76, f37, f84 + FNMA f81 = f73, f37, f81 + FNMA f85 = f77, f37, f85 + FNMA f82 = f74, f37, f82 + FNMA f86 = f78, f37, f86 + FNMA f83 = f75, f37, f83 + FNMA f87 = f79, f37, f87 + ;; + FNMA f88 = f72, f38, f88 + FNMA f92 = f76, f38, f92 + FNMA f89 = f73, f38, f89 + FNMA f93 = f77, f38, f93 + FNMA f90 = f74, f38, f90 + FNMA f94 = f78, f38, f94 + FNMA f91 = f75, f38, f91 + FNMA f95 = f79, f38, f95 + ;; + FMPY f80 = f80, f39 + FMPY f84 = f84, f39 + FMPY f81 = f81, f39 + FMPY f85 = f85, f39 + FMPY f82 = f82, f39 + FMPY f86 = f86, f39 + FMPY f83 = f83, f39 + FMPY f87 = f87, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f92 = f84, f40, f92 + FNMA f89 = f81, f40, f89 + FNMA f93 = f85, f40, f93 + FNMA f90 = f82, f40, f90 + FNMA f94 = f86, f40, f94 + FNMA f91 = f83, f40, f91 + FNMA f95 = f87, f40, f95 + ;; + FMPY f88 = f88, f41 + FMPY f92 = f92, f41 + FMPY f89 = f89, f41 + FMPY f93 = f93, f41 + FMPY f90 = f90, f41 + FMPY f94 = f94, f41 + FMPY f91 = f91, f41 + FMPY f95 = f95, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, 5 * SIZE + STFD [AOFFSET2] = f79, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f87, 5 * SIZE + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, -27 * SIZE + STFD [AOFFSET2] = f95, -27 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], -2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + + FMPY f88 = f88, f32 + FMPY f92 = f92, f32 + FMPY f89 = f89, f32 + FMPY f93 = f93, f32 + FMPY f90 = f90, f32 + FMPY f94 = f94, f32 + FMPY f91 = f91, f32 + FMPY f95 = f95, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f84 = f92, f33, f84 + FNMA f81 = f89, f33, f81 + FNMA f85 = f93, f33, f85 + FNMA f82 = f90, f33, f82 + FNMA f86 = f94, f33, f86 + FNMA f83 = f91, f33, f83 + FNMA f87 = f95, f33, f87 + ;; + FNMA f72 = f88, f34, f72 + FNMA f76 = f92, f34, f76 + FNMA f73 = f89, f34, f73 + FNMA f77 = f93, f34, f77 + FNMA f74 = f90, f34, f74 + FNMA f78 = f94, f34, f78 + FNMA f75 = f91, f34, f75 + FNMA f79 = f95, f34, f79 + ;; + FNMA f64 = f88, f35, f64 + FNMA f68 = f92, f35, f68 + FNMA f65 = f89, f35, f65 + FNMA f69 = f93, f35, f69 + FNMA f66 = f90, f35, f66 + FNMA f70 = f94, f35, f70 + FNMA f67 = f91, f35, f67 + FNMA f71 = f95, f35, f71 + ;; + FMPY f80 = f80, f36 + FMPY f84 = f84, f36 + FMPY f81 = f81, f36 + FMPY f85 = f85, f36 + FMPY f82 = f82, f36 + FMPY f86 = f86, f36 + FMPY f83 = f83, f36 + FMPY f87 = f87, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f76 = f84, f37, f76 + FNMA f73 = f81, f37, f73 + FNMA f77 = f85, f37, f77 + FNMA f74 = f82, f37, f74 + FNMA f78 = f86, f37, f78 + FNMA f75 = f83, f37, f75 + FNMA f79 = f87, f37, f79 + ;; + FNMA f64 = f80, f38, f64 + FNMA f68 = f84, f38, f68 + FNMA f65 = f81, f38, f65 + FNMA f69 = f85, f38, f69 + FNMA f66 = f82, f38, f66 + FNMA f70 = f86, f38, f70 + FNMA f67 = f83, f38, f67 + FNMA f71 = f87, f38, f71 + ;; + FMPY f72 = f72, f39 + FMPY f76 = f76, f39 + FMPY f73 = f73, f39 + FMPY f77 = f77, f39 + FMPY f74 = f74, f39 + FMPY f78 = f78, f39 + FMPY f75 = f75, f39 + FMPY f79 = f79, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f68 = f76, f40, f68 + FNMA f65 = f73, f40, f65 + FNMA f69 = f77, f40, f69 + FNMA f66 = f74, f40, f66 + FNMA f70 = f78, f40, f70 + FNMA f67 = f75, f40, f67 + FNMA f71 = f79, f40, f71 + ;; + FMPY f64 = f64, f41 + FMPY f68 = f68, f41 + FMPY f65 = f65, f41 + FMPY f69 = f69, f41 + FMPY f66 = f66, f41 + FMPY f70 = f70, f41 + FMPY f67 = f67, f41 + FMPY f71 = f71, f41 + ;; + adds AOFFSET = 24 * SIZE, AOFFSET + adds AOFFSET2 = 24 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, - 11 * SIZE + STFD [AOFFSET2] = f95, - 11 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f87, - 11 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, - 11 * SIZE + STFD [AOFFSET2] = f79, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + ;; + +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, 5 * SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + STFD [C11] = f87 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, 5 * SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + { .mmb + (p6) br.cond.dptk .L052 + } + ;; + + .align 8 + +.L060: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L070 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + + mov f66 = f0 + mov f67 = f0 + mov f74 = f0 + mov f75 = f0 + mov f82 = f0 + mov f83 = f0 + mov f90 = f0 + mov f91 = f0 + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L068 + } + ;; + .align 8 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; + .align 8 + +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + ;; + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f75 = f75, f32 + FMPY f83 = f83, f32 + FMPY f91 = f91, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f74 = f75, f33, f74 + FNMA f82 = f83, f33, f82 + FNMA f90 = f91, f33, f90 + ;; + FNMA f65 = f67, f34, f65 + FNMA f73 = f75, f34, f73 + FNMA f81 = f83, f34, f81 + FNMA f89 = f91, f34, f89 + ;; + FNMA f64 = f67, f35, f64 + FNMA f72 = f75, f35, f72 + FNMA f80 = f83, f35, f80 + FNMA f88 = f91, f35, f88 + ;; + FMPY f66 = f66, f36 + FMPY f74 = f74, f36 + FMPY f82 = f82, f36 + FMPY f90 = f90, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f73 = f74, f37, f73 + FNMA f81 = f82, f37, f81 + FNMA f89 = f90, f37, f89 + ;; + FNMA f64 = f66, f38, f64 + FNMA f72 = f74, f38, f72 + FNMA f80 = f82, f38, f80 + FNMA f88 = f90, f38, f88 + ;; + FMPY f65 = f65, f39 + FMPY f73 = f73, f39 + FMPY f81 = f81, f39 + FMPY f89 = f89, f39 + ;; + FNMA f64 = f65, f40, f64 + FNMA f72 = f73, f40, f72 + FNMA f80 = f81, f40, f80 + FNMA f88 = f89, f40, f88 + ;; + FMPY f64 = f64, f41 + FMPY f72 = f72, f41 + FMPY f80 = f80, f41 + FMPY f88 = f88, f41 + ;; + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, - 11 * SIZE + STFD [BOFFSET2] = f91, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + FMPY f81 = f81, f36 + FMPY f89 = f89, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + FNMA f82 = f81, f37, f82 + FNMA f90 = f89, f37, f90 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + FNMA f83 = f81, f38, f83 + FNMA f91 = f89, f38, f91 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + FMPY f82 = f82, f39 + FMPY f90 = f90, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + FNMA f83 = f82, f40, f83 + FNMA f91 = f90, f40, f91 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + FMPY f83 = f83, f41 + FMPY f91 = f91, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, -11 * SIZE + STFD [BOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + FMPY f74 = f74, f36 + FMPY f75 = f75, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + FNMA f82 = f74, f37, f82 + FNMA f83 = f75, f37, f83 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + FNMA f90 = f74, f38, f90 + FNMA f91 = f75, f38, f91 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + FMPY f82 = f82, f39 + FMPY f83 = f83, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + FNMA f90 = f82, f40, f90 + FNMA f91 = f83, f40, f91 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + FMPY f90 = f90, f41 + FMPY f91 = f91, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, -11 * SIZE + STFD [AOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + FMPY f89 = f89, f32 + FMPY f90 = f90, f32 + FMPY f91 = f91, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f81 = f89, f33, f81 + FNMA f82 = f90, f33, f82 + FNMA f83 = f91, f33, f83 + ;; + FNMA f72 = f88, f34, f72 + FNMA f73 = f89, f34, f73 + FNMA f74 = f90, f34, f74 + FNMA f75 = f91, f34, f75 + ;; + FNMA f64 = f88, f35, f64 + FNMA f65 = f89, f35, f65 + FNMA f66 = f90, f35, f66 + FNMA f67 = f91, f35, f67 + ;; + FMPY f80 = f80, f36 + FMPY f81 = f81, f36 + FMPY f82 = f82, f36 + FMPY f83 = f83, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f73 = f81, f37, f73 + FNMA f74 = f82, f37, f74 + FNMA f75 = f83, f37, f75 + ;; + FNMA f64 = f80, f38, f64 + FNMA f65 = f81, f38, f65 + FNMA f66 = f82, f38, f66 + FNMA f67 = f83, f38, f67 + ;; + FMPY f72 = f72, f39 + FMPY f73 = f73, f39 + FMPY f74 = f74, f39 + FMPY f75 = f75, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f65 = f73, f40, f65 + FNMA f66 = f74, f40, f66 + FNMA f67 = f75, f40, f67 + ;; + FMPY f64 = f64, f41 + FMPY f65 = f65, f41 + FMPY f66 = f66, f41 + FMPY f67 = f67, f41 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f91, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + nop __LINE__ + } + ;; + mov f65 = f0 + ;; + mov f73 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f81 = f0 + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f89 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L070: + tbit.z p6,p0 = M, 1 + (p6) br.cond.dptk .L080 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + mov f73 = f0 + ;; + { .mfi + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L078 + } + ;; + .align 8 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f73 = f73, f32 + FMPY f81 = f81, f32 + FMPY f89 = f89, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f72 = f73, f33, f72 + FNMA f80 = f81, f33, f80 + FNMA f88 = f89, f33, f88 + ;; + FMPY f64 = f64, f34 + FMPY f72 = f72, f34 + FMPY f80 = f80, f34 + FMPY f88 = f88, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f89, - 3 * SIZE + ;; + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + FMPY f81 = f81, f34 + FMPY f89 = f89, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + FMPY f89 = f89, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f81 = f89, f33, f81 + ;; + FNMA f72 = f88, f34, f72 + FNMA f73 = f89, f34, f73 + ;; + FNMA f64 = f88, f35, f64 + FNMA f65 = f89, f35, f65 + ;; + FMPY f80 = f80, f36 + FMPY f81 = f81, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f73 = f81, f37, f73 + ;; + FNMA f64 = f80, f38, f64 + FNMA f65 = f81, f38, f65 + ;; + FMPY f72 = f72, f39 + FMPY f73 = f73, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f65 = f73, f40, f65 + ;; + FMPY f64 = f64, f41 + FMPY f65 = f65, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; +#ifndef LN + STFD [C3 ] = f81, SIZE +#else + STFD [C3 ] = f81, - SIZE +#endif + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; +#ifndef LN + STFD [C4 ] = f89, SIZE +#else + STFD [C4 ] = f89, -SIZE +#endif + ;; + mov f96 = f0 + ;; + mov f104 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + mov f112 = f0 + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f120 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L080: + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L088 + } + ;; + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb + nop __LINE__ + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; + +.L088: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + adds C2 = -1 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + } + ;; + adds C3 = -1 * SIZE, C3 + adds C4 = -1 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FMPY f72 = f72, f36 + ;; + FNMA f80 = f72, f37, f80 + ;; + FNMA f88 = f72, f38, f88 + ;; + FMPY f80 = f80, f39 + ;; + FNMA f88 = f80, f40, f88 + ;; + FMPY f88 = f88, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + ;; + FNMA f80 = f88, f33, f80 + ;; + FNMA f72 = f88, f34, f72 + ;; + FNMA f64 = f88, f35, f64 + ;; + FMPY f80 = f80, f36 + ;; + FNMA f72 = f80, f37, f72 + ;; + FNMA f64 = f80, f38, f64 + ;; + FMPY f72 = f72, f39 + ;; + FNMA f64 = f72, f40, f64 + ;; + FMPY f64 = f64, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, - 3 * SIZE + ;; +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif +#ifndef LN + STFD [C3 ] = f80, SIZE +#else + STFD [C3 ] = f80 +#endif +#ifndef LN + STFD [C4 ] = f88, SIZE +#else + STFD [C4 ] = f88 +#endif + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L089: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L000: + shr J = N, 3 + ;; + cmp.ge p6, p0 = 0, J + (p6) br.cond.dpnt .L999 + ;; + .align 8 + +.L010: +#ifdef RT + { .mmi + shladd r3 = LDC, 3, r0 + nop __LINE__ + shl r2 = K, 3 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc +#ifndef RT + shladd C = LDC, 3, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 + shladd C6 = LDC, 2, C2 + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 + mov f112 = f0 +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + shladd C8 = LDC, 2, C4 + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + { .mfb + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mmf + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f119 = r0 + mov f89 = f0 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfb + setf.d f113 = r0 + mov f121 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfb + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfb + setf.d f114 = r0 + mov f122 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f127 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.fault.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FSUB f113 = f46, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + FSUB f66 = f48, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f74 = f49, f74 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f90 = f51, f90 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FSUB f98 = f52, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f106 = f53, f106 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + FSUB f114 = f54, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f55, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + FSUB f67 = f56, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f75 = f57, f75 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + FSUB f83 = f58, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + FSUB f99 = f60, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f61, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + FSUB f115 = f62, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f63, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f68 = f32, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f76 = f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f84 = f34, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f92 = f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f108 = f37, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f116 = f38, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f39, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f69 = f40, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f77 = f41, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f85 = f42, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f43, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f101 = f44, f101 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f117 = f46, f117 + adds BOFFSET = -62 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f125 = f47, f125 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f48, f70 +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FSUB f78 = f49, f78 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f86 = f50, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f94 = f51, f94 + nop __LINE__ + } + ;; + { .mfi +#ifdef LN + LDFPD f33, f32 = [AOFFSET] +#else + LDFPD f32, f33 = [AOFFSET] +#endif + FSUB f102 = f52, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f110 = f53, f110 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f126 = f55, f126 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + ;; + { .mfi + nop __LINE__ + FSUB f71 = f56, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f79 = f57, f79 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f58, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f59, f95 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f60, f103 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f111 = f61, f111 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f62, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + } + { .mfi + FSUB f65 = f33, f65 + } + ;; + { .mfi + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + FSUB f66 = f34, f66 + } + { .mfi + FSUB f67 = f35, f67 + } + ;; + { .mfi + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + FSUB f68 = f36, f68 + } + { .mfi + FSUB f69 = f37, f69 + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FSUB f70 = f38, f70 + } + { .mfi + FSUB f71 = f39, f71 + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + FSUB f72 = f40, f72 + } + { .mfi + FSUB f73 = f41, f73 + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FSUB f74 = f42, f74 + } + { .mfi + FSUB f75 = f43, f75 + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + FSUB f76 = f44, f76 + } + { .mfi + FSUB f77 = f45, f77 + } + ;; + { .mfi + LDFPD f62, f63 = [AOFFSET], 2 * SIZE + FSUB f78 = f46, f78 + } + { .mfi + FSUB f79 = f47, f79 + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FSUB f80 = f48, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f49, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f51, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + FSUB f84 = f52, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f53, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + FSUB f86 = f54, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f55, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FSUB f88 = f56, f88 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f57, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FSUB f90 = f58, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FSUB f92 = f60, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f61, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FSUB f94 = f62, f94 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f63, f95 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + FSUB f96 = f32, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f33, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + FSUB f98 = f34, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f35, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f101 = f37, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FSUB f102 = f38, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f39, f103 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + FSUB f104 = f40, f104 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f41, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FSUB f106 = f42, f106 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f43, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + FSUB f108 = f44, f108 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [AOFFSET] + FSUB f110 = f46, f110 + adds AOFFSET = -62 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f111 = f47, f111 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f48, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f49, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f114 = f50, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f51, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f116 = f52, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f53, f117 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f55, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f56, f120 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f57, f121 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f58, f122 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f59, f123 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f60, f124 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f125 = f61, f125 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f126 = f62, f126 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + { .mfi + LDFPD f35, f34 = [AOFFSET] + FMPY f71 = f71, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f103 = f103, f32 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + LDFPD f37, f36 = [AOFFSET] + FMPY f79 = f79, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f111 = f111, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f39, f38 = [AOFFSET] + FMPY f87 = f87, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f119 = f119, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [AOFFSET], -2 * SIZE + FMPY f95 = f95, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f127 = f127, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f41 = [AOFFSET] + FNMA f70 = f71, f33, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f103, f33, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f43 = [AOFFSET] + FNMA f78 = f79, f33, f78 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f110 = f111, f33, f110 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f45 = [AOFFSET] + FNMA f86 = f87, f33, f86 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f118 = f119, f33, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f47 = [AOFFSET] + FNMA f94 = f95, f33, f94 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f126 = f127, f33, f126 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f49 = [AOFFSET] + FNMA f69 = f71, f34, f69 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f103, f34, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f51 = [AOFFSET] + FNMA f77 = f79, f34, f77 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f109 = f111, f34, f109 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [AOFFSET], -2 * SIZE + FNMA f85 = f87, f34, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f119, f34, f117 + nop __LINE__ + } + ;; + { .mfi + LDFPD f55, f54 = [AOFFSET] + FNMA f93 = f95, f34, f93 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f125 = f127, f34, f125 + nop __LINE__ + } + ;; + { .mfi + LDFPD f57, f56 = [AOFFSET] + FNMA f68 = f71, f35, f68 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f103, f35, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f59, f58 = [AOFFSET] + FNMA f76 = f79, f35, f76 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f111, f35, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f61, f60 = [AOFFSET] + FNMA f84 = f87, f35, f84 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f119, f35, f116 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [AOFFSET], -2 * SIZE + FNMA f92 = f95, f35, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f124 = f127, f35, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f18, f17 = [AOFFSET] + FNMA f67 = f71, f36, f67 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f99 = f103, f36, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f20, f19 = [AOFFSET] + FNMA f75 = f79, f36, f75 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f107 = f111, f36, f107 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [AOFFSET] + FNMA f83 = f87, f36, f83 + adds BOFFSET = 56 * SIZE, BOFFSET + } + { .mfi + FNMA f115 = f119, f36, f115 + adds BOFFSET2 = 56 * SIZE, BOFFSET2 + } + ;; + FNMA f91 = f95, f36, f91 + FNMA f123 = f127, f36, f123 + ;; + FNMA f66 = f71, f37, f66 + FNMA f98 = f103, f37, f98 + FNMA f74 = f79, f37, f74 + FNMA f106 = f111, f37, f106 + FNMA f82 = f87, f37, f82 + FNMA f114 = f119, f37, f114 + FNMA f90 = f95, f37, f90 + FNMA f122 = f127, f37, f122 + ;; + FNMA f65 = f71, f38, f65 + FNMA f97 = f103, f38, f97 + FNMA f73 = f79, f38, f73 + FNMA f105 = f111, f38, f105 + FNMA f81 = f87, f38, f81 + FNMA f113 = f119, f38, f113 + FNMA f89 = f95, f38, f89 + FNMA f121 = f127, f38, f121 + ;; + FNMA f64 = f71, f39, f64 + FNMA f96 = f103, f39, f96 + FNMA f72 = f79, f39, f72 + FNMA f104 = f111, f39, f104 + FNMA f80 = f87, f39, f80 + FNMA f112 = f119, f39, f112 + FNMA f88 = f95, f39, f88 + FNMA f120 = f127, f39, f120 + ;; + FMPY f70 = f70, f40 + FMPY f102 = f102, f40 + FMPY f78 = f78, f40 + FMPY f110 = f110, f40 + FMPY f86 = f86, f40 + FMPY f118 = f118, f40 + FMPY f94 = f94, f40 + FMPY f126 = f126, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f101 = f102, f41, f101 + FNMA f77 = f78, f41, f77 + FNMA f109 = f110, f41, f109 + FNMA f85 = f86, f41, f85 + FNMA f117 = f118, f41, f117 + FNMA f93 = f94, f41, f93 + FNMA f125 = f126, f41, f125 + ;; + FNMA f68 = f70, f42, f68 + FNMA f100 = f102, f42, f100 + FNMA f76 = f78, f42, f76 + FNMA f108 = f110, f42, f108 + FNMA f84 = f86, f42, f84 + FNMA f116 = f118, f42, f116 + FNMA f92 = f94, f42, f92 + FNMA f124 = f126, f42, f124 + ;; + FNMA f67 = f70, f43, f67 + FNMA f99 = f102, f43, f99 + FNMA f75 = f78, f43, f75 + FNMA f107 = f110, f43, f107 + FNMA f83 = f86, f43, f83 + FNMA f115 = f118, f43, f115 + FNMA f91 = f94, f43, f91 + FNMA f123 = f126, f43, f123 + ;; + FNMA f66 = f70, f44, f66 + FNMA f98 = f102, f44, f98 + FNMA f74 = f78, f44, f74 + FNMA f106 = f110, f44, f106 + FNMA f82 = f86, f44, f82 + FNMA f114 = f118, f44, f114 + FNMA f90 = f94, f44, f90 + FNMA f122 = f126, f44, f122 + ;; + FNMA f65 = f70, f45, f65 + FNMA f97 = f102, f45, f97 + FNMA f73 = f78, f45, f73 + FNMA f105 = f110, f45, f105 + FNMA f81 = f86, f45, f81 + FNMA f113 = f118, f45, f113 + FNMA f89 = f94, f45, f89 + FNMA f121 = f126, f45, f121 + ;; + FNMA f64 = f70, f46, f64 + FNMA f96 = f102, f46, f96 + FNMA f72 = f78, f46, f72 + FNMA f104 = f110, f46, f104 + FNMA f80 = f86, f46, f80 + FNMA f112 = f118, f46, f112 + FNMA f88 = f94, f46, f88 + FNMA f120 = f126, f46, f120 + ;; + FMPY f69 = f69, f47 + FMPY f101 = f101, f47 + FMPY f77 = f77, f47 + FMPY f109 = f109, f47 + FMPY f85 = f85, f47 + FMPY f117 = f117, f47 + FMPY f93 = f93, f47 + FMPY f125 = f125, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f100 = f101, f48, f100 + FNMA f76 = f77, f48, f76 + FNMA f108 = f109, f48, f108 + FNMA f84 = f85, f48, f84 + FNMA f116 = f117, f48, f116 + FNMA f92 = f93, f48, f92 + FNMA f124 = f125, f48, f124 + ;; + FNMA f67 = f69, f49, f67 + FNMA f99 = f101, f49, f99 + FNMA f75 = f77, f49, f75 + FNMA f107 = f109, f49, f107 + FNMA f83 = f85, f49, f83 + FNMA f115 = f117, f49, f115 + FNMA f91 = f93, f49, f91 + FNMA f123 = f125, f49, f123 + ;; + FNMA f66 = f69, f50, f66 + FNMA f98 = f101, f50, f98 + FNMA f74 = f77, f50, f74 + FNMA f106 = f109, f50, f106 + FNMA f82 = f85, f50, f82 + FNMA f114 = f117, f50, f114 + FNMA f90 = f93, f50, f90 + FNMA f122 = f125, f50, f122 + ;; + FNMA f65 = f69, f51, f65 + FNMA f97 = f101, f51, f97 + FNMA f73 = f77, f51, f73 + FNMA f105 = f109, f51, f105 + FNMA f81 = f85, f51, f81 + FNMA f113 = f117, f51, f113 + FNMA f89 = f93, f51, f89 + FNMA f121 = f125, f51, f121 + ;; + FNMA f64 = f69, f52, f64 + FNMA f96 = f101, f52, f96 + FNMA f72 = f77, f52, f72 + FNMA f104 = f109, f52, f104 + FNMA f80 = f85, f52, f80 + FNMA f112 = f117, f52, f112 + FNMA f88 = f93, f52, f88 + FNMA f120 = f125, f52, f120 + ;; + FMPY f68 = f68, f53 + FMPY f100 = f100, f53 + FMPY f76 = f76, f53 + FMPY f108 = f108, f53 + FMPY f84 = f84, f53 + FMPY f116 = f116, f53 + FMPY f92 = f92, f53 + FMPY f124 = f124, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f99 = f100, f54, f99 + FNMA f75 = f76, f54, f75 + FNMA f107 = f108, f54, f107 + FNMA f83 = f84, f54, f83 + FNMA f115 = f116, f54, f115 + FNMA f91 = f92, f54, f91 + FNMA f123 = f124, f54, f123 + ;; + FNMA f66 = f68, f55, f66 + FNMA f98 = f100, f55, f98 + FNMA f74 = f76, f55, f74 + FNMA f106 = f108, f55, f106 + FNMA f82 = f84, f55, f82 + FNMA f114 = f116, f55, f114 + FNMA f90 = f92, f55, f90 + FNMA f122 = f124, f55, f122 + ;; + FNMA f65 = f68, f56, f65 + FNMA f97 = f100, f56, f97 + FNMA f73 = f76, f56, f73 + FNMA f105 = f108, f56, f105 + FNMA f81 = f84, f56, f81 + FNMA f113 = f116, f56, f113 + FNMA f89 = f92, f56, f89 + FNMA f121 = f124, f56, f121 + ;; + FNMA f64 = f68, f57, f64 + FNMA f96 = f100, f57, f96 + FNMA f72 = f76, f57, f72 + FNMA f104 = f108, f57, f104 + FNMA f80 = f84, f57, f80 + FNMA f112 = f116, f57, f112 + FNMA f88 = f92, f57, f88 + FNMA f120 = f124, f57, f120 + ;; + FMPY f67 = f67, f58 + FMPY f99 = f99, f58 + FMPY f75 = f75, f58 + FMPY f107 = f107, f58 + FMPY f83 = f83, f58 + FMPY f115 = f115, f58 + FMPY f91 = f91, f58 + FMPY f123 = f123, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f98 = f99, f59, f98 + FNMA f74 = f75, f59, f74 + FNMA f106 = f107, f59, f106 + FNMA f82 = f83, f59, f82 + FNMA f114 = f115, f59, f114 + FNMA f90 = f91, f59, f90 + FNMA f122 = f123, f59, f122 + ;; + FNMA f65 = f67, f60, f65 + FNMA f97 = f99, f60, f97 + FNMA f73 = f75, f60, f73 + FNMA f105 = f107, f60, f105 + FNMA f81 = f83, f60, f81 + FNMA f113 = f115, f60, f113 + FNMA f89 = f91, f60, f89 + FNMA f121 = f123, f60, f121 + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FNMA f64 = f67, f61, f64 + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FNMA f96 = f99, f61, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f79, SIZE + FNMA f72 = f75, f61, f72 + } + { .mfi + STFD [BOFFSET2] = f111, SIZE + FNMA f104 = f107, f61, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f87, SIZE + FNMA f80 = f83, f61, f80 + } + { .mfi + STFD [BOFFSET2] = f119, SIZE + FNMA f112 = f115, f61, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f95, - 11 * SIZE + FNMA f88 = f91, f61, f88 + } + { .mfi + STFD [BOFFSET2] = f127, - 11 * SIZE + FNMA f120 = f123, f61, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FMPY f66 = f66, f16 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FMPY f98 = f98, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f78, SIZE + FMPY f74 = f74, f16 + } + { .mfi + STFD [BOFFSET2] = f110, SIZE + FMPY f106 = f106, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FMPY f82 = f82, f16 + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FMPY f114 = f114, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f94, - 11 * SIZE + FMPY f90 = f90, f16 + } + { .mfi + STFD [BOFFSET2] = f126, - 11 * SIZE + FMPY f122 = f122, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FNMA f65 = f66, f17, f65 + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FNMA f97 = f98, f17, f97 + } + ;; + { .mfi + STFD [BOFFSET] = f77, SIZE + FNMA f73 = f74, f17, f73 + } + { .mfi + STFD [BOFFSET2] = f109, SIZE + FNMA f105 = f106, f17, f105 + } + ;; + { .mfi + STFD [BOFFSET] = f85, SIZE + FNMA f81 = f82, f17, f81 + } + { .mfi + STFD [BOFFSET2] = f117, SIZE + FNMA f113 = f114, f17, f113 + } + ;; + { .mfi + STFD [BOFFSET] = f93, - 11 * SIZE + FNMA f89 = f90, f17, f89 + } + { .mfi + STFD [BOFFSET2] = f125, - 11 * SIZE + FNMA f121 = f122, f17, f121 + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f64 = f66, f18, f64 + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f96 = f98, f18, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f76, SIZE + FNMA f72 = f74, f18, f72 + } + { .mfi + STFD [BOFFSET2] = f108, SIZE + FNMA f104 = f106, f18, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f80 = f82, f18, f80 + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f112 = f114, f18, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f92, - 11 * SIZE + FNMA f88 = f90, f18, f88 + } + { .mfi + STFD [BOFFSET2] = f124, - 11 * SIZE + FNMA f120 = f122, f18, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMPY f65 = f65, f19 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMPY f97 = f97, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FMPY f73 = f73, f19 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FMPY f105 = f105, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FMPY f81 = f81, f19 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FMPY f113 = f113, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f91, - 11 * SIZE + FMPY f89 = f89, f19 + } + { .mfi + STFD [BOFFSET2] = f123, - 11 * SIZE + FMPY f121 = f121, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f65, f20, f64 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f97, f20, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f72 = f73, f20, f72 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f104 = f105, f20, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f81, f20, f80 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f113, f20, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f90, -11 * SIZE + FNMA f88 = f89, f20, f88 + } + { .mfi + STFD [BOFFSET2] = f122, -11 * SIZE + FNMA f120 = f121, f20, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f64 = f64, f21 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f96 = f96, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f72 = f72, f21 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f104 = f104, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f80 = f80, f21 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f112 = f112, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + FMPY f88 = f88, f21 + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + FMPY f120 = f120, f21 + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -8 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -8 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + adds C9 = 4 * SIZE, C1 + } + ;; +#endif + +#ifdef LT + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET] + FMPY f80 = f80, f32 + adds AOFFSET = 3 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [AOFFSET], 1 * SIZE + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f45, f46 = [AOFFSET] + FNMA f81 = f80, f33, f81 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + LDFPD f51, f52 = [AOFFSET] + FNMA f74 = f72, f34, f74 + adds AOFFSET = 5 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [AOFFSET], 1 * SIZE + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET] + FNMA f67 = f64, f35, f67 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET] + FNMA f83 = f80, f35, f83 + adds AOFFSET = 7 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [AOFFSET], 1 * SIZE + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f17, f18 = [AOFFSET] + FNMA f68 = f64, f36, f68 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f96, f36, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f19, f20 = [AOFFSET] + FNMA f76 = f72, f36, f76 + adds AOFFSET = 9 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f104, f36, f108 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [AOFFSET] + FNMA f84 = f80, f36, f84 + adds AOFFSET = -63 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f112, f36, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f92 = f88, f36, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f124 = f120, f36, f124 + nop __LINE__ + } + ;; + FNMA f69 = f64, f37, f69 + FNMA f101 = f96, f37, f101 + FNMA f77 = f72, f37, f77 + FNMA f109 = f104, f37, f109 + FNMA f85 = f80, f37, f85 + FNMA f117 = f112, f37, f117 + FNMA f93 = f88, f37, f93 + FNMA f125 = f120, f37, f125 + ;; + FNMA f70 = f64, f38, f70 + FNMA f102 = f96, f38, f102 + FNMA f78 = f72, f38, f78 + FNMA f110 = f104, f38, f110 + FNMA f86 = f80, f38, f86 + FNMA f118 = f112, f38, f118 + FNMA f94 = f88, f38, f94 + FNMA f126 = f120, f38, f126 + ;; + FNMA f71 = f64, f39, f71 + FNMA f103 = f96, f39, f103 + FNMA f79 = f72, f39, f79 + FNMA f111 = f104, f39, f111 + FNMA f87 = f80, f39, f87 + FNMA f119 = f112, f39, f119 + FNMA f95 = f88, f39, f95 + FNMA f127 = f120, f39, f127 + ;; + FMPY f65 = f65, f40 + FMPY f97 = f97, f40 + FMPY f73 = f73, f40 + FMPY f105 = f105, f40 + FMPY f81 = f81, f40 + FMPY f113 = f113, f40 + FMPY f89 = f89, f40 + FMPY f121 = f121, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f98 = f97, f41, f98 + FNMA f74 = f73, f41, f74 + FNMA f106 = f105, f41, f106 + FNMA f82 = f81, f41, f82 + FNMA f114 = f113, f41, f114 + FNMA f90 = f89, f41, f90 + FNMA f122 = f121, f41, f122 + FNMA f67 = f65, f42, f67 + FNMA f99 = f97, f42, f99 + FNMA f75 = f73, f42, f75 + FNMA f107 = f105, f42, f107 + FNMA f83 = f81, f42, f83 + FNMA f115 = f113, f42, f115 + FNMA f91 = f89, f42, f91 + FNMA f123 = f121, f42, f123 + ;; + FNMA f68 = f65, f43, f68 + FNMA f100 = f97, f43, f100 + FNMA f76 = f73, f43, f76 + FNMA f108 = f105, f43, f108 + FNMA f84 = f81, f43, f84 + FNMA f116 = f113, f43, f116 + FNMA f92 = f89, f43, f92 + FNMA f124 = f121, f43, f124 + ;; + FNMA f69 = f65, f44, f69 + FNMA f101 = f97, f44, f101 + FNMA f77 = f73, f44, f77 + FNMA f109 = f105, f44, f109 + FNMA f85 = f81, f44, f85 + FNMA f117 = f113, f44, f117 + FNMA f93 = f89, f44, f93 + FNMA f125 = f121, f44, f125 + ;; + FNMA f70 = f65, f45, f70 + FNMA f102 = f97, f45, f102 + FNMA f78 = f73, f45, f78 + FNMA f110 = f105, f45, f110 + FNMA f86 = f81, f45, f86 + FNMA f118 = f113, f45, f118 + FNMA f94 = f89, f45, f94 + FNMA f126 = f121, f45, f126 + ;; + FNMA f71 = f65, f46, f71 + FNMA f103 = f97, f46, f103 + FNMA f79 = f73, f46, f79 + FNMA f111 = f105, f46, f111 + FNMA f87 = f81, f46, f87 + FNMA f119 = f113, f46, f119 + FNMA f95 = f89, f46, f95 + FNMA f127 = f121, f46, f127 + ;; + FMPY f66 = f66, f47 + FMPY f98 = f98, f47 + FMPY f74 = f74, f47 + FMPY f106 = f106, f47 + FMPY f82 = f82, f47 + FMPY f114 = f114, f47 + FMPY f90 = f90, f47 + FMPY f122 = f122, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f99 = f98, f48, f99 + FNMA f75 = f74, f48, f75 + FNMA f107 = f106, f48, f107 + FNMA f83 = f82, f48, f83 + FNMA f115 = f114, f48, f115 + FNMA f91 = f90, f48, f91 + FNMA f123 = f122, f48, f123 + FNMA f68 = f66, f49, f68 + FNMA f100 = f98, f49, f100 + FNMA f76 = f74, f49, f76 + FNMA f108 = f106, f49, f108 + FNMA f84 = f82, f49, f84 + FNMA f116 = f114, f49, f116 + FNMA f92 = f90, f49, f92 + FNMA f124 = f122, f49, f124 + ;; + FNMA f69 = f66, f50, f69 + FNMA f101 = f98, f50, f101 + FNMA f77 = f74, f50, f77 + FNMA f109 = f106, f50, f109 + FNMA f85 = f82, f50, f85 + FNMA f117 = f114, f50, f117 + FNMA f93 = f90, f50, f93 + FNMA f125 = f122, f50, f125 + ;; + FNMA f70 = f66, f51, f70 + FNMA f102 = f98, f51, f102 + FNMA f78 = f74, f51, f78 + FNMA f110 = f106, f51, f110 + FNMA f86 = f82, f51, f86 + FNMA f118 = f114, f51, f118 + FNMA f94 = f90, f51, f94 + FNMA f126 = f122, f51, f126 + ;; + FNMA f71 = f66, f52, f71 + FNMA f103 = f98, f52, f103 + FNMA f79 = f74, f52, f79 + FNMA f111 = f106, f52, f111 + FNMA f87 = f82, f52, f87 + FNMA f119 = f114, f52, f119 + FNMA f95 = f90, f52, f95 + FNMA f127 = f122, f52, f127 + ;; + FMPY f67 = f67, f53 + FMPY f99 = f99, f53 + FMPY f75 = f75, f53 + FMPY f107 = f107, f53 + FMPY f83 = f83, f53 + FMPY f115 = f115, f53 + FMPY f91 = f91, f53 + FMPY f123 = f123, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f100 = f99, f54, f100 + FNMA f76 = f75, f54, f76 + FNMA f108 = f107, f54, f108 + FNMA f84 = f83, f54, f84 + FNMA f116 = f115, f54, f116 + FNMA f92 = f91, f54, f92 + FNMA f124 = f123, f54, f124 + ;; + FNMA f69 = f67, f55, f69 + FNMA f101 = f99, f55, f101 + FNMA f77 = f75, f55, f77 + FNMA f109 = f107, f55, f109 + FNMA f85 = f83, f55, f85 + FNMA f117 = f115, f55, f117 + FNMA f93 = f91, f55, f93 + FNMA f125 = f123, f55, f125 + ;; + FNMA f70 = f67, f56, f70 + FNMA f102 = f99, f56, f102 + FNMA f78 = f75, f56, f78 + FNMA f110 = f107, f56, f110 + FNMA f86 = f83, f56, f86 + FNMA f118 = f115, f56, f118 + FNMA f94 = f91, f56, f94 + FNMA f126 = f123, f56, f126 + ;; + FNMA f71 = f67, f57, f71 + FNMA f103 = f99, f57, f103 + FNMA f79 = f75, f57, f79 + FNMA f111 = f107, f57, f111 + FNMA f87 = f83, f57, f87 + FNMA f119 = f115, f57, f119 + FNMA f95 = f91, f57, f95 + FNMA f127 = f123, f57, f127 + ;; + FMPY f68 = f68, f58 + FMPY f100 = f100, f58 + FMPY f76 = f76, f58 + FMPY f108 = f108, f58 + FMPY f84 = f84, f58 + FMPY f116 = f116, f58 + FMPY f92 = f92, f58 + FMPY f124 = f124, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f101 = f100, f59, f101 + FNMA f77 = f76, f59, f77 + FNMA f109 = f108, f59, f109 + FNMA f85 = f84, f59, f85 + FNMA f117 = f116, f59, f117 + FNMA f93 = f92, f59, f93 + FNMA f125 = f124, f59, f125 + ;; + FNMA f70 = f68, f60, f70 + FNMA f102 = f100, f60, f102 + FNMA f78 = f76, f60, f78 + FNMA f110 = f108, f60, f110 + FNMA f86 = f84, f60, f86 + FNMA f118 = f116, f60, f118 + FNMA f94 = f92, f60, f94 + FNMA f126 = f124, f60, f126 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f71 = f68, f61, f71 + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f103 = f100, f61, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + FNMA f79 = f76, f61, f79 + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + FNMA f111 = f108, f61, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f87 = f84, f61, f87 + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f119 = f116, f61, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + FNMA f95 = f92, f61, f95 + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + FNMA f127 = f124, f61, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f69 = f69, f16 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f101 = f101, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f77 = f77, f16 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f109 = f109, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f85 = f85, f16 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f117 = f117, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + FMPY f93 = f93, f16 + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + FMPY f125 = f125, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f70 = f69, f17, f70 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f102 = f101, f17, f102 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f78 = f77, f17, f78 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f110 = f109, f17, f110 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f86 = f85, f17, f86 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f118 = f117, f17, f118 + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + FNMA f94 = f93, f17, f94 + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + FNMA f126 = f125, f17, f126 + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FNMA f71 = f69, f18, f71 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FNMA f103 = f101, f18, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FNMA f79 = f77, f18, f79 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FNMA f111 = f109, f18, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FNMA f87 = f85, f18, f87 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FNMA f119 = f117, f18, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f91, 5 * SIZE + FNMA f95 = f93, f18, f95 + } + { .mfi + STFD [BOFFSET2] = f123, 5 * SIZE + FNMA f127 = f125, f18, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FMPY f70 = f70, f19 + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FMPY f102 = f102, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f76, SIZE + FMPY f78 = f78, f19 + } + { .mfi + STFD [BOFFSET2] = f108, SIZE + FMPY f110 = f110, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FMPY f86 = f86, f19 + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FMPY f118 = f118, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f92, 5 * SIZE + FMPY f94 = f94, f19 + } + { .mfi + STFD [BOFFSET2] = f124, 5 * SIZE + FMPY f126 = f126, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FNMA f71 = f70, f20, f71 + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FNMA f103 = f102, f20, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f77, SIZE + FNMA f79 = f78, f20, f79 + } + { .mfi + STFD [BOFFSET2] = f109, SIZE + FNMA f111 = f110, f20, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f85, SIZE + FNMA f87 = f86, f20, f87 + } + { .mfi + STFD [BOFFSET2] = f117, SIZE + FNMA f119 = f118, f20, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f93, 5 * SIZE + FNMA f95 = f94, f20, f95 + } + { .mfi + STFD [BOFFSET2] = f125, 5 * SIZE + FNMA f127 = f126, f20, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FMPY f71 = f71, f21 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FMPY f103 = f103, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f78, SIZE + FMPY f79 = f79, f21 + } + { .mfi + STFD [BOFFSET2] = f110, SIZE + FMPY f111 = f111, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FMPY f87 = f87, f21 + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FMPY f119 = f119, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f94, 5 * SIZE + FMPY f95 = f95, f21 + } + { .mfi + STFD [BOFFSET2] = f126, 5 * SIZE + FMPY f127 = f127, f21 + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f79, SIZE + STFD [BOFFSET2] = f111, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f87, SIZE + STFD [BOFFSET2] = f119, SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + STFD [BOFFSET] = f95 + adds BOFFSET = - 59 * SIZE, BOFFSET + } + { .mfi + STFD [BOFFSET2] = f127 + adds BOFFSET2 = - 59 * SIZE, BOFFSET2 + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f68 = f68, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FMPY f65 = f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f69 = f69, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET] + FMPY f66 = f66, f32 + adds BOFFSET = 3 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f70 = f70, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [BOFFSET], 1 * SIZE + FMPY f67 = f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f71 = f71, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + FNMA f72 = f64, f33, f72 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f76 = f68, f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + FNMA f73 = f65, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f77 = f69, f33, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f45, f46 = [BOFFSET] + FNMA f74 = f66, f33, f74 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f78 = f70, f33, f78 + nop __LINE__ + } + ;; + { .mfi + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + FNMA f75 = f67, f33, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f79 = f71, f33, f79 + nop __LINE__ + } + ;; + { .mfi + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + FNMA f80 = f64, f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f68, f34, f84 + nop __LINE__ + } + ;; + { .mfi + LDFPD f51, f52 = [BOFFSET] + FNMA f81 = f65, f34, f81 + adds BOFFSET = 5 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f85 = f69, f34, f85 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [BOFFSET], 1 * SIZE + FNMA f82 = f66, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f70, f34, f86 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FNMA f83 = f67, f34, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f71, f34, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET] + FNMA f88 = f64, f35, f88 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f92 = f68, f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FNMA f89 = f65, f35, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f93 = f69, f35, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET] + FNMA f90 = f66, f35, f90 + adds BOFFSET = 7 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f94 = f70, f35, f94 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [BOFFSET], 1 * SIZE + FNMA f91 = f67, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f95 = f71, f35, f95 + nop __LINE__ + } + ;; + { .mfi + LDFPD f17, f18 = [BOFFSET] + FNMA f96 = f64, f36, f96 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f68, f36, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f19, f20 = [BOFFSET] + FNMA f97 = f65, f36, f97 + adds BOFFSET = 9 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f69, f36, f101 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [BOFFSET] + FNMA f98 = f66, f36, f98 + adds BOFFSET = -63 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f70, f36, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f67, f36, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f71, f36, f103 + nop __LINE__ + } + ;; + FNMA f104 = f64, f37, f104 + FNMA f108 = f68, f37, f108 + FNMA f105 = f65, f37, f105 + FNMA f109 = f69, f37, f109 + FNMA f106 = f66, f37, f106 + FNMA f110 = f70, f37, f110 + FNMA f107 = f67, f37, f107 + FNMA f111 = f71, f37, f111 + ;; + FNMA f112 = f64, f38, f112 + FNMA f116 = f68, f38, f116 + FNMA f113 = f65, f38, f113 + FNMA f117 = f69, f38, f117 + FNMA f114 = f66, f38, f114 + FNMA f118 = f70, f38, f118 + FNMA f115 = f67, f38, f115 + FNMA f119 = f71, f38, f119 + ;; + FNMA f120 = f64, f39, f120 + FNMA f124 = f68, f39, f124 + FNMA f121 = f65, f39, f121 + FNMA f125 = f69, f39, f125 + FNMA f122 = f66, f39, f122 + FNMA f126 = f70, f39, f126 + FNMA f123 = f67, f39, f123 + FNMA f127 = f71, f39, f127 + ;; + FMPY f72 = f72, f40 + FMPY f76 = f76, f40 + FMPY f73 = f73, f40 + FMPY f77 = f77, f40 + FMPY f74 = f74, f40 + FMPY f78 = f78, f40 + FMPY f75 = f75, f40 + FMPY f79 = f79, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f84 = f76, f41, f84 + FNMA f81 = f73, f41, f81 + FNMA f85 = f77, f41, f85 + FNMA f82 = f74, f41, f82 + FNMA f86 = f78, f41, f86 + FNMA f83 = f75, f41, f83 + FNMA f87 = f79, f41, f87 + ;; + FNMA f88 = f72, f42, f88 + FNMA f92 = f76, f42, f92 + FNMA f89 = f73, f42, f89 + FNMA f93 = f77, f42, f93 + FNMA f90 = f74, f42, f90 + FNMA f94 = f78, f42, f94 + FNMA f91 = f75, f42, f91 + FNMA f95 = f79, f42, f95 + ;; + FNMA f96 = f72, f43, f96 + FNMA f100 = f76, f43, f100 + FNMA f97 = f73, f43, f97 + FNMA f101 = f77, f43, f101 + FNMA f98 = f74, f43, f98 + FNMA f102 = f78, f43, f102 + FNMA f99 = f75, f43, f99 + FNMA f103 = f79, f43, f103 + ;; + FNMA f104 = f72, f44, f104 + FNMA f108 = f76, f44, f108 + FNMA f105 = f73, f44, f105 + FNMA f109 = f77, f44, f109 + FNMA f106 = f74, f44, f106 + FNMA f110 = f78, f44, f110 + FNMA f107 = f75, f44, f107 + FNMA f111 = f79, f44, f111 + ;; + FNMA f112 = f72, f45, f112 + FNMA f116 = f76, f45, f116 + FNMA f113 = f73, f45, f113 + FNMA f117 = f77, f45, f117 + FNMA f114 = f74, f45, f114 + FNMA f118 = f78, f45, f118 + FNMA f115 = f75, f45, f115 + FNMA f119 = f79, f45, f119 + ;; + FNMA f120 = f72, f46, f120 + FNMA f124 = f76, f46, f124 + FNMA f121 = f73, f46, f121 + FNMA f125 = f77, f46, f125 + FNMA f122 = f74, f46, f122 + FNMA f126 = f78, f46, f126 + FNMA f123 = f75, f46, f123 + FNMA f127 = f79, f46, f127 + ;; + FMPY f80 = f80, f47 + FMPY f84 = f84, f47 + FMPY f81 = f81, f47 + FMPY f85 = f85, f47 + FMPY f82 = f82, f47 + FMPY f86 = f86, f47 + FMPY f83 = f83, f47 + FMPY f87 = f87, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f92 = f84, f48, f92 + FNMA f89 = f81, f48, f89 + FNMA f93 = f85, f48, f93 + FNMA f90 = f82, f48, f90 + FNMA f94 = f86, f48, f94 + FNMA f91 = f83, f48, f91 + FNMA f95 = f87, f48, f95 + ;; + FNMA f96 = f80, f49, f96 + FNMA f100 = f84, f49, f100 + FNMA f97 = f81, f49, f97 + FNMA f101 = f85, f49, f101 + FNMA f98 = f82, f49, f98 + FNMA f102 = f86, f49, f102 + FNMA f99 = f83, f49, f99 + FNMA f103 = f87, f49, f103 + ;; + FNMA f104 = f80, f50, f104 + FNMA f108 = f84, f50, f108 + FNMA f105 = f81, f50, f105 + FNMA f109 = f85, f50, f109 + FNMA f106 = f82, f50, f106 + FNMA f110 = f86, f50, f110 + FNMA f107 = f83, f50, f107 + FNMA f111 = f87, f50, f111 + ;; + FNMA f112 = f80, f51, f112 + FNMA f116 = f84, f51, f116 + FNMA f113 = f81, f51, f113 + FNMA f117 = f85, f51, f117 + FNMA f114 = f82, f51, f114 + FNMA f118 = f86, f51, f118 + FNMA f115 = f83, f51, f115 + FNMA f119 = f87, f51, f119 + ;; + FNMA f120 = f80, f52, f120 + FNMA f124 = f84, f52, f124 + FNMA f121 = f81, f52, f121 + FNMA f125 = f85, f52, f125 + FNMA f122 = f82, f52, f122 + FNMA f126 = f86, f52, f126 + FNMA f123 = f83, f52, f123 + FNMA f127 = f87, f52, f127 + ;; + FMPY f88 = f88, f53 + FMPY f92 = f92, f53 + FMPY f89 = f89, f53 + FMPY f93 = f93, f53 + FMPY f90 = f90, f53 + FMPY f94 = f94, f53 + FMPY f91 = f91, f53 + FMPY f95 = f95, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f100 = f92, f54, f100 + FNMA f97 = f89, f54, f97 + FNMA f101 = f93, f54, f101 + FNMA f98 = f90, f54, f98 + FNMA f102 = f94, f54, f102 + FNMA f99 = f91, f54, f99 + FNMA f103 = f95, f54, f103 + ;; + FNMA f104 = f88, f55, f104 + FNMA f108 = f92, f55, f108 + FNMA f105 = f89, f55, f105 + FNMA f109 = f93, f55, f109 + FNMA f106 = f90, f55, f106 + FNMA f110 = f94, f55, f110 + FNMA f107 = f91, f55, f107 + FNMA f111 = f95, f55, f111 + ;; + FNMA f112 = f88, f56, f112 + FNMA f116 = f92, f56, f116 + FNMA f113 = f89, f56, f113 + FNMA f117 = f93, f56, f117 + FNMA f114 = f90, f56, f114 + FNMA f118 = f94, f56, f118 + FNMA f115 = f91, f56, f115 + FNMA f119 = f95, f56, f119 + ;; + FNMA f120 = f88, f57, f120 + FNMA f124 = f92, f57, f124 + FNMA f121 = f89, f57, f121 + FNMA f125 = f93, f57, f125 + FNMA f122 = f90, f57, f122 + FNMA f126 = f94, f57, f126 + FNMA f123 = f91, f57, f123 + FNMA f127 = f95, f57, f127 + ;; + FMPY f96 = f96, f58 + FMPY f100 = f100, f58 + FMPY f97 = f97, f58 + FMPY f101 = f101, f58 + FMPY f98 = f98, f58 + FMPY f102 = f102, f58 + FMPY f99 = f99, f58 + FMPY f103 = f103, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f108 = f100, f59, f108 + FNMA f105 = f97, f59, f105 + FNMA f109 = f101, f59, f109 + FNMA f106 = f98, f59, f106 + FNMA f110 = f102, f59, f110 + FNMA f107 = f99, f59, f107 + FNMA f111 = f103, f59, f111 + ;; + FNMA f112 = f96, f60, f112 + FNMA f116 = f100, f60, f116 + FNMA f113 = f97, f60, f113 + FNMA f117 = f101, f60, f117 + FNMA f114 = f98, f60, f114 + FNMA f118 = f102, f60, f118 + FNMA f115 = f99, f60, f115 + FNMA f119 = f103, f60, f119 + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f120 = f96, f61, f120 + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f124 = f100, f61, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FNMA f121 = f97, f61, f121 + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FNMA f125 = f101, f61, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f122 = f98, f61, f122 + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f126 = f102, f61, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FNMA f123 = f99, f61, f123 + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FNMA f127 = f103, f61, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f72, SIZE + FMPY f104 = f104, f16 + } + { .mfi + STFD [AOFFSET2] = f76, SIZE + FMPY f108 = f108, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f73, SIZE + FMPY f105 = f105, f16 + } + { .mfi + STFD [AOFFSET2] = f77, SIZE + FMPY f109 = f109, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f74, SIZE + FMPY f106 = f106, f16 + } + { .mfi + STFD [AOFFSET2] = f78, SIZE + FMPY f110 = f110, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f75, 5 * SIZE + FMPY f107 = f107, f16 + } + { .mfi + STFD [AOFFSET2] = f79, 5 * SIZE + FMPY f111 = f111, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f112 = f104, f17, f112 + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f116 = f108, f17, f116 + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FNMA f113 = f105, f17, f113 + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FNMA f117 = f109, f17, f117 + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f114 = f106, f17, f114 + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f118 = f110, f17, f118 + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FNMA f115 = f107, f17, f115 + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FNMA f119 = f111, f17, f119 + } + ;; + { .mfi + STFD [AOFFSET] = f88, SIZE + FNMA f120 = f104, f18, f120 + } + { .mfi + STFD [AOFFSET2] = f92, SIZE + FNMA f124 = f108, f18, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f89, SIZE + FNMA f121 = f105, f18, f121 + } + { .mfi + STFD [AOFFSET2] = f93, SIZE + FNMA f125 = f109, f18, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f90, SIZE + FNMA f122 = f106, f18, f122 + } + { .mfi + STFD [AOFFSET2] = f94, SIZE + FNMA f126 = f110, f18, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f91, 5 * SIZE + FNMA f123 = f107, f18, f123 + } + { .mfi + STFD [AOFFSET2] = f95, 5 * SIZE + FNMA f127 = f111, f18, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FMPY f112 = f112, f19 + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FMPY f116 = f116, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMPY f113 = f113, f19 + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMPY f117 = f117, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FMPY f114 = f114, f19 + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FMPY f118 = f118, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMPY f115 = f115, f19 + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMPY f119 = f119, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f104, SIZE + FNMA f120 = f112, f20, f120 + } + { .mfi + STFD [AOFFSET2] = f108, SIZE + FNMA f124 = f116, f20, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f105, SIZE + FNMA f121 = f113, f20, f121 + } + { .mfi + STFD [AOFFSET2] = f109, SIZE + FNMA f125 = f117, f20, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f106, SIZE + FNMA f122 = f114, f20, f122 + } + { .mfi + STFD [AOFFSET2] = f110, SIZE + FNMA f126 = f118, f20, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f107, 5 * SIZE + FNMA f123 = f115, f20, f123 + } + { .mfi + STFD [AOFFSET2] = f111, 5 * SIZE + FNMA f127 = f119, f20, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FMPY f120 = f120, f21 + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FMPY f124 = f124, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMPY f121 = f121, f21 + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMPY f125 = f125, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FMPY f122 = f122, f21 + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FMPY f126 = f126, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f115, 5 * SIZE + FMPY f123 = f123, f21 + } + { .mfi + STFD [AOFFSET2] = f119, 5 * SIZE + FMPY f127 = f127, f21 + } + ;; + { .mmi + STFD [AOFFSET] = f120, SIZE + STFD [AOFFSET2] = f124, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f121, SIZE + STFD [AOFFSET2] = f125, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f122, SIZE + STFD [AOFFSET2] = f126, SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + STFD [AOFFSET] = f123 + adds AOFFSET = - 59 * SIZE, AOFFSET + } + { .mfi + STFD [AOFFSET2] = f127 + adds AOFFSET2 = - 59 * SIZE, AOFFSET2 + } + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + { .mfi + LDFPD f35, f34 = [BOFFSET] + FMPY f120 = f120, f32 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f124 = f124, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f37, f36 = [BOFFSET] + FMPY f121 = f121, f32 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f125 = f125, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f39, f38 = [BOFFSET] + FMPY f122 = f122, f32 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f126 = f126, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [BOFFSET], -2 * SIZE + FMPY f123 = f123, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f127 = f127, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f41 = [BOFFSET] + FNMA f112 = f120, f33, f112 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f124, f33, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f43 = [BOFFSET] + FNMA f113 = f121, f33, f113 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f117 = f125, f33, f117 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f45 = [BOFFSET] + FNMA f114 = f122, f33, f114 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f118 = f126, f33, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f47 = [BOFFSET] + FNMA f115 = f123, f33, f115 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f119 = f127, f33, f119 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f49 = [BOFFSET] + FNMA f104 = f120, f34, f104 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f124, f34, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f51 = [BOFFSET] + FNMA f105 = f121, f34, f105 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f109 = f125, f34, f109 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [BOFFSET], -2 * SIZE + FNMA f106 = f122, f34, f106 + } + { .mfi + nop __LINE__ + FNMA f110 = f126, f34, f110 + nop __LINE__ + } + ;; + { .mfi + LDFPD f55, f54 = [BOFFSET] + FNMA f107 = f123, f34, f107 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f111 = f127, f34, f111 + nop __LINE__ + } + ;; + { .mfi + LDFPD f57, f56 = [BOFFSET] + FNMA f96 = f120, f35, f96 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f124, f35, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f59, f58 = [BOFFSET] + FNMA f97 = f121, f35, f97 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f125, f35, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f61, f60 = [BOFFSET] + FNMA f98 = f122, f35, f98 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f126, f35, f102 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [BOFFSET], -2 * SIZE + FNMA f99 = f123, f35, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f127, f35, f103 + nop __LINE__ + } + ;; + { .mfi + LDFPD f18, f17 = [BOFFSET] + FNMA f88 = f120, f36, f88 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f92 = f124, f36, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f20, f19 = [BOFFSET] + FNMA f89 = f121, f36, f89 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f93 = f125, f36, f93 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [BOFFSET] + FNMA f90 = f122, f36, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f94 = f126, f36, f94 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f91 = f123, f36, f91 + adds AOFFSET = 56 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f95 = f127, f36, f95 + adds AOFFSET2 = 56 * SIZE, AOFFSET2 + } + ;; + FNMA f80 = f120, f37, f80 + FNMA f84 = f124, f37, f84 + FNMA f81 = f121, f37, f81 + FNMA f85 = f125, f37, f85 + FNMA f82 = f122, f37, f82 + FNMA f86 = f126, f37, f86 + FNMA f83 = f123, f37, f83 + FNMA f87 = f127, f37, f87 + ;; + FNMA f72 = f120, f38, f72 + FNMA f76 = f124, f38, f76 + FNMA f73 = f121, f38, f73 + FNMA f77 = f125, f38, f77 + FNMA f74 = f122, f38, f74 + FNMA f78 = f126, f38, f78 + FNMA f75 = f123, f38, f75 + FNMA f79 = f127, f38, f79 + ;; + FNMA f64 = f120, f39, f64 + FNMA f68 = f124, f39, f68 + FNMA f65 = f121, f39, f65 + FNMA f69 = f125, f39, f69 + FNMA f66 = f122, f39, f66 + FNMA f70 = f126, f39, f70 + FNMA f67 = f123, f39, f67 + FNMA f71 = f127, f39, f71 + ;; + FMPY f112 = f112, f40 + FMPY f116 = f116, f40 + FMPY f113 = f113, f40 + FMPY f117 = f117, f40 + FMPY f114 = f114, f40 + FMPY f118 = f118, f40 + FMPY f115 = f115, f40 + FMPY f119 = f119, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f108 = f116, f41, f108 + FNMA f105 = f113, f41, f105 + FNMA f109 = f117, f41, f109 + FNMA f106 = f114, f41, f106 + FNMA f110 = f118, f41, f110 + FNMA f107 = f115, f41, f107 + FNMA f111 = f119, f41, f111 + ;; + FNMA f96 = f112, f42, f96 + FNMA f100 = f116, f42, f100 + FNMA f97 = f113, f42, f97 + FNMA f101 = f117, f42, f101 + FNMA f98 = f114, f42, f98 + FNMA f102 = f118, f42, f102 + FNMA f99 = f115, f42, f99 + FNMA f103 = f119, f42, f103 + ;; + FNMA f88 = f112, f43, f88 + FNMA f92 = f116, f43, f92 + FNMA f89 = f113, f43, f89 + FNMA f93 = f117, f43, f93 + FNMA f90 = f114, f43, f90 + FNMA f94 = f118, f43, f94 + FNMA f91 = f115, f43, f91 + FNMA f95 = f119, f43, f95 + ;; + FNMA f80 = f112, f44, f80 + FNMA f84 = f116, f44, f84 + FNMA f81 = f113, f44, f81 + FNMA f85 = f117, f44, f85 + FNMA f82 = f114, f44, f82 + FNMA f86 = f118, f44, f86 + FNMA f83 = f115, f44, f83 + FNMA f87 = f119, f44, f87 + ;; + FNMA f72 = f112, f45, f72 + FNMA f76 = f116, f45, f76 + FNMA f73 = f113, f45, f73 + FNMA f77 = f117, f45, f77 + FNMA f74 = f114, f45, f74 + FNMA f78 = f118, f45, f78 + FNMA f75 = f115, f45, f75 + FNMA f79 = f119, f45, f79 + ;; + FNMA f64 = f112, f46, f64 + FNMA f68 = f116, f46, f68 + FNMA f65 = f113, f46, f65 + FNMA f69 = f117, f46, f69 + FNMA f66 = f114, f46, f66 + FNMA f70 = f118, f46, f70 + FNMA f67 = f115, f46, f67 + FNMA f71 = f119, f46, f71 + ;; + FMPY f104 = f104, f47 + FMPY f108 = f108, f47 + FMPY f105 = f105, f47 + FMPY f109 = f109, f47 + FMPY f106 = f106, f47 + FMPY f110 = f110, f47 + FMPY f107 = f107, f47 + FMPY f111 = f111, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f100 = f108, f48, f100 + FNMA f97 = f105, f48, f97 + FNMA f101 = f109, f48, f101 + FNMA f98 = f106, f48, f98 + FNMA f102 = f110, f48, f102 + FNMA f99 = f107, f48, f99 + FNMA f103 = f111, f48, f103 + ;; + FNMA f88 = f104, f49, f88 + FNMA f92 = f108, f49, f92 + FNMA f89 = f105, f49, f89 + FNMA f93 = f109, f49, f93 + FNMA f90 = f106, f49, f90 + FNMA f94 = f110, f49, f94 + FNMA f91 = f107, f49, f91 + FNMA f95 = f111, f49, f95 + ;; + FNMA f80 = f104, f50, f80 + FNMA f84 = f108, f50, f84 + FNMA f81 = f105, f50, f81 + FNMA f85 = f109, f50, f85 + FNMA f82 = f106, f50, f82 + FNMA f86 = f110, f50, f86 + FNMA f83 = f107, f50, f83 + FNMA f87 = f111, f50, f87 + ;; + FNMA f72 = f104, f51, f72 + FNMA f76 = f108, f51, f76 + FNMA f73 = f105, f51, f73 + FNMA f77 = f109, f51, f77 + FNMA f74 = f106, f51, f74 + FNMA f78 = f110, f51, f78 + FNMA f75 = f107, f51, f75 + FNMA f79 = f111, f51, f79 + ;; + FNMA f64 = f104, f52, f64 + FNMA f68 = f108, f52, f68 + FNMA f65 = f105, f52, f65 + FNMA f69 = f109, f52, f69 + FNMA f66 = f106, f52, f66 + FNMA f70 = f110, f52, f70 + FNMA f67 = f107, f52, f67 + FNMA f71 = f111, f52, f71 + ;; + FMPY f96 = f96, f53 + FMPY f100 = f100, f53 + FMPY f97 = f97, f53 + FMPY f101 = f101, f53 + FMPY f98 = f98, f53 + FMPY f102 = f102, f53 + FMPY f99 = f99, f53 + FMPY f103 = f103, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f92 = f100, f54, f92 + FNMA f89 = f97, f54, f89 + FNMA f93 = f101, f54, f93 + FNMA f90 = f98, f54, f90 + FNMA f94 = f102, f54, f94 + FNMA f91 = f99, f54, f91 + FNMA f95 = f103, f54, f95 + ;; + FNMA f80 = f96, f55, f80 + FNMA f84 = f100, f55, f84 + FNMA f81 = f97, f55, f81 + FNMA f85 = f101, f55, f85 + FNMA f82 = f98, f55, f82 + FNMA f86 = f102, f55, f86 + FNMA f83 = f99, f55, f83 + FNMA f87 = f103, f55, f87 + ;; + FNMA f72 = f96, f56, f72 + FNMA f76 = f100, f56, f76 + FNMA f73 = f97, f56, f73 + FNMA f77 = f101, f56, f77 + FNMA f74 = f98, f56, f74 + FNMA f78 = f102, f56, f78 + FNMA f75 = f99, f56, f75 + FNMA f79 = f103, f56, f79 + ;; + FNMA f64 = f96, f57, f64 + FNMA f68 = f100, f57, f68 + FNMA f65 = f97, f57, f65 + FNMA f69 = f101, f57, f69 + FNMA f66 = f98, f57, f66 + FNMA f70 = f102, f57, f70 + FNMA f67 = f99, f57, f67 + FNMA f71 = f103, f57, f71 + ;; + FMPY f88 = f88, f58 + FMPY f92 = f92, f58 + FMPY f89 = f89, f58 + FMPY f93 = f93, f58 + FMPY f90 = f90, f58 + FMPY f94 = f94, f58 + FMPY f91 = f91, f58 + FMPY f95 = f95, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f84 = f92, f59, f84 + FNMA f81 = f89, f59, f81 + FNMA f85 = f93, f59, f85 + FNMA f82 = f90, f59, f82 + FNMA f86 = f94, f59, f86 + FNMA f83 = f91, f59, f83 + FNMA f87 = f95, f59, f87 + ;; + FNMA f72 = f88, f60, f72 + FNMA f76 = f92, f60, f76 + FNMA f73 = f89, f60, f73 + FNMA f77 = f93, f60, f77 + FNMA f74 = f90, f60, f74 + FNMA f78 = f94, f60, f78 + FNMA f75 = f91, f60, f75 + FNMA f79 = f95, f60, f79 + ;; + + { .mfi + STFD [AOFFSET] = f120, SIZE + FNMA f64 = f88, f61, f64 + } + { .mfi + STFD [AOFFSET2] = f124, SIZE + FNMA f68 = f92, f61, f68 + } + ;; + { .mfi + STFD [AOFFSET] = f121, SIZE + FNMA f65 = f89, f61, f65 + } + { .mfi + STFD [AOFFSET2] = f125, SIZE + FNMA f69 = f93, f61, f69 + } + ;; + { .mfi + STFD [AOFFSET] = f122, SIZE + FNMA f66 = f90, f61, f66 + } + { .mfi + STFD [AOFFSET2] = f126, SIZE + FNMA f70 = f94, f61, f70 + } + ;; + { .mfi + STFD [AOFFSET] = f123, - 11 * SIZE + FNMA f67 = f91, f61, f67 + } + { .mfi + STFD [AOFFSET2] = f127, - 11 * SIZE + FNMA f71 = f95, f61, f71 + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FMPY f80 = f80, f16 + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FMPY f84 = f84, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMPY f81 = f81, f16 + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMPY f85 = f85, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FMPY f82 = f82, f16 + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FMPY f86 = f86, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f115, - 11 * SIZE + FMPY f83 = f83, f16 + } + { .mfi + STFD [AOFFSET2] = f119, - 11 * SIZE + FMPY f87 = f87, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f104, SIZE + FNMA f72 = f80, f17, f72 + } + { .mfi + STFD [AOFFSET2] = f108, SIZE + FNMA f76 = f84, f17, f76 + } + ;; + { .mfi + STFD [AOFFSET] = f105, SIZE + FNMA f73 = f81, f17, f73 + } + { .mfi + STFD [AOFFSET2] = f109, SIZE + FNMA f77 = f85, f17, f77 + } + ;; + { .mfi + STFD [AOFFSET] = f106, SIZE + FNMA f74 = f82, f17, f74 + } + { .mfi + STFD [AOFFSET2] = f110, SIZE + FNMA f78 = f86, f17, f78 + } + ;; + { .mfi + STFD [AOFFSET] = f107, - 11 * SIZE + FNMA f75 = f83, f17, f75 + } + { .mfi + STFD [AOFFSET2] = f111, - 11 * SIZE + FNMA f79 = f87, f17, f79 + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f64 = f80, f18, f64 + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f68 = f84, f18, f68 + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FNMA f65 = f81, f18, f65 + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FNMA f69 = f85, f18, f69 + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f66 = f82, f18, f66 + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f70 = f86, f18, f70 + } + ;; + { .mfi + STFD [AOFFSET] = f99, - 11 * SIZE + FNMA f67 = f83, f18, f67 + } + { .mfi + STFD [AOFFSET2] = f103, - 11 * SIZE + FNMA f71 = f87, f18, f71 + } + ;; + { .mfi + STFD [AOFFSET] = f88, SIZE + FMPY f72 = f72, f19 + } + { .mfi + STFD [AOFFSET2] = f92, SIZE + FMPY f76 = f76, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f89, SIZE + FMPY f73 = f73, f19 + } + { .mfi + STFD [AOFFSET2] = f93, SIZE + FMPY f77 = f77, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f90, SIZE + FMPY f74 = f74, f19 + } + { .mfi + STFD [AOFFSET2] = f94, SIZE + FMPY f78 = f78, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f91, - 11 * SIZE + FMPY f75 = f75, f19 + } + { .mfi + STFD [AOFFSET2] = f95, - 11 * SIZE + FMPY f79 = f79, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f64 = f72, f20, f64 + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f68 = f76, f20, f68 + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FNMA f65 = f73, f20, f65 + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FNMA f69 = f77, f20, f69 + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f66 = f74, f20, f66 + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f70 = f78, f20, f70 + } + ;; + { .mfi + STFD [AOFFSET] = f83, - 11 * SIZE + FNMA f67 = f75, f20, f67 + } + { .mfi + STFD [AOFFSET2] = f87, - 11 * SIZE + FNMA f71 = f79, f20, f71 + } + ;; + { .mfi + STFD [AOFFSET] = f72, SIZE + FMPY f64 = f64, f21 + } + { .mfi + STFD [AOFFSET2] = f76, SIZE + FMPY f68 = f68, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f73, SIZE + FMPY f65 = f65, f21 + } + { .mfi + STFD [AOFFSET2] = f77, SIZE + FMPY f69 = f69, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f74, SIZE + FMPY f66 = f66, f21 + } + { .mfi + STFD [AOFFSET2] = f78, SIZE + FMPY f70 = f70, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f75, - 11 * SIZE + FMPY f67 = f67, f21 + } + { .mfi + STFD [AOFFSET2] = f79, - 11 * SIZE + FMPY f71 = f71, f21 + } + ;; + { .mmi + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + +#endif + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE +#ifdef LN + adds C3 = -8 * SIZE, C3 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE +#ifdef LN + adds C4 = -8 * SIZE, C4 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 +#ifdef LN + adds C5 = -8 * SIZE, C5 +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + adds C13 = 4 * SIZE, C5 + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE +#ifdef LN + adds C6 = -8 * SIZE, C6 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, 5 * SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + STFD [C11] = f87 + adds C14 = 4 * SIZE, C6 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE +#ifdef LN + adds C8 = -8 * SIZE, C8 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + adds C16 = 4 * SIZE, C8 + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, 5 * SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C13] = f100, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + STFD [C13] = f101, SIZE + adds I = -1, I + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE + STFD [C13] = f102, SIZE +#ifdef LN + adds C7 = -8 * SIZE, C7 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C5 ] = f99, 5 * SIZE +#else + STFD [C5 ] = f99, - 3 * SIZE +#endif + STFD [C13] = f103 + adds C15 = 4 * SIZE, C7 + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + STFD [C14] = f108, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + STFD [C14] = f109, SIZE + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + STFD [C14] = f110, SIZE + sub L = K, KK + } + ;; + { .mmi +#ifndef LN + STFD [C6 ] = f107, 5 * SIZE +#else + STFD [C6 ] = f107, - 3 * SIZE +#endif + STFD [C14] = f111 +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C15] = f116, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + STFD [C15] = f117, SIZE +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE + STFD [C15] = f118, SIZE +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C7 ] = f115, 5 * SIZE +#else + STFD [C7 ] = f115, - 3 * SIZE +#endif + STFD [C15] = f119 +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + STFD [C16] = f124, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE + STFD [C16] = f125, SIZE +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE + STFD [C16] = f126, SIZE +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmb +#ifndef LN + STFD [C8 ] = f123, 5 * SIZE +#else + STFD [C8 ] = f123, - 3 * SIZE +#endif + STFD [C16] = f127 + (p6) br.cond.dptk .L011 + } + ;; + +.L020: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p0 = M, 2 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f67 = f0 + } + { .mfi + setf.d f74 = r0 + mov f75 = f0 + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f83 = f0 + } + { .mfi + setf.d f90 = r0 + mov f91 = f0 + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f99 = f0 + } + { .mfi + setf.d f106 = r0 + mov f107 = f0 + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f114 = r0 + mov f115 = f0 + } + { .mfb + setf.d f122 = r0 + mov f123 = f0 + (p6) br.cond.dpnt .L028 + } + ;; + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f113 = f46, f113 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + FSUB f66 = f48, f66 + FSUB f74 = f49, f74 + FSUB f82 = f50, f82 + FSUB f90 = f51, f90 + FSUB f98 = f52, f98 + FSUB f106 = f53, f106 + FSUB f114 = f54, f114 + FSUB f122 = f55, f122 + ;; + FSUB f67 = f56, f67 + FSUB f75 = f57, f75 + FSUB f83 = f58, f83 + FSUB f91 = f59, f91 + FSUB f99 = f60, f99 + FSUB f107 = f61, f107 + FSUB f115 = f62, f115 + FSUB f123 = f63, f123 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; + FSUB f96 = f48, f96 + FSUB f97 = f49, f97 + FSUB f98 = f50, f98 + FSUB f99 = f51, f99 + ;; + FSUB f104 = f52, f104 + FSUB f105 = f53, f105 + FSUB f106 = f54, f106 + FSUB f107 = f55, f107 + ;; + FSUB f112 = f56, f112 + FSUB f113 = f57, f113 + FSUB f114 = f58, f114 + FSUB f115 = f59, f115 + ;; + FSUB f120 = f60, f120 + FSUB f121 = f61, f121 + FSUB f122 = f62, f122 + FSUB f123 = f63, f123 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f99 = f99, f32 + FMPY f75 = f75, f32 + FMPY f107 = f107, f32 + FMPY f83 = f83, f32 + FMPY f115 = f115, f32 + FMPY f91 = f91, f32 + FMPY f123 = f123, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f98 = f99, f33, f98 + FNMA f74 = f75, f33, f74 + FNMA f106 = f107, f33, f106 + FNMA f82 = f83, f33, f82 + FNMA f114 = f115, f33, f114 + FNMA f90 = f91, f33, f90 + FNMA f122 = f123, f33, f122 + ;; + FNMA f65 = f67, f34, f65 + FNMA f97 = f99, f34, f97 + FNMA f73 = f75, f34, f73 + FNMA f105 = f107, f34, f105 + FNMA f81 = f83, f34, f81 + FNMA f113 = f115, f34, f113 + FNMA f89 = f91, f34, f89 + FNMA f121 = f123, f34, f121 + ;; + FNMA f64 = f67, f35, f64 + FNMA f96 = f99, f35, f96 + FNMA f72 = f75, f35, f72 + FNMA f104 = f107, f35, f104 + FNMA f80 = f83, f35, f80 + FNMA f112 = f115, f35, f112 + FNMA f88 = f91, f35, f88 + FNMA f120 = f123, f35, f120 + ;; + FMPY f66 = f66, f36 + FMPY f98 = f98, f36 + FMPY f74 = f74, f36 + FMPY f106 = f106, f36 + FMPY f82 = f82, f36 + FMPY f114 = f114, f36 + FMPY f90 = f90, f36 + FMPY f122 = f122, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f97 = f98, f37, f97 + FNMA f73 = f74, f37, f73 + FNMA f105 = f106, f37, f105 + FNMA f81 = f82, f37, f81 + FNMA f113 = f114, f37, f113 + FNMA f89 = f90, f37, f89 + FNMA f121 = f122, f37, f121 + ;; + FNMA f64 = f66, f38, f64 + FNMA f96 = f98, f38, f96 + FNMA f72 = f74, f38, f72 + FNMA f104 = f106, f38, f104 + FNMA f80 = f82, f38, f80 + FNMA f112 = f114, f38, f112 + FNMA f88 = f90, f38, f88 + FNMA f120 = f122, f38, f120 + ;; + adds BOFFSET = 24 * SIZE, BOFFSET + adds BOFFSET2 = 24 * SIZE, BOFFSET2 + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMPY f65 = f65, f39 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMPY f97 = f97, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FMPY f73 = f73, f39 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FMPY f105 = f105, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FMPY f81 = f81, f39 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FMPY f113 = f113, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f91, - 11 * SIZE + FMPY f89 = f89, f39 + } + { .mfi + STFD [BOFFSET2] = f123, - 11 * SIZE + FMPY f121 = f121, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f65, f40, f64 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f97, f40, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f72 = f73, f40, f72 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f104 = f105, f40, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f81, f40, f80 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f113, f40, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f90, -11 * SIZE + FNMA f88 = f89, f40, f88 + } + { .mfi + STFD [BOFFSET2] = f122, -11 * SIZE + FNMA f120 = f121, f40, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f64 = f64, f41 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f96 = f96, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f72 = f72, f41 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f104 = f104, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f80 = f80, f41 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f112 = f112, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + FMPY f88 = f88, f41 + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + FMPY f120 = f120, f41 + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -4 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -4 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + FNMA f74 = f72, f34, f74 + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + FNMA f67 = f64, f35, f67 + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + FNMA f83 = f80, f35, f83 + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + nop __LINE__ + } + ;; + FMPY f65 = f65, f36 + FMPY f97 = f97, f36 + FMPY f73 = f73, f36 + FMPY f105 = f105, f36 + FMPY f81 = f81, f36 + FMPY f113 = f113, f36 + FMPY f89 = f89, f36 + FMPY f121 = f121, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f98 = f97, f37, f98 + FNMA f74 = f73, f37, f74 + FNMA f106 = f105, f37, f106 + FNMA f82 = f81, f37, f82 + FNMA f114 = f113, f37, f114 + FNMA f90 = f89, f37, f90 + FNMA f122 = f121, f37, f122 + ;; + FNMA f67 = f65, f38, f67 + FNMA f99 = f97, f38, f99 + FNMA f75 = f73, f38, f75 + FNMA f107 = f105, f38, f107 + FNMA f83 = f81, f38, f83 + FNMA f115 = f113, f38, f115 + FNMA f91 = f89, f38, f91 + FNMA f123 = f121, f38, f123 + ;; + FMPY f66 = f66, f39 + FMPY f98 = f98, f39 + FMPY f74 = f74, f39 + FMPY f106 = f106, f39 + FMPY f82 = f82, f39 + FMPY f114 = f114, f39 + FMPY f90 = f90, f39 + FMPY f122 = f122, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f99 = f98, f40, f99 + FNMA f75 = f74, f40, f75 + FNMA f107 = f106, f40, f107 + FNMA f83 = f82, f40, f83 + FNMA f115 = f114, f40, f115 + FNMA f91 = f90, f40, f91 + FNMA f123 = f122, f40, f123 + ;; + FMPY f67 = f67, f41 + FMPY f99 = f99, f41 + FMPY f75 = f75, f41 + FMPY f107 = f107, f41 + FMPY f83 = f83, f41 + FMPY f115 = f115, f41 + FMPY f91 = f91, f41 + FMPY f123 = f123, f41 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f91, -27 * SIZE + } + { .mfi + STFD [BOFFSET2] = f123, -27 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + FNMA f98 = f66, f36, f98 + FNMA f99 = f67, f36, f99 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + FNMA f106 = f66, f37, f106 + FNMA f107 = f67, f37, f107 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + FNMA f114 = f66, f38, f114 + FNMA f115 = f67, f38, f115 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + FNMA f122 = f66, f39, f122 + FNMA f123 = f67, f39, f123 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + FMPY f74 = f74, f40 + FMPY f75 = f75, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + FNMA f82 = f74, f41, f82 + FNMA f83 = f75, f41, f83 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + FNMA f90 = f74, f42, f90 + FNMA f91 = f75, f42, f91 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + FNMA f98 = f74, f43, f98 + FNMA f99 = f75, f43, f99 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + FNMA f106 = f74, f44, f106 + FNMA f107 = f75, f44, f107 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + FNMA f114 = f74, f45, f114 + FNMA f115 = f75, f45, f115 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + FNMA f122 = f74, f46, f122 + FNMA f123 = f75, f46, f123 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + FMPY f82 = f82, f47 + FMPY f83 = f83, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + FNMA f90 = f82, f48, f90 + FNMA f91 = f83, f48, f91 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + FNMA f98 = f82, f49, f98 + FNMA f99 = f83, f49, f99 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + FNMA f106 = f82, f50, f106 + FNMA f107 = f83, f50, f107 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + FNMA f114 = f82, f51, f114 + FNMA f115 = f83, f51, f115 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + FNMA f122 = f82, f52, f122 + FNMA f123 = f83, f52, f123 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + FMPY f90 = f90, f53 + FMPY f91 = f91, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + FNMA f98 = f90, f54, f98 + FNMA f99 = f91, f54, f99 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + FNMA f106 = f90, f55, f106 + FNMA f107 = f91, f55, f107 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + FNMA f114 = f90, f56, f114 + FNMA f115 = f91, f56, f115 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + FNMA f122 = f90, f57, f122 + FNMA f123 = f91, f57, f123 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + FMPY f98 = f98, f58 + FMPY f99 = f99, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + FNMA f106 = f98, f59, f106 + FNMA f107 = f99, f59, f107 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + FNMA f114 = f98, f60, f114 + FNMA f115 = f99, f60, f115 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + FNMA f122 = f98, f61, f122 + FNMA f123 = f99, f61, f123 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + FMPY f106 = f106, f16 + FMPY f107 = f107, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + FNMA f114 = f106, f17, f114 + FNMA f115 = f107, f17, f115 + ;; + FNMA f120 = f104, f18, f120 + FNMA f121 = f105, f18, f121 + FNMA f122 = f106, f18, f122 + FNMA f123 = f107, f18, f123 + ;; + FMPY f112 = f112, f19 + FMPY f113 = f113, f19 + FMPY f114 = f114, f19 + FMPY f115 = f115, f19 + ;; + FNMA f120 = f112, f20, f120 + FNMA f121 = f113, f20, f121 + FNMA f122 = f114, f20, f122 + FNMA f123 = f115, f20, f123 + ;; + FMPY f120 = f120, f21 + FMPY f121 = f121, f21 + FMPY f122 = f122, f21 + FMPY f123 = f123, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f91, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f105, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f106, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f107, 5 * SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + ;; + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + ;; + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f123, - 27 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + FMPY f121 = f121, f32 + FMPY f122 = f122, f32 + FMPY f123 = f123, f32 + ;; + FNMA f112 = f120, f33, f112 + FNMA f113 = f121, f33, f113 + FNMA f114 = f122, f33, f114 + FNMA f115 = f123, f33, f115 + ;; + FNMA f104 = f120, f34, f104 + FNMA f105 = f121, f34, f105 + FNMA f106 = f122, f34, f106 + FNMA f107 = f123, f34, f107 + ;; + FNMA f96 = f120, f35, f96 + FNMA f97 = f121, f35, f97 + FNMA f98 = f122, f35, f98 + FNMA f99 = f123, f35, f99 + ;; + FNMA f88 = f120, f36, f88 + FNMA f89 = f121, f36, f89 + FNMA f90 = f122, f36, f90 + FNMA f91 = f123, f36, f91 + ;; + FNMA f80 = f120, f37, f80 + FNMA f81 = f121, f37, f81 + FNMA f82 = f122, f37, f82 + FNMA f83 = f123, f37, f83 + ;; + FNMA f72 = f120, f38, f72 + FNMA f73 = f121, f38, f73 + FNMA f74 = f122, f38, f74 + FNMA f75 = f123, f38, f75 + ;; + FNMA f64 = f120, f39, f64 + FNMA f65 = f121, f39, f65 + FNMA f66 = f122, f39, f66 + FNMA f67 = f123, f39, f67 + ;; + FMPY f112 = f112, f40 + FMPY f113 = f113, f40 + FMPY f114 = f114, f40 + FMPY f115 = f115, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f105 = f113, f41, f105 + FNMA f106 = f114, f41, f106 + FNMA f107 = f115, f41, f107 + ;; + FNMA f96 = f112, f42, f96 + FNMA f97 = f113, f42, f97 + FNMA f98 = f114, f42, f98 + FNMA f99 = f115, f42, f99 + ;; + FNMA f88 = f112, f43, f88 + FNMA f89 = f113, f43, f89 + FNMA f90 = f114, f43, f90 + FNMA f91 = f115, f43, f91 + ;; + FNMA f80 = f112, f44, f80 + FNMA f81 = f113, f44, f81 + FNMA f82 = f114, f44, f82 + FNMA f83 = f115, f44, f83 + ;; + FNMA f72 = f112, f45, f72 + FNMA f73 = f113, f45, f73 + FNMA f74 = f114, f45, f74 + FNMA f75 = f115, f45, f75 + ;; + FNMA f64 = f112, f46, f64 + FNMA f65 = f113, f46, f65 + FNMA f66 = f114, f46, f66 + FNMA f67 = f115, f46, f67 + ;; + FMPY f104 = f104, f47 + FMPY f105 = f105, f47 + FMPY f106 = f106, f47 + FMPY f107 = f107, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f97 = f105, f48, f97 + FNMA f98 = f106, f48, f98 + FNMA f99 = f107, f48, f99 + ;; + FNMA f88 = f104, f49, f88 + FNMA f89 = f105, f49, f89 + FNMA f90 = f106, f49, f90 + FNMA f91 = f107, f49, f91 + ;; + FNMA f80 = f104, f50, f80 + FNMA f81 = f105, f50, f81 + FNMA f82 = f106, f50, f82 + FNMA f83 = f107, f50, f83 + ;; + FNMA f72 = f104, f51, f72 + FNMA f73 = f105, f51, f73 + FNMA f74 = f106, f51, f74 + FNMA f75 = f107, f51, f75 + ;; + FNMA f64 = f104, f52, f64 + FNMA f65 = f105, f52, f65 + FNMA f66 = f106, f52, f66 + FNMA f67 = f107, f52, f67 + ;; + FMPY f96 = f96, f53 + FMPY f97 = f97, f53 + FMPY f98 = f98, f53 + FMPY f99 = f99, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f89 = f97, f54, f89 + FNMA f90 = f98, f54, f90 + FNMA f91 = f99, f54, f91 + ;; + FNMA f80 = f96, f55, f80 + FNMA f81 = f97, f55, f81 + FNMA f82 = f98, f55, f82 + FNMA f83 = f99, f55, f83 + ;; + FNMA f72 = f96, f56, f72 + FNMA f73 = f97, f56, f73 + FNMA f74 = f98, f56, f74 + FNMA f75 = f99, f56, f75 + ;; + FNMA f64 = f96, f57, f64 + FNMA f65 = f97, f57, f65 + FNMA f66 = f98, f57, f66 + FNMA f67 = f99, f57, f67 + ;; + FMPY f88 = f88, f58 + FMPY f89 = f89, f58 + FMPY f90 = f90, f58 + FMPY f91 = f91, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f81 = f89, f59, f81 + FNMA f82 = f90, f59, f82 + FNMA f83 = f91, f59, f83 + ;; + FNMA f72 = f88, f60, f72 + FNMA f73 = f89, f60, f73 + FNMA f74 = f90, f60, f74 + FNMA f75 = f91, f60, f75 + ;; + FNMA f64 = f88, f61, f64 + FNMA f65 = f89, f61, f65 + FNMA f66 = f90, f61, f66 + FNMA f67 = f91, f61, f67 + ;; + FMPY f80 = f80, f16 + FMPY f81 = f81, f16 + FMPY f82 = f82, f16 + FMPY f83 = f83, f16 + ;; + FNMA f72 = f80, f17, f72 + FNMA f73 = f81, f17, f73 + FNMA f74 = f82, f17, f74 + FNMA f75 = f83, f17, f75 + ;; + FNMA f64 = f80, f18, f64 + FNMA f65 = f81, f18, f65 + FNMA f66 = f82, f18, f66 + FNMA f67 = f83, f18, f67 + ;; + FMPY f72 = f72, f19 + FMPY f73 = f73, f19 + FMPY f74 = f74, f19 + FMPY f75 = f75, f19 + ;; + FNMA f64 = f72, f20, f64 + FNMA f65 = f73, f20, f65 + FNMA f66 = f74, f20, f66 + FNMA f67 = f75, f20, f67 + ;; + FMPY f64 = f64, f21 + FMPY f65 = f65, f21 + FMPY f66 = f66, f21 + FMPY f67 = f67, f21 + ;; + adds AOFFSET = 24 * SIZE, AOFFSET + adds AOFFSET2 = 24 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + ;; + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + ;; + STFD [AOFFSET] = f115, - 11 * SIZE + STFD [AOFFSET2] = f123, - 11 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f105, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f106, SIZE + ;; + STFD [AOFFSET] = f99, - 11 * SIZE + STFD [AOFFSET2] = f107, - 11 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f91, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; + +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE +#ifdef LN + adds C3 = -4 * SIZE, C3 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE +#ifdef LN + adds C4 = -4 * SIZE, C4 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif +#ifdef LN + adds C5 = -4 * SIZE, C5 +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE +#ifdef LN + adds C6 = -4 * SIZE, C6 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE +#ifdef LN + adds C8 = -4 * SIZE, C8 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + nop __LINE__ + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE +#ifdef LN + adds C7 = -4 * SIZE, C7 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C5 ] = f99, SIZE +#else + STFD [C5 ] = f99, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + sub L = K, KK + } + ;; + { .mmi +#ifndef LN + STFD [C6 ] = f107, SIZE +#else + STFD [C6 ] = f107, - 3 * SIZE +#endif +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C7 ] = f115, SIZE +#else + STFD [C7 ] = f115, - 3 * SIZE +#endif +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmb +#ifndef LN + STFD [C8 ] = f123, SIZE +#else + STFD [C8 ] = f123, - 3 * SIZE +#endif + } + ;; + .align 8 + +.L030: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p0 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L038 + } + ;; + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + FSUB f113 = f46, f113 + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; + FSUB f96 = f40, f96 + FSUB f97 = f41, f97 + ;; + FSUB f104 = f42, f104 + FSUB f105 = f43, f105 + ;; + FSUB f112 = f44, f112 + FSUB f113 = f45, f113 + ;; + FSUB f120 = f46, f120 + FSUB f121 = f47, f121 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f97 = f97, f32 + FMPY f73 = f73, f32 + FMPY f105 = f105, f32 + FMPY f81 = f81, f32 + FMPY f113 = f113, f32 + FMPY f89 = f89, f32 + FMPY f121 = f121, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f96 = f97, f33, f96 + FNMA f72 = f73, f33, f72 + FNMA f104 = f105, f33, f104 + FNMA f80 = f81, f33, f80 + FNMA f112 = f113, f33, f112 + FNMA f88 = f89, f33, f88 + FNMA f120 = f121, f33, f120 + ;; + FMPY f64 = f64, f34 + FMPY f96 = f96, f34 + FMPY f72 = f72, f34 + FMPY f104 = f104, f34 + FMPY f80 = f80, f34 + FMPY f112 = f112, f34 + FMPY f88 = f88, f34 + FMPY f120 = f120, f34 + ;; + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -2 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -2 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 + adds C5 = -2 * SIZE, C5 + adds C6 = -2 * SIZE, C6 + adds C7 = -2 * SIZE, C7 + adds C8 = -2 * SIZE, C8 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + FMPY f65 = f65, f34 + FMPY f97 = f97, f34 + FMPY f73 = f73, f34 + FMPY f105 = f105, f34 + FMPY f81 = f81, f34 + FMPY f113 = f113, f34 + FMPY f89 = f89, f34 + FMPY f121 = f121, f34 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, -11 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, -11 * SIZE + } +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + ;; + FNMA f120 = f104, f18, f120 + FNMA f121 = f105, f18, f121 + ;; + FMPY f112 = f112, f19 + FMPY f113 = f113, f19 + ;; + FNMA f120 = f112, f20, f120 + FNMA f121 = f113, f20, f121 + ;; + FMPY f120 = f120, f21 + FMPY f121 = f121, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, 5 * SIZE + STFD [AOFFSET2] = f89, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f105, -11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + FMPY f121 = f121, f32 + ;; + FNMA f112 = f120, f33, f112 + FNMA f113 = f121, f33, f113 + ;; + FNMA f104 = f120, f34, f104 + FNMA f105 = f121, f34, f105 + ;; + FNMA f96 = f120, f35, f96 + FNMA f97 = f121, f35, f97 + ;; + FNMA f88 = f120, f36, f88 + FNMA f89 = f121, f36, f89 + ;; + FNMA f80 = f120, f37, f80 + FNMA f81 = f121, f37, f81 + ;; + FNMA f72 = f120, f38, f72 + FNMA f73 = f121, f38, f73 + ;; + FNMA f64 = f120, f39, f64 + FNMA f65 = f121, f39, f65 + ;; + FMPY f112 = f112, f40 + FMPY f113 = f113, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f105 = f113, f41, f105 + ;; + FNMA f96 = f112, f42, f96 + FNMA f97 = f113, f42, f97 + ;; + FNMA f88 = f112, f43, f88 + FNMA f89 = f113, f43, f89 + ;; + FNMA f80 = f112, f44, f80 + FNMA f81 = f113, f44, f81 + ;; + FNMA f72 = f112, f45, f72 + FNMA f73 = f113, f45, f73 + ;; + FNMA f64 = f112, f46, f64 + FNMA f65 = f113, f46, f65 + ;; + FMPY f104 = f104, f47 + FMPY f105 = f105, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f97 = f105, f48, f97 + ;; + FNMA f88 = f104, f49, f88 + FNMA f89 = f105, f49, f89 + ;; + FNMA f80 = f104, f50, f80 + FNMA f81 = f105, f50, f81 + ;; + FNMA f72 = f104, f51, f72 + FNMA f73 = f105, f51, f73 + ;; + FNMA f64 = f104, f52, f64 + FNMA f65 = f105, f52, f65 + ;; + FMPY f96 = f96, f53 + FMPY f97 = f97, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f89 = f97, f54, f89 + ;; + FNMA f80 = f96, f55, f80 + FNMA f81 = f97, f55, f81 + ;; + FNMA f72 = f96, f56, f72 + FNMA f73 = f97, f56, f73 + ;; + FNMA f64 = f96, f57, f64 + FNMA f65 = f97, f57, f65 + ;; + FMPY f88 = f88, f58 + FMPY f89 = f89, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f81 = f89, f59, f81 + ;; + FNMA f72 = f88, f60, f72 + FNMA f73 = f89, f60, f73 + ;; + FNMA f64 = f88, f61, f64 + FNMA f65 = f89, f61, f65 + ;; + FMPY f80 = f80, f16 + FMPY f81 = f81, f16 + ;; + FNMA f72 = f80, f17, f72 + FNMA f73 = f81, f17, f73 + ;; + FNMA f64 = f80, f18, f64 + FNMA f65 = f81, f18, f65 + ;; + FMPY f72 = f72, f19 + FMPY f73 = f73, f19 + ;; + FNMA f64 = f72, f20, f64 + FNMA f65 = f73, f20, f65 + ;; + FMPY f64 = f64, f21 + FMPY f65 = f65, f21 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f105, - 11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, - 3 * SIZE + STFD [AOFFSET2] = f89, - 3 * SIZE + ;; + +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; +#ifndef LN + STFD [C3 ] = f81, SIZE +#else + STFD [C3 ] = f81, - SIZE +#endif + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; +#ifndef LN + STFD [C4 ] = f89, SIZE +#else + STFD [C4 ] = f89, -SIZE +#endif + ;; + STFD [C5 ] = f96, SIZE + mov f96 = f0 + ;; +#ifndef LN + STFD [C5 ] = f97, SIZE +#else + STFD [C5 ] = f97, -SIZE +#endif + ;; + STFD [C6 ] = f104, SIZE + mov f104 = f0 + ;; +#ifndef LN + STFD [C6 ] = f105, SIZE +#else + STFD [C6 ] = f105, -SIZE +#endif + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + STFD [C7 ] = f112, SIZE + mov f112 = f0 + ;; + { .mmi +#ifndef LN + STFD [C7 ] = f113, SIZE +#else + STFD [C7 ] = f113, -SIZE +#endif + +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + mov f120 = f0 + } + ;; + { .mmi +#ifndef LN + STFD [C8 ] = f121, SIZE +#else + STFD [C8 ] = f121, -SIZE +#endif + +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L040: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p0 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L048 + } + ;; + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.cloop.sptk.few .L042 + } + ;; + +.L048: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f96 = f36, f96 + FSUB f104 = f37, f104 + FSUB f112 = f38, f112 + FSUB f120 = f39, f120 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f96 = f96, f32 + FMPY f72 = f72, f32 + FMPY f104 = f104, f32 + FMPY f80 = f80, f32 + FMPY f112 = f112, f32 + FMPY f88 = f88, f32 + FMPY f120 = f120, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -1 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + adds C3 = -1 * SIZE, C3 + adds C4 = -1 * SIZE, C4 + adds C5 = -1 * SIZE, C5 + adds C6 = -1 * SIZE, C6 + adds C7 = -1 * SIZE, C7 + adds C8 = -1 * SIZE, C8 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, -3 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, -3 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FNMA f96 = f64, f36, f96 + ;; + FNMA f104 = f64, f37, f104 + ;; + FNMA f112 = f64, f38, f112 + ;; + FNMA f120 = f64, f39, f120 + ;; + FMPY f72 = f72, f40 + ;; + FNMA f80 = f72, f41, f80 + ;; + FNMA f88 = f72, f42, f88 + ;; + FNMA f96 = f72, f43, f96 + ;; + FNMA f104 = f72, f44, f104 + ;; + FNMA f112 = f72, f45, f112 + ;; + FNMA f120 = f72, f46, f120 + ;; + FMPY f80 = f80, f47 + ;; + FNMA f88 = f80, f48, f88 + ;; + FNMA f96 = f80, f49, f96 + ;; + FNMA f104 = f80, f50, f104 + ;; + FNMA f112 = f80, f51, f112 + ;; + FNMA f120 = f80, f52, f120 + ;; + FMPY f88 = f88, f53 + ;; + FNMA f96 = f88, f54, f96 + ;; + FNMA f104 = f88, f55, f104 + ;; + FNMA f112 = f88, f56, f112 + ;; + FNMA f120 = f88, f57, f120 + ;; + FMPY f96 = f96, f58 + ;; + FNMA f104 = f96, f59, f104 + ;; + FNMA f112 = f96, f60, f112 + ;; + FNMA f120 = f96, f61, f120 + ;; + FMPY f104 = f104, f16 + ;; + FNMA f112 = f104, f17, f112 + ;; + FNMA f120 = f104, f18, f120 + ;; + FMPY f112 = f112, f19 + ;; + FNMA f120 = f112, f20, f120 + ;; + FMPY f120 = f120, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + ;; + FNMA f112 = f120, f33, f112 + ;; + FNMA f104 = f120, f34, f104 + ;; + FNMA f96 = f120, f35, f96 + ;; + FNMA f88 = f120, f36, f88 + ;; + FNMA f80 = f120, f37, f80 + ;; + FNMA f72 = f120, f38, f72 + ;; + FNMA f64 = f120, f39, f64 + ;; + FMPY f112 = f112, f40 + ;; + FNMA f104 = f112, f41, f104 + ;; + FNMA f96 = f112, f42, f96 + ;; + FNMA f88 = f112, f43, f88 + ;; + FNMA f80 = f112, f44, f80 + ;; + FNMA f72 = f112, f45, f72 + ;; + FNMA f64 = f112, f46, f64 + ;; + FMPY f104 = f104, f47 + ;; + FNMA f96 = f104, f48, f96 + ;; + FNMA f88 = f104, f49, f88 + ;; + FNMA f80 = f104, f50, f80 + ;; + FNMA f72 = f104, f51, f72 + ;; + FNMA f64 = f104, f52, f64 + ;; + FMPY f96 = f96, f53 + ;; + FNMA f88 = f96, f54, f88 + ;; + FNMA f80 = f96, f55, f80 + ;; + FNMA f72 = f96, f56, f72 + ;; + FNMA f64 = f96, f57, f64 + ;; + FMPY f88 = f88, f58 + ;; + FNMA f80 = f88, f59, f80 + ;; + FNMA f72 = f88, f60, f72 + ;; + FNMA f64 = f88, f61, f64 + ;; + FMPY f80 = f80, f16 + ;; + FNMA f72 = f80, f17, f72 + ;; + FNMA f64 = f80, f18, f64 + ;; + FMPY f72 = f72, f19 + ;; + FNMA f64 = f72, f20, f64 + ;; + FMPY f64 = f64, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, - 3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; + +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif +#ifndef LN + STFD [C3 ] = f80, SIZE +#else + STFD [C3 ] = f80 +#endif +#ifndef LN + STFD [C4 ] = f88, SIZE +#else + STFD [C4 ] = f88 +#endif +#ifndef LN + STFD [C5 ] = f96, SIZE +#else + STFD [C5 ] = f96 +#endif +#ifndef LN + STFD [C6 ] = f104, SIZE +#else + STFD [C6 ] = f104 +#endif +#ifndef LN + STFD [C7 ] = f112, SIZE +#else + STFD [C7 ] = f112 +#endif +#ifndef LN + STFD [C8 ] = f120, SIZE +#else + STFD [C8 ] = f120 +#endif + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f96 = f0 + mov f104 = f0 + mov f112 = f0 + mov f120 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L049: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 3, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 8, KK +#endif + +#ifdef RT + adds KK = -8, KK +#endif + ;; + + { .mmi + mov AOFFSET = A + } + ;; + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 8 + + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + mov ar.lc = ARLC + ;; + mov pr = PR, -1 + ;; + mov ar.pfs = ARPFS + ;; + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/xcopy.S b/kernel/ia64/xcopy.S new file mode 100644 index 0000000..e58f5ef --- /dev/null +++ b/kernel/ia64/xcopy.S @@ -0,0 +1,565 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define INCX2 r18 +#define INCY2 r19 +#define INCX8 r20 +#define INCY8 r21 +#define PR r30 +#define ARLC r31 + +#define PREFETCH_SIZE (8 * 16) + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + shr I = N, 3 + (p6) br.ret.sptk.many b0 + } + ;; + shl INCX = INCX, ZBASE_SHIFT + shl INCY = INCY, ZBASE_SHIFT + ;; + .body + { .mmi + sub r8 = X1, Y1 + mov r9 = 0xf0 + mov PR = pr + } + { .mmi + shladd INCX2 = INCX, 1, r0 + shladd INCY2 = INCY, 1, r0 + and J = 15, N + } + ;; + { .mmi + shladd INCX8 = INCX, 2, r0 + shladd INCY8 = INCY, 2, r0 + mov pr.rot = 0 + } + { .mmi + and r8 = r9, r8 + cmp.eq p9, p0 = r0, J + adds I = -1, I + } + ;; + { .mmi + adds X2 = 1 * SIZE, X1 + adds Y2 = 1 * SIZE, Y1 + mov ar.ec = 4 + } + { .mmb + cmp.gt p6, p0 = 127, r8 + cmp.eq p16, p0 = r0, r0 + (p6) br.cond.dpnt .L20 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 2) * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = N, 2 + (p8) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mmi + (p19) STFD [Y1] = f35 + (p19) STFD [Y2] = f39 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p17) LDFD f81 = [X1], INCX + (p17) LDFD f85 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f43 + (p19) STFD [Y2] = f47 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p17) LDFD f89 = [X1], INCX + (p17) LDFD f93 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f51 + (p19) STFD [Y2] = f55 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f36 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f59 + (p19) STFD [Y2] = f63 + (p19) add Y1 = INCY, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p16) LDFD f40 = [X1], INCX + (p16) LDFD f44 = [X2], INCX + nop __LINE__ + } + ;; + { .mmi + (p19) STFD [Y1] = f67 + (p19) STFD [Y2] = f71 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f48 = [X1], INCX + (p16) LDFD f52 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f79 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f56 = [X1], INCX + (p16) LDFD f60 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f83 + (p19) STFD [Y2] = f87 + (p19) add Y1 = INCY, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f91 + (p19) STFD [Y2] = f95 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f64 = [X1], INCX + (p16) LDFD f68 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmb + (p16) LDFD f72 = [X1], INCX + (p16) LDFD f76 = [X2], INCX + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f49 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f51 = [X2], INCX + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f52 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f54 = [X1], INCX + (p12) LDFD f55 = [X2], INCX + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX + (p13) LDFD f57 = [X2], INCX + tbit.z p0, p14 = N, 0 + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX + (p13) LDFD f59 = [X2], INCX + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f49 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p14) LDFD f60 = [X1], INCX + (p14) LDFD f61 = [X2], INCX + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f51 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f52 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f54 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) STFD [Y2] = f57 + (p13) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p13) STFD [Y2] = f59 + (p13) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY, Y2 + } + ;; + { .mmb + (p14) STFD [Y1] = f60 + (p14) STFD [Y2] = f61 + br.ret.sptk.many b0 + } + ;; + .align 16 + +.L20: + { .mmi + adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 10) * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = N, 2 + (p8) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mmi + (p19) STFD [Y1] = f67 + (p19) STFD [Y2] = f71 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p17) LDFD f81 = [X1], INCX + (p17) LDFD f85 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f79 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p17) LDFD f89 = [X1], INCX + (p17) LDFD f93 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f83 + (p19) STFD [Y2] = f87 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f36 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f91 + (p19) STFD [Y2] = f95 + (p19) add Y1 = INCY, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p16) LDFD f40 = [X1], INCX + (p16) LDFD f44 = [X2], INCX + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [Y1] = f34 + (p18) STFD [Y2] = f38 + (p18) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f48 = [X1], INCX + (p16) LDFD f52 = [X2], INCX + (p18) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f42 + (p18) STFD [Y2] = f46 + (p18) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f56 = [X1], INCX + (p16) LDFD f60 = [X2], INCX + (p18) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f50 + (p18) STFD [Y2] = f54 + (p18) add Y1 = INCY, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p18) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f58 + (p18) STFD [Y2] = f62 + (p18) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f64 = [X1], INCX + (p16) LDFD f68 = [X2], INCX + (p18) add Y2 = INCY, Y2 + } + ;; + { .mmb + (p16) LDFD f72 = [X1], INCX + (p16) LDFD f76 = [X2], INCX + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f49 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f51 = [X2], INCX + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f52 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f54 = [X1], INCX + (p12) LDFD f55 = [X2], INCX + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX + (p13) LDFD f57 = [X2], INCX + tbit.z p0, p14 = N, 0 + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX + (p13) LDFD f59 = [X2], INCX + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f49 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p14) LDFD f60 = [X1], INCX + (p14) LDFD f61 = [X2], INCX + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f51 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f52 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f54 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) STFD [Y2] = f57 + (p13) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p13) STFD [Y2] = f59 + (p13) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY, Y2 + } + ;; + { .mmb + (p14) STFD [Y1] = f60 + (p14) STFD [Y2] = f61 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/xdot.S b/kernel/ia64/xdot.S new file mode 100644 index 0000000..9322b4b --- /dev/null +++ b/kernel/ia64/xdot.S @@ -0,0 +1,518 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (4 * 24) + +#ifdef F_INTERFACE +#define N r33 +#define X1 r34 +#define INCX r35 +#define Y1 r36 +#define INCY r37 +#else +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 +#endif + +#define PREX1 r2 +#define PREY1 r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 + +#define INCX4 r24 +#define INCY4 r25 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + nop __LINE__ + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov r26 = 1 + mov f9 = f0 + nop __LINE__ + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + shl r26 = r26, ZBASE_SHIFT + shl r27 = r27, ZBASE_SHIFT + ;; + (p6) add X1 = r26, X1 + (p7) add Y1 = r27, Y1 + ;; +#endif + { .mfi + adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1 + mov f10 = f0 + mov PR = pr + } + { .mfb + cmp.lt p0, p6 = r0, N + mov f11 = f0 + (p6) br.cond.spnt .L1000 + } + ;; + { .mii + adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1 + shl INCX = INCX, ZBASE_SHIFT + shl INCY = INCY, ZBASE_SHIFT + } + ;; + { .mfi + add X2 = SIZE, X1 + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + add Y2 = SIZE, Y1 + mov f13 = f0 + shr I = N, 3 + } + ;; + { .mfi + adds I = -1, I + mov f14 = f0 + mov ar.ec= 3 + } + { .mmf + shladd INCX4 = INCX, 2, r0 + shladd INCY4 = INCY, 2, r0 + mov f15 = f0 + } + ;; + { .mmi + and J = 7, N + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + } + { .mib + cmp.eq p6 ,p0 = -1, I + tbit.nz p12, p0 = N, 2 + (p6) br.cond.dpnt .L215 + } + ;; + .align 32 + +.L212: + { .mmf + (p16) lfetch.nt1 [PREX1], INCX4 + (p16) LDFD f80 = [X1], INCX + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f83 = [X2], INCX + nop __LINE__ + (p18) FMA f9 = f37, f82, f9 + } + ;; + { .mmf + (p16) LDFD f32 = [Y1], INCY + (p16) LDFD f35 = [Y2], INCY + (p18) FMA f10 = f34, f85, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f11 = f37, f85, f11 + } + ;; + { .mmf + (p16) LDFD f86 = [X1], INCX + (p16) LDFD f89 = [X2], INCX + (p18) FMA f12 = f40, f88, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f13 = f43, f88, f13 + } + ;; + { .mmf + (p16) LDFD f38 = [Y1], INCY + (p16) LDFD f41 = [Y2], INCY + (p18) FMA f14 = f40, f91, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f15 = f43, f91, f15 + } + ;; + { .mmf + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f95 = [X2], INCX + (p18) FMA f8 = f46, f94, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f9 = f49, f94, f9 + } + ;; + { .mmf + (p16) lfetch.nt1 [PREY1], INCY4 + (p16) LDFD f44 = [Y1], INCY + (p18) FMA f10 = f46, f97, f10 + } + { .mmf + (p16) LDFD f47 = [Y2], INCY + nop __LINE__ + (p18) FMA f11 = f49, f97, f11 + } + ;; + { .mmf + (p16) LDFD f98 = [X1], INCX + (p16) LDFD f101 = [X2], INCX + (p18) FMA f12 = f52, f100, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f13 = f55, f100, f13 + } + ;; + { .mmf + (p16) LDFD f50 = [Y1], INCY + (p16) LDFD f53 = [Y2], INCY + (p18) FMA f14 = f52, f103, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) lfetch.nt1 [PREX1], INCX4 + (p16) LDFD f104 = [X1], INCX + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f107 = [X2], INCX + nop __LINE__ + (p18) FMA f9 = f61, f106, f9 + } + ;; + { .mmf + (p16) LDFD f56 = [Y1], INCY + (p16) LDFD f59 = [Y2], INCY + (p18) FMA f10 = f58, f109, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f11 = f61, f109, f11 + } + ;; + { .mmf + (p16) LDFD f110 = [X1], INCX + (p16) LDFD f113 = [X2], INCX + (p18) FMA f12 = f64, f112, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f13 = f67, f112, f13 + } + ;; + { .mmf + (p16) LDFD f62 = [Y1], INCY + (p16) LDFD f65 = [Y2], INCY + (p18) FMA f14 = f64, f115, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f15 = f67, f115, f15 + } + ;; + { .mmf + (p16) lfetch.nt1 [PREY1], INCY4 + (p16) LDFD f116 = [X1], INCX + (p18) FMA f8 = f70, f118, f8 + } + { .mmf + (p16) LDFD f119 = [X2], INCX + nop __LINE__ + (p18) FMA f9 = f73, f118, f9 + } + ;; + { .mmf + (p16) LDFD f68 = [Y1], INCY + (p16) LDFD f71 = [Y2], INCY + (p18) FMA f10 = f70, f121, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f11 = f73, f121, f11 + } + ;; + { .mmf + (p16) LDFD f122 = [X1], INCX + (p16) LDFD f125 = [X2], INCX + (p18) FMA f12 = f76, f124, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f13 = f79, f124, f13 + } + ;; + { .mmf + (p16) LDFD f74 = [Y1], INCY + (p16) LDFD f77 = [Y2], INCY + (p18) FMA f14 = f76, f127, f14 + } + { .mfb + nop __LINE__ + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L212 + } + ;; + .align 32 + +.L215: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f49 = [X2], INCX + cmp.eq p7, p0 = r0, J + } + ;; + { .mmb + (p12) LDFD f32 = [Y1], INCY + (p12) LDFD f33 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f51 = [X2], INCX + tbit.nz p13, p0 = N, 1 + } + ;; + { .mmi + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f35 = [Y2], INCY + nop __LINE__ + } + ;; + { .mmi + (p12) LDFD f52 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + tbit.nz p14, p0 = N, 0 + } + ;; + { .mmi + (p12) LDFD f36 = [Y1], INCY + (p12) LDFD f37 = [Y2], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) LDFD f54 = [X1], INCX + (p12) LDFD f55 = [X2], INCX + (p12) FMA f8 = f32, f48, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f9 = f33, f48, f9 + } + ;; + { .mmf + (p12) LDFD f38 = [Y1], INCY + (p12) LDFD f39 = [Y2], INCY + (p12) FMA f10 = f32, f49, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f11 = f33, f49, f11 + } + ;; + { .mmf + (p13) LDFD f56 = [X1], INCX + (p13) LDFD f57 = [X2], INCX + (p12) FMA f12 = f34, f50, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f13 = f35, f50, f13 + } + ;; + { .mmf + (p13) LDFD f40 = [Y1], INCY + (p13) LDFD f41 = [Y2], INCY + (p12) FMA f14 = f34, f51, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f15 = f35, f51, f15 + } + ;; + { .mmf + (p13) LDFD f58 = [X1], INCX + (p13) LDFD f59 = [X2], INCX + (p12) FMA f8 = f36, f52, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f9 = f37, f52, f9 + } + ;; + { .mmf + (p13) LDFD f42 = [Y1], INCY + (p13) LDFD f43 = [Y2], INCY + (p12) FMA f10 = f36, f53, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f11 = f37, f53, f11 + } + ;; + { .mmf + (p14) LDFD f60 = [X1] + (p14) LDFD f61 = [X2] + (p12) FMA f12 = f38, f54, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f13 = f39, f54, f13 + } + ;; + { .mmf + (p14) LDFD f44 = [Y1] + (p14) LDFD f45 = [Y2] + (p12) FMA f14 = f38, f55, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f15 = f39, f55, f15 + } + ;; + (p13) FMA f8 = f40, f56, f8 + (p13) FMA f9 = f41, f56, f9 + (p13) FMA f10 = f40, f57, f10 + (p13) FMA f11 = f41, f57, f11 + (p13) FMA f12 = f42, f58, f12 + (p13) FMA f13 = f43, f58, f13 + (p13) FMA f14 = f42, f59, f14 + (p13) FMA f15 = f43, f59, f15 + ;; + (p14) FMA f8 = f44, f60, f8 + (p14) FMA f9 = f45, f60, f9 + (p14) FMA f10 = f44, f61, f10 + (p14) FMA f11 = f45, f61, f11 + ;; + .align 32 + +.L999: + FADD f8 = f8, f12 + FADD f9 = f9, f13 + FADD f10 = f10, f14 + FADD f11 = f11, f15 + mov ar.lc = ARLC + ;; +#ifndef CONJ + FSUB f8 = f8, f11 + FADD f9 = f9, f10 +#else + FADD f8 = f8, f11 + FSUB f9 = f9, f10 +#endif + ;; + .align 32 + +.L1000: +#ifdef F_INTERFACE + STFD [r32] = f8, SIZE + ;; + STFD [r32] = f9, SIZE +#endif + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/zaxpy.S b/kernel/ia64/zaxpy.S new file mode 100644 index 0000000..c0f14fe --- /dev/null +++ b/kernel/ia64/zaxpy.S @@ -0,0 +1,822 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#ifndef CONJ +#define FMA1 FNMA +#define FMA2 FMA +#else +#define FMA1 FMA +#define FMA2 FNMA +#endif + +#define SP r12 + +#ifdef XDOUBLE +#define N r32 +#define X1 r14 +#define INCX r15 +#define Y1 r16 +#define INCY r17 +#else +#define N r32 +#define X1 r37 +#define INCX r38 +#define Y1 r39 +#define INCY r36 +#endif + +#define PREX1 r2 +#define PREY1 r3 + +#define I r18 +#define J r19 +#define Y2 r20 +#define X2 r21 +#define INCX8 r22 +#define INCY8 r23 +#define YY1 r24 +#define YY2 r25 +#define YY3 r26 +#define YY4 r27 + +#define INCX2M1 loc0 +#define INCY2M1 loc1 +#define INCX4M1 loc2 +#define INCY4M1 loc3 +#define X3 loc4 +#define Y3 loc5 +#define X4 loc6 +#define Y4 loc7 +#define PREX2 loc8 +#define PREY2 loc9 + +#define ARLC r29 +#define PR r30 + +#define ALPHA_R f8 +#define ALPHA_I f9 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + } + { .mmb + adds r17 = 40, SP + cmp.gt p15, p0 = r0, N + (p15) br.ret.sptk.many b0 + } + ;; +#ifdef XDOUBLE + { .mmi + ld8 X1 = [r14] + ld8 INCX = [r15] + nop __LINE__ + } + { .mmi + ld8 Y1 = [r16] + ld8 INCY = [r17] + nop __LINE__ + } + ;; +#else + { .mmi + ld8 INCY = [r14] + nop __LINE__ + nop __LINE__ + } + ;; +#endif + { .mmi + .save ar.pfs, r10 + alloc r10 = ar.pfs, 8, 16, 0, 0 + and J = 7, N + shl INCX = INCX, ZBASE_SHIFT + } + { .mmi + adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1 + adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1 + shl INCY = INCY, ZBASE_SHIFT + } + ;; + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mmi + adds INCX2M1 = -SIZE, INCX + adds INCY2M1 = -SIZE, INCY + shr I = N, 3 + } + ;; + { .mmi + add INCX2M1 = INCX2M1, INCX + add INCY2M1 = INCY2M1, INCY + mov PR = pr + } + { .mmi + add X2 = X1, INCX + add Y2 = Y1, INCY + nop __LINE__ + } + ;; + { .mmi + shladd INCX4M1 = INCX, 1, INCX2M1 + shladd INCY4M1 = INCY, 1, INCY2M1 + mov pr.rot= 0 + } + { .mmi + shladd X3 = INCX, 1, X1 + shladd Y3 = INCY, 1, Y1 + } + ;; + { .mmi + shladd X4 = INCX, 1, X2 + shladd Y4 = INCY, 1, Y2 + adds I = -1, I + } + { .mmi + cmp.eq p16, p0 = r0, r0 + and r8 = 127, Y1 + and PREX1 = -128, PREX1 + } + ;; + { .mmi + mov YY1 = Y1 + mov YY2 = Y2 + mov ar.ec = 3 + } + { .mmi + mov YY3 = Y3 + mov YY4 = Y4 + or PREX1 = PREX1, r8 + } + ;; + { .mmi + shladd PREX2 = INCX, 2, PREX1 + shladd PREY2 = INCY, 2, PREY1 + mov ar.lc = I + } + { .mib + cmp.eq p11 ,p0 = -1, I + tbit.z p0, p13 = N, 2 + (p11) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: +#ifdef XDOUBLE + { .mmf + (p16) LDFD f80 = [Y1], 1 * SIZE + (p16) LDFD f83 = [Y2], 1 * SIZE + (p18) FMA1 f82 = ALPHA_I, f40, f82 + } + { .mmf + (p16) LDFD f92 = [Y3], 1 * SIZE + (p16) LDFD f95 = [Y4], 1 * SIZE + (p18) FMA1 f85 = ALPHA_I, f43, f85 + } + ;; + { .mmf + (p16) LDFD f86 = [Y1], INCY4M1 + (p16) LDFD f89 = [Y2], INCY4M1 + (p18) FMA1 f94 = ALPHA_I, f52, f94 + } + { .mmf + (p16) LDFD f98 = [Y3], INCY4M1 + (p16) LDFD f101 = [Y4], INCY4M1 + (p18) FMA1 f97 = ALPHA_I, f55, f97 + } + ;; + { .mmf + (p16) LDFD f32 = [X1], 1 * SIZE + (p16) LDFD f35 = [X2], 1 * SIZE + (p18) FMA f88 = ALPHA_I, f34, f88 + } + { .mmf + (p16) LDFD f44 = [X3], 1 * SIZE + (p16) LDFD f47 = [X4], 1 * SIZE + (p18) FMA f91 = ALPHA_I, f37, f91 + } + ;; + { .mmf + (p16) LDFD f38 = [X1], INCX4M1 + (p16) LDFD f41 = [X2], INCX4M1 + (p18) FMA f100 = ALPHA_I, f46, f100 + } + { .mmf + (p16) LDFD f50 = [X3], INCX4M1 + (p16) LDFD f53 = [X4], INCX4M1 + (p18) FMA f103 = ALPHA_I, f49, f103 + } + ;; + { .mmf + (p18) STFD [YY1] = f82, 1 * SIZE + (p18) STFD [YY2] = f85, 1 * SIZE + (p18) FMA f106 = ALPHA_R, f58, f106 + } + { .mmf + (p19) add YY3 = YY3, INCY4M1 + (p19) add YY4 = YY4, INCY4M1 + (p18) FMA f109 = ALPHA_R, f61, f109 + } + ;; + { .mmf + (p18) STFD [YY3] = f94, 1 * SIZE + (p18) STFD [YY4] = f97, 1 * SIZE + (p18) FMA f118 = ALPHA_R, f70, f118 + } + { .mmf + (p16) lfetch.excl.nt1 [PREY1], INCY8 + (p16) lfetch.excl.nt1 [PREY2], INCY8 + (p18) FMA f121 = ALPHA_R, f73, f121 + } + ;; + { .mmf + (p18) STFD [YY1] = f88 + (p18) STFD [YY2] = f91 + (p18) FMA2 f112 = ALPHA_R, f64, f112 + } + { .mmf + (p18) add YY1 = YY1, INCY4M1 + (p18) add YY2 = YY2, INCY4M1 + (p18) FMA2 f115 = ALPHA_R, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY3] = f100 + (p18) STFD [YY4] = f103 + (p18) FMA2 f124 = ALPHA_R, f76, f124 + } + { .mmf + (p18) add YY3 = YY3, INCY4M1 + (p18) add YY4 = YY4, INCY4M1 + (p18) FMA2 f127 = ALPHA_R, f79, f127 + } + ;; + { .mmf + (p16) LDFD f104 = [Y1], 1 * SIZE + (p16) LDFD f107 = [Y2], 1 * SIZE + (p18) FMA1 f106 = ALPHA_I, f64, f106 + } + { .mmf + (p16) LDFD f116 = [Y3], 1 * SIZE + (p16) LDFD f119 = [Y4], 1 * SIZE + (p18) FMA1 f109 = ALPHA_I, f67, f109 + } + ;; + { .mmf + (p16) LDFD f110 = [Y1], INCY4M1 + (p16) LDFD f113 = [Y2], INCY4M1 + (p18) FMA1 f118 = ALPHA_I, f76, f118 + } + { .mmf + (p16) LDFD f122 = [Y3], INCY4M1 + (p16) LDFD f125 = [Y4], INCY4M1 + (p18) FMA1 f121 = ALPHA_I, f79, f121 + } + ;; + { .mmf + (p16) LDFD f56 = [X1], 1 * SIZE + (p16) LDFD f59 = [X2], 1 * SIZE + (p18) FMA f112 = ALPHA_I, f58, f112 + } + { .mmf + (p16) LDFD f68 = [X3], 1 * SIZE + (p16) LDFD f71 = [X4], 1 * SIZE + (p18) FMA f115 = ALPHA_I, f61, f115 + } + ;; + { .mmf + (p16) LDFD f62 = [X1], INCX4M1 + (p16) LDFD f65 = [X2], INCX4M1 + (p18) FMA f124 = ALPHA_I, f70, f124 + } + { .mmf + (p16) LDFD f74 = [X3], INCX4M1 + (p16) LDFD f77 = [X4], INCX4M1 + (p18) FMA f127 = ALPHA_I, f73, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f106, 1 * SIZE + (p18) STFD [YY2] = f109, 1 * SIZE + (p17) FMA f81 = ALPHA_R, f33, f81 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f84 = ALPHA_R, f36, f84 + } + ;; + { .mmf + (p18) STFD [YY3] = f118, 1 * SIZE + (p18) STFD [YY4] = f121, 1 * SIZE + (p17) FMA f93 = ALPHA_R, f45, f93 + } + { .mmf + (p16) lfetch.nt1 [PREX1], INCX8 + (p16) lfetch.nt1 [PREX2], INCX8 + (p17) FMA f96 = ALPHA_R, f48, f96 + } + ;; + { .mmf + (p18) STFD [YY1] = f112 + (p18) STFD [YY2] = f115 + (p17) FMA2 f87 = ALPHA_R, f39, f87 + } + { .mmf + (p18) add YY1 = YY1, INCY4M1 + (p18) add YY2 = YY2, INCY4M1 + (p17) FMA2 f90 = ALPHA_R, f42, f90 + } + ;; + { .mmf + (p18) STFD [YY3] = f124 + (p18) STFD [YY4] = f127 + (p17) FMA2 f99 = ALPHA_R, f51, f99 + } + { .mfb + nop __LINE__ + (p17) FMA2 f102 = ALPHA_R, f54, f102 + br.ctop.sptk.few .L22 + } + ;; + ;; + (p19) add YY3 = YY3, INCY4M1 + (p19) add YY4 = YY4, INCY4M1 + ;; +#else + { .mmf + (p19) STFD [YY3] = f125 + (p19) STFD [YY4] = f32 + (p18) FMA2 f100 = ALPHA_R, f52, f100 + } + { .mmf + (p16) lfetch.excl.nt1 [PREY1], INCY8 + nop __LINE__ + (p18) FMA2 f103 = ALPHA_R, f55, f103 + } + ;; + { .mmf + (p16) LDFD f80 = [Y1], 1 * SIZE + (p16) LDFD f83 = [Y2], 1 * SIZE + (p18) FMA1 f82 = ALPHA_I, f40, f82 + } + { .mmf + (p16) LDFD f92 = [Y3], 1 * SIZE + (p16) LDFD f95 = [Y4], 1 * SIZE + (p18) FMA1 f85 = ALPHA_I, f43, f85 + } + ;; + { .mmf + (p16) LDFD f86 = [Y1], INCY4M1 + (p16) LDFD f89 = [Y2], INCY4M1 + (p18) FMA1 f94 = ALPHA_I, f52, f94 + } + { .mmf + (p19) add YY3 = YY3, INCY4M1 + (p19) add YY4 = YY4, INCY4M1 + (p18) FMA1 f97 = ALPHA_I, f55, f97 + } + ;; + { .mmf + (p16) LDFD f98 = [Y3], INCY4M1 + (p16) LDFD f101 = [Y4], INCY4M1 + (p18) FMA f88 = ALPHA_I, f34, f88 + } + { .mmf + (p19) add YY1 = YY1, INCY4M1 + (p19) add YY2 = YY2, INCY4M1 + (p18) FMA f91 = ALPHA_I, f37, f91 + } + ;; + { .mmf + (p16) LDFD f32 = [X1], 1 * SIZE + (p16) LDFD f35 = [X2], 1 * SIZE + (p18) FMA f100 = ALPHA_I, f46, f100 + } + { .mmf + (p16) LDFD f44 = [X3], 1 * SIZE + (p16) LDFD f47 = [X4], 1 * SIZE + (p18) FMA f103 = ALPHA_I, f49, f103 + } + ;; + { .mmf + (p18) STFD [YY1] = f82, 1 * SIZE + (p18) STFD [YY2] = f85, 1 * SIZE + (p18) FMA f106 = ALPHA_R, f58, f106 + } + { .mmf + (p16) LDFD f38 = [X1], INCX4M1 + (p16) LDFD f41 = [X2], INCX4M1 + (p18) FMA f109 = ALPHA_R, f61, f109 + } + ;; + { .mmf + (p18) STFD [YY3] = f94, 1 * SIZE + (p18) STFD [YY4] = f97, 1 * SIZE + (p18) FMA f118 = ALPHA_R, f70, f118 + } + { .mmf + (p16) LDFD f50 = [X3], INCX4M1 + (p16) LDFD f53 = [X4], INCX4M1 + (p18) FMA f121 = ALPHA_R, f73, f121 + } + ;; + { .mmf + (p18) STFD [YY1] = f88 + (p18) STFD [YY2] = f91 + (p18) FMA2 f112 = ALPHA_R, f64, f112 + } + { .mmf + (p16) lfetch.nt1 [PREX1], INCX8 + nop __LINE__ + (p18) FMA2 f115 = ALPHA_R, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY3] = f100 + (p18) STFD [YY4] = f103 + (p18) FMA2 f124 = ALPHA_R, f76, f124 + } + { .mmf + (p16) LDFD f104 = [Y1], 1 * SIZE + (p16) LDFD f107 = [Y2], 1 * SIZE + (p18) FMA2 f127 = ALPHA_R, f79, f127 + } + ;; + { .mmf + (p16) LDFD f116 = [Y3], 1 * SIZE + (p16) LDFD f119 = [Y4], 1 * SIZE + (p18) FMA1 f106 = ALPHA_I, f64, f106 + } + { .mmf + (p18) add YY1 = YY1, INCY4M1 + (p18) add YY2 = YY2, INCY4M1 + (p18) FMA1 f109 = ALPHA_I, f67, f109 + } + ;; + { .mmf + (p16) LDFD f110 = [Y1], INCY4M1 + (p16) LDFD f113 = [Y2], INCY4M1 + (p18) FMA1 f118 = ALPHA_I, f76, f118 + } + { .mmf + (p18) add YY3 = YY3, INCY4M1 + (p18) add YY4 = YY4, INCY4M1 + (p18) FMA1 f121 = ALPHA_I, f79, f121 + } + ;; + { .mmf + (p16) LDFD f122 = [Y3], INCY4M1 + (p16) LDFD f125 = [Y4], INCY4M1 + (p18) FMA f112 = ALPHA_I, f58, f112 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f115 = ALPHA_I, f61, f115 + } + ;; + { .mmf + (p16) LDFD f56 = [X1], 1 * SIZE + (p16) LDFD f59 = [X2], 1 * SIZE + (p18) FMA f124 = ALPHA_I, f70, f124 + } + { .mmf + (p16) LDFD f68 = [X3], 1 * SIZE + (p16) LDFD f71 = [X4], 1 * SIZE + (p18) FMA f127 = ALPHA_I, f73, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f106, 1 * SIZE + (p18) STFD [YY2] = f109, 1 * SIZE + (p17) FMA f81 = ALPHA_R, f33, f81 + } + { .mmf + (p16) LDFD f62 = [X1], INCX4M1 + (p16) LDFD f65 = [X2], INCX4M1 + (p17) FMA f84 = ALPHA_R, f36, f84 + } + ;; + { .mmf + (p18) STFD [YY3] = f118, 1 * SIZE + (p18) STFD [YY4] = f121, 1 * SIZE + (p17) FMA f93 = ALPHA_R, f45, f93 + } + { .mmf + (p16) LDFD f74 = [X3], INCX4M1 + (p16) LDFD f77 = [X4], INCX4M1 + (p17) FMA f96 = ALPHA_R, f48, f96 + } + ;; + { .mmf + (p18) STFD [YY1] = f112 + (p18) STFD [YY2] = f115 + (p17) FMA2 f87 = ALPHA_R, f39, f87 + } + { .mfb + nop __LINE__ + (p17) FMA2 f90 = ALPHA_R, f42, f90 + br.ctop.sptk.few .L22 + } + ;; + { .mmi + (p19) STFD [YY3] = f125 + (p19) STFD [YY4] = f32 + (p19) add YY1 = YY1, INCY4M1 + } + { .mmi + (p19) add YY2 = YY2, INCY4M1 + (p19) add YY3 = YY3, INCY4M1 + (p19) add YY4 = YY4, INCY4M1 + } + ;; +#endif + .align 32 + +.L25: + { .mmi + (p13) LDFD f32 = [X1], 1 * SIZE + (p13) LDFD f34 = [X2], 1 * SIZE + mov ar.lc = ARLC + } + { .mmi + (p13) LDFD f36 = [X3], 1 * SIZE + (p13) LDFD f38 = [X4], 1 * SIZE + cmp.eq p12, p0 = r0, J + } + ;; + { .mmi + (p13) LDFD f80 = [Y1], 1 * SIZE + (p13) LDFD f82 = [Y2], 1 * SIZE + mov pr = PR, -65474 + } + { .mmb + (p13) LDFD f84 = [Y3], 1 * SIZE + (p13) LDFD f86 = [Y4], 1 * SIZE + (p12) br.ret.sptk.many b0 + } + ;; + { .mmi + (p13) LDFD f33 = [X1], INCX4M1 + (p13) LDFD f35 = [X2], INCX4M1 + tbit.z p0, p14 = N, 1 + } + { .mmi + (p13) LDFD f81 = [Y1], INCY4M1 + (p13) LDFD f83 = [Y2], INCY4M1 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f37 = [X3], INCX4M1 + (p13) LDFD f39 = [X4], INCX4M1 + tbit.z p0, p15 = N, 0 + } + { .mmi + (p13) LDFD f85 = [Y3], INCY4M1 + (p13) LDFD f87 = [Y4], INCY4M1 + nop __LINE__ + } + ;; + { .mmf + (p14) LDFD f40 = [X1], 1 * SIZE + (p14) LDFD f42 = [X2], 1 * SIZE + } + ;; + { .mmf + (p14) LDFD f88 = [Y1], 1 * SIZE + (p14) LDFD f90 = [Y2], 1 * SIZE + } + ;; + { .mmf + (p14) LDFD f41 = [X1], INCX2M1 + (p14) LDFD f43 = [X2], INCX2M1 + (p13) FMA f80 = ALPHA_R, f32, f80 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f82 = ALPHA_R, f34, f82 + } + ;; + { .mmf + (p14) LDFD f89 = [Y1], INCY2M1 + (p14) LDFD f91 = [Y2], INCY2M1 + (p13) FMA f84 = ALPHA_R, f36, f84 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f86 = ALPHA_R, f38, f86 + } + ;; + { .mmf + (p15) LDFD f44 = [X1], 1 * SIZE + (p15) LDFD f92 = [Y1], 1 * SIZE + (p13) FMA2 f81 = ALPHA_R, f33, f81 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA2 f83 = ALPHA_R, f35, f83 + } + ;; + { .mmf + (p15) LDFD f45 = [X1] + (p15) LDFD f93 = [Y1] + (p13) FMA2 f85 = ALPHA_R, f37, f85 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA2 f87 = ALPHA_R, f39, f87 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA1 f80 = ALPHA_I, f33, f80 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA1 f82 = ALPHA_I, f35, f82 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA1 f84 = ALPHA_I, f37, f84 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA1 f86 = ALPHA_I, f39, f86 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f81 = ALPHA_I, f32, f81 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f83 = ALPHA_I, f34, f83 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f85 = ALPHA_I, f36, f85 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f87 = ALPHA_I, f38, f87 + } + ;; + { .mmf + (p13) STFD [YY1] = f80, 1 * SIZE + (p13) STFD [YY2] = f82, 1 * SIZE + (p14) FMA f88 = ALPHA_R, f40, f88 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMA f90 = ALPHA_R, f42, f90 + } + ;; + { .mmf + (p13) STFD [YY3] = f84, 1 * SIZE + (p13) STFD [YY4] = f86, 1 * SIZE + (p14) FMA2 f89 = ALPHA_R, f41, f89 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMA2 f91 = ALPHA_R, f43, f91 + } + ;; + { .mmf + (p13) STFD [YY1] = f81 + (p13) STFD [YY2] = f83 + (p15) FMA f92 = ALPHA_R, f44, f92 + } + { .mmf + (p13) add YY1 = YY1, INCY4M1 + (p13) add YY2 = YY2, INCY4M1 + (p15) FMA2 f93 = ALPHA_R, f45, f93 + } + ;; + { .mmf + (p13) STFD [YY3] = f85 + (p13) STFD [YY4] = f87 + (p14) FMA1 f88 = ALPHA_I, f41, f88 + } + { .mmf + (p13) add YY3 = YY3, INCY4M1 + (p13) add YY4 = YY4, INCY4M1 + (p14) FMA1 f90 = ALPHA_I, f43, f90 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMA f89 = ALPHA_I, f40, f89 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMA f91 = ALPHA_I, f42, f91 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA1 f92 = ALPHA_I, f45, f92 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f93 = ALPHA_I, f44, f93 + } + ;; + { .mmi + (p14) STFD [YY1] = f88, 1 * SIZE + (p14) STFD [YY2] = f90, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YY1] = f89 + (p14) STFD [YY2] = f91 + (p14) add YY1 = YY1, INCY2M1 + } + ;; + { .mmi + (p15) STFD [YY1] = f92, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f93 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + EPILOGUE diff --git a/kernel/ia64/zcopy.S b/kernel/ia64/zcopy.S new file mode 100644 index 0000000..91d90e0 --- /dev/null +++ b/kernel/ia64/zcopy.S @@ -0,0 +1,1378 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREA r2 +#define PREB r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define INCXM1 r20 +#define INCYM1 r21 +#define INCX3M1 r22 +#define INCY3M1 r23 +#define INCX8 r24 +#define INCY8 r25 +#define XX r26 +#define YY r27 +#define XA r28 +#define YA r29 +#define PR r30 +#define ARLC r31 + +#ifdef DOUBLE +#define PREFETCH_SIZE (6 * 32) +#else +#define PREFETCH_SIZE (8 * 64) +#endif + + PROLOGUE + .prologue + PROFCODE + + { .mmi + shladd INCX = INCX, ZBASE_SHIFT, r0 + shladd INCY = INCY, ZBASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + sub XA = Y1, X1 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + shladd INCX3M1 = INCX, 1, INCX + shladd INCY3M1 = INCY, 1, INCY + mov PR = pr + } + { .mmi + adds INCXM1 = - SIZE, INCX + adds INCYM1 = - SIZE, INCY + shr.u XA = XA, BASE_SHIFT + } + ;; + { .mmi +#ifdef DOUBLE + adds XA = 4, XA +#else + adds XA = -2, XA +#endif + and J = 7, N + mov pr.rot = 0 + } + { .mmi + adds INCX3M1 = - SIZE, INCX3M1 + adds INCY3M1 = - SIZE, INCY3M1 + shr I = N, 3 + } + ;; + { .mmi +#ifdef DOUBLE + and XA = 31, XA +#else + and XA = 63, XA +#endif + cmp.eq p9, p0 = r0, J + tbit.z p0, p7 = X1, BASE_SHIFT + } + { .mmi + shladd X2 = INCX, 1, X1 + shladd Y2 = INCY, 1, Y1 + tbit.z p0, p12 = N, 2 + } + ;; + { .mmi + cmp.eq p8 ,p0 = r0, I + adds I = -1, I +#ifdef DOUBLE + cmp.le p11, p0 = 15, XA +#else + cmp.ge p11, p0 = 31, XA +#endif + } + { .mmb + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + (p8) br.cond.dpnt .L25 + } + ;; + { .mmi + nop.m 0 + nop.m 0 + mov ar.lc = I + } + { .mbb + (p7) br.cond.dpnt .L100 + (p11) br.cond.dpnt .L30 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + nop.m 0 + mov ar.ec = 5 + } + { .mmi + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 +#ifndef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE + 0, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE - 40, Y1 +#endif + nop.i 0 + } + ;; + .align 32 + +.L21: + { .mmi + (p21) STFD [Y1] = f42 + (p21) STFD [Y2] = f62 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f32, f37 = [X1] + (p16) add X1 = X1, INCX + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f47, 1 * SIZE + (p21) STFD [Y2] = f67, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f42, f47 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p21) STFD [Y1] = f52 + (p21) STFD [Y2] = f72 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1] + (p16) add X1 = X1, INCX + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f77, 1 * SIZE + (p21) STFD [Y2] = f97, 1 * SIZE + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f62, f67 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p21) STFD [Y1] = f82 + (p21) STFD [Y2] = f102 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1] + (p16) add X1 = X1, INCX + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f87, 1 * SIZE + (p21) STFD [Y2] = f107, 1 * SIZE + } + { .mmi + (p16) LDFPD f82, f87 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p21) STFD [Y1] = f92 + (p21) STFD [Y2] = f112 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f92, f97 = [X1] + (p16) add X1 = X1, INCX + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f36, 1 * SIZE + (p20) STFD [Y2] = f56, 1 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + { .mmb + (p16) LDFPD f102, f107 = [X1] + (p16) add X1 = X1, INCX + br.ctop.sptk.few .L21 + } + ;; + + { .mmi + (p21) STFD [Y1] = f42 + (p21) STFD [Y2] = f62 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f47, 1 * SIZE + (p21) STFD [Y2] = f67, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f52 + (p21) STFD [Y2] = f72 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f77, 1 * SIZE + (p21) STFD [Y2] = f97, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f82 + (p21) STFD [Y2] = f102 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f87, 1 * SIZE + (p21) STFD [Y2] = f107, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f92 + (p21) STFD [Y2] = f112 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p21) add Y2 = INCY3M1, Y2 + } + ;; + .align 32 + +.L25: + { .mmi + mov XX = X1 + nop.m 0 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f48 = [X1], 1 * SIZE + (p12) LDFD f52 = [X2], 1 * SIZE + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCXM1 + (p12) LDFD f53 = [X2], INCXM1 + mov pr = PR, -65474 + } + { .mib + nop.m 0 + tbit.z p0, p14 = N, 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], 1 * SIZE + (p12) LDFD f54 = [X2], 1 * SIZE + (p12) shladd XX = INCX, 2, XX;; + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX3M1 + (p12) LDFD f55 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX;; + } + ;; + { .mmi + (p13) LDFD f56 = [X1], 1 * SIZE + (p14) LDFD f60 = [XX], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCXM1 + (p14) LDFD f61 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48, 1 * SIZE + (p12) STFD [Y2] = f52, 1 * SIZE + } + { .mmi + (p13) LDFD f58 = [X1], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCYM1, Y1 + } + { .mmi + (p13) LDFD f59 = [X1] + (p12) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50, 1 * SIZE + (p12) STFD [Y2] = f54, 1 * SIZE + (p12) shladd YY = INCY, 2, YY;; + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) shladd YY = INCY, 1, YY + } + { .mmi + (p12) add Y1 = INCY3M1, Y1 + (p12) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56, 1 * SIZE + (p14) STFD [YY] = f60, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCYM1, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f58, 1 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L30: + { .mmi + cmp.eq p16, p0 = r0, r0 + nop.m 0 + mov ar.ec = 5 + } + { .mmi +#ifndef DOUBLE + adds PREA = PREFETCH_SIZE * SIZE + 24, X1 + adds PREB = PREFETCH_SIZE * SIZE + 40, Y1 +#else + adds PREA = PREFETCH_SIZE * SIZE - 56, X1 + adds PREB = PREFETCH_SIZE * SIZE - 24, Y1 +#endif + nop.i 0 + } + ;; + .align 32 + +#ifndef DOUBLE +.L31: + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f111 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f32, f37 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f42, f47 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1] + (p16) add X1 = X1, INCX + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f45, 1 * SIZE + (p19) STFD [Y2] = f65, 1 * SIZE + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f62, f67 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p19) STFD [Y1] = f50 + (p19) STFD [Y2] = f70 + (p19) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1] + (p16) add X1 = X1, INCX + (p19) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75, 1 * SIZE + (p19) STFD [Y2] = f95, 1 * SIZE + } + { .mmi + (p16) LDFPD f82, f87 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p19) STFD [Y1] = f80 + (p19) STFD [Y2] = f100 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f92, f97 = [X1] + (p16) add X1 = X1, INCX + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f85, 1 * SIZE + (p19) STFD [Y2] = f105, 1 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + { .mmb + (p16) LDFPD f102, f107 = [X1] + (p16) add X1 = X1, INCX + br.ctop.sptk.few .L31 + } + ;; + br .L25 + .align 32 + +#else +.L31: + { .mmi + (p20) STFD [Y1] = f41 + (p20) STFD [Y2] = f61 + (p20) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f32, f37 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f46, 1 * SIZE + (p20) STFD [Y2] = f66, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f42, f47 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p20) STFD [Y1] = f51 + (p20) STFD [Y2] = f71 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f76, 1 * SIZE + (p20) STFD [Y2] = f96, 1 * SIZE + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f62, f67 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p20) STFD [Y1] = f81 + (p20) STFD [Y2] = f101 + (p20) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f86, 1 * SIZE + (p20) STFD [Y2] = f106, 1 * SIZE + } + { .mmi + (p16) LDFPD f82, f87 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f111 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f92, f97 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + { .mmb + (p16) LDFPD f102, f107 = [X1] + (p16) add X1 = X1, INCX + br.ctop.sptk.few .L31 + } + ;; + br .L25 + .align 32 +#endif + +.L100: + { .mmi + mov ar.lc = I + } + { .mbb + cmp.ne p6, p0 = 2 * SIZE, INCX + (p6) br.cond.dpnt .L200 + (p11) br.cond.dpnt .L130 + } + ;; + { .mmi + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 +#ifndef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE - 32, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE + 72, Y1 +#endif + mov ar.ec = 5 + } + { .mmi + LDFD f32 = [X1], 1 * SIZE + cmp.eq p16, p0 = r0, r0 + nop.i 0 + } + ;; + .align 32 + +.L121: + { .mmi + (p21) STFD [Y1] = f47, 1 * SIZE + (p21) STFD [Y2] = f67, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f37, f42 = [X1], 2 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f52 + (p21) STFD [Y2] = f72 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f47, f52 = [X1], 2 * SIZE + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f77, 1 * SIZE + (p21) STFD [Y2] = f97, 1 * SIZE + } + { .mmi + (p16) LDFPD f57, f62 = [X1], 2 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f82 + (p21) STFD [Y2] = f102 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f67, f72 = [X1], 2 * SIZE + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f87, 1 * SIZE + (p21) STFD [Y2] = f107, 1 * SIZE + } + { .mmi + (p16) LDFPD f77, f82 = [X1], 2 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f92 + (p21) STFD [Y2] = f113 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f87, f92 = [X1], 2 * SIZE + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f36, 1 * SIZE + (p20) STFD [Y2] = f56, 1 * SIZE + } + { .mmi + (p16) LDFPD f97, f102 = [X1], 2 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + ;; + { .mmi + (p20) STFD [Y1] = f41 + (p20) STFD [Y2] = f61 + (p20) add Y1 = INCYM1, Y1 + } + { .mmb + (p16) LDFPD f108, f127 = [X1], 2 * SIZE + (p20) add Y2 = INCYM1, Y2 + br.ctop.sptk.few .L121 + } + ;; + { .mmi + (p21) STFD [Y1] = f47, 1 * SIZE + (p21) STFD [Y2] = f67, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f52 + (p21) STFD [Y2] = f72 + (p21) add Y1 = INCY3M1, Y1 + } + (p21) add Y2 = INCY3M1, Y2 + ;; + { .mmi + (p21) STFD [Y1] = f77, 1 * SIZE + (p21) STFD [Y2] = f97, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f82 + (p21) STFD [Y2] = f102 + (p21) add Y1 = INCYM1, Y1 + } + (p21) add Y2 = INCYM1, Y2 + ;; + { .mmi + (p21) STFD [Y1] = f87, 1 * SIZE + (p21) STFD [Y2] = f107, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f92 + (p21) STFD [Y2] = f113 + (p21) add Y1 = INCY3M1, Y1 + } + (p21) add Y2 = INCY3M1, Y2 + + adds X1 = -SIZE, X1 + ;; + .align 32 + +.L125: + { .mmi + mov XX = X1 + nop.m 0 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f48 = [X1], 1 * SIZE + (p12) LDFD f52 = [X2], 1 * SIZE + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCXM1 + (p12) LDFD f53 = [X2], INCXM1 + mov pr = PR, -65474 + } + { .mib + nop.m 0 + tbit.z p0, p14 = N, 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], 1 * SIZE + (p12) LDFD f54 = [X2], 1 * SIZE + (p12) shladd XX = INCX, 2, XX;; + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX3M1 + (p12) LDFD f55 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX;; + } + ;; + { .mmi + (p13) LDFD f56 = [X1], 1 * SIZE + (p14) LDFD f60 = [XX], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCXM1 + (p14) LDFD f61 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48, 1 * SIZE + (p12) STFD [Y2] = f52, 1 * SIZE + } + { .mmi + (p13) LDFD f58 = [X1], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCYM1, Y1 + } + { .mmi + (p13) LDFD f59 = [X1] + (p12) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50, 1 * SIZE + (p12) STFD [Y2] = f54, 1 * SIZE + (p12) shladd YY = INCY, 2, YY;; + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) shladd YY = INCY, 1, YY + } + { .mmi + (p12) add Y1 = INCY3M1, Y1 + (p12) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56, 1 * SIZE + (p14) STFD [YY] = f60, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCYM1, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f58, 1 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L130: + { .mmi + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 +#ifndef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE + 72, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE + 56, Y1 +#endif + mov ar.ec = 5 + } + { .mmi + LDFD f32 = [X1], 1 * SIZE + cmp.eq p16, p0 = r0, r0 + nop.i 0 + } + ;; +#ifndef DOUBLE +.L131: + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + nop.i 0 + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f37, f42 = [X1], 2 * SIZE + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f47, f52 = [X1], 2 * SIZE + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f45, 1 * SIZE + (p19) STFD [Y2] = f65, 1 * SIZE + nop.i 0 + } + { .mmi + (p16) LDFPD f57, f62 = [X1], 2 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f50 + (p19) STFD [Y2] = f70 + (p19) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f67, f72 = [X1], 2 * SIZE + (p19) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f75, 1 * SIZE + (p19) STFD [Y2] = f95, 1 * SIZE + nop.i 0 + } + { .mmi + (p16) LDFPD f77, f82 = [X1], 2 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f80 + (p19) STFD [Y2] = f100 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f87, f92 = [X1], 2 * SIZE + (p19) add Y2 = INCYM1, Y2 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f85, 1 * SIZE + (p19) STFD [Y2] = f105, 1 * SIZE + nop.i 0 + } + { .mmi + (p16) LDFPD f97, f102 = [X1], 2 * SIZE + (p16) shladd X2 = INCX, 3, X2 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f90 + (p19) STFD [Y2] = f111 + (p19) add Y1 = INCY3M1, Y1 + } + { .mmb + (p16) LDFPD f108, f127 = [X1], 2 * SIZE + (p19) add Y2 = INCY3M1, Y2 + br.ctop.sptk.few .L131 + } + ;; + { .mmi + adds X1 = -SIZE, X1 + nop.m 0 + nop.i 0 + } + ;; + .align 32 +#else +.L131: + { .mmi + (p20) STFD [Y1] = f46, 1 * SIZE + (p20) STFD [Y2] = f66, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f37, f42 = [X1], 2 * SIZE + } + ;; + { .mmi + (p20) STFD [Y1] = f51 + (p20) STFD [Y2] = f71 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f47, f52 = [X1], 2 * SIZE + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f76, 1 * SIZE + (p20) STFD [Y2] = f96, 1 * SIZE + } + { .mmi + (p16) LDFPD f57, f62 = [X1], 2 * SIZE + } + ;; + { .mmi + (p20) STFD [Y1] = f81 + (p20) STFD [Y2] = f101 + (p20) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f67, f72 = [X1], 2 * SIZE + (p20) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f86, 1 * SIZE + (p20) STFD [Y2] = f106, 1 * SIZE + } + { .mmi + (p16) LDFPD f77, f82 = [X1], 2 * SIZE + } + ;; + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f112 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f87, f92 = [X1], 2 * SIZE + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + } + { .mmi + (p16) LDFPD f97, f102 = [X1], 2 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCYM1, Y1 + } + { .mmb + (p16) LDFPD f108, f127 = [X1], 2 * SIZE + (p19) add Y2 = INCYM1, Y2 + br.ctop.sptk.few .L131 + } + ;; + { .mmi + adds X1 = -SIZE, X1 + nop.m 0 + nop.i 0 + } + ;; + .align 32 +#endif + +.L135: + { .mmi + mov XX = X1 + nop.m 0 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f48 = [X1], 1 * SIZE + (p12) LDFD f52 = [X2], 1 * SIZE + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCXM1 + (p12) LDFD f53 = [X2], INCXM1 + mov pr = PR, -65474 + } + { .mib + nop.m 0 + tbit.z p0, p14 = N, 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], 1 * SIZE + (p12) LDFD f54 = [X2], 1 * SIZE + (p12) shladd XX = INCX, 2, XX;; + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX3M1 + (p12) LDFD f55 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX;; + } + ;; + { .mmi + (p13) LDFD f56 = [X1], 1 * SIZE + (p14) LDFD f60 = [XX], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCXM1 + (p14) LDFD f61 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48, 1 * SIZE + (p12) STFD [Y2] = f52, 1 * SIZE + } + { .mmi + (p13) LDFD f58 = [X1], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCYM1, Y1 + } + { .mmi + (p13) LDFD f59 = [X1] + (p12) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50, 1 * SIZE + (p12) STFD [Y2] = f54, 1 * SIZE + (p12) shladd YY = INCY, 2, YY;; + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) shladd YY = INCY, 1, YY + } + { .mmi + (p12) add Y1 = INCY3M1, Y1 + (p12) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56, 1 * SIZE + (p14) STFD [YY] = f60, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCYM1, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f58, 1 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + ;; + +/* Unaligned Copy INCX =! 1 */ +.L200: + ;; + { .mmi + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 + adds PREB = PREFETCH_SIZE * SIZE + 32, Y1 + mov ar.ec = 5 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + nop.m 0 + nop.i 0 + } + ;; + .align 32 + +.L221: + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f111 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], 1 * SIZE + (p16) LDFD f52 = [X2], 1 * SIZE + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + } + { .mmi + (p16) LDFD f37 = [X1], INCXM1 + (p16) LDFD f57 = [X2], INCXM1 + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFD f42 = [X1], 1 * SIZE + (p16) LDFD f62 = [X2], 1 * SIZE + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f45, 1 * SIZE + (p19) STFD [Y2] = f65, 1 * SIZE + } + { .mmi + (p16) LDFD f47 = [X1], INCX3M1 + (p16) LDFD f67 = [X2], INCX3M1 + } + ;; + { .mmi + (p19) STFD [Y1] = f50 + (p19) STFD [Y2] = f70 + (p19) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFD f72 = [X1], 1 * SIZE + (p16) LDFD f92 = [X2], 1 * SIZE + (p19) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75, 1 * SIZE + (p19) STFD [Y2] = f95, 1 * SIZE + } + { .mmi + (p16) LDFD f77 = [X1], INCXM1 + (p16) LDFD f97 = [X2], INCXM1 + } + ;; + { .mmi + (p19) STFD [Y1] = f80 + (p19) STFD [Y2] = f100 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFD f82 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f85, 1 * SIZE + (p19) STFD [Y2] = f105, 1 * SIZE + } + { .mmb + (p16) LDFD f87 = [X1], INCX3M1 + (p16) LDFD f107 = [X2], INCX3M1 + br.ctop.sptk.few .L221 + } + ;; + .align 32 + +.L225: + { .mmi + mov XX = X1 + nop.m 0 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f48 = [X1], 1 * SIZE + (p12) LDFD f52 = [X2], 1 * SIZE + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCXM1 + (p12) LDFD f53 = [X2], INCXM1 + mov pr = PR, -65474 + } + { .mib + nop.m 0 + tbit.z p0, p14 = N, 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], 1 * SIZE + (p12) LDFD f54 = [X2], 1 * SIZE + (p12) shladd XX = INCX, 2, XX;; + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX3M1 + (p12) LDFD f55 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX;; + } + ;; + { .mmi + (p13) LDFD f56 = [X1], 1 * SIZE + (p14) LDFD f60 = [XX], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCXM1 + (p14) LDFD f61 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48, 1 * SIZE + (p12) STFD [Y2] = f52, 1 * SIZE + } + { .mmi + (p13) LDFD f58 = [X1], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCYM1, Y1 + } + { .mmi + (p13) LDFD f59 = [X1] + (p12) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50, 1 * SIZE + (p12) STFD [Y2] = f54, 1 * SIZE + (p12) shladd YY = INCY, 2, YY;; + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) shladd YY = INCY, 1, YY + } + { .mmi + (p12) add Y1 = INCY3M1, Y1 + (p12) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56, 1 * SIZE + (p14) STFD [YY] = f60, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCYM1, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f58, 1 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + + EPILOGUE + diff --git a/kernel/ia64/zdot.S b/kernel/ia64/zdot.S new file mode 100644 index 0000000..5c77ce6 --- /dev/null +++ b/kernel/ia64/zdot.S @@ -0,0 +1,487 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCH_SIZE (13 * 16) +#else +#define PREFETCH_SIZE ( 9 * 32) +#endif + +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) +#define N r33 +#define X1 r34 +#define INCX r35 +#define Y1 r36 +#define INCY r37 +#else +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 +#endif + +#define PRE1 r2 +#define PRE2 r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 +#define INCXM1 r18 +#define INCYM1 r19 +#define INCX16 r20 +#define INCY16 r21 +#define INCX3M1 r22 +#define INCY3M1 r23 +#define XX r24 +#define YY r25 + +#define PR r30 +#define ARLC r31 + +#define ALPHA f8 + + PROLOGUE + .prologue + PROFCODE + + { .mfi + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov f9 = f0 + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif +#endif + { .mmi + shladd INCX = INCX, ZBASE_SHIFT, r0 + shladd INCY = INCY, ZBASE_SHIFT, r0 + mov PR = pr + } + { .mib + cmp.lt p0, p7 = r0, N + mov r26 = 1 + (p7) br.cond.spnt .L1000 + } + ;; +#ifdef F_INTERFACE + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + (p6) add X1 = X1, r26 + (p7) add Y1 = Y1, r27 + ;; +#endif + { .mfi +#ifdef DOUBLE + adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 +#else + adds PRE1 = (PREFETCH_SIZE + 8) * SIZE, X1 +#endif + mov f10 = f0 + mov pr.rot= 0 + } + { .mfi + and J = 7, N + mov f11 = f0 + shr I = N, 3 + } + ;; + { .mfi +#ifdef DOUBLE + adds PRE2 = (PREFETCH_SIZE + 6) * SIZE, Y1 +#else + adds PRE2 = (PREFETCH_SIZE + 12) * SIZE, Y1 +#endif + mov f12 = f0 + mov ar.ec = 3 + } + { .mmf + shladd INCX16 = INCX, 3, r0 + shladd INCY16 = INCY, 3, r0 + mov f13 = f0 + } + ;; + { .mmf + shladd INCX3M1 = INCX, 1, INCX + shladd INCY3M1 = INCY, 1, INCY + mov f14 = f0 + } + { .mmf + adds INCXM1 = -SIZE, INCX + adds INCYM1 = -SIZE, INCY + mov f15 = f0 + } + ;; + { .mmi + adds INCX3M1 = -SIZE, INCX3M1 + adds INCY3M1 = -SIZE, INCY3M1 + tbit.z p0, p12 = N, 2 + } + { .mmi + cmp.eq p8 ,p0 = r0, I + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + } + ;; + { .mmi + shladd X2 = INCX, 1, X1 + shladd Y2 = INCY, 1, Y1 + mov ar.lc = I + } + { .mmb + mov XX = X1 + mov YY = Y1 + (p8) br.cond.dpnt .L55 + } + ;; + .align 32 + +.L52: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X1], SIZE + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f44 = [X2], SIZE + nop.m 0 + (p18) FMA f9 = f34, f85, f9 + } + ;; + { .mmf + (p16) LDFD f80 = [Y1], SIZE + (p16) LDFD f92 = [Y2], SIZE + (p18) FMA f10 = f37, f82, f10 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f11 = f37, f85, f11 + } + ;; + { .mmf + (p16) lfetch.nt1 [PRE2], INCY16 + (p16) LDFD f35 = [X1], INCXM1 + (p18) FMA f12 = f40, f88, f12 + } + { .mmf + (p16) LDFD f47 = [X2], INCXM1 + nop.m 0 + (p18) FMA f13 = f40, f91, f13 + } + ;; + { .mmf + (p16) LDFD f83 = [Y1], INCYM1 + (p16) LDFD f95 = [Y2], INCYM1 + (p18) FMA f14 = f43, f88, f14 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f15 = f43, f91, f15 + } + ;; + { .mmf + (p16) LDFD f38 = [X1], SIZE + (p16) LDFD f50 = [X2], SIZE + (p18) FMA f8 = f46, f94, f8 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f9 = f46, f97, f9 + } + ;; + { .mmf + (p16) LDFD f86 = [Y1], SIZE + (p16) LDFD f98 = [Y2], SIZE + (p18) FMA f10 = f49, f94, f10 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f11 = f49, f97, f11 + } + ;; + { .mmf + (p16) LDFD f41 = [X1], INCX3M1 + (p16) LDFD f53 = [X2], INCX3M1 + (p18) FMA f12 = f52, f100, f12 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f13 = f52, f103, f13 + } + ;; + { .mmf + (p16) LDFD f89 = [Y1], INCY3M1 + (p16) LDFD f101 = [Y2], INCY3M1 + (p18) FMA f14 = f55, f100, f14 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFD f56 = [X1], SIZE + (p16) LDFD f68 = [X2], SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f9 = f58, f109, f9 + } + ;; + { .mmf + (p16) LDFD f104 = [Y1], SIZE + (p16) LDFD f116 = [Y2], SIZE + (p18) FMA f10 = f61, f106, f10 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f11 = f61, f109, f11 + } + ;; + { .mmf + (p16) LDFD f59 = [X1], INCXM1 + (p16) LDFD f71 = [X2], INCXM1 + (p18) FMA f12 = f64, f112, f12 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f13 = f64, f115, f13 + } + ;; + { .mmf + (p16) LDFD f107 = [Y1], INCYM1 + (p16) LDFD f119 = [Y2], INCYM1 + (p18) FMA f14 = f67, f112, f14 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f15 = f67, f115, f15 + } + ;; + { .mmf + (p16) LDFD f62 = [X1], SIZE + (p16) LDFD f74 = [X2], SIZE + (p18) FMA f8 = f70, f118, f8 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f9 = f70, f121, f9 + } + ;; + { .mmf + (p16) LDFD f110 = [Y1], SIZE + (p16) LDFD f122 = [Y2], SIZE + (p18) FMA f10 = f73, f118, f10 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f11 = f73, f121, f11 + } + ;; + { .mmf + (p16) LDFD f65 = [X1], INCX3M1 + (p16) LDFD f77 = [X2], INCX3M1 + (p18) FMA f12 = f76, f124, f12 + } + { .mmf + (p16) add XX = INCX16, XX + (p16) add YY = INCY16, YY + (p18) FMA f13 = f76, f127, f13 + } + ;; + { .mmf + (p16) LDFD f113 = [Y1], INCY3M1 + (p16) LDFD f125 = [Y2], INCY3M1 + (p18) FMA f14 = f79, f124, f14 + } + { .mfb + nop.m 0 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L52 + } + ;; + .align 32 + +.L55: + (p12) LDFD f32 = [X1], SIZE + (p12) LDFD f40 = [X2], SIZE + tbit.z p0, p13 = N, 1 + (p12) LDFD f34 = [Y1], SIZE + (p12) LDFD f42 = [Y2], SIZE + tbit.z p0, p14 = N, 0 + ;; + (p12) LDFD f33 = [X1], INCXM1 + (p12) LDFD f41 = [X2], INCXM1 + cmp.eq p9, p0 = r0, J + (p12) LDFD f35 = [Y1], INCYM1 + (p12) LDFD f43 = [Y2], INCYM1 + (p9) br.cond.dptk .L999 + ;; + (p12) LDFD f36 = [X1], SIZE + (p12) LDFD f44 = [X2], SIZE + (p12) shladd XX = INCX, 2, XX + (p12) LDFD f38 = [Y1], SIZE + (p12) LDFD f46 = [Y2], SIZE + (p12) shladd YY = INCY, 2, YY + ;; + (p12) LDFD f37 = [X1], INCX3M1 + (p12) LDFD f45 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX + (p12) LDFD f39 = [Y1], INCY3M1 + (p12) LDFD f47 = [Y2], INCY3M1 + (p13) shladd YY = INCY, 1, YY + ;; + (p13) LDFD f48 = [X1], SIZE + (p13) LDFD f50 = [Y1], SIZE + (p14) LDFD f56 = [XX], SIZE + (p14) LDFD f58 = [YY], SIZE + ;; + (p13) LDFD f49 = [X1], INCXM1 + (p13) LDFD f51 = [Y1], INCYM1 + (p14) LDFD f57 = [XX] + (p14) LDFD f59 = [YY] + ;; + (p13) LDFD f52 = [X1], SIZE + (p13) LDFD f54 = [Y1], SIZE + ;; + (p13) LDFD f53 = [X1] + (p13) LDFD f55 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f32, f35, f9 + (p12) FMA f10 = f33, f34, f10 + (p12) FMA f11 = f33, f35, f11 + (p12) FMA f12 = f36, f38, f12 + (p12) FMA f13 = f36, f39, f13 + (p12) FMA f14 = f37, f38, f14 + (p12) FMA f15 = f37, f39, f15 + ;; + (p12) FMA f8 = f40, f42, f8 + (p12) FMA f9 = f40, f43, f9 + (p12) FMA f10 = f41, f42, f10 + (p12) FMA f11 = f41, f43, f11 + (p12) FMA f12 = f44, f46, f12 + (p12) FMA f13 = f44, f47, f13 + (p12) FMA f14 = f45, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f48, f51, f9 + (p13) FMA f10 = f49, f50, f10 + (p13) FMA f11 = f49, f51, f11 + (p13) FMA f12 = f52, f54, f12 + (p13) FMA f13 = f52, f55, f13 + (p13) FMA f14 = f53, f54, f14 + (p13) FMA f15 = f53, f55, f15 + ;; + (p14) FMA f8 = f56, f58, f8 + (p14) FMA f9 = f56, f59, f9 + (p14) FMA f10 = f57, f58, f10 + (p14) FMA f11 = f57, f59, f11 + .align 32 + ;; +.L999: + FADD f8 = f8, f12 + FADD f9 = f9, f13 + FADD f10 = f10, f14 + FADD f11 = f11, f15 + mov ar.lc = ARLC + ;; +#ifndef CONJ + FSUB f8 = f8, f11 + FADD f9 = f9, f10 +#else + FADD f8 = f8, f11 + FSUB f9 = f9, f10 +#endif + ;; + .align 32 + +.L1000: +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + STFD [r32] = f8, SIZE + ;; + STFD [r32] = f9, SIZE +#endif + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/zgemm3m_kernel.S b/kernel/ia64/zgemm3m_kernel.S new file mode 100644 index 0000000..5adb66a --- /dev/null +++ b/kernel/ia64/zgemm3m_kernel.S @@ -0,0 +1,6803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE 15 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -16 * 16, SP + adds r9 = -15 * 16, SP + adds SP = -16 * 16, SP + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + } + { .mmi + ld8 LDC = [r14], 8 + nop __LINE__ + nop __LINE__ + } + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + shr J = N, 3 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + shladd LDC = LDC, ZBASE_SHIFT, r0 + ;; + stf.spill [r8] = f22, 32 + stf.spill [r9] = f23, 32 + mov AOFFSET = A + ;; + stf.spill [r8] = f24, 32 + stf.spill [r9] = f25, 32 + cmp.ge p6, p0 = 0, J + ;; + stf.spill [r8] = f26, 32 + stf.spill [r9] = f27, 32 + ;; + stf.spill [r8] = f28, 32 + stf.spill [r9] = f29, 32 + ;; + stf.spill [r8] = f30 + stf.spill [r9] = f31 + (p6) br.cond.dpnt .L050 + .body + ;; + .align 32 + +.L010: + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + nop __LINE__ + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc + shladd C = LDC, 3, C // coffset += 8 * ldc + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc + shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc + mov f112 = f0 + nop __LINE__ + } + { .mfb + sub C8 = C, LDC // coffset8 = c + 7 * ldc + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mfb + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfb + setf.d f113 = r0 + mov f121 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfb + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfb + setf.d f114 = r0 + mov f122 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, K + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfi + setf.d f119 = r0 + mov f127 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfi + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfi + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfi + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfi + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfi + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfi + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfi + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfi + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfi + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfi + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfi + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfi + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfi + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfi + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfi + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfi + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfi + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfi + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfi + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfi + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfi + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfi + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfi + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfi + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfi + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfi + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfi + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfi + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfi + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfi + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfi + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfi + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfi + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfi + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfi + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfi + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfi + (p5) LDFD f6 = [C1 ], SIZE + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f7 = [C9 ], SIZE + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfi + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfi + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfi + (p5) LDFD f14 = [C1 ], 5 * SIZE + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f15 = [C9 ], 5 * SIZE + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfi + (p5) LDFD f16 = [C1 ], SIZE + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f17 = [C9 ], SIZE + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfi + (p5) LDFD f18 = [C1 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f19 = [C9 ], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfi + (p5) LDFD f20 = [C1 ], SIZE + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f21 = [C9 ], SIZE + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfi + (p5) LDFD f22 = [C1 ], - 11 * SIZE + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f23 = [C9 ], - 11 * SIZE + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfi + (p5) LDFD f24 = [C2 ], SIZE + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f25 = [C10], SIZE + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfi + (p5) LDFD f26 = [C2 ], SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f27 = [C10], SIZE + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfi + (p5) LDFD f28 = [C2 ], SIZE + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f29 = [C10], SIZE + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfi + (p5) LDFD f30 = [C2 ], 5 * SIZE + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f31 = [C10], 5 * SIZE + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfi + (p5) LDFD f32 = [C2 ], SIZE + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f33 = [C10], SIZE + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfi + (p5) LDFD f34 = [C2 ], SIZE + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f35 = [C10], SIZE + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfi + (p5) LDFD f36 = [C2 ], SIZE + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f37 = [C10], SIZE + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfi + (p5) LDFD f38 = [C2 ], - 11 * SIZE + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f39 = [C10], - 11 * SIZE + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfi + (p5) LDFD f48 = [C3 ], SIZE + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f49 = [C11], SIZE + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfi + (p5) LDFD f50 = [C3 ], SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f51 = [C11], SIZE + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfi + (p5) LDFD f52 = [C3 ], SIZE + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f53 = [C11], SIZE + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfi + (p5) LDFD f54 = [C3 ], 5 * SIZE + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f55 = [C11], 5 * SIZE + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfi + (p5) LDFD f40 = [C3 ], SIZE + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f41 = [C11], SIZE + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfi + (p5) LDFD f42 = [C3 ], SIZE + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f43 = [C11], SIZE + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfi + (p5) LDFD f44 = [C3 ], SIZE + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f45 = [C11], SIZE + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfi + (p5) LDFD f46 = [C3 ], - 11 * SIZE + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f56 = [C11], - 11 * SIZE + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + (p5) LDFD f57 = [C4 ], SIZE + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + (p5) LDFD f58 = [C12], SIZE + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; +.L013: + { .mmf + (p5) LDFD f59 = [C4 ], SIZE + (p5) LDFD f60 = [C12], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + cmp.ne p6, p0 = 1, I + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + (p5) LDFD f61 = [C4 ], SIZE + (p5) LDFD f62 = [C12], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + (p5) LDFD f63 = [C4 ], 5 * SIZE + (p5) LDFD f47 = [C12], 5 * SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mfi + (p5) LDFD f64 = [C4 ], SIZE + FMA f14 = ALPHA_I, f65, f14 + nop __LINE__ + } + { .mfi + (p5) LDFD f65 = [C12], SIZE + FMA f15 = ALPHA_I, f67, f15 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f68, f16 + } + { .mmf + (p5) LDFD f6 = [C4 ], SIZE + (p5) LDFD f7 = [C12], SIZE + FMA f17 = ALPHA_R, f70, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f68, f18 + } + { .mmf + (p5) LDFD f10 = [C4 ], SIZE + (p5) LDFD f11 = [C12], SIZE + FMA f19 = ALPHA_I, f70, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f69, f20 + } + { .mmf + (p5) LDFD f12 = [C4 ], - 11 * SIZE + (p5) LDFD f13 = [C12], - 11 * SIZE + FMA f21 = ALPHA_R, f71, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f69, f22 + } + { .mmf + (p5) LDFD f14 = [C5 ], SIZE + (p5) LDFD f15 = [C13], SIZE + FMA f23 = ALPHA_I, f71, f23 + } + ;; + { .mmf + STFD [C1 ] = f16, SIZE + STFD [C9 ] = f17, SIZE + FMA f24 = ALPHA_R, f72, f24 + } + { .mmf + (p5) LDFD f16 = [C5 ], SIZE + (p5) LDFD f17 = [C13], SIZE + FMA f25 = ALPHA_R, f74, f25 + } + ;; + { .mmf + STFD [C1 ] = f18, SIZE + STFD [C9 ] = f19, SIZE + FMA f26 = ALPHA_I, f72, f26 + } + { .mmf + (p5) LDFD f18 = [C5 ], SIZE + (p5) LDFD f19 = [C13], SIZE + FMA f27 = ALPHA_I, f74, f27 + } + ;; + { .mmf + STFD [C1 ] = f20, SIZE + STFD [C9 ] = f21, SIZE + FMA f28 = ALPHA_R, f73, f28 + } + { .mmf + (p5) LDFD f20 = [C5 ], 5 * SIZE + (p5) LDFD f21 = [C13], 5 * SIZE + FMA f29 = ALPHA_R, f75, f29 + } + ;; + { .mmf + STFD [C1 ] = f22, 5 * SIZE + STFD [C9 ] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f73, f30 + } + { .mmf + (p5) LDFD f22 = [C5 ], SIZE + (p5) LDFD f23 = [C13], SIZE + FMA f31 = ALPHA_I, f75, f31 + } + ;; + { .mmf + STFD [C2 ] = f24, SIZE + STFD [C10] = f25, SIZE + FMA f32 = ALPHA_R, f76, f32 + } + { .mmf + (p5) LDFD f24 = [C5 ], SIZE + (p5) LDFD f25 = [C13], SIZE + FMA f33 = ALPHA_R, f78, f33 + } + ;; + { .mmf + STFD [C2 ] = f26, SIZE + STFD [C10] = f27, SIZE + FMA f34 = ALPHA_I, f76, f34 + } + { .mmf + (p5) LDFD f26 = [C5 ], SIZE + (p5) LDFD f27 = [C13], SIZE + FMA f35 = ALPHA_I, f78, f35 + } + ;; + { .mmf + STFD [C2 ] = f28, SIZE + STFD [C10] = f29, SIZE + FMA f36 = ALPHA_R, f77, f36 + } + { .mmf + (p5) LDFD f28 = [C5 ], - 11 * SIZE + (p5) LDFD f29 = [C13], - 11 * SIZE + FMA f37 = ALPHA_R, f79, f37 + } + ;; + { .mmf + STFD [C2 ] = f30, 5 * SIZE + STFD [C10] = f31, 5 * SIZE + FMA f38 = ALPHA_I, f77, f38 + } + { .mmf + (p5) LDFD f30 = [C6 ], SIZE + (p5) LDFD f31 = [C14], SIZE + FMA f39 = ALPHA_I, f79, f39 + } + ;; + { .mmf + STFD [C2 ] = f32, SIZE + STFD [C10] = f33, SIZE + FMA f48 = ALPHA_R, f80, f48 + } + { .mmf + (p5) LDFD f32 = [C6 ], SIZE + (p5) LDFD f33 = [C14], SIZE + FMA f49 = ALPHA_R, f82, f49 + } + ;; + { .mmf + STFD [C2 ] = f34, SIZE + STFD [C10] = f35, SIZE + FMA f50 = ALPHA_I, f80, f50 + } + { .mmf + (p5) LDFD f34 = [C6 ], SIZE + (p5) LDFD f35 = [C14], SIZE + FMA f51 = ALPHA_I, f82, f51 + } + ;; + { .mmf + STFD [C2 ] = f36, SIZE + STFD [C10] = f37, SIZE + FMA f52 = ALPHA_R, f81, f52 + } + { .mmf + (p5) LDFD f36 = [C6 ], 5 * SIZE + (p5) LDFD f37 = [C14], 5 * SIZE + FMA f53 = ALPHA_R, f83, f53 + } + ;; + { .mmf + STFD [C2 ] = f38, 5 * SIZE + STFD [C10] = f39, 5 * SIZE + FMA f54 = ALPHA_I, f81, f54 + } + { .mmf + (p5) LDFD f38 = [C6 ], SIZE + (p5) LDFD f39 = [C14], SIZE + FMA f55 = ALPHA_I, f83, f55 + } + ;; + { .mmf + STFD [C3 ] = f48, SIZE + STFD [C11] = f49, SIZE + FMA f40 = ALPHA_R, f84, f40 + } + { .mmf + (p5) LDFD f48 = [C6 ], SIZE + (p5) LDFD f49 = [C14], SIZE + FMA f41 = ALPHA_R, f86, f41 + } + ;; + { .mmf + STFD [C3 ] = f50, SIZE + STFD [C11] = f51, SIZE + FMA f42 = ALPHA_I, f84, f42 + } + { .mmf + (p5) LDFD f50 = [C6 ], SIZE + (p5) LDFD f51 = [C14], SIZE + FMA f43 = ALPHA_I, f86, f43 + } + ;; + { .mmf + STFD [C3 ] = f52, SIZE + STFD [C11] = f53, SIZE + FMA f44 = ALPHA_R, f85, f44 + } + { .mmf + (p5) LDFD f52 = [C6 ], - 11 * SIZE + (p5) LDFD f53 = [C14], - 11 * SIZE + FMA f45 = ALPHA_R, f87, f45 + } + ;; + { .mmf + STFD [C3 ] = f54, 5 * SIZE + STFD [C11] = f55, 5 * SIZE + FMA f46 = ALPHA_I, f85, f46 + } + { .mmf + (p5) LDFD f54 = [C7 ], SIZE + (p5) LDFD f55 = [C15], SIZE + FMA f56 = ALPHA_I, f87, f56 + } + ;; + { .mmf + STFD [C3 ] = f40, SIZE + STFD [C11] = f41, SIZE + FMA f57 = ALPHA_R, f88, f57 + } + { .mmf + (p5) LDFD f40 = [C7 ], SIZE + (p5) LDFD f41 = [C15], SIZE + FMA f58 = ALPHA_R, f90, f58 + } + ;; + { .mmf + STFD [C3 ] = f42, SIZE + STFD [C11] = f43, SIZE + FMA f59 = ALPHA_I, f88, f59 + } + { .mmf + (p5) LDFD f42 = [C7 ], SIZE + (p5) LDFD f43 = [C15], SIZE + FMA f60 = ALPHA_I, f90, f60 + } + ;; + { .mmf + STFD [C3 ] = f44, SIZE + STFD [C11] = f45, SIZE + FMA f61 = ALPHA_R, f89, f61 + } + { .mmf + (p5) LDFD f44 = [C7 ], 5 * SIZE + (p5) LDFD f45 = [C15], 5 * SIZE + FMA f62 = ALPHA_R, f91, f62 + } + ;; + { .mmf + STFD [C3 ] = f46, 5 * SIZE + STFD [C11] = f56, 5 * SIZE + FMA f63 = ALPHA_I, f89, f63 + } + { .mmf + (p5) LDFD f46 = [C7 ], SIZE + (p5) LDFD f56 = [C15], SIZE + FMA f47 = ALPHA_I, f91, f47 + } + ;; + { .mmf + STFD [C4 ] = f57, SIZE + STFD [C12] = f58, SIZE + FMA f64 = ALPHA_R, f92, f64 + } + { .mmf + (p5) LDFD f57 = [C7 ], SIZE + (p5) LDFD f58 = [C15], SIZE + FMA f65 = ALPHA_R, f94, f65 + } + ;; + { .mmf + STFD [C4 ] = f59, SIZE + STFD [C12] = f60, SIZE + FMA f6 = ALPHA_I, f92, f6 + } + { .mmf + (p5) LDFD f59 = [C7 ], SIZE + (p5) LDFD f60 = [C15], SIZE + FMA f7 = ALPHA_I, f94, f7 + } + ;; + { .mmf + STFD [C4 ] = f61, SIZE + STFD [C12] = f62, SIZE + FMA f10 = ALPHA_R, f93, f10 + } + { .mmf + (p5) LDFD f61 = [C7 ], - 11 * SIZE + (p5) LDFD f62 = [C15], - 11 * SIZE + FMA f11 = ALPHA_R, f95, f11 + } + ;; + { .mmf + STFD [C4 ] = f63, 5 * SIZE + STFD [C12] = f47, 5 * SIZE + FMA f12 = ALPHA_I, f93, f12 + } + { .mmf + (p5) LDFD f63 = [C8 ], SIZE + (p5) LDFD f47 = [C16], SIZE + FMA f13 = ALPHA_I, f95, f13 + } + ;; + { .mmf + STFD [C4 ] = f64, SIZE + STFD [C12] = f65, SIZE + FMA f14 = ALPHA_R, f96, f14 + } + { .mmf + (p5) LDFD f64 = [C8 ], SIZE + (p5) LDFD f65 = [C16], SIZE + FMA f15 = ALPHA_R, f98, f15 + } + ;; + { .mmf + STFD [C4 ] = f6, SIZE + STFD [C12] = f7, SIZE + FMA f16 = ALPHA_I, f96, f16 + } + { .mmf + (p5) LDFD f6 = [C8 ], SIZE + (p5) LDFD f7 = [C16], SIZE + FMA f17 = ALPHA_I, f98, f17 + } + ;; + { .mmf + STFD [C4 ] = f10, SIZE + STFD [C12] = f11, SIZE + FMA f18 = ALPHA_R, f97, f18 + } + { .mmf + (p5) LDFD f10 = [C8 ], 5 * SIZE + (p5) LDFD f11 = [C16], 5 * SIZE + FMA f19 = ALPHA_R, f99, f19 + } + ;; + { .mmf + STFD [C4 ] = f12, 5 * SIZE + STFD [C12] = f13, 5 * SIZE + FMA f20 = ALPHA_I, f97, f20 + } + { .mmf + (p5) LDFD f12 = [C8 ], SIZE + (p5) LDFD f13 = [C16], SIZE + FMA f21 = ALPHA_I, f99, f21 + } + ;; + { .mmf + STFD [C5 ] = f14, SIZE + STFD [C13] = f15, SIZE + FMA f22 = ALPHA_R, f100, f22 + } + { .mmf + (p5) LDFD f14 = [C8 ], SIZE + (p5) LDFD f15 = [C16], SIZE + FMA f23 = ALPHA_R, f102, f23 + } + ;; + { .mmf + STFD [C5 ] = f16, SIZE + STFD [C13] = f17, SIZE + FMA f24 = ALPHA_I, f100, f24 + } + { .mmf + (p5) LDFD f16 = [C8 ], SIZE + (p5) LDFD f17 = [C16], SIZE + FMA f25 = ALPHA_I, f102, f25 + } + ;; + { .mmf + STFD [C5 ] = f18, SIZE + STFD [C13] = f19, SIZE + FMA f26 = ALPHA_R, f101, f26 + } + { .mmf + (p5) LDFD f18 = [C8 ], - 11 * SIZE + (p5) LDFD f19 = [C16], - 11 * SIZE + FMA f27 = ALPHA_R, f103, f27 + } + ;; + { .mmf + STFD [C5 ] = f20, 5 * SIZE + STFD [C13] = f21, 5 * SIZE + FMA f28 = ALPHA_I, f101, f28 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f29 = ALPHA_I, f103, f29 + } + ;; + { .mmf + STFD [C5 ] = f22, SIZE + STFD [C13] = f23, SIZE + FMA f30 = ALPHA_R, f104, f30 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f31 = ALPHA_R, f106, f31 + } + ;; + { .mmf + STFD [C5 ] = f24, SIZE + STFD [C13] = f25, SIZE + FMA f32 = ALPHA_I, f104, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA_I, f106, f33 + } + ;; + { .mmf + STFD [C5 ] = f26, SIZE + STFD [C13] = f27, SIZE + FMA f34 = ALPHA_R, f105, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA_R, f107, f35 + } + ;; + { .mmf + STFD [C5 ] = f28, 5 * SIZE + STFD [C13] = f29, 5 * SIZE + FMA f36 = ALPHA_I, f105, f36 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_I, f107, f37 + } + ;; + { .mmf + STFD [C6 ] = f30, SIZE + STFD [C14] = f31, SIZE + FMA f38 = ALPHA_R, f108, f38 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_R, f110, f39 + } + ;; + { .mmf + STFD [C6 ] = f32, SIZE + STFD [C14] = f33, SIZE + FMA f48 = ALPHA_I, f108, f48 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f49 = ALPHA_I, f110, f49 + } + ;; + { .mmf + STFD [C6 ] = f34, SIZE + STFD [C14] = f35, SIZE + FMA f50 = ALPHA_R, f109, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f51 = ALPHA_R, f111, f51 + } + ;; + { .mmf + STFD [C6 ] = f36, 5 * SIZE + STFD [C14] = f37, 5 * SIZE + FMA f52 = ALPHA_I, f109, f52 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f53 = ALPHA_I, f111, f53 + } + ;; + { .mmf + STFD [C6 ] = f38, SIZE + STFD [C14] = f39, SIZE + FMA f54 = ALPHA_R, f112, f54 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f55 = ALPHA_R, f114, f55 + } + ;; + { .mmf + STFD [C6 ] = f48, SIZE + STFD [C14] = f49, SIZE + FMA f40 = ALPHA_I, f112, f40 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f41 = ALPHA_I, f114, f41 + } + ;; + { .mmf + STFD [C6 ] = f50, SIZE + STFD [C14] = f51, SIZE + FMA f42 = ALPHA_R, f113, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f43 = ALPHA_R, f115, f43 + } + ;; + { .mmf + STFD [C6 ] = f52, 5 * SIZE + STFD [C14] = f53, 5 * SIZE + FMA f44 = ALPHA_I, f113, f44 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f45 = ALPHA_I, f115, f45 + } + ;; + { .mmf + STFD [C7 ] = f54, SIZE + STFD [C15] = f55, SIZE + FMA f46 = ALPHA_R, f116, f46 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f56 = ALPHA_R, f118, f56 + } + ;; + { .mmf + STFD [C7 ] = f40, SIZE + STFD [C15] = f41, SIZE + FMA f57 = ALPHA_I, f116, f57 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f58 = ALPHA_I, f118, f58 + } + ;; + { .mmf + STFD [C7 ] = f42, SIZE + STFD [C15] = f43, SIZE + FMA f59 = ALPHA_R, f117, f59 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f60 = ALPHA_R, f119, f60 + } + ;; + { .mmf + STFD [C7 ] = f44, 5 * SIZE + STFD [C15] = f45, 5 * SIZE + FMA f61 = ALPHA_I, f117, f61 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f62 = ALPHA_I, f119, f62 + } + ;; + { .mmf + STFD [C7 ] = f46, SIZE + STFD [C15] = f56, SIZE + FMA f63 = ALPHA_R, f120, f63 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f47 = ALPHA_R, f122, f47 + } + ;; + { .mmf + STFD [C7 ] = f57, SIZE + STFD [C15] = f58, SIZE + FMA f64 = ALPHA_I, f120, f64 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f65 = ALPHA_I, f122, f65 + } + ;; + { .mmf + STFD [C7 ] = f59, SIZE + STFD [C15] = f60, SIZE + FMA f6 = ALPHA_R, f121, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f123, f7 + } + ;; + { .mmf + STFD [C7 ] = f61, 5 * SIZE + STFD [C15] = f62, 5 * SIZE + FMA f10 = ALPHA_I, f121, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f123, f11 + } + ;; + { .mmf + STFD [C8 ] = f63, SIZE + STFD [C16] = f47, SIZE + FMA f12 = ALPHA_R, f124, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f126, f13 + } + ;; + { .mmf + STFD [C8 ] = f64, SIZE + STFD [C16] = f65, SIZE + FMA f14 = ALPHA_I, f124, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f126, f15 + } + ;; + { .mmf + STFD [C8 ] = f6, SIZE + STFD [C16] = f7, SIZE + FMA f16 = ALPHA_R, f125, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f17 = ALPHA_R, f127, f17 + } + ;; + { .mmf + STFD [C8 ] = f10, 5 * SIZE + STFD [C16] = f11, 5 * SIZE + FMA f18 = ALPHA_I, f125, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f19 = ALPHA_I, f127, f19 + } + ;; + { .mmf + STFD [C8 ] = f12, SIZE + STFD [C16] = f13, SIZE + mov f64 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C8 ] = f14, SIZE + STFD [C16] = f15, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C8 ] = f16, SIZE + STFD [C16] = f17, SIZE + mov f96 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f104 = f0 + } + ;; + { .mmf + STFD [C8 ] = f18, 5 * SIZE + STFD [C16] = f19, 5 * SIZE + mov f112 = f0 + } + { .mfb + adds I = -1, I + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; + +.L020: + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f89 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb + nop __LINE__ + mov f81 = f0 + (p6) br.cond.dptk .L030 + } + ;; + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; + { .mmf + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfi + setf.d f113 = r0 + mov f121 = f0 + adds L = 1, K + } + ;; + { .mmf + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f75 = f0 + adds L = -1, L + } + { .mmf + setf.d f67 = r0 + setf.d f83 = r0 + mov f91 = f0 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f107 = f0 + mov ar.lc = L + } + { .mmf + setf.d f99 = r0 + setf.d f115 = r0 + mov f123 = f0 + } + ;; + .align 32 + +.L022: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 4 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 4 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 4 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 4 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 4 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f14 = [C1 ], - 3 * SIZE + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9 ], - 3 * SIZE + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f16 = [C2 ], SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f17 = [C10], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f18 = [C2 ], SIZE + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f19 = [C10], SIZE + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f20 = [C2 ], SIZE + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f21 = [C10], SIZE + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f22 = [C2 ], - 3 * SIZE + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + (p5) LDFD f23 = [C10], - 3 * SIZE + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f24 = [C3 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f25 = [C11], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f26 = [C3 ], SIZE + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f27 = [C11], SIZE + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f28 = [C3 ], SIZE + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f29 = [C11], SIZE + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f30 = [C3 ], - 3 * SIZE + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + (p5) LDFD f31 = [C11], - 3 * SIZE + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: + { .mmf + LDFD f68 = [C4 ], SIZE + LDFD f69 = [C12], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f70 = [C4 ], SIZE + LDFD f71 = [C12], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f76 = [C4 ], SIZE + LDFD f77 = [C12], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f78 = [C4 ], -3 * SIZE + LDFD f79 = [C12], -3 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f72, f16 + } + { .mmf + LDFD f84 = [C5 ], SIZE + LDFD f85 = [C13], SIZE + FMA f17 = ALPHA_R, f74, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f72, f18 + } + { .mmf + LDFD f86 = [C5 ], SIZE + LDFD f87 = [C13], SIZE + FMA f19 = ALPHA_I, f74, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f73, f20 + } + { .mmf + LDFD f92 = [C5 ], SIZE + LDFD f93 = [C13], SIZE + FMA f21 = ALPHA_R, f75, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f73, f22 + } + { .mmf + LDFD f94 = [C5 ], -3 * SIZE + LDFD f95 = [C13], -3 * SIZE + FMA f23 = ALPHA_I, f75, f23 + } + ;; + { .mmf + STFD [C2 ] = f16, SIZE + STFD [C10] = f17, SIZE + FMA f24 = ALPHA_R, f80, f24 + } + { .mmf + LDFD f100 = [C6 ], SIZE + LDFD f101 = [C14], SIZE + FMA f25 = ALPHA_R, f82, f25 + } + ;; + { .mmf + STFD [C2 ] = f18, SIZE + STFD [C10] = f19, SIZE + FMA f26 = ALPHA_I, f80, f26 + } + { .mmf + LDFD f102 = [C6 ], SIZE + LDFD f103 = [C14], SIZE + FMA f27 = ALPHA_I, f82, f27 + } + ;; + { .mmf + STFD [C2 ] = f20, SIZE + STFD [C10] = f21, SIZE + FMA f28 = ALPHA_R, f81, f28 + } + { .mmf + LDFD f108 = [C6 ], SIZE + LDFD f109 = [C14], SIZE + FMA f29 = ALPHA_R, f83, f29 + } + ;; + { .mmf + STFD [C2 ] = f22, 5 * SIZE + STFD [C10] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f81, f30 + } + { .mmf + LDFD f110 = [C6 ], -3 * SIZE + LDFD f111 = [C14], -3 * SIZE + FMA f31 = ALPHA_I, f83, f31 + } + ;; + { .mmf + STFD [C3 ] = f24, SIZE + STFD [C11] = f25, SIZE + FMA f68 = ALPHA_R, f88, f68 + } + { .mmf + LDFD f116 = [C7 ], SIZE + LDFD f117 = [C15], SIZE + FMA f69 = ALPHA_R, f90, f69 + } + ;; + { .mmf + STFD [C3 ] = f26, SIZE + STFD [C11] = f27, SIZE + FMA f70 = ALPHA_I, f88, f70 + } + { .mmf + LDFD f118 = [C7 ], SIZE + LDFD f119 = [C15], SIZE + FMA f71 = ALPHA_I, f90, f71 + } + ;; + { .mmf + STFD [C3 ] = f28, SIZE + STFD [C11] = f29, SIZE + FMA f76 = ALPHA_R, f89, f76 + } + { .mmf + LDFD f124 = [C7 ], SIZE + LDFD f125 = [C15], SIZE + FMA f77 = ALPHA_R, f91, f77 + } + ;; + { .mmf + STFD [C3 ] = f30, 5 * SIZE + STFD [C11] = f31, 5 * SIZE + FMA f78 = ALPHA_I, f89, f78 + } + { .mmf + LDFD f126 = [C7 ], -3 * SIZE + LDFD f127 = [C15], -3 * SIZE + FMA f79 = ALPHA_I, f91, f79 + } + ;; + { .mmf + STFD [C4 ] = f68, SIZE + STFD [C12] = f69, SIZE + FMA f84 = ALPHA_R, f96, f84 + } + { .mmf + LDFD f32 = [C8 ], SIZE + LDFD f33 = [C16], SIZE + FMA f85 = ALPHA_R, f98, f85 + } + ;; + { .mmf + STFD [C4 ] = f70, SIZE + STFD [C12] = f71, SIZE + FMA f86 = ALPHA_I, f96, f86 + } + { .mmf + LDFD f34 = [C8 ], SIZE + LDFD f35 = [C16], SIZE + FMA f87 = ALPHA_I, f98, f87 + } + ;; + { .mmf + STFD [C4 ] = f76, SIZE + STFD [C12] = f77, SIZE + FMA f92 = ALPHA_R, f97, f92 + } + { .mmf + LDFD f36 = [C8 ], SIZE + LDFD f37 = [C16], SIZE + FMA f93 = ALPHA_R, f99, f93 + } + ;; + { .mmf + STFD [C4 ] = f78, 5 * SIZE + STFD [C12] = f79, 5 * SIZE + FMA f94 = ALPHA_I, f97, f94 + } + { .mmf + LDFD f38 = [C8 ], -3 * SIZE + LDFD f39 = [C16], -3 * SIZE + FMA f95 = ALPHA_I, f99, f95 + } + ;; + { .mmf + STFD [C5 ] = f84, SIZE + STFD [C13] = f85, SIZE + FMA f100 = ALPHA_R, f104, f100 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f101 = ALPHA_R, f106, f101 + } + ;; + { .mmf + STFD [C5 ] = f86, SIZE + STFD [C13] = f87, SIZE + FMA f102 = ALPHA_I, f104, f102 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f103 = ALPHA_I, f106, f103 + } + ;; + { .mmf + STFD [C5 ] = f92, SIZE + STFD [C13] = f93, SIZE + FMA f108 = ALPHA_R, f105, f108 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f109 = ALPHA_R, f107, f109 + } + ;; + { .mmf + STFD [C5 ] = f94, 5 * SIZE + STFD [C13] = f95, 5 * SIZE + FMA f110 = ALPHA_I, f105, f110 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f111 = ALPHA_I, f107, f111 + } + ;; + { .mmf + STFD [C6 ] = f100, SIZE + STFD [C14] = f101, SIZE + FMA f116 = ALPHA_R, f112, f116 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f117 = ALPHA_R, f114, f117 + } + ;; + { .mmf + STFD [C6 ] = f102, SIZE + STFD [C14] = f103, SIZE + FMA f118 = ALPHA_I, f112, f118 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f119 = ALPHA_I, f114, f119 + } + ;; + { .mmf + STFD [C6 ] = f108, SIZE + STFD [C14] = f109, SIZE + FMA f124 = ALPHA_R, f113, f124 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f125 = ALPHA_R, f115, f125 + } + ;; + { .mmf + STFD [C6 ] = f110, 5 * SIZE + STFD [C14] = f111, 5 * SIZE + FMA f126 = ALPHA_I, f113, f126 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f127 = ALPHA_I, f115, f127 + } + ;; + { .mmf + STFD [C7 ] = f116, SIZE + STFD [C15] = f117, SIZE + FMA f32 = ALPHA_R, f120, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA_R, f122, f33 + } + ;; + { .mmf + STFD [C7 ] = f118, SIZE + STFD [C15] = f119, SIZE + FMA f34 = ALPHA_I, f120, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA_I, f122, f35 + } + ;; + { .mmf + STFD [C7 ] = f124, SIZE + STFD [C15] = f125, SIZE + FMA f36 = ALPHA_R, f121, f36 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_R, f123, f37 + } + ;; + { .mmf + STFD [C7 ] = f126, 5 * SIZE + STFD [C15] = f127, 5 * SIZE + FMA f38 = ALPHA_I, f121, f38 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_I, f123, f39 + } + ;; + { .mmf + STFD [C8 ] = f32, SIZE + STFD [C16] = f33, SIZE + mov f64 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C8 ] = f34, SIZE + STFD [C16] = f35, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C8 ] = f36, SIZE + STFD [C16] = f37, SIZE + mov f96 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f104 = f0 + } + ;; + { .mmf + STFD [C8 ] = f38, 5 * SIZE + STFD [C16] = f39, 5 * SIZE + mov f112 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f120 = f0 + } + ;; + .align 32 + +.L030: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + adds L = 1, K + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f81 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f97 = f0 + adds L = -1, L + } + { .mfi + nop __LINE__ + mov f105 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f113 = f0 + mov ar.lc = L + } + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f121 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f6 = [C1], SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C2], SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C1], SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C2], SIZE + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f10 = [C1], SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f14 = [C2], SIZE + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f11 = [C1], -3 * SIZE + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + (p5) LDFD f15 = [C2], -3 * SIZE + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: + { .mmf + LDFD f16 = [C3], SIZE + LDFD f20 = [C4], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f72, f12 + } + ;; + { .mmf + LDFD f17 = [C3], SIZE + LDFD f21 = [C4], SIZE + FMA f7 = ALPHA_I, f64, f7 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f72, f13 + } + ;; + { .mmf + LDFD f18 = [C3], SIZE + LDFD f22 = [C4], SIZE + FMA f10 = ALPHA_R, f65, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_R, f73, f14 + } + ;; + { .mmf + LDFD f19 = [C3], - 3 * SIZE + LDFD f23 = [C4], - 3 * SIZE + FMA f11 = ALPHA_I, f65, f11 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f73, f15 + } + ;; + { .mmf + STFD [C1] = f6, SIZE + STFD [C2] = f12, SIZE + FMA f16 = ALPHA_R, f80, f16 + } + { .mmf + LDFD f24 = [C5], SIZE + LDFD f28 = [C6], SIZE + FMA f20 = ALPHA_R, f88, f20 + } + ;; + { .mmf + STFD [C1] = f7, SIZE + STFD [C2] = f13, SIZE + FMA f17 = ALPHA_I, f80, f17 + } + { .mmf + LDFD f25 = [C5], SIZE + LDFD f29 = [C6], SIZE + FMA f21 = ALPHA_I, f88, f21 + } + ;; + { .mmf + STFD [C1] = f10, SIZE + STFD [C2] = f14, SIZE + FMA f18 = ALPHA_R, f81, f18 + } + { .mmf + LDFD f26 = [C5], SIZE + LDFD f30 = [C6], SIZE + FMA f22 = ALPHA_R, f89, f22 + } + ;; + { .mmf + STFD [C1] = f11, SIZE + STFD [C2] = f15, SIZE + FMA f19 = ALPHA_I, f81, f19 + } + { .mmf + LDFD f27 = [C5], - 3 * SIZE + LDFD f31 = [C6], - 3 * SIZE + FMA f23 = ALPHA_I, f89, f23 + } + ;; + { .mmf + STFD [C3] = f16, SIZE + STFD [C4] = f20, SIZE + FMA f24 = ALPHA_R, f96, f24 + } + { .mmf + LDFD f32 = [C7], SIZE + LDFD f36 = [C8], SIZE + FMA f28 = ALPHA_R, f104, f28 + } + ;; + { .mmf + STFD [C3] = f17, SIZE + STFD [C4] = f21, SIZE + FMA f25 = ALPHA_I, f96, f25 + } + { .mmf + LDFD f33 = [C7], SIZE + LDFD f37 = [C8], SIZE + FMA f29 = ALPHA_I, f104, f29 + } + ;; + { .mmf + STFD [C3] = f18, SIZE + STFD [C4] = f22, SIZE + FMA f26 = ALPHA_R, f97, f26 + } + { .mmf + LDFD f34 = [C7], SIZE + LDFD f38 = [C8], SIZE + FMA f30 = ALPHA_R, f105, f30 + } + ;; + { .mmf + STFD [C3] = f19, SIZE + STFD [C4] = f23, SIZE + FMA f27 = ALPHA_I, f97, f27 + } + { .mmf + LDFD f35 = [C7], - 3 * SIZE + LDFD f39 = [C8], - 3 * SIZE + FMA f31 = ALPHA_I, f105, f31 + } + ;; + { .mmf + STFD [C5] = f24, SIZE + STFD [C6] = f28, SIZE + FMA f32 = ALPHA_R, f112, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f36 = ALPHA_R, f120, f36 + } + ;; + { .mmf + STFD [C5] = f25, SIZE + STFD [C6] = f29, SIZE + FMA f33 = ALPHA_I, f112, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_I, f120, f37 + } + ;; + { .mmf + STFD [C5] = f26, SIZE + STFD [C6] = f30, SIZE + FMA f34 = ALPHA_R, f113, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f38 = ALPHA_R, f121, f38 + } + ;; + { .mmf + STFD [C5] = f27, SIZE + STFD [C6] = f31, SIZE + FMA f35 = ALPHA_I, f113, f35 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_I, f121, f39 + } + ;; + { .mmf + STFD [C7] = f32, SIZE + STFD [C8] = f36, SIZE + mov f64 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C7] = f33, SIZE + STFD [C8] = f37, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C7] = f34, SIZE + STFD [C8] = f38, SIZE + mov f96 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f104 = f0 + } + ;; + { .mmf + STFD [C7] = f35, SIZE + STFD [C8] = f39, SIZE + mov f112 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f120 = f0 + } + ;; + .align 32 + +.L040: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + LDFD f32 = [AOFFSET], 1 * SIZE + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + { .mmi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + nop __LINE__ + } + ;; + .align 32 + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mmf + (p5) LDFD f6 = [C1], SIZE + (p5) LDFD f10 = [C2], SIZE + FMA f104 = f32, f53, f104 // A1 * B6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mmf + (p5) LDFD f7 = [C1], -SIZE + (p5) LDFD f11 = [C2], -SIZE + FMA f120 = f32, f55, f120 // A1 * B8 + } + ;; + { .mmf + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + } + { .mmf + (p5) LDFD f12 = [C3], SIZE + (p5) LDFD f14 = [C4], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + (p5) LDFD f13 = [C3], -SIZE + (p5) LDFD f15 = [C4], -SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mmf + (p5) LDFD f16 = [C5], SIZE + (p5) LDFD f18 = [C6], SIZE + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb + (p5) LDFD f17 = [C5], -SIZE + (p5) LDFD f19 = [C6], -SIZE + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb + (p5) LDFD f20 = [C7], SIZE + (p5) LDFD f22 = [C8], SIZE + br.cloop.sptk.few .L042 + } + ;; + { .mmf + LDFD f21 = [C7], -SIZE + LDFD f23 = [C8], -SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f10 = ALPHA_R, f72, f10 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_I, f64, f7 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f72, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f80, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_R, f88, f14 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f80, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f88, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C2 ] = f10, SIZE + FMA f16 = ALPHA_R, f96, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f18 = ALPHA_R, f104, f18 + } + ;; + { .mmf + STFD [C1 ] = f7, SIZE + STFD [C2 ] = f11, SIZE + FMA f17 = ALPHA_I, f96, f17 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f19 = ALPHA_I, f104, f19 + } + ;; + { .mmf + STFD [C3 ] = f12, SIZE + STFD [C4 ] = f14, SIZE + FMA f20 = ALPHA_R, f112, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f22 = ALPHA_R, f120, f22 + } + ;; + { .mmf + STFD [C3 ] = f13, SIZE + STFD [C4 ] = f15, SIZE + FMA f21 = ALPHA_I, f112, f21 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f23 = ALPHA_I, f120, f23 + } + ;; + { .mmi + STFD [C5 ] = f16, SIZE + STFD [C6 ] = f18, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C5 ] = f17, SIZE + STFD [C6 ] = f19, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C7 ] = f20, SIZE + STFD [C8 ] = f22, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C7 ] = f21, SIZE + STFD [C8 ] = f23, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L049: + { .mmi + mov B = BOFFSET + mov AOFFSET = A + nop __LINE__ + } + ;; + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 32 + +.L050: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 2 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + shladd C3 = LDC, 1, C + mov f80 = f0 + nop __LINE__ + } + { .mfb + mov AOFFSET = A + mov f88 = f0 + (p6) br.cond.dpnt .L090 + } + ;; + { .mfi + cmp.eq p6, p7 = 0, I + mov f65 = f0 + nop __LINE__ + } + { .mfi + shladd C4 = LDC, 1, C2 + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + mov f81 = f0 + nop __LINE__ + } + { .mfb + shladd C = LDC, 2, C + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 32 + +.L052: + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + nop __LINE__ + } + { .mfi + setf.d f84 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f67 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f75 = f0 + adds L = 1, K + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f83 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f91 = r0 + mov f68 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f76 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f92 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f69 = f0 + shr L = L, 1 + } + { .mmf + setf.d f77 = r0 + setf.d f85 = r0 + mov f93 = f0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f70 = f0 + adds L = -1, L + } + { .mmf + setf.d f78 = r0 + setf.d f86 = r0 + mov f94 = f0 + } + ;; + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + mov ar.lc = L + } + { .mmf + setf.d f79 = r0 + setf.d f87 = r0 + mov f95 = f0 + } + ;; + .align 32 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f14 = [C1 ], 5 * SIZE + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9 ], 5 * SIZE + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f16 = [C1 ], SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f17 = [C9], SIZE + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f18 = [C1 ], SIZE + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f19 = [C9], SIZE + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f20 = [C1 ], SIZE + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f21 = [C9], SIZE + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f22 = [C1 ], -11 * SIZE + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f23 = [C9 ], -11 * SIZE + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f24 = [C2 ], SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f25 = [C10], SIZE + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f26 = [C2 ], SIZE + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f27 = [C10], SIZE + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f28 = [C2 ], SIZE + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f29 = [C10], SIZE + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f30 = [C2 ], 5 * SIZE + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb + (p5) LDFD f31 = [C10], 5 * SIZE + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 32 + +.L058: + { .mmf + LDFD f32 = [C2 ], SIZE + LDFD f33 = [C10], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f34 = [C2 ], SIZE + LDFD f35 = [C10], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f36 = [C2 ], SIZE + LDFD f37 = [C10], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f38 = [C2 ], - 11 * SIZE + LDFD f39 = [C10], - 11 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f68, f16 + } + { .mmf + LDFD f48 = [C3 ], SIZE + LDFD f49 = [C11], SIZE + FMA f17 = ALPHA_R, f70, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f68, f18 + } + { .mmf + LDFD f50 = [C3 ], SIZE + LDFD f51 = [C11], SIZE + FMA f19 = ALPHA_I, f70, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f69, f20 + } + { .mmf + LDFD f52 = [C3 ], SIZE + LDFD f53 = [C11], SIZE + FMA f21 = ALPHA_R, f71, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f69, f22 + } + { .mmf + LDFD f54 = [C3 ], 5 * SIZE + LDFD f55 = [C11], 5 * SIZE + FMA f23 = ALPHA_I, f71, f23 + } + ;; + { .mmf + STFD [C1 ] = f16, SIZE + STFD [C9 ] = f17, SIZE + FMA f24 = ALPHA_R, f72, f24 + } + { .mmf + LDFD f40 = [C3 ], SIZE + LDFD f41 = [C11], SIZE + FMA f25 = ALPHA_R, f74, f25 + } + ;; + { .mmf + STFD [C1 ] = f18, SIZE + STFD [C9 ] = f19, SIZE + FMA f26 = ALPHA_I, f72, f26 + } + { .mmf + LDFD f42 = [C3 ], SIZE + LDFD f43 = [C11], SIZE + FMA f27 = ALPHA_I, f74, f27 + } + ;; + { .mmf + STFD [C1 ] = f20, SIZE + STFD [C9 ] = f21, SIZE + FMA f28 = ALPHA_R, f73, f28 + } + { .mmf + LDFD f44 = [C3 ], SIZE + LDFD f45 = [C11], SIZE + FMA f29 = ALPHA_R, f75, f29 + } + ;; + { .mmf + STFD [C1 ] = f22, 5 * SIZE + STFD [C9 ] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f73, f30 + } + { .mmf + LDFD f46 = [C3 ], - 11 * SIZE + LDFD f56 = [C11], - 11 * SIZE + FMA f31 = ALPHA_I, f75, f31 + } + ;; + { .mmf + STFD [C2 ] = f24, SIZE + STFD [C10] = f25, SIZE + FMA f32 = ALPHA_R, f76, f32 + } + { .mmf + LDFD f57 = [C4 ], SIZE + LDFD f58 = [C12], SIZE + FMA f33 = ALPHA_R, f78, f33 + } + ;; + { .mmf + STFD [C2 ] = f26, SIZE + STFD [C10] = f27, SIZE + FMA f34 = ALPHA_I, f76, f34 + } + { .mmf + LDFD f59 = [C4 ], SIZE + LDFD f60 = [C12], SIZE + FMA f35 = ALPHA_I, f78, f35 + } + ;; + { .mmf + STFD [C2 ] = f28, SIZE + STFD [C10] = f29, SIZE + FMA f36 = ALPHA_R, f77, f36 + } + { .mmf + LDFD f61 = [C4 ], SIZE + LDFD f62 = [C12], SIZE + FMA f37 = ALPHA_R, f79, f37 + } + ;; + { .mmf + STFD [C2 ] = f30, 5 * SIZE + STFD [C10] = f31, 5 * SIZE + FMA f38 = ALPHA_I, f77, f38 + } + { .mmf + LDFD f63 = [C4 ], 5 * SIZE + LDFD f47 = [C12], 5 * SIZE + FMA f39 = ALPHA_I, f79, f39 + } + ;; + { .mmf + STFD [C2 ] = f32, SIZE + STFD [C10] = f33, SIZE + FMA f48 = ALPHA_R, f80, f48 + } + { .mmf + LDFD f64 = [C4 ], SIZE + LDFD f65 = [C12], SIZE + FMA f49 = ALPHA_R, f82, f49 + } + ;; + { .mmf + STFD [C2 ] = f34, SIZE + STFD [C10] = f35, SIZE + FMA f50 = ALPHA_I, f80, f50 + } + { .mmf + LDFD f6 = [C4 ], SIZE + LDFD f7 = [C12], SIZE + FMA f51 = ALPHA_I, f82, f51 + } + ;; + { .mmf + STFD [C2 ] = f36, SIZE + STFD [C10] = f37, SIZE + FMA f52 = ALPHA_R, f81, f52 + } + { .mmf + LDFD f10 = [C4 ], SIZE + LDFD f11 = [C12], SIZE + FMA f53 = ALPHA_R, f83, f53 + } + ;; + { .mmf + STFD [C2 ] = f38, 5 * SIZE + STFD [C10] = f39, 5 * SIZE + FMA f54 = ALPHA_I, f81, f54 + } + { .mmf + LDFD f12 = [C4 ], - 11 * SIZE + LDFD f13 = [C12], - 11 * SIZE + FMA f55 = ALPHA_I, f83, f55 + } + ;; + { .mmf + STFD [C3 ] = f48, SIZE + STFD [C11] = f49, SIZE + FMA f40 = ALPHA_R, f84, f40 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f41 = ALPHA_R, f86, f41 + } + ;; + { .mmf + STFD [C3 ] = f50, SIZE + STFD [C11] = f51, SIZE + FMA f42 = ALPHA_I, f84, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f43 = ALPHA_I, f86, f43 + } + ;; + { .mmf + STFD [C3 ] = f52, SIZE + STFD [C11] = f53, SIZE + FMA f44 = ALPHA_R, f85, f44 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f45 = ALPHA_R, f87, f45 + } + ;; + { .mmf + STFD [C3 ] = f54, 5 * SIZE + STFD [C11] = f55, 5 * SIZE + FMA f46 = ALPHA_I, f85, f46 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f56 = ALPHA_I, f87, f56 + } + ;; + { .mmf + STFD [C3 ] = f40, SIZE + STFD [C11] = f41, SIZE + FMA f57 = ALPHA_R, f88, f57 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f58 = ALPHA_R, f90, f58 + } + ;; + { .mmf + STFD [C3 ] = f42, SIZE + STFD [C11] = f43, SIZE + FMA f59 = ALPHA_I, f88, f59 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f60 = ALPHA_I, f90, f60 + } + ;; + { .mmf + STFD [C3 ] = f44, SIZE + STFD [C11] = f45, SIZE + FMA f61 = ALPHA_R, f89, f61 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f62 = ALPHA_R, f91, f62 + } + ;; + { .mmf + STFD [C3 ] = f46, 5 * SIZE + STFD [C11] = f56, 5 * SIZE + FMA f63 = ALPHA_I, f89, f63 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f47 = ALPHA_I, f91, f47 + } + ;; + { .mmf + STFD [C4 ] = f57, SIZE + STFD [C12] = f58, SIZE + FMA f64 = ALPHA_R, f92, f64 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f65 = ALPHA_R, f94, f65 + } + ;; + { .mmf + STFD [C4 ] = f59, SIZE + STFD [C12] = f60, SIZE + FMA f6 = ALPHA_I, f92, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_I, f94, f7 + } + ;; + { .mmf + STFD [C4 ] = f61, SIZE + STFD [C12] = f62, SIZE + FMA f10 = ALPHA_R, f93, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_R, f95, f11 + } + ;; + { .mmf + STFD [C4 ] = f63, 5 * SIZE + STFD [C12] = f47, 5 * SIZE + FMA f12 = ALPHA_I, f93, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f95, f13 + } + ;; + { .mmf + STFD [C4 ] = f64, SIZE + STFD [C12] = f65, SIZE + mov f64 = f0 + } + { .mmf + cmp.ne p6, p0 = 1, I + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C4 ] = f6, SIZE + STFD [C12] = f7, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C4 ] = f10, SIZE + STFD [C12] = f11, SIZE + mov f65 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; + { .mmf + STFD [C4 ] = f12, 5 * SIZE + STFD [C12] = f13, 5 * SIZE + mov f81 = f0 + } + { .mfb + adds I = -1, I + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; + .align 32 + +.L060: + { .mfi + nop __LINE__ + mov f66 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb + nop __LINE__ + mov f74 = f0 + (p6) br.cond.dptk .L070 + } + ;; + { .mfb + LDFPD f48, f49 = [B] + mov f82 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f90 = f0 + adds L = 1, K + } + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f67 = f0 + adds L = -1, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f75 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9], SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f14 = [C1 ], - 3 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9], - 3 * SIZE + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f16 = [C2 ], SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f17 = [C10], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f18 = [C2 ], SIZE + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f19 = [C10], SIZE + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f20 = [C2 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f21 = [C10], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f22 = [C2 ], -3 * SIZE + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb + (p5) LDFD f23 = [C10], -3 * SIZE + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; + { .mmf + LDFD f24 = [C3 ], SIZE + LDFD f25 = [C11], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f26 = [C3 ], SIZE + LDFD f27 = [C11], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f28 = [C3 ], SIZE + LDFD f29 = [C11], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f30 = [C3 ], - 3 * SIZE + LDFD f31 = [C11], - 3 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f72, f16 + } + { .mmf + LDFD f32 = [C4 ], SIZE + LDFD f33 = [C12], SIZE + FMA f17 = ALPHA_R, f74, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f72, f18 + } + { .mmf + LDFD f34 = [C4 ], SIZE + LDFD f35 = [C12], SIZE + FMA f19 = ALPHA_I, f74, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f73, f20 + } + { .mmf + LDFD f36 = [C4 ], SIZE + LDFD f37 = [C12], SIZE + FMA f21 = ALPHA_R, f75, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f73, f22 + } + { .mmf + LDFD f38 = [C4 ], - 3 * SIZE + LDFD f39 = [C12], - 3 * SIZE + FMA f23 = ALPHA_I, f75, f23 + } + ;; + { .mmf + STFD [C2 ] = f16, SIZE + STFD [C10] = f17, SIZE + FMA f24 = ALPHA_R, f80, f24 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f25 = ALPHA_R, f82, f25 + } + ;; + { .mmf + STFD [C2 ] = f18, SIZE + STFD [C10] = f19, SIZE + FMA f26 = ALPHA_I, f80, f26 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f27 = ALPHA_I, f82, f27 + } + ;; + { .mmf + STFD [C2 ] = f20, SIZE + STFD [C10] = f21, SIZE + FMA f28 = ALPHA_R, f81, f28 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f29 = ALPHA_R, f83, f29 + } + ;; + { .mmf + STFD [C2 ] = f22, 5 * SIZE + STFD [C10] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f81, f30 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f31 = ALPHA_I, f83, f31 + } + ;; + { .mmf + STFD [C3 ] = f24, SIZE + STFD [C11] = f25, SIZE + FMA f32 = ALPHA_R, f88, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA_R, f90, f33 + } + ;; + { .mmf + STFD [C3 ] = f26, SIZE + STFD [C11] = f27, SIZE + FMA f34 = ALPHA_I, f88, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA_I, f90, f35 + } + ;; + { .mmf + STFD [C3 ] = f28, SIZE + STFD [C11] = f29, SIZE + FMA f36 = ALPHA_R, f89, f36 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_R, f91, f37 + } + ;; + { .mmf + STFD [C3 ] = f30, 5 * SIZE + STFD [C11] = f31, 5 * SIZE + FMA f38 = ALPHA_I, f89, f38 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_I, f91, f39 + } + ;; + { .mmf + STFD [C4 ] = f32, SIZE + STFD [C12] = f33, SIZE + mov f64 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C4 ] = f34, SIZE + STFD [C12] = f35, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C4 ] = f36, SIZE + STFD [C12] = f37, SIZE + mov f81 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f65 = f0 + } + ;; + { .mmf + STFD [C4 ] = f38, 5 * SIZE + STFD [C12] = f39, 5 * SIZE + mov f89 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; + .align 32 + +.L070: + { .mib + nop __LINE__ + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L080 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + (p5) LDFD f6 = [C1 ], SIZE + (p5) LDFD f12 = [C2 ], SIZE + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + (p5) LDFD f7 = [C1 ], SIZE + (p5) LDFD f13 = [C2 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + (p5) LDFD f10 = [C1 ], SIZE + (p5) LDFD f14 = [C2 ], SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C1 ], - 3 * SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + (p5) LDFD f15 = [C2 ], - 3 * SIZE + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mmf + LDFD f16 = [C3], SIZE + LDFD f20 = [C4], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f72, f12 + } + ;; + { .mmf + LDFD f17 = [C3], SIZE + LDFD f21 = [C4], SIZE + FMA f7 = ALPHA_I, f64, f7 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f72, f13 + } + ;; + { .mmf + LDFD f18 = [C3], SIZE + LDFD f22 = [C4], SIZE + FMA f10 = ALPHA_R, f65, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_R, f73, f14 + } + ;; + { .mmf + LDFD f19 = [C3], - 3 * SIZE + LDFD f23 = [C4], - 3 * SIZE + FMA f11 = ALPHA_I, f65, f11 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f73, f15 + } + ;; + { .mmf + STFD [C1] = f6, SIZE + STFD [C2] = f12, SIZE + FMA f16 = ALPHA_R, f80, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f20 = ALPHA_R, f88, f20 + } + ;; + { .mmf + STFD [C1] = f7, SIZE + STFD [C2] = f13, SIZE + FMA f17 = ALPHA_I, f80, f17 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f21 = ALPHA_I, f88, f21 + } + ;; + { .mmf + STFD [C1] = f10, SIZE + STFD [C2] = f14, SIZE + FMA f18 = ALPHA_R, f81, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f22 = ALPHA_R, f89, f22 + } + ;; + { .mmf + STFD [C1] = f11, SIZE + STFD [C2] = f15, SIZE + FMA f19 = ALPHA_I, f81, f19 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f23 = ALPHA_I, f89, f23 + } + ;; + { .mmf + STFD [C3] = f16, SIZE + STFD [C4] = f20, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C3] = f17, SIZE + STFD [C4] = f21, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C3] = f18, SIZE + STFD [C4] = f22, SIZE + mov f80 = f0 + } + ;; + { .mmf + STFD [C3] = f19, SIZE + STFD [C4] = f23, SIZE + mov f88 = f0 + } + ;; + .align 32 + +.L080: + { .mib + nop __LINE__ + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + (p5) LDFD f6 = [C1], SIZE + (p5) LDFD f10 = [C2], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf + (p5) LDFD f7 = [C1], -SIZE + (p5) LDFD f11 = [C2], -SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; + { .mmf + LDFD f12 = [C3], SIZE + LDFD f14 = [C4], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f10 = ALPHA_R, f72, f10 + } + ;; + { .mmf + LDFD f13 = [C3], -SIZE + LDFD f15 = [C4], -SIZE + FMA f7 = ALPHA_I, f64, f7 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f72, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f80, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_R, f88, f14 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f80, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f88, f15 + } + ;; + { .mmi + STFD [C1] = f6, SIZE + STFD [C2] = f10, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C1] = f7, SIZE + STFD [C2] = f11, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C3] = f12, SIZE + STFD [C4] = f14, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C3] = f13, SIZE + STFD [C4] = f15, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L089: + { .mmi + mov B = BOFFSET + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L090: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 1 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + setf.d f66 = r0 + mov f65 = f0 + nop __LINE__ + } + { .mfb + mov AOFFSET = A + mov f73 = f0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mfi + nop __LINE__ + mov f67 = f0 + shladd C = LDC, 1, C + } + { .mfb + cmp.eq p6, p7 = 0, I + mov f74 = f0 + (p6) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L092: + { .mfb + LDFPD f48, f49 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f79 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f75 = f0 + nop __LINE__ + } + ;; + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + mov f76 = f0 + adds L = 1, K + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f69 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f77 = f0 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC], LDC + mov f70 = f0 + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f78 = f0 + mov ar.lc = L + } + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + .align 32 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C1 ], SIZE + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f14 = [C1 ], 5 * SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9 ], 5 * SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f16 = [C1 ], SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f17 = [C9 ], SIZE + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f18 = [C1 ], SIZE + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f19 = [C9 ], SIZE + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f20 = [C1 ], SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f21 = [C9 ], SIZE + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f22 = [C1 ], -11 * SIZE + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb + (p5) LDFD f23 = [C9 ], -11 * SIZE + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; + { .mmf + LDFD f24 = [C2 ], SIZE + LDFD f25 = [C10], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f26 = [C2 ], SIZE + LDFD f27 = [C10], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f28 = [C2 ], SIZE + LDFD f29 = [C10], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f30 = [C2 ], 5 * SIZE + LDFD f31 = [C10], 5 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f68, f16 + } + { .mmf + LDFD f32 = [C2 ], SIZE + LDFD f33 = [C10], SIZE + FMA f17 = ALPHA_R, f70, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f68, f18 + } + { .mmf + LDFD f34 = [C2 ], SIZE + LDFD f35 = [C10], SIZE + FMA f19 = ALPHA_I, f70, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f69, f20 + } + { .mmf + LDFD f36 = [C2 ], SIZE + LDFD f37 = [C10], SIZE + FMA f21 = ALPHA_R, f71, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f69, f22 + } + { .mmf + LDFD f38 = [C2 ], - 11 * SIZE + LDFD f39 = [C10], - 11 * SIZE + FMA f23 = ALPHA_I, f71, f23 + } + ;; + { .mmf + STFD [C1 ] = f16, SIZE + STFD [C9 ] = f17, SIZE + FMA f24 = ALPHA_R, f72, f24 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f25 = ALPHA_R, f74, f25 + } + ;; + { .mmf + STFD [C1 ] = f18, SIZE + STFD [C9 ] = f19, SIZE + FMA f26 = ALPHA_I, f72, f26 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f27 = ALPHA_I, f74, f27 + } + ;; + { .mmf + STFD [C1 ] = f20, SIZE + STFD [C9 ] = f21, SIZE + FMA f28 = ALPHA_R, f73, f28 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f29 = ALPHA_R, f75, f29 + } + ;; + { .mmf + STFD [C1 ] = f22, 5 * SIZE + STFD [C9 ] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f73, f30 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f31 = ALPHA_I, f75, f31 + } + ;; + { .mmf + STFD [C2 ] = f24, SIZE + STFD [C10] = f25, SIZE + FMA f32 = ALPHA_R, f76, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA_R, f78, f33 + } + ;; + { .mmf + STFD [C2 ] = f26, SIZE + STFD [C10] = f27, SIZE + FMA f34 = ALPHA_I, f76, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA_I, f78, f35 + } + ;; + { .mmf + STFD [C2 ] = f28, SIZE + STFD [C10] = f29, SIZE + FMA f36 = ALPHA_R, f77, f36 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_R, f79, f37 + } + ;; + { .mmf + STFD [C2 ] = f30, 5 * SIZE + STFD [C10] = f31, 5 * SIZE + FMA f38 = ALPHA_I, f77, f38 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_I, f79, f39 + } + ;; + { .mmf + STFD [C2 ] = f32, SIZE + STFD [C10] = f33, SIZE + mov f64 = f0 + } + { .mmf + cmp.ne p6, p0 = 1, I + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C2 ] = f34, SIZE + STFD [C10] = f35, SIZE + mov f65 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; + { .mmf + STFD [C2 ] = f36, SIZE + STFD [C10] = f37, SIZE + mov f66 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f74 = f0 + } + ;; + { .mmf + STFD [C2 ] = f38, 5 * SIZE + STFD [C10] = f39, 5 * SIZE + mov f67 = f0 + } + { .mfb + adds I = -1, I + mov f75 = f0 + (p6) br.cond.dptk .L092 + } + ;; + .align 32 + +.L100: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + } + ;; + { .mmf + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + mov f75 = f0 + } + { .mii + nop __LINE__ + adds L = 1, K + } + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C1], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb + (p5) LDFD f13 = [C9], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mmf + LDFD f14 = [C1], - 3 * SIZE + LDFD f15 = [C9], - 3 * SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f16 = [C2 ], SIZE + LDFD f17 = [C10], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f18 = [C2 ], SIZE + LDFD f19 = [C10], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f20 = [C2 ], SIZE + LDFD f21 = [C10], SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f72, f16 + } + { .mmf + LDFD f22 = [C2 ], - 3 * SIZE + LDFD f23 = [C10], - 3 * SIZE + FMA f17 = ALPHA_R, f74, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f72, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f19 = ALPHA_I, f74, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f73, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f21 = ALPHA_R, f75, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f73, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f23 = ALPHA_I, f75, f23 + } + ;; + { .mmf + STFD [C2 ] = f16, SIZE + STFD [C10] = f17, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C2 ] = f18, SIZE + STFD [C10] = f19, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C2 ] = f20, SIZE + STFD [C10] = f21, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C2 ] = f22, 5 * SIZE + STFD [C10] = f23, 5 * SIZE + mov f73 = f0 + } + ;; + .align 32 + +.L110: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + (p5) LDFD f6 = [C1 ], SIZE + (p5) LDFD f7 = [C2 ], SIZE + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p5) LDFD f11 = [C2 ], SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mmf + LDFD f12 = [C1], SIZE + LDFD f13 = [C2], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f72, f7 + } + ;; + { .mmf + LDFD f14 = [C1], - 3 * SIZE + LDFD f15 = [C2], - 3 * SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f72, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f73, f13 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f73, f15 + } + ;; + { .mmf + STFD [C1] = f6, SIZE + STFD [C2] = f7, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1] = f10, SIZE + STFD [C2] = f11, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C1] = f12, SIZE + STFD [C2] = f13, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1] = f14, SIZE + STFD [C2] = f15, SIZE + mov f73 = f0 + } + ;; + .align 32 + +.L120: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + nop __LINE__ + mov ar.lc = L + } + ;; + .align 32 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi + (p5) LDFD f6 = [C1], SIZE + (p5) LDFD f7 = [C2], SIZE + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: + { .mmf + (p5) LDFD f10 = [C1], -SIZE + (p5) LDFD f11 = [C2], -SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f72, f7 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f72, f11 + } + ;; + { .mmi + STFD [C1 ] = f6, SIZE + STFD [C2 ] = f7, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f10, SIZE + STFD [C2 ] = f11, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L129: + { .mmi + mov B = BOFFSET + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L130: + { .mfi + nop __LINE__ + mov f64 = f0 + tbit.z p6, p0 = N, 0 + } + { .mib + mov AOFFSET = A + shr I = M, 3 + (p6) br.cond.dpnt .L999 + } + ;; + { .mfi + mov C1 = C + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + mov f66 = f0 + nop __LINE__ + } + { .mfb + cmp.eq p7, p0 = 0, I + mov f67 = f0 + (p7) br.cond.dpnt .L140 + } + ;; + .align 32 + +.L132: + { .mfb + LDFD f48 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 1 * SIZE, B + mov f69 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f70 = f0 + adds L = 1, K + } + ;; + { .mii + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f71 = f0 + adds L = -1, L + } + ;; + { .mmi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds PREC = CPREFETCHSIZE * SIZE, C1 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmi + CPREFETCH [PREC] + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p5) LDFD f6 = [C1 ], SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb + (p5) LDFD f14 = [C1 ], 5 * SIZE + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9 ], 5 * SIZE + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: + { .mmf + LDFD f16 = [C1 ], SIZE + LDFD f17 = [C9 ], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f18 = [C1 ], SIZE + LDFD f19 = [C9 ], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f20 = [C1 ], SIZE + LDFD f21 = [C9 ], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f22 = [C1 ], - 11 * SIZE + LDFD f23 = [C9 ], - 11 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f68, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f17 = ALPHA_R, f70, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f68, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f19 = ALPHA_I, f70, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f69, f20 + } + { .mmf + cmp.ne p6, p0 = 1, I + adds I = -1, I + FMA f21 = ALPHA_R, f71, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f69, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f23 = ALPHA_I, f71, f23 + } + ;; + { .mmf + STFD [C1 ] = f16, SIZE + STFD [C9 ] = f17, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f18, SIZE + STFD [C9 ] = f19, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1 ] = f20, SIZE + STFD [C9 ] = f21, SIZE + mov f66 = f0 + } + ;; + { .mmf + STFD [C1 ] = f22, 5 * SIZE + STFD [C9 ] = f23, 5 * SIZE + mov f67 = f0 + } + { .mmb + nop __LINE__ + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; + .align 32 + +.L140: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + } + ;; + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B + adds L = 1, K + } + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + adds L = -1, L + nop __LINE__ + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mmf + (p5) LDFD f6 = [C1 ], SIZE + (p5) LDFD f7 = [C9 ], SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + } + ;; + { .mmf + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + } + { .mmf + (p5) LDFD f10 = [C1 ], SIZE + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mmb + (p5) LDFD f12 = [C1 ], SIZE + (p5) LDFD f13 = [C9 ], SIZE + br.cloop.sptk.few .L142 + } + ;; + +.L148: + { .mmf + LDFD f14 = [C1 ], - 3 * SIZE + LDFD f15 = [C9 ], - 3 * SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + mov f66 = f0 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + mov f67 = f0 + } + ;; + .align 32 + +.L150: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + } + ;; + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B + adds L = 1, K + } + ;; + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: + LDFD f68 = [C1 ], 1 * SIZE + ;; + LDFD f69 = [C1 ], 1 * SIZE + ;; + LDFD f70 = [C1 ], 1 * SIZE + ;; + LDFD f71 = [C1 ], - 3 * SIZE + ;; + FMA f68 = ALPHA_R, f64, f68 + FMA f69 = ALPHA_I, f64, f69 + FMA f70 = ALPHA_R, f65, f70 + FMA f71 = ALPHA_I, f65, f71 + ;; + STFD [C1 ] = f68, SIZE + ;; + STFD [C1 ] = f69, SIZE + ;; + STFD [C1 ] = f70, SIZE + mov f64 = f0 + ;; + STFD [C1 ] = f71, SIZE + mov f65 = f0 + ;; + .align 32 + +.L160: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B + adds L = 1, K + } + ;; + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p5) LDFD f68 = [C1], 1 * SIZE + adds L = -1, L + } + ;; + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p5) LDFD f69 = [C1], - 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + } + { .mib + nop __LINE__ + nop __LINE__ + br.cloop.sptk.few .L162 + } + ;; + FMA f68 = ALPHA_R, f64, f68 + FMA f69 = ALPHA_I, f64, f69 + ;; + STFD [C1 ] = f68, SIZE + ;; + STFD [C1 ] = f69, SIZE + ;; + .align 32 + +.L169: + { .mmi + mov B = BOFFSET + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f24 = [SP], 32 + ldf.fill f25 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f26 = [SP], 32 + ldf.fill f27 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f28 = [SP], 32 + ldf.fill f29 = [r9], 32 + ;; + ldf.fill f30 = [SP], 32 + ldf.fill f31 = [r9] + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/zgemm_beta.S b/kernel/ia64/zgemm_beta.S new file mode 100644 index 0000000..00cf3e9 --- /dev/null +++ b/kernel/ia64/zgemm_beta.S @@ -0,0 +1,517 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 74 + +#define CO1 r14 +#define CO2 r15 +#define CO3 r16 +#define DO1 r17 +#define DO2 r18 +#define DO3 r19 + +#define I r22 +#define I_AND_15 r23 +#define PRE1 r24 + +#define PR r30 +#define ARLC r31 + +#define M r32 +#define N r33 +#define C r34 +#define LDC r35 +#define J r36 + +#define BETA_R f8 +#define BETA_I f9 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds CO1 = 24, r12 + adds CO2 = 32, r12 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, N + fcmp.eq p0, p14 = BETA_R, f0 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + ld8 C = [CO1], 8 + ld8 LDC = [CO2] + mov PR = pr + } + { .mfi + mov J = N + fcmp.eq p0, p15 = BETA_I, f0 + shr I = M, 3 + } + ;; + { .mmb + cmp.ge p6, p0 = 0, M + adds I = -1, I + (p6) br.ret.sptk.many b0 + } + ;; + { .mbb + shladd LDC = LDC, ZBASE_SHIFT, r0 + (p14) br.cond.dpnt .L100 + (p15) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L60: + { .mmi + mov CO1 = C + mov CO3 = C + add CO2 = 4 * SIZE, C + } + { .mmi + adds PRE1 = PREFETCHSIZE * SIZE, C + add C = C, LDC + tbit.nz p12, p0 = M, 2 + } + ;; + { .mmi + and I_AND_15 = 15, M + mov ar.lc = I + } + { .mib + cmp.gt p8, p0 = 0, I + (p8) br.cond.dpnt .L80 + } + ;; + .align 32 + +.L70: + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + { .mmi + lfetch.excl.nt1 [PRE1], 16 * SIZE + nop.m 0 + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + adds CO3 = 16 * SIZE, CO3 + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 5 * SIZE + STFD [CO2] = f0, 5 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmb + STFD [CO1] = f0, 5 * SIZE + STFD [CO2] = f0, 5 * SIZE + br.cloop.sptk.few .L70 + } + ;; + .align 32 + +.L80: + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + tbit.nz p13, p0 = M, 1 + } + { .mmb + cmp.eq p9, p0 = 0, I_AND_15 + adds J = -1, J + (p9) br.cond.dptk .L99 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + tbit.nz p14, p0 = M, 0 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + (p12) adds CO3 = 8 * SIZE, CO3 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 5 * SIZE + (p12) STFD [CO2] = f0 + (p13) adds CO3 = 4 * SIZE, CO3 + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p14) STFD [CO3] = f0, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p14) STFD [CO3] = f0, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [CO1] = f0 + } + ;; + .align 32 + +.L99: + { .mib + cmp.lt p6, p0 = 0, J + mov ar.lc = ARLC + } + { .mbb + (p6) br.cond.dptk .L60 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mmi + mov CO1 = C + mov CO3 = C + mov pr.rot = 0 + } + { .mmi + adds PRE1 = PREFETCHSIZE * SIZE, C + add CO2 = 4 * SIZE, C + mov DO1 = C + } + ;; + { .mmi + mov ar.ec = 6 + } + { .mmi + adds DO2 = 4 * SIZE, C + mov DO3 = C + add C = C, LDC + } + ;; + { .mmi + and I_AND_15 = 15, M + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + } + { .mib + cmp.gt p8, p0 = 0, I + tbit.nz p12, p0 = M, 2 + (p8) br.cond.dpnt .L180 + } + ;; + .align 32 + +.L170: + { .mmf + (p21) STFD [DO1] = f37, 1 * SIZE + (p16) lfetch.excl.nt1 [PRE1], 16 * SIZE + (p21) FNMA f61 = BETA_I, f67, f61 + } + { .mmf + (p16) LDFD f32 = [CO1], 1 * SIZE + (p16) adds CO2 = 16 * SIZE, CO2 + (p21) FMPY f12 = BETA_I, f85 + } + ;; + { .mfi + (p21) STFD [DO1] = f43, 1 * SIZE + (p21) FMA f67 = BETA_R, f67, f10 + (p16) adds CO3 = 16 * SIZE, CO3 + } + { .mfi + (p16) LDFD f38 = [CO1], 1 * SIZE + (p21) FMPY f85 = BETA_R, f85 + (p16) adds DO2 = 16 * SIZE, DO2 + } + ;; + { .mfi + (p21) STFD [DO1] = f49, 1 * SIZE + (p21) FNMA f73 = BETA_I, f79, f73 + (p16) adds DO3 = 16 * SIZE, DO3 + } + { .mfi + (p16) LDFD f44 = [CO1], 1 * SIZE + (p21) FMPY f13 = BETA_I, f97 + nop.i 0 + } + ;; + (p21) STFD [DO1] = f55, 1 * SIZE + (p21) FMA f79 = BETA_R, f79, f11 + (p16) LDFD f50 = [CO1], 1 * SIZE + (p21) FMPY f97 = BETA_R, f97 + ;; + (p21) STFD [DO1] = f61, 1 * SIZE + (p21) FNMA f85 = BETA_I, f91, f85 + (p16) LDFD f56 = [CO1], 1 * SIZE + (p21) FMPY f14 = BETA_I, f109 + ;; + (p21) STFD [DO1] = f67, 1 * SIZE + (p21) FMA f91 = BETA_R, f91, f12 + (p16) LDFD f62 = [CO1], 1 * SIZE + (p21) FMPY f109 = BETA_R, f109 + ;; + (p21) STFD [DO1] = f73, 1 * SIZE + (p21) FNMA f97 = BETA_I, f103, f97 + (p16) LDFD f68 = [CO1], 1 * SIZE + (p21) FMPY f15 = BETA_I, f121 + ;; + (p21) STFD [DO1] = f79, 1 * SIZE + (p21) FMA f103 = BETA_R, f103, f13 + (p16) LDFD f74 = [CO1], 1 * SIZE + (p21) FMPY f121 = BETA_R, f121 + ;; + (p21) STFD [DO1] = f85, 1 * SIZE + (p21) FNMA f109 = BETA_I, f115, f109 + (p16) LDFD f80 = [CO1], 1 * SIZE + (p20) FMPY f6 = BETA_I, f36 + ;; + (p21) STFD [DO1] = f91, 1 * SIZE + (p21) FMA f115 = BETA_R, f115, f14 + (p16) LDFD f86 = [CO1], 1 * SIZE + (p20) FMPY f36 = BETA_R, f36 + ;; + (p21) STFD [DO1] = f97, 1 * SIZE + (p21) FNMA f121 = BETA_I, f127, f121 + (p16) LDFD f92 = [CO1], 1 * SIZE + (p20) FMPY f7 = BETA_I, f48 + ;; + (p21) STFD [DO1] = f103, 1 * SIZE + (p21) FMA f127 = BETA_R, f127, f15 + (p16) LDFD f98 = [CO1], 1 * SIZE + (p20) FMPY f48 = BETA_R, f48 + ;; + (p21) STFD [DO1] = f109, 1 * SIZE + (p20) FNMA f36 = BETA_I, f42, f36 + (p16) LDFD f104 = [CO1], 1 * SIZE + (p20) FMPY f10 = BETA_I, f60 + ;; + (p21) STFD [DO1] = f115, 1 * SIZE + (p20) FMA f42 = BETA_R, f42, f6 + (p16) LDFD f110 = [CO1], 1 * SIZE + (p20) FMPY f60 = BETA_R, f60 + ;; + (p21) STFD [DO1] = f121, 1 * SIZE + (p20) FNMA f48 = BETA_I, f54, f48 + (p16) LDFD f116 = [CO1], 1 * SIZE + (p20) FMPY f11 = BETA_I, f72 + ;; + (p21) STFD [DO1] = f127, 1 * SIZE + (p20) FMA f54 = BETA_R, f54, f7 + (p16) LDFD f122 = [CO1], 1 * SIZE + (p20) FMPY f72 = BETA_R, f72 + br.ctop.sptk.few .L170 + ;; + .align 32 + +.L180: + { .mmi + (p12) LDFD f32 = [CO1], 1 * SIZE + (p12) LDFD f36 = [CO2], 1 * SIZE + tbit.nz p13, p0 = M, 1 + } + { .mmb + cmp.eq p9, p0 = 0, I_AND_15 + adds J = -1, J + (p9) br.cond.dptk .L199 + } + ;; + { .mmi + (p12) LDFD f33 = [CO1], 1 * SIZE + (p12) LDFD f37 = [CO2], 1 * SIZE + tbit.nz p14, p0 = M, 0 + } + ;; + { .mmi + (p12) LDFD f34 = [CO1], 1 * SIZE + (p12) LDFD f38 = [CO2], 1 * SIZE + (p12) adds CO3 = 8 * SIZE, CO3 + } + ;; + { .mmi + (p12) LDFD f35 = [CO1], 5 * SIZE + (p12) LDFD f39 = [CO2] + (p13) adds CO3 = 4 * SIZE, CO3 + } + ;; + { .mmi + (p13) LDFD f40 = [CO1], 1 * SIZE + (p14) LDFD f44 = [CO3], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f41 = [CO1], 1 * SIZE + (p14) LDFD f45 = [CO3], 1 * SIZE + } + ;; + { .mmf + (p13) LDFD f42 = [CO1], 1 * SIZE + } + ;; + { .mmf + (p13) LDFD f43 = [CO1] + } + ;; + (p12) FMPY f80 = BETA_I, f32 + (p12) FMPY f32 = BETA_R, f32 + (p12) FMPY f81 = BETA_I, f34 + (p12) FMPY f34 = BETA_R, f34 + (p12) FMPY f82 = BETA_I, f36 + (p12) FMPY f36 = BETA_R, f36 + (p12) FMPY f83 = BETA_I, f38 + (p12) FMPY f38 = BETA_R, f38 + ;; + (p12) FNMA f32 = BETA_I, f33, f32 + (p12) FMA f33 = BETA_R, f33, f80 + (p12) FNMA f34 = BETA_I, f35, f34 + (p12) FMA f35 = BETA_R, f35, f81 + (p12) FNMA f36 = BETA_I, f37, f36 + (p12) FMA f37 = BETA_R, f37, f82 + (p12) FNMA f38 = BETA_I, f39, f38 + (p12) FMA f39 = BETA_R, f39, f83 + ;; + (p13) FMPY f84 = BETA_I, f40 + (p13) FMPY f40 = BETA_R, f40 + (p13) FMPY f85 = BETA_I, f42 + (p13) FMPY f42 = BETA_R, f42 + (p14) FMPY f86 = BETA_I, f44 + (p14) FMPY f44 = BETA_R, f44 + ;; + (p13) FNMA f40 = BETA_I, f41, f40 + (p13) FMA f41 = BETA_R, f41, f84 + (p13) FNMA f42 = BETA_I, f43, f42 + (p13) FMA f43 = BETA_R, f43, f85 + (p14) FNMA f44 = BETA_I, f45, f44 + (p14) FMA f45 = BETA_R, f45, f86 + ;; + + { .mmf + (p12) STFD [DO1] = f32, 1 * SIZE + (p12) STFD [DO2] = f36, 1 * SIZE + } + { .mmf + (p12) adds DO3 = 8 * SIZE, DO3 + } + ;; + { .mmf + (p12) STFD [DO1] = f33, 1 * SIZE + (p12) STFD [DO2] = f37, 1 * SIZE + } + { .mmf + (p13) adds DO3 = 4 * SIZE, DO3 + } + ;; + { .mmf + (p12) STFD [DO1] = f34, 1 * SIZE + (p12) STFD [DO2] = f38, 1 * SIZE + } + ;; + { .mmf + (p12) STFD [DO1] = f35, 5 * SIZE + (p12) STFD [DO2] = f39 + } + ;; + { .mmi + (p13) STFD [DO1] = f40, 1 * SIZE + (p14) STFD [DO3] = f44, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [DO1] = f41, 1 * SIZE + (p14) STFD [DO3] = f45, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [DO1] = f42, 1 * SIZE + ;; + (p13) STFD [DO1] = f43 + } + ;; + .align 32 + +.L199: + { .mib + cmp.lt p6, p0 = 0, J + mov ar.lc = ARLC + (p6) br.cond.dptk .L100 + } + ;; + { .mib + mov pr = PR, -1 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/zgemm_kernel.S b/kernel/ia64/zgemm_kernel.S new file mode 100644 index 0000000..bfdb92c --- /dev/null +++ b/kernel/ia64/zgemm_kernel.S @@ -0,0 +1,6849 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE 7 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + +#define AORIG loc0 +#define KK loc1 +#define KK8 loc2 +#define OFFSET loc3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define FCALC_A FSUB +#define FCALC_B FADD +#define FMA_A FNMA +#define FMA_B FMA + +#else +#define FCALC_A FADD +#define FCALC_B FSUB +#define FMA_A FMA +#define FMA_B FNMA +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define FCALC_C FMA +#define FCALC_D FNMA +#else +#define FCALC_C FNMA +#define FCALC_D FMA +#endif + + PROLOGUE + .prologue + PROFCODE + + { .mfi +#ifdef TRMMKERNEL + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 8, 0, 0 +#else + nop __LINE__ +#endif + mov f64 = f0 + adds r14 = 16, SP + } + { .mfi + nop __LINE__ + mov f65 = f0 + adds r15 = 24, SP + } + ;; + { .mfi + ld8 LDC = [r14] + mov f81 = f0 + mov PR = pr + } + { .mfi +#ifdef TRMMKERNEL + ld8 OFFSET = [r15] +#else + nop __LINE__ +#endif + mov f96 = f0 + shr J = N, 2 + } + ;; + { .mfi + shladd LDC = LDC, ZBASE_SHIFT, r0 + mov f97 = f0 + mov AOFFSET = A + } + { .mfi + nop __LINE__ + mov f113 = f0 +#if defined(TRMMKERNEL) && !defined(LEFT) + sub KK = r0, OFFSET +#endif + } + ;; + .body + { .mfi + nop __LINE__ + mov f80 = f0 + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, J + mov f112 = f0 + (p6) br.cond.dpnt .L050 + } + ;; + .align 16 + +.L010: + { .mmi + mov C1 = C // coffset1 = c + 0 * ldc + add C2 = LDC, C // coffset2 = c + 1 * ldc + shr I = M, 2 + } + { .mmi + adds J = -1, J +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mmi + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + cmp.eq p6, p7 = 0, I + shladd C = LDC, 2, C // coffset += 8 * ldc + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f67 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f66 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 4, KK +#endif +#endif + } + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + adds C5 = 4 * SIZE, C1 + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f115 = f0 + adds C6 = 4 * SIZE, C2 + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f68 = f0 + shr L = L, 1 + } + { .mfi + setf.d f86 = r0 + mov f69 = f0 + adds C7 = 4 * SIZE, C3 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f84 = f0 + adds L = -1, L + } + { .mfi + setf.d f87 = r0 + mov f85 = f0 + adds C8 = 4 * SIZE, C4 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f100 = f0 + mov ar.lc = L + } + { .mfi + setf.d f102 = r0 + mov f101 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f116 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + { .mfi + setf.d f103 = r0 + mov f117 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC] + mov f70 = f0 + nop __LINE__ + } + { .mmf + setf.d f118 = r0 + setf.d f119 = r0 + mov f71 = f0 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA_B f65 = f32, f49, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f69 = f36, f49, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f85 = f36, f51, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f101 = f36, f53, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f117 = f36, f55, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f68 = f37, f49, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f84 = f37, f51, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f100 = f37, f53, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f116 = f37, f55, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f71 = f38, f49, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f87 = f38, f51, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f103 = f38, f53, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f119 = f38, f55, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f70 = f39, f49, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f86 = f39, f51, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f102 = f39, f53, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f118 = f39, f55, f118 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C5], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C5], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C5], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C5], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C6], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C6], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f90 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C6], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f91 = [C2], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C6], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C7], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C7], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C7], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C3], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C7], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f120 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f124 = [C8], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f121 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f125 = [C8], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f122 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f126 = [C8], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f123 = [C4], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f127 = [C8], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f68, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f69, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f66, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f70, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f67, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f71, f79 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f69, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f68, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f67, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f71, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f66, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f70, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + { .mfb + STFD [C5] = f76, SIZE + FMA f92 = ALPHA_R, f84, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + { .mfb + STFD [C5] = f77, SIZE + FCALC_C f93 = ALPHA_R, f85, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMA f90 = ALPHA_R, f82, f90 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + FMA f94 = ALPHA_R, f86, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, 5 * SIZE + FCALC_C f91 = ALPHA_R, f83, f91 + nop __LINE__ + } + { .mfb + STFD [C5] = f79, 5 * SIZE + FCALC_C f95 = ALPHA_R, f87, f95 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f92 = ALPHA_I, f85, f92 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = ALPHA_I, f84, f93 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f83, f90 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f94 = ALPHA_I, f87, f94 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f82, f91 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = ALPHA_I, f86, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f88, SIZE + FMA f104 = ALPHA_R, f96, f104 + nop __LINE__ + } + { .mfb + STFD [C6] = f92, SIZE + FMA f108 = ALPHA_R, f100, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f89, SIZE + FCALC_C f105 = ALPHA_R, f97, f105 + nop __LINE__ + } + { .mfb + STFD [C6] = f93, SIZE + FCALC_C f109 = ALPHA_R, f101, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f90, SIZE + FMA f106 = ALPHA_R, f98, f106 + nop __LINE__ + } + { .mfb + STFD [C6] = f94, SIZE + FMA f110 = ALPHA_R, f102, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f91, 5 * SIZE + FCALC_C f107 = ALPHA_R, f99, f107 + nop __LINE__ + } + { .mfb + STFD [C6] = f95, 5 * SIZE + FCALC_C f111 = ALPHA_R, f103, f111 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f108 = ALPHA_I, f101, f108 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = ALPHA_I, f100, f109 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f106 = ALPHA_I, f99, f106 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f110 = ALPHA_I, f103, f110 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f107 = ALPHA_I, f98, f107 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = ALPHA_I, f102, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f104, SIZE + FMA f120 = ALPHA_R, f112, f120 + nop __LINE__ + } + { .mfb + STFD [C7] = f108, SIZE + FMA f124 = ALPHA_R, f116, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f105, SIZE + FCALC_C f121 = ALPHA_R, f113, f121 + nop __LINE__ + } + { .mfb + STFD [C7] = f109, SIZE + FCALC_C f125 = ALPHA_R, f117, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f106, SIZE + FMA f122 = ALPHA_R, f114, f122 + nop __LINE__ + } + { .mfb + STFD [C7] = f110, SIZE + FMA f126 = ALPHA_R, f118, f126 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f107, 5 * SIZE + FCALC_C f123 = ALPHA_R, f115, f123 + nop __LINE__ + } + { .mfb + STFD [C7] = f111, 5 * SIZE + FCALC_C f127 = ALPHA_R, f119, f127 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f124 = ALPHA_I, f117, f124 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = ALPHA_I, f116, f125 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f122 = ALPHA_I, f115, f122 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f126 = ALPHA_I, f119, f126 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f123 = ALPHA_I, f114, f123 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMA f127 = ALPHA_I, f118, f127 + nop __LINE__ + } + ;; + { .mfi + STFD [C4] = f120, SIZE + mov f64 = f0 + adds I = -1, I + } + { .mfb + STFD [C8] = f124, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4] = f121, SIZE + mov f80 = f0 + and TEMP = 3, M + } + { .mfb + STFD [C8] = f125, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4] = f122, SIZE + mov f96 = f0 + cmp.ne p8, p9 = r0, TEMP + } + { .mfb + STFD [C8] = f126, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4] = f123, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8] = f127, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f72 = ALPHA_R, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f76 = ALPHA_R, f68 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f69, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f74 = ALPHA_R, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f78 = ALPHA_R, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f67, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f71, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f69, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f68, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f67, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f71, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f66, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f70, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMPY f88 = ALPHA_R, f80 + nop __LINE__ + } + { .mfb + STFD [C5] = f76, SIZE + FMPY f92 = ALPHA_R, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f77, SIZE + FCALC_C f93 = ALPHA_R, f85, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMPY f90 = ALPHA_R, f82 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + FMPY f94 = ALPHA_R, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, 5 * SIZE + FCALC_C f91 = ALPHA_R, f83, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f79, 5 * SIZE + FCALC_C f95 = ALPHA_R, f87, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f92 = ALPHA_I, f85, f92 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = ALPHA_I, f84, f93 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f83, f90 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f94 = ALPHA_I, f87, f94 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f82, f91 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = ALPHA_I, f86, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f88, SIZE + FMPY f104 = ALPHA_R, f96 + nop __LINE__ + } + { .mfb + STFD [C6] = f92, SIZE + FMPY f108 = ALPHA_R, f100 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f89, SIZE + FCALC_C f105 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f93, SIZE + FCALC_C f109 = ALPHA_R, f101, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f90, SIZE + FMPY f106 = ALPHA_R, f98 + nop __LINE__ + } + { .mfb + STFD [C6] = f94, SIZE + FMPY f110 = ALPHA_R, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f91, 5 * SIZE + FCALC_C f107 = ALPHA_R, f99, f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f95, 5 * SIZE + FCALC_C f111 = ALPHA_R, f103, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f108 = ALPHA_I, f101, f108 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = ALPHA_I, f100, f109 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f106 = ALPHA_I, f99, f106 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f110 = ALPHA_I, f103, f110 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f107 = ALPHA_I, f98, f107 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = ALPHA_I, f102, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f104, SIZE + FMPY f120 = ALPHA_R, f112 + nop __LINE__ + } + { .mfb + STFD [C7] = f108, SIZE + FMPY f124 = ALPHA_R, f116 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f105, SIZE + FCALC_C f121 = ALPHA_R, f113, f0 + nop __LINE__ + } + { .mfb + STFD [C7] = f109, SIZE + FCALC_C f125 = ALPHA_R, f117, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f106, SIZE + FMPY f122 = ALPHA_R, f114 + nop __LINE__ + } + { .mfb + STFD [C7] = f110, SIZE + FMPY f126 = ALPHA_R, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f107, 5 * SIZE + FCALC_C f123 = ALPHA_R, f115, f0 + nop __LINE__ + } + { .mfb + STFD [C7] = f111, 5 * SIZE + FCALC_C f127 = ALPHA_R, f119, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f124 = ALPHA_I, f117, f124 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FMA f125 = ALPHA_I, f116, f125 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f122 = ALPHA_I, f115, f122 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FCALC_D f126 = ALPHA_I, f119, f126 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FMA f123 = ALPHA_I, f114, f123 + cmp.ne p6, p0 = 1, I + } + { .mfi + nop __LINE__ + FMA f127 = ALPHA_I, f118, f127 + adds I = -1, I + } + ;; + { .mfi + STFD [C4] = f120, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8] = f124, SIZE + mov f65 = f0 + and TEMP = 3, M + } + ;; + { .mfi + STFD [C4] = f121, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8] = f125, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4] = f122, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8] = f126, SIZE + mov f97 = f0 + cmp.ne p8, p9 = r0, TEMP + } + ;; + { .mfi + STFD [C4] = f123, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8] = f127, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L030 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f67 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f66 = f0 + shladd AOFFSET = KK8, 1, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = -1, L + } + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f114 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f115 = f0 + nop __LINE__ + } + ;; + .align 16 + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f90 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f91 = [C2], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f120 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f121 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f122 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C3], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f123 = [C4], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f66, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = ALPHA_R, f82, f90 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f67, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f91 = ALPHA_R, f83, f91 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f67, f74 + nop __LINE__ + } + { .mfb + FCALC_D f90 = ALPHA_I, f83, f90 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f66, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f82, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMA f104 = ALPHA_R, f96, f104 + nop __LINE__ + } + { .mfb + STFD [C2] = f88, SIZE + FMA f120 = ALPHA_R, f112, f120 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f105 = ALPHA_R, f97, f105 + nop __LINE__ + } + { .mfb + STFD [C2] = f89, SIZE + FCALC_C f121 = ALPHA_R, f113, f121 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMA f106 = ALPHA_R, f98, f106 + nop __LINE__ + } + { .mfb + STFD [C2] = f90, SIZE + FMA f122 = ALPHA_R, f114, f122 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, SIZE + FCALC_C f107 = ALPHA_R, f99, f107 + nop __LINE__ + } + { .mfb + STFD [C2] = f91, SIZE + FCALC_C f123 = ALPHA_R, f115, f123 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f106 = ALPHA_I, f99, f106 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f122 = ALPHA_I, f115, f122 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f107 = ALPHA_I, f98, f107 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = ALPHA_I, f114, f123 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f104, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f120, SIZE + mov f65 = f0 + } + ;; + { .mfb + STFD [C3] = f105, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f121, SIZE + mov f81 = f0 + } + ;; + { .mfb + STFD [C3] = f106, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f122, SIZE + mov f97 = f0 + } + ;; + { .mfi + STFD [C3] = f107, SIZE + mov f112 = f0 + } + { .mfb + STFD [C4] = f123, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f72 = ALPHA_R, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f88 = ALPHA_R, f80 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f74 = ALPHA_R, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f90 = ALPHA_R, f82 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f67, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f91 = ALPHA_R, f83, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f67, f74 + nop __LINE__ + } + { .mfb + FCALC_D f90 = ALPHA_I, f83, f90 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f66, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f82, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMPY f104 = ALPHA_R, f96 + nop __LINE__ + } + { .mfb + STFD [C2] = f88, SIZE + FMPY f120 = ALPHA_R, f112 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f105 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f89, SIZE + FCALC_C f121 = ALPHA_R, f113, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMPY f106 = ALPHA_R, f98 + nop __LINE__ + } + { .mfb + STFD [C2] = f90, SIZE + FMPY f122 = ALPHA_R, f114 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, SIZE + FCALC_C f107 = ALPHA_R, f99, f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f91, SIZE + FCALC_C f123 = ALPHA_R, f115, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f106 = ALPHA_I, f99, f106 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FCALC_D f122 = ALPHA_I, f115, f122 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f107 = ALPHA_I, f98, f107 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f123 = ALPHA_I, f114, f123 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3] = f104, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f120, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3] = f105, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f121, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3] = f106, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f122, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3] = f107, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4] = f123, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#endif + .align 16 + +.L030: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f72 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f72 = f0 + add AOFFSET = KK8, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mmi + nop __LINE__ + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f88 = f0 + shr L = L, 1 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f104 = f0 + adds L = -1, L + } + { .mfb + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f105 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f120 = f0 + mov ar.lc = L + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f121 = f0 + nop __LINE__ + } + ;; + .align 16 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f120 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C3], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f121 = [C4], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f104 = ALPHA_R, f96, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = ALPHA_R, f112, f120 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f105 = ALPHA_R, f97, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f121 = ALPHA_R, f113, f121 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f88, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f89, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f104, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f120, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f105, SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f121, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f104 = ALPHA_R, f96, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = ALPHA_R, f112, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f105 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f121 = ALPHA_R, f113, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2] = f88, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C2] = f89, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3] = f104, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f120, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3] = f105, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f121, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#endif + .align 16 + +.L049: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 16 + +.L050: + { .mmi +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + shr I = M, 2 + } + { .mib + mov C1 = C + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L090 + } + ;; + { .mmi + add C2 = LDC, C +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + nop __LINE__ + } + { .mib + cmp.eq p6, p7 = 0, I + shladd C = LDC, 1, C + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f66 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f67 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f66 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 2, KK +#endif +#endif + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f99 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + CPREFETCH [PREC], LDC + mov f115 = f0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds C5 = 4 * SIZE, C1 + adds L = -1, L + } + ;; + { .mmi + CPREFETCH [PREC], LDC + adds C6 = 4 * SIZE, C2 + mov ar.lc = L + } + ;; + .align 16 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfi + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f36, f48, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f36, f49, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f36, f50, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f36, f51, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f38, f48, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f38, f49, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f38, f50, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f38, f51, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f37, f48, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f37, f49, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f37, f50, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f37, f51, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f39, f48, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f39, f49, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f39, f50, f115 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f39, f51, f114 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C5 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f44, f56, f66 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f44, f58, f82 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f90 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f46, f56, f98 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f91 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f46, f58, f114 // A7 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C6 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f45, f56, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f45, f58, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f47, f56, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f47, f58, f115 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f66, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f67, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f98, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f99, f79 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f67, f76 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f66, f77 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f99, f78 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f98, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + { .mfb + STFD [C5] = f76, SIZE + FMA f92 = ALPHA_R, f82, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + { .mfb + STFD [C5] = f77, SIZE + FCALC_C f93 = ALPHA_R, f83, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMA f90 = ALPHA_R, f112, f90 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + FMA f94 = ALPHA_R, f114, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, 5 * SIZE + FCALC_C f91 = ALPHA_R, f113, f91 + nop __LINE__ + } + { .mfb + STFD [C5] = f79, 5 * SIZE + FCALC_C f95 = ALPHA_R, f115, f95 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f92 = ALPHA_I, f83, f92 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = ALPHA_I, f82, f93 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f113, f90 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f94 = ALPHA_I, f115, f94 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f91 = ALPHA_I, f112, f91 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMA f95 = ALPHA_I, f114, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f88, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f92, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2] = f89, SIZE + mov f80 = f0 + adds I = -1, I + } + { .mfb + STFD [C6] = f93, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f90, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f94, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f91, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f95, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f66, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f67, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f98, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f99, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f67, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f66, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f99, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f98, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMA f88 = ALPHA_R, f80, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f76, SIZE + FMA f92 = ALPHA_R, f82, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f77, SIZE + FCALC_C f93 = ALPHA_R, f83, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMA f90 = ALPHA_R, f112, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + FMA f94 = ALPHA_R, f114, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, 5 * SIZE + FCALC_C f91 = ALPHA_R, f113, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f79, 5 * SIZE + FCALC_C f95 = ALPHA_R, f115, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f92 = ALPHA_I, f83, f92 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = ALPHA_I, f82, f93 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f90 = ALPHA_I, f113, f90 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FCALC_D f94 = ALPHA_I, f115, f94 + cmp.ne p6, p0 = 1, I + } + ;; + { .mfi + nop __LINE__ + FMA f91 = ALPHA_I, f112, f91 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f95 = ALPHA_I, f114, f95 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2] = f88, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6] = f92, SIZE + mov f65 = f0 + adds I = -1, I + } + ;; + { .mfi + STFD [C2] = f89, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6] = f93, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2] = f90, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C6] = f94, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2] = f91, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C6] = f95, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#endif + .align 16 + +.L060: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L070 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + nop __LINE__ + } + { .mmi + adds BOFFSET = 2 * SIZE, B + cmp.eq p3, p0 = r0, r0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + shladd AOFFSET = KK8, 1, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = L + } + ;; + .align 16 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + } + { .mfb + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f90 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f91 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = ALPHA_R, f112, f90 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f91 = ALPHA_R, f113, f91 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f113, f90 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f112, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f88, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f89, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f96 = f0 + adds L = 1, K + } + { .mfb + STFD [C2] = f90, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f75, SIZE + mov f112 = f0 + shr L = L, 1 + } + { .mfb + STFD [C2] = f91, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = ALPHA_R, f112, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f91 = ALPHA_R, f113, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f113, f90 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f91 = ALPHA_I, f112, f91 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2] = f88, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C2] = f89, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2] = f90, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f75, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2] = f91, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#endif + .align 16 + +.L070: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 16 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f96 = f32, f49, f96 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f112 = f32, f51, f112 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f97 = f33, f49, f97 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f113 = f33, f51, f113 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f96 = f40, f57, f96 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE + (p5) LDFD f88 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f112 = f40, f59, f112 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f57, f97 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2 ], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f59, f113 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f97 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f80 = f80, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f96 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f81 = f81, f112 + nop __LINE__ + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + setf.d f96 = r0 + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + setf.d f97 = r0 + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + ;; + { .mfb + setf.d f112 = r0 + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + setf.d f113 = r0 + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + setf.d f65 = r0 + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + setf.d f81 = r0 + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + setf.d f64 = r0 + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mmf + STFD [C1] = f72, SIZE + STFD [C2] = f88, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C1] = f73, SIZE + STFD [C2] = f89, SIZE + mov B = BOFFSET + } + ;; +#else + { .mfi + setf.d f96 = r0 + FMA f72 = ALPHA_R, f64, f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + setf.d f97 = r0 + FMA f88 = ALPHA_R, f80, f0 + nop __LINE__ + } + ;; + { .mfi + setf.d f112 = r0 + FCALC_C f73 = ALPHA_R, f65, f0 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + setf.d f113 = r0 + FCALC_C f89 = ALPHA_R, f81, f0 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + setf.d f65 = r0 + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfi + setf.d f81 = r0 + FMA f73 = ALPHA_I, f64, f73 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + setf.d f64 = r0 + FMA f89 = ALPHA_I, f80, f89 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + mov f80 = f0 + } + ;; + { .mmi + STFD [C1] = f72, SIZE + STFD [C2] = f88, SIZE +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C1] = f73, SIZE + STFD [C2] = f89, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } +#endif + ;; + .align 16 + +.L089: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + +.L090: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 0 + } + { .mfi +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f72 = f0 + shr I = M, 2 + } + ;; + { .mfi + setf.d f66 = r0 + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + mov AOFFSET = A + mov f73 = f0 + (p6) br.cond.dpnt .L999 + } + ;; + { .mfi + setf.d f74 = r0 + mov f67 = f0 + nop __LINE__ + } + { .mfb + cmp.eq p6, p7 = 0, I + mov f75 = f0 + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#else + { .mfi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC] + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov ar.lc = L + } + { .mmi + adds C5 = 4 * SIZE, C1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 16 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f34, f48, f80 // A3 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f81 = f34, f49, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f36, f48, f96 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f97 = f36, f49, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f38, f48, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f38, f49, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f81 = f35, f48, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f35, f49, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f97 = f37, f48, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f37, f49, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f113 = f39, f48, f113 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f39, f49, f112 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f42, f56, f80 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f44, f56, f96 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f46, f56, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f43, f56, f81 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f45, f56, f97 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f47, f56, f113 // A8 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C5 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f96, f76 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f97, f77 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f80, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f112, f78 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f81, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f113, f79 + nop __LINE__ + } + ;; + + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f97, f76 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f96, f77 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f81, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f113, f78 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f80, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f112, f79 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f72, SIZE + mov f64 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfb + STFD [C5] = f76, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 + adds I = -1, I + } + { .mfb + STFD [C5] = f77, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f75, 5 * SIZE + mov f112 = f0 + } + { .mfb + STFD [C5] = f79, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L092 + } + ;; +#else + { .mfb + nop __LINE__ + FMA f6 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f96, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f80, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f112, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f81, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f113, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f6 = ALPHA_I, f65, f6 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f97, f76 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f96, f77 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f74 = ALPHA_I, f81, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f113, f78 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f75 = ALPHA_I, f80, f75 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f79 = ALPHA_I, f112, f79 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f6, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C5] = f76, SIZE + mov f65 = f0 + cmp.ne p6, p0 = 1, I + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C5] = f77, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C5] = f78, SIZE + mov f97 = f0 + adds I = -1, I + } + ;; + { .mfi + STFD [C1] = f75, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C5] = f79, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L092 + } + ;; +#endif + .align 16 + +.L100: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L110 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mii + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 16 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f81 = f33, f49, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f112 = f34, f49, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f113 = f35, f49, f113 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f57, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f57, f113 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f96 = f96, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f97 = f97, f112 + nop __LINE__ + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f75 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + setf.d f112 = r0 + FCALC_D f74 = ALPHA_I, f97, f74 + nop __LINE__ + } + { .mfb + setf.d f113 = r0 + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + ;; + { .mmf + STFD [C1] = f72, SIZE + setf.d f97 = r0 + mov f64 = f0 + } + ;; + { .mmf + STFD [C1] = f73, SIZE + setf.d f96 = r0 + mov f80 = f0 + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f65 = f0 + adds L = 1, K + } + ;; + { .mfi + STFD [C1] = f75, SIZE + mov f81 = f0 + shr L = L, 1 + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + ;; + { .mfi + setf.d f112 = r0 + FMA f74 = ALPHA_R, f96, f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + setf.d f113 = r0 + FCALC_C f75 = ALPHA_R, f97, f0 + nop __LINE__ + } + ;; + { .mfi + setf.d f97 = r0 + FCALC_D f72 = ALPHA_I, f65, f72 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + setf.d f96 = r0 + FMA f73 = ALPHA_I, f64, f73 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f75, SIZE + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 16 + +.L110: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L119 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mii + add BOFFSET = KK8, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + .align 16 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + ;; + { .mmf + STFD [C1] = f72, SIZE + setf.d f64 = r0 + mov f80 = f0 + } + ;; + { .mmf + STFD [C1] = f73, SIZE + setf.d f65 = r0 + mov f81 = f0 + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + ;; + { .mmf + STFD [C1] = f72, SIZE + setf.d f64 = r0 + mov f80 = f0 + } + ;; + { .mmf + STFD [C1] = f73, SIZE + setf.d f65 = r0 + mov f81 = f0 + } + ;; +#endif + .align 16 + +.L119: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + +.L999: + { .mii + nop __LINE__ + mov ar.lc = ARLC + mov pr = PR, -1 + } + { .mib + nop __LINE__ +#ifdef TRMMKERNEL + mov ar.pfs = ARPFS +#else + nop __LINE__ +#endif + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/zgemm_ncopy.S b/kernel/ia64/zgemm_ncopy.S new file mode 100644 index 0000000..e7950e9 --- /dev/null +++ b/kernel/ia64/zgemm_ncopy.S @@ -0,0 +1,854 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 64 +#define WPREFETCHSIZE 32 + +#define LD LDF8 +#define ST STF8_NTA + +#define TEMP r2 + +#define I r14 +#define J r15 +#define PREB r16 +#define PREA r17 + +#define A1 r18 +#define A2 r19 +#define A3 r20 +#define A4 r21 +#define A5 r22 +#define A6 r23 +#define A7 r24 +#define A8 r25 +#define B1 r26 + +#define COUNT r28 + +#define ARLC r30 +#define PR r31 + +#define M r32 +#define N r33 +#define A r34 +#define LDA r35 +#define B r36 + + PROLOGUE + .prologue + PROFCODE + + .body + { .mii + shladd LDA= LDA, ZBASE_SHIFT, r0 + mov PR = pr + shr J = N, 2 + } + ;; + { .mii + mov COUNT=r0 + tbit.nz p10, p0 =M, 1 + tbit.nz p11, p0 =M, 0 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mib + cmp.eq p8,p0 = 0, J + mov ARLC = ar.lc + (p8) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L11: + { .mmi + mov A1 = A + add A2 = A, LDA + mov pr.rot = 0 + } + { .mmi + shladd A3 = LDA, 1, A + adds B1 = 4 * SIZE, B + shr I = M, 2 + } + ;; + { .mmi + shladd A4 = LDA, 1, A2 + cmp.eq p16,p0 = r0, r0 + mov ar.ec = 3 + } + { .mmi + cmp.eq p6,p0 = 0,I + adds I =-1, I + adds J =-1, J + } + ;; + { .mmi + shladd A = LDA, 2, A + adds A5 = 4 * SIZE, A1 + adds A6 = 4 * SIZE, A2 + } + { .mmi + adds A7 = 4 * SIZE, A3 + adds A8 = 4 * SIZE, A4 + adds PREA = PREFETCHSIZE * SIZE,A1 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mib + adds PREB = WPREFETCHSIZE * SIZE, B + mov ar.lc = I + (p6) br.cond.dpnt.few .L15 + } + ;; + .align 32 + +.L12: + { .mmb + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], 16 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f34, SIZE + (p18) ST [B1] = f82, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f40, SIZE + (p18) ST [B1] = f88, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f58, SIZE + (p18) ST [B1] = f106, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f64, 5 * SIZE + (p18) ST [B1] = f112, 5 * SIZE + tbit.z p0,p7 = COUNT,0 + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f46, SIZE + (p18) ST [B1] = f94, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f56 = [A2], SIZE + (p16) LD f59 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f52, SIZE + (p18) ST [B1] = f100, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f62 = [A2], SIZE + (p16) LD f65 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f70, SIZE + (p18) ST [B1] = f118, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f68 = [A2], SIZE + (p16) LD f71 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f76, 5 * SIZE + (p18) ST [B1] = f124, 5 * SIZE + shladd TEMP = LDA, 2, r0 + } + { .mmb + (p16) LD f74 = [A2], 5 * SIZE + (p16) LD f77 = [A6], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], 16 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f37, SIZE + (p18) ST [B1] = f85, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f80 = [A3], SIZE + (p16) LD f83 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f43, SIZE + (p18) ST [B1] = f91, SIZE + adds TEMP = -16 * SIZE, TEMP + } + { .mmb + (p16) LD f86 = [A3], SIZE + (p16) LD f89 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f61, SIZE + (p18) ST [B1] = f109, SIZE + (p7) sub PREA = PREA, TEMP + } + { .mmb + (p16) LD f92 = [A3], SIZE + (p16) LD f95 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f67, 5 * SIZE + (p18) ST [B1] = f115, 5 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f98 = [A3], 5 * SIZE + (p16) LD f101 = [A7], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f49, SIZE + (p18) ST [B1] = f97, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f104 = [A4], SIZE + (p16) LD f107 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f55, SIZE + (p18) ST [B1] = f103, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f110 = [A4], SIZE + (p16) LD f113 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f73, SIZE + (p18) ST [B1] = f121, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f116 = [A4], SIZE + (p16) LD f119 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f79, 5 * SIZE + (p18) ST [B1] = f127, 5 * SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p16) LD f122 = [A4], 5 * SIZE + (p16) LD f125 = [A8], 5 * SIZE + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmb + (p10) LD f32 = [A1], SIZE + (p10) LD f40 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f33 = [A1], SIZE + (p10) LD f41 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f34 = [A1], SIZE + (p10) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f35 = [A1], SIZE + (p10) LD f43 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f50 = [A3], SIZE + (p10) LD f60 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f51 = [A3], SIZE + (p10) LD f61 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f52 = [A3], SIZE + (p10) LD f62 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f53 = [A3], SIZE + (p10) LD f63 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f36 = [A1], SIZE + (p11) LD f44 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f37 = [A1] + (p11) LD f45 = [A2] + nop __LINE__ + } + ;; + { .mmb + (p11) LD f54 = [A3], SIZE + (p11) LD f64 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f55 = [A3] + (p11) LD f65 = [A4] + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f32, SIZE + (p10) ST [B1] = f50, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f33, SIZE + (p10) ST [B1] = f51, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f40, SIZE + (p10) ST [B1] = f60, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f41, 5 * SIZE + (p10) ST [B1] = f61, 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f34, SIZE + (p10) ST [B1] = f52, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f35, SIZE + (p10) ST [B1] = f53, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f42, SIZE + (p10) ST [B1] = f62, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f43, 5 * SIZE + (p10) ST [B1] = f63, 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) ST [B ] = f36, SIZE + (p11) ST [B1] = f54, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [B ] = f37, SIZE + (p11) ST [B1] = f55, SIZE + mov COUNT = r0 + } + ;; + { .mmi + (p11) ST [B ] = f44, SIZE + (p11) ST [B1] = f64, SIZE + cmp.eq p0,p6 = 0,J + } + ;; + { .mmb + (p11) ST [B ] = f45, 5 * SIZE + (p11) ST [B1] = f65, 5 * SIZE + (p6) br.cond.dptk.few .L11 + } + ;; + .align 32 + +.L20: + { .mmi + mov A1 = A + add A2 = A,LDA + mov pr.rot = 0 + } + { .mmi + adds A5 = 4 * SIZE, A + adds B1 = 4 * SIZE, B + tbit.z p8, p0 = N, 1 + } + ;; + { .mmi + cmp.eq p16,p0 = r0,r0 + adds PREA = PREFETCHSIZE * SIZE, A + mov ar.ec = 3 + } + ;; + { .mib + adds PREB = WPREFETCHSIZE * SIZE,B + shr I = M, 2 + (p8) br.cond.dpnt.few .L30 + } + ;; + { .mmi + shladd A = LDA, 1, A + cmp.eq p6, p0 = 0, I + adds I = -1, I + } + ;; + { .mib + adds A6 = 4 * SIZE, A2 + mov ar.lc = I + (p6) br.cond.dpnt.few .L25 + } + ;; + .align 32 + +.L21: + { .mmb + (p16) lfetch.nt1 [PREA],LDA + (p16) lfetch.excl.nt1 [PREB ],16 * SIZE + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f34, SIZE + (p18) ST [B1] = f46, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f40, SIZE + (p18) ST [B1] = f52, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f58, SIZE + (p18) ST [B1] = f70, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f64, 5 * SIZE + (p18) ST [B1] = f76, 5 * SIZE + tbit.z p0,p7 = COUNT,0 + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f37, SIZE + (p18) ST [B1] = f49, SIZE + adds TEMP = -16 * SIZE,TEMP + } + { .mmb + (p16) LD f56 = [A2], SIZE + (p16) LD f59 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f43, SIZE + (p18) ST [B1] = f55, SIZE + (p7) sub PREA = PREA,TEMP + } + { .mmb + (p16) LD f62 = [A2], SIZE + (p16) LD f65 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f61, SIZE + (p18) ST [B1] = f73, SIZE + (p16) adds COUNT = 1,COUNT + } + { .mmb + (p16) LD f68 = [A2], SIZE + (p16) LD f71 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f67, 5 * SIZE + (p18) ST [B1] = f79, 5 * SIZE + shladd TEMP = LDA,2,r0 + } + { .mmb + (p16) LD f74 = [A2], 5 * SIZE + (p16) LD f77 = [A6], 5 * SIZE + br.ctop.sptk.few .L21 + } + ;; + .align 32 + +.L25: + { .mmb + (p10) LD f32 = [A1], SIZE + (p10) LD f40 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f33 = [A1], SIZE + (p10) LD f41 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f34 = [A1], SIZE + (p10) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f35 = [A1], SIZE + (p10) LD f43 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f36 = [A1], SIZE + (p11) LD f44 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f37 = [A1] + (p11) LD f45 = [A2] + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f32, SIZE + (p10) ST [B1] = f34, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f33, SIZE + (p10) ST [B1] = f35, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f40, SIZE + (p10) ST [B1] = f42, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f41, 5 * SIZE + (p10) ST [B1] = f43, 5 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [B ] = f36, SIZE + ;; + (p11) ST [B ] = f37, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [B ] = f44, SIZE + ;; + (p11) ST [B ] = f45, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L30: + { .mmi + mov A1 = A + mov COUNT = r0 + mov pr.rot = 0 + } + { .mmi + adds A5 = 4 * SIZE,A + adds B1 = 4 * SIZE,B + tbit.z p8,p0 = N,0 + } + ;; + { .mmi + cmp.eq p16,p0 = r0,r0 + nop __LINE__ + mov ar.ec = 3 + } + { .mib + nop __LINE__ + shr I = M,2 + (p8) br.cond.dptk.few .L999 + } + ;; + { .mmi + cmp.eq p6 ,p0 = 0, I + adds PREA = PREFETCHSIZE * SIZE, A + adds I = -1, I + } + ;; + { .mib + adds PREB = WPREFETCHSIZE * SIZE, B + mov ar.lc = I + (p6) br.cond.dpnt.few .L35 + } + ;; + .align 32 + +.L31: + { .mmi + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB ], 16 * SIZE + tbit.z p0, p7 = COUNT, 0 + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f34, SIZE + (p18) ST [B1] = f37, SIZE + shladd TEMP = LDA,2,r0 + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f40, SIZE + (p18) ST [B1] = f43, SIZE + adds TEMP = -16 * SIZE,TEMP + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f46, SIZE + (p18) ST [B1] = f49, SIZE + nop __LINE__ + } + { .mmi + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + (p7) sub PREA = PREA,TEMP + } + ;; + { .mmi + (p18) ST [B ] = f52, 5 * SIZE + (p18) ST [B1] = f55, 5 * SIZE + (p16) adds COUNT = 1,COUNT + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + br.ctop.sptk.few .L31 + } + ;; + .align 32 + +.L35: + { .mmi + (p10) LD f32 = [A1], SIZE + ;; + (p10) LD f33 = [A1], SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) LD f34 = [A1], SIZE + ;; + (p10) LD f35 = [A1], SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) LD f36 = [A1], SIZE + ;; + (p11) LD f37 = [A1] + nop __LINE__ + } + ;; + { .mmi + (p10) ST [B ] = f32, SIZE + ;; + (p10) ST [B ] = f33, SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) ST [B ] = f34, SIZE + ;; + (p10) ST [B ] = f35, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [B ] = f36, SIZE + ;; + (p11) ST [B ] = f37, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L999: + mov pr = PR,-1 + mov ar.lc = ARLC + br.ret.sptk.many b0 + ;; + EPILOGUE + diff --git a/kernel/ia64/zgemm_tcopy.S b/kernel/ia64/zgemm_tcopy.S new file mode 100644 index 0000000..9af5380 --- /dev/null +++ b/kernel/ia64/zgemm_tcopy.S @@ -0,0 +1,898 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 24 +#define WPREFETCHSIZE 48 + +#define LD LDF8 +#define ST STF8_NTA + +#define PREA r2 +#define PREB r3 + +#define I r14 +#define J r15 + +#define A1 r16 +#define A2 r17 +#define A3 r18 +#define A4 r19 +#define A5 r20 +#define A6 r21 +#define A7 r22 +#define A8 r23 +#define B1 r24 +#define B2 r25 + +#define COUNT r26 +#define TEMP r27 + +#define BO2 r28 +#define BO3 r29 +#define LDB r8 + +#define ARLC r30 +#define PR r31 + +#define M r32 +#define N r33 +#define A r34 +#define LDA r35 +#define B r36 + + PROLOGUE + .prologue + PROFCODE + + .body + { .mmi + setf.sig f32 = M + and r8 = -4, N + mov ARLC = ar.lc + } + ;; + { .mmi + setf.sig f33 = r8 + and r9 = -2, N + mov PR = pr + } + ;; + { .mmi + setf.sig f34 = r9 + shladd LDA = LDA, ZBASE_SHIFT, r0 + shl LDB = M, BASE_SHIFT + 3 + } + ;; + { .mfi + nop __LINE__ + xmpy.l f33 = f32, f33 + shr J = M, 2 + } + { .mfi + nop __LINE__ + xmpy.l f34 = f32, f34 + nop __LINE__ + } + ;; + { .mmb + getf.sig BO2 = f33 + getf.sig BO3 = f34 + nop __LINE__ + } + ;; + { .mmi + shladd BO2 = BO2, ZBASE_SHIFT, B + shladd BO3 = BO3, ZBASE_SHIFT, B + tbit.nz p10, p0 =N, 1 + } + { .mib + cmp.eq p6, p0 = 0, J + tbit.nz p11, p0 =N, 0 + (p6) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L11: + { .mmi + mov A1 = A + add A2 = A, LDA + mov pr.rot = 0 + } + { .mmi + shladd A3 = LDA, 1, A + mov B1 = B + shr I = N, 2 + } + ;; + { .mmi + shladd A4 = LDA, 1, A2 + cmp.eq p16,p0 = r0, r0 + mov ar.ec = 3 + } + { .mmi + cmp.eq p6,p0 = 0,I + adds I =-1, I + adds J =-1, J + } + ;; + { .mmi + shladd A = LDA, 2, A + adds A5 = 4 * SIZE, A1 + adds A6 = 4 * SIZE, A2 + } + { .mmi + adds A7 = 4 * SIZE, A3 + adds A8 = 4 * SIZE, A4 + adds PREA = PREFETCHSIZE * SIZE,A1 + } + ;; + { .mmb + adds B2 = 4 * SIZE, B + adds PREB = WPREFETCHSIZE * SIZE, B + nop __LINE__ + } + { .mib + adds B = 32 * SIZE, B + mov ar.lc = I + (p6) br.cond.dpnt.few .L15 + } + ;; + +.L12: + { .mmb + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], LDB + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f34, SIZE + (p18) ST [B2] = f37, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f40, SIZE + (p18) ST [B2] = f43, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f46, SIZE + (p18) ST [B2] = f49, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f52, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + tbit.z p0,p7 = COUNT,0 + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f58, SIZE + (p18) ST [B2] = f61, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f56 = [A2], SIZE + (p16) LD f59 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f64, SIZE + (p18) ST [B2] = f67, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f62 = [A2], SIZE + (p16) LD f65 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f70, SIZE + (p18) ST [B2] = f73, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f68 = [A2], SIZE + (p16) LD f71 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f76, 5 * SIZE + (p18) ST [B2] = f79, 5 * SIZE + shladd TEMP = LDA, 2, r0 + } + { .mmb + (p16) LD f74 = [A2], 5 * SIZE + (p16) LD f77 = [A6], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f82, SIZE + (p18) ST [B2] = f85, SIZE + nop __LINE__ + } + { .mmb + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], LDB + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f88, SIZE + (p18) ST [B2] = f91, SIZE + adds TEMP = -16 * SIZE, TEMP + } + { .mmb + (p16) LD f80 = [A3], SIZE + (p16) LD f83 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f94, SIZE + (p18) ST [B2] = f97, SIZE + (p7) sub PREA = PREA, TEMP + } + { .mmb + (p16) LD f86 = [A3], SIZE + (p16) LD f89 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f100, 5 * SIZE + (p18) ST [B2] = f103, 5 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f92 = [A3], SIZE + (p16) LD f95 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f106, SIZE + (p18) ST [B2] = f109, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f98 = [A3], 5 * SIZE + (p16) LD f101 = [A7], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f112, SIZE + (p18) ST [B2] = f115, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f104 = [A4], SIZE + (p16) LD f107 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f118, SIZE + (p18) ST [B2] = f121, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f110 = [A4], SIZE + (p16) LD f113 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f124, -27 * SIZE + (p18) ST [B2] = f127, -27 * SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p16) LD f116 = [A4], SIZE + (p16) LD f119 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) add B1 = B1, LDB + (p18) add B2 = B2, LDB + nop __LINE__ + } + { .mmb + (p16) LD f122 = [A4], 5 * SIZE + (p16) LD f125 = [A8], 5 * SIZE + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmb + (p10) LD f32 = [A1], SIZE + (p10) LD f40 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f33 = [A1], SIZE + (p10) LD f41 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f34 = [A1], SIZE + (p10) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f35 = [A1], SIZE + (p10) LD f43 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f50 = [A3], SIZE + (p10) LD f60 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f51 = [A3], SIZE + (p10) LD f61 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f52 = [A3], SIZE + (p10) LD f62 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f53 = [A3], SIZE + (p10) LD f63 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f36 = [A1], SIZE + (p11) LD f44 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f37 = [A1] + (p11) LD f45 = [A2] + nop __LINE__ + } + ;; + { .mmb + (p11) LD f54 = [A3], SIZE + (p11) LD f64 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) LD f55 = [A3] + (p11) LD f65 = [A4] + adds B2 = 4 * SIZE, BO2 + } + ;; + { .mmb + (p10) ST [BO2] = f32, SIZE + (p10) ST [B2] = f40, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f33, SIZE + (p10) ST [B2] = f41, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f34, SIZE + (p10) ST [B2] = f42, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f35, 5 * SIZE + (p10) ST [B2] = f43, 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f50, SIZE + (p10) ST [B2] = f60, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f51, SIZE + (p10) ST [B2] = f61, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f52, SIZE + (p10) ST [B2] = f62, SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) ST [BO2] = f53, 5 * SIZE + (p10) ST [B2] = f63 + adds B2 = 4 * SIZE, BO3 + } + ;; + { .mmb + (p11) ST [BO3] = f36, SIZE + (p11) ST [B2] = f54, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [BO3] = f37, SIZE + (p11) ST [B2] = f55, SIZE + mov COUNT = r0 + } + ;; + { .mmi + (p11) ST [BO3] = f44, SIZE + (p11) ST [B2] = f64, SIZE + cmp.eq p0,p6 = 0,J + } + ;; + { .mmb + (p11) ST [BO3] = f45, 5 * SIZE + (p11) ST [B2] = f65, 5 * SIZE + (p6) br.cond.dptk.few .L11 + } + ;; + .align 32 + +.L20: + { .mmi + mov A1 = A + add A2 = A, LDA + mov pr.rot = 0 + } + { .mmi + mov B1 = B + adds PREA = PREFETCHSIZE * SIZE,A + tbit.z p6, p0 = M, 1 + } + ;; + { .mmi + cmp.eq p16,p0 = r0, r0 + adds B2 = 4 * SIZE, B + mov ar.ec = 3 + } + { .mib + adds PREB = WPREFETCHSIZE * SIZE, B + shr I = N, 2 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, I + adds I =-1, I + nop __LINE__ + } + { .mmi + shladd A = LDA, 1, A + adds A5 = 4 * SIZE, A1 + adds A6 = 4 * SIZE, A2 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mib + adds B = 16 * SIZE, B + mov ar.lc = I + (p6) br.cond.dpnt.few .L25 + } + ;; + +.L22: + { .mmi + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], LDB + shladd TEMP = LDA, 1, r0 + } + ;; + { .mmb + (p18) ST [B1] = f34, SIZE + (p18) ST [B2] = f37, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f40, SIZE + (p18) ST [B2] = f43, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f46, SIZE + (p18) ST [B2] = f49, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f52, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + tbit.z p0,p7 = COUNT,0 + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f58, SIZE + (p18) ST [B2] = f61, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f56 = [A2], SIZE + (p16) LD f59 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f64, SIZE + (p18) ST [B2] = f67, SIZE + adds TEMP = -16 * SIZE, TEMP + } + { .mmb + (p16) LD f62 = [A2], SIZE + (p16) LD f65 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f70, SIZE + (p18) ST [B2] = f73, SIZE + (p7) sub PREA = PREA, TEMP + } + { .mmb + (p16) LD f68 = [A2], SIZE + (p16) LD f71 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f76, -11 * SIZE + (p18) ST [B2] = f79, -11 * SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p16) LD f74 = [A2], 5 * SIZE + (p16) LD f77 = [A6], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) add B1 = B1, LDB + (p18) add B2 = B2, LDB + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmb + (p10) LD f32 = [A1], SIZE + (p10) LD f40 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f33 = [A1], SIZE + (p10) LD f41 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f34 = [A1], SIZE + (p10) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f35 = [A1], SIZE + (p10) LD f43 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f36 = [A1], SIZE + (p11) LD f44 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) LD f37 = [A1] + (p11) LD f45 = [A2] + adds B2 = 4 * SIZE, BO2 + } + ;; + { .mmb + (p10) ST [BO2] = f32, SIZE + (p10) ST [B2] = f40, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f33, SIZE + (p10) ST [B2] = f41, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f34, SIZE + (p10) ST [B2] = f42, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f35, 5 * SIZE + (p10) ST [B2] = f43, 5 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [BO3] = f36, SIZE + ;; + (p11) ST [BO3] = f37, SIZE + mov COUNT = r0 + } + ;; + { .mmi + (p11) ST [BO3] = f44, SIZE + ;; + (p11) ST [BO3] = f45, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L30: + { .mmi + mov A1 = A + adds A5 = 4 * SIZE, A + mov pr.rot = 0 + } + { .mmi + mov B1 = B + adds B2 = 4 * SIZE, B + tbit.z p6, p0 = M, 0 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mib + cmp.eq p16,p0 = r0, r0 + shr I = N, 2 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, I + adds I =-1, I + mov ar.ec = 3 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = I + (p6) br.cond.dpnt.few .L35 + } + ;; + .align 32 + +.L32: + { .mmb + (p18) ST [B1] = f34, SIZE + (p18) ST [B2] = f37, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f40, SIZE + (p18) ST [B2] = f43, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f46, SIZE + (p18) ST [B2] = f49, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f52, -3 * SIZE + (p18) ST [B2] = f55, -3 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mmb + (p18) add B1 = B1, LDB + (p18) add B2 = B2, LDB + br.ctop.sptk.few .L32 + } + ;; + .align 32 + +.L35: + { .mmi + (p10) LD f32 = [A1], SIZE + ;; + (p10) LD f33 = [A1], SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) LD f34 = [A1], SIZE + ;; + (p10) LD f35 = [A1], SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) LD f36 = [A1], SIZE + ;; + (p11) LD f37 = [A1] + nop __LINE__ + } + ;; + { .mmi + (p10) ST [BO2] = f32, SIZE + ;; + (p10) ST [BO2] = f33, SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) ST [BO2] = f34, SIZE + ;; + (p10) ST [BO2] = f35, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [BO3] = f36, SIZE + ;; + (p11) ST [BO3] = f37, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L999: + mov pr = PR, -1 + mov ar.lc = ARLC + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/zgemv_n.S b/kernel/ia64/zgemv_n.S new file mode 100644 index 0000000..b3027a6 --- /dev/null +++ b/kernel/ia64/zgemv_n.S @@ -0,0 +1,2293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#define A r37 +#define LDA r38 +#define X r39 +#define INCX r34 +#define Y r35 +#define INCY r36 +#define BUFFER r11 + +#define I r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define AO5 r20 +#define AO6 r21 +#define AO7 r22 +#define AO8 r23 +#define YLD1 r24 +#define YLD2 r25 +#define YST1 r26 +#define YST2 r27 +#define YY r28 +#define XX r9 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define INCXM1 r2 +#define INCX3M1 r3 + +#define AO9 loc8 +#define AO10 loc9 +#define AO11 loc10 +#define AO12 loc11 +#define AO13 loc12 +#define AO14 loc13 +#define AO15 loc14 +#define AO16 loc15 + +#define PREB r8 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 2 + 8) +#else +#define RPREFETCH (16 * 2 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA_R f6 +#define ALPHA_I f7 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 FNMA +#define ADD2 FMA +#define ADD3 FNMA +#define ADD4 FMA +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 FNMA +#define ADD2 FMA +#define ADD3 FMA +#define ADD4 FNMA +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 FMA +#define ADD2 FNMA +#define ADD3 FNMA +#define ADD4 FMA +#else +#define ADD1 FMA +#define ADD2 FNMA +#define ADD3 FMA +#define ADD4 FNMA +#endif + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + adds r17 = 40, SP + ;; + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + ;; + stf.spill [r8] = f22 + stf.spill [r9] = f23 + ;; + ld8 INCX = [r14] + ld8 Y = [r15] + ld8 INCY = [r16] + ld8 BUFFER = [r17] + .body + ;; + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + mov ALPHA_R = f8 + shladd INCX = INCX, ZBASE_SHIFT, r0 + shladd LDA = LDA, ZBASE_SHIFT, r0 + mov ALPHA_I = f9 + ;; + shladd INCY = INCY, ZBASE_SHIFT, r0 + tbit.nz p8, p0 = A, BASE_SHIFT + (p7) br.cond.dpnt .L999 + ;; + shladd XX = INCX, 1, X + adds INCXM1 = -SIZE, INCX + (p6) br.cond.dpnt .L999 + ;; + shladd INCX3M1 = INCX, 1, INCXM1 + cmp.eq p10, p11 = 2 * SIZE, INCY + mov YY = Y + ;; + (p11) mov YY = BUFFER + mov YST1 = BUFFER + shr J = M, 2 + ;; + { .mib + adds YST2 = 4 * SIZE, BUFFER + mov ar.lc = J + (p10) br.cond.dptk .L10 + } + ;; +.L02: + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 5 * SIZE + STFD [YST2] = f0, 5 * SIZE + br.cloop.sptk.few .L02 + ;; + +.L10: + { .mmi + mov AO1 = A + nop __LINE__ + shr J = N, 3 + } + ;; + { .mmb + add AO2 = LDA, A + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + LDFD f32 = [X], SIZE + LDFD f36 = [XX], SIZE + mov pr.rot= 0 + ;; + LDFD f33 = [X], INCXM1 + LDFD f37 = [XX], INCXM1 + mov YLD1 = YY + ;; + LDFD f34 = [X], SIZE + LDFD f38 = [XX], SIZE + adds YLD2 = 4 * SIZE, YY + ;; + LDFD f35 = [X], INCX3M1 + LDFD f39 = [XX], INCX3M1 + mov YST1 = YY + ;; + LDFD f40 = [X], SIZE + LDFD f44 = [XX], SIZE + adds YST2 = 4 * SIZE, YY + ;; + LDFD f41 = [X], INCXM1 + LDFD f45 = [XX], INCXM1 + shr I = M, 2 + ;; + LDFD f42 = [X], SIZE + LDFD f46 = [XX], SIZE + mov AO1 = A + ;; + LDFD f43 = [X], INCX3M1 + LDFD f47 = [XX], INCX3M1 + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + FMPY f8 = ALPHA_R, f32 + mov ar.ec= 2 + shladd AO4 = LDA, 1, AO2 + FMPY f9 = ALPHA_I, f32 + ;; + shladd AO5 = LDA, 1, AO3 + FMPY f10 = ALPHA_R, f34 + shladd AO6 = LDA, 1, AO4 + FMPY f11 = ALPHA_I, f34 + ;; + FMPY f12 = ALPHA_R, f36 + shladd AO7 = LDA, 1, AO5 + FMPY f13 = ALPHA_I, f36 + shladd AO8 = LDA, 1, AO6 + FMPY f14 = ALPHA_R, f38 + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f15 = ALPHA_I, f38 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f16 = ALPHA_R, f40 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + FMPY f17 = ALPHA_I, f40 + adds RPRE3 = RPREFETCH * SIZE, AO3 + FMPY f18 = ALPHA_R, f42 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + FMPY f19 = ALPHA_I, f42 + adds RPRE5 = RPREFETCH * SIZE, AO5 + FMPY f20 = ALPHA_R, f44 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + FMPY f21 = ALPHA_I, f44 + adds RPRE7 = RPREFETCH * SIZE, AO7 + FMPY f22 = ALPHA_R, f46 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + FMPY f23 = ALPHA_I, f46 + ;; + ADD1 f8 = ALPHA_I, f33, f8 + tbit.nz p14, p0 = M, 1 + ADD2 f9 = ALPHA_R, f33, f9 + shladd A = LDA, 3, A + ADD1 f10 = ALPHA_I, f35, f10 + adds AO9 = 4 * SIZE, AO1 + ADD2 f11 = ALPHA_R, f35, f11 + adds AO10 = 4 * SIZE, AO2 + ADD1 f12 = ALPHA_I, f37, f12 + adds AO11 = 4 * SIZE, AO3 + ADD2 f13 = ALPHA_R, f37, f13 + adds AO12 = 4 * SIZE, AO4 + ADD1 f14 = ALPHA_I, f39, f14 + adds AO13 = 4 * SIZE, AO5 + ADD2 f15 = ALPHA_R, f39, f15 + adds AO14 = 4 * SIZE, AO6 + ADD1 f16 = ALPHA_I, f41, f16 + adds AO15 = 4 * SIZE, AO7 + ADD2 f17 = ALPHA_R, f41, f17 + adds AO16 = 4 * SIZE, AO8 + ADD1 f18 = ALPHA_I, f43, f18 + cmp.eq p6, p0 = 0, I + ADD2 f19 = ALPHA_R, f43, f19 + cmp.eq p16, p0 = r0, r0 + ADD1 f20 = ALPHA_I, f45, f20 + adds I = -1, I + ADD2 f21 = ALPHA_R, f45, f21 + ;; + { .mfi + nop __LINE__ + ADD1 f22 = ALPHA_I, f47, f22 + mov ar.lc = I + } + { .mfb + nop __LINE__ + ADD2 f23 = ALPHA_R, f47, f23 + (p6) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mfi + (p17) LDFD f89 = [AO8], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p12, p13 = I, 0 + } + { .mfi + (p17) LDFD f93 = [AO16], 1 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + ;; + { .mfi + (p17) LDFD f90 = [AO8], 1 * SIZE + (p17) FMA f104 = f9, f33, f104 + (p16) adds I = -1, I + } + { .mfi + (p17) LDFD f94 = [AO16], 1 * SIZE + (p17) FMA f116 = f9, f37, f116 + } + ;; + { .mfi + (p17) LDFD f91 = [AO8], 1 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p17) LDFD f95 = [AO16], 1 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + ;; + { .mfi + (p17) LDFD f92 = [AO8], 5 * SIZE + (p17) FMA f110 = f9, f35, f110 + } + { .mfi + (p17) LDFD f96 = [AO16], 5 * SIZE + (p17) FMA f122 = f9, f39, f122 + } + ;; + { .mfi + (p12) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) ADD3 f101 = f9, f34, f101 + } + { .mfi + (p17) ADD3 f113 = f9, f38, f113 + } + ;; + { .mfi + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p17) ADD4 f104 = f8, f34, f104 + } + { .mfi + (p16) LDFD f112 = [YLD2], 1 * SIZE + (p17) ADD4 f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFD f103 = [YLD1], 1 * SIZE + (p17) ADD3 f107 = f9, f36, f107 + } + { .mfi + (p16) LDFD f115 = [YLD2], 1 * SIZE + (p17) ADD3 f119 = f9, f40, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE1], 16 * SIZE + (p17) ADD4 f110 = f8, f36, f110 + } + { .mfi + (p17) ADD4 f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFD f32 = [AO1], 1 * SIZE + (p17) FMA f101 = f10, f41, f101 + } + { .mfi + (p16) LDFD f36 = [AO9], 1 * SIZE + (p17) FMA f113 = f10, f45, f113 + } + ;; + { .mfi + (p16) LDFD f33 = [AO1], 1 * SIZE + (p17) FMA f104 = f11, f41, f104 + } + { .mfi + (p16) LDFD f37 = [AO9], 1 * SIZE + (p17) FMA f116 = f11, f45, f116 + } + ;; + { .mfi + (p16) LDFD f34 = [AO1], 1 * SIZE + (p17) FMA f107 = f10, f43, f107 + } + { .mfi + (p16) LDFD f38 = [AO9], 1 * SIZE + (p17) FMA f119 = f10, f47, f119 + } + ;; + { .mfi + (p16) LDFD f35 = [AO1], 5 * SIZE + (p17) FMA f110 = f11, f43, f110 + } + { .mfi + (p16) LDFD f39 = [AO9], 5 * SIZE + (p17) FMA f122 = f11, f47, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f11, f42, f101 + } + { .mfi + (p17) ADD3 f113 = f11, f46, f113 + } + ;; + { .mfi + (p16) LDFD f106 = [YLD1], 1 * SIZE + (p17) ADD4 f104 = f10, f42, f104 + } + { .mfi + (p16) LDFD f118 = [YLD2], 1 * SIZE + (p17) ADD4 f116 = f10, f46, f116 + } + ;; + { .mfi + (p16) LDFD f109 = [YLD1], 5 * SIZE + (p17) ADD3 f107 = f11, f44, f107 + } + { .mfi + (p16) LDFD f121 = [YLD2], 5 * SIZE + (p17) ADD3 f119 = f11, f48, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE2], 16 * SIZE + (p17) ADD4 f110 = f10, f44, f110 + } + { .mfi + (p17) ADD4 f122 = f10, f48, f122 + } + ;; + { .mfi + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f101 = f12, f49, f101 + } + { .mfi + (p16) LDFD f44 = [AO10], 1 * SIZE + (p17) FMA f113 = f12, f53, f113 + } + ;; + { .mfi + (p16) LDFD f41 = [AO2], 1 * SIZE + (p17) FMA f104 = f13, f49, f104 + } + { .mfi + (p16) LDFD f45 = [AO10], 1 * SIZE + (p17) FMA f116 = f13, f53, f116 + } + ;; + { .mfi + (p16) LDFD f42 = [AO2], 1 * SIZE + (p17) FMA f107 = f12, f51, f107 + } + { .mfi + (p16) LDFD f46 = [AO10], 1 * SIZE + (p17) FMA f119 = f12, f55, f119 + } + ;; + { .mfi + (p16) LDFD f43 = [AO2], 5 * SIZE + (p17) FMA f110 = f13, f51, f110 + } + { .mfi + (p16) LDFD f47 = [AO10], 5 * SIZE + (p17) FMA f122 = f13, f55, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f13, f50, f101 + } + { .mfi + (p17) ADD3 f113 = f13, f54, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f12, f50, f104 + } + { .mfi + (p17) ADD4 f116 = f12, f54, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f13, f52, f107 + } + { .mfi + (p17) ADD3 f119 = f13, f56, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE3], 16 * SIZE + (p17) ADD4 f110 = f12, f52, f110 + } + { .mfi + (p17) ADD4 f122 = f12, f56, f122 + } + ;; + { .mfi + (p16) LDFD f48 = [AO3], 1 * SIZE + (p17) FMA f101 = f14, f57, f101 + } + { .mfi + (p16) LDFD f52 = [AO11], 1 * SIZE + (p17) FMA f113 = f14, f61, f113 + } + ;; + { .mfi + (p16) LDFD f49 = [AO3], 1 * SIZE + (p17) FMA f104 = f15, f57, f104 + } + { .mfi + (p16) LDFD f53 = [AO11], 1 * SIZE + (p17) FMA f116 = f15, f61, f116 + } + ;; + { .mfi + (p16) LDFD f50 = [AO3], 1 * SIZE + (p17) FMA f107 = f14, f59, f107 + } + { .mfi + (p16) LDFD f54 = [AO11], 1 * SIZE + (p17) FMA f119 = f14, f63, f119 + } + ;; + { .mfi + (p16) LDFD f51 = [AO3], 5 * SIZE + (p17) FMA f110 = f15, f59, f110 + } + { .mfi + (p16) LDFD f55 = [AO11], 5 * SIZE + (p17) FMA f122 = f15, f63, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f15, f58, f101 + } + { .mfi + (p17) ADD3 f113 = f15, f62, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f14, f58, f104 + } + { .mfi + (p17) ADD4 f116 = f14, f62, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f15, f60, f107 + } + { .mfi + (p17) ADD3 f119 = f15, f64, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE4], 16 * SIZE + (p17) ADD4 f110 = f14, f60, f110 + } + { .mfi + (p17) ADD4 f122 = f14, f64, f122 + } + ;; + { .mfi + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f101 = f16, f65, f101 + } + { .mfi + (p16) LDFD f60 = [AO12], 1 * SIZE + (p17) FMA f113 = f16, f69, f113 + } + ;; + { .mfi + (p16) LDFD f57 = [AO4], 1 * SIZE + (p17) FMA f104 = f17, f65, f104 + } + { .mfi + (p16) LDFD f61 = [AO12], 1 * SIZE + (p17) FMA f116 = f17, f69, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + (p17) FMA f107 = f16, f67, f107 + } + { .mmf + (p16) LDFD f58 = [AO4], 1 * SIZE + (p16) LDFD f62 = [AO12], 1 * SIZE + (p17) FMA f119 = f16, f71, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + (p17) FMA f110 = f17, f67, f110 + } + { .mmf + (p16) LDFD f59 = [AO4], 5 * SIZE + (p16) LDFD f63 = [AO12], 5 * SIZE + (p17) FMA f122 = f17, f71, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f17, f66, f101 + } + { .mfi + (p17) ADD3 f113 = f17, f70, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f16, f66, f104 + } + { .mfi + (p17) ADD4 f116 = f16, f70, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f17, f68, f107 + } + { .mfi + (p17) ADD3 f119 = f17, f72, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE5], 16 * SIZE + (p17) ADD4 f110 = f16, f68, f110 + } + { .mfi + (p17) ADD4 f122 = f16, f72, f122 + } + ;; + { .mfi + (p16) LDFD f64 = [AO5], 1 * SIZE + (p17) FMA f101 = f18, f73, f101 + } + { .mfi + (p16) LDFD f68 = [AO13], 1 * SIZE + (p17) FMA f113 = f18, f77, f113 + } + ;; + { .mfi + (p16) LDFD f65 = [AO5], 1 * SIZE + (p17) FMA f104 = f19, f73, f104 + } + { .mfi + (p16) LDFD f69 = [AO13], 1 * SIZE + (p17) FMA f116 = f19, f77, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p17) FMA f107 = f18, f75, f107 + } + { .mmf + (p16) LDFD f66 = [AO5], 1 * SIZE + (p16) LDFD f70 = [AO13], 1 * SIZE + (p17) FMA f119 = f18, f79, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + (p17) FMA f110 = f19, f75, f110 + } + { .mmf + (p16) LDFD f67 = [AO5], 5 * SIZE + (p16) LDFD f71 = [AO13], 5 * SIZE + (p17) FMA f122 = f19, f79, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f19, f74, f101 + } + { .mfi + (p17) ADD3 f113 = f19, f78, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f18, f74, f104 + } + { .mfi + (p17) ADD4 f116 = f18, f78, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f19, f76, f107 + } + { .mfi + (p17) ADD3 f119 = f19, f80, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE6], 16 * SIZE + (p17) ADD4 f110 = f18, f76, f110 + } + { .mfi + (p17) ADD4 f122 = f18, f80, f122 + } + ;; + { .mfi + (p16) LDFD f72 = [AO6], 1 * SIZE + (p17) FMA f101 = f20, f81, f101 + } + { .mfi + (p16) LDFD f76 = [AO14], 1 * SIZE + (p17) FMA f113 = f20, f85, f113 + } + ;; + { .mfi + (p16) LDFD f73 = [AO6], 1 * SIZE + (p17) FMA f104 = f21, f81, f104 + } + { .mfi + (p16) LDFD f77 = [AO14], 1 * SIZE + (p17) FMA f116 = f21, f85, f116 + } + ;; + { .mfi + (p16) LDFD f74 = [AO6], 1 * SIZE + (p17) FMA f107 = f20, f83, f107 + } + { .mfi + (p16) LDFD f78 = [AO14], 1 * SIZE + (p17) FMA f119 = f20, f87, f119 + } + ;; + { .mfi + (p16) LDFD f75 = [AO6], 5 * SIZE + (p17) FMA f110 = f21, f83, f110 + } + { .mfi + (p16) LDFD f79 = [AO14], 5 * SIZE + (p17) FMA f122 = f21, f87, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f21, f82, f101 + } + { .mfi + (p17) ADD3 f113 = f21, f86, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f20, f82, f104 + } + { .mfi + (p17) ADD4 f116 = f20, f86, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f21, f84, f107 + } + { .mfi + (p17) ADD3 f119 = f21, f88, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE7], 16 * SIZE + (p17) ADD4 f110 = f20, f84, f110 + } + { .mfi + (p17) ADD4 f122 = f20, f88, f122 + } + ;; + { .mfi + (p16) LDFD f80 = [AO7], 1 * SIZE + (p17) FMA f101 = f22, f89, f101 + } + { .mfi + (p16) LDFD f84 = [AO15], 1 * SIZE + (p17) FMA f113 = f22, f93, f113 + } + ;; + { .mfi + (p16) LDFD f81 = [AO7], 1 * SIZE + (p17) FMA f104 = f23, f89, f104 + } + { .mfi + (p16) LDFD f85 = [AO15], 1 * SIZE + (p17) FMA f116 = f23, f93, f116 + } + ;; + { .mfi + (p16) LDFD f82 = [AO7], 1 * SIZE + (p17) FMA f107 = f22, f91, f107 + } + { .mfi + (p16) LDFD f86 = [AO15], 1 * SIZE + (p17) FMA f119 = f22, f95, f119 + } + ;; + { .mfi + (p16) LDFD f83 = [AO7], 5 * SIZE + (p17) FMA f110 = f23, f91, f110 + } + { .mfi + (p16) LDFD f87 = [AO15], 5 * SIZE + (p17) FMA f122 = f23, f95, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f23, f90, f101 + } + { .mfi + (p17) ADD3 f113 = f23, f94, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f22, f90, f104 + } + { .mfi + (p17) ADD4 f116 = f22, f94, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f23, f92, f107 + } + { .mfi + (p17) ADD3 f119 = f23, f96, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE8], 16 * SIZE + (p17) ADD4 f110 = f22, f92, f110 + } + { .mfb + (p17) ADD4 f122 = f22, f96, f122 + br.ctop.sptk.few .L12 + } + ;; + .align 16 + +.L15: + { .mmi + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f32 = [AO1], 1 * SIZE + (p14) LDFD f80 = [YLD1], 1 * SIZE + cmp.lt p6, p0 = 1, J + } + ;; + { .mmi + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + adds J = -1, J + } + { + (p14) LDFD f33 = [AO1], 1 * SIZE + (p14) LDFD f81 = [YLD1], 1 * SIZE + and I = 3, M + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p6) cmp.eq.unc p7, p0 = I, r0 + } + { .mmi + (p14) LDFD f34 = [AO1], 1 * SIZE + (p14) LDFD f82 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + } + { .mmb + (p14) LDFD f35 = [AO1], 1 * SIZE + (p14) LDFD f83 = [YLD1], 1 * SIZE + (p7) br.cond.dptk .L11 + } + ;; + (p15) LDFD f36 = [AO1], 1 * SIZE + (p15) LDFD f84 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f37 = [AO1], 1 * SIZE + (p15) LDFD f85 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f38 = [AO2], 1 * SIZE + (p14) LDFD f44 = [AO3], 1 * SIZE + ;; + (p14) LDFD f39 = [AO2], 1 * SIZE + (p14) LDFD f45 = [AO3], 1 * SIZE + ;; + (p14) LDFD f40 = [AO2], 1 * SIZE + (p14) LDFD f46 = [AO3], 1 * SIZE + ;; + (p14) LDFD f41 = [AO2], 1 * SIZE + (p14) LDFD f47 = [AO3], 1 * SIZE + (p14) FMA f80 = f8, f32, f80 + ;; + (p15) LDFD f42 = [AO2], 1 * SIZE + (p15) LDFD f48 = [AO3], 1 * SIZE + (p14) FMA f81 = f9, f32, f81 + ;; + (p15) LDFD f43 = [AO2], 1 * SIZE + (p15) LDFD f49 = [AO3], 1 * SIZE + (p14) FMA f82 = f8, f34, f82 + ;; + (p14) LDFD f50 = [AO4], 1 * SIZE + (p14) LDFD f56 = [AO5], 1 * SIZE + (p14) FMA f83 = f9, f34, f83 + ;; + (p14) LDFD f51 = [AO4], 1 * SIZE + (p14) LDFD f57 = [AO5], 1 * SIZE + (p15) FMA f84 = f8, f36, f84 + ;; + (p14) LDFD f52 = [AO4], 1 * SIZE + (p14) LDFD f58 = [AO5], 1 * SIZE + (p15) FMA f85 = f9, f36, f85 + ;; + (p14) LDFD f53 = [AO4], 1 * SIZE + (p14) LDFD f59 = [AO5], 1 * SIZE + (p14) ADD3 f80 = f9, f33, f80 + ;; + (p15) LDFD f54 = [AO4], 1 * SIZE + (p15) LDFD f60 = [AO5], 1 * SIZE + (p14) ADD4 f81 = f8, f33, f81 + ;; + (p15) LDFD f55 = [AO4], 1 * SIZE + (p15) LDFD f61 = [AO5], 1 * SIZE + (p14) ADD3 f82 = f9, f35, f82 + ;; + (p14) LDFD f62 = [AO6], 1 * SIZE + (p14) LDFD f68 = [AO7], 1 * SIZE + (p14) ADD4 f83 = f8, f35, f83 + ;; + (p14) LDFD f63 = [AO6], 1 * SIZE + (p14) LDFD f69 = [AO7], 1 * SIZE + (p15) ADD3 f84 = f9, f37, f84 + ;; + (p14) LDFD f64 = [AO6], 1 * SIZE + (p14) LDFD f70 = [AO7], 1 * SIZE + (p15) ADD4 f85 = f8, f37, f85 + ;; + (p14) LDFD f65 = [AO6], 1 * SIZE + (p14) LDFD f71 = [AO7], 1 * SIZE + (p14) FMA f80 = f10, f38, f80 + ;; + (p15) LDFD f66 = [AO6], 1 * SIZE + (p15) LDFD f72 = [AO7], 1 * SIZE + (p14) FMA f81 = f11, f38, f81 + ;; + (p15) LDFD f67 = [AO6], 1 * SIZE + (p15) LDFD f73 = [AO7], 1 * SIZE + (p14) FMA f82 = f10, f40, f82 + ;; + (p14) LDFD f74 = [AO8], 1 * SIZE + (p14) FMA f83 = f11, f40, f83 + ;; + (p14) LDFD f75 = [AO8], 1 * SIZE + (p15) FMA f84 = f10, f42, f84 + ;; + (p14) LDFD f76 = [AO8], 1 * SIZE + (p15) FMA f85 = f11, f42, f85 + ;; + (p14) LDFD f77 = [AO8], 1 * SIZE + (p14) ADD3 f80 = f11, f39, f80 + ;; + (p15) LDFD f78 = [AO8], 1 * SIZE + (p14) ADD4 f81 = f10, f39, f81 + ;; + (p15) LDFD f79 = [AO8], 1 * SIZE + (p14) ADD3 f82 = f11, f41, f82 + (p14) ADD4 f83 = f10, f41, f83 + (p15) ADD3 f84 = f11, f43, f84 + (p15) ADD4 f85 = f10, f43, f85 + ;; + (p14) FMA f80 = f12, f44, f80 + (p14) FMA f81 = f13, f44, f81 + (p14) FMA f82 = f12, f46, f82 + (p14) FMA f83 = f13, f46, f83 + (p15) FMA f84 = f12, f48, f84 + (p15) FMA f85 = f13, f48, f85 + ;; + (p14) ADD3 f80 = f13, f45, f80 + (p14) ADD4 f81 = f12, f45, f81 + (p14) ADD3 f82 = f13, f47, f82 + (p14) ADD4 f83 = f12, f47, f83 + (p15) ADD3 f84 = f13, f49, f84 + (p15) ADD4 f85 = f12, f49, f85 + ;; + (p14) FMA f80 = f14, f50, f80 + (p14) FMA f81 = f15, f50, f81 + (p14) FMA f82 = f14, f52, f82 + (p14) FMA f83 = f15, f52, f83 + (p15) FMA f84 = f14, f54, f84 + (p15) FMA f85 = f15, f54, f85 + ;; + (p14) ADD3 f80 = f15, f51, f80 + (p14) ADD4 f81 = f14, f51, f81 + (p14) ADD3 f82 = f15, f53, f82 + (p14) ADD4 f83 = f14, f53, f83 + (p15) ADD3 f84 = f15, f55, f84 + (p15) ADD4 f85 = f14, f55, f85 + ;; + (p14) FMA f80 = f16, f56, f80 + (p14) FMA f81 = f17, f56, f81 + (p14) FMA f82 = f16, f58, f82 + (p14) FMA f83 = f17, f58, f83 + (p15) FMA f84 = f16, f60, f84 + (p15) FMA f85 = f17, f60, f85 + ;; + (p14) ADD3 f80 = f17, f57, f80 + (p14) ADD4 f81 = f16, f57, f81 + (p14) ADD3 f82 = f17, f59, f82 + (p14) ADD4 f83 = f16, f59, f83 + (p15) ADD3 f84 = f17, f61, f84 + (p15) ADD4 f85 = f16, f61, f85 + ;; + (p14) FMA f80 = f18, f62, f80 + (p14) FMA f81 = f19, f62, f81 + (p14) FMA f82 = f18, f64, f82 + (p14) FMA f83 = f19, f64, f83 + (p15) FMA f84 = f18, f66, f84 + (p15) FMA f85 = f19, f66, f85 + ;; + (p14) ADD3 f80 = f19, f63, f80 + (p14) ADD4 f81 = f18, f63, f81 + (p14) ADD3 f82 = f19, f65, f82 + (p14) ADD4 f83 = f18, f65, f83 + (p15) ADD3 f84 = f19, f67, f84 + (p15) ADD4 f85 = f18, f67, f85 + ;; + (p14) FMA f80 = f20, f68, f80 + (p14) FMA f81 = f21, f68, f81 + (p14) FMA f82 = f20, f70, f82 + (p14) FMA f83 = f21, f70, f83 + (p15) FMA f84 = f20, f72, f84 + (p15) FMA f85 = f21, f72, f85 + ;; + (p14) ADD3 f80 = f21, f69, f80 + (p14) ADD4 f81 = f20, f69, f81 + (p14) ADD3 f82 = f21, f71, f82 + (p14) ADD4 f83 = f20, f71, f83 + (p15) ADD3 f84 = f21, f73, f84 + (p15) ADD4 f85 = f20, f73, f85 + ;; + (p14) FMA f80 = f22, f74, f80 + (p14) FMA f81 = f23, f74, f81 + (p14) FMA f82 = f22, f76, f82 + (p14) FMA f83 = f23, f76, f83 + (p15) FMA f84 = f22, f78, f84 + (p15) FMA f85 = f23, f78, f85 + ;; + (p14) ADD3 f80 = f23, f75, f80 + (p14) ADD4 f81 = f22, f75, f81 + (p14) ADD3 f82 = f23, f77, f82 + (p14) ADD4 f83 = f22, f77, f83 + (p15) ADD3 f84 = f23, f79, f84 + (p15) ADD4 f85 = f22, f79, f85 + ;; + (p14) STFD [YST1] = f80, 1 * SIZE + ;; + (p14) STFD [YST1] = f81, 1 * SIZE + ;; + (p14) STFD [YST1] = f82, 1 * SIZE + ;; + (p14) STFD [YST1] = f83, 1 * SIZE + ;; + (p15) STFD [YST1] = f84, 1 * SIZE + ;; + (p15) STFD [YST1] = f85, 1 * SIZE + (p6) br.cond.dptk .L11 + ;; + +.L20: + { .mmi + mov YLD1 = YY + adds YLD2 = 4 * SIZE, YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mmb + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + (p6) br.cond.dpnt .L30 + } + ;; + LDFD f32 = [X], SIZE + LDFD f36 = [XX], SIZE + mov AO1 = A + ;; + LDFD f33 = [X], INCXM1 + LDFD f37 = [XX], INCXM1 + add AO2 = LDA, A + ;; + LDFD f34 = [X], SIZE + LDFD f38 = [XX], SIZE + shladd AO3 = LDA, 1, A + ;; + LDFD f35 = [X], INCX3M1 + LDFD f39 = [XX], INCX3M1 + shladd AO4 = LDA, 1, AO2 + ;; + shladd A = LDA, 2, A + FMPY f8 = ALPHA_R, f32 + adds AO9 = 4 * SIZE, AO1 + FMPY f9 = ALPHA_I, f32 + adds AO10 = 4 * SIZE, AO2 + FMPY f10 = ALPHA_R, f34 + adds AO11 = 4 * SIZE, AO3 + FMPY f11 = ALPHA_I, f34 + adds AO12 = 4 * SIZE, AO4 + FMPY f12 = ALPHA_R, f36 + mov pr.rot= 0 + FMPY f13 = ALPHA_I, f36 + shr I = M, 2 + FMPY f14 = ALPHA_R, f38 + tbit.nz p14, p0 = M, 1 + FMPY f15 = ALPHA_I, f38 + ;; + { .mfi + cmp.eq p6, p0 = 0, I + ADD1 f8 = ALPHA_I, f33, f8 + mov ar.ec= 2 + } + ADD2 f9 = ALPHA_R, f33, f9 + adds I = -1, I + ADD1 f10 = ALPHA_I, f35, f10 + adds PREB = RPREFETCH * SIZE, YLD1 + ADD2 f11 = ALPHA_R, f35, f11 + adds RPRE1 = RPREFETCH * SIZE, AO1 + ADD1 f12 = ALPHA_I, f37, f12 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + ADD2 f13 = ALPHA_R, f37, f13 + adds RPRE3 = RPREFETCH * SIZE, AO3 + ADD1 f14 = ALPHA_I, f39, f14 + ADD2 f15 = ALPHA_R, f39, f15 + ;; + { .mib + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mfi + (p17) LDFD f57 = [AO4], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p12, p13 = I, 0 + } + { .mfi + (p17) LDFD f61 = [AO12], 1 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + ;; + { .mfi + (p17) LDFD f58 = [AO4], 1 * SIZE + (p17) FMA f104 = f9, f33, f104 + (p16) adds I = -1, I + } + { .mfi + (p17) LDFD f62 = [AO12], 1 * SIZE + (p17) FMA f116 = f9, f37, f116 + } + ;; + { .mfi + (p17) LDFD f59 = [AO4], 1 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p17) LDFD f63 = [AO12], 1 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + ;; + { .mfi + (p17) LDFD f60 = [AO4], 5 * SIZE + (p17) FMA f110 = f9, f35, f110 + } + { .mfi + (p17) LDFD f64 = [AO12], 5 * SIZE + (p17) FMA f122 = f9, f39, f122 + } + ;; + { .mfi + (p12) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) ADD3 f101 = f9, f34, f101 + } + { .mfi + (p17) ADD3 f113 = f9, f38, f113 + } + ;; + { .mfi + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p17) ADD4 f104 = f8, f34, f104 + } + { .mfi + (p16) LDFD f112 = [YLD2], 1 * SIZE + (p17) ADD4 f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFD f103 = [YLD1], 1 * SIZE + (p17) ADD3 f107 = f9, f36, f107 + } + { .mfi + (p16) LDFD f115 = [YLD2], 1 * SIZE + (p17) ADD3 f119 = f9, f40, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE1], 16 * SIZE + (p17) ADD4 f110 = f8, f36, f110 + } + { .mfi + (p17) ADD4 f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFD f32 = [AO1], 1 * SIZE + (p17) FMA f101 = f10, f41, f101 + } + { .mfi + (p16) LDFD f36 = [AO9], 1 * SIZE + (p17) FMA f113 = f10, f45, f113 + } + ;; + { .mfi + (p16) LDFD f33 = [AO1], 1 * SIZE + (p17) FMA f104 = f11, f41, f104 + } + { .mfi + (p16) LDFD f37 = [AO9], 1 * SIZE + (p17) FMA f116 = f11, f45, f116 + } + ;; + { .mfi + (p16) LDFD f34 = [AO1], 1 * SIZE + (p17) FMA f107 = f10, f43, f107 + } + { .mfi + (p16) LDFD f38 = [AO9], 1 * SIZE + (p17) FMA f119 = f10, f47, f119 + } + ;; + { .mfi + (p16) LDFD f35 = [AO1], 5 * SIZE + (p17) FMA f110 = f11, f43, f110 + } + { .mfi + (p16) LDFD f39 = [AO9], 5 * SIZE + (p17) FMA f122 = f11, f47, f122 + } + ;; + { .mfi + (p16) LDFD f106 = [YLD1], 1 * SIZE + (p17) ADD3 f101 = f11, f42, f101 + } + { .mfi + (p16) LDFD f118 = [YLD2], 1 * SIZE + (p17) ADD3 f113 = f11, f46, f113 + } + ;; + { .mfi + (p16) LDFD f109 = [YLD1], 5 * SIZE + (p17) ADD4 f104 = f10, f42, f104 + } + { .mfi + (p16) LDFD f121 = [YLD2], 5 * SIZE + (p17) ADD4 f116 = f10, f46, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f11, f44, f107 + } + { .mfi + (p17) ADD3 f119 = f11, f48, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE2], 16 * SIZE + (p17) ADD4 f110 = f10, f44, f110 + } + { .mfi + (p17) ADD4 f122 = f10, f48, f122 + } + ;; + { .mfi + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f101 = f12, f49, f101 + } + { .mfi + (p16) LDFD f44 = [AO10], 1 * SIZE + (p17) FMA f113 = f12, f53, f113 + } + ;; + { .mfi + (p16) LDFD f41 = [AO2], 1 * SIZE + (p17) FMA f104 = f13, f49, f104 + } + { .mfi + (p16) LDFD f45 = [AO10], 1 * SIZE + (p17) FMA f116 = f13, f53, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + (p17) FMA f107 = f12, f51, f107 + } + { .mmf + (p16) LDFD f42 = [AO2], 1 * SIZE + (p16) LDFD f46 = [AO10], 1 * SIZE + (p17) FMA f119 = f12, f55, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + (p17) FMA f110 = f13, f51, f110 + } + { .mmf + (p16) LDFD f43 = [AO2], 5 * SIZE + (p16) LDFD f47 = [AO10], 5 * SIZE + (p17) FMA f122 = f13, f55, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f13, f50, f101 + } + { .mfi + (p17) ADD3 f113 = f13, f54, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f12, f50, f104 + } + { .mfi + (p17) ADD4 f116 = f12, f54, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f13, f52, f107 + } + { .mfi + (p17) ADD3 f119 = f13, f56, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE3], 16 * SIZE + (p17) ADD4 f110 = f12, f52, f110 + } + { .mfi + (p17) ADD4 f122 = f12, f56, f122 + } + ;; + { .mfi + (p16) LDFD f48 = [AO3], 1 * SIZE + (p17) FMA f101 = f14, f57, f101 + } + { .mfi + (p16) LDFD f52 = [AO11], 1 * SIZE + (p17) FMA f113 = f14, f61, f113 + } + ;; + { .mfi + (p16) LDFD f49 = [AO3], 1 * SIZE + (p17) FMA f104 = f15, f57, f104 + } + { .mfi + (p16) LDFD f53 = [AO11], 1 * SIZE + (p17) FMA f116 = f15, f61, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p17) FMA f107 = f14, f59, f107 + } + { .mmf + (p16) LDFD f50 = [AO3], 1 * SIZE + (p16) LDFD f54 = [AO11], 1 * SIZE + (p17) FMA f119 = f14, f63, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + (p17) FMA f110 = f15, f59, f110 + } + { .mmf + (p16) LDFD f51 = [AO3], 5 * SIZE + (p16) LDFD f55 = [AO11], 5 * SIZE + (p17) FMA f122 = f15, f63, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f15, f58, f101 + } + { .mfi + (p17) ADD3 f113 = f15, f62, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f14, f58, f104 + } + { .mfi + (p17) ADD4 f116 = f14, f62, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f15, f60, f107 + } + { .mfi + (p17) ADD3 f119 = f15, f64, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE4], 16 * SIZE + (p17) ADD4 f110 = f14, f60, f110 + } + { .mfb + (p17) ADD4 f122 = f14, f64, f122 + br.ctop.sptk.few .L22 + } + ;; + .align 16 + +.L25: + { .mmi + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f32 = [AO1], 1 * SIZE + (p14) LDFD f80 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + } + { .mmi + (p14) LDFD f33 = [AO1], 1 * SIZE + (p14) LDFD f81 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + } + { .mmi + (p14) LDFD f34 = [AO1], 1 * SIZE + (p14) LDFD f82 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + } + { .mmi + (p14) LDFD f35 = [AO1], 1 * SIZE + (p14) LDFD f83 = [YLD1], 1 * SIZE + } + ;; + (p15) LDFD f36 = [AO1], 1 * SIZE + (p15) LDFD f84 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f37 = [AO1], 1 * SIZE + (p15) LDFD f85 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f38 = [AO2], 1 * SIZE + (p14) LDFD f44 = [AO3], 1 * SIZE + ;; + + (p14) LDFD f39 = [AO2], 1 * SIZE + (p14) LDFD f45 = [AO3], 1 * SIZE + (p14) FMA f80 = f8, f32, f80 + ;; + (p14) LDFD f40 = [AO2], 1 * SIZE + (p14) LDFD f46 = [AO3], 1 * SIZE + (p14) FMA f81 = f9, f32, f81 + ;; + (p14) LDFD f41 = [AO2], 1 * SIZE + (p14) LDFD f47 = [AO3], 1 * SIZE + (p14) FMA f82 = f8, f34, f82 + ;; + (p15) LDFD f42 = [AO2], 1 * SIZE + (p15) LDFD f48 = [AO3], 1 * SIZE + (p14) FMA f83 = f9, f34, f83 + ;; + (p15) LDFD f43 = [AO2], 1 * SIZE + (p15) LDFD f49 = [AO3], 1 * SIZE + (p15) FMA f84 = f8, f36, f84 + ;; + (p14) LDFD f50 = [AO4], 1 * SIZE + (p15) FMA f85 = f9, f36, f85 + ;; + (p14) LDFD f51 = [AO4], 1 * SIZE + (p14) ADD3 f80 = f9, f33, f80 + ;; + (p14) LDFD f52 = [AO4], 1 * SIZE + (p14) ADD4 f81 = f8, f33, f81 + ;; + (p14) LDFD f53 = [AO4], 1 * SIZE + (p14) ADD3 f82 = f9, f35, f82 + ;; + (p15) LDFD f54 = [AO4], 1 * SIZE + (p14) ADD4 f83 = f8, f35, f83 + ;; + (p15) LDFD f55 = [AO4], 1 * SIZE + (p15) ADD3 f84 = f9, f37, f84 + (p15) ADD4 f85 = f8, f37, f85 + ;; + (p14) FMA f80 = f10, f38, f80 + (p14) FMA f81 = f11, f38, f81 + (p14) FMA f82 = f10, f40, f82 + (p14) FMA f83 = f11, f40, f83 + (p15) FMA f84 = f10, f42, f84 + (p15) FMA f85 = f11, f42, f85 + ;; + (p14) ADD3 f80 = f11, f39, f80 + (p14) ADD4 f81 = f10, f39, f81 + (p14) ADD3 f82 = f11, f41, f82 + (p14) ADD4 f83 = f10, f41, f83 + (p15) ADD3 f84 = f11, f43, f84 + (p15) ADD4 f85 = f10, f43, f85 + ;; + (p14) FMA f80 = f12, f44, f80 + (p14) FMA f81 = f13, f44, f81 + (p14) FMA f82 = f12, f46, f82 + (p14) FMA f83 = f13, f46, f83 + (p15) FMA f84 = f12, f48, f84 + (p15) FMA f85 = f13, f48, f85 + ;; + (p14) ADD3 f80 = f13, f45, f80 + (p14) ADD4 f81 = f12, f45, f81 + (p14) ADD3 f82 = f13, f47, f82 + (p14) ADD4 f83 = f12, f47, f83 + (p15) ADD3 f84 = f13, f49, f84 + (p15) ADD4 f85 = f12, f49, f85 + ;; + (p14) FMA f80 = f14, f50, f80 + (p14) FMA f81 = f15, f50, f81 + (p14) FMA f82 = f14, f52, f82 + (p14) FMA f83 = f15, f52, f83 + (p15) FMA f84 = f14, f54, f84 + (p15) FMA f85 = f15, f54, f85 + ;; + (p14) ADD3 f80 = f15, f51, f80 + (p14) ADD4 f81 = f14, f51, f81 + (p14) ADD3 f82 = f15, f53, f82 + (p14) ADD4 f83 = f14, f53, f83 + (p15) ADD3 f84 = f15, f55, f84 + (p15) ADD4 f85 = f14, f55, f85 + ;; + (p14) STFD [YST1] = f80, 1 * SIZE + ;; + (p14) STFD [YST1] = f81, 1 * SIZE + ;; + (p14) STFD [YST1] = f82, 1 * SIZE + ;; + (p14) STFD [YST1] = f83, 1 * SIZE + ;; + (p15) STFD [YST1] = f84, 1 * SIZE + ;; + (p15) STFD [YST1] = f85, 1 * SIZE + ;; + +.L30: + { .mmi + mov YLD1 = YY + adds YLD2 = 4 * SIZE, YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mmb + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + (p6) br.cond.dpnt .L40 + } + ;; + LDFD f32 = [X], SIZE + mov AO1 = A + mov pr.rot= 0 + ;; + LDFD f33 = [X], INCXM1 + add AO2 = A, LDA + shr I = M, 2 + ;; + LDFD f34 = [X], SIZE + shladd A = LDA, 1, A + tbit.nz p14, p0 = M, 1 + ;; + LDFD f35 = [X], INCXM1 + cmp.eq p6, p0 = 0, I + ;; + FMPY f8 = ALPHA_R, f32 + adds AO9 = 4 * SIZE, AO1 + FMPY f9 = ALPHA_I, f32 + adds AO10 = 4 * SIZE, AO2 + FMPY f10 = ALPHA_R, f34 + mov ar.ec= 2 + FMPY f11 = ALPHA_I, f34 + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds I = -1, I + ADD1 f8 = ALPHA_I, f33, f8 + adds RPRE1 = RPREFETCH * SIZE, AO1 + ADD2 f9 = ALPHA_R, f33, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + ADD1 f10 = ALPHA_I, f35, f10 + ADD2 f11 = ALPHA_R, f35, f11 + ;; + { .mib + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + { .mfi + (p17) LDFD f41 = [AO2], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p12, p13 = I, 0 + } + { .mfi + (p17) LDFD f45 = [AO10], 1 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + ;; + { .mfi + (p17) LDFD f42 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f33, f104 + (p16) adds I = -1, I + } + { .mfi + (p17) LDFD f46 = [AO10], 1 * SIZE + (p17) FMA f116 = f9, f37, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mmf + (p17) LDFD f43 = [AO2], 1 * SIZE + (p17) LDFD f47 = [AO10], 1 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + (p17) FMA f110 = f9, f35, f110 + } + { .mmf + (p17) LDFD f44 = [AO2], 5 * SIZE + (p17) LDFD f48 = [AO10], 5 * SIZE + (p17) FMA f122 = f9, f39, f122 + } + ;; + { .mfi + (p12) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) ADD3 f101 = f9, f34, f101 + } + { .mfi + (p17) ADD3 f113 = f9, f38, f113 + } + ;; + { .mfi + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p17) ADD4 f104 = f8, f34, f104 + } + { .mfi + (p16) LDFD f112 = [YLD2], 1 * SIZE + (p17) ADD4 f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFD f103 = [YLD1], 1 * SIZE + (p17) ADD3 f107 = f9, f36, f107 + } + { .mfi + (p16) LDFD f115 = [YLD2], 1 * SIZE + (p17) ADD3 f119 = f9, f40, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE1], 16 * SIZE + (p17) ADD4 f110 = f8, f36, f110 + } + { .mfi + (p17) ADD4 f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFD f32 = [AO1], 1 * SIZE + (p17) FMA f101 = f10, f41, f101 + } + { .mfi + (p16) LDFD f36 = [AO9], 1 * SIZE + (p17) FMA f113 = f10, f45, f113 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p17) FMA f104 = f11, f41, f104 + } + { .mmf + (p16) LDFD f33 = [AO1], 1 * SIZE + (p16) LDFD f37 = [AO9], 1 * SIZE + (p17) FMA f116 = f11, f45, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + (p17) FMA f107 = f10, f43, f107 + } + { .mmf + (p16) LDFD f34 = [AO1], 1 * SIZE + (p16) LDFD f38 = [AO9], 1 * SIZE + (p17) FMA f119 = f10, f47, f119 + } + ;; + { .mfi + (p16) LDFD f35 = [AO1], 5 * SIZE + (p17) FMA f110 = f11, f43, f110 + } + { .mfi + (p16) LDFD f39 = [AO9], 5 * SIZE + (p17) FMA f122 = f11, f47, f122 + } + ;; + { .mfi + (p16) LDFD f106 = [YLD1], 1 * SIZE + (p17) ADD3 f101 = f11, f42, f101 + } + { .mfi + (p16) LDFD f118 = [YLD2], 1 * SIZE + (p17) ADD3 f113 = f11, f46, f113 + } + ;; + { .mfi + (p16) LDFD f109 = [YLD1], 5 * SIZE + (p17) ADD4 f104 = f10, f42, f104 + } + { .mfi + (p16) LDFD f121 = [YLD2], 5 * SIZE + (p17) ADD4 f116 = f10, f46, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f11, f44, f107 + } + { .mfi + (p17) ADD3 f119 = f11, f48, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE2], 16 * SIZE + (p17) ADD4 f110 = f10, f44, f110 + } + { .mfb + (p17) ADD4 f122 = f10, f48, f122 + br.ctop.sptk.few .L32 + } + ;; + .align 16 + +.L35: + { .mmi + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f32 = [AO1], 1 * SIZE + (p14) LDFD f80 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + } + { .mmi + (p14) LDFD f33 = [AO1], 1 * SIZE + (p14) LDFD f81 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + } + { .mmi + (p14) LDFD f34 = [AO1], 1 * SIZE + (p14) LDFD f82 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + } + { .mmi + (p14) LDFD f35 = [AO1], 1 * SIZE + (p14) LDFD f83 = [YLD1], 1 * SIZE + } + ;; + (p15) LDFD f36 = [AO1], 1 * SIZE + (p15) LDFD f84 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f37 = [AO1], 1 * SIZE + (p15) LDFD f85 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f38 = [AO2], 1 * SIZE + (p14) FMA f80 = f8, f32, f80 + ;; + (p14) LDFD f39 = [AO2], 1 * SIZE + (p14) FMA f81 = f9, f32, f81 + ;; + (p14) LDFD f40 = [AO2], 1 * SIZE + (p14) FMA f82 = f8, f34, f82 + ;; + (p14) LDFD f41 = [AO2], 1 * SIZE + (p14) FMA f83 = f9, f34, f83 + ;; + (p15) LDFD f42 = [AO2], 1 * SIZE + (p15) FMA f84 = f8, f36, f84 + ;; + (p15) LDFD f43 = [AO2], 1 * SIZE + (p15) FMA f85 = f9, f36, f85 + ;; + (p14) ADD3 f80 = f9, f33, f80 + (p14) ADD4 f81 = f8, f33, f81 + (p14) ADD3 f82 = f9, f35, f82 + (p14) ADD4 f83 = f8, f35, f83 + (p15) ADD3 f84 = f9, f37, f84 + (p15) ADD4 f85 = f8, f37, f85 + ;; + (p14) FMA f80 = f10, f38, f80 + (p14) FMA f81 = f11, f38, f81 + (p14) FMA f82 = f10, f40, f82 + (p14) FMA f83 = f11, f40, f83 + (p15) FMA f84 = f10, f42, f84 + (p15) FMA f85 = f11, f42, f85 + ;; + (p14) ADD3 f80 = f11, f39, f80 + (p14) ADD4 f81 = f10, f39, f81 + (p14) ADD3 f82 = f11, f41, f82 + (p14) ADD4 f83 = f10, f41, f83 + (p15) ADD3 f84 = f11, f43, f84 + (p15) ADD4 f85 = f10, f43, f85 + ;; + (p14) STFD [YST1] = f80, 1 * SIZE + ;; + (p14) STFD [YST1] = f81, 1 * SIZE + ;; + (p14) STFD [YST1] = f82, 1 * SIZE + ;; + (p14) STFD [YST1] = f83, 1 * SIZE + ;; + (p15) STFD [YST1] = f84, 1 * SIZE + ;; + (p15) STFD [YST1] = f85, 1 * SIZE + ;; + +.L40: + { .mmi + mov YLD1 = YY + adds YLD2 = 4 * SIZE, YY + tbit.z p6, p0 = N, 0 + } + { .mmb + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + (p6) br.cond.dpnt .L990 + } + ;; + LDFD f32 = [X], SIZE + mov AO1 = A + adds AO9 = 4 * SIZE, A + ;; + LDFD f33 = [X], INCXM1 + add A = A, LDA + mov pr.rot= 0 + ;; + { .mfi + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA_R, f32 + mov ar.ec= 2 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA_I, f32 + shr I = M, 2 + } + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ADD1 f8 = ALPHA_I, f33, f8 + } + { .mfi + adds I = -1, I + ADD2 f9 = ALPHA_R, f33, f9 + tbit.nz p14, p0 = M, 1 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = I + (p6) br.cond.dpnt .L45 + } + ;; + .align 16 + +.L42: + { .mmf + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p16) LDFD f112 = [YLD2], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + } + { .mmf + (p16) LDFD f32 = [AO1], 1 * SIZE + (p16) LDFD f44 = [AO9], 1 * SIZE + (p17) FMA f113 = f8, f45, f113 + } + ;; + { .mmf + (p16) LDFD f103 = [YLD1], 1 * SIZE + (p16) LDFD f115 = [YLD2], 1 * SIZE + (p17) FMA f104 = f9, f33, f104 + } + { .mmf + (p16) LDFD f35 = [AO1], 1 * SIZE + (p16) LDFD f47 = [AO9], 1 * SIZE + (p17) FMA f116 = f9, f45, f116 + } + ;; + { .mmf + (p16) LDFD f106 = [YLD1], 1 * SIZE + (p16) LDFD f118 = [YLD2], 1 * SIZE + (p17) FMA f107 = f8, f39, f107 + } + { .mmf + (p16) LDFD f38 = [AO1], 1 * SIZE + (p16) LDFD f50 = [AO9], 1 * SIZE + (p17) FMA f119 = f8, f51, f119 + } + ;; + { .mmf + (p16) LDFD f109 = [YLD1], 5 * SIZE + (p16) LDFD f121 = [YLD2], 5 * SIZE + (p17) FMA f110 = f9, f39, f110 + } + { .mmf + (p16) LDFD f41 = [AO1], 5 * SIZE + (p16) LDFD f53 = [AO9], 5 * SIZE + (p17) FMA f122 = f9, f51, f122 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + (p17) ADD3 f101 = f9, f36, f101 + } + { .mfi + (p17) ADD3 f113 = f9, f48, f113 + (p16) tbit.nz.unc p12, p13 = I, 0 + } + ;; + { .mmf + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + (p17) ADD4 f104 = f8, f36, f104 + } + { .mfi + (p12) PREFETCH [RPRE1], 16 * SIZE + (p17) ADD4 f116 = f8, f48, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p17) ADD3 f107 = f9, f42, f107 + } + { .mfi + (p13) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) ADD3 f119 = f9, f54, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + (p17) ADD4 f110 = f8, f42, f110 + } + { .mfb + (p17) ADD4 f122 = f8, f54, f122 + br.ctop.sptk.few .L42 + } + ;; + .align 16 + +.L45: + { .mmi + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f32 = [AO1], 1 * SIZE + (p14) LDFD f80 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + } + { .mmi + (p14) LDFD f33 = [AO1], 1 * SIZE + (p14) LDFD f81 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + } + { .mmi + (p14) LDFD f34 = [AO1], 1 * SIZE + (p14) LDFD f82 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + } + { .mmi + (p14) LDFD f35 = [AO1], 1 * SIZE + (p14) LDFD f83 = [YLD1], 1 * SIZE + } + ;; + (p15) LDFD f36 = [AO1], 1 * SIZE + (p15) LDFD f84 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f37 = [AO1], 1 * SIZE + (p15) LDFD f85 = [YLD1], 1 * SIZE + ;; + (p14) FMA f80 = f8, f32, f80 + (p14) FMA f81 = f9, f32, f81 + (p14) FMA f82 = f8, f34, f82 + (p14) FMA f83 = f9, f34, f83 + (p15) FMA f84 = f8, f36, f84 + (p15) FMA f85 = f9, f36, f85 + ;; + (p14) ADD3 f80 = f9, f33, f80 + (p14) ADD4 f81 = f8, f33, f81 + (p14) ADD3 f82 = f9, f35, f82 + (p14) ADD4 f83 = f8, f35, f83 + (p15) ADD3 f84 = f9, f37, f84 + (p15) ADD4 f85 = f8, f37, f85 + ;; + (p14) STFD [YST1] = f80, 1 * SIZE + ;; + (p14) STFD [YST1] = f81, 1 * SIZE + ;; + (p14) STFD [YST1] = f82, 1 * SIZE + ;; + (p14) STFD [YST1] = f83, 1 * SIZE + ;; + (p15) STFD [YST1] = f84, 1 * SIZE + ;; + (p15) STFD [YST1] = f85, 1 * SIZE + ;; + +.L990: + { .mmi + mov YST1 = Y + mov YST2 = Y + mov pr.rot= 0 + } + { .mib + mov YLD1 = YY + shr J = M, 2 + (p10) br.cond.dptk .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = r0, J + adds INCY = - SIZE, INCY + mov ar.ec = 4 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + adds J = -1, J + tbit.nz p13, p0 = M, 1 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = J + (p6) br.cond.dpnt .L995 + } + ;; +.L992: + { .mfi + (p19) STFD [YST2] = f35, 1 * SIZE + (p18) FADD f34 = f34, f66 + } + { .mmi + (p16) LDFD f64 = [YLD1], 1 * SIZE + (p16) LDFD f32 = [YST1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f39 + (p18) FADD f38 = f38, f70 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f68 = [YLD1], 1 * SIZE + (p16) LDFD f36 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f43, 1 * SIZE + (p18) FADD f42 = f42, f74 + } + { .mmi + (p16) LDFD f72 = [YLD1], 1 * SIZE + (p16) LDFD f40 = [YST1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f47 + (p18) FADD f50 = f50, f82 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f76 = [YLD1], 1 * SIZE + (p16) LDFD f44 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f51, 1 * SIZE + (p18) FADD f54 = f54, f86 + } + { .mmi + (p16) LDFD f80 = [YLD1], 1 * SIZE + (p16) LDFD f48 = [YST1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f55 + (p18) FADD f58 = f58, f90 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f84 = [YLD1], 1 * SIZE + (p16) LDFD f52 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f59, 1 * SIZE + (p18) FADD f46 = f46, f78 + } + { .mmi + (p16) LDFD f88 = [YLD1], 1 * SIZE + (p16) LDFD f56 = [YST1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f63 + (p18) FADD f62 = f62, f94 + (p19) add YST2 = YST2, INCY + } + { .mmb + (p16) LDFD f92 = [YLD1], 1 * SIZE + (p16) LDFD f60 = [YST1], INCY + br.ctop.sptk.few .L992 + } + ;; + +.L995: + (p13) LDFD f32 = [YST1], 1 * SIZE + (p13) LDFD f40 = [YLD1], 1 * SIZE + tbit.nz p14, p0 = M, 0 + ;; + (p13) LDFD f33 = [YST1], INCY + (p13) LDFD f41 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f34 = [YST1], 1 * SIZE + (p13) LDFD f42 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f35 = [YST1], INCY + (p13) LDFD f43 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f36 = [YST1], 1 * SIZE + (p14) LDFD f44 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f37 = [YST1], INCY + (p14) LDFD f45 = [YLD1], 1 * SIZE + ;; + (p13) FADD f32 = f32, f40 + (p13) FADD f33 = f33, f41 + (p13) FADD f34 = f34, f42 + (p13) FADD f35 = f35, f43 + (p14) FADD f36 = f36, f44 + (p14) FADD f37 = f37, f45 + ;; + (p13) STFD [YST2] = f32, 1 * SIZE + ;; + (p13) STFD [YST2] = f33 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f34, 1 * SIZE + ;; + (p13) STFD [YST2] = f35 + (p13) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f36, 1 * SIZE + ;; + (p14) STFD [YST2] = f37 + ;; + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/zgemv_t.S b/kernel/ia64/zgemv_t.S new file mode 100644 index 0000000..73e6df0 --- /dev/null +++ b/kernel/ia64/zgemv_t.S @@ -0,0 +1,2017 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#define A r37 +#define LDA r38 +#define X r39 +#define INCX r34 +#define Y r35 +#define INCY r36 +#define BUFFER r11 + +#define I r15 +#define J r16 +#define AO1 r18 +#define AO2 r19 +#define AO3 r20 +#define AO4 r21 +#define AO5 r22 +#define AO6 r23 +#define AO7 r24 +#define AO8 r25 +#define BO r26 +#define INCYM1 r28 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define AO21 loc8 +#define AO41 loc9 +#define AO61 loc10 +#define AO81 loc11 +#define CLD1 loc12 +#define CLD2 loc13 +#define CST1 loc14 +#define CST2 loc15 + +#define PREB r8 +#define WPRE r9 +#define OFFSET PREB +#define INCX3M1 WPRE +#define INCY3M1 r10 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 2 + 8) +#else +#define RPREFETCH (16 * 2 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA_R f6 +#define ALPHA_I f7 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 FMA +#define ADD2 FMA +#define ADD3 FNMA +#define ADD4 FMA +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 FMA +#define ADD2 FMA +#define ADD3 FMA +#define ADD4 FNMA +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 FMA +#define ADD2 FNMA +#define ADD3 FMA +#define ADD4 FMA +#else +#define ADD1 FMA +#define ADD2 FNMA +#define ADD3 FNMA +#define ADD4 FNMA +#endif + + PROLOGUE + PROFCODE + .prologue + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + } + ;; + { .mmi + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + adds r15 = 152, SP + } + ;; + { .mmi + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + adds r16 = 160, SP + } + ;; + { .mmi + stf.spill [r8] = f22 + stf.spill [r9] = f23 + adds r17 = 168, SP + } + .body + ;; + { .mmf + ld8 INCX = [r14] + ld8 Y = [r15] + mov ALPHA_R = f8 + } + { .mmf + ld8 INCY = [r16] + ld8 BUFFER = [r17] + mov ALPHA_I = f9 + } + ;; + { .mmi + shladd INCX = INCX, ZBASE_SHIFT, r0 + shladd LDA = LDA, ZBASE_SHIFT, r0 + mov pr.rot= 0 + } + { .mmi + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + shladd INCY = INCY, ZBASE_SHIFT, r0 + } + ;; + { .mmi + mov AO1 = BUFFER + adds OFFSET = -SIZE, INCX + shr I = M, 3 + } + { .mib + adds INCYM1 = - SIZE, INCY + shladd INCX3M1 = INCX, 1, INCX + (p7) br.cond.dpnt .L999 + } + ;; + { .mmi + shladd BO = INCX, 1, X + adds AO2 = 4 * SIZE, BUFFER + mov ar.ec= 5 + } + { .mmb + shladd INCY3M1 = INCY, 1, INCYM1 + adds I = -1, I + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + adds INCX3M1 = -SIZE, INCX3M1 + cmp.eq p16, p0 = r0, r0 + tbit.nz p13, p0 = M, 2 + } + { .mib + cmp.gt p6, p0 = 0, I + mov ar.lc = I + (p6) br.cond.dpnt .L05 + } + ;; + .align 16 + +.L01: + (p20) STFD [AO1] = f36, SIZE + (p20) STFD [AO2] = f56, SIZE + (p16) LDFD f32 = [X], SIZE + (p16) LDFD f52 = [BO], SIZE + ;; + (p20) STFD [AO1] = f41, SIZE + (p20) STFD [AO2] = f61, SIZE + (p16) LDFD f37 = [X], OFFSET + (p16) LDFD f57 = [BO], OFFSET + ;; + (p20) STFD [AO1] = f46, SIZE + (p20) STFD [AO2] = f66, SIZE + (p16) LDFD f42 = [X], SIZE + (p16) LDFD f62 = [BO], SIZE + ;; + (p20) STFD [AO1] = f51, 5 * SIZE + (p20) STFD [AO2] = f71, 5 * SIZE + (p16) LDFD f47 = [X], INCX3M1 + (p16) LDFD f67 = [BO], INCX3M1 + ;; + (p20) STFD [AO1] = f76, SIZE + (p20) STFD [AO2] = f96, SIZE + (p16) LDFD f72 = [X], SIZE + (p16) LDFD f92 = [BO], SIZE + ;; + (p20) STFD [AO1] = f81, SIZE + (p20) STFD [AO2] = f101, SIZE + (p16) LDFD f77 = [X], OFFSET + (p16) LDFD f97 = [BO], OFFSET + ;; + (p20) STFD [AO1] = f86, SIZE + (p20) STFD [AO2] = f106, SIZE + (p16) LDFD f82 = [X], SIZE + (p16) LDFD f102 = [BO], SIZE + ;; + (p20) STFD [AO1] = f91, 5 * SIZE + (p20) STFD [AO2] = f111, 5 * SIZE + (p16) LDFD f87 = [X], INCX3M1 + (p16) LDFD f107 = [BO], INCX3M1 + br.ctop.sptk.few .L01 + ;; + .align 16 + +.L05: + { .mmi + (p13) LDFD f32 = [X], SIZE + (p13) LDFD f36 = [BO], SIZE + tbit.nz p14, p0 = M, 1 + } + ;; + { .mmi + (p13) LDFD f33 = [X], OFFSET + (p13) LDFD f37 = [BO], OFFSET + tbit.nz p15, p0 = M, 0 + } + ;; + { .mmb + (p13) LDFD f34 = [X], SIZE + (p13) LDFD f38 = [BO], SIZE + } + ;; + { .mmi + (p13) LDFD f35 = [X], INCX3M1 + (p13) LDFD f39 = [BO], INCX3M1 + } + ;; + { .mmi + (p14) LDFD f40 = [X], SIZE + } + ;; + (p14) LDFD f41 = [X], OFFSET + (p13) STFD [AO1] = f32, SIZE + tbit.nz p8, p0 = A, BASE_SHIFT + ;; + (p14) LDFD f42 = [X], SIZE + (p13) STFD [AO2] = f36, SIZE + ;; + (p14) LDFD f43 = [X], OFFSET + (p13) STFD [AO1] = f33, SIZE + ;; + (p15) LDFD f44 = [X], SIZE + (p13) STFD [AO2] = f37, SIZE + ;; + (p15) LDFD f45 = [X], OFFSET + (p13) STFD [AO1] = f34, SIZE + (p13) STFD [AO2] = f38, SIZE + ;; + (p13) STFD [AO1] = f35, 5 * SIZE + (p13) STFD [AO2] = f39, 5 * SIZE + ;; + (p14) STFD [AO1] = f40, SIZE + ;; + (p14) STFD [AO1] = f41, SIZE + ;; + (p14) STFD [AO1] = f42, SIZE + ;; + (p14) STFD [AO1] = f43, SIZE + ;; + (p15) STFD [AO1] = f44, SIZE + ;; + (p15) STFD [AO1] = f45, SIZE + (p8) br.cond.dpnt .L100 + ;; + .align 16 + +.L10: + { .mmi + mov CLD1 = Y + shladd CLD2 = INCY, 1, Y + shr J = N, 3 + } + ;; + { .mmb + mov CST1 = Y + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + mov BO = BUFFER + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mmf + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + mov f14 = f0 + } + ;; + { .mmf + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + mov f16 = f0 + } + { .mmf + adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 + mov f18 = f0 + } + ;; + { .mmf + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + mov f20 = f0 + } + { .mmf + adds RPRE5 = (RPREFETCH + 8) * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 10) * SIZE, AO6 + mov f22 = f0 + } + ;; + { .mfi + shladd A = LDA, 3, A + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + adds RPRE7 = (RPREFETCH + 12) * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 14) * SIZE, AO8 + mov f11 = f0 + } + ;; + { .mmf + adds WPRE = 16 * SIZE, CLD1 + adds PREB = RPREFETCH * SIZE, BO + mov f13 = f0 + } + { .mmf + adds I = -1, M + cmp.eq p16, p0 = r0, r0 + mov f15 = f0 + } + ;; + { .mfi + cmp.eq p12, p0 = r0, r0 + mov f17 = f0 + mov ar.lc = I + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f19 = f0 + } + ;; + { .mmf + lfetch.excl.nt1 [WPRE] + nop __LINE__ + mov f21 = f0 + } + { .mmf + mov I = 0 + nop __LINE__ + mov f23 = f0 + } + ;; + .align 16 + +.L16: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [AO1], 2 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 1, I + nop __LINE__ + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p13) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) cmp.eq.unc p14, p0 = 2, I + (p16) cmp.eq.unc p15, p0 = 3, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p16) LDFPD f42, f47 = [AO2], 2 * SIZE + nop __LINE__ + (p20) ADD1 f12 = f116, f56, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f13 = f121, f56, f13 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + nop __LINE__ + (p20) ADD1 f14 = f116, f66, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f15 = f121, f66, f15 + } + ;; + { .mmf + (p16) LDFPD f52, f57 = [AO3], 2 * SIZE + nop __LINE__ + (p20) ADD3 f8 = f121, f41, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f9 = f116, f41, f9 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + nop __LINE__ + (p20) ADD3 f10 = f121, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f11 = f116, f51, f11 + } + ;; + { .mmf + (p16) LDFPD f62, f67 = [AO4], 2 * SIZE + nop __LINE__ + (p20) ADD3 f12 = f121, f61, f12 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 4, I + (p16) cmp.eq.unc p13, p0 = 5, I + (p20) ADD4 f13 = f116, f61, f13 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + nop __LINE__ + (p20) ADD3 f14 = f121, f71, f14 + } + { .mmf + (p16) cmp.eq.unc p14, p0 = 6, I + (p16) cmp.eq.unc p15, p0 = 7, I + (p20) ADD4 f15 = f116, f71, f15 + } + ;; + { .mmf + (p16) LDFPD f72, f77 = [AO5], 2 * SIZE + nop __LINE__ + (p20) ADD1 f16 = f116, f76, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f17 = f121, f76, f17 + } + ;; + { .mmf + (p12) PREFETCH [RPRE5], 16 * SIZE + nop __LINE__ + (p20) ADD1 f18 = f116, f86, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f19 = f121, f86, f19 + } + ;; + { .mmf + (p16) LDFPD f82, f87 = [AO6], 2 * SIZE + nop __LINE__ + (p20) ADD1 f20 = f116, f96, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f21 = f121, f96, f21 + } + ;; + { .mmf + (p13) PREFETCH [RPRE6], 16 * SIZE + nop __LINE__ + (p20) ADD1 f22 = f116, f106, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f23 = f121, f106, f23 + } + ;; + { .mmf + (p16) LDFPD f92, f97 = [AO7], 2 * SIZE + nop __LINE__ + (p20) ADD3 f16 = f121, f81, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f17 = f116, f81, f17 + } + ;; + { .mmf + (p14) PREFETCH [RPRE7], 16 * SIZE + nop __LINE__ + (p20) ADD3 f18 = f121, f91, f18 + } + { .mmf + nop __LINE__ + (p16) adds I = 1, I + (p20) ADD4 f19 = f116, f91, f19 + } + ;; + { .mmf + (p16) LDFPD f102, f107 = [AO8], 2 * SIZE + nop __LINE__ + (p20) ADD3 f20 = f121, f101, f20 + } + { .mmf + (p15) mov I = 0 + nop __LINE__ + (p20) ADD4 f21 = f116, f101, f21 + } + ;; + { .mmf + (p15) PREFETCH [RPRE8], 16 * SIZE + nop __LINE__ + (p20) ADD3 f22 = f121, f111, f22 + } + { .mfb + (p16) cmp.eq.unc p12, p0 = 0, I + (p20) ADD4 f23 = f116, f111, f23 + br.ctop.sptk.few .L16 + } + ;; + +.L18: + LDFD f32 = [CLD1], SIZE + LDFD f36 = [CLD2], SIZE + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + LDFD f37 = [CLD2], INCYM1 + ;; + LDFD f34 = [CLD1], SIZE + LDFD f38 = [CLD2], SIZE + ;; + LDFD f35 = [CLD1], INCY3M1 + LDFD f39 = [CLD2], INCY3M1 + ;; + LDFD f40 = [CLD1], SIZE + LDFD f44 = [CLD2], SIZE + ;; + LDFD f41 = [CLD1], INCYM1 + LDFD f45 = [CLD2], INCYM1 + ;; + LDFD f42 = [CLD1], SIZE + LDFD f46 = [CLD2], SIZE + ;; + LDFD f43 = [CLD1], INCY3M1 + LDFD f47 = [CLD2], INCY3M1 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f36 = ALPHA_R, f12, f36 + FMA f33 = ALPHA_I, f8, f33 + FMA f37 = ALPHA_I, f12, f37 + FMA f34 = ALPHA_R, f10, f34 + FMA f38 = ALPHA_R, f14, f38 + FMA f35 = ALPHA_I, f10, f35 + FMA f39 = ALPHA_I, f14, f39 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FNMA f36 = ALPHA_I, f13, f36 + FMA f33 = ALPHA_R, f9, f33 + FMA f37 = ALPHA_R, f13, f37 + FNMA f34 = ALPHA_I, f11, f34 + FNMA f38 = ALPHA_I, f15, f38 + FMA f35 = ALPHA_R, f11, f35 + FMA f39 = ALPHA_R, f15, f39 + ;; + FMA f40 = ALPHA_R, f16, f40 + FMA f44 = ALPHA_R, f20, f44 + FMA f41 = ALPHA_I, f16, f41 + FMA f45 = ALPHA_I, f20, f45 + FMA f42 = ALPHA_R, f18, f42 + FMA f46 = ALPHA_R, f22, f46 + FMA f43 = ALPHA_I, f18, f43 + FMA f47 = ALPHA_I, f22, f47 + ;; + { .mmf + STFD [CST1] = f32, SIZE + STFD [CST2] = f36, SIZE + FNMA f40 = ALPHA_I, f17, f40 + } + { .mmf + nop __LINE__ + nop __LINE__ + FNMA f44 = ALPHA_I, f21, f44 + } + ;; + { .mmf + STFD [CST1] = f33 + STFD [CST2] = f37 + FMA f41 = ALPHA_R, f17, f41 + } + { .mmf + add CST1 = CST1, INCYM1 + add CST2 = CST2, INCYM1 + FMA f45 = ALPHA_R, f21, f45 + } + ;; + { .mmf + STFD [CST1] = f34, SIZE + STFD [CST2] = f38, SIZE + FNMA f42 = ALPHA_I, f19, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + FNMA f46 = ALPHA_I, f23, f46 + } + ;; + { .mmf + STFD [CST1] = f35 + STFD [CST2] = f39 + FMA f43 = ALPHA_R, f19, f43 + } + { .mmf + add CST1 = CST1, INCY3M1 + add CST2 = CST2, INCY3M1 + FMA f47 = ALPHA_R, f23, f47 + } + ;; + { .mmi + STFD [CST1] = f40, SIZE + STFD [CST2] = f44, SIZE + adds J = -1, J + } + ;; + { .mmi + STFD [CST1] = f41 + STFD [CST2] = f45 + add CST1 = CST1, INCYM1 + } + { .mmi + nop __LINE__ + nop __LINE__ + add CST2 = CST2, INCYM1 + } + ;; + { .mmi + STFD [CST1] = f42, SIZE + STFD [CST2] = f46, SIZE + cmp.lt p6, p0 = 0, J + } + ;; + { .mmi + STFD [CST1] = f43 + STFD [CST2] = f47 + add CST1 = CST1, INCY3M1 + } + { .mmb + add CST2 = CST2, INCY3M1 + (p6) br.cond.dptk .L11 + } + ;; + .align 16 + +.L20: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 2 + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mfb + mov BO = BUFFER + mov f14 = f0 + (p6) br.cond.dpnt .L30 + } + ;; + { .mfi + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + adds I = -1, M + mov f11 = f0 + } + ;; + { .mmf + adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 + mov f13 = f0 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 2, A + mov f15 = f0 + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + adds PREB = RPREFETCH * SIZE, BO + mov ar.lc = I + } + { .mmi + adds WPRE = 16 * SIZE, CLD1 + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L26: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [AO1], 2 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 2, I + nop __LINE__ + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p12) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) cmp.eq.unc p14, p0 = 4, I + (p16) cmp.eq.unc p15, p0 = 6, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p16) LDFPD f42, f47 = [AO2], 2 * SIZE + nop __LINE__ + (p20) ADD1 f12 = f116, f56, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f13 = f121, f56, f13 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + nop __LINE__ + (p20) ADD1 f14 = f116, f66, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f15 = f121, f66, f15 + } + ;; + { .mmf + (p16) LDFPD f52, f57 = [AO3], 2 * SIZE + nop __LINE__ + (p20) ADD3 f8 = f121, f41, f8 + } + { .mmf + (p16) adds I = 1, I + nop __LINE__ + (p20) ADD4 f9 = f116, f41, f9 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + nop __LINE__ + (p20) ADD3 f10 = f121, f51, f10 + } + { .mmf + (p16) cmp.eq.unc p15, p0 = 8, I + nop __LINE__ + (p20) ADD4 f11 = f116, f51, f11 + } + ;; + { .mmf + (p16) LDFPD f62, f67 = [AO4], 2 * SIZE + nop __LINE__ + (p20) ADD3 f12 = f121, f61, f12 + } + { .mmf + (p15) mov I = 0 + nop __LINE__ + (p20) ADD4 f13 = f116, f61, f13 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + nop __LINE__ + (p20) ADD3 f14 = f121, f71, f14 + } + { .mfb + (p16) cmp.eq.unc p12, p0 = 0, I + (p20) ADD4 f15 = f116, f71, f15 + br.ctop.sptk.few .L26 + } + ;; +.L28: + LDFD f32 = [CLD1], SIZE + LDFD f36 = [CLD2], SIZE + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + LDFD f37 = [CLD2], INCYM1 + ;; + LDFD f34 = [CLD1], SIZE + LDFD f38 = [CLD2], SIZE + ;; + LDFD f35 = [CLD1], INCY3M1 + LDFD f39 = [CLD2], INCY3M1 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f36 = ALPHA_R, f12, f36 + FMA f33 = ALPHA_I, f8, f33 + FMA f37 = ALPHA_I, f12, f37 + FMA f34 = ALPHA_R, f10, f34 + FMA f38 = ALPHA_R, f14, f38 + FMA f35 = ALPHA_I, f10, f35 + FMA f39 = ALPHA_I, f14, f39 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FNMA f36 = ALPHA_I, f13, f36 + FMA f33 = ALPHA_R, f9, f33 + FMA f37 = ALPHA_R, f13, f37 + FNMA f34 = ALPHA_I, f11, f34 + FNMA f38 = ALPHA_I, f15, f38 + FMA f35 = ALPHA_R, f11, f35 + FMA f39 = ALPHA_R, f15, f39 + ;; + STFD [CST1] = f32, SIZE + STFD [CST2] = f36, SIZE + ;; + STFD [CST1] = f33 + STFD [CST2] = f37 + add CST1 = CST1, INCYM1 + add CST2 = CST2, INCYM1 + ;; + STFD [CST1] = f34, SIZE + STFD [CST2] = f38, SIZE + ;; + STFD [CST1] = f35 + STFD [CST2] = f39 + add CST1 = CST1, INCY3M1 + add CST2 = CST2, INCY3M1 + ;; + .align 16 + +.L30: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 1 + } + ;; + { .mmf + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + mov f12 = f0 + } + { .mfb + adds I = -1, M + mov f14 = f0 + (p6) br.cond.dpnt .L40 + } + ;; + { .mfi + mov BO = BUFFER + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 1, A + mov f11 = f0 + } + ;; + { .mfi + adds WPRE = 16 * SIZE, CLD1 + mov f13 = f0 + mov ar.lc = I + } + { .mmf + adds PREB = RPREFETCH * SIZE, BO + nop __LINE__ + mov f15 = f0 + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L36: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [AO1], 2 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 4, I + (p16) adds I = 1, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p12) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 8, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFPD f42, f47 = [AO2], 2 * SIZE + (p20) ADD3 f12 = f121, f41, f12 + } + { .mmf + (p12) mov I = 0 + (p20) ADD4 f13 = f116, f41, f13 + } + ;; + { .mmf + (p20) ADD3 f14 = f121, f51, f14 + } + { .mfb + nop __LINE__ + (p20) ADD4 f15 = f116, f51, f15 + br.ctop.sptk.few .L36 + } + ;; + +.L38: + LDFD f32 = [CLD1], SIZE + FADD f8 = f8, f12 + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + FADD f10 = f10, f14 + ;; + LDFD f34 = [CLD1], SIZE + FADD f9 = f9, f13 + ;; + LDFD f35 = [CLD1], INCYM1 + FADD f11 = f11, f15 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f33 = ALPHA_I, f8, f33 + FMA f34 = ALPHA_R, f10, f34 + FMA f35 = ALPHA_I, f10, f35 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FMA f33 = ALPHA_R, f9, f33 + FNMA f34 = ALPHA_I, f11, f34 + FMA f35 = ALPHA_R, f11, f35 + ;; + STFD [CST1] = f32, SIZE + ;; + STFD [CST1] = f33 + add CST1 = CST1, INCYM1 + ;; + STFD [CST1] = f34, SIZE + ;; + STFD [CST1] = f35 + add CST1 = CST1, INCYM1 + ;; + .align 16 + + +.L40: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + mov f9 = f0 + tbit.z p6, p0 = N, 0 + } + ;; + { .mfi + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + mov f10 = f0 + mov ar.ec= 5 + } + { .mfb + adds I = -1, M + mov f11 = f0 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + add A = LDA, A + mov ar.lc = I + } + { .mmi + adds WPRE = 16 * SIZE, CLD1 + adds PREB = RPREFETCH * SIZE, BO + mov BO = BUFFER + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L46: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [AO1], 2 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 7, I + (p16) adds I = 1, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD3 f10 = f121, f41, f10 + } + { .mfb + (p12) mov I = 0 + (p20) ADD4 f11 = f116, f41, f11 + br.ctop.sptk.few .L46 + } + ;; + +.L48: + LDFD f32 = [CLD1], SIZE + FADD f8 = f8, f10 + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + FADD f9 = f9, f11 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f33 = ALPHA_I, f8, f33 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FMA f33 = ALPHA_R, f9, f33 + ;; + STFD [CST1] = f32, SIZE + ;; + STFD [CST1] = f33 + add CST1 = CST1, INCYM1 + br .L999 + .align 16 + ;; + +.L100: + { .mmi + mov CLD1 = Y + shladd CLD2 = INCY, 1, Y + shr J = N, 3 + } + ;; + { .mmb + mov CST1 = Y + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L120 + } + ;; + .align 16 + +.L111: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + mov BO = BUFFER + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mmf + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + mov f14 = f0 + } + ;; + { .mmf + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + mov f16 = f0 + } + { .mmf + adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 + mov f18 = f0 + } + ;; + { .mmf + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + mov f20 = f0 + } + { .mmf + adds RPRE5 = (RPREFETCH + 8) * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 10) * SIZE, AO6 + mov f22 = f0 + } + ;; + { .mfi + shladd A = LDA, 3, A + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + adds RPRE7 = (RPREFETCH + 12) * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 14) * SIZE, AO8 + mov f11 = f0 + } + ;; + { .mmf + adds WPRE = 16 * SIZE, CLD1 + adds PREB = RPREFETCH * SIZE, BO + mov f13 = f0 + } + { .mmf + adds I = -1, M + cmp.eq p16, p0 = r0, r0 + mov f15 = f0 + } + ;; + { .mfi + cmp.eq p12, p0 = r0, r0 + mov f17 = f0 + mov ar.lc = I + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f19 = f0 + } + ;; + { .mmf + lfetch.excl.nt1 [WPRE] + nop __LINE__ + mov f21 = f0 + } + { .mmf + mov I = 0 + nop __LINE__ + mov f23 = f0 + } + ;; + .align 16 + +.L116: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFD f32 = [AO1], 1 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 1, I + (p16) cmp.eq.unc p14, p0 = 2, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p13) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) LDFD f37 = [AO1], 1 * SIZE + (p16) cmp.eq.unc p15, p0 = 3, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f42 = [AO2], 1 * SIZE + (p20) ADD1 f12 = f116, f56, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f13 = f121, f56, f13 + } + ;; + { .mmf + (p16) LDFD f47 = [AO2], 1 * SIZE + nop __LINE__ + (p20) ADD1 f14 = f116, f66, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f15 = f121, f66, f15 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFD f52 = [AO3], 1 * SIZE + (p20) ADD3 f8 = f121, f41, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f9 = f116, f41, f9 + } + ;; + { .mmf + (p16) LDFD f57 = [AO3], 1 * SIZE + nop __LINE__ + (p20) ADD3 f10 = f121, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f11 = f116, f51, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f62 = [AO4], 1 * SIZE + (p20) ADD3 f12 = f121, f61, f12 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 4, I + (p16) cmp.eq.unc p13, p0 = 5, I + (p20) ADD4 f13 = f116, f61, f13 + } + ;; + { .mmf + (p16) LDFD f67 = [AO4], 1 * SIZE + nop __LINE__ + (p20) ADD3 f14 = f121, f71, f14 + } + { .mmf + (p16) cmp.eq.unc p14, p0 = 6, I + (p16) cmp.eq.unc p15, p0 = 7, I + (p20) ADD4 f15 = f116, f71, f15 + } + ;; + { .mmf + (p12) PREFETCH [RPRE5], 16 * SIZE + (p16) LDFD f72 = [AO5], 1 * SIZE + (p20) ADD1 f16 = f116, f76, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f17 = f121, f76, f17 + } + ;; + { .mmf + (p16) LDFD f77 = [AO5], 1 * SIZE + nop __LINE__ + (p20) ADD1 f18 = f116, f86, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f19 = f121, f86, f19 + } + ;; + { .mmf + (p13) PREFETCH [RPRE6], 16 * SIZE + (p16) LDFD f82 = [AO6], 1 * SIZE + (p20) ADD1 f20 = f116, f96, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f21 = f121, f96, f21 + } + ;; + { .mmf + (p16) LDFD f87 = [AO6], 1 * SIZE + nop __LINE__ + (p20) ADD1 f22 = f116, f106, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f23 = f121, f106, f23 + } + ;; + { .mmf + (p14) PREFETCH [RPRE7], 16 * SIZE + (p16) LDFD f92 = [AO7], 1 * SIZE + (p20) ADD3 f16 = f121, f81, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f17 = f116, f81, f17 + } + ;; + { .mmf + (p16) LDFD f97 = [AO7], 1 * SIZE + nop __LINE__ + (p20) ADD3 f18 = f121, f91, f18 + } + { .mmf + nop __LINE__ + (p16) adds I = 1, I + (p20) ADD4 f19 = f116, f91, f19 + } + ;; + { .mmf + (p15) PREFETCH [RPRE8], 16 * SIZE + (p16) LDFD f102 = [AO8], 1 * SIZE + (p20) ADD3 f20 = f121, f101, f20 + } + { .mmf + (p15) mov I = 0 + nop __LINE__ + (p20) ADD4 f21 = f116, f101, f21 + } + ;; + { .mmf + (p16) LDFD f107 = [AO8], 1 * SIZE + nop __LINE__ + (p20) ADD3 f22 = f121, f111, f22 + } + { .mfb + (p16) cmp.eq.unc p12, p0 = 0, I + (p20) ADD4 f23 = f116, f111, f23 + br.ctop.sptk.few .L116 + } + ;; + +.L118: + LDFD f32 = [CLD1], SIZE + LDFD f36 = [CLD2], SIZE + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + LDFD f37 = [CLD2], INCYM1 + ;; + LDFD f34 = [CLD1], SIZE + LDFD f38 = [CLD2], SIZE + ;; + LDFD f35 = [CLD1], INCY3M1 + LDFD f39 = [CLD2], INCY3M1 + ;; + LDFD f40 = [CLD1], SIZE + LDFD f44 = [CLD2], SIZE + ;; + LDFD f41 = [CLD1], INCYM1 + LDFD f45 = [CLD2], INCYM1 + ;; + LDFD f42 = [CLD1], SIZE + LDFD f46 = [CLD2], SIZE + ;; + LDFD f43 = [CLD1], INCY3M1 + LDFD f47 = [CLD2], INCY3M1 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f36 = ALPHA_R, f12, f36 + FMA f33 = ALPHA_I, f8, f33 + FMA f37 = ALPHA_I, f12, f37 + FMA f34 = ALPHA_R, f10, f34 + FMA f38 = ALPHA_R, f14, f38 + FMA f35 = ALPHA_I, f10, f35 + FMA f39 = ALPHA_I, f14, f39 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FNMA f36 = ALPHA_I, f13, f36 + FMA f33 = ALPHA_R, f9, f33 + FMA f37 = ALPHA_R, f13, f37 + FNMA f34 = ALPHA_I, f11, f34 + FNMA f38 = ALPHA_I, f15, f38 + FMA f35 = ALPHA_R, f11, f35 + FMA f39 = ALPHA_R, f15, f39 + ;; + FMA f40 = ALPHA_R, f16, f40 + FMA f44 = ALPHA_R, f20, f44 + FMA f41 = ALPHA_I, f16, f41 + FMA f45 = ALPHA_I, f20, f45 + FMA f42 = ALPHA_R, f18, f42 + FMA f46 = ALPHA_R, f22, f46 + FMA f43 = ALPHA_I, f18, f43 + FMA f47 = ALPHA_I, f22, f47 + ;; + { .mmf + STFD [CST1] = f32, SIZE + STFD [CST2] = f36, SIZE + FNMA f40 = ALPHA_I, f17, f40 + } + { .mmf + nop __LINE__ + nop __LINE__ + FNMA f44 = ALPHA_I, f21, f44 + } + ;; + { .mmf + STFD [CST1] = f33 + STFD [CST2] = f37 + FMA f41 = ALPHA_R, f17, f41 + } + { .mmf + add CST1 = CST1, INCYM1 + add CST2 = CST2, INCYM1 + FMA f45 = ALPHA_R, f21, f45 + } + ;; + { .mmf + STFD [CST1] = f34, SIZE + STFD [CST2] = f38, SIZE + FNMA f42 = ALPHA_I, f19, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + FNMA f46 = ALPHA_I, f23, f46 + } + ;; + { .mmf + STFD [CST1] = f35 + STFD [CST2] = f39 + FMA f43 = ALPHA_R, f19, f43 + } + { .mmf + add CST1 = CST1, INCY3M1 + add CST2 = CST2, INCY3M1 + FMA f47 = ALPHA_R, f23, f47 + } + ;; + { .mmi + STFD [CST1] = f40, SIZE + STFD [CST2] = f44, SIZE + adds J = -1, J + } + ;; + { .mmi + STFD [CST1] = f41 + STFD [CST2] = f45 + add CST1 = CST1, INCYM1 + } + { .mmi + nop __LINE__ + nop __LINE__ + add CST2 = CST2, INCYM1 + } + ;; + { .mmi + STFD [CST1] = f42, SIZE + STFD [CST2] = f46, SIZE + cmp.lt p6, p0 = 0, J + } + ;; + { .mmi + STFD [CST1] = f43 + STFD [CST2] = f47 + add CST1 = CST1, INCY3M1 + } + { .mmb + add CST2 = CST2, INCY3M1 + (p6) br.cond.dptk .L111 + } + ;; + .align 16 + +.L120: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 2 + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mfb + mov BO = BUFFER + mov f14 = f0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mfi + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + adds I = -1, M + mov f11 = f0 + } + ;; + { .mmf + adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 + mov f13 = f0 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 2, A + mov f15 = f0 + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + adds PREB = RPREFETCH * SIZE, BO + mov ar.lc = I + } + { .mmi + adds WPRE = 16 * SIZE, CLD1 + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L126: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFD f32 = [AO1], 1 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 2, I + (p16) cmp.eq.unc p14, p0 = 4, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p12) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) LDFD f37 = [AO1], 1 * SIZE + (p16) cmp.eq.unc p15, p0 = 6, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p16) LDFD f42 = [AO2], 1 * SIZE + nop __LINE__ + (p20) ADD1 f12 = f116, f56, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f13 = f121, f56, f13 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f47 = [AO2], 1 * SIZE + (p20) ADD1 f14 = f116, f66, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f15 = f121, f66, f15 + } + ;; + { .mmf + (p16) LDFD f52 = [AO3], 1 * SIZE + nop __LINE__ + (p20) ADD3 f8 = f121, f41, f8 + } + { .mmf + nop __LINE__ + (p16) adds I = 1, I + (p20) ADD4 f9 = f116, f41, f9 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFD f57 = [AO3], 1 * SIZE + (p20) ADD3 f10 = f121, f51, f10 + } + { .mmf + nop __LINE__ + (p16) cmp.eq.unc p15, p0 = 8, I + (p20) ADD4 f11 = f116, f51, f11 + } + ;; + { .mmf + (p16) LDFD f62 = [AO4], 1 * SIZE + nop __LINE__ + (p20) ADD3 f12 = f121, f61, f12 + } + { .mmf + (p15) mov I = 0 + nop __LINE__ + (p20) ADD4 f13 = f116, f61, f13 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f67 = [AO4], 1 * SIZE + (p20) ADD3 f14 = f121, f71, f14 + } + { .mfb + (p16) cmp.eq.unc p12, p0 = 0, I + (p20) ADD4 f15 = f116, f71, f15 + br.ctop.sptk.few .L126 + } + ;; +.L128: + LDFD f32 = [CLD1], SIZE + LDFD f36 = [CLD2], SIZE + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + LDFD f37 = [CLD2], INCYM1 + ;; + LDFD f34 = [CLD1], SIZE + LDFD f38 = [CLD2], SIZE + ;; + LDFD f35 = [CLD1], INCY3M1 + LDFD f39 = [CLD2], INCY3M1 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f36 = ALPHA_R, f12, f36 + FMA f33 = ALPHA_I, f8, f33 + FMA f37 = ALPHA_I, f12, f37 + FMA f34 = ALPHA_R, f10, f34 + FMA f38 = ALPHA_R, f14, f38 + FMA f35 = ALPHA_I, f10, f35 + FMA f39 = ALPHA_I, f14, f39 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FNMA f36 = ALPHA_I, f13, f36 + FMA f33 = ALPHA_R, f9, f33 + FMA f37 = ALPHA_R, f13, f37 + FNMA f34 = ALPHA_I, f11, f34 + FNMA f38 = ALPHA_I, f15, f38 + FMA f35 = ALPHA_R, f11, f35 + FMA f39 = ALPHA_R, f15, f39 + ;; + STFD [CST1] = f32, SIZE + STFD [CST2] = f36, SIZE + ;; + STFD [CST1] = f33 + STFD [CST2] = f37 + add CST1 = CST1, INCYM1 + add CST2 = CST2, INCYM1 + ;; + STFD [CST1] = f34, SIZE + STFD [CST2] = f38, SIZE + ;; + STFD [CST1] = f35 + STFD [CST2] = f39 + add CST1 = CST1, INCY3M1 + add CST2 = CST2, INCY3M1 + ;; + .align 16 + +.L130: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 1 + } + ;; + { .mmf + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + mov f12 = f0 + } + { .mfb + adds I = -1, M + mov f14 = f0 + (p6) br.cond.dpnt .L140 + } + ;; + { .mfi + mov BO = BUFFER + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 1, A + mov f11 = f0 + } + ;; + { .mfi + adds WPRE = 16 * SIZE, CLD1 + mov f13 = f0 + mov ar.lc = I + } + { .mmf + adds PREB = RPREFETCH * SIZE, BO + nop __LINE__ + mov f15 = f0 + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L136: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFD f32 = [AO1], 1 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 4, I + (p16) adds I = 1, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p12) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) LDFD f37 = [AO1], 1 * SIZE + (p16) cmp.eq.unc p12, p0 = 8, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f42 = [AO2], 1 * SIZE + (p20) ADD3 f12 = f121, f41, f12 + } + { .mmf + (p12) mov I = 0 + nop __LINE__ + (p20) ADD4 f13 = f116, f41, f13 + } + ;; + { .mmf + (p16) LDFD f47 = [AO2], 1 * SIZE + nop __LINE__ + (p20) ADD3 f14 = f121, f51, f14 + } + { .mfb + nop __LINE__ + (p20) ADD4 f15 = f116, f51, f15 + br.ctop.sptk.few .L136 + } + ;; + +.L138: + LDFD f32 = [CLD1], SIZE + FADD f8 = f8, f12 + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + FADD f10 = f10, f14 + ;; + LDFD f34 = [CLD1], SIZE + FADD f9 = f9, f13 + ;; + LDFD f35 = [CLD1], INCYM1 + FADD f11 = f11, f15 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f33 = ALPHA_I, f8, f33 + FMA f34 = ALPHA_R, f10, f34 + FMA f35 = ALPHA_I, f10, f35 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FMA f33 = ALPHA_R, f9, f33 + FNMA f34 = ALPHA_I, f11, f34 + FMA f35 = ALPHA_R, f11, f35 + ;; + STFD [CST1] = f32, SIZE + ;; + STFD [CST1] = f33 + add CST1 = CST1, INCYM1 + ;; + STFD [CST1] = f34, SIZE + ;; + STFD [CST1] = f35 + add CST1 = CST1, INCYM1 + ;; + .align 16 + + +.L140: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + mov f9 = f0 + tbit.z p6, p0 = N, 0 + } + ;; + { .mfi + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + mov f10 = f0 + mov ar.ec= 5 + } + { .mfb + adds I = -1, M + mov f11 = f0 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 1, A + mov ar.lc = I + } + { .mmi + adds WPRE = 16 * SIZE, CLD1 + adds PREB = RPREFETCH * SIZE, BO + mov BO = BUFFER + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L146: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFD f32 = [AO1], 1 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 7, I + (p16) adds I = 1, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p16) LDFD f37 = [AO1], 1 * SIZE + (p20) ADD3 f10 = f121, f41, f10 + } + { .mfb + (p12) mov I = 0 + (p20) ADD4 f11 = f116, f41, f11 + br.ctop.sptk.few .L146 + } + ;; + +.L148: + LDFD f32 = [CLD1], SIZE + FADD f8 = f8, f10 + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + FADD f9 = f9, f11 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f33 = ALPHA_I, f8, f33 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FMA f33 = ALPHA_R, f9, f33 + ;; + STFD [CST1] = f32, SIZE + ;; + STFD [CST1] = f33 + add CST1 = CST1, INCYM1 + ;; + .align 16 + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/zrot.S b/kernel/ia64/zrot.S new file mode 100644 index 0000000..f133a74 --- /dev/null +++ b/kernel/ia64/zrot.S @@ -0,0 +1,879 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 8 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 8 + 8) +#else +#define PREFETCH_SIZE (32 * 8 + 16) +#endif + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 + +#define INCX16 r18 +#define INCY16 r19 + +#define PR r30 +#define ARLC r31 + +#define C f8 +#define S f9 + + PROLOGUE + .prologue + PROFCODE + { .mmi + adds r29 = 16, r12 + add INCX = INCX, INCX + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + shr I = N, 3 + (p6) br.ret.spnt.many b0 + } + ;; + .body + { .mmi +#ifdef XDOUBLE + LDFD S = [r29] +#else + nop __LINE__ +#endif + add INCY = INCY, INCY + mov PR = pr + } + { .mmi + mov X2 = X1 + mov Y2 = Y1 + mov pr.rot= 0 + } + ;; + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.eq p16, p0 = r0, r0 + and J = 7, N + } + ;; + { .mmi +#ifndef XDOUBLE + shladd INCX16 = INCX, 3, r0 + shladd INCY16 = INCY, 3, r0 +#else + shladd INCX16 = INCX, 2, r0 + shladd INCY16 = INCY, 2, r0 +#endif + nop __LINE__ + } + { .mmi + adds INCX = -SIZE, INCX + adds INCY = -SIZE, INCY + nop __LINE__ + } + ;; + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = PREFETCH_SIZE * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p6 ,p0 = -1, I + tbit.z p0, p12 = N, 2 + (p6) br.cond.dpnt .L15 + } + ;; + .align 32 + +.L12: + { .mmf + (p19) STFD [Y2] = f15 + (p16) lfetch.excl.nt1 [PREX], INCX16 + (p18) FMPY f15 = C, f91 + } + { .mmf + (p16) LDFD f32 = [X1], SIZE + (p19) add Y2 = Y2, INCY + (p18) FNMA f11 = S, f37, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p16) lfetch.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = C, f40, f12 + } + { .mmf + (p17) LDFD f114 = [Y1], INCY + (p18) adds X2 = SIZE, X2 + (p18) FMPY f6 = S, f94 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f35 = [X1], INCX + (p18) FNMA f13 = S, f40, f13 + } + { .mmf + nop __LINE__ + (p18) adds Y2 = SIZE, Y2 + (p18) FMPY f7 = C, f94 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p17) LDFD f117 = [Y1], SIZE + (p18) FMA f14 = C, f43, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f97 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f38 = [X1], SIZE + (p18) FNMA f15 = S, f43, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f97 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p17) LDFD f120 = [Y1], INCY + (p18) FMPY f12 = S, f100 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMA f6 = C, f46, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f41 = [X1], INCX + (p18) FMPY f13 = C, f100 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FNMA f7 = S, f46, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p17) LDFD f123 = [Y1], SIZE + (p18) FMPY f14 = S, f103 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f49, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f44 = [X1], SIZE + (p18) FMPY f15 = C, f103 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f49, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p17) LDFD f126 = [Y1], INCY + (p18) FMA f12 = C, f52, f12 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMPY f6 = S, f106 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f47 = [X1], INCX + (p18) FNMA f13 = S, f52, f13 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FMPY f7 = C, f106 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f80 = [Y1], SIZE + (p18) FMA f14 = C, f55, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f109 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f50 = [X1], SIZE + (p18) FNMA f15 = S, f55, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f109 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f83 = [Y1], INCY + (p18) FMPY f12 = S, f112 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMA f6 = C, f58, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f53 = [X1], INCX + (p18) FMPY f13 = C, f112 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FNMA f7 = S, f58, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f86 = [Y1], SIZE + (p18) FMPY f14 = S, f115 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f61, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f56 = [X1], SIZE + (p18) FMPY f15 = C, f115 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f61, f11 + } + ;; +#ifndef XDOUBLE + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f89 = [Y1], INCY + (p18) FMA f12 = C, f64, f12 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMPY f6 = S, f118 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f59 = [X1], INCX + (p18) FNMA f13 = S, f64, f13 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FMPY f7 = C, f118 + } + ;; +#else + { .mmf + (p18) STFD [X2] = f6 + (p16) lfetch.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = C, f64, f12 + } + { .mmf + (p16) LDFD f89 = [Y1], INCY + (p18) adds X2 = SIZE, X2 + (p18) FMPY f6 = S, f118 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) lfetch.excl.nt1 [PREX], INCX16 + (p18) FNMA f13 = S, f64, f13 + } + { .mmf + (p16) LDFD f59 = [X1], INCX + (p18) adds Y2 = SIZE, Y2 + (p18) FMPY f7 = C, f118 + } + ;; +#endif + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f92 = [Y1], SIZE + (p18) FMA f14 = C, f67, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f121 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f62 = [X1], SIZE + (p18) FNMA f15 = S, f67, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f121 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f95 = [Y1], INCY + (p18) FMPY f12 = S, f124 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMA f6 = C, f70, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f65 = [X1], INCX + (p18) FMPY f13 = C, f124 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FNMA f7 = S, f70, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f98 = [Y1], SIZE + (p18) FMPY f14 = S, f127 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f73, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f68 = [X1], SIZE + (p18) FMPY f15 = C, f127 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f73, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f101 = [Y1], INCY + (p18) FMA f12 = C, f76, f12 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p17) FMPY f6 = S, f81 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f71 = [X1], INCX + (p18) FNMA f13 = S, f76, f13 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p17) FMPY f7 = C, f81 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f104 = [Y1], SIZE + (p18) FMA f14 = C, f79, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMPY f10 = S, f84 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f74 = [X1], SIZE + (p18) FNMA f15 = S, f79, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p17) FMPY f11 = C, f84 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f107 = [Y1], INCY + (p17) FMPY f12 = S, f87 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p17) FMA f6 = C, f33, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f77 = [X1], INCX + (p17) FMPY f13 = C, f87 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p17) FNMA f7 = S, f33, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f110 = [Y1], SIZE + (p17) FMPY f14 = S, f90 + } + { .mfb + (p18) add X2 = X2, INCX + (p17) FMA f10 = C, f36, f10 + br.ctop.sptk.few .L12 + } + ;; + { .mmi + (p19) STFD [Y2] = f15 + (p19) add Y2 = Y2, INCY + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFD f40 = [Y1], SIZE + (p12) LDFD f32 = [X1], SIZE + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f41 = [Y1], INCY + (p12) LDFD f33 = [X1], INCX + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f42 = [Y1], SIZE + cmp.eq p7, p0 = r0, J + (p7) br.ret.sptk.many b0 + } + ;; + { .mmf + (p12) LDFD f43 = [Y1], INCY + nop __LINE__ + (p12) FMPY f6 = S, f40 + } + ;; + { .mmf + (p12) LDFD f34 = [X1], SIZE + nop __LINE__ + (p12) FMPY f7 = C, f40 + } + ;; + { .mmf + (p12) LDFD f44 = [Y1], SIZE + nop __LINE__ + (p12) FMPY f10 = S, f41 + } + ;; + { .mmf + (p12) LDFD f35 = [X1], INCX + nop __LINE__ + (p12) FMPY f11 = C, f41 + } + ;; + { .mmf + (p12) LDFD f45 = [Y1], INCY + nop __LINE__ + (p12) FMPY f12 = S, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f6 = C, f32, f6 + } + ;; + { .mmf + (p12) LDFD f36 = [X1], SIZE + nop __LINE__ + (p12) FMPY f13 = C, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f7 = S, f32, f7 + } + ;; + { .mmf + (p12) LDFD f46 = [Y1], SIZE + nop __LINE__ + (p12) FMPY f14 = S, f43 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f10 = C, f33, f10 + } + ;; + { .mmf + (p12) LDFD f37 = [X1], INCX + nop __LINE__ + (p12) FMPY f15 = C, f43 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f11 = S, f33, f11 + } + ;; + { .mmf + (p12) STFD [X2] = f6, SIZE + (p12) LDFD f47 = [Y1], INCY + (p12) FMA f12 = C, f34, f12 + } + { .mfi + nop __LINE__ + (p12) FMPY f6 = S, f44 + tbit.z p0, p13 = N, 1 + } + ;; + { .mmf + (p12) STFD [Y2] = f7, SIZE + (p12) LDFD f38 = [X1], SIZE + (p12) FNMA f13 = S, f34, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f7 = C, f44 + } + ;; + { .mmf + (p12) STFD [X2] = f10 + (p13) LDFD f52 = [Y1], SIZE + (p12) FMA f14 = C, f35, f14 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMPY f10 = S, f45 + } + ;; + { .mmf + (p12) STFD [Y2] = f11 + (p12) LDFD f39 = [X1], INCX + (p12) FNMA f15 = S, f35, f15 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p12) FMPY f11 = C, f45 + } + ;; + { .mmf + (p12) STFD [X2] = f12, SIZE + (p13) LDFD f53 = [Y1], INCY + (p12) FMPY f12 = S, f46 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f6 = C, f36, f6 + } + ;; + { .mmf + (p12) STFD [Y2] = f13, SIZE + (p13) LDFD f48 = [X1], SIZE + (p12) FMPY f13 = C, f46 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f7 = S, f36, f7 + } + ;; + { .mmf + (p12) STFD [X2] = f14 + (p13) LDFD f54 = [Y1], SIZE + (p12) FMPY f14 = S, f47 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMA f10 = C, f37, f10 + } + ;; + { .mmf + (p12) STFD [Y2] = f15 + (p13) LDFD f49 = [X1], INCX + (p12) FMPY f15 = C, f47 + } + { .mfi + (p12) add Y2 = Y2, INCY + (p12) FNMA f11 = S, f37, f11 + tbit.z p0, p14 = N, 0 + } + ;; + { .mmf + (p12) STFD [X2] = f6, SIZE + (p13) LDFD f55 = [Y1], INCY + (p12) FMA f12 = C, f38, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMPY f6 = S, f52 + } + ;; + { .mmf + (p12) STFD [Y2] = f7, SIZE + (p13) LDFD f50 = [X1], SIZE + (p12) FNMA f13 = S, f38, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMPY f7 = C, f52 + } + ;; + { .mmf + (p12) STFD [X2] = f10 + (p14) LDFD f58 = [Y1], SIZE + (p12) FMA f14 = C, f39, f14 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMPY f10 = S, f53 + } + ;; + { .mmf + (p12) STFD [Y2] = f11 + (p13) LDFD f51 = [X1], INCX + (p12) FNMA f15 = S, f39, f15 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FMPY f11 = C, f53 + } + ;; + { .mmf + (p12) STFD [X2] = f12, SIZE + (p14) LDFD f59 = [Y1], INCY + (p13) FMPY f12 = S, f54 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f6 = C, f48, f6 + } + ;; + { .mmf + (p12) STFD [Y2] = f13, SIZE + (p14) LDFD f56 = [X1], SIZE + (p13) FMPY f13 = C, f54 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FNMA f7 = S, f48, f7 + } + ;; + { .mmf + (p12) STFD [X2] = f14 + (p12) add X2 = X2, INCX + (p13) FMPY f14 = S, f55 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f10 = C, f49, f10 + } + ;; + { .mmf + (p12) STFD [Y2] = f15 + (p14) LDFD f57 = [X1], INCX + (p13) FMPY f15 = C, f55 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FNMA f11 = S, f49, f11 + } + ;; + { .mmf + (p13) STFD [X2] = f6, SIZE + nop __LINE__ + (p13) FMA f12 = C, f50, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMPY f6 = S, f58 + } + ;; + { .mmf + (p13) STFD [Y2] = f7, SIZE + nop __LINE__ + (p13) FNMA f13 = S, f50, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMPY f7 = C, f58 + } + ;; + { .mmf + (p13) STFD [X2] = f10 + (p13) add X2 = X2, INCX + (p13) FMA f14 = C, f51, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMPY f10 = S, f59 + } + ;; + { .mmf + (p13) STFD [Y2] = f11 + (p13) add Y2 = Y2, INCY + (p13) FNMA f15 = S, f51, f15 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMPY f11 = C, f59 + } + ;; + { .mmf + (p13) STFD [X2] = f12, SIZE + nop __LINE__ + (p14) FMA f6 = C, f56, f6 + } + ;; + { .mmf + (p13) STFD [Y2] = f13, SIZE + nop __LINE__ + (p14) FNMA f7 = S, f56, f7 + } + ;; + { .mmf + (p13) STFD [X2] = f14 + (p13) add X2 = X2, INCX + (p14) FMA f10 = C, f57, f10 + } + ;; + { .mmf + (p13) STFD [Y2] = f15 + (p13) add Y2 = Y2, INCY + (p14) FNMA f11 = S, f57, f11 + } + ;; + { .mmi + (p14) STFD [X2] = f6, SIZE + (p14) STFD [Y2] = f7, SIZE + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X2] = f10 + (p14) STFD [Y2] = f11 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/zscal.S b/kernel/ia64/zscal.S new file mode 100644 index 0000000..e97feda --- /dev/null +++ b/kernel/ia64/zscal.S @@ -0,0 +1,540 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#define SP r12 + +#ifdef XDOUBLE +#define N r32 +#define X1 r14 +#define INCX r15 +#else +#define N r32 +#define X1 r37 +#define INCX r38 +#endif + +#define X2 r16 +#define Y1 r17 +#define INCX3 r18 +#define PRE r19 +#define INCX8 r20 +#define I r29 +#define J r28 + +#define PR r30 +#define ARLC r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + + PROLOGUE + .prologue + PROFCODE + {.mmi + adds r22 = 16, SP + adds r23 = 24, SP + mov PR = pr + } + { .mib + cmp.ge p7, p0 = 0, N + shr I = N, 3 + (p7) br.ret.sptk.many b0 + } + ;; +#ifdef XDOUBLE + { .mmi + ld8 X1 = [r22] + ld8 INCX = [r23] + nop __LINE__ + } + ;; +#endif + { .mfi + and J = 7, N + fcmp.eq p0, p11 = ALPHA_I, f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + adds I = -1, I + fcmp.eq p0, p10 = ALPHA_R, f0 + shl INCX = INCX, ZBASE_SHIFT + } + ;; + .body + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd X2 = INCX, 1, X1 + mov pr.rot= 0 + } + { .mmi + shladd INCX3 = INCX, 1, INCX + adds PRE = PREFETCH_SIZE * SIZE, X1 + mov Y1 = X1 + } + ;; + { .mmi + cmp.gt p8, p0 = 0, I + cmp.ge p9, p0 = 0, J + mov ar.lc = I + } + { .mmi + adds INCX = -1 * SIZE, INCX + adds INCX3 = -1 * SIZE, INCX3 + tbit.z p0, p13 = N, 2 + } + ;; + { .bbb + (p10) br.cond.dptk .L100 + (p11) br.cond.dptk .L100 + (p8) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L10: + { .mmb + STFD [X1] = f0, 1 * SIZE + STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + { .mmb + lfetch.excl.nt1 [PRE], INCX8 + nop.m 0 + } + ;; + { .mmb + STFD [X1] = f0 + add X1 = INCX, X1 + } + { .mmb + STFD [X2] = f0 + add X2 = INCX, X2 + } + ;; + { .mmb + STFD [X1] = f0, 1 * SIZE + STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + ;; + { .mmb + STFD [X1] = f0 + add X1 = INCX3, X1 + } + { .mmb + STFD [X2] = f0 + add X2 = INCX3, X2 + } + ;; + { .mmb + STFD [X1] = f0, 1 * SIZE + STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + ;; + { .mmb + STFD [X1] = f0 + add X1 = INCX, X1 + } + { .mmb + STFD [X2] = f0 + add X2 = INCX, X2 + } + ;; + { .mmb + STFD [X1] = f0, 1 * SIZE + STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + ;; + { .mmb + STFD [X1] = f0 + add X1 = INCX3, X1 + } + { .mmb + STFD [X2] = f0 + add X2 = INCX3, X2 + br.cloop.sptk.few .L10 + } + ;; + .align 32 + +.L20: + { .mmi + (p13) STFD [X1] = f0, 1 * SIZE + (p13) STFD [X2] = f0, 1 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmi + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + tbit.z p0, p14 = N, 1 + } + { .mmi + (p13) STFD [X2] = f0 + (p13) add X2 = INCX, X2 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmb + (p13) STFD [X1] = f0, 1 * SIZE + (p13) STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + { .mib + nop.m 0 + mov pr = PR, -65474 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX3, X1 + } + { .mmb + (p13) STFD [X2] = f0 + (p13) add X2 = INCX3, X2 + } + ;; + (p14) STFD [X1] = f0, 1 * SIZE + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + } + ;; + (p14) STFD [X1] = f0, 1 * SIZE + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + } + ;; + (p15) STFD [X1] = f0, 1 * SIZE + ;; + { .mib + (p15) STFD [X1] = f0 + mov pr = PR, -65474 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + cmp.eq p16, p0 = r0, r0 + mov.i ar.ec = 6 + (p8) br.cond.dpnt .L170 + ;; + .align 32 + +.L160: + { .mmf + (p21) STFD [X1] = f6, 1 * SIZE + (p16) lfetch.excl.nt1 [PRE], INCX8 + (p21) FMS f12 = ALPHA_R, f85, f12 + } + { .mfb + (p16) LDFD f32 = [Y1], 1 * SIZE + (p20) FMPY f6 = ALPHA_I, f42 + } + ;; + { .mmf + (p21) STFD [X1] = f43 + (p21) add X1 = INCX, X1 + (p21) FMA f91 = ALPHA_I, f85, f91 + } + { .mfb + (p16) LDFD f38 = [Y1], INCX + (p20) FMPY f42 = ALPHA_R, f42 + } + ;; + { .mmf + (p21) STFD [X1] = f7, 1 * SIZE + (p21) FMS f13 = ALPHA_R, f97, f13 + } + { .mfb + (p16) LDFD f44 = [Y1], 1 * SIZE + (p20) FMPY f7 = ALPHA_I, f54 + } + ;; + { .mmf + (p21) STFD [X1] = f55 + (p21) add X1 = INCX, X1 + (p21) FMA f103 = ALPHA_I, f97, f103 + } + { .mfb + (p16) LDFD f50 = [Y1], INCX + (p20) FMPY f54 = ALPHA_R, f54 + } + ;; + { .mmf + (p21) STFD [X1] = f10, 1 * SIZE + (p21) FMS f14 = ALPHA_R, f109, f14 + } + { .mfb + (p16) LDFD f56 = [Y1], 1 * SIZE + (p20) FMPY f10 = ALPHA_I, f66 + } + ;; + { .mmf + (p21) STFD [X1] = f67 + (p21) add X1 = INCX, X1 + (p21) FMA f115 = ALPHA_I, f109, f115 + } + { .mfb + (p16) LDFD f62 = [Y1], INCX + (p20) FMPY f66 = ALPHA_R, f66 + } + ;; + { .mmf + (p21) STFD [X1] = f11, 1 * SIZE + (p21) FMS f15 = ALPHA_R, f121, f15 + } + { .mfb + (p16) LDFD f68 = [Y1], 1 * SIZE + (p20) FMPY f11 = ALPHA_I, f78 + } + ;; + { .mmf + (p21) STFD [X1] = f79 + (p21) add X1 = INCX, X1 + (p21) FMA f127 = ALPHA_I, f121, f127 + } + { .mfb + (p16) LDFD f74 = [Y1], INCX + (p20) FMPY f78 = ALPHA_R, f78 + } + ;; + { .mmf + (p21) STFD [X1] = f12, 1 * SIZE + (p20) FMS f6 = ALPHA_R, f36, f6 + } + { .mfb + (p16) LDFD f80 = [Y1], 1 * SIZE + (p20) FMPY f12 = ALPHA_I, f90 + } + ;; + { .mmf + (p21) STFD [X1] = f91 + (p21) add X1 = INCX, X1 + (p20) FMA f42 = ALPHA_I, f36, f42 + } + { .mfb + (p16) LDFD f86 = [Y1], INCX + (p20) FMPY f90 = ALPHA_R, f90 + } + ;; + { .mmf + (p21) STFD [X1] = f13, 1 * SIZE + (p20) FMS f7 = ALPHA_R, f48, f7 + } + { .mfb + (p16) LDFD f92 = [Y1], 1 * SIZE + (p20) FMPY f13 = ALPHA_I, f102 + } + ;; + { .mmf + (p21) STFD [X1] = f103 + (p21) add X1 = INCX, X1 + (p20) FMA f54 = ALPHA_I, f48, f54 + } + { .mfb + (p16) LDFD f98 = [Y1], INCX + (p20) FMPY f102 = ALPHA_R, f102 + } + ;; + { .mmf + (p21) STFD [X1] = f14, 1 * SIZE + (p20) FMS f10 = ALPHA_R, f60, f10 + } + { .mfb + (p16) LDFD f104 = [Y1], 1 * SIZE + (p20) FMPY f14 = ALPHA_I, f114 + } + ;; + { .mmf + (p21) STFD [X1] = f115 + (p21) add X1 = INCX, X1 + (p20) FMA f66 = ALPHA_I, f60, f66 + } + { .mfb + (p16) LDFD f110 = [Y1], INCX + (p20) FMPY f114 = ALPHA_R, f114 + } + ;; + { .mmf + (p21) STFD [X1] = f15, 1 * SIZE + (p20) FMS f11 = ALPHA_R, f72, f11 + } + { .mfb + (p16) LDFD f116 = [Y1], 1 * SIZE + (p20) FMPY f15 = ALPHA_I, f126 + } + ;; + { .mmf + (p21) STFD [X1] = f127 + (p21) add X1 = INCX, X1 + (p20) FMA f78 = ALPHA_I, f72, f78 + } + { .mfb + (p16) LDFD f122 = [Y1], INCX + (p20) FMPY f126 = ALPHA_R, f126 + br.ctop.sptk.few .L160 + } + ;; + .align 16 + +.L170: + { .mmi + (p13) LDFD f48 = [Y1], 1 * SIZE + mov ar.lc = ARLC + } + ;; + { .mib + (p13) LDFD f49 = [Y1], INCX + mov pr = PR, -65474 + (p9) br.ret.sptk.many b0 + } + ;; + (p13) LDFD f50 = [Y1], 1 * SIZE + tbit.z p0, p14 = N, 1 + ;; + (p13) LDFD f51 = [Y1], INCX + tbit.z p0, p15 = N, 0 + ;; + (p13) LDFD f52 = [Y1], 1 * SIZE + ;; + (p13) LDFD f53 = [Y1], INCX + ;; + (p13) LDFD f54 = [Y1], 1 * SIZE + (p13) FMPY f112 = ALPHA_I, f48 + ;; + (p13) LDFD f55 = [Y1], INCX + (p13) FMPY f111 = ALPHA_I, f49 + ;; + (p14) LDFD f56 = [Y1], 1 * SIZE + (p13) FMPY f114 = ALPHA_I, f50 + ;; + (p14) LDFD f57 = [Y1], INCX + (p13) FMPY f113 = ALPHA_I, f51 + ;; + (p14) LDFD f58 = [Y1], 1 * SIZE + (p13) FMPY f116 = ALPHA_I, f52 + ;; + (p14) LDFD f59 = [Y1], INCX + (p13) FMPY f115 = ALPHA_I, f53 + ;; + (p15) LDFD f60 = [Y1], 1 * SIZE + (p13) FMPY f118 = ALPHA_I, f54 + ;; + (p15) LDFD f61 = [Y1], INCX + (p13) FMPY f117 = ALPHA_I, f55 + ;; + (p14) FMPY f120 = ALPHA_I, f56 + (p14) FMPY f119 = ALPHA_I, f57 + (p14) FMPY f122 = ALPHA_I, f58 + (p14) FMPY f121 = ALPHA_I, f59 + (p15) FMPY f124 = ALPHA_I, f60 + (p15) FMPY f123 = ALPHA_I, f61 + ;; + (p13) FMS f48 = ALPHA_R, f48, f111 + (p13) FMA f49 = ALPHA_R, f49, f112 + (p13) FMS f50 = ALPHA_R, f50, f113 + (p13) FMA f51 = ALPHA_R, f51, f114 + + ;; + (p13) STFD [X1] = f48, 1 * SIZE + (p13) FMS f52 = ALPHA_R, f52, f115 + ;; + (p13) STFD [X1] = f49 + (p13) add X1 = INCX, X1 + (p13) FMA f53 = ALPHA_R, f53, f116 + ;; + (p13) STFD [X1] = f50, 1 * SIZE + (p13) FMS f54 = ALPHA_R, f54, f117 + ;; + (p13) STFD [X1] = f51 + (p13) add X1 = INCX, X1 + (p13) FMA f55 = ALPHA_R, f55, f118 + ;; + (p13) STFD [X1] = f52, 1 * SIZE + (p14) FMS f56 = ALPHA_R, f56, f119 + ;; + (p13) STFD [X1] = f53 + (p13) add X1 = INCX, X1 + (p14) FMA f57 = ALPHA_R, f57, f120 + ;; + (p13) STFD [X1] = f54, 1 * SIZE + (p14) FMS f58 = ALPHA_R, f58, f121 + ;; + (p13) STFD [X1] = f55 + (p13) add X1 = INCX, X1 + (p14) FMA f59 = ALPHA_R, f59, f122 + ;; + (p14) STFD [X1] = f56, 1 * SIZE + (p15) FMS f60 = ALPHA_R, f60, f123 + ;; + (p14) STFD [X1] = f57 + (p14) add X1 = INCX, X1 + (p15) FMA f61 = ALPHA_R, f61, f124 + ;; + (p14) STFD [X1] = f58, 1 * SIZE + ;; + (p14) STFD [X1] = f59 + (p14) add X1 = INCX, X1 + ;; + (p15) STFD [X1] = f60, 1 * SIZE + ;; + (p15) STFD [X1] = f61 + mov pr = PR, -65474 + br.ret.sptk.many b0 + + EPILOGUE diff --git a/kernel/ia64/zswap.S b/kernel/ia64/zswap.S new file mode 100644 index 0000000..8251b14 --- /dev/null +++ b/kernel/ia64/zswap.S @@ -0,0 +1,476 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#define SP r12 + +#ifdef XDOUBLE +#define N r32 +#define X r14 +#define INCX r15 +#define Y r16 +#define INCY r17 +#else +#define N r32 +#define X r37 +#define INCX r38 +#define Y r39 +#define INCY r36 +#endif + +#define PRE1 r2 +#define PRE2 r3 + +#define I r18 +#define J r19 +#define YY r20 +#define XX r21 +#define INCXM1 r22 +#define INCYM1 r23 +#define INCX8 r24 +#define INCY8 r25 + +#define PR r30 +#define ARLC r31 + + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + } + { .mmb + adds r17 = 40, SP + cmp.gt p15, p0 = r0, N + (p15) br.ret.sptk.many b0 + } + ;; +#ifdef XDOUBLE + { .mmi + ld8 X = [r14] + ld8 INCX = [r15] + nop __LINE__ + } + { .mmi + ld8 Y = [r16] + ld8 INCY = [r17] + nop __LINE__ + } + ;; +#else + { .mmi + ld8 INCY = [r14] + nop __LINE__ + nop __LINE__ + } + ;; +#endif + { .mii + .save ar.lc, ARLC + mov ARLC = ar.lc + shl INCX = INCX, ZBASE_SHIFT + } + ;; + .body + { .mii + and J = 7, N + mov PR = pr + shl INCY = INCY, ZBASE_SHIFT + } + ;; + { .mmi + mov XX = X + mov YY = Y + shr I = N, 3 + } + ;; + { .mmi + adds I = -1, I + cmp.eq p9, p0 = r0, J + mov pr.rot = 0 + } + ;; + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + mov ar.ec= 3 + } + { .mmi + adds INCXM1 = -SIZE, INCX + adds INCYM1 = -SIZE, INCY + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mmi + adds PRE1 = PREFETCH_SIZE * SIZE, X + adds PRE2 = PREFETCH_SIZE * SIZE, Y + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = J, 2 + (p8) br.cond.dpnt .L55 + } + ;; + .align 32 + +.L52: + { .mmi + (p18) STFD [XX] = f37, 1 * SIZE + (p18) STFD [YY] = f34, 1 * SIZE + } + { .mmi + (p16) LDFD f32 = [X], 1 * SIZE + (p16) LDFD f35 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f43 + (p18) STFD [YY] = f40 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f38 = [X], INCXM1 + (p16) LDFD f41 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f49, 1 * SIZE + (p18) STFD [YY] = f46, 1 * SIZE + } + { .mmi + (p16) LDFD f44 = [X], 1 * SIZE + (p16) LDFD f47 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f55 + (p18) STFD [YY] = f52 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f50 = [X], INCXM1 + (p16) LDFD f53 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f61, 1 * SIZE + (p18) STFD [YY] = f58, 1 * SIZE + } + { .mmi + (p16) LDFD f56 = [X], 1 * SIZE + (p16) LDFD f59 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f67 + (p18) STFD [YY] = f64 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f62 = [X], INCXM1 + (p16) LDFD f65 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f73, 1 * SIZE + (p18) STFD [YY] = f70, 1 * SIZE + } + { .mmi + (p16) LDFD f68 = [X], 1 * SIZE + (p16) LDFD f71 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f79 + (p18) STFD [YY] = f76 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f74 = [X], INCXM1 + (p16) LDFD f77 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f85, 1 * SIZE + (p18) STFD [YY] = f82, 1 * SIZE + } + { .mmi + (p16) LDFD f80 = [X], 1 * SIZE + (p16) LDFD f83 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f91 + (p18) STFD [YY] = f88 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f86 = [X], INCXM1 + (p16) LDFD f89 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f97, 1 * SIZE + (p18) STFD [YY] = f94, 1 * SIZE + } + { .mmi + (p16) LDFD f92 = [X], 1 * SIZE + (p16) LDFD f95 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f103 + (p18) STFD [YY] = f100 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f98 = [X], INCXM1 + (p16) LDFD f101 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f109, 1 * SIZE + (p18) STFD [YY] = f106, 1 * SIZE + } + { .mmi + (p16) LDFD f104 = [X], 1 * SIZE + (p16) LDFD f107 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f115 + (p18) STFD [YY] = f112 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f110 = [X], INCXM1 + (p16) LDFD f113 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f121, 1 * SIZE + (p18) STFD [YY] = f118, 1 * SIZE + } + { .mmi + (p16) LDFD f116 = [X], 1 * SIZE + (p16) LDFD f119 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f127 + (p18) STFD [YY] = f124 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f122 = [X], INCXM1 + (p16) LDFD f125 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + { .mmb + (p16) lfetch.excl.nt1 [PRE1], INCX8 + (p16) lfetch.excl.nt1 [PRE2], INCY8 + br.ctop.sptk.few .L52 + } + ;; + .align 32 + +.L55: + { .mmi + (p12) LDFD f32 = [X], 1 * SIZE + (p12) LDFD f80 = [Y], 1 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f33 = [X], INCXM1 + (p12) LDFD f81 = [Y], INCYM1 + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f34 = [X], 1 * SIZE + (p12) LDFD f82 = [Y], 1 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f35 = [X], INCXM1 + (p12) LDFD f83 = [Y], INCYM1 + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f36 = [X], 1 * SIZE + (p12) LDFD f84 = [Y], 1 * SIZE + tbit.z p0, p14 = N, 0 + } + ;; + { .mmi + (p12) LDFD f37 = [X], INCXM1 + (p12) LDFD f85 = [Y], INCYM1 + } + ;; + { .mmi + (p12) STFD [XX] = f80, 1 * SIZE + (p12) STFD [YY] = f32, 1 * SIZE + } + { .mmi + (p12) LDFD f38 = [X], 1 * SIZE + (p12) LDFD f86 = [Y], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [XX] = f81 + (p12) STFD [YY] = f33 + (p12) add XX = XX, INCXM1 + } + { .mmi + (p12) LDFD f39 = [X], INCXM1 + (p12) LDFD f87 = [Y], INCYM1 + (p12) add YY = YY, INCYM1 + } + ;; + { .mmi + (p12) STFD [XX] = f82, 1 * SIZE + (p12) STFD [YY] = f34, 1 * SIZE + } + { .mmi + (p13) LDFD f40 = [X], 1 * SIZE + (p13) LDFD f88 = [Y], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [XX] = f83 + (p12) STFD [YY] = f35 + (p12) add XX = XX, INCXM1 + } + { .mmi + (p13) LDFD f41 = [X], INCXM1 + (p13) LDFD f89 = [Y], INCYM1 + (p12) add YY = YY, INCYM1 + } + ;; + { .mmi + (p12) STFD [XX] = f84, 1 * SIZE + (p12) STFD [YY] = f36, 1 * SIZE + } + { .mmi + (p13) LDFD f42 = [X], 1 * SIZE + (p13) LDFD f90 = [Y], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [XX] = f85 + (p12) STFD [YY] = f37 + (p12) add XX = XX, INCXM1 + } + { .mmi + (p13) LDFD f43 = [X], INCXM1 + (p13) LDFD f91 = [Y], INCYM1 + (p12) add YY = YY, INCYM1 + } + ;; + { .mmi + (p12) STFD [XX] = f86, 1 * SIZE + (p12) STFD [YY] = f38, 1 * SIZE + } + { .mmi + (p14) LDFD f44 = [X], 1 * SIZE + (p14) LDFD f92 = [Y], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [XX] = f87 + (p12) STFD [YY] = f39 + (p12) add XX = XX, INCXM1 + } + { .mmi + (p14) LDFD f45 = [X] + (p14) LDFD f93 = [Y] + (p12) add YY = YY, INCYM1 + } + ;; + { .mmi + (p13) STFD [XX] = f88, 1 * SIZE + (p13) STFD [YY] = f40, 1 * SIZE + } + ;; + (p13) STFD [XX] = f89 + (p13) add XX = XX, INCXM1 + (p13) STFD [YY] = f41 + (p13) add YY = YY, INCYM1 + ;; + (p13) STFD [XX] = f90, 1 * SIZE + (p13) STFD [YY] = f42, 1 * SIZE + ;; + (p13) STFD [XX] = f91 + (p13) add XX = XX, INCXM1 + (p13) STFD [YY] = f43 + (p13) add YY = YY, INCYM1 + ;; + (p14) STFD [XX] = f92, 1 * SIZE + (p14) STFD [YY] = f44, 1 * SIZE + ;; + (p14) STFD [XX] = f93 + (p14) STFD [YY] = f45 + br.ret.sptk.many b0 + ;; + EPILOGUE + diff --git a/kernel/ia64/ztrsm_kernel_LN.S b/kernel/ia64/ztrsm_kernel_LN.S new file mode 100644 index 0000000..ef903e3 --- /dev/null +++ b/kernel/ia64/ztrsm_kernel_LN.S @@ -0,0 +1,10839 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#ifndef LN +#define CPREFETCHSIZE 7 +#else +#define CPREFETCHSIZE -8 +#endif +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + +#define AORIG loc0 +#define KK loc1 +#define KK8 loc2 +#define OFFSET loc3 +#define AOFFSET2 loc4 +#define BOFFSET2 loc5 + +#ifndef CONJ +#define FCALC_A FSUB +#define FCALC_B FADD +#define FMA_A FNMA +#define FMA_B FMA +#else +#define FCALC_A FADD +#define FCALC_B FSUB +#define FMA_A FMA +#define FMA_B FNMA +#endif + +#ifndef CONJ +#define FCALC_C FMA +#define FCALC_D FNMA +#else +#define FCALC_C FNMA +#define FCALC_D FMA +#endif + +#ifndef CONJ +#define FMA_C FNMA +#define FMA_D FMA +#define FSUB_A FSUB +#else +#define FMA_C FMA +#define FMA_D FMS +#define FSUB_A FADD +#endif + + + PROLOGUE + .prologue + PROFCODE + + { .mfi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 8, 0, 0 + mov f64 = f0 + adds r14 = 16, SP + } + { .mfi + nop __LINE__ + mov f65 = f0 + adds r15 = 24, SP + } + ;; + { .mfi + ld8 LDC = [r14] + mov f81 = f0 + mov PR = pr + } + { .mfi + ld8 OFFSET = [r15] + mov f96 = f0 + shr J = N, 2 + } + ;; + { .mfi + shladd LDC = LDC, ZBASE_SHIFT, r0 + mov f97 = f0 + } + { .mfi + nop __LINE__ + mov f113 = f0 + } + ;; +#ifdef LN + { .mmi + setf.sig f32 = M + setf.sig f33 = K + shladd C = M, ZBASE_SHIFT, C + } + ;; + {.mmf + nop __LINE__ + nop __LINE__ + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + ;; + nop __LINE__ + shladd A = r2, ZBASE_SHIFT, A + } + ;; +#endif + +#ifdef RN + sub KK = r0, OFFSET +#endif + +#ifdef RT + { .mmi + setf.sig f32 = N + setf.sig f33 = K + nop __LINE__ + } + ;; + { .mmi + setf.sig f34 = LDC + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + xmpy.l f33 = f32, f33 + } + { .mmf + nop __LINE__ + sub KK = N, OFFSET + xmpy.l f34 = f32, f34 + } + ;; + { .mmi + getf.sig r2 = f33 + getf.sig r3 = f34 + } + ;; + shladd B = r2, ZBASE_SHIFT, B + add C = r3, C +#endif + ;; + .body + { .mfi + nop __LINE__ + mov f80 = f0 + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, J + mov f112 = f0 + (p6) br.cond.dpnt .L050 + } + ;; + .align 16 + +.L010: +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + { .mmi + mov C1 = C // coffset1 = c + 0 * ldc + add C2 = LDC, C // coffset2 = c + 1 * ldc + } + { .mmi + adds J = -1, J +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + { .mmi + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + { .mib +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L020 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f72 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f72 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + + { .mmi + nop __LINE__ + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f88 = f0 + shr L = L, 1 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f104 = f0 + adds L = -1, L + } + { .mfb + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f105 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f120 = f0 + mov ar.lc = L + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f121 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L038 + ;; + .align 16 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB_A f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB_A f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB_A f113 = f121, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB f113 = f121, f113 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f90, f91 = [AOFFSET] + ;; + FMPY f32 = f90, f64 + FMPY f33 = f91, f64 + FMPY f34 = f90, f80 + FMPY f35 = f91, f80 + FMPY f36 = f90, f96 + FMPY f37 = f91, f96 + FMPY f38 = f90, f112 + FMPY f39 = f91, f112 + ;; + FMA_C f64 = f91, f65, f32 + FMA_D f65 = f90, f65, f33 + FMA_C f80 = f91, f81, f34 + FMA_D f81 = f90, f81, f35 + FMA_C f96 = f91, f97, f36 + FMA_D f97 = f90, f97, f37 + FMA_C f112 = f91, f113, f38 + FMA_D f113 = f90, f113, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L020: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L010x + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + shr L = L, 1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = -1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f114 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f115 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L028 + ;; + .align 16 + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; +.L028: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET] + FSUB f80 = f74, f80 + adds BOFFSET = -14 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f89, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f90, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f91, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f66 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f105, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f106, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f107, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f120, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f121, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f123, f115 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET] + FSUB f66 = f74, f66 + adds AOFFSET = -14 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f106, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f66 + FMPY f33 = f105, f66 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + FMPY f36 = f104, f98 + FMPY f37 = f105, f98 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f66 = f105, f67, f32 + FMA_D f67 = f104, f67, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + FMA_C f98 = f105, f99, f36 + FMA_D f99 = f104, f99, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f66, f64 + FMA_A f65 = f107, f66, f65 + FNMA f80 = f106, f82, f80 + FMA_A f81 = f107, f82, f81 + FNMA f96 = f106, f98, f96 + FMA_A f97 = f107, f98, f97 + FNMA f112 = f106, f114, f112 + FMA_A f113 = f107, f114, f113 + ;; + FMA_B f64 = f107, f67, f64 + FNMA f65 = f106, f67, f65 + FMA_B f80 = f107, f83, f80 + FNMA f81 = f106, f83, f81 + FMA_B f96 = f107, f99, f96 + FNMA f97 = f106, f99, f97 + FMA_B f112 = f107, f115, f112 + FNMA f113 = f106, f115, f113 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; + FNMA f66 = f74, f64, f66 + FMA_A f67 = f75, f64, f67 + FNMA f82 = f74, f80, f82 + FMA_A f83 = f75, f80, f83 + FNMA f98 = f74, f96, f98 + FMA_A f99 = f75, f96, f99 + FNMA f114 = f74, f112, f114 + FMA_A f115 = f75, f112, f115 + ;; + FMA_B f66 = f75, f65, f66 + FNMA f67 = f74, f65, f67 + FMA_B f82 = f75, f81, f82 + FNMA f83 = f74, f81, f83 + FMA_B f98 = f75, f97, f98 + FNMA f99 = f74, f97, f99 + FMA_B f114 = f75, f113, f114 + FNMA f115 = f74, f113, f115 + ;; + FMPY f32 = f90, f66 + FMPY f33 = f91, f66 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + FMPY f36 = f90, f98 + FMPY f37 = f91, f98 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f66 = f91, f67, f32 + FMA_D f67 = f90, f67, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + FMA_C f98 = f91, f99, f36 + FMA_D f99 = f90, f99, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f66 + FMPY f35 = f73, f66 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f66 = f73, f67, f34 + FMA_D f67 = f72, f67, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f98 = f76, f66, f98 + FMA_A f99 = f77, f66, f99 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f98 = f77, f67, f98 + FNMA f99 = f76, f67, f99 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + FNMA f114 = f78, f66, f114 + FMA_A f115 = f79, f66, f115 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + FMA_B f114 = f79, f67, f114 + FNMA f115 = f78, f67, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f98 = f92, f82, f98 + FMA_A f99 = f93, f82, f99 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f98 = f93, f83, f98 + FNMA f99 = f92, f83, f99 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + FNMA f114 = f94, f82, f114 + FMA_A f115 = f95, f82, f115 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + FMA_B f114 = f95, f83, f114 + FNMA f115 = f94, f83, f115 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + FMPY f34 = f108, f98 + FMPY f35 = f109, f98 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + FMA_C f98 = f109, f99, f34 + FMA_D f99 = f108, f99, f35 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + FNMA f114 = f110, f98, f114 + FMA_A f115 = f111, f98, f115 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + FMA_B f114 = f111, f99, f114 + FNMA f115 = f110, f99, f115 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f98 = f74, f114, f98 + FMA_A f99 = f75, f114, f99 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f98 = f75, f115, f98 + FNMA f99 = f74, f115, f99 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f82 = f76, f114, f82 + FMA_A f83 = f77, f114, f83 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f82 = f77, f115, f82 + FNMA f83 = f76, f115, f83 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + FNMA f66 = f78, f114, f66 + FMA_A f67 = f79, f114, f67 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + FMA_B f66 = f79, f115, f66 + FNMA f67 = f78, f115, f67 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + FMPY f34 = f88, f98 + FMPY f35 = f89, f98 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + FMA_C f98 = f89, f99, f34 + FMA_D f99 = f88, f99, f35 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f82 = f90, f98, f82 + FMA_A f83 = f91, f98, f83 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f82 = f91, f99, f82 + FNMA f83 = f90, f99, f83 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + FNMA f66 = f92, f98, f66 + FMA_A f67 = f93, f98, f67 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + FMA_B f66 = f93, f99, f66 + FNMA f67 = f92, f99, f67 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f66 + FMPY f35 = f121, f66 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f66 = f121, f67, f34 + FMA_D f67 = f120, f67, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f83, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f66, SIZE + ;; + STFD [C1 ] = f67, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f82, SIZE + ;; + STFD [C2 ] = f83, SIZE + ;; + + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C3 ] = f98, SIZE + ;; + STFD [C3 ] = f99, SIZE + ;; + + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + STFD [C4 ] = f114, SIZE + ;; + STFD [C4 ] = f115, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L010x: +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + shr I = M, 2 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L049 + ;; + .align 16 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mfi + shladd r3 = KK, ZBASE_SHIFT, r0 + mov f118 = f0 + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = 1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + adds C5 = 4 * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f115 = f0 + adds C6 = 4 * SIZE, C2 + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f68 = f0 + shr L = L, 1 + } + { .mfi + setf.d f86 = r0 + mov f69 = f0 + adds C7 = 4 * SIZE, C3 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f84 = f0 + adds L = -1, L + } + { .mfi + setf.d f87 = r0 + mov f85 = f0 + adds C8 = 4 * SIZE, C4 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f100 = f0 + mov ar.lc = L + } + { .mfi + setf.d f102 = r0 + mov f101 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f116 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + { .mfi + setf.d f103 = r0 + mov f117 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC] + mov f70 = f0 + cmp.eq p6, p0 = -1, L + } + { .mfb + setf.d f119 = r0 + mov f71 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA_B f65 = f32, f49, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f69 = f36, f49, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f85 = f36, f51, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f101 = f36, f53, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f117 = f36, f55, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f68 = f37, f49, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f84 = f37, f51, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f100 = f37, f53, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f116 = f37, f55, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f71 = f38, f49, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f87 = f38, f51, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f103 = f38, f53, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f119 = f38, f55, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f70 = f39, f49, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f86 = f39, f51, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f102 = f39, f53, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f118 = f39, f55, f118 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + FSUB f80 = f74, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + FSUB f96 = f76, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f77, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FSUB f112 = f78, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f79, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET], 2 * SIZE + FSUB f66 = f88, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f89, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET], 2 * SIZE + FSUB f98 = f92, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f93, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [BOFFSET], 2 * SIZE + FSUB f114 = f94, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f95, f115 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FSUB f68 = f104, f68 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f69 = f105, f69 +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [AOFFSET] + FSUB f84 = f106, f84 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f85 = f107, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [AOFFSET] + FSUB f100 = f108, f100 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f110, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f117 = f111, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f120, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f71 = f121, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f86 = f122, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f87 = f123, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f102 = f124, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f103 = f125, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f119 = f127, f119 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET], 2 * SIZE + FSUB f66 = f74, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + FSUB f68 = f76, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f69 = f77, f69 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + FSUB f70 = f78, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f71 = f79, f71 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f84 = f92, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f93, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET], 2 * SIZE + FSUB f86 = f94, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f95, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [AOFFSET], 2 * SIZE + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FSUB f98 = f106, f98 + adds AOFFSET = -30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [BOFFSET] + FSUB f100 = f108, f100 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [BOFFSET] + FSUB f102 = f110, f102 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f103 = f111, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f124, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f125, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f127, f119 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + { .mfi + LDFPD f76, f77 = [AOFFSET] + FMPY f32 = f72, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f70 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [AOFFSET] + FMPY f34 = f72, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET] + FMPY f35 = f73, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET] + FMA_C f70 = f73, f71, f32 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f102 = f73, f103, f36 + adds C1 = -2 * SIZE, C1 + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET] + FMA_D f71 = f72, f71, f33 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f103 = f72, f103, f37 + adds C2 = -2 * SIZE, C2 + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET] + FMA_C f86 = f73, f87, f34 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + adds C3 = -2 * SIZE, C3 + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET] + FMA_D f87 = f72, f87, f35 + adds BOFFSET2 = 28 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds BOFFSET = 24 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FNMA f68 = f74, f70, f68 + adds C4 = -2 * SIZE, C4 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FNMA f100 = f74, f102, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FMA_A f69 = f75, f70, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FMA_A f101 = f75, f102, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FNMA f84 = f74, f86, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FNMA f116 = f74, f118, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f87, -11 * SIZE + FMA_A f85 = f75, f86, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f119, -11 * SIZE + FMA_A f117 = f75, f118, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + FMA_B f68 = f75, f71, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f102, SIZE + FMA_B f100 = f75, f103, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f71, -3 * SIZE + FNMA f69 = f74, f71, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f103, -3 * SIZE + FNMA f101 = f74, f103, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + FMA_B f84 = f75, f87, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f118, SIZE + FMA_B f116 = f75, f119, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, -3 * SIZE + FNMA f85 = f74, f87, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f119, -3 * SIZE + FNMA f117 = f74, f119, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f76, f70, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f76, f102, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f77, f70, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f99 = f77, f102, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f86, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f76, f118, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f86, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f115 = f77, f118, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f77, f71, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f98 = f77, f103, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f76, f71, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f99 = f76, f103, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f87, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f114 = f77, f119, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f87, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f115 = f76, f119, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f70, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f78, f102, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f70, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f79, f102, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f78, f86, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f78, f118, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f79, f86, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f79, f118, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f71, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f79, f103, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f71, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f78, f103, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f79, f87, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f79, f119, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f78, f87, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f78, f119, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f89, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f69 = f88, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f84 = f89, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f89, f117, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f85 = f88, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f88, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f66 = f90, f68, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f98 = f90, f100, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f67 = f91, f68, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f99 = f91, f100, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f82 = f90, f84, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f114 = f90, f116, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, -11 * SIZE + FMA_A f83 = f91, f84, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, -11 * SIZE + FMA_A f115 = f91, f116, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f66 = f91, f69, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f98 = f91, f101, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, -3 * SIZE + FNMA f67 = f90, f69, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, -3 * SIZE + FNMA f99 = f90, f101, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f82 = f91, f85, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f114 = f91, f117, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, -3 * SIZE + FNMA f83 = f90, f85, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, -3 * SIZE + FNMA f115 = f90, f117, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f68, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f92, f100, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f68, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f93, f100, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f92, f84, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f92, f116, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f93, f84, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f93, f116, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f69, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f93, f101, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f69, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f92, f101, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f93, f85, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f93, f117, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f92, f85, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f92, f117, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f105, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f105, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f104, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f104, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f105, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f104, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f106, f66, f64 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f106, f98, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f65 = f107, f66, f65 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f97 = f107, f98, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f106, f82, f80 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f106, f114, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, -11 * SIZE + FMA_A f81 = f107, f82, f81 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, -11 * SIZE + FMA_A f113 = f107, f114, f113 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f64 = f107, f67, f64 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f96 = f107, f99, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, -3 * SIZE + FNMA f65 = f106, f67, f65 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, -3 * SIZE + FNMA f97 = f106, f99, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f80 = f107, f83, f80 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f112 = f107, f115, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, -3 * SIZE + FNMA f81 = f106, f83, f81 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, -3 * SIZE + FNMA f113 = f106, f115, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f96 = f121, f97, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f120, f97, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f121, f81, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f112 = f121, f113, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f120, f81, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f113 = f120, f113, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f81, -3 * SIZE + STFD [BOFFSET2] = f113, -3 * SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, -1 * SIZE + mov f65 = f0 + adds KK = -4, KK + } + { .mfi + STFD [C3 ] = f97, -1 * SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f112, SIZE + mov f112 = f0 + sub L = K, KK + } + ;; + { .mfi + STFD [C2 ] = f81, -1 * SIZE + mov f81 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f113, -1 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef LT + { .mfi + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f64 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + FMPY f34 = f72, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FMPY f35 = f73, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET] + FMA_C f64 = f73, f65, f32 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f96 = f73, f97, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f72, f97, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET] + FMA_C f80 = f73, f81, f34 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f112 = f73, f113, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FMA_D f81 = f72, f81, f35 + adds AOFFSET = - 30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f113 = f72, f113, f39 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f66 = f74, f64, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f98 = f74, f96, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMA_A f67 = f75, f64, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMA_A f99 = f75, f96, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f82 = f74, f80, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f114 = f74, f112, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f81, 5 * SIZE + FMA_A f83 = f75, f80, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f113, 5 * SIZE + FMA_A f115 = f75, f112, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f66 = f75, f65, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f98 = f75, f97, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f67 = f74, f65, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f99 = f74, f97, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f82 = f75, f81, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f114 = f75, f113, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f83 = f74, f81, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f115 = f74, f113, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f68 = f76, f64, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f96, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f69 = f77, f64, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f96, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f84 = f76, f80, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f76, f112, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f85 = f77, f80, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f77, f112, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f68 = f77, f65, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f97, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f69 = f76, f65, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f97, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f84 = f77, f81, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f77, f113, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f85 = f76, f81, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f76, f113, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f78, f64, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f78, f96, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f79, f64, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f79, f96, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f78, f80, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f112, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f79, f80, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f112, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f79, f65, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f79, f97, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f78, f65, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f78, f97, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f79, f81, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f113, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f78, f81, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f113, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f91, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f91, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f91, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f90, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f90, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f91, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f90, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f68 = f92, f66, f68 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f100 = f92, f98, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f69 = f93, f66, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f101 = f93, f98, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f84 = f92, f82, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f116 = f92, f114, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, 5 * SIZE + FMA_A f85 = f93, f82, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, 5 * SIZE + FMA_A f117 = f93, f114, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f68 = f93, f67, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f100 = f93, f99, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, SIZE + FNMA f69 = f92, f67, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, SIZE + FNMA f101 = f92, f99, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f84 = f93, f83, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f116 = f93, f115, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, SIZE + FNMA f85 = f92, f83, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, SIZE + FNMA f117 = f92, f115, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f94, f66, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f94, f98, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f95, f66, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f95, f98, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f94, f82, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f114, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f95, f82, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f114, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f95, f67, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f95, f99, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f94, f67, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f94, f99, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f95, f83, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f115, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f94, f83, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f115, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f109, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f108, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f109, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f109, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f108, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f109, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f109, f117, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f108, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f108, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f70 = f110, f68, f70 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f102 = f110, f100, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f71 = f111, f68, f71 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f103 = f111, f100, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f86 = f110, f84, f86 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f118 = f110, f116, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, 5 * SIZE + FMA_A f87 = f111, f84, f87 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, 5 * SIZE + FMA_A f119 = f111, f116, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f70 = f111, f69, f70 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f102 = f111, f101, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, SIZE + FNMA f71 = f110, f69, f71 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, SIZE + FNMA f103 = f110, f101, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f86 = f111, f85, f86 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f118 = f111, f117, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, SIZE + FNMA f87 = f110, f85, f87 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, SIZE + FNMA f119 = f110, f117, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f70 = f127, f71, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f127, f103, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f71 = f126, f71, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f126, f103, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f86 = f127, f87, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f87 = f126, f87, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f102, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f118, SIZE + adds KK = 4, KK + } + ;; + { .mmi + STFD [BOFFSET] = f87, -27 * SIZE + STFD [BOFFSET2] = f119 + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + mov f64 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + { .mfi + STFD [C3 ] = f102, SIZE + mov f65 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + ;; + { .mfi + STFD [C1 ] = f71, SIZE + mov f80 = f0 + mov L = KK + } + { .mfi + STFD [C3 ] = f103, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + mov f96 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f119, SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef RN + { .mfi + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f64 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + FMPY f34 = f72, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + FMPY f35 = f73, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET] + FMA_C f64 = f73, f65, f32 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f68 = f73, f69, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f72, f69, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET] + FMA_C f66 = f73, f67, f34 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f70 = f73, f71, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FMA_D f67 = f72, f67, f35 + adds BOFFSET = - 30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f71 = f72, f71, f39 + adds AOFFSET2 = 4 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f80 = f74, f64, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f84 = f74, f68, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMA_A f81 = f75, f64, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FMA_A f85 = f75, f68, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f82 = f74, f66, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f86 = f74, f70, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FMA_A f83 = f75, f66, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FMA_A f87 = f75, f70, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f80 = f75, f65, f80 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f68, SIZE + FMA_B f84 = f75, f69, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f81 = f74, f65, f81 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + FNMA f85 = f74, f69, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f82 = f75, f67, f82 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f70, SIZE + FMA_B f86 = f75, f71, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FNMA f83 = f74, f67, f83 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f71, 5 * SIZE + FNMA f87 = f74, f71, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f96 = f76, f64, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f68, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f97 = f77, f64, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f68, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f98 = f76, f66, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f76, f70, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f99 = f77, f66, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f77, f70, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f96 = f77, f65, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f69, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f97 = f76, f65, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f69, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f98 = f77, f67, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f77, f71, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f76, f67, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f76, f71, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f78, f64, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f78, f68, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f79, f64, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f79, f68, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f78, f66, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f70, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f79, f66, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f70, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f79, f65, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f79, f69, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f78, f65, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f78, f69, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f79, f67, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f71, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f78, f67, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f71, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f91, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f91, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f91, f85, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f81 = f90, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f90, f85, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f91, f87, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f90, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f96 = f92, f80, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f100 = f92, f84, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f97 = f93, f80, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f101 = f93, f84, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f98 = f92, f82, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f102 = f92, f86, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FMA_A f99 = f93, f82, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FMA_A f103 = f93, f86, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f96 = f93, f81, f96 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f100 = f93, f85, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f97 = f92, f81, f97 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f101 = f92, f85, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f98 = f93, f83, f98 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f102 = f93, f87, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f99 = f92, f83, f99 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f103 = f92, f87, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f94, f80, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f94, f84, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f95, f80, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f95, f84, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f94, f82, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f86, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f95, f82, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f86, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f95, f81, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f95, f85, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f94, f81, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f94, f85, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f95, f83, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f87, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f94, f83, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f87, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f109, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f108, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f109, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f109, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f108, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f109, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f109, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f108, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f108, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f112 = f110, f96, f112 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f116 = f110, f100, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f113 = f111, f96, f113 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f117 = f111, f100, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f114 = f110, f98, f114 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f118 = f110, f102, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMA_A f115 = f111, f98, f115 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMA_A f119 = f111, f102, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f112 = f111, f97, f112 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f116 = f111, f101, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f113 = f110, f97, f113 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f117 = f110, f101, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f114 = f111, f99, f114 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f118 = f111, f103, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f115 = f110, f99, f115 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f119 = f110, f103, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f112 = f127, f113, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f127, f117, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f113 = f126, f113, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f126, f117, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f114 = f127, f115, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f115 = f126, f115, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f116, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f117, SIZE + mov L = KK + } + ;; + { .mmi + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f118, SIZE + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + mov f64 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + { .mfi + STFD [C8 ] = f116, SIZE + mov f65 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C8 ] = f117, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + mov f96 = f0 + adds I = -1, I + } + { .mfi + STFD [C8 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f119, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } +#endif + +#ifdef RT + { .mfi + LDFPD f76, f77 = [BOFFSET] + FMPY f32 = f72, f112 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f112 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [BOFFSET] + FMPY f34 = f72, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET] + FMPY f35 = f73, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET] + FMA_C f112 = f73, f113, f32 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f116 = f73, f117, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET] + FMA_D f113 = f72, f113, f33 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f117 = f72, f117, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET] + FMA_C f114 = f73, f115, f34 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET] + FMA_D f115 = f72, f115, f35 + adds AOFFSET2 = 28 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds AOFFSET = 24 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FNMA f96 = f74, f112, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FNMA f100 = f74, f116, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMA_A f97 = f75, f112, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMA_A f101 = f75, f116, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FNMA f98 = f74, f114, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FNMA f102 = f74, f118, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f115, -11 * SIZE + FMA_A f99 = f75, f114, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f119, -11 * SIZE + FMA_A f103 = f75, f118, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f96 = f75, f113, f96 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f116, SIZE + FMA_B f100 = f75, f117, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f97 = f74, f113, f97 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f117, SIZE + FNMA f101 = f74, f117, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f98 = f75, f115, f98 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f118, SIZE + FMA_B f102 = f75, f119, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + FNMA f99 = f74, f115, f99 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f119, 5 * SIZE + FNMA f103 = f74, f119, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f76, f112, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f76, f116, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f77, f112, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f85 = f77, f116, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f114, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f76, f118, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f114, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f87 = f77, f118, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f77, f113, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f84 = f77, f117, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f76, f113, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f85 = f76, f117, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f115, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f86 = f77, f119, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f115, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f76, f119, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f112, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f78, f116, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f112, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f79, f116, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f78, f114, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f78, f118, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f79, f114, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f79, f118, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f113, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f79, f117, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f113, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f78, f117, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f79, f115, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f79, f119, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f78, f115, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f78, f119, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f89, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f88, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f89, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f89, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f88, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f88, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f80 = f90, f96, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f84 = f90, f100, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f81 = f91, f96, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f85 = f91, f100, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f82 = f90, f98, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f86 = f90, f102, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, -11 * SIZE + FMA_A f83 = f91, f98, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, -11 * SIZE + FMA_A f87 = f91, f102, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f80 = f91, f97, f80 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f84 = f91, f101, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f81 = f90, f97, f81 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f85 = f90, f101, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f82 = f91, f99, f82 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f86 = f91, f103, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f83 = f90, f99, f83 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f87 = f90, f103, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f96, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f92, f100, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f96, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f93, f100, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f92, f98, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f92, f102, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f93, f98, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f93, f102, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f97, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f93, f101, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f97, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f92, f101, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f93, f99, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f93, f103, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f92, f99, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f92, f103, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f105, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f105, f85, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f104, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f104, f85, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f105, f87, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f104, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f64 = f106, f80, f64 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f68 = f106, f84, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f65 = f107, f80, f65 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f69 = f107, f84, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f66 = f106, f82, f66 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f70 = f106, f86, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, -11 * SIZE + FMA_A f67 = f107, f82, f67 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, -11 * SIZE + FMA_A f71 = f107, f86, f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f64 = f107, f81, f64 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f68 = f107, f85, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f65 = f106, f81, f65 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f69 = f106, f85, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f66 = f107, f83, f66 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f70 = f107, f87, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f67 = f106, f83, f67 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f71 = f106, f87, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f68 = f121, f69, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f120, f69, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f121, f67, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f70 = f121, f71, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f120, f67, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f71 = f120, f71, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + shladd r2 = K, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + shladd AORIG = r2, 2, AORIG + } + ;; + { .mmi + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C5 ] = f68, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f80 = f0 + sub L = K, KK + } + { .mfi + STFD [C5 ] = f70, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C5 ] = f71, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L049: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + + { .mmb + mov AOFFSET = A + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 16 + +.L050: + { .mib + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L090 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + + mov C1 = C + add C2 = LDC, C + ;; +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib +#ifndef RT + shladd C = LDC, 1, C +#else + nop __LINE__ +#endif + } + ;; + +.L070: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L060 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L078 + ;; + .align 16 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f96 = f32, f49, f96 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f112 = f32, f51, f112 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f97 = f33, f49, f97 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f113 = f33, f51, f113 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f96 = f40, f57, f96 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f112 = f40, f59, f112 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f57, f97 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f59, f113 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f97 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f80 = f80, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f96 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f81 = f81, f112 + nop __LINE__ + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f81, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L060: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L051 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L068 + ;; + .align 16 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + } + { .mfb + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f112, SIZE + ;; + STFD [C2 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L051: + shr I = M, 2 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L089 + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f99 = f0 + adds L = 1, L + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + CPREFETCH [PREC], LDC + mov f115 = f0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds C5 = 4 * SIZE, C1 + adds L = -1, L + } + ;; + { .mmi + CPREFETCH [PREC], LDC + adds C6 = 4 * SIZE, C2 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L058 + ;; + .align 16 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfi + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f36, f48, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f36, f49, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f36, f50, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f36, f51, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f38, f48, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f38, f49, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f38, f50, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f38, f51, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f37, f48, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f37, f49, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f37, f50, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f37, f51, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f39, f48, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f39, f49, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f39, f50, f115 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f39, f51, f114 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f44, f56, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f44, f58, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f46, f56, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f46, f58, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f45, f56, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f45, f58, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f47, f56, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f47, f58, f115 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + ;; + LDFPD f122, f123 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + + FSUB f66 = f104, f66 + FSUB_A f67 = f105, f67 + FSUB f82 = f106, f82 + FSUB_A f83 = f107, f83 + FSUB f98 = f120, f98 + FSUB_A f99 = f121, f99 + FSUB f114 = f122, f114 + FSUB_A f115 = f123, f115 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f66 = f76, f66 + FSUB f67 = f77, f67 + FSUB f98 = f78, f98 + FSUB f99 = f79, f99 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + + FSUB f82 = f92, f82 + FSUB f83 = f93, f83 + FSUB f114 = f94, f114 + FSUB f115 = f95, f115 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f98 + FMPY f33 = f73, f98 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f98 = f73, f99, f32 + FMA_D f99 = f72, f99, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f66 = f74, f98, f66 + FMA_A f67 = f75, f98, f67 + FNMA f82 = f74, f114, f82 + FMA_A f83 = f75, f114, f83 + ;; + FMA_B f66 = f75, f99, f66 + FNMA f67 = f74, f99, f67 + FMA_B f82 = f75, f115, f82 + FNMA f83 = f74, f115, f83 + ;; + FNMA f96 = f76, f98, f96 + FMA_A f97 = f77, f98, f97 + FNMA f112 = f76, f114, f112 + FMA_A f113 = f77, f114, f113 + ;; + FMA_B f96 = f77, f99, f96 + FNMA f97 = f76, f99, f97 + FMA_B f112 = f77, f115, f112 + FNMA f113 = f76, f115, f113 + ;; + FNMA f64 = f78, f98, f64 + FMA_A f65 = f79, f98, f65 + FNMA f80 = f78, f114, f80 + FMA_A f81 = f79, f114, f81 + ;; + FMA_B f64 = f79, f99, f64 + FNMA f65 = f78, f99, f65 + FMA_B f80 = f79, f115, f80 + FNMA f81 = f78, f115, f81 + ;; + FMPY f32 = f88, f66 + FMPY f33 = f89, f66 + FMPY f34 = f88, f82 + FMPY f35 = f89, f82 + ;; + FMA_C f66 = f89, f67, f32 + FMA_D f67 = f88, f67, f33 + FMA_C f82 = f89, f83, f34 + FMA_D f83 = f88, f83, f35 + ;; + FNMA f96 = f90, f66, f96 + FMA_A f97 = f91, f66, f97 + FNMA f112 = f90, f82, f112 + FMA_A f113 = f91, f82, f113 + ;; + FMA_B f96 = f91, f67, f96 + FNMA f97 = f90, f67, f97 + FMA_B f112 = f91, f83, f112 + FNMA f113 = f90, f83, f113 + ;; + FNMA f64 = f92, f66, f64 + FMA_A f65 = f93, f66, f65 + FNMA f80 = f92, f82, f80 + FMA_A f81 = f93, f82, f81 + ;; + FMA_B f64 = f93, f67, f64 + FNMA f65 = f92, f67, f65 + FMA_B f80 = f93, f83, f80 + FNMA f81 = f92, f83, f81 + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FNMA f66 = f76, f64, f66 + FMA_A f67 = f77, f64, f67 + FNMA f82 = f76, f80, f82 + FMA_A f83 = f77, f80, f83 + ;; + FMA_B f66 = f77, f65, f66 + FNMA f67 = f76, f65, f67 + FMA_B f82 = f77, f81, f82 + FNMA f83 = f76, f81, f83 + ;; + FNMA f98 = f78, f64, f98 + FMA_A f99 = f79, f64, f99 + FNMA f114 = f78, f80, f114 + FMA_A f115 = f79, f80, f115 + ;; + FMA_B f98 = f79, f65, f98 + FNMA f99 = f78, f65, f99 + FMA_B f114 = f79, f81, f114 + FNMA f115 = f78, f81, f115 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; + FNMA f66 = f92, f96, f66 + FMA_A f67 = f93, f96, f67 + FNMA f82 = f92, f112, f82 + FMA_A f83 = f93, f112, f83 + ;; + FMA_B f66 = f93, f97, f66 + FNMA f67 = f92, f97, f67 + FMA_B f82 = f93, f113, f82 + FNMA f83 = f92, f113, f83 + ;; + FNMA f98 = f94, f96, f98 + FMA_A f99 = f95, f96, f99 + FNMA f114 = f94, f112, f114 + FMA_A f115 = f95, f112, f115 + ;; + FMA_B f98 = f95, f97, f98 + FNMA f99 = f94, f97, f99 + FMA_B f114 = f95, f113, f114 + FNMA f115 = f94, f113, f115 + ;; + FMPY f32 = f108, f66 + FMPY f33 = f109, f66 + FMPY f34 = f108, f82 + FMPY f35 = f109, f82 + ;; + FMA_C f66 = f109, f67, f32 + FMA_D f67 = f108, f67, f33 + FMA_C f82 = f109, f83, f34 + FMA_D f83 = f108, f83, f35 + ;; + FNMA f98 = f110, f66, f98 + FMA_A f99 = f111, f66, f99 + FNMA f114 = f110, f82, f114 + FMA_A f115 = f111, f82, f115 + ;; + FMA_B f98 = f111, f67, f98 + FNMA f99 = f110, f67, f99 + FMA_B f114 = f111, f83, f114 + FNMA f115 = f110, f83, f115 + ;; + FMPY f32 = f126, f98 + FMPY f33 = f127, f98 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f98 = f127, f99, f32 + FMA_D f99 = f126, f99, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + FMPY f36 = f72, f66 + FMPY f37 = f73, f66 + FMPY f38 = f72, f98 + FMPY f39 = f73, f98 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + FMA_C f66 = f73, f67, f36 + FMA_D f67 = f72, f67, f37 + FMA_C f98 = f73, f99, f38 + FMA_D f99 = f72, f99, f39 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + FNMA f114 = f74, f98, f114 + FMA_A f115 = f75, f98, f115 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + FMA_B f114 = f75, f99, f114 + FNMA f115 = f74, f99, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + FMPY f36 = f90, f82 + FMPY f37 = f91, f82 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + FMA_C f82 = f91, f83, f36 + FMA_D f83 = f90, f83, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + FMPY f36 = f104, f82 + FMPY f37 = f105, f82 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + FMA_C f82 = f105, f83, f36 + FMA_D f83 = f104, f83, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + FNMA f98 = f106, f114, f98 + FMA_A f99 = f107, f114, f99 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + FMA_B f98 = f107, f115, f98 + FNMA f99 = f106, f115, f99 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + FMPY f36 = f120, f66 + FMPY f37 = f121, f66 + FMPY f38 = f120, f98 + FMPY f39 = f121, f98 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + FMA_C f66 = f121, f67, f36 + FMA_D f67 = f120, f67, f37 + FMA_C f98 = f121, f99, f38 + FMA_D f99 = f120, f99, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f66, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f67, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f98, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f99, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f83, SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f113, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f66, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f67, SIZE + ;; + STFD [C1 ] = f96, SIZE + STFD [C5 ] = f98, SIZE + ;; + STFD [C1 ] = f97, 5 * SIZE + STFD [C5 ] = f99, 5 * SIZE + ;; + STFD [C2 ] = f80, SIZE + STFD [C6 ] = f82, SIZE + ;; + STFD [C2 ] = f81, SIZE + STFD [C6 ] = f83, SIZE + ;; + STFD [C2 ] = f112, SIZE + STFD [C6 ] = f114, SIZE + ;; + STFD [C2 ] = f113, 5 * SIZE + STFD [C6 ] = f115, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L052 + ;; + .align 16 + + +.L089: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L090: + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L999 + ;; +#ifdef RT + { .mmi + shl r2 = K, ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } + ;; +#endif + mov C1 = C + +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib +#ifndef RT + add C = LDC, C +#else + nop __LINE__ +#endif + } + ;; + +.L110: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L100 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L118 + ;; + .align 16 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + ;; +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + ;; +#else + LDFPD f72, f73 = [AOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L100: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L091 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L108 + ;; + .align 16 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f112 = f34, f49, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f113 = f35, f49, f113 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f57, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f57, f113 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f96 = f96, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f97 = f97, f112 + nop __LINE__ + } + ;; +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f96, SIZE + ;; + STFD [BOFFSET] = f97, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + .align 16 + +.L091: + shr I = M, 2 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L119 + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC] + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov ar.lc = L + } + { .mmi + adds C5 = 4 * SIZE, C1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + ;; + .align 16 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f34, f48, f80 // A3 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f81 = f34, f49, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f36, f48, f96 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f97 = f36, f49, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f38, f48, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f38, f49, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f81 = f35, f48, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f35, f49, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f97 = f37, f48, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f37, f49, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f113 = f39, f48, f113 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f39, f49, f112 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f42, f56, f80 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f44, f56, f96 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f46, f56, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f43, f56, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f45, f56, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f47, f56, f113 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f96, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f97, SIZE + ;; + STFD [C1 ] = f80, SIZE + STFD [C5 ] = f112, SIZE + ;; + STFD [C1 ] = f81, 5 * SIZE + STFD [C5 ] = f113, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L092 + ;; + .align 16 + +.L119: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + add B = KK8, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 1, KK +#endif + +#ifdef RT + adds KK = -1, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L999: + { .mii + nop __LINE__ + mov ar.lc = ARLC + mov pr = PR, -1 + } + { .mib + nop __LINE__ +#ifdef TRMMKERNEL + mov ar.pfs = ARPFS +#else + nop __LINE__ +#endif + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/ztrsm_kernel_LT.S b/kernel/ia64/ztrsm_kernel_LT.S new file mode 100644 index 0000000..6c7a8ca --- /dev/null +++ b/kernel/ia64/ztrsm_kernel_LT.S @@ -0,0 +1,10835 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#ifndef LN +#define CPREFETCHSIZE 7 +#else +#define CPREFETCHSIZE -8 +#endif +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + +#define AORIG loc0 +#define KK loc1 +#define KK8 loc2 +#define OFFSET loc3 +#define AOFFSET2 loc4 +#define BOFFSET2 loc5 + +#ifndef CONJ +#define FCALC_A FSUB +#define FCALC_B FADD +#define FMA_A FNMA +#define FMA_B FMA +#else +#define FCALC_A FADD +#define FCALC_B FSUB +#define FMA_A FMA +#define FMA_B FNMA +#endif + +#ifndef CONJ +#define FCALC_C FMA +#define FCALC_D FNMA +#else +#define FCALC_C FNMA +#define FCALC_D FMA +#endif + +#ifndef CONJ +#define FMA_C FNMA +#define FMA_D FMA +#define FSUB_A FSUB +#else +#define FMA_C FMA +#define FMA_D FMS +#define FSUB_A FADD +#endif + + + PROLOGUE + .prologue + PROFCODE + + { .mfi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 8, 0, 0 + mov f64 = f0 + adds r14 = 16, SP + } + { .mfi + nop __LINE__ + mov f65 = f0 + adds r15 = 24, SP + } + ;; + { .mfi + ld8 LDC = [r14] + mov f81 = f0 + mov PR = pr + } + { .mfi + ld8 OFFSET = [r15] + mov f96 = f0 + shr J = N, 2 + } + ;; + { .mfi + shladd LDC = LDC, ZBASE_SHIFT, r0 + mov f97 = f0 + } + { .mfi + nop __LINE__ + mov f113 = f0 + } + ;; +#ifdef LN + { .mmi + setf.sig f32 = M + setf.sig f33 = K + shladd C = M, ZBASE_SHIFT, C + } + ;; + {.mmf + nop __LINE__ + nop __LINE__ + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + ;; + nop __LINE__ + shladd A = r2, ZBASE_SHIFT, A + } + ;; +#endif + +#ifdef RN + sub KK = r0, OFFSET +#endif + +#ifdef RT + { .mmi + setf.sig f32 = N + setf.sig f33 = K + nop __LINE__ + } + ;; + { .mmi + setf.sig f34 = LDC + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + xmpy.l f33 = f32, f33 + } + { .mmf + nop __LINE__ + sub KK = N, OFFSET + xmpy.l f34 = f32, f34 + } + ;; + { .mmi + getf.sig r2 = f33 + getf.sig r3 = f34 + } + ;; + shladd B = r2, ZBASE_SHIFT, B + add C = r3, C +#endif + ;; + .body + { .mfi + nop __LINE__ + mov f80 = f0 + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, J + mov f112 = f0 + (p6) br.cond.dpnt .L050 + } + ;; + .align 16 + +.L010: +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + { .mmi + mov C1 = C // coffset1 = c + 0 * ldc + add C2 = LDC, C // coffset2 = c + 1 * ldc + shr I = M, 2 + } + { .mmi + adds J = -1, J +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + ;; + { .mmi + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mfi + shladd r3 = KK, ZBASE_SHIFT, r0 + mov f118 = f0 + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = 1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + adds C5 = 4 * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f115 = f0 + adds C6 = 4 * SIZE, C2 + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f68 = f0 + shr L = L, 1 + } + { .mfi + setf.d f86 = r0 + mov f69 = f0 + adds C7 = 4 * SIZE, C3 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f84 = f0 + adds L = -1, L + } + { .mfi + setf.d f87 = r0 + mov f85 = f0 + adds C8 = 4 * SIZE, C4 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f100 = f0 + mov ar.lc = L + } + { .mfi + setf.d f102 = r0 + mov f101 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f116 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + { .mfi + setf.d f103 = r0 + mov f117 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC] + mov f70 = f0 + cmp.eq p6, p0 = -1, L + } + { .mfb + setf.d f119 = r0 + mov f71 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA_B f65 = f32, f49, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f69 = f36, f49, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f85 = f36, f51, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f101 = f36, f53, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f117 = f36, f55, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f68 = f37, f49, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f84 = f37, f51, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f100 = f37, f53, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f116 = f37, f55, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f71 = f38, f49, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f87 = f38, f51, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f103 = f38, f53, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f119 = f38, f55, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f70 = f39, f49, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f86 = f39, f51, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f102 = f39, f53, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f118 = f39, f55, f118 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + FSUB f80 = f74, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + FSUB f96 = f76, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f77, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FSUB f112 = f78, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f79, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET], 2 * SIZE + FSUB f66 = f88, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f89, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET], 2 * SIZE + FSUB f98 = f92, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f93, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [BOFFSET], 2 * SIZE + FSUB f114 = f94, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f95, f115 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FSUB f68 = f104, f68 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f69 = f105, f69 +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [AOFFSET] + FSUB f84 = f106, f84 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f85 = f107, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [AOFFSET] + FSUB f100 = f108, f100 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f110, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f117 = f111, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f120, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f71 = f121, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f86 = f122, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f87 = f123, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f102 = f124, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f103 = f125, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f119 = f127, f119 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET], 2 * SIZE + FSUB f66 = f74, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + FSUB f68 = f76, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f69 = f77, f69 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + FSUB f70 = f78, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f71 = f79, f71 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f84 = f92, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f93, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET], 2 * SIZE + FSUB f86 = f94, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f95, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [AOFFSET], 2 * SIZE + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FSUB f98 = f106, f98 + adds AOFFSET = -30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [BOFFSET] + FSUB f100 = f108, f100 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [BOFFSET] + FSUB f102 = f110, f102 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f103 = f111, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f124, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f125, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f127, f119 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + { .mfi + LDFPD f76, f77 = [AOFFSET] + FMPY f32 = f72, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f70 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [AOFFSET] + FMPY f34 = f72, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET] + FMPY f35 = f73, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET] + FMA_C f70 = f73, f71, f32 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f102 = f73, f103, f36 + adds C1 = -2 * SIZE, C1 + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET] + FMA_D f71 = f72, f71, f33 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f103 = f72, f103, f37 + adds C2 = -2 * SIZE, C2 + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET] + FMA_C f86 = f73, f87, f34 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + adds C3 = -2 * SIZE, C3 + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET] + FMA_D f87 = f72, f87, f35 + adds BOFFSET2 = 28 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds BOFFSET = 24 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FNMA f68 = f74, f70, f68 + adds C4 = -2 * SIZE, C4 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FNMA f100 = f74, f102, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FMA_A f69 = f75, f70, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FMA_A f101 = f75, f102, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FNMA f84 = f74, f86, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FNMA f116 = f74, f118, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f87, -11 * SIZE + FMA_A f85 = f75, f86, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f119, -11 * SIZE + FMA_A f117 = f75, f118, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + FMA_B f68 = f75, f71, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f102, SIZE + FMA_B f100 = f75, f103, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f71, -3 * SIZE + FNMA f69 = f74, f71, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f103, -3 * SIZE + FNMA f101 = f74, f103, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + FMA_B f84 = f75, f87, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f118, SIZE + FMA_B f116 = f75, f119, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, -3 * SIZE + FNMA f85 = f74, f87, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f119, -3 * SIZE + FNMA f117 = f74, f119, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f76, f70, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f76, f102, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f77, f70, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f99 = f77, f102, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f86, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f76, f118, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f86, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f115 = f77, f118, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f77, f71, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f98 = f77, f103, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f76, f71, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f99 = f76, f103, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f87, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f114 = f77, f119, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f87, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f115 = f76, f119, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f70, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f78, f102, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f70, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f79, f102, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f78, f86, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f78, f118, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f79, f86, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f79, f118, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f71, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f79, f103, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f71, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f78, f103, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f79, f87, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f79, f119, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f78, f87, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f78, f119, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f89, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f69 = f88, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f84 = f89, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f89, f117, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f85 = f88, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f88, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f66 = f90, f68, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f98 = f90, f100, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f67 = f91, f68, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f99 = f91, f100, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f82 = f90, f84, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f114 = f90, f116, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, -11 * SIZE + FMA_A f83 = f91, f84, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, -11 * SIZE + FMA_A f115 = f91, f116, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f66 = f91, f69, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f98 = f91, f101, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, -3 * SIZE + FNMA f67 = f90, f69, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, -3 * SIZE + FNMA f99 = f90, f101, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f82 = f91, f85, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f114 = f91, f117, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, -3 * SIZE + FNMA f83 = f90, f85, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, -3 * SIZE + FNMA f115 = f90, f117, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f68, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f92, f100, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f68, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f93, f100, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f92, f84, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f92, f116, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f93, f84, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f93, f116, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f69, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f93, f101, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f69, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f92, f101, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f93, f85, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f93, f117, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f92, f85, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f92, f117, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f105, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f105, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f104, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f104, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f105, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f104, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f106, f66, f64 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f106, f98, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f65 = f107, f66, f65 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f97 = f107, f98, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f106, f82, f80 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f106, f114, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, -11 * SIZE + FMA_A f81 = f107, f82, f81 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, -11 * SIZE + FMA_A f113 = f107, f114, f113 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f64 = f107, f67, f64 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f96 = f107, f99, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, -3 * SIZE + FNMA f65 = f106, f67, f65 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, -3 * SIZE + FNMA f97 = f106, f99, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f80 = f107, f83, f80 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f112 = f107, f115, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, -3 * SIZE + FNMA f81 = f106, f83, f81 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, -3 * SIZE + FNMA f113 = f106, f115, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f96 = f121, f97, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f120, f97, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f121, f81, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f112 = f121, f113, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f120, f81, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f113 = f120, f113, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f81, -3 * SIZE + STFD [BOFFSET2] = f113, -3 * SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, -1 * SIZE + mov f65 = f0 + adds KK = -4, KK + } + { .mfi + STFD [C3 ] = f97, -1 * SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f112, SIZE + mov f112 = f0 + sub L = K, KK + } + ;; + { .mfi + STFD [C2 ] = f81, -1 * SIZE + mov f81 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f113, -1 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef LT + { .mfi + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f64 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + FMPY f34 = f72, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FMPY f35 = f73, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET] + FMA_C f64 = f73, f65, f32 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f96 = f73, f97, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f72, f97, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET] + FMA_C f80 = f73, f81, f34 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f112 = f73, f113, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FMA_D f81 = f72, f81, f35 + adds AOFFSET = - 30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f113 = f72, f113, f39 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f66 = f74, f64, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f98 = f74, f96, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMA_A f67 = f75, f64, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMA_A f99 = f75, f96, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f82 = f74, f80, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f114 = f74, f112, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f81, 5 * SIZE + FMA_A f83 = f75, f80, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f113, 5 * SIZE + FMA_A f115 = f75, f112, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f66 = f75, f65, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f98 = f75, f97, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f67 = f74, f65, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f99 = f74, f97, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f82 = f75, f81, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f114 = f75, f113, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f83 = f74, f81, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f115 = f74, f113, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f68 = f76, f64, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f96, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f69 = f77, f64, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f96, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f84 = f76, f80, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f76, f112, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f85 = f77, f80, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f77, f112, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f68 = f77, f65, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f97, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f69 = f76, f65, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f97, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f84 = f77, f81, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f77, f113, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f85 = f76, f81, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f76, f113, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f78, f64, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f78, f96, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f79, f64, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f79, f96, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f78, f80, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f112, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f79, f80, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f112, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f79, f65, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f79, f97, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f78, f65, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f78, f97, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f79, f81, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f113, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f78, f81, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f113, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f91, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f91, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f91, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f90, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f90, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f91, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f90, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f68 = f92, f66, f68 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f100 = f92, f98, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f69 = f93, f66, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f101 = f93, f98, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f84 = f92, f82, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f116 = f92, f114, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, 5 * SIZE + FMA_A f85 = f93, f82, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, 5 * SIZE + FMA_A f117 = f93, f114, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f68 = f93, f67, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f100 = f93, f99, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, SIZE + FNMA f69 = f92, f67, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, SIZE + FNMA f101 = f92, f99, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f84 = f93, f83, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f116 = f93, f115, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, SIZE + FNMA f85 = f92, f83, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, SIZE + FNMA f117 = f92, f115, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f94, f66, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f94, f98, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f95, f66, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f95, f98, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f94, f82, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f114, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f95, f82, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f114, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f95, f67, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f95, f99, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f94, f67, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f94, f99, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f95, f83, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f115, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f94, f83, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f115, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f109, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f108, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f109, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f109, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f108, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f109, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f109, f117, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f108, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f108, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f70 = f110, f68, f70 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f102 = f110, f100, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f71 = f111, f68, f71 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f103 = f111, f100, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f86 = f110, f84, f86 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f118 = f110, f116, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, 5 * SIZE + FMA_A f87 = f111, f84, f87 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, 5 * SIZE + FMA_A f119 = f111, f116, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f70 = f111, f69, f70 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f102 = f111, f101, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, SIZE + FNMA f71 = f110, f69, f71 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, SIZE + FNMA f103 = f110, f101, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f86 = f111, f85, f86 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f118 = f111, f117, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, SIZE + FNMA f87 = f110, f85, f87 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, SIZE + FNMA f119 = f110, f117, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f70 = f127, f71, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f127, f103, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f71 = f126, f71, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f126, f103, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f86 = f127, f87, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f87 = f126, f87, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f102, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f118, SIZE + adds KK = 4, KK + } + ;; + { .mmi + STFD [BOFFSET] = f87, -27 * SIZE + STFD [BOFFSET2] = f119 + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + mov f64 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + { .mfi + STFD [C3 ] = f102, SIZE + mov f65 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + ;; + { .mfi + STFD [C1 ] = f71, SIZE + mov f80 = f0 + mov L = KK + } + { .mfi + STFD [C3 ] = f103, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + mov f96 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f119, SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef RN + { .mfi + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f64 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + FMPY f34 = f72, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + FMPY f35 = f73, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET] + FMA_C f64 = f73, f65, f32 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f68 = f73, f69, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f72, f69, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET] + FMA_C f66 = f73, f67, f34 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f70 = f73, f71, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FMA_D f67 = f72, f67, f35 + adds BOFFSET = - 30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f71 = f72, f71, f39 + adds AOFFSET2 = 4 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f80 = f74, f64, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f84 = f74, f68, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMA_A f81 = f75, f64, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FMA_A f85 = f75, f68, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f82 = f74, f66, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f86 = f74, f70, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FMA_A f83 = f75, f66, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FMA_A f87 = f75, f70, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f80 = f75, f65, f80 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f68, SIZE + FMA_B f84 = f75, f69, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f81 = f74, f65, f81 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + FNMA f85 = f74, f69, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f82 = f75, f67, f82 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f70, SIZE + FMA_B f86 = f75, f71, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FNMA f83 = f74, f67, f83 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f71, 5 * SIZE + FNMA f87 = f74, f71, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f96 = f76, f64, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f68, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f97 = f77, f64, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f68, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f98 = f76, f66, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f76, f70, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f99 = f77, f66, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f77, f70, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f96 = f77, f65, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f69, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f97 = f76, f65, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f69, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f98 = f77, f67, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f77, f71, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f76, f67, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f76, f71, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f78, f64, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f78, f68, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f79, f64, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f79, f68, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f78, f66, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f70, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f79, f66, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f70, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f79, f65, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f79, f69, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f78, f65, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f78, f69, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f79, f67, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f71, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f78, f67, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f71, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f91, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f91, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f91, f85, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f81 = f90, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f90, f85, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f91, f87, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f90, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f96 = f92, f80, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f100 = f92, f84, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f97 = f93, f80, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f101 = f93, f84, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f98 = f92, f82, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f102 = f92, f86, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FMA_A f99 = f93, f82, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FMA_A f103 = f93, f86, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f96 = f93, f81, f96 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f100 = f93, f85, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f97 = f92, f81, f97 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f101 = f92, f85, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f98 = f93, f83, f98 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f102 = f93, f87, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f99 = f92, f83, f99 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f103 = f92, f87, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f94, f80, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f94, f84, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f95, f80, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f95, f84, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f94, f82, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f86, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f95, f82, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f86, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f95, f81, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f95, f85, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f94, f81, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f94, f85, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f95, f83, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f87, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f94, f83, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f87, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f109, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f108, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f109, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f109, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f108, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f109, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f109, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f108, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f108, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f112 = f110, f96, f112 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f116 = f110, f100, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f113 = f111, f96, f113 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f117 = f111, f100, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f114 = f110, f98, f114 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f118 = f110, f102, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMA_A f115 = f111, f98, f115 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMA_A f119 = f111, f102, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f112 = f111, f97, f112 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f116 = f111, f101, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f113 = f110, f97, f113 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f117 = f110, f101, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f114 = f111, f99, f114 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f118 = f111, f103, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f115 = f110, f99, f115 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f119 = f110, f103, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f112 = f127, f113, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f127, f117, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f113 = f126, f113, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f126, f117, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f114 = f127, f115, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f115 = f126, f115, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f116, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f117, SIZE + mov L = KK + } + ;; + { .mmi + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f118, SIZE + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + mov f64 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + { .mfi + STFD [C8 ] = f116, SIZE + mov f65 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C8 ] = f117, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + mov f96 = f0 + adds I = -1, I + } + { .mfi + STFD [C8 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f119, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } +#endif + +#ifdef RT + { .mfi + LDFPD f76, f77 = [BOFFSET] + FMPY f32 = f72, f112 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f112 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [BOFFSET] + FMPY f34 = f72, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET] + FMPY f35 = f73, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET] + FMA_C f112 = f73, f113, f32 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f116 = f73, f117, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET] + FMA_D f113 = f72, f113, f33 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f117 = f72, f117, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET] + FMA_C f114 = f73, f115, f34 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET] + FMA_D f115 = f72, f115, f35 + adds AOFFSET2 = 28 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds AOFFSET = 24 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FNMA f96 = f74, f112, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FNMA f100 = f74, f116, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMA_A f97 = f75, f112, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMA_A f101 = f75, f116, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FNMA f98 = f74, f114, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FNMA f102 = f74, f118, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f115, -11 * SIZE + FMA_A f99 = f75, f114, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f119, -11 * SIZE + FMA_A f103 = f75, f118, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f96 = f75, f113, f96 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f116, SIZE + FMA_B f100 = f75, f117, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f97 = f74, f113, f97 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f117, SIZE + FNMA f101 = f74, f117, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f98 = f75, f115, f98 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f118, SIZE + FMA_B f102 = f75, f119, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + FNMA f99 = f74, f115, f99 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f119, 5 * SIZE + FNMA f103 = f74, f119, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f76, f112, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f76, f116, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f77, f112, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f85 = f77, f116, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f114, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f76, f118, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f114, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f87 = f77, f118, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f77, f113, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f84 = f77, f117, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f76, f113, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f85 = f76, f117, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f115, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f86 = f77, f119, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f115, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f76, f119, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f112, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f78, f116, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f112, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f79, f116, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f78, f114, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f78, f118, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f79, f114, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f79, f118, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f113, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f79, f117, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f113, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f78, f117, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f79, f115, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f79, f119, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f78, f115, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f78, f119, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f89, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f88, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f89, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f89, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f88, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f88, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f80 = f90, f96, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f84 = f90, f100, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f81 = f91, f96, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f85 = f91, f100, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f82 = f90, f98, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f86 = f90, f102, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, -11 * SIZE + FMA_A f83 = f91, f98, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, -11 * SIZE + FMA_A f87 = f91, f102, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f80 = f91, f97, f80 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f84 = f91, f101, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f81 = f90, f97, f81 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f85 = f90, f101, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f82 = f91, f99, f82 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f86 = f91, f103, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f83 = f90, f99, f83 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f87 = f90, f103, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f96, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f92, f100, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f96, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f93, f100, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f92, f98, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f92, f102, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f93, f98, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f93, f102, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f97, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f93, f101, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f97, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f92, f101, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f93, f99, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f93, f103, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f92, f99, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f92, f103, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f105, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f105, f85, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f104, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f104, f85, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f105, f87, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f104, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f64 = f106, f80, f64 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f68 = f106, f84, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f65 = f107, f80, f65 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f69 = f107, f84, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f66 = f106, f82, f66 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f70 = f106, f86, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, -11 * SIZE + FMA_A f67 = f107, f82, f67 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, -11 * SIZE + FMA_A f71 = f107, f86, f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f64 = f107, f81, f64 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f68 = f107, f85, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f65 = f106, f81, f65 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f69 = f106, f85, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f66 = f107, f83, f66 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f70 = f107, f87, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f67 = f106, f83, f67 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f71 = f106, f87, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f68 = f121, f69, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f120, f69, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f121, f67, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f70 = f121, f71, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f120, f67, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f71 = f120, f71, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + shladd r2 = K, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + shladd AORIG = r2, 2, AORIG + } + ;; + { .mmi + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C5 ] = f68, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f80 = f0 + sub L = K, KK + } + { .mfi + STFD [C5 ] = f70, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C5 ] = f71, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + shr L = L, 1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = -1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f114 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f115 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L028 + ;; + .align 16 + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; +.L028: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET] + FSUB f80 = f74, f80 + adds BOFFSET = -14 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f89, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f90, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f91, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f66 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f105, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f106, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f107, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f120, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f121, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f123, f115 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET] + FSUB f66 = f74, f66 + adds AOFFSET = -14 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f106, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f66 + FMPY f33 = f105, f66 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + FMPY f36 = f104, f98 + FMPY f37 = f105, f98 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f66 = f105, f67, f32 + FMA_D f67 = f104, f67, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + FMA_C f98 = f105, f99, f36 + FMA_D f99 = f104, f99, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f66, f64 + FMA_A f65 = f107, f66, f65 + FNMA f80 = f106, f82, f80 + FMA_A f81 = f107, f82, f81 + FNMA f96 = f106, f98, f96 + FMA_A f97 = f107, f98, f97 + FNMA f112 = f106, f114, f112 + FMA_A f113 = f107, f114, f113 + ;; + FMA_B f64 = f107, f67, f64 + FNMA f65 = f106, f67, f65 + FMA_B f80 = f107, f83, f80 + FNMA f81 = f106, f83, f81 + FMA_B f96 = f107, f99, f96 + FNMA f97 = f106, f99, f97 + FMA_B f112 = f107, f115, f112 + FNMA f113 = f106, f115, f113 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; + FNMA f66 = f74, f64, f66 + FMA_A f67 = f75, f64, f67 + FNMA f82 = f74, f80, f82 + FMA_A f83 = f75, f80, f83 + FNMA f98 = f74, f96, f98 + FMA_A f99 = f75, f96, f99 + FNMA f114 = f74, f112, f114 + FMA_A f115 = f75, f112, f115 + ;; + FMA_B f66 = f75, f65, f66 + FNMA f67 = f74, f65, f67 + FMA_B f82 = f75, f81, f82 + FNMA f83 = f74, f81, f83 + FMA_B f98 = f75, f97, f98 + FNMA f99 = f74, f97, f99 + FMA_B f114 = f75, f113, f114 + FNMA f115 = f74, f113, f115 + ;; + FMPY f32 = f90, f66 + FMPY f33 = f91, f66 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + FMPY f36 = f90, f98 + FMPY f37 = f91, f98 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f66 = f91, f67, f32 + FMA_D f67 = f90, f67, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + FMA_C f98 = f91, f99, f36 + FMA_D f99 = f90, f99, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f66 + FMPY f35 = f73, f66 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f66 = f73, f67, f34 + FMA_D f67 = f72, f67, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f98 = f76, f66, f98 + FMA_A f99 = f77, f66, f99 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f98 = f77, f67, f98 + FNMA f99 = f76, f67, f99 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + FNMA f114 = f78, f66, f114 + FMA_A f115 = f79, f66, f115 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + FMA_B f114 = f79, f67, f114 + FNMA f115 = f78, f67, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f98 = f92, f82, f98 + FMA_A f99 = f93, f82, f99 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f98 = f93, f83, f98 + FNMA f99 = f92, f83, f99 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + FNMA f114 = f94, f82, f114 + FMA_A f115 = f95, f82, f115 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + FMA_B f114 = f95, f83, f114 + FNMA f115 = f94, f83, f115 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + FMPY f34 = f108, f98 + FMPY f35 = f109, f98 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + FMA_C f98 = f109, f99, f34 + FMA_D f99 = f108, f99, f35 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + FNMA f114 = f110, f98, f114 + FMA_A f115 = f111, f98, f115 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + FMA_B f114 = f111, f99, f114 + FNMA f115 = f110, f99, f115 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f98 = f74, f114, f98 + FMA_A f99 = f75, f114, f99 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f98 = f75, f115, f98 + FNMA f99 = f74, f115, f99 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f82 = f76, f114, f82 + FMA_A f83 = f77, f114, f83 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f82 = f77, f115, f82 + FNMA f83 = f76, f115, f83 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + FNMA f66 = f78, f114, f66 + FMA_A f67 = f79, f114, f67 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + FMA_B f66 = f79, f115, f66 + FNMA f67 = f78, f115, f67 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + FMPY f34 = f88, f98 + FMPY f35 = f89, f98 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + FMA_C f98 = f89, f99, f34 + FMA_D f99 = f88, f99, f35 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f82 = f90, f98, f82 + FMA_A f83 = f91, f98, f83 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f82 = f91, f99, f82 + FNMA f83 = f90, f99, f83 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + FNMA f66 = f92, f98, f66 + FMA_A f67 = f93, f98, f67 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + FMA_B f66 = f93, f99, f66 + FNMA f67 = f92, f99, f67 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f66 + FMPY f35 = f121, f66 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f66 = f121, f67, f34 + FMA_D f67 = f120, f67, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f83, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f66, SIZE + ;; + STFD [C1 ] = f67, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f82, SIZE + ;; + STFD [C2 ] = f83, SIZE + ;; + + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C3 ] = f98, SIZE + ;; + STFD [C3 ] = f99, SIZE + ;; + + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + STFD [C4 ] = f114, SIZE + ;; + STFD [C4 ] = f115, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L030: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f72 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f72 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + + { .mmi + nop __LINE__ + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f88 = f0 + shr L = L, 1 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f104 = f0 + adds L = -1, L + } + { .mfb + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f105 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f120 = f0 + mov ar.lc = L + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f121 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L038 + ;; + .align 16 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB_A f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB_A f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB_A f113 = f121, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB f113 = f121, f113 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f90, f91 = [AOFFSET] + ;; + FMPY f32 = f90, f64 + FMPY f33 = f91, f64 + FMPY f34 = f90, f80 + FMPY f35 = f91, f80 + FMPY f36 = f90, f96 + FMPY f37 = f91, f96 + FMPY f38 = f90, f112 + FMPY f39 = f91, f112 + ;; + FMA_C f64 = f91, f65, f32 + FMA_D f65 = f90, f65, f33 + FMA_C f80 = f91, f81, f34 + FMA_D f81 = f90, f81, f35 + FMA_C f96 = f91, f97, f36 + FMA_D f97 = f90, f97, f37 + FMA_C f112 = f91, f113, f38 + FMA_D f113 = f90, f113, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L049: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + + { .mmb + mov AOFFSET = A + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 16 + +.L050: + { .mmi + shr I = M, 2 + } + { .mib + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L090 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + + mov C1 = C + add C2 = LDC, C + ;; +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + shladd C = LDC, 1, C +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f99 = f0 + adds L = 1, L + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + CPREFETCH [PREC], LDC + mov f115 = f0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds C5 = 4 * SIZE, C1 + adds L = -1, L + } + ;; + { .mmi + CPREFETCH [PREC], LDC + adds C6 = 4 * SIZE, C2 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L058 + ;; + .align 16 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfi + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f36, f48, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f36, f49, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f36, f50, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f36, f51, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f38, f48, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f38, f49, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f38, f50, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f38, f51, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f37, f48, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f37, f49, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f37, f50, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f37, f51, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f39, f48, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f39, f49, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f39, f50, f115 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f39, f51, f114 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f44, f56, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f44, f58, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f46, f56, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f46, f58, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f45, f56, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f45, f58, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f47, f56, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f47, f58, f115 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + ;; + LDFPD f122, f123 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + + FSUB f66 = f104, f66 + FSUB_A f67 = f105, f67 + FSUB f82 = f106, f82 + FSUB_A f83 = f107, f83 + FSUB f98 = f120, f98 + FSUB_A f99 = f121, f99 + FSUB f114 = f122, f114 + FSUB_A f115 = f123, f115 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f66 = f76, f66 + FSUB f67 = f77, f67 + FSUB f98 = f78, f98 + FSUB f99 = f79, f99 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + + FSUB f82 = f92, f82 + FSUB f83 = f93, f83 + FSUB f114 = f94, f114 + FSUB f115 = f95, f115 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f98 + FMPY f33 = f73, f98 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f98 = f73, f99, f32 + FMA_D f99 = f72, f99, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f66 = f74, f98, f66 + FMA_A f67 = f75, f98, f67 + FNMA f82 = f74, f114, f82 + FMA_A f83 = f75, f114, f83 + ;; + FMA_B f66 = f75, f99, f66 + FNMA f67 = f74, f99, f67 + FMA_B f82 = f75, f115, f82 + FNMA f83 = f74, f115, f83 + ;; + FNMA f96 = f76, f98, f96 + FMA_A f97 = f77, f98, f97 + FNMA f112 = f76, f114, f112 + FMA_A f113 = f77, f114, f113 + ;; + FMA_B f96 = f77, f99, f96 + FNMA f97 = f76, f99, f97 + FMA_B f112 = f77, f115, f112 + FNMA f113 = f76, f115, f113 + ;; + FNMA f64 = f78, f98, f64 + FMA_A f65 = f79, f98, f65 + FNMA f80 = f78, f114, f80 + FMA_A f81 = f79, f114, f81 + ;; + FMA_B f64 = f79, f99, f64 + FNMA f65 = f78, f99, f65 + FMA_B f80 = f79, f115, f80 + FNMA f81 = f78, f115, f81 + ;; + FMPY f32 = f88, f66 + FMPY f33 = f89, f66 + FMPY f34 = f88, f82 + FMPY f35 = f89, f82 + ;; + FMA_C f66 = f89, f67, f32 + FMA_D f67 = f88, f67, f33 + FMA_C f82 = f89, f83, f34 + FMA_D f83 = f88, f83, f35 + ;; + FNMA f96 = f90, f66, f96 + FMA_A f97 = f91, f66, f97 + FNMA f112 = f90, f82, f112 + FMA_A f113 = f91, f82, f113 + ;; + FMA_B f96 = f91, f67, f96 + FNMA f97 = f90, f67, f97 + FMA_B f112 = f91, f83, f112 + FNMA f113 = f90, f83, f113 + ;; + FNMA f64 = f92, f66, f64 + FMA_A f65 = f93, f66, f65 + FNMA f80 = f92, f82, f80 + FMA_A f81 = f93, f82, f81 + ;; + FMA_B f64 = f93, f67, f64 + FNMA f65 = f92, f67, f65 + FMA_B f80 = f93, f83, f80 + FNMA f81 = f92, f83, f81 + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FNMA f66 = f76, f64, f66 + FMA_A f67 = f77, f64, f67 + FNMA f82 = f76, f80, f82 + FMA_A f83 = f77, f80, f83 + ;; + FMA_B f66 = f77, f65, f66 + FNMA f67 = f76, f65, f67 + FMA_B f82 = f77, f81, f82 + FNMA f83 = f76, f81, f83 + ;; + FNMA f98 = f78, f64, f98 + FMA_A f99 = f79, f64, f99 + FNMA f114 = f78, f80, f114 + FMA_A f115 = f79, f80, f115 + ;; + FMA_B f98 = f79, f65, f98 + FNMA f99 = f78, f65, f99 + FMA_B f114 = f79, f81, f114 + FNMA f115 = f78, f81, f115 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; + FNMA f66 = f92, f96, f66 + FMA_A f67 = f93, f96, f67 + FNMA f82 = f92, f112, f82 + FMA_A f83 = f93, f112, f83 + ;; + FMA_B f66 = f93, f97, f66 + FNMA f67 = f92, f97, f67 + FMA_B f82 = f93, f113, f82 + FNMA f83 = f92, f113, f83 + ;; + FNMA f98 = f94, f96, f98 + FMA_A f99 = f95, f96, f99 + FNMA f114 = f94, f112, f114 + FMA_A f115 = f95, f112, f115 + ;; + FMA_B f98 = f95, f97, f98 + FNMA f99 = f94, f97, f99 + FMA_B f114 = f95, f113, f114 + FNMA f115 = f94, f113, f115 + ;; + FMPY f32 = f108, f66 + FMPY f33 = f109, f66 + FMPY f34 = f108, f82 + FMPY f35 = f109, f82 + ;; + FMA_C f66 = f109, f67, f32 + FMA_D f67 = f108, f67, f33 + FMA_C f82 = f109, f83, f34 + FMA_D f83 = f108, f83, f35 + ;; + FNMA f98 = f110, f66, f98 + FMA_A f99 = f111, f66, f99 + FNMA f114 = f110, f82, f114 + FMA_A f115 = f111, f82, f115 + ;; + FMA_B f98 = f111, f67, f98 + FNMA f99 = f110, f67, f99 + FMA_B f114 = f111, f83, f114 + FNMA f115 = f110, f83, f115 + ;; + FMPY f32 = f126, f98 + FMPY f33 = f127, f98 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f98 = f127, f99, f32 + FMA_D f99 = f126, f99, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + FMPY f36 = f72, f66 + FMPY f37 = f73, f66 + FMPY f38 = f72, f98 + FMPY f39 = f73, f98 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + FMA_C f66 = f73, f67, f36 + FMA_D f67 = f72, f67, f37 + FMA_C f98 = f73, f99, f38 + FMA_D f99 = f72, f99, f39 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + FNMA f114 = f74, f98, f114 + FMA_A f115 = f75, f98, f115 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + FMA_B f114 = f75, f99, f114 + FNMA f115 = f74, f99, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + FMPY f36 = f90, f82 + FMPY f37 = f91, f82 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + FMA_C f82 = f91, f83, f36 + FMA_D f83 = f90, f83, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + FMPY f36 = f104, f82 + FMPY f37 = f105, f82 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + FMA_C f82 = f105, f83, f36 + FMA_D f83 = f104, f83, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + FNMA f98 = f106, f114, f98 + FMA_A f99 = f107, f114, f99 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + FMA_B f98 = f107, f115, f98 + FNMA f99 = f106, f115, f99 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + FMPY f36 = f120, f66 + FMPY f37 = f121, f66 + FMPY f38 = f120, f98 + FMPY f39 = f121, f98 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + FMA_C f66 = f121, f67, f36 + FMA_D f67 = f120, f67, f37 + FMA_C f98 = f121, f99, f38 + FMA_D f99 = f120, f99, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f66, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f67, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f98, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f99, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f83, SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f113, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f66, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f67, SIZE + ;; + STFD [C1 ] = f96, SIZE + STFD [C5 ] = f98, SIZE + ;; + STFD [C1 ] = f97, 5 * SIZE + STFD [C5 ] = f99, 5 * SIZE + ;; + STFD [C2 ] = f80, SIZE + STFD [C6 ] = f82, SIZE + ;; + STFD [C2 ] = f81, SIZE + STFD [C6 ] = f83, SIZE + ;; + STFD [C2 ] = f112, SIZE + STFD [C6 ] = f114, SIZE + ;; + STFD [C2 ] = f113, 5 * SIZE + STFD [C6 ] = f115, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L052 + ;; + .align 16 + +.L060: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L070 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L068 + ;; + .align 16 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + } + { .mfb + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f112, SIZE + ;; + STFD [C2 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L070: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L078 + ;; + .align 16 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f96 = f32, f49, f96 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f112 = f32, f51, f112 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f97 = f33, f49, f97 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f113 = f33, f51, f113 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f96 = f40, f57, f96 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f112 = f40, f59, f112 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f57, f97 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f59, f113 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f97 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f80 = f80, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f96 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f81 = f81, f112 + nop __LINE__ + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f81, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L089: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L090: + shr I = M, 2 + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L999 + ;; + +#ifdef RT + { .mmi + shl r2 = K, ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } + ;; +#endif + mov C1 = C + +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + add C = LDC, C +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC] + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov ar.lc = L + } + { .mmi + adds C5 = 4 * SIZE, C1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + ;; + .align 16 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f34, f48, f80 // A3 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f81 = f34, f49, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f36, f48, f96 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f97 = f36, f49, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f38, f48, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f38, f49, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f81 = f35, f48, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f35, f49, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f97 = f37, f48, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f37, f49, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f113 = f39, f48, f113 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f39, f49, f112 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f42, f56, f80 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f44, f56, f96 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f46, f56, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f43, f56, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f45, f56, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f47, f56, f113 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f96, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f97, SIZE + ;; + STFD [C1 ] = f80, SIZE + STFD [C5 ] = f112, SIZE + ;; + STFD [C1 ] = f81, 5 * SIZE + STFD [C5 ] = f113, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L092 + ;; + .align 16 + +.L100: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L110 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L108 + ;; + .align 16 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f112 = f34, f49, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f113 = f35, f49, f113 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f57, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f57, f113 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f96 = f96, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f97 = f97, f112 + nop __LINE__ + } + ;; +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f96, SIZE + ;; + STFD [BOFFSET] = f97, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L110: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L119 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L118 + ;; + .align 16 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + ;; +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + ;; +#else + LDFPD f72, f73 = [AOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + .align 16 + +.L119: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + add B = KK8, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 1, KK +#endif + +#ifdef RT + adds KK = -1, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L999: + { .mii + nop __LINE__ + mov ar.lc = ARLC + mov pr = PR, -1 + } + { .mib + nop __LINE__ +#ifdef TRMMKERNEL + mov ar.pfs = ARPFS +#else + nop __LINE__ +#endif + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/ztrsm_kernel_RT.S b/kernel/ia64/ztrsm_kernel_RT.S new file mode 100644 index 0000000..582e2e5 --- /dev/null +++ b/kernel/ia64/ztrsm_kernel_RT.S @@ -0,0 +1,10837 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#ifndef LN +#define CPREFETCHSIZE 7 +#else +#define CPREFETCHSIZE -8 +#endif +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + +#define AORIG loc0 +#define KK loc1 +#define KK8 loc2 +#define OFFSET loc3 +#define AOFFSET2 loc4 +#define BOFFSET2 loc5 + +#ifndef CONJ +#define FCALC_A FSUB +#define FCALC_B FADD +#define FMA_A FNMA +#define FMA_B FMA +#else +#define FCALC_A FADD +#define FCALC_B FSUB +#define FMA_A FMA +#define FMA_B FNMA +#endif + +#ifndef CONJ +#define FCALC_C FMA +#define FCALC_D FNMA +#else +#define FCALC_C FNMA +#define FCALC_D FMA +#endif + +#ifndef CONJ +#define FMA_C FNMA +#define FMA_D FMA +#define FSUB_A FSUB +#else +#define FMA_C FMA +#define FMA_D FMS +#define FSUB_A FADD +#endif + + + PROLOGUE + .prologue + PROFCODE + + { .mfi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 8, 0, 0 + mov f64 = f0 + adds r14 = 16, SP + } + { .mfi + nop __LINE__ + mov f65 = f0 + adds r15 = 24, SP + } + ;; + { .mfi + ld8 LDC = [r14] + mov f81 = f0 + mov PR = pr + } + { .mfi + ld8 OFFSET = [r15] + mov f96 = f0 + } + ;; + { .mfi + shladd LDC = LDC, ZBASE_SHIFT, r0 + mov f97 = f0 + } + { .mfi + nop __LINE__ + mov f113 = f0 + } + ;; +#ifdef LN + { .mmi + setf.sig f32 = M + setf.sig f33 = K + shladd C = M, ZBASE_SHIFT, C + } + ;; + {.mmf + nop __LINE__ + nop __LINE__ + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + ;; + nop __LINE__ + shladd A = r2, ZBASE_SHIFT, A + } + ;; +#endif + +#ifdef RN + sub KK = r0, OFFSET +#endif + +#ifdef RT + { .mmi + setf.sig f32 = N + setf.sig f33 = K + nop __LINE__ + } + ;; + { .mmi + setf.sig f34 = LDC + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + xmpy.l f33 = f32, f33 + } + { .mmf + nop __LINE__ + sub KK = N, OFFSET + xmpy.l f34 = f32, f34 + } + ;; + { .mmi + getf.sig r2 = f33 + getf.sig r3 = f34 + } + ;; + shladd B = r2, ZBASE_SHIFT, B + add C = r3, C +#endif + ;; + .body + { .mfi + nop __LINE__ + mov f80 = f0 + mov ARLC = ar.lc + } + { .mfb + mov f112 = f0 + } + ;; + ;; + shr I = M, 2 + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L050 + ;; + +#ifdef RT + { .mmi + shl r2 = K, ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } + ;; +#endif + mov C1 = C + +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + add C = LDC, C +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC] + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov ar.lc = L + } + { .mmi + adds C5 = 4 * SIZE, C1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + ;; + .align 16 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f34, f48, f80 // A3 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f81 = f34, f49, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f36, f48, f96 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f97 = f36, f49, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f38, f48, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f38, f49, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f81 = f35, f48, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f35, f49, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f97 = f37, f48, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f37, f49, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f113 = f39, f48, f113 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f39, f49, f112 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f42, f56, f80 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f44, f56, f96 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f46, f56, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f43, f56, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f45, f56, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f47, f56, f113 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f96, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f97, SIZE + ;; + STFD [C1 ] = f80, SIZE + STFD [C5 ] = f112, SIZE + ;; + STFD [C1 ] = f81, 5 * SIZE + STFD [C5 ] = f113, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L092 + ;; + .align 16 + +.L100: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L110 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L108 + ;; + .align 16 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f112 = f34, f49, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f113 = f35, f49, f113 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f57, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f57, f113 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f96 = f96, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f97 = f97, f112 + nop __LINE__ + } + ;; +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f96, SIZE + ;; + STFD [BOFFSET] = f97, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L110: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L119 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L118 + ;; + .align 16 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + ;; +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + ;; +#else + LDFPD f72, f73 = [AOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + .align 16 + +.L119: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + add B = KK8, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 1, KK +#endif + +#ifdef RT + adds KK = -1, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L050: + { .mmi + shr I = M, 2 + } + { .mib + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L010 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + + mov C1 = C + add C2 = LDC, C + ;; +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + shladd C = LDC, 1, C +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f99 = f0 + adds L = 1, L + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + CPREFETCH [PREC], LDC + mov f115 = f0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds C5 = 4 * SIZE, C1 + adds L = -1, L + } + ;; + { .mmi + CPREFETCH [PREC], LDC + adds C6 = 4 * SIZE, C2 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L058 + ;; + .align 16 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfi + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f36, f48, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f36, f49, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f36, f50, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f36, f51, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f38, f48, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f38, f49, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f38, f50, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f38, f51, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f37, f48, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f37, f49, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f37, f50, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f37, f51, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f39, f48, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f39, f49, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f39, f50, f115 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f39, f51, f114 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f44, f56, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f44, f58, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f46, f56, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f46, f58, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f45, f56, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f45, f58, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f47, f56, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f47, f58, f115 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + ;; + LDFPD f122, f123 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + + FSUB f66 = f104, f66 + FSUB_A f67 = f105, f67 + FSUB f82 = f106, f82 + FSUB_A f83 = f107, f83 + FSUB f98 = f120, f98 + FSUB_A f99 = f121, f99 + FSUB f114 = f122, f114 + FSUB_A f115 = f123, f115 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f66 = f76, f66 + FSUB f67 = f77, f67 + FSUB f98 = f78, f98 + FSUB f99 = f79, f99 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + + FSUB f82 = f92, f82 + FSUB f83 = f93, f83 + FSUB f114 = f94, f114 + FSUB f115 = f95, f115 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f98 + FMPY f33 = f73, f98 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f98 = f73, f99, f32 + FMA_D f99 = f72, f99, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f66 = f74, f98, f66 + FMA_A f67 = f75, f98, f67 + FNMA f82 = f74, f114, f82 + FMA_A f83 = f75, f114, f83 + ;; + FMA_B f66 = f75, f99, f66 + FNMA f67 = f74, f99, f67 + FMA_B f82 = f75, f115, f82 + FNMA f83 = f74, f115, f83 + ;; + FNMA f96 = f76, f98, f96 + FMA_A f97 = f77, f98, f97 + FNMA f112 = f76, f114, f112 + FMA_A f113 = f77, f114, f113 + ;; + FMA_B f96 = f77, f99, f96 + FNMA f97 = f76, f99, f97 + FMA_B f112 = f77, f115, f112 + FNMA f113 = f76, f115, f113 + ;; + FNMA f64 = f78, f98, f64 + FMA_A f65 = f79, f98, f65 + FNMA f80 = f78, f114, f80 + FMA_A f81 = f79, f114, f81 + ;; + FMA_B f64 = f79, f99, f64 + FNMA f65 = f78, f99, f65 + FMA_B f80 = f79, f115, f80 + FNMA f81 = f78, f115, f81 + ;; + FMPY f32 = f88, f66 + FMPY f33 = f89, f66 + FMPY f34 = f88, f82 + FMPY f35 = f89, f82 + ;; + FMA_C f66 = f89, f67, f32 + FMA_D f67 = f88, f67, f33 + FMA_C f82 = f89, f83, f34 + FMA_D f83 = f88, f83, f35 + ;; + FNMA f96 = f90, f66, f96 + FMA_A f97 = f91, f66, f97 + FNMA f112 = f90, f82, f112 + FMA_A f113 = f91, f82, f113 + ;; + FMA_B f96 = f91, f67, f96 + FNMA f97 = f90, f67, f97 + FMA_B f112 = f91, f83, f112 + FNMA f113 = f90, f83, f113 + ;; + FNMA f64 = f92, f66, f64 + FMA_A f65 = f93, f66, f65 + FNMA f80 = f92, f82, f80 + FMA_A f81 = f93, f82, f81 + ;; + FMA_B f64 = f93, f67, f64 + FNMA f65 = f92, f67, f65 + FMA_B f80 = f93, f83, f80 + FNMA f81 = f92, f83, f81 + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FNMA f66 = f76, f64, f66 + FMA_A f67 = f77, f64, f67 + FNMA f82 = f76, f80, f82 + FMA_A f83 = f77, f80, f83 + ;; + FMA_B f66 = f77, f65, f66 + FNMA f67 = f76, f65, f67 + FMA_B f82 = f77, f81, f82 + FNMA f83 = f76, f81, f83 + ;; + FNMA f98 = f78, f64, f98 + FMA_A f99 = f79, f64, f99 + FNMA f114 = f78, f80, f114 + FMA_A f115 = f79, f80, f115 + ;; + FMA_B f98 = f79, f65, f98 + FNMA f99 = f78, f65, f99 + FMA_B f114 = f79, f81, f114 + FNMA f115 = f78, f81, f115 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; + FNMA f66 = f92, f96, f66 + FMA_A f67 = f93, f96, f67 + FNMA f82 = f92, f112, f82 + FMA_A f83 = f93, f112, f83 + ;; + FMA_B f66 = f93, f97, f66 + FNMA f67 = f92, f97, f67 + FMA_B f82 = f93, f113, f82 + FNMA f83 = f92, f113, f83 + ;; + FNMA f98 = f94, f96, f98 + FMA_A f99 = f95, f96, f99 + FNMA f114 = f94, f112, f114 + FMA_A f115 = f95, f112, f115 + ;; + FMA_B f98 = f95, f97, f98 + FNMA f99 = f94, f97, f99 + FMA_B f114 = f95, f113, f114 + FNMA f115 = f94, f113, f115 + ;; + FMPY f32 = f108, f66 + FMPY f33 = f109, f66 + FMPY f34 = f108, f82 + FMPY f35 = f109, f82 + ;; + FMA_C f66 = f109, f67, f32 + FMA_D f67 = f108, f67, f33 + FMA_C f82 = f109, f83, f34 + FMA_D f83 = f108, f83, f35 + ;; + FNMA f98 = f110, f66, f98 + FMA_A f99 = f111, f66, f99 + FNMA f114 = f110, f82, f114 + FMA_A f115 = f111, f82, f115 + ;; + FMA_B f98 = f111, f67, f98 + FNMA f99 = f110, f67, f99 + FMA_B f114 = f111, f83, f114 + FNMA f115 = f110, f83, f115 + ;; + FMPY f32 = f126, f98 + FMPY f33 = f127, f98 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f98 = f127, f99, f32 + FMA_D f99 = f126, f99, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + FMPY f36 = f72, f66 + FMPY f37 = f73, f66 + FMPY f38 = f72, f98 + FMPY f39 = f73, f98 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + FMA_C f66 = f73, f67, f36 + FMA_D f67 = f72, f67, f37 + FMA_C f98 = f73, f99, f38 + FMA_D f99 = f72, f99, f39 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + FNMA f114 = f74, f98, f114 + FMA_A f115 = f75, f98, f115 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + FMA_B f114 = f75, f99, f114 + FNMA f115 = f74, f99, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + FMPY f36 = f90, f82 + FMPY f37 = f91, f82 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + FMA_C f82 = f91, f83, f36 + FMA_D f83 = f90, f83, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + FMPY f36 = f104, f82 + FMPY f37 = f105, f82 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + FMA_C f82 = f105, f83, f36 + FMA_D f83 = f104, f83, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + FNMA f98 = f106, f114, f98 + FMA_A f99 = f107, f114, f99 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + FMA_B f98 = f107, f115, f98 + FNMA f99 = f106, f115, f99 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + FMPY f36 = f120, f66 + FMPY f37 = f121, f66 + FMPY f38 = f120, f98 + FMPY f39 = f121, f98 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + FMA_C f66 = f121, f67, f36 + FMA_D f67 = f120, f67, f37 + FMA_C f98 = f121, f99, f38 + FMA_D f99 = f120, f99, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f66, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f67, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f98, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f99, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f83, SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f113, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f66, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f67, SIZE + ;; + STFD [C1 ] = f96, SIZE + STFD [C5 ] = f98, SIZE + ;; + STFD [C1 ] = f97, 5 * SIZE + STFD [C5 ] = f99, 5 * SIZE + ;; + STFD [C2 ] = f80, SIZE + STFD [C6 ] = f82, SIZE + ;; + STFD [C2 ] = f81, SIZE + STFD [C6 ] = f83, SIZE + ;; + STFD [C2 ] = f112, SIZE + STFD [C6 ] = f114, SIZE + ;; + STFD [C2 ] = f113, 5 * SIZE + STFD [C6 ] = f115, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L052 + ;; + .align 16 + +.L060: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L070 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L068 + ;; + .align 16 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + } + { .mfb + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f112, SIZE + ;; + STFD [C2 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L070: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L078 + ;; + .align 16 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f96 = f32, f49, f96 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f112 = f32, f51, f112 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f97 = f33, f49, f97 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f113 = f33, f51, f113 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f96 = f40, f57, f96 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f112 = f40, f59, f112 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f57, f97 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f59, f113 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f97 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f80 = f80, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f96 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f81 = f81, f112 + nop __LINE__ + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f81, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L089: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L010: + shr J = N, 2 + ;; + cmp.ge p6, p0 = 0, J + (p6) br.cond.dpnt .L999 + ;; + +.L010x: +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + { .mmi + mov C1 = C // coffset1 = c + 0 * ldc + add C2 = LDC, C // coffset2 = c + 1 * ldc + shr I = M, 2 + } + { .mmi + adds J = -1, J +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + ;; + { .mmi + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mfi + shladd r3 = KK, ZBASE_SHIFT, r0 + mov f118 = f0 + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = 1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + adds C5 = 4 * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f115 = f0 + adds C6 = 4 * SIZE, C2 + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f68 = f0 + shr L = L, 1 + } + { .mfi + setf.d f86 = r0 + mov f69 = f0 + adds C7 = 4 * SIZE, C3 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f84 = f0 + adds L = -1, L + } + { .mfi + setf.d f87 = r0 + mov f85 = f0 + adds C8 = 4 * SIZE, C4 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f100 = f0 + mov ar.lc = L + } + { .mfi + setf.d f102 = r0 + mov f101 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f116 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + { .mfi + setf.d f103 = r0 + mov f117 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC] + mov f70 = f0 + cmp.eq p6, p0 = -1, L + } + { .mfb + setf.d f119 = r0 + mov f71 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA_B f65 = f32, f49, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f69 = f36, f49, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f85 = f36, f51, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f101 = f36, f53, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f117 = f36, f55, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f68 = f37, f49, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f84 = f37, f51, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f100 = f37, f53, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f116 = f37, f55, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f71 = f38, f49, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f87 = f38, f51, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f103 = f38, f53, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f119 = f38, f55, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f70 = f39, f49, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f86 = f39, f51, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f102 = f39, f53, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f118 = f39, f55, f118 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + FSUB f80 = f74, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + FSUB f96 = f76, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f77, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FSUB f112 = f78, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f79, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET], 2 * SIZE + FSUB f66 = f88, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f89, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET], 2 * SIZE + FSUB f98 = f92, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f93, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [BOFFSET], 2 * SIZE + FSUB f114 = f94, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f95, f115 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FSUB f68 = f104, f68 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f69 = f105, f69 +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [AOFFSET] + FSUB f84 = f106, f84 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f85 = f107, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [AOFFSET] + FSUB f100 = f108, f100 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f110, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f117 = f111, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f120, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f71 = f121, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f86 = f122, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f87 = f123, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f102 = f124, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f103 = f125, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f119 = f127, f119 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET], 2 * SIZE + FSUB f66 = f74, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + FSUB f68 = f76, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f69 = f77, f69 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + FSUB f70 = f78, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f71 = f79, f71 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f84 = f92, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f93, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET], 2 * SIZE + FSUB f86 = f94, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f95, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [AOFFSET], 2 * SIZE + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FSUB f98 = f106, f98 + adds AOFFSET = -30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [BOFFSET] + FSUB f100 = f108, f100 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [BOFFSET] + FSUB f102 = f110, f102 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f103 = f111, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f124, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f125, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f127, f119 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + { .mfi + LDFPD f76, f77 = [AOFFSET] + FMPY f32 = f72, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f70 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [AOFFSET] + FMPY f34 = f72, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET] + FMPY f35 = f73, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET] + FMA_C f70 = f73, f71, f32 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f102 = f73, f103, f36 + adds C1 = -2 * SIZE, C1 + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET] + FMA_D f71 = f72, f71, f33 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f103 = f72, f103, f37 + adds C2 = -2 * SIZE, C2 + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET] + FMA_C f86 = f73, f87, f34 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + adds C3 = -2 * SIZE, C3 + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET] + FMA_D f87 = f72, f87, f35 + adds BOFFSET2 = 28 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds BOFFSET = 24 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FNMA f68 = f74, f70, f68 + adds C4 = -2 * SIZE, C4 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FNMA f100 = f74, f102, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FMA_A f69 = f75, f70, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FMA_A f101 = f75, f102, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FNMA f84 = f74, f86, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FNMA f116 = f74, f118, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f87, -11 * SIZE + FMA_A f85 = f75, f86, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f119, -11 * SIZE + FMA_A f117 = f75, f118, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + FMA_B f68 = f75, f71, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f102, SIZE + FMA_B f100 = f75, f103, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f71, -3 * SIZE + FNMA f69 = f74, f71, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f103, -3 * SIZE + FNMA f101 = f74, f103, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + FMA_B f84 = f75, f87, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f118, SIZE + FMA_B f116 = f75, f119, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, -3 * SIZE + FNMA f85 = f74, f87, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f119, -3 * SIZE + FNMA f117 = f74, f119, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f76, f70, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f76, f102, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f77, f70, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f99 = f77, f102, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f86, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f76, f118, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f86, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f115 = f77, f118, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f77, f71, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f98 = f77, f103, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f76, f71, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f99 = f76, f103, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f87, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f114 = f77, f119, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f87, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f115 = f76, f119, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f70, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f78, f102, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f70, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f79, f102, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f78, f86, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f78, f118, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f79, f86, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f79, f118, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f71, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f79, f103, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f71, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f78, f103, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f79, f87, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f79, f119, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f78, f87, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f78, f119, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f89, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f69 = f88, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f84 = f89, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f89, f117, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f85 = f88, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f88, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f66 = f90, f68, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f98 = f90, f100, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f67 = f91, f68, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f99 = f91, f100, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f82 = f90, f84, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f114 = f90, f116, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, -11 * SIZE + FMA_A f83 = f91, f84, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, -11 * SIZE + FMA_A f115 = f91, f116, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f66 = f91, f69, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f98 = f91, f101, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, -3 * SIZE + FNMA f67 = f90, f69, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, -3 * SIZE + FNMA f99 = f90, f101, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f82 = f91, f85, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f114 = f91, f117, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, -3 * SIZE + FNMA f83 = f90, f85, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, -3 * SIZE + FNMA f115 = f90, f117, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f68, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f92, f100, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f68, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f93, f100, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f92, f84, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f92, f116, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f93, f84, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f93, f116, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f69, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f93, f101, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f69, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f92, f101, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f93, f85, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f93, f117, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f92, f85, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f92, f117, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f105, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f105, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f104, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f104, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f105, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f104, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f106, f66, f64 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f106, f98, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f65 = f107, f66, f65 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f97 = f107, f98, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f106, f82, f80 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f106, f114, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, -11 * SIZE + FMA_A f81 = f107, f82, f81 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, -11 * SIZE + FMA_A f113 = f107, f114, f113 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f64 = f107, f67, f64 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f96 = f107, f99, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, -3 * SIZE + FNMA f65 = f106, f67, f65 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, -3 * SIZE + FNMA f97 = f106, f99, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f80 = f107, f83, f80 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f112 = f107, f115, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, -3 * SIZE + FNMA f81 = f106, f83, f81 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, -3 * SIZE + FNMA f113 = f106, f115, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f96 = f121, f97, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f120, f97, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f121, f81, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f112 = f121, f113, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f120, f81, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f113 = f120, f113, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f81, -3 * SIZE + STFD [BOFFSET2] = f113, -3 * SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, -1 * SIZE + mov f65 = f0 + adds KK = -4, KK + } + { .mfi + STFD [C3 ] = f97, -1 * SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f112, SIZE + mov f112 = f0 + sub L = K, KK + } + ;; + { .mfi + STFD [C2 ] = f81, -1 * SIZE + mov f81 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f113, -1 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef LT + { .mfi + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f64 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + FMPY f34 = f72, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FMPY f35 = f73, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET] + FMA_C f64 = f73, f65, f32 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f96 = f73, f97, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f72, f97, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET] + FMA_C f80 = f73, f81, f34 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f112 = f73, f113, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FMA_D f81 = f72, f81, f35 + adds AOFFSET = - 30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f113 = f72, f113, f39 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f66 = f74, f64, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f98 = f74, f96, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMA_A f67 = f75, f64, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMA_A f99 = f75, f96, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f82 = f74, f80, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f114 = f74, f112, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f81, 5 * SIZE + FMA_A f83 = f75, f80, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f113, 5 * SIZE + FMA_A f115 = f75, f112, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f66 = f75, f65, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f98 = f75, f97, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f67 = f74, f65, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f99 = f74, f97, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f82 = f75, f81, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f114 = f75, f113, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f83 = f74, f81, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f115 = f74, f113, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f68 = f76, f64, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f96, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f69 = f77, f64, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f96, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f84 = f76, f80, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f76, f112, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f85 = f77, f80, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f77, f112, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f68 = f77, f65, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f97, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f69 = f76, f65, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f97, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f84 = f77, f81, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f77, f113, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f85 = f76, f81, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f76, f113, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f78, f64, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f78, f96, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f79, f64, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f79, f96, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f78, f80, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f112, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f79, f80, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f112, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f79, f65, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f79, f97, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f78, f65, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f78, f97, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f79, f81, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f113, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f78, f81, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f113, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f91, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f91, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f91, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f90, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f90, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f91, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f90, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f68 = f92, f66, f68 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f100 = f92, f98, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f69 = f93, f66, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f101 = f93, f98, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f84 = f92, f82, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f116 = f92, f114, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, 5 * SIZE + FMA_A f85 = f93, f82, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, 5 * SIZE + FMA_A f117 = f93, f114, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f68 = f93, f67, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f100 = f93, f99, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, SIZE + FNMA f69 = f92, f67, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, SIZE + FNMA f101 = f92, f99, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f84 = f93, f83, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f116 = f93, f115, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, SIZE + FNMA f85 = f92, f83, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, SIZE + FNMA f117 = f92, f115, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f94, f66, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f94, f98, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f95, f66, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f95, f98, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f94, f82, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f114, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f95, f82, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f114, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f95, f67, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f95, f99, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f94, f67, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f94, f99, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f95, f83, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f115, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f94, f83, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f115, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f109, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f108, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f109, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f109, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f108, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f109, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f109, f117, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f108, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f108, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f70 = f110, f68, f70 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f102 = f110, f100, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f71 = f111, f68, f71 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f103 = f111, f100, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f86 = f110, f84, f86 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f118 = f110, f116, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, 5 * SIZE + FMA_A f87 = f111, f84, f87 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, 5 * SIZE + FMA_A f119 = f111, f116, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f70 = f111, f69, f70 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f102 = f111, f101, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, SIZE + FNMA f71 = f110, f69, f71 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, SIZE + FNMA f103 = f110, f101, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f86 = f111, f85, f86 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f118 = f111, f117, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, SIZE + FNMA f87 = f110, f85, f87 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, SIZE + FNMA f119 = f110, f117, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f70 = f127, f71, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f127, f103, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f71 = f126, f71, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f126, f103, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f86 = f127, f87, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f87 = f126, f87, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f102, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f118, SIZE + adds KK = 4, KK + } + ;; + { .mmi + STFD [BOFFSET] = f87, -27 * SIZE + STFD [BOFFSET2] = f119 + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + mov f64 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + { .mfi + STFD [C3 ] = f102, SIZE + mov f65 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + ;; + { .mfi + STFD [C1 ] = f71, SIZE + mov f80 = f0 + mov L = KK + } + { .mfi + STFD [C3 ] = f103, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + mov f96 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f119, SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef RN + { .mfi + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f64 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + FMPY f34 = f72, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + FMPY f35 = f73, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET] + FMA_C f64 = f73, f65, f32 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f68 = f73, f69, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f72, f69, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET] + FMA_C f66 = f73, f67, f34 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f70 = f73, f71, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FMA_D f67 = f72, f67, f35 + adds BOFFSET = - 30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f71 = f72, f71, f39 + adds AOFFSET2 = 4 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f80 = f74, f64, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f84 = f74, f68, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMA_A f81 = f75, f64, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FMA_A f85 = f75, f68, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f82 = f74, f66, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f86 = f74, f70, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FMA_A f83 = f75, f66, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FMA_A f87 = f75, f70, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f80 = f75, f65, f80 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f68, SIZE + FMA_B f84 = f75, f69, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f81 = f74, f65, f81 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + FNMA f85 = f74, f69, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f82 = f75, f67, f82 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f70, SIZE + FMA_B f86 = f75, f71, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FNMA f83 = f74, f67, f83 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f71, 5 * SIZE + FNMA f87 = f74, f71, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f96 = f76, f64, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f68, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f97 = f77, f64, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f68, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f98 = f76, f66, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f76, f70, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f99 = f77, f66, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f77, f70, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f96 = f77, f65, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f69, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f97 = f76, f65, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f69, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f98 = f77, f67, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f77, f71, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f76, f67, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f76, f71, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f78, f64, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f78, f68, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f79, f64, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f79, f68, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f78, f66, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f70, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f79, f66, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f70, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f79, f65, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f79, f69, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f78, f65, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f78, f69, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f79, f67, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f71, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f78, f67, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f71, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f91, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f91, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f91, f85, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f81 = f90, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f90, f85, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f91, f87, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f90, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f96 = f92, f80, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f100 = f92, f84, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f97 = f93, f80, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f101 = f93, f84, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f98 = f92, f82, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f102 = f92, f86, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FMA_A f99 = f93, f82, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FMA_A f103 = f93, f86, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f96 = f93, f81, f96 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f100 = f93, f85, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f97 = f92, f81, f97 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f101 = f92, f85, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f98 = f93, f83, f98 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f102 = f93, f87, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f99 = f92, f83, f99 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f103 = f92, f87, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f94, f80, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f94, f84, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f95, f80, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f95, f84, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f94, f82, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f86, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f95, f82, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f86, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f95, f81, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f95, f85, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f94, f81, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f94, f85, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f95, f83, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f87, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f94, f83, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f87, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f109, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f108, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f109, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f109, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f108, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f109, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f109, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f108, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f108, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f112 = f110, f96, f112 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f116 = f110, f100, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f113 = f111, f96, f113 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f117 = f111, f100, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f114 = f110, f98, f114 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f118 = f110, f102, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMA_A f115 = f111, f98, f115 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMA_A f119 = f111, f102, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f112 = f111, f97, f112 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f116 = f111, f101, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f113 = f110, f97, f113 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f117 = f110, f101, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f114 = f111, f99, f114 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f118 = f111, f103, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f115 = f110, f99, f115 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f119 = f110, f103, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f112 = f127, f113, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f127, f117, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f113 = f126, f113, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f126, f117, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f114 = f127, f115, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f115 = f126, f115, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f116, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f117, SIZE + mov L = KK + } + ;; + { .mmi + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f118, SIZE + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + mov f64 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + { .mfi + STFD [C8 ] = f116, SIZE + mov f65 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C8 ] = f117, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + mov f96 = f0 + adds I = -1, I + } + { .mfi + STFD [C8 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f119, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } +#endif + +#ifdef RT + { .mfi + LDFPD f76, f77 = [BOFFSET] + FMPY f32 = f72, f112 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f112 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [BOFFSET] + FMPY f34 = f72, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET] + FMPY f35 = f73, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET] + FMA_C f112 = f73, f113, f32 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f116 = f73, f117, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET] + FMA_D f113 = f72, f113, f33 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f117 = f72, f117, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET] + FMA_C f114 = f73, f115, f34 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET] + FMA_D f115 = f72, f115, f35 + adds AOFFSET2 = 28 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds AOFFSET = 24 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FNMA f96 = f74, f112, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FNMA f100 = f74, f116, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMA_A f97 = f75, f112, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMA_A f101 = f75, f116, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FNMA f98 = f74, f114, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FNMA f102 = f74, f118, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f115, -11 * SIZE + FMA_A f99 = f75, f114, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f119, -11 * SIZE + FMA_A f103 = f75, f118, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f96 = f75, f113, f96 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f116, SIZE + FMA_B f100 = f75, f117, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f97 = f74, f113, f97 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f117, SIZE + FNMA f101 = f74, f117, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f98 = f75, f115, f98 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f118, SIZE + FMA_B f102 = f75, f119, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + FNMA f99 = f74, f115, f99 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f119, 5 * SIZE + FNMA f103 = f74, f119, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f76, f112, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f76, f116, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f77, f112, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f85 = f77, f116, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f114, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f76, f118, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f114, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f87 = f77, f118, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f77, f113, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f84 = f77, f117, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f76, f113, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f85 = f76, f117, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f115, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f86 = f77, f119, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f115, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f76, f119, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f112, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f78, f116, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f112, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f79, f116, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f78, f114, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f78, f118, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f79, f114, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f79, f118, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f113, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f79, f117, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f113, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f78, f117, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f79, f115, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f79, f119, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f78, f115, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f78, f119, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f89, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f88, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f89, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f89, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f88, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f88, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f80 = f90, f96, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f84 = f90, f100, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f81 = f91, f96, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f85 = f91, f100, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f82 = f90, f98, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f86 = f90, f102, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, -11 * SIZE + FMA_A f83 = f91, f98, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, -11 * SIZE + FMA_A f87 = f91, f102, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f80 = f91, f97, f80 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f84 = f91, f101, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f81 = f90, f97, f81 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f85 = f90, f101, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f82 = f91, f99, f82 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f86 = f91, f103, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f83 = f90, f99, f83 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f87 = f90, f103, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f96, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f92, f100, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f96, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f93, f100, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f92, f98, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f92, f102, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f93, f98, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f93, f102, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f97, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f93, f101, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f97, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f92, f101, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f93, f99, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f93, f103, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f92, f99, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f92, f103, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f105, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f105, f85, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f104, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f104, f85, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f105, f87, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f104, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f64 = f106, f80, f64 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f68 = f106, f84, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f65 = f107, f80, f65 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f69 = f107, f84, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f66 = f106, f82, f66 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f70 = f106, f86, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, -11 * SIZE + FMA_A f67 = f107, f82, f67 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, -11 * SIZE + FMA_A f71 = f107, f86, f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f64 = f107, f81, f64 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f68 = f107, f85, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f65 = f106, f81, f65 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f69 = f106, f85, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f66 = f107, f83, f66 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f70 = f107, f87, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f67 = f106, f83, f67 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f71 = f106, f87, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f68 = f121, f69, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f120, f69, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f121, f67, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f70 = f121, f71, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f120, f67, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f71 = f120, f71, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + shladd r2 = K, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + shladd AORIG = r2, 2, AORIG + } + ;; + { .mmi + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C5 ] = f68, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f80 = f0 + sub L = K, KK + } + { .mfi + STFD [C5 ] = f70, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C5 ] = f71, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + shr L = L, 1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = -1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f114 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f115 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L028 + ;; + .align 16 + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; +.L028: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET] + FSUB f80 = f74, f80 + adds BOFFSET = -14 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f89, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f90, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f91, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f66 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f105, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f106, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f107, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f120, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f121, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f123, f115 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET] + FSUB f66 = f74, f66 + adds AOFFSET = -14 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f106, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f66 + FMPY f33 = f105, f66 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + FMPY f36 = f104, f98 + FMPY f37 = f105, f98 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f66 = f105, f67, f32 + FMA_D f67 = f104, f67, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + FMA_C f98 = f105, f99, f36 + FMA_D f99 = f104, f99, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f66, f64 + FMA_A f65 = f107, f66, f65 + FNMA f80 = f106, f82, f80 + FMA_A f81 = f107, f82, f81 + FNMA f96 = f106, f98, f96 + FMA_A f97 = f107, f98, f97 + FNMA f112 = f106, f114, f112 + FMA_A f113 = f107, f114, f113 + ;; + FMA_B f64 = f107, f67, f64 + FNMA f65 = f106, f67, f65 + FMA_B f80 = f107, f83, f80 + FNMA f81 = f106, f83, f81 + FMA_B f96 = f107, f99, f96 + FNMA f97 = f106, f99, f97 + FMA_B f112 = f107, f115, f112 + FNMA f113 = f106, f115, f113 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; + FNMA f66 = f74, f64, f66 + FMA_A f67 = f75, f64, f67 + FNMA f82 = f74, f80, f82 + FMA_A f83 = f75, f80, f83 + FNMA f98 = f74, f96, f98 + FMA_A f99 = f75, f96, f99 + FNMA f114 = f74, f112, f114 + FMA_A f115 = f75, f112, f115 + ;; + FMA_B f66 = f75, f65, f66 + FNMA f67 = f74, f65, f67 + FMA_B f82 = f75, f81, f82 + FNMA f83 = f74, f81, f83 + FMA_B f98 = f75, f97, f98 + FNMA f99 = f74, f97, f99 + FMA_B f114 = f75, f113, f114 + FNMA f115 = f74, f113, f115 + ;; + FMPY f32 = f90, f66 + FMPY f33 = f91, f66 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + FMPY f36 = f90, f98 + FMPY f37 = f91, f98 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f66 = f91, f67, f32 + FMA_D f67 = f90, f67, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + FMA_C f98 = f91, f99, f36 + FMA_D f99 = f90, f99, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f66 + FMPY f35 = f73, f66 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f66 = f73, f67, f34 + FMA_D f67 = f72, f67, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f98 = f76, f66, f98 + FMA_A f99 = f77, f66, f99 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f98 = f77, f67, f98 + FNMA f99 = f76, f67, f99 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + FNMA f114 = f78, f66, f114 + FMA_A f115 = f79, f66, f115 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + FMA_B f114 = f79, f67, f114 + FNMA f115 = f78, f67, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f98 = f92, f82, f98 + FMA_A f99 = f93, f82, f99 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f98 = f93, f83, f98 + FNMA f99 = f92, f83, f99 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + FNMA f114 = f94, f82, f114 + FMA_A f115 = f95, f82, f115 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + FMA_B f114 = f95, f83, f114 + FNMA f115 = f94, f83, f115 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + FMPY f34 = f108, f98 + FMPY f35 = f109, f98 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + FMA_C f98 = f109, f99, f34 + FMA_D f99 = f108, f99, f35 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + FNMA f114 = f110, f98, f114 + FMA_A f115 = f111, f98, f115 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + FMA_B f114 = f111, f99, f114 + FNMA f115 = f110, f99, f115 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f98 = f74, f114, f98 + FMA_A f99 = f75, f114, f99 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f98 = f75, f115, f98 + FNMA f99 = f74, f115, f99 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f82 = f76, f114, f82 + FMA_A f83 = f77, f114, f83 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f82 = f77, f115, f82 + FNMA f83 = f76, f115, f83 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + FNMA f66 = f78, f114, f66 + FMA_A f67 = f79, f114, f67 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + FMA_B f66 = f79, f115, f66 + FNMA f67 = f78, f115, f67 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + FMPY f34 = f88, f98 + FMPY f35 = f89, f98 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + FMA_C f98 = f89, f99, f34 + FMA_D f99 = f88, f99, f35 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f82 = f90, f98, f82 + FMA_A f83 = f91, f98, f83 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f82 = f91, f99, f82 + FNMA f83 = f90, f99, f83 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + FNMA f66 = f92, f98, f66 + FMA_A f67 = f93, f98, f67 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + FMA_B f66 = f93, f99, f66 + FNMA f67 = f92, f99, f67 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f66 + FMPY f35 = f121, f66 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f66 = f121, f67, f34 + FMA_D f67 = f120, f67, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f83, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f66, SIZE + ;; + STFD [C1 ] = f67, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f82, SIZE + ;; + STFD [C2 ] = f83, SIZE + ;; + + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C3 ] = f98, SIZE + ;; + STFD [C3 ] = f99, SIZE + ;; + + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + STFD [C4 ] = f114, SIZE + ;; + STFD [C4 ] = f115, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L030: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f72 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f72 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + + { .mmi + nop __LINE__ + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f88 = f0 + shr L = L, 1 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f104 = f0 + adds L = -1, L + } + { .mfb + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f105 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f120 = f0 + mov ar.lc = L + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f121 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L038 + ;; + .align 16 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB_A f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB_A f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB_A f113 = f121, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB f113 = f121, f113 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f90, f91 = [AOFFSET] + ;; + FMPY f32 = f90, f64 + FMPY f33 = f91, f64 + FMPY f34 = f90, f80 + FMPY f35 = f91, f80 + FMPY f36 = f90, f96 + FMPY f37 = f91, f96 + FMPY f38 = f90, f112 + FMPY f39 = f91, f112 + ;; + FMA_C f64 = f91, f65, f32 + FMA_D f65 = f90, f65, f33 + FMA_C f80 = f91, f81, f34 + FMA_D f81 = f90, f81, f35 + FMA_C f96 = f91, f97, f36 + FMA_D f97 = f90, f97, f37 + FMA_C f112 = f91, f113, f38 + FMA_D f113 = f90, f113, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L049: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + + { .mmb + mov AOFFSET = A + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010x + } + ;; + .align 16 + +.L999: + { .mii + nop __LINE__ + mov ar.lc = ARLC + mov pr = PR, -1 + } + { .mib + nop __LINE__ +#ifdef TRMMKERNEL + mov ar.pfs = ARPFS +#else + nop __LINE__ +#endif + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/mips64/._KERNEL b/kernel/mips64/._KERNEL new file mode 100644 index 0000000..e029a7b Binary files /dev/null and b/kernel/mips64/._KERNEL differ diff --git a/kernel/mips64/._Makefile b/kernel/mips64/._Makefile new file mode 100644 index 0000000..fd9d478 Binary files /dev/null and b/kernel/mips64/._Makefile differ diff --git a/kernel/mips64/._amax.S b/kernel/mips64/._amax.S new file mode 100644 index 0000000..4c0f0ba Binary files /dev/null and b/kernel/mips64/._amax.S differ diff --git a/kernel/mips64/._amin.S b/kernel/mips64/._amin.S new file mode 100644 index 0000000..ac9c9f8 Binary files /dev/null and b/kernel/mips64/._amin.S differ diff --git a/kernel/mips64/._asum.S b/kernel/mips64/._asum.S new file mode 100644 index 0000000..ef7416e Binary files /dev/null and b/kernel/mips64/._asum.S differ diff --git a/kernel/mips64/._axpy.S b/kernel/mips64/._axpy.S new file mode 100644 index 0000000..12537d5 Binary files /dev/null and b/kernel/mips64/._axpy.S differ diff --git a/kernel/mips64/._cnrm2.S b/kernel/mips64/._cnrm2.S new file mode 100644 index 0000000..76eb98e Binary files /dev/null and b/kernel/mips64/._cnrm2.S differ diff --git a/kernel/mips64/._copy.S b/kernel/mips64/._copy.S new file mode 100644 index 0000000..4e79ed8 Binary files /dev/null and b/kernel/mips64/._copy.S differ diff --git a/kernel/mips64/._dnrm2.S b/kernel/mips64/._dnrm2.S new file mode 100644 index 0000000..a8e8bd9 Binary files /dev/null and b/kernel/mips64/._dnrm2.S differ diff --git a/kernel/mips64/._dot.S b/kernel/mips64/._dot.S new file mode 100644 index 0000000..e42c430 Binary files /dev/null and b/kernel/mips64/._dot.S differ diff --git a/kernel/mips64/._gemm_beta.S b/kernel/mips64/._gemm_beta.S new file mode 100644 index 0000000..8aaab49 Binary files /dev/null and b/kernel/mips64/._gemm_beta.S differ diff --git a/kernel/mips64/._gemm_kernel.S b/kernel/mips64/._gemm_kernel.S new file mode 100644 index 0000000..467a4eb Binary files /dev/null and b/kernel/mips64/._gemm_kernel.S differ diff --git a/kernel/mips64/._gemv_n.S b/kernel/mips64/._gemv_n.S new file mode 100644 index 0000000..246223d Binary files /dev/null and b/kernel/mips64/._gemv_n.S differ diff --git a/kernel/mips64/._gemv_t.S b/kernel/mips64/._gemv_t.S new file mode 100644 index 0000000..3afce28 Binary files /dev/null and b/kernel/mips64/._gemv_t.S differ diff --git a/kernel/mips64/._iamax.S b/kernel/mips64/._iamax.S new file mode 100644 index 0000000..483fcfb Binary files /dev/null and b/kernel/mips64/._iamax.S differ diff --git a/kernel/mips64/._iamin.S b/kernel/mips64/._iamin.S new file mode 100644 index 0000000..dda0ab4 Binary files /dev/null and b/kernel/mips64/._iamin.S differ diff --git a/kernel/mips64/._imax.S b/kernel/mips64/._imax.S new file mode 100644 index 0000000..d7c1f40 Binary files /dev/null and b/kernel/mips64/._imax.S differ diff --git a/kernel/mips64/._imin.S b/kernel/mips64/._imin.S new file mode 100644 index 0000000..a6f596b Binary files /dev/null and b/kernel/mips64/._imin.S differ diff --git a/kernel/mips64/._izamax.S b/kernel/mips64/._izamax.S new file mode 100644 index 0000000..f999bb1 Binary files /dev/null and b/kernel/mips64/._izamax.S differ diff --git a/kernel/mips64/._izamin.S b/kernel/mips64/._izamin.S new file mode 100644 index 0000000..990505b Binary files /dev/null and b/kernel/mips64/._izamin.S differ diff --git a/kernel/mips64/._max.S b/kernel/mips64/._max.S new file mode 100644 index 0000000..934e670 Binary files /dev/null and b/kernel/mips64/._max.S differ diff --git a/kernel/mips64/._min.S b/kernel/mips64/._min.S new file mode 100644 index 0000000..e67320b Binary files /dev/null and b/kernel/mips64/._min.S differ diff --git a/kernel/mips64/._rot.S b/kernel/mips64/._rot.S new file mode 100644 index 0000000..1c64f50 Binary files /dev/null and b/kernel/mips64/._rot.S differ diff --git a/kernel/mips64/._scal.S b/kernel/mips64/._scal.S new file mode 100644 index 0000000..38c3a6d Binary files /dev/null and b/kernel/mips64/._scal.S differ diff --git a/kernel/mips64/._snrm2.S b/kernel/mips64/._snrm2.S new file mode 100644 index 0000000..3aa367a Binary files /dev/null and b/kernel/mips64/._snrm2.S differ diff --git a/kernel/mips64/._swap.S b/kernel/mips64/._swap.S new file mode 100644 index 0000000..aebe7a7 Binary files /dev/null and b/kernel/mips64/._swap.S differ diff --git a/kernel/mips64/._symv_L.S b/kernel/mips64/._symv_L.S new file mode 100644 index 0000000..00f5b5e Binary files /dev/null and b/kernel/mips64/._symv_L.S differ diff --git a/kernel/mips64/._symv_U.S b/kernel/mips64/._symv_U.S new file mode 100644 index 0000000..6b4b377 Binary files /dev/null and b/kernel/mips64/._symv_U.S differ diff --git a/kernel/mips64/._trsm_kernel_LN.S b/kernel/mips64/._trsm_kernel_LN.S new file mode 100644 index 0000000..3959553 Binary files /dev/null and b/kernel/mips64/._trsm_kernel_LN.S differ diff --git a/kernel/mips64/._trsm_kernel_LT.S b/kernel/mips64/._trsm_kernel_LT.S new file mode 100644 index 0000000..030d24d Binary files /dev/null and b/kernel/mips64/._trsm_kernel_LT.S differ diff --git a/kernel/mips64/._trsm_kernel_RT.S b/kernel/mips64/._trsm_kernel_RT.S new file mode 100644 index 0000000..dab964c Binary files /dev/null and b/kernel/mips64/._trsm_kernel_RT.S differ diff --git a/kernel/mips64/._zamax.S b/kernel/mips64/._zamax.S new file mode 100644 index 0000000..bc1e4a9 Binary files /dev/null and b/kernel/mips64/._zamax.S differ diff --git a/kernel/mips64/._zamin.S b/kernel/mips64/._zamin.S new file mode 100644 index 0000000..ecb228e Binary files /dev/null and b/kernel/mips64/._zamin.S differ diff --git a/kernel/mips64/._zasum.S b/kernel/mips64/._zasum.S new file mode 100644 index 0000000..98d9425 Binary files /dev/null and b/kernel/mips64/._zasum.S differ diff --git a/kernel/mips64/._zaxpy.S b/kernel/mips64/._zaxpy.S new file mode 100644 index 0000000..f426aaf Binary files /dev/null and b/kernel/mips64/._zaxpy.S differ diff --git a/kernel/mips64/._zcopy.S b/kernel/mips64/._zcopy.S new file mode 100644 index 0000000..a65eb00 Binary files /dev/null and b/kernel/mips64/._zcopy.S differ diff --git a/kernel/mips64/._zdot.S b/kernel/mips64/._zdot.S new file mode 100644 index 0000000..c07fa68 Binary files /dev/null and b/kernel/mips64/._zdot.S differ diff --git a/kernel/mips64/._zgemm3m_kernel.S b/kernel/mips64/._zgemm3m_kernel.S new file mode 100644 index 0000000..738ccc7 Binary files /dev/null and b/kernel/mips64/._zgemm3m_kernel.S differ diff --git a/kernel/mips64/._zgemm_kernel.S b/kernel/mips64/._zgemm_kernel.S new file mode 100644 index 0000000..582e6aa Binary files /dev/null and b/kernel/mips64/._zgemm_kernel.S differ diff --git a/kernel/mips64/._zgemv_n.S b/kernel/mips64/._zgemv_n.S new file mode 100644 index 0000000..eeeec9c Binary files /dev/null and b/kernel/mips64/._zgemv_n.S differ diff --git a/kernel/mips64/._zgemv_t.S b/kernel/mips64/._zgemv_t.S new file mode 100644 index 0000000..2360fe6 Binary files /dev/null and b/kernel/mips64/._zgemv_t.S differ diff --git a/kernel/mips64/._znrm2.S b/kernel/mips64/._znrm2.S new file mode 100644 index 0000000..cde8c67 Binary files /dev/null and b/kernel/mips64/._znrm2.S differ diff --git a/kernel/mips64/._zrot.S b/kernel/mips64/._zrot.S new file mode 100644 index 0000000..5fd9fe9 Binary files /dev/null and b/kernel/mips64/._zrot.S differ diff --git a/kernel/mips64/._zscal.S b/kernel/mips64/._zscal.S new file mode 100644 index 0000000..af10dd5 Binary files /dev/null and b/kernel/mips64/._zscal.S differ diff --git a/kernel/mips64/._zswap.S b/kernel/mips64/._zswap.S new file mode 100644 index 0000000..923448c Binary files /dev/null and b/kernel/mips64/._zswap.S differ diff --git a/kernel/mips64/._zsymv_L.S b/kernel/mips64/._zsymv_L.S new file mode 100644 index 0000000..37917d1 Binary files /dev/null and b/kernel/mips64/._zsymv_L.S differ diff --git a/kernel/mips64/._zsymv_U.S b/kernel/mips64/._zsymv_U.S new file mode 100644 index 0000000..85846e4 Binary files /dev/null and b/kernel/mips64/._zsymv_U.S differ diff --git a/kernel/mips64/._ztrsm_kernel_LT.S b/kernel/mips64/._ztrsm_kernel_LT.S new file mode 100644 index 0000000..03db446 Binary files /dev/null and b/kernel/mips64/._ztrsm_kernel_LT.S differ diff --git a/kernel/mips64/._ztrsm_kernel_RT.S b/kernel/mips64/._ztrsm_kernel_RT.S new file mode 100644 index 0000000..f56de00 Binary files /dev/null and b/kernel/mips64/._ztrsm_kernel_RT.S differ diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL new file mode 100644 index 0000000..3dd7f8e --- /dev/null +++ b/kernel/mips64/KERNEL @@ -0,0 +1,96 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = ../generic/gemm_ncopy_2.c +SGEMMITCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = ../generic/zgemm_ncopy_1.c +CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRSMKERNEL_LN = trsm_kernel_LN.S +STRSMKERNEL_LT = trsm_kernel_LT.S +STRSMKERNEL_RN = trsm_kernel_LT.S +STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_LT.S +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + +CGEMM3MKERNEL = zgemm3m_kernel.S +ZGEMM3MKERNEL = zgemm3m_kernel.S diff --git a/kernel/mips64/Makefile b/kernel/mips64/Makefile new file mode 100644 index 0000000..efae70d --- /dev/null +++ b/kernel/mips64/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/mips64/amax.S b/kernel/mips64/amax.S new file mode 100644 index 0000000..30c35ba --- /dev/null +++ b/kernel/mips64/amax.S @@ -0,0 +1,241 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + FABS s1, a1 + + blez N, .L999 + FABS s2, a1 + + FABS s3, a1 + dsra I, N, 3 + + blez I, .L15 + FABS s4, a1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + + bgtz I, .L12 + CMOVT s4, t4, $fcc3 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + + CMPLT $fcc0, s1, t1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/amin.S b/kernel/mips64/amin.S new file mode 100644 index 0000000..47108b1 --- /dev/null +++ b/kernel/mips64/amin.S @@ -0,0 +1,241 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + FABS s1, a1 + + blez N, .L999 + FABS s2, a1 + + FABS s3, a1 + dsra I, N, 3 + + blez I, .L15 + FABS s4, a1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, t1, s1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, t2, s2 + daddu X, X, INCX + + CMPLT $fcc2, t3, s3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, t4, s4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, t1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, t2, s2 + daddu X, X, INCX + + CMPLT $fcc2, t3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, t4, s4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + + bgtz I, .L12 + CMOVT s4, t4, $fcc3 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + + CMPLT $fcc0, t1, s1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/asum.S b/kernel/mips64/asum.S new file mode 100644 index 0000000..447c2f7 --- /dev/null +++ b/kernel/mips64/asum.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 + +#define t1 $f10 +#define t2 $f11 +#define t3 $f12 +#define t4 $f13 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC $0, s1 + + MTC $0, s2 + dsll INCX, INCX, BASE_SHIFT + + blez N, .L999 + li TEMP, SIZE + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + + LD a5, 4 * SIZE(X) + FABS t1, a1 + LD a6, 5 * SIZE(X) + FABS t2, a2 + LD a7, 6 * SIZE(X) + FABS t3, a3 + + FABS t4, a4 + daddiu I, I, -1 + + blez I, .L13 + LD a8, 7 * SIZE(X) + .align 3 + +.L12: + ADD s1, s1, t1 + LD a1, 8 * SIZE(X) + + FABS t1, a5 + daddiu I, I, -1 + + ADD s2, s2, t2 + LD a2, 9 * SIZE(X) + + FABS t2, a6 + NOP + + ADD s1, s1, t3 + LD a3, 10 * SIZE(X) + + FABS t3, a7 + NOP + + ADD s2, s2, t4 + LD a4, 11 * SIZE(X) + + FABS t4, a8 + daddiu X, X, 8 * SIZE + + ADD s1, s1, t1 + LD a5, 4 * SIZE(X) + + FABS t1, a1 + NOP + + ADD s2, s2, t2 + LD a6, 5 * SIZE(X) + + FABS t2, a2 + NOP + + ADD s1, s1, t3 + LD a7, 6 * SIZE(X) + + FABS t3, a3 + NOP + + ADD s2, s2, t4 + LD a8, 7 * SIZE(X) + + bgtz I, .L12 + FABS t4, a4 + .align 3 + +.L13: + ADD s1, s1, t1 + daddiu X, X, 8 * SIZE + + FABS t1, a5 + NOP + + ADD s2, s2, t2 + FABS t2, a6 + + ADD s1, s1, t3 + FABS t3, a7 + + ADD s2, s2, t4 + FABS t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + + ADD s1, s1, t1 + + bgtz I, .L16 + daddiu X, X, SIZE + + j .L999 + NOP + .align 3 + +.L20: + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + LD a7, 0 * SIZE(X) + + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a8, 0 * SIZE(X) + + FABS t4, a4 + daddiu I, I, -1 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, 0 * SIZE(X) + + FABS t1, a5 + daddu X, X, INCX + + ADD s2, s2, t2 + LD a2, 0 * SIZE(X) + + FABS t2, a6 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a3, 0 * SIZE(X) + + FABS t3, a7 + daddu X, X, INCX + + ADD s2, s2, t4 + LD a4, 0 * SIZE(X) + + FABS t4, a8 + daddu X, X, INCX + + ADD s1, s1, t1 + LD a5, 0 * SIZE(X) + + FABS t1, a1 + daddu X, X, INCX + + ADD s2, s2, t2 + LD a6, 0 * SIZE(X) + + FABS t2, a2 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a7, 0 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + + ADD s2, s2, t4 + LD a8, 0 * SIZE(X) + + FABS t4, a4 + daddiu I, I, -1 + + bgtz I, .L23 + daddu X, X, INCX + .align 3 + +.L24: + ADD s1, s1, t1 + FABS t1, a5 + + ADD s2, s2, t2 + FABS t2, a6 + + ADD s1, s1, t3 + FABS t3, a7 + + ADD s2, s2, t4 + FABS t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + daddu X, X, INCX + + bgtz I, .L26 + ADD s1, s1, t1 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE diff --git a/kernel/mips64/axpy.S b/kernel/mips64/axpy.S new file mode 100644 index 0000000..f7d8887 --- /dev/null +++ b/kernel/mips64/axpy.S @@ -0,0 +1,409 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $8 +#define INCX $9 + +#define Y $10 +#define INCY $11 + +#define I $2 +#define TEMP $3 + +#define YY $5 + +#define ALPHA $f15 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f17 + +#define t1 $f18 +#define t2 $f19 +#define t3 $f20 +#define t4 $f21 + + PROLOGUE + +#ifndef __64BIT__ + daddiu $sp, $sp, -16 + sdc1 $f20, 0($sp) + sdc1 $f21, 8($sp) +#endif + + li TEMP, SIZE + + blez N, .L999 + dsll INCX, INCX, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, BASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + LD a4, 3 * SIZE(X) + LD b4, 3 * SIZE(Y) + LD a5, 4 * SIZE(X) + LD b5, 4 * SIZE(Y) + LD a6, 5 * SIZE(X) + LD b6, 5 * SIZE(Y) + LD a7, 6 * SIZE(X) + LD b7, 6 * SIZE(Y) + LD a8, 7 * SIZE(X) + LD b8, 7 * SIZE(Y) + + blez I, .L13 + NOP + .align 3 + +.L12: + MADD t1, b1, ALPHA, a1 + LD a1, 8 * SIZE(X) + LD b1, 8 * SIZE(Y) + + MADD t2, b2, ALPHA, a2 + LD a2, 9 * SIZE(X) + LD b2, 9 * SIZE(Y) + + MADD t3, b3, ALPHA, a3 + LD a3, 10 * SIZE(X) + LD b3, 10 * SIZE(Y) + + MADD t4, b4, ALPHA, a4 + LD a4, 11 * SIZE(X) + LD b4, 11 * SIZE(Y) + + ST t1, 0 * SIZE(Y) + ST t2, 1 * SIZE(Y) + ST t3, 2 * SIZE(Y) + ST t4, 3 * SIZE(Y) + + MADD t1, b5, ALPHA, a5 + LD a5, 12 * SIZE(X) + LD b5, 12 * SIZE(Y) + + MADD t2, b6, ALPHA, a6 + LD a6, 13 * SIZE(X) + LD b6, 13 * SIZE(Y) + + MADD t3, b7, ALPHA, a7 + LD a7, 14 * SIZE(X) + LD b7, 14 * SIZE(Y) + + MADD t4, b8, ALPHA, a8 + LD a8, 15 * SIZE(X) + LD b8, 15 * SIZE(Y) + + ST t1, 4 * SIZE(Y) + ST t2, 5 * SIZE(Y) + ST t3, 6 * SIZE(Y) + ST t4, 7 * SIZE(Y) + + daddiu I, I, -1 + daddiu Y, Y, 8 * SIZE + + bgtz I, .L12 + daddiu X, X, 8 * SIZE + .align 3 + +.L13: + MADD t1, b1, ALPHA, a1 + MADD t2, b2, ALPHA, a2 + MADD t3, b3, ALPHA, a3 + MADD t4, b4, ALPHA, a4 + + ST t1, 0 * SIZE(Y) + MADD t1, b5, ALPHA, a5 + ST t2, 1 * SIZE(Y) + MADD t2, b6, ALPHA, a6 + ST t3, 2 * SIZE(Y) + MADD t3, b7, ALPHA, a7 + ST t4, 3 * SIZE(Y) + MADD t4, b8, ALPHA, a8 + + ST t1, 4 * SIZE(Y) + ST t2, 5 * SIZE(Y) + ST t3, 6 * SIZE(Y) + ST t4, 7 * SIZE(Y) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + daddiu X, X, SIZE + daddiu Y, Y, SIZE + + MADD t1, b1, ALPHA, a1 + daddiu I, I, -1 + + bgtz I, .L16 + ST t1, -1 * SIZE(Y) + +#ifndef __64BIT__ + ldc1 $f20, 0($sp) + ldc1 $f21, 8($sp) + daddiu $sp, $sp, 16 +#endif + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 3 + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD b1, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD b2, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD b3, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD b4, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD b5, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD b6, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD b7, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a8, 0 * SIZE(X) + daddu X, X, INCX + LD b8, 0 * SIZE(Y) + daddu Y, Y, INCY + + blez I, .L23 + NOP + .align 3 + +.L22: + MADD t1, b1, ALPHA, a1 + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + MADD t2, b2, ALPHA, a2 + LD a2, 0 * SIZE(X) + LD b2, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + MADD t3, b3, ALPHA, a3 + LD a3, 0 * SIZE(X) + LD b3, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + MADD t4, b4, ALPHA, a4 + LD a4, 0 * SIZE(X) + LD b4, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + ST t1, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t1, b5, ALPHA, a5 + + LD a5, 0 * SIZE(X) + LD b5, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + ST t2, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t2, b6, ALPHA, a6 + + LD a6, 0 * SIZE(X) + LD b6, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + ST t3, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t3, b7, ALPHA, a7 + + LD a7, 0 * SIZE(X) + LD b7, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + ST t4, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t4, b8, ALPHA, a8 + + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + LD b8, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST t1, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t2, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t4, 0 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L22 + daddu YY, YY, INCY + .align 3 + +.L23: + MADD t1, b1, ALPHA, a1 + MADD t2, b2, ALPHA, a2 + MADD t3, b3, ALPHA, a3 + MADD t4, b4, ALPHA, a4 + + ST t1, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t1, b5, ALPHA, a5 + + ST t2, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t2, b6, ALPHA, a6 + + ST t3, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t3, b7, ALPHA, a7 + + ST t4, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t4, b8, ALPHA, a8 + + ST t1, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t2, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t4, 0 * SIZE(YY) + daddu YY, YY, INCY + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + MADD t1, b1, ALPHA, a1 + daddu X, X, INCX + + ST t1, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: +#ifndef __64BIT__ + ldc1 $f20, 0($sp) + ldc1 $f21, 8($sp) + daddiu $sp, $sp, 16 +#endif + + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S new file mode 100644 index 0000000..dd8c210 --- /dev/null +++ b/kernel/mips64/cnrm2.S @@ -0,0 +1,214 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f6 +#define a2 $f7 +#define a3 $f8 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + +#define s1 $f0 +#define s2 $f1 + +#define t1 $f2 +#define t2 $f3 +#define t3 $f4 +#define t4 $f5 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + dmtc1 $0, s1 + li TEMP, 2 * SIZE + + blez N, .L999 + mov.d s2, s1 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + dsra I, N, 2 + + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + + daddu X, X, INCX + cvt.d.s t1, a1 + + LD a7, 0 * SIZE(X) + cvt.d.s t2, a2 + + LD a8, 1 * SIZE(X) + cvt.d.s t3, a3 + + daddiu I, I, -1 + cvt.d.s t4, a4 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + madd.d s1, s1, t1, t1 + LD a1, 0 * SIZE(X) + + cvt.d.s t1, a5 + NOP + + madd.d s2, s2, t2, t2 + LD a2, 1 * SIZE(X) + + cvt.d.s t2, a6 + daddu X, X, INCX + + madd.d s1, s1, t3, t3 + LD a3, 0 * SIZE(X) + + cvt.d.s t3, a7 + NOP + + madd.d s2, s2, t4, t4 + LD a4, 1 * SIZE(X) + + cvt.d.s t4, a8 + daddu X, X, INCX + + madd.d s1, s1, t1, t1 + LD a5, 0 * SIZE(X) + + cvt.d.s t1, a1 + daddiu I, I, -1 + + madd.d s2, s2, t2, t2 + LD a6, 1 * SIZE(X) + + cvt.d.s t2, a2 + daddu X, X, INCX + + madd.d s1, s1, t3, t3 + LD a7, 0 * SIZE(X) + + cvt.d.s t3, a3 + LD a8, 1 * SIZE(X) + + madd.d s2, s2, t4, t4 + daddu X, X, INCX + + bgtz I, .L23 + cvt.d.s t4, a4 + .align 3 + +.L24: + madd.d s1, s1, t1, t1 + cvt.d.s t1, a5 + + madd.d s2, s2, t2, t2 + cvt.d.s t2, a6 + + madd.d s1, s1, t3, t3 + cvt.d.s t3, a7 + + madd.d s2, s2, t4, t4 + cvt.d.s t4, a8 + + madd.d s1, s1, t1, t1 + madd.d s2, s2, t2, t2 + madd.d s1, s1, t3, t3 + madd.d s2, s2, t4, t4 + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + cvt.d.s t2, a2 + + madd.d s1, s1, t1, t1 + daddu X, X, INCX + + bgtz I, .L26 + madd.d s2, s2, t2, t2 + .align 3 + +.L999: + add.d s1, s1, s2 + + sqrt.d s1, s1 + + j $31 + cvt.s.d s1, s1 + + EPILOGUE diff --git a/kernel/mips64/copy.S b/kernel/mips64/copy.S new file mode 100644 index 0000000..7942b18 --- /dev/null +++ b/kernel/mips64/copy.S @@ -0,0 +1,277 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, SIZE + NOP + + blez N, .L999 + dsll INCX, INCX, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, BASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + LD a5, 4 * SIZE(X) + LD a6, 5 * SIZE(X) + LD a7, 6 * SIZE(X) + LD a8, 7 * SIZE(X) + + blez I, .L13 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(Y) + LD a1, 8 * SIZE(X) + + ST a2, 1 * SIZE(Y) + LD a2, 9 * SIZE(X) + + ST a3, 2 * SIZE(Y) + LD a3, 10 * SIZE(X) + + ST a4, 3 * SIZE(Y) + LD a4, 11 * SIZE(X) + + ST a5, 4 * SIZE(Y) + LD a5, 12 * SIZE(X) + + ST a6, 5 * SIZE(Y) + LD a6, 13 * SIZE(X) + + ST a7, 6 * SIZE(Y) + LD a7, 14 * SIZE(X) + + ST a8, 7 * SIZE(Y) + LD a8, 15 * SIZE(X) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + ST a3, 2 * SIZE(Y) + ST a4, 3 * SIZE(Y) + ST a5, 4 * SIZE(Y) + ST a6, 5 * SIZE(Y) + ST a7, 6 * SIZE(Y) + ST a8, 7 * SIZE(Y) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu X, X, SIZE + + daddiu I, I, -1 + daddiu Y, Y, SIZE + + bgtz I, .L16 + ST a1, -1 * SIZE(Y) + + j .L999 + NOP + .align 3 + +.L20: + dsra I, N, 3 + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + blez I, .L23 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + ST a5, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + ST a6, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + ST a7, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a7, 0 * SIZE(X) + daddu X, X, INCX + + ST a8, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a8, 0 * SIZE(X) + + daddiu I, I, -1 + + bgtz I, .L22 + daddu X, X, INCX + .align 3 + +.L23: + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a6, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a8, 0 * SIZE(Y) + daddu Y, Y, INCY + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + daddiu I, I, -1 + ST a1, 0 * SIZE(Y) + + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S new file mode 100644 index 0000000..595eb96 --- /dev/null +++ b/kernel/mips64/dnrm2.S @@ -0,0 +1,397 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define XX $7 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define ALPHA $f16 +#define max $f17 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + move XX, X + NOP + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + FABS s1, a1 + + blez N, .L999 + FABS s2, a1 + + FABS s3, a1 + dsra I, N, 3 + + blez I, .L15 + FABS s4, a1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + + bgtz I, .L12 + CMOVT s4, t4, $fcc3 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L100 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + + CMPLT $fcc0, s1, t1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + + daddiu N, N, 1 + + lui TEMP, 0x3f80 + dmtc1 $0, a1 + + mtc1 TEMP, ALPHA + CMPEQ $fcc0, s1, a1 + + bc1t $fcc0, .L999 + cvt.d.s ALPHA, ALPHA + + div.d ALPHA, ALPHA, s1 + MOV max, s1 + + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + + dsra I, N, 3 + blez I, .L105 + NOP + + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a2, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a3, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a4, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a5, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a6, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a7, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a8, 0 * SIZE(XX) + daddiu I, I, -1 + + blez I, .L104 + daddu XX, XX, INCX + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, 0 * SIZE(XX) + MUL t2, ALPHA, a2 + daddu XX, XX, INCX + + MUL t3, ALPHA, a3 + LD a2, 0 * SIZE(XX) + MUL t4, ALPHA, a4 + daddu XX, XX, INCX + + MADD s1, s1, t1, t1 + LD a3, 0 * SIZE(XX) + MADD s2, s2, t2, t2 + daddu XX, XX, INCX + + MADD s3, s3, t3, t3 + LD a4, 0 * SIZE(XX) + MADD s4, s4, t4, t4 + daddu XX, XX, INCX + + MUL t1, ALPHA, a5 + LD a5, 0 * SIZE(XX) + MUL t2, ALPHA, a6 + daddu XX, XX, INCX + + MUL t3, ALPHA, a7 + LD a6, 0 * SIZE(XX) + MUL t4, ALPHA, a8 + daddu XX, XX, INCX + + MADD s1, s1, t1, t1 + LD a7, 0 * SIZE(XX) + MADD s2, s2, t2, t2 + daddu XX, XX, INCX + + MADD s3, s3, t3, t3 + LD a8, 0 * SIZE(XX) + MADD s4, s4, t4, t4 + daddiu I, I, -1 + + bgtz I, .L103 + daddu XX, XX, INCX + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + + MADD s1, s1, t1, t1 + MADD s2, s2, t2, t2 + MADD s3, s3, t3, t3 + MADD s4, s4, t4, t4 + + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + + MADD s1, s1, t1, t1 + MADD s2, s2, t2, t2 + MADD s3, s3, t3, t3 + MADD s4, s4, t4, t4 + .align 3 + +.L105: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L106: + LD a1, 0 * SIZE(XX) + daddiu I, I, -1 + + MUL t1, ALPHA, a1 + + daddu XX, XX, INCX + + bgtz I, .L106 + MADD s1, s1, t1, t1 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + + ADD s1, s1, s3 + + sqrt.d s1, s1 + + j $31 + MUL s1, max, s1 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S new file mode 100644 index 0000000..b1f5991 --- /dev/null +++ b/kernel/mips64/dot.S @@ -0,0 +1,306 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define b1 $f6 +#define b2 $f7 +#define b3 $f8 +#define b4 $f9 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC $0, s1 + MTC $0, s2 + + dsll INCX, INCX, BASE_SHIFT + li TEMP, SIZE + + blez N, .L999 + dsll INCY, INCY, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + bne INCY, TEMP, .L20 + NOP + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + + LD a4, 3 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + LD b4, 3 * SIZE(Y) + .align 3 + +.L12: + MADD s1, s1, a1, b1 + LD a1, 4 * SIZE(X) + LD b1, 4 * SIZE(Y) + + MADD s2, s2, a2, b2 + LD a2, 5 * SIZE(X) + LD b2, 5 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a3, 6 * SIZE(X) + LD b3, 6 * SIZE(Y) + + MADD s2, s2, a4, b4 + LD a4, 7 * SIZE(X) + LD b4, 7 * SIZE(Y) + + MADD s1, s1, a1, b1 + LD a1, 8 * SIZE(X) + LD b1, 8 * SIZE(Y) + + MADD s2, s2, a2, b2 + LD a2, 9 * SIZE(X) + LD b2, 9 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a3, 10 * SIZE(X) + LD b3, 10 * SIZE(Y) + + MADD s2, s2, a4, b4 + LD a4, 11 * SIZE(X) + LD b4, 11 * SIZE(Y) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + MADD s1, s1, a1, b1 + LD a1, 4 * SIZE(X) + LD b1, 4 * SIZE(Y) + + MADD s2, s2, a2, b2 + LD a2, 5 * SIZE(X) + LD b2, 5 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a3, 6 * SIZE(X) + LD b3, 6 * SIZE(Y) + + MADD s2, s2, a4, b4 + LD a4, 7 * SIZE(X) + LD b4, 7 * SIZE(Y) + + MADD s1, s1, a1, b1 + daddiu X, X, 8 * SIZE + MADD s2, s2, a2, b2 + daddiu Y, Y, 8 * SIZE + + MADD s1, s1, a3, b3 + MADD s2, s2, a4, b4 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + MADD s1, s1, a1, b1 + + daddiu I, I, -1 + + daddiu X, X, SIZE + daddiu Y, Y, SIZE + + bgtz I, .L16 + NOP + j .L999 + NOP + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + daddiu TEMP, N, -1 + + mult TEMP, INCX + + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + daddiu TEMP, N, -1 + + mult TEMP, INCY + + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + blez I, .L25 + NOP + .align 3 + +.L23: + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s2, s2, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s2, s2, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s2, s2, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L23 + MADD s2, s2, a1, b1 + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L26 + MADD s1, s1, a1, b1 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE diff --git a/kernel/mips64/gemm_beta.S b/kernel/mips64/gemm_beta.S new file mode 100644 index 0000000..2e0b241 --- /dev/null +++ b/kernel/mips64/gemm_beta.S @@ -0,0 +1,205 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define C $6 +#define LDC $7 + +#define I $2 +#define J $3 + +#define CO1 $8 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define FZERO $f8 +#define ALPHA $f15 + + PROLOGUE + + LDARG C, 0($sp) + MTC $0, FZERO + LDARG LDC, 8($sp) + + dsll LDC, LDC, BASE_SHIFT + + move J, N + blez J, .L999 + nop + .align 3 + +.L10: + move CO1, C + dsra I, M, 3 + + blez I, .L15 + daddu C, C, LDC + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + LD a3, 2 * SIZE(CO1) + LD a4, 3 * SIZE(CO1) + + MUL b1, ALPHA, a1 + LD a1, 4 * SIZE(CO1) + + MUL b2, ALPHA, a2 + daddiu I, I, -1 + + blez I, .L13 + LD a2, 5 * SIZE(CO1) + .align 3 + +.L12: + MUL b3, ALPHA, a3 + LD a3, 6 * SIZE(CO1) + + ST b1, 0 * SIZE(CO1) + + MUL b4, ALPHA, a4 + LD a4, 7 * SIZE(CO1) + + ST b2, 1 * SIZE(CO1) + + MUL b1, ALPHA, a1 + LD a1, 8 * SIZE(CO1) + + ST b3, 2 * SIZE(CO1) + + MUL b2, ALPHA, a2 + LD a2, 9 * SIZE(CO1) + + ST b4, 3 * SIZE(CO1) + + MUL b3, ALPHA, a3 + LD a3, 10 * SIZE(CO1) + + ST b1, 4 * SIZE(CO1) + + MUL b4, ALPHA, a4 + LD a4, 11 * SIZE(CO1) + + ST b2, 5 * SIZE(CO1) + + MUL b1, ALPHA, a1 + LD a1, 12 * SIZE(CO1) + + ST b3, 6 * SIZE(CO1) + + MUL b2, ALPHA, a2 + LD a2, 13 * SIZE(CO1) + + ST b4, 7 * SIZE(CO1) + daddiu I, I, -1 + + bgtz I, .L12 + daddiu CO1, CO1, 8 * SIZE + .align 3 + +.L13: + MUL b3, ALPHA, a3 + LD a3, 6 * SIZE(CO1) + + ST b1, 0 * SIZE(CO1) + + MUL b4, ALPHA, a4 + LD a4, 7 * SIZE(CO1) + + ST b2, 1 * SIZE(CO1) + + MUL b1, ALPHA, a1 + + ST b3, 2 * SIZE(CO1) + + MUL b2, ALPHA, a2 + + ST b4, 3 * SIZE(CO1) + + MUL b3, ALPHA, a3 + + ST b1, 4 * SIZE(CO1) + + MUL b4, ALPHA, a4 + + ST b2, 5 * SIZE(CO1) + ST b3, 6 * SIZE(CO1) + ST b4, 7 * SIZE(CO1) + + daddiu CO1, CO1, 8 * SIZE + .align 3 + +.L15: + andi I, M, 7 + daddiu J, J, -1 + + blez I, .L18 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(CO1) + daddiu I, I, -1 + + MUL b1, ALPHA, a1 + daddiu CO1, CO1, 1 * SIZE + + bgtz I, .L16 + ST b1, -1 * SIZE(CO1) + .align 3 + +.L18: + bgtz J, .L10 + NOP + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/gemm_kernel.S b/kernel/mips64/gemm_kernel.S new file mode 100644 index 0000000..8ee32d5 --- /dev/null +++ b/kernel/mips64/gemm_kernel.S @@ -0,0 +1,2250 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define PREFETCHSIZE (4 * 10) + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#define BB $22 + +#if defined(TRMMKERNEL) +#define OFFSET $23 +#define KK $24 +#define TEMP $25 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f27 +#define a4 $f28 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f16 +#define c41 $f17 +#define c42 $f18 +#define c51 $f19 +#define c52 $f20 +#define c61 $f21 +#define c62 $f22 +#define c71 $f23 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -160 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + SDARG $22, 48($sp) + + sdc1 $f24, 56($sp) + sdc1 $f25, 64($sp) + sdc1 $f26, 72($sp) + sdc1 $f27, 80($sp) + sdc1 $f28, 88($sp) + +#if defined(TRMMKERNEL) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + + LDARG OFFSET, 160($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20,120($sp) + sdc1 $f21,128($sp) + sdc1 $f22,136($sp) + sdc1 $f23,144($sp) +#endif + + dsll LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsra J, N, 3 + blez J, .L30 + nop + +.L10: + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + move AO, A + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + dsra I, M, 1 + daddu C, CO8, LDC + + dsll BB, K, 2 + BASE_SHIFT + daddu BB, B, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + blez I, .L20 + MOV c61, c11 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 8 +#endif + dsra L, TEMP, 2 + + blez L, .L15 + NOP +#else + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + pref 1, 3 * SIZE(CO1) + pref 1, 3 * SIZE(CO2) + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, K, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#endif + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + pref 1, 2 * SIZE(CO3) + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD a4, 2 * SIZE(AO) + MADD c61, c61, a1, b2 + NOP + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD a4, 6 * SIZE(AO) + MADD c61, c61, a3, b2 + NOP + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + pref 1, 3 * SIZE(CO4) + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + pref 1, 3 * SIZE(CO5) + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + pref 1, 3 * SIZE(CO6) + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + pref 1, 3 * SIZE(CO7) + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + NOP + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L18 + pref 1, 3 * SIZE(CO8) + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + daddiu CO3,CO3, 2 * SIZE + LD $f1, 1 * SIZE(CO1) + daddiu CO1,CO1, 2 * SIZE + LD $f2, 0 * SIZE(CO2) + daddiu CO4,CO4, 2 * SIZE + LD $f3, 1 * SIZE(CO2) + daddiu CO2,CO2, 2 * SIZE + + LD $f4, -2 * SIZE(CO3) + daddiu CO5,CO5, 2 * SIZE + LD $f5, -1 * SIZE(CO3) + daddiu CO6,CO6, 2 * SIZE + LD $f6, -2 * SIZE(CO4) + daddiu CO7,CO7, 2 * SIZE + LD $f7, -1 * SIZE(CO4) + daddiu I, I, -1 + + MADD c11, $f0, ALPHA, c11 + LD $f0,-2 * SIZE(CO5) + MADD c12, $f1, ALPHA, c12 + LD $f1,-1 * SIZE(CO5) + MADD c21, $f2, ALPHA, c21 + LD $f2,-2 * SIZE(CO6) + MADD c22, $f3, ALPHA, c22 + LD $f3,-1 * SIZE(CO6) + + MADD c31, $f4, ALPHA, c31 + LD $f4,-2 * SIZE(CO7) + MADD c32, $f5, ALPHA, c32 + LD $f5,-1 * SIZE(CO7) + MADD c41, $f6, ALPHA, c41 + LD $f6, 0 * SIZE(CO8) + MADD c42, $f7, ALPHA, c42 + LD $f7, 1 * SIZE(CO8) + + pref 0, 0 * SIZE(BB) + pref 0, 8 * SIZE(BB) + + ST c11, -2 * SIZE(CO1) + MTC $0, c11 + ST c12, -1 * SIZE(CO1) + daddiu CO8,CO8, 2 * SIZE + ST c21, -2 * SIZE(CO2) + MOV c21, c11 + ST c22, -1 * SIZE(CO2) + daddiu BB, BB, 16 * SIZE + + MADD c51, $f0, ALPHA, c51 + ST c31, -2 * SIZE(CO3) + MADD c52, $f1, ALPHA, c52 + ST c32, -1 * SIZE(CO3) + MADD c61, $f2, ALPHA, c61 + ST c41, -2 * SIZE(CO4) + MADD c62, $f3, ALPHA, c62 + ST c42, -1 * SIZE(CO4) + + MADD c71, $f4, ALPHA, c71 + ST c51, -2 * SIZE(CO5) + MADD c72, $f5, ALPHA, c72 + ST c52, -1 * SIZE(CO5) + MADD c81, $f6, ALPHA, c81 + ST c61, -2 * SIZE(CO6) + MADD c82, $f7, ALPHA, c82 + ST c62, -1 * SIZE(CO6) + + ST c71, -2 * SIZE(CO7) + MOV c31, c11 + ST c72, -1 * SIZE(CO7) + MOV c41, c11 + + ST c81, -2 * SIZE(CO8) + MOV c51, c11 + ST c82, -1 * SIZE(CO8) + bgtz I, .L11 + MOV c61, c11 +#else + daddiu CO4,CO4, 2 * SIZE + daddiu CO5,CO5, 2 * SIZE + daddiu CO6,CO6, 2 * SIZE + daddiu CO7,CO7, 2 * SIZE + + pref 0, 0 * SIZE(BB) + pref 0, 8 * SIZE(BB) + + MUL c11, ALPHA, c11 + daddiu CO1,CO1, 2 * SIZE + MUL c12, ALPHA, c12 + MTC $0, a1 + MUL c21, ALPHA, c21 + daddiu CO2,CO2, 2 * SIZE + MUL c22, ALPHA, c22 + daddiu CO3,CO3, 2 * SIZE + + ST c11, -2 * SIZE(CO1) + MUL c31, ALPHA, c31 + ST c12, -1 * SIZE(CO1) + MUL c32, ALPHA, c32 + ST c21, -2 * SIZE(CO2) + MUL c41, ALPHA, c41 + ST c22, -1 * SIZE(CO2) + MUL c42, ALPHA, c42 + + ST c31, -2 * SIZE(CO3) + MUL c51, ALPHA, c51 + ST c32, -1 * SIZE(CO3) + MUL c52, ALPHA, c52 + ST c41, -2 * SIZE(CO4) + MUL c61, ALPHA, c61 + ST c42, -1 * SIZE(CO4) + MUL c62, ALPHA, c62 + + ST c51, -2 * SIZE(CO5) + MUL c71, ALPHA, c71 + ST c52, -1 * SIZE(CO5) + MUL c72, ALPHA, c72 + ST c61, -2 * SIZE(CO6) + MUL c81, ALPHA, c81 + ST c62, -1 * SIZE(CO6) + MUL c82, ALPHA, c82 + + ST c71, -2 * SIZE(CO7) + MOV c11, a1 + ST c72, -1 * SIZE(CO7) + MOV c21, a1 + + daddiu CO8,CO8, 2 * SIZE + daddiu BB, BB, 16 * SIZE + + ST c81, -2 * SIZE(CO8) + MOV c31, a1 + ST c82, -1 * SIZE(CO8) + MOV c41, a1 + + daddiu I, I, -1 + MOV c51, a1 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -8 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + + bgtz I, .L11 + MOV c61, a1 +#endif + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 + blez I, .L29 + MOV c71, c11 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 8 +#endif + dsra L, TEMP, 2 + + blez L, .L25 + MOV c81, c11 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B +#endif + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + LD $f1, 0 * SIZE(CO2) + LD $f2, 0 * SIZE(CO3) + LD $f3, 0 * SIZE(CO4) + MADD c11, $f0, ALPHA, c11 + LD $f4, 0 * SIZE(CO5) + MADD c21, $f1, ALPHA, c21 + LD $f5, 0 * SIZE(CO6) + MADD c31, $f2, ALPHA, c31 + LD $f6, 0 * SIZE(CO7) + MADD c41, $f3, ALPHA, c41 + LD $f7, 0 * SIZE(CO8) + MADD c51, $f4, ALPHA, c51 + ST c11, 0 * SIZE(CO1) + MADD c61, $f5, ALPHA, c61 + ST c21, 0 * SIZE(CO2) + MADD c71, $f6, ALPHA, c71 + ST c31, 0 * SIZE(CO3) + MADD c81, $f7, ALPHA, c81 + ST c41, 0 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + + ST c11, 0 * SIZE(CO1) + MUL c51, ALPHA, c51 + ST c21, 0 * SIZE(CO2) + MUL c61, ALPHA, c61 + ST c31, 0 * SIZE(CO3) + MUL c71, ALPHA, c71 + ST c41, 0 * SIZE(CO4) + MUL c81, ALPHA, c81 + + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -8 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 8 +#endif + + bgtz J, .L10 + move B, BO + .align 3 + +.L30: + andi J, N, 4 + blez J, .L50 + move AO, A + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + MOV c21, c11 + daddu C, CO4, LDC + MOV c31, c11 + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + dsra I, M, 1 + blez I, .L40 + MOV c41, c11 + +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + MOV c32, c11 + LD b4, 3 * SIZE(BO) + MOV c42, c11 + + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP +#else + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B +#endif + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + daddiu CO3,CO3, 2 * SIZE + LD $f1, 1 * SIZE(CO1) + daddiu CO1,CO1, 2 * SIZE + LD $f2, 0 * SIZE(CO2) + daddiu CO4,CO4, 2 * SIZE + LD $f3, 1 * SIZE(CO2) + daddiu CO2,CO2, 2 * SIZE + + LD $f4, -2 * SIZE(CO3) + MADD c11, $f0, ALPHA, c11 + LD $f5, -1 * SIZE(CO3) + MADD c12, $f1, ALPHA, c12 + LD $f6, -2 * SIZE(CO4) + MADD c21, $f2, ALPHA, c21 + LD $f7, -1 * SIZE(CO4) + MADD c22, $f3, ALPHA, c22 + + MADD c31, $f4, ALPHA, c31 + ST c11, -2 * SIZE(CO1) + MADD c32, $f5, ALPHA, c32 + ST c12, -1 * SIZE(CO1) + MADD c41, $f6, ALPHA, c41 + ST c21, -2 * SIZE(CO2) + MADD c42, $f7, ALPHA, c42 + ST c22, -1 * SIZE(CO2) + + ST c31, -2 * SIZE(CO3) + MTC $0, c11 + ST c32, -1 * SIZE(CO3) + daddiu I, I, -1 + ST c41, -2 * SIZE(CO4) + MOV c21, c11 + ST c42, -1 * SIZE(CO4) + MOV c31, c11 +#else + MUL c11, ALPHA, c11 + daddiu CO3,CO3, 2 * SIZE + MUL c12, ALPHA, c12 + daddiu CO1,CO1, 2 * SIZE + MUL c21, ALPHA, c21 + daddiu CO4,CO4, 2 * SIZE + MUL c22, ALPHA, c22 + daddiu CO2,CO2, 2 * SIZE + + ST c11, -2 * SIZE(CO1) + MUL c31, ALPHA, c31 + ST c12, -1 * SIZE(CO1) + MUL c32, ALPHA, c32 + ST c21, -2 * SIZE(CO2) + MUL c41, ALPHA, c41 + ST c22, -1 * SIZE(CO2) + MUL c42, ALPHA, c42 + + ST c31, -2 * SIZE(CO3) + MTC $0, c11 + ST c32, -1 * SIZE(CO3) + daddiu I, I, -1 + ST c41, -2 * SIZE(CO4) + MOV c21, c11 + ST c42, -1 * SIZE(CO4) + MOV c31, c11 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L40: + andi I, M, 1 + blez I, .L49 + MOV c61, c11 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + + blez L, .L45 + NOP +#else + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + + blez L, .L45 + move BO, B +#endif + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + LD $f1, 0 * SIZE(CO2) + LD $f2, 0 * SIZE(CO3) + LD $f3, 0 * SIZE(CO4) + + MADD c11, $f0, ALPHA, c11 + MADD c21, $f1, ALPHA, c21 + MADD c31, $f2, ALPHA, c31 + MADD c41, $f3, ALPHA, c41 + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + .align 3 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 4 +#endif + move B, BO + .align 3 + +.L50: + andi J, N, 2 + blez J, .L70 + + move AO, A + move CO1, C + daddu CO2, C, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + dsra I, M, 1 + blez I, .L60 + daddu C, CO2, LDC + +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + blez L, .L55 + NOP +#else + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B +#endif + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + daddiu I, I, -1 + LD $f1, 1 * SIZE(CO1) + daddiu CO1,CO1, 2 * SIZE + LD $f2, 0 * SIZE(CO2) + NOP + LD $f3, 1 * SIZE(CO2) + daddiu CO2,CO2, 2 * SIZE + + MADD c11, $f0, ALPHA, c11 + MADD c12, $f1, ALPHA, c12 + MADD c21, $f2, ALPHA, c21 + MADD c22, $f3, ALPHA, c22 + + ST c11, -2 * SIZE(CO1) + ST c12, -1 * SIZE(CO1) + ST c21, -2 * SIZE(CO2) + NOP + bgtz I, .L51 + ST c22, -1 * SIZE(CO2) +#else + daddiu I, I, -1 + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + MUL c21, ALPHA, c21 + MUL c22, ALPHA, c22 + + ST c11, -2 * SIZE(CO1) + ST c12, -1 * SIZE(CO1) + ST c21, -2 * SIZE(CO2) + ST c22, -1 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + + bgtz I, .L51 + NOP +#endif + .align 3 + +.L60: + andi I, M, 1 + blez I, .L69 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + blez L, .L65 + NOP +#else + dsra L, K, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B +#endif + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + LD $f1, 0 * SIZE(CO2) + + ADD c11, c11, c31 + ADD c21, c21, c41 + + MADD c11, $f0, ALPHA, c11 + MADD c21, $f1, ALPHA, c21 + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) +#else + ADD c11, c11, c31 + ADD c21, c21, c41 + + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + .align 3 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move B, BO + .align 3 + +.L70: + andi J, N, 1 + blez J, .L999 + + move AO, A + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + dsra I, M, 1 + blez I, .L80 + daddu C, CO1, LDC + +.L71: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L75 + NOP +#else + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B +#endif + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + daddiu I, I, -1 + LD $f1, 1 * SIZE(CO1) + daddiu CO1,CO1, 2 * SIZE + + ADD c11, c11, c21 + ADD c12, c12, c22 + + MADD c11, $f0, ALPHA, c11 + MADD c12, $f1, ALPHA, c12 + + ST c11, -2 * SIZE(CO1) + bgtz I, .L71 + ST c12, -1 * SIZE(CO1) +#else + ADD c11, c11, c21 + daddiu I, I, -1 + ADD c12, c12, c22 + daddiu CO1,CO1, 2 * SIZE + + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + + ST c11, -2 * SIZE(CO1) + ST c12, -1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + + bgtz I, .L71 + NOP +#endif + .align 3 + +.L80: + andi I, M, 1 + blez I, .L89 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L85 + NOP +#else + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + blez L, .L85 + move BO, B +#endif + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + + ADD c11, c11, c21 + MADD c11, $f0, ALPHA, c11 + + ST c11, 0 * SIZE(CO1) +#else + ADD c11, c11, c21 + MUL c11, ALPHA, c11 + + ST c11, 0 * SIZE(CO1) +#endif + .align 3 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 1 +#endif + move B, BO + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + LDARG $22, 48($sp) + + ldc1 $f24, 56($sp) + ldc1 $f25, 64($sp) + ldc1 $f26, 72($sp) + ldc1 $f27, 80($sp) + ldc1 $f28, 88($sp) + +#if defined(TRMMKERNEL) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20,120($sp) + ldc1 $f21,128($sp) + ldc1 $f22,136($sp) + ldc1 $f23,144($sp) +#endif + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE diff --git a/kernel/mips64/gemv_n.S b/kernel/mips64/gemv_n.S new file mode 100644 index 0000000..908f973 --- /dev/null +++ b/kernel/mips64/gemv_n.S @@ -0,0 +1,665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define A $8 +#define LDA $9 +#define X $10 +#define INCX $11 +#define Y $2 +#define INCY $6 +#define BUFFER $7 + +#define YORIG $3 +#define XX $12 +#define YY $13 + +#define I $14 +#define J $15 + +#define AO1 $16 +#define AO2 $17 + +#define ALPHA $f15 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define x1 $f8 +#define x2 $f9 + +#define y1 $f10 +#define y2 $f11 +#define y3 $f12 +#define y4 $f13 +#define y5 $f14 +#define y6 $f16 +#define y7 $f17 +#define y8 $f18 + +#define t1 $f19 +#define t2 $f20 +#define t3 $f21 +#define t4 $f22 + + + PROLOGUE + + LDARG Y, 0($sp) + LDARG INCY, 8($sp) + LDARG BUFFER, 16($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -16 +#else + daddiu $sp, $sp, -48 +#endif + + SDARG $16, 0($sp) + + SDARG $17, 8($sp) + dsll LDA, LDA, BASE_SHIFT + +#ifndef __64BIT__ + sdc1 $f20, 16($sp) + sdc1 $f21, 24($sp) + sdc1 $f22, 32($sp) +#endif + + blez M, .L999 + dsll INCX, INCX, BASE_SHIFT + + blez N, .L999 + dsll INCY, INCY, BASE_SHIFT + + li YORIG, SIZE + + beq INCY, YORIG, .L10 + move YORIG, Y + + dsra I, M, 2 + move YORIG, BUFFER + + move XX, Y + + blez I, .L05 + move YY, BUFFER + .align 3 + +.L02: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCY + LD a2, 0 * SIZE(XX) + daddu XX, XX, INCY + LD a3, 0 * SIZE(XX) + daddu XX, XX, INCY + LD a4, 0 * SIZE(XX) + daddu XX, XX, INCY + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + ST a3, 2 * SIZE(YY) + ST a4, 3 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L02 + daddiu YY, YY, 4 * SIZE + .align 3 + +.L05: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L06: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCY + + ST a1, 0 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu YY, YY, 1 * SIZE + .align 3 + +.L10: + dsra J, N, 1 + blez J, .L20 + NOP + .align 3 + +.L11: + LD x1, 0 * SIZE(X) + daddu X, X, INCX + LD x2, 0 * SIZE(X) + daddu X, X, INCX + + move AO1, A + daddu AO2, A, LDA + daddu A, AO2, LDA + + move YY, YORIG + MUL x1, ALPHA, x1 + + dsra I, M, 3 + blez I, .L15 + MUL x2, ALPHA, x2 + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + LD a5, 0 * SIZE(AO2) + LD y5, 4 * SIZE(YY) + LD a6, 1 * SIZE(AO2) + LD y6, 5 * SIZE(YY) + + LD a7, 2 * SIZE(AO2) + LD y7, 6 * SIZE(YY) + LD a8, 3 * SIZE(AO2) + daddiu I, I, -1 + + blez I, .L13 + LD y8, 7 * SIZE(YY) + .align 3 + +.L12: + MADD t1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD t2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + + LD y1, 8 * SIZE(YY) + LD y2, 9 * SIZE(YY) + + MADD t3, y3, x1, a3 + LD a3, 6 * SIZE(AO1) + MADD t4, y4, x1, a4 + LD a4, 7 * SIZE(AO1) + + LD y3, 10 * SIZE(YY) + LD y4, 11 * SIZE(YY) + + MADD t1, t1, x2, a5 + LD a5, 4 * SIZE(AO2) + MADD t2, t2, x2, a6 + LD a6, 5 * SIZE(AO2) + MADD t3, t3, x2, a7 + LD a7, 6 * SIZE(AO2) + MADD t4, t4, x2, a8 + LD a8, 7 * SIZE(AO2) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + ST t3, 2 * SIZE(YY) + ST t4, 3 * SIZE(YY) + + MADD t1, y5, x1, a1 + LD a1, 8 * SIZE(AO1) + MADD t2, y6, x1, a2 + LD a2, 9 * SIZE(AO1) + + LD y5, 12 * SIZE(YY) + LD y6, 13 * SIZE(YY) + + MADD t3, y7, x1, a3 + LD a3, 10 * SIZE(AO1) + MADD t4, y8, x1, a4 + LD a4, 11 * SIZE(AO1) + + LD y7, 14 * SIZE(YY) + LD y8, 15 * SIZE(YY) + + MADD t1, t1, x2, a5 + LD a5, 8 * SIZE(AO2) + MADD t2, t2, x2, a6 + LD a6, 9 * SIZE(AO2) + MADD t3, t3, x2, a7 + LD a7, 10 * SIZE(AO2) + MADD t4, t4, x2, a8 + LD a8, 11 * SIZE(AO2) + + ST t1, 4 * SIZE(YY) + ST t2, 5 * SIZE(YY) + ST t3, 6 * SIZE(YY) + ST t4, 7 * SIZE(YY) + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + daddiu AO1, AO1, 8 * SIZE + bgtz I, .L12 + daddiu AO2, AO2, 8 * SIZE + .align 3 + +.L13: + MADD t1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD t2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD t3, y3, x1, a3 + LD a3, 6 * SIZE(AO1) + MADD t4, y4, x1, a4 + LD a4, 7 * SIZE(AO1) + + MADD t1, t1, x2, a5 + LD a5, 4 * SIZE(AO2) + MADD t2, t2, x2, a6 + LD a6, 5 * SIZE(AO2) + MADD t3, t3, x2, a7 + LD a7, 6 * SIZE(AO2) + MADD t4, t4, x2, a8 + LD a8, 7 * SIZE(AO2) + + ST t1, 0 * SIZE(YY) + MADD t1, y5, x1, a1 + ST t2, 1 * SIZE(YY) + MADD t2, y6, x1, a2 + ST t3, 2 * SIZE(YY) + MADD t3, y7, x1, a3 + ST t4, 3 * SIZE(YY) + MADD t4, y8, x1, a4 + + MADD t1, t1, x2, a5 + daddiu AO1, AO1, 8 * SIZE + MADD t2, t2, x2, a6 + daddiu AO2, AO2, 8 * SIZE + MADD t3, t3, x2, a7 + daddiu YY, YY, 8 * SIZE + MADD t4, t4, x2, a8 + NOP + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L15: + andi I, M, 4 + NOP + blez I, .L16 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + LD a5, 0 * SIZE(AO2) + MADD y1, y1, x1, a1 + LD a6, 1 * SIZE(AO2) + MADD y2, y2, x1, a2 + LD a7, 2 * SIZE(AO2) + MADD y3, y3, x1, a3 + LD a8, 3 * SIZE(AO2) + MADD y4, y4, x1, a4 + + MADD y1, y1, x2, a5 + daddiu YY, YY, 4 * SIZE + MADD y2, y2, x2, a6 + daddiu AO1, AO1, 4 * SIZE + MADD y3, y3, x2, a7 + daddiu AO2, AO2, 4 * SIZE + MADD y4, y4, x2, a8 + + ST y1, -4 * SIZE(YY) + ST y2, -3 * SIZE(YY) + ST y3, -2 * SIZE(YY) + ST y4, -1 * SIZE(YY) + .align 3 + +.L16: + andi I, M, 2 + NOP + blez I, .L17 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + + MADD y1, y1, x1, a1 + NOP + MADD y2, y2, x1, a2 + daddiu YY, YY, 2 * SIZE + MADD y1, y1, x2, a5 + daddiu AO1, AO1, 2 * SIZE + MADD y2, y2, x2, a6 + daddiu AO2, AO2, 2 * SIZE + + ST y1, -2 * SIZE(YY) + ST y2, -1 * SIZE(YY) + .align 3 + +.L17: + andi I, M, 1 + NOP + blez I, .L19 + NOP + + LD y1, 0 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD a5, 0 * SIZE(AO2) + + MADD y1, y1, x1, a1 + MADD y1, y1, x2, a5 + + ST y1, 0 * SIZE(YY) + .align 3 + + +.L19: + daddiu J, J, -1 + + bgtz J, .L11 + NOP + .align 3 + +.L20: + andi J, N, 1 + blez J, .L900 + NOP + .align 3 + +.L21: + LD x1, 0 * SIZE(X) + daddu X, X, INCX + + move YY, YORIG + move AO1, A + + dsra I, M, 3 + blez I, .L25 + MUL x1, ALPHA, x1 + + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + LD y5, 4 * SIZE(YY) + LD y6, 5 * SIZE(YY) + + LD y7, 6 * SIZE(YY) + daddiu I, I, -1 + + blez I, .L23 + LD y8, 7 * SIZE(YY) + .align 3 + +.L22: + MADD t1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD t2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + + LD y1, 8 * SIZE(YY) + LD y2, 9 * SIZE(YY) + + MADD t3, y3, x1, a3 + LD a3, 6 * SIZE(AO1) + MADD t4, y4, x1, a4 + LD a4, 7 * SIZE(AO1) + + LD y3, 10 * SIZE(YY) + LD y4, 11 * SIZE(YY) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + ST t3, 2 * SIZE(YY) + ST t4, 3 * SIZE(YY) + + MADD t1, y5, x1, a1 + LD a1, 8 * SIZE(AO1) + MADD t2, y6, x1, a2 + LD a2, 9 * SIZE(AO1) + + LD y5, 12 * SIZE(YY) + LD y6, 13 * SIZE(YY) + + MADD t3, y7, x1, a3 + LD a3, 10 * SIZE(AO1) + MADD t4, y8, x1, a4 + LD a4, 11 * SIZE(AO1) + + LD y7, 14 * SIZE(YY) + LD y8, 15 * SIZE(YY) + + ST t1, 4 * SIZE(YY) + ST t2, 5 * SIZE(YY) + ST t3, 6 * SIZE(YY) + ST t4, 7 * SIZE(YY) + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + bgtz I, .L22 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L23: + MADD t1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD t2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD t3, y3, x1, a3 + LD a3, 6 * SIZE(AO1) + MADD t4, y4, x1, a4 + LD a4, 7 * SIZE(AO1) + + ST t1, 0 * SIZE(YY) + MADD t1, y5, x1, a1 + ST t2, 1 * SIZE(YY) + MADD t2, y6, x1, a2 + ST t3, 2 * SIZE(YY) + MADD t3, y7, x1, a3 + ST t4, 3 * SIZE(YY) + MADD t4, y8, x1, a4 + + ST t1, 4 * SIZE(YY) + ST t2, 5 * SIZE(YY) + ST t3, 6 * SIZE(YY) + ST t4, 7 * SIZE(YY) + + daddiu AO1, AO1, 8 * SIZE + daddiu YY, YY, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + NOP + blez I, .L26 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + MADD y1, y1, x1, a1 + MADD y2, y2, x1, a2 + + MADD y3, y3, x1, a3 + daddiu YY, YY, 4 * SIZE + MADD y4, y4, x1, a4 + daddiu AO1, AO1, 4 * SIZE + + ST y1, -4 * SIZE(YY) + ST y2, -3 * SIZE(YY) + ST y3, -2 * SIZE(YY) + ST y4, -1 * SIZE(YY) + .align 3 + +.L26: + andi I, M, 2 + NOP + blez I, .L27 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + MADD y1, y1, x1, a1 + daddiu YY, YY, 2 * SIZE + MADD y2, y2, x1, a2 + daddiu AO1, AO1, 2 * SIZE + + ST y1, -2 * SIZE(YY) + ST y2, -1 * SIZE(YY) + .align 3 + +.L27: + andi I, M, 1 + NOP + blez I, .L900 + NOP + + LD y1, 0 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + + MADD y1, y1, x1, a1 + + ST y1, 0 * SIZE(YY) + .align 3 + + +.L900: + li YORIG, SIZE + + beq INCY, YORIG, .L999 + dsra I, M, 2 + + blez I, .L905 + move XX, BUFFER + .align 3 + +.L902: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + LD a3, 2 * SIZE(XX) + LD a4, 3 * SIZE(XX) + + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu XX, XX, 4 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(XX) + daddiu XX, XX, 1 * SIZE + + ST a1, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + +#ifndef __64BIT__ + ldc1 $f20, 16($sp) + ldc1 $f21, 24($sp) + ldc1 $f22, 32($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 16 +#else + daddiu $sp, $sp, 48 +#endif + + EPILOGUE diff --git a/kernel/mips64/gemv_t.S b/kernel/mips64/gemv_t.S new file mode 100644 index 0000000..2808756 --- /dev/null +++ b/kernel/mips64/gemv_t.S @@ -0,0 +1,531 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define A $8 +#define LDA $9 +#define X $10 +#define INCX $11 +#define Y $2 +#define INCY $6 +#define BUFFER $7 + +#define XORIG $3 +#define XX $12 +#define YY $13 + +#define I $14 +#define J $15 + +#define AO1 $16 +#define AO2 $17 + +#define ALPHA $f15 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define y1 $f8 +#define y2 $f9 +#define y3 $f10 +#define y4 $f11 + +#define x1 $f12 +#define x2 $f13 +#define x3 $f14 +#define x4 $f16 +#define x5 $f17 +#define x6 $f18 +#define x7 $f19 +#define x8 $f20 + + PROLOGUE + + LDARG Y, 0($sp) + LDARG INCY, 8($sp) + LDARG BUFFER, 16($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -16 +#else + daddiu $sp, $sp, -32 +#endif + + MTC $0, y1 + SDARG $16, 0($sp) + + SDARG $17, 8($sp) + dsll LDA, LDA, BASE_SHIFT + +#ifndef __64BIT__ + sdc1 $f20, 16($sp) +#endif + + blez M, .L999 + dsll INCX, INCX, BASE_SHIFT + + blez N, .L999 + dsll INCY, INCY, BASE_SHIFT + + li XORIG, SIZE + + beq INCX, XORIG, .L10 + move XORIG, X + + dsra I, M, 2 + move XORIG, BUFFER + + blez I, .L05 + move YY, BUFFER + .align 3 + +.L02: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + ST a3, 2 * SIZE(YY) + ST a4, 3 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L02 + daddiu YY, YY, 4 * SIZE + .align 3 + +.L05: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L06: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu YY, YY, 1 * SIZE + .align 3 + +.L10: + dsra J, N, 1 + blez J, .L20 + move YY, Y + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + daddu AO2, A, LDA + MOV y3, y1 + daddu A, AO2, LDA + MOV y4, y1 + + dsra I, M, 3 + blez I, .L15 + move XX, XORIG + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 0 * SIZE(AO2) + LD x2, 1 * SIZE(XX) + + LD a3, 1 * SIZE(AO1) + LD x3, 2 * SIZE(XX) + LD a4, 1 * SIZE(AO2) + LD x4, 3 * SIZE(XX) + + LD a5, 2 * SIZE(AO1) + LD x5, 4 * SIZE(XX) + LD a6, 2 * SIZE(AO2) + LD x6, 5 * SIZE(XX) + + LD a7, 3 * SIZE(AO1) + LD x7, 6 * SIZE(XX) + LD a8, 3 * SIZE(AO2) + daddiu I, I, -1 + + blez I, .L13 + LD x8, 7 * SIZE(XX) + .align 3 + +.L12: + MADD y1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD y2, y2, x1, a2 + LD a2, 4 * SIZE(AO2) + MADD y3, y3, x2, a3 + LD a3, 5 * SIZE(AO1) + MADD y4, y4, x2, a4 + LD a4, 5 * SIZE(AO2) + + LD x1, 8 * SIZE(XX) + LD x2, 9 * SIZE(XX) + + MADD y1, y1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD y2, y2, x3, a6 + LD a6, 6 * SIZE(AO2) + MADD y3, y3, x4, a7 + LD a7, 7 * SIZE(AO1) + MADD y4, y4, x4, a8 + LD a8, 7 * SIZE(AO2) + + LD x3, 10 * SIZE(XX) + LD x4, 11 * SIZE(XX) + + MADD y1, y1, x5, a1 + LD a1, 8 * SIZE(AO1) + MADD y2, y2, x5, a2 + LD a2, 8 * SIZE(AO2) + MADD y3, y3, x6, a3 + LD a3, 9 * SIZE(AO1) + MADD y4, y4, x6, a4 + LD a4, 9 * SIZE(AO2) + + LD x5, 12 * SIZE(XX) + LD x6, 13 * SIZE(XX) + + MADD y1, y1, x7, a5 + LD a5,10 * SIZE(AO1) + MADD y2, y2, x7, a6 + LD a6,10 * SIZE(AO2) + MADD y3, y3, x8, a7 + LD a7,11 * SIZE(AO1) + MADD y4, y4, x8, a8 + LD a8,11 * SIZE(AO2) + + LD x7, 14 * SIZE(XX) + LD x8, 15 * SIZE(XX) + + daddiu I, I, -1 + daddiu XX, XX, 8 * SIZE + + daddiu AO1, AO1, 8 * SIZE + bgtz I, .L12 + daddiu AO2, AO2, 8 * SIZE + .align 3 + +.L13: + MADD y1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD y2, y2, x1, a2 + LD a2, 4 * SIZE(AO2) + MADD y3, y3, x2, a3 + LD a3, 5 * SIZE(AO1) + MADD y4, y4, x2, a4 + LD a4, 5 * SIZE(AO2) + MADD y1, y1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD y2, y2, x3, a6 + LD a6, 6 * SIZE(AO2) + MADD y3, y3, x4, a7 + LD a7, 7 * SIZE(AO1) + MADD y4, y4, x4, a8 + LD a8, 7 * SIZE(AO2) + + MADD y1, y1, x5, a1 + MADD y2, y2, x5, a2 + MADD y3, y3, x6, a3 + MADD y4, y4, x6, a4 + + MADD y1, y1, x7, a5 + daddiu XX, XX, 8 * SIZE + MADD y2, y2, x7, a6 + daddiu AO1, AO1, 8 * SIZE + MADD y3, y3, x8, a7 + daddiu AO2, AO2, 8 * SIZE + MADD y4, y4, x8, a8 + NOP + .align 3 + +.L15: + andi I, M, 4 + NOP + blez I, .L17 + NOP + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 0 * SIZE(AO2) + + LD a3, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + + LD a4, 1 * SIZE(AO2) + + LD a5, 2 * SIZE(AO1) + LD x3, 2 * SIZE(XX) + MADD y1, y1, x1, a1 + LD a6, 2 * SIZE(AO2) + MADD y2, y2, x1, a2 + + LD a7, 3 * SIZE(AO1) + MADD y3, y3, x2, a3 + LD x4, 3 * SIZE(XX) + MADD y4, y4, x2, a4 + LD a8, 3 * SIZE(AO2) + MADD y1, y1, x3, a5 + + MADD y2, y2, x3, a6 + daddiu XX, XX, 4 * SIZE + MADD y3, y3, x4, a7 + daddiu AO1, AO1, 4 * SIZE + MADD y4, y4, x4, a8 + daddiu AO2, AO2, 4 * SIZE + .align 3 + +.L17: + andi I, M, 3 + ADD y1, y1, y3 + blez I, .L19 + ADD y2, y2, y4 + .align 3 + +.L18: + LD x1, 0 * SIZE(XX) + LD a1, 0 * SIZE(AO1) + LD a2, 0 * SIZE(AO2) + + daddiu I, I, -1 + daddiu XX, XX, 1 * SIZE + daddiu AO1, AO1, 1 * SIZE + daddiu AO2, AO2, 1 * SIZE + + MADD y1, y1, x1, a1 + + bgtz I, .L18 + MADD y2, y2, x1, a2 + .align 3 + +.L19: + LD a1, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 0 * SIZE(Y) + daddu Y, Y, INCY + + MADD a1, a1, ALPHA, y1 + daddiu J, J, -1 + MADD a2, a2, ALPHA, y2 + MTC $0, y1 + + ST a1, 0 * SIZE(YY) + daddu YY, YY, INCY + ST a2, 0 * SIZE(YY) + + bgtz J, .L11 + daddu YY, YY, INCY + .align 3 + +.L20: + andi J, N, 1 + MOV y3, y1 + blez J, .L999 + move AO1, A + + dsra I, M, 3 + NOP + blez I, .L25 + move XX, XORIG + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a3, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + LD a5, 2 * SIZE(AO1) + LD x3, 2 * SIZE(XX) + LD a7, 3 * SIZE(AO1) + + LD x4, 3 * SIZE(XX) + LD x5, 4 * SIZE(XX) + LD x6, 5 * SIZE(XX) + LD x7, 6 * SIZE(XX) + daddiu I, I, -1 + + blez I, .L23 + LD x8, 7 * SIZE(XX) + .align 3 + +.L22: + MADD y1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD y3, y3, x2, a3 + LD a3, 5 * SIZE(AO1) + + LD x1, 8 * SIZE(XX) + LD x2, 9 * SIZE(XX) + + MADD y1, y1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD y3, y3, x4, a7 + LD a7, 7 * SIZE(AO1) + + LD x3, 10 * SIZE(XX) + LD x4, 11 * SIZE(XX) + + MADD y1, y1, x5, a1 + LD a1, 8 * SIZE(AO1) + MADD y3, y3, x6, a3 + LD a3, 9 * SIZE(AO1) + + LD x5, 12 * SIZE(XX) + LD x6, 13 * SIZE(XX) + + MADD y1, y1, x7, a5 + LD a5, 10 * SIZE(AO1) + MADD y3, y3, x8, a7 + LD a7, 11 * SIZE(AO1) + + LD x7, 14 * SIZE(XX) + LD x8, 15 * SIZE(XX) + + daddiu I, I, -1 + daddiu XX, XX, 8 * SIZE + bgtz I, .L22 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L23: + MADD y1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD y3, y3, x2, a3 + LD a3, 5 * SIZE(AO1) + MADD y1, y1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD y3, y3, x4, a7 + LD a7, 7 * SIZE(AO1) + + MADD y1, y1, x5, a1 + MADD y3, y3, x6, a3 + MADD y1, y1, x7, a5 + MADD y3, y3, x8, a7 + + daddiu XX, XX, 8 * SIZE + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + NOP + blez I, .L27 + NOP + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a3, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + + LD a5, 2 * SIZE(AO1) + LD x3, 2 * SIZE(XX) + + MADD y1, y1, x1, a1 + LD a7, 3 * SIZE(AO1) + + MADD y3, y3, x2, a3 + LD x4, 3 * SIZE(XX) + + MADD y1, y1, x3, a5 + daddiu XX, XX, 4 * SIZE + MADD y3, y3, x4, a7 + daddiu AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 3 + ADD y1, y1, y3 + blez I, .L29 + NOP + .align 3 + +.L28: + LD x1, 0 * SIZE(XX) + LD a1, 0 * SIZE(AO1) + + daddiu I, I, -1 + daddiu XX, XX, 1 * SIZE + daddiu AO1, AO1, 1 * SIZE + + bgtz I, .L28 + MADD y1, y1, x1, a1 + .align 3 + +.L29: + LD a1, 0 * SIZE(Y) + daddu Y, Y, INCY + + MADD a1, a1, ALPHA, y1 + NOP + + ST a1, 0 * SIZE(YY) + daddu YY, YY, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + +#ifndef __64BIT__ + ldc1 $f20, 16($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 16 +#else + daddiu $sp, $sp, 32 +#endif + + EPILOGUE diff --git a/kernel/mips64/iamax.S b/kernel/mips64/iamax.S new file mode 100644 index 0000000..ff6c215 --- /dev/null +++ b/kernel/mips64/iamax.S @@ -0,0 +1,288 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + FABS s1, a1 + daddu X, X, INCX + FABS s2, a1 + li x2, 1 + + FABS s3, a1 + dsra I, N, 3 + FABS s4, a1 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu I, I, -1 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + FABS t1, a5 + daddiu TEMP, TEMP, 4 + FABS t2, a6 + NOP + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + FABS t1, a1 + daddiu I, I, -1 + + CMPLT $fcc0, s1, t1 + NOP + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/iamin.S b/kernel/mips64/iamin.S new file mode 100644 index 0000000..131aa88 --- /dev/null +++ b/kernel/mips64/iamin.S @@ -0,0 +1,288 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + FABS s1, a1 + daddu X, X, INCX + FABS s2, a1 + li x2, 1 + + FABS s3, a1 + dsra I, N, 3 + FABS s4, a1 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, t1, s1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, t2, s2 + daddu X, X, INCX + + CMPLT $fcc2, t3, s3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, t4, s4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu I, I, -1 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, t1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, t2, s2 + daddu X, X, INCX + + CMPLT $fcc2, t3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, t4, s4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + FABS t1, a5 + daddiu TEMP, TEMP, 4 + FABS t2, a6 + NOP + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + FABS t1, a1 + daddiu I, I, -1 + + CMPLT $fcc0, t1, s1 + NOP + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/imax.S b/kernel/mips64/imax.S new file mode 100644 index 0000000..ec9d3fc --- /dev/null +++ b/kernel/mips64/imax.S @@ -0,0 +1,262 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD s1, 0 * SIZE(X) + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + daddu X, X, INCX + MOV s2, s1 + li x2, 1 + + MOV s3, s1 + dsra I, N, 3 + MOV s4, s1 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + CMPLT $fcc0, s1, a1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, a2 + daddu X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, a4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a2, $fcc1 + movt x2, TEMP, $fcc1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + CMOVT s3, a3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu I, I, -1 + + CMPLT $fcc0, s1, a5 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, a6 + daddu X, X, INCX + + CMPLT $fcc2, s3, a7 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, a8 + daddu X, X, INCX + + CMOVT s1, a5, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a6, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a7, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a8, $fcc3 + movt x4, TEMP, $fcc3 + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + CMPLT $fcc0, s1, a1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, a2 + daddu X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, a4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a4, $fcc3 + movt x4, TEMP, $fcc3 + + CMPLT $fcc0, s1, a5 + daddiu TEMP, TEMP, 4 + CMPLT $fcc1, s2, a6 + NOP + + CMPLT $fcc2, s3, a7 + CMPLT $fcc3, s4, a8 + + CMOVT s1, a5, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a6, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a7, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a8, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + daddiu I, I, -1 + + CMPLT $fcc0, s1, a1 + NOP + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/imin.S b/kernel/mips64/imin.S new file mode 100644 index 0000000..a247c83 --- /dev/null +++ b/kernel/mips64/imin.S @@ -0,0 +1,262 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD s1, 0 * SIZE(X) + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + daddu X, X, INCX + MOV s2, s1 + li x2, 1 + + MOV s3, s1 + dsra I, N, 3 + MOV s4, s1 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + CMPLT $fcc0, a1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, a2, s2 + daddu X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, a4, s4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a2, $fcc1 + movt x2, TEMP, $fcc1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + CMOVT s3, a3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu I, I, -1 + + CMPLT $fcc0, a5, s1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, a6, s2 + daddu X, X, INCX + + CMPLT $fcc2, a7, s3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, a8, s4 + daddu X, X, INCX + + CMOVT s1, a5, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a6, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a7, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a8, $fcc3 + movt x4, TEMP, $fcc3 + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + CMPLT $fcc0, a1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, a2, s2 + daddu X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, a4, s4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a4, $fcc3 + movt x4, TEMP, $fcc3 + + CMPLT $fcc0, a5, s1 + daddiu TEMP, TEMP, 4 + CMPLT $fcc1, a6, s2 + NOP + + CMPLT $fcc2, a7, s3 + CMPLT $fcc3, a8, s4 + + CMOVT s1, a5, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a6, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a7, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a8, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + daddiu I, I, -1 + + CMPLT $fcc0, a1, s1 + NOP + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/izamax.S b/kernel/mips64/izamax.S new file mode 100644 index 0000000..12e26c9 --- /dev/null +++ b/kernel/mips64/izamax.S @@ -0,0 +1,268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 +#define t5 $f16 +#define t6 $f17 +#define t7 $f18 +#define t8 $f19 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + FABS t1, a1 + FABS t2, a2 + + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + daddu X, X, INCX + li x2, 1 + + dsra I, N, 2 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + LD a2, 1 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + FABS t4, a4 + NOP + + FABS t5, a5 + LD a3, 0 * SIZE(X) + FABS t6, a6 + LD a4, 1 * SIZE(X) + FABS t7, a7 + daddu X, X, INCX + FABS t8, a8 + NOP + + ADD t1, t1, t2 + LD a5, 0 * SIZE(X) + ADD t3, t3, t4 + LD a6, 1 * SIZE(X) + ADD t5, t5, t6 + daddu X, X, INCX + ADD t7, t7, t8 + NOP + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t3 + LD a8, 1 * SIZE(X) + CMPLT $fcc2, s3, t5 + daddu X, X, INCX + CMPLT $fcc3, s4, t7 + daddiu I, I, -1 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t3, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t5, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t7, $fcc3 + movt x4, TEMP, $fcc3 + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t3, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t5, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t7, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + ADD t1, t1, t2 + + daddiu I, I, -1 + + CMPLT $fcc0, s1, t1 + NOP + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/izamin.S b/kernel/mips64/izamin.S new file mode 100644 index 0000000..af3d750 --- /dev/null +++ b/kernel/mips64/izamin.S @@ -0,0 +1,268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 +#define t5 $f16 +#define t6 $f17 +#define t7 $f18 +#define t8 $f19 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + FABS t1, a1 + FABS t2, a2 + + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + daddu X, X, INCX + li x2, 1 + + dsra I, N, 2 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + LD a2, 1 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + FABS t4, a4 + NOP + + FABS t5, a5 + LD a3, 0 * SIZE(X) + FABS t6, a6 + LD a4, 1 * SIZE(X) + FABS t7, a7 + daddu X, X, INCX + FABS t8, a8 + NOP + + ADD t1, t1, t2 + LD a5, 0 * SIZE(X) + ADD t3, t3, t4 + LD a6, 1 * SIZE(X) + ADD t5, t5, t6 + daddu X, X, INCX + ADD t7, t7, t8 + NOP + + CMPLT $fcc0, t1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, t3, s2 + LD a8, 1 * SIZE(X) + CMPLT $fcc2, t5, s3 + daddu X, X, INCX + CMPLT $fcc3, t7, s4 + daddiu I, I, -1 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t3, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t5, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t7, $fcc3 + movt x4, TEMP, $fcc3 + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t3, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t5, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t7, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + ADD t1, t1, t2 + + daddiu I, I, -1 + + CMPLT $fcc0, t1, s1 + NOP + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/max.S b/kernel/mips64/max.S new file mode 100644 index 0000000..a432f12 --- /dev/null +++ b/kernel/mips64/max.S @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD s1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + NOP + + blez N, .L999 + MOV s2, s1 + + MOV s3, s1 + dsra I, N, 3 + + blez I, .L15 + MOV s4, s1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + CMPLT $fcc0, s1, a1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, a2 + daddu X, X, INCX + + CMPLT $fcc2, s3, a3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, a4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + LD a1, 0 * SIZE(X) + CMOVT s2, a2, $fcc1 + daddu X, X, INCX + + CMOVT s3, a3, $fcc2 + LD a2, 0 * SIZE(X) + CMOVT s4, a4, $fcc3 + daddu X, X, INCX + + CMPLT $fcc0, s1, a5 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, a6 + daddu X, X, INCX + CMPLT $fcc2, s3, a7 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, a8 + daddu X, X, INCX + + CMOVT s1, a5, $fcc0 + LD a5, 0 * SIZE(X) + CMOVT s2, a6, $fcc1 + daddu X, X, INCX + + CMOVT s3, a7, $fcc2 + LD a6, 0 * SIZE(X) + CMOVT s4, a8, $fcc3 + daddiu I, I, -1 + + bgtz I, .L12 + daddu X, X, INCX + .align 3 + +.L13: + CMPLT $fcc0, s1, a1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, a2 + daddu X, X, INCX + + CMPLT $fcc2, s3, a3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, a4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + CMOVT s2, a2, $fcc1 + CMOVT s3, a3, $fcc2 + CMOVT s4, a4, $fcc3 + + CMPLT $fcc0, s1, a5 + CMPLT $fcc1, s2, a6 + CMPLT $fcc2, s3, a7 + CMPLT $fcc3, s4, a8 + + CMOVT s1, a5, $fcc0 + CMOVT s2, a6, $fcc1 + CMOVT s3, a7, $fcc2 + CMOVT s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + CMPLT $fcc0, s1, a1 + + CMOVT s1, a1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/min.S b/kernel/mips64/min.S new file mode 100644 index 0000000..33cfc81 --- /dev/null +++ b/kernel/mips64/min.S @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD s1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + NOP + + blez N, .L999 + MOV s2, s1 + + MOV s3, s1 + dsra I, N, 3 + + blez I, .L15 + MOV s4, s1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + CMPLT $fcc0, a1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, a2, s2 + daddu X, X, INCX + + CMPLT $fcc2, a3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, a4, s4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + LD a1, 0 * SIZE(X) + CMOVT s2, a2, $fcc1 + daddu X, X, INCX + + CMOVT s3, a3, $fcc2 + LD a2, 0 * SIZE(X) + CMOVT s4, a4, $fcc3 + daddu X, X, INCX + + CMPLT $fcc0, a5, s1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, a6, s2 + daddu X, X, INCX + CMPLT $fcc2, a7, s3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, a8, s4 + daddu X, X, INCX + + CMOVT s1, a5, $fcc0 + LD a5, 0 * SIZE(X) + CMOVT s2, a6, $fcc1 + daddu X, X, INCX + + CMOVT s3, a7, $fcc2 + LD a6, 0 * SIZE(X) + CMOVT s4, a8, $fcc3 + daddiu I, I, -1 + + bgtz I, .L12 + daddu X, X, INCX + .align 3 + +.L13: + CMPLT $fcc0, a1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, a2, s2 + daddu X, X, INCX + + CMPLT $fcc2, a3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, a4, s4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + CMOVT s2, a2, $fcc1 + CMOVT s3, a3, $fcc2 + CMOVT s4, a4, $fcc3 + + CMPLT $fcc0, a5, s1 + CMPLT $fcc1, a6, s2 + CMPLT $fcc2, a7, s3 + CMPLT $fcc3, a8, s4 + + CMOVT s1, a5, $fcc0 + CMOVT s2, a6, $fcc1 + CMOVT s3, a7, $fcc2 + CMOVT s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + CMPLT $fcc0, a1, s1 + + CMOVT s1, a1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/rot.S b/kernel/mips64/rot.S new file mode 100644 index 0000000..b94a59c --- /dev/null +++ b/kernel/mips64/rot.S @@ -0,0 +1,367 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define XX $9 +#define YY $10 + +#define C $f17 +#define S $f18 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 + +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 + + PROLOGUE + + dsll INCX, INCX, BASE_SHIFT + li TEMP, SIZE + + blez N, .L999 + dsll INCY, INCY, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 2 + + bne INCY, TEMP, .L20 + NOP + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + MUL t1, S, b1 + + LD a4, 3 * SIZE(X) + MUL t2, C, b1 + LD b4, 3 * SIZE(Y) + MUL t3, S, b2 + + blez I, .L13 + MUL t4, C, b2 + .align 3 + +.L12: + MADD t1, t1, C, a1 + LD b1, 4 * SIZE(Y) + NMSUB t2, t2, S, a1 + LD a1, 4 * SIZE(X) + MADD t3, t3, C, a2 + LD b2, 5 * SIZE(Y) + NMSUB t4, t4, S, a2 + LD a2, 5 * SIZE(X) + + ST t1, 0 * SIZE(X) + MUL t1, S, b3 + ST t2, 0 * SIZE(Y) + MUL t2, C, b3 + ST t3, 1 * SIZE(X) + MUL t3, S, b4 + ST t4, 1 * SIZE(Y) + MUL t4, C, b4 + + + MADD t1, t1, C, a3 + LD b3, 6 * SIZE(Y) + NMSUB t2, t2, S, a3 + LD a3, 6 * SIZE(X) + MADD t3, t3, C, a4 + LD b4, 7 * SIZE(Y) + NMSUB t4, t4, S, a4 + LD a4, 7 * SIZE(X) + + ST t1, 2 * SIZE(X) + MUL t1, S, b1 + ST t2, 2 * SIZE(Y) + MUL t2, C, b1 + ST t3, 3 * SIZE(X) + MUL t3, S, b2 + ST t4, 3 * SIZE(Y) + MUL t4, C, b2 + + daddiu I, I, -1 + daddiu X, X, 4 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 4 * SIZE + .align 3 + +.L13: + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(X) + MUL t1, S, b3 + ST t2, 0 * SIZE(Y) + MUL t2, C, b3 + ST t3, 1 * SIZE(X) + MUL t3, S, b4 + ST t4, 1 * SIZE(Y) + MUL t4, C, b4 + + MADD t1, t1, C, a3 + NMSUB t2, t2, S, a3 + MADD t3, t3, C, a4 + daddiu X, X, 4 * SIZE + NMSUB t4, t4, S, a4 + daddiu Y, Y, 4 * SIZE + + ST t1, -2 * SIZE(X) + ST t2, -2 * SIZE(Y) + ST t3, -1 * SIZE(X) + ST t4, -1 * SIZE(Y) + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + MUL t1, S, b1 + MUL t2, C, b1 + + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + + ST t1, 0 * SIZE(X) + ST t2, 0 * SIZE(Y) + + daddiu I, I, -1 + + daddiu X, X, SIZE + daddiu Y, Y, SIZE + + bgtz I, .L16 + NOP + j .L999 + NOP + .align 3 + +.L20: + move XX, X + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + LD a2, 0 * SIZE(X) + dadd X, X, INCX + LD b2, 0 * SIZE(Y) + dadd Y, Y, INCY + + LD a3, 0 * SIZE(X) + dadd X, X, INCX + LD b3, 0 * SIZE(Y) + dadd Y, Y, INCY + + MUL t1, S, b1 + + LD a4, 0 * SIZE(X) + dadd X, X, INCX + MUL t2, C, b1 + LD b4, 0 * SIZE(Y) + dadd Y, Y, INCY + + MUL t3, S, b2 + blez I, .L23 + MUL t4, C, b2 + .align 3 + +.L22: + MADD t1, t1, C, a1 + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t2, t2, S, a1 + LD a1, 0 * SIZE(X) + dadd X, X, INCX + MADD t3, t3, C, a2 + LD b2, 0 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t4, t4, S, a2 + LD a2, 0 * SIZE(X) + dadd X, X, INCX + + ST t1, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t1, S, b3 + ST t2, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t2, C, b3 + ST t3, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b4 + ST t4, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t4, C, b4 + + MADD t1, t1, C, a3 + LD b3, 0 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t2, t2, S, a3 + LD a3, 0 * SIZE(X) + dadd X, X, INCX + MADD t3, t3, C, a4 + LD b4, 0 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t4, t4, S, a4 + LD a4, 0 * SIZE(X) + dadd X, X, INCX + + ST t1, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t1, S, b1 + ST t2, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t2, C, b1 + ST t3, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b2 + ST t4, 0 * SIZE(YY) + MUL t4, C, b2 + daddiu I, I, -1 + + bgtz I, .L22 + dadd YY, YY, INCY + .align 3 + +.L23: + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t1, S, b3 + ST t2, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t2, C, b3 + ST t3, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b4 + ST t4, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t4, C, b4 + + MADD t1, t1, C, a3 + NMSUB t2, t2, S, a3 + MADD t3, t3, C, a4 + NMSUB t4, t4, S, a4 + + ST t1, 0 * SIZE(XX) + dadd XX, XX, INCX + ST t2, 0 * SIZE(YY) + dadd YY, YY, INCY + ST t3, 0 * SIZE(XX) + dadd XX, XX, INCX + ST t4, 0 * SIZE(YY) + dadd YY, YY, INCY + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + MUL t1, S, b1 + MUL t2, C, b1 + + MADD t1, t1, C, a1 + daddiu I, I, -1 + NMSUB t2, t2, S, a1 + + ST t1, 0 * SIZE(X) + ST t2, 0 * SIZE(Y) + + dadd X, X, INCX + bgtz I, .L26 + dadd Y, Y, INCY + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/scal.S b/kernel/mips64/scal.S new file mode 100644 index 0000000..f544914 --- /dev/null +++ b/kernel/mips64/scal.S @@ -0,0 +1,412 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $8 +#define INCX $9 + +#define I $2 +#define TEMP $3 + +#define XX $5 + +#define ALPHA $f15 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define t1 $f8 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + + li TEMP, SIZE + MTC $0, a1 + + blez N, .L999 + dsll INCX, INCX, BASE_SHIFT + + CMPEQ $fcc0, ALPHA, a1 + NOP + + bc1f $fcc0, .L50 + NOP + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + ST a1, 2 * SIZE(X) + ST a1, 3 * SIZE(X) + ST a1, 4 * SIZE(X) + ST a1, 5 * SIZE(X) + ST a1, 6 * SIZE(X) + ST a1, 7 * SIZE(X) + addiu I, I, -1 + + bgtz I, .L12 + daddiu X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + ST a1, 0 * SIZE(X) + daddiu I, I, -1 + + bgtz I, .L16 + daddiu X, X, SIZE + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 3 + blez I, .L25 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddiu I, I, -1 + + bgtz I, .L22 + daddu X, X, INCX + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + daddiu I, I, -1 + ST a1, 0 * SIZE(X) + + bgtz I, .L26 + daddu X, X, INCX + + j $31 + NOP + .align 3 + +.L50: + bne INCX, TEMP, .L60 + dsra I, N, 3 + + blez I, .L55 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + LD a5, 4 * SIZE(X) + LD a6, 5 * SIZE(X) + LD a7, 6 * SIZE(X) + LD a8, 7 * SIZE(X) + + blez I, .L53 + NOP + .align 3 + +.L52: + MUL t1, ALPHA, a1 + LD a1, 8 * SIZE(X) + MUL t2, ALPHA, a2 + LD a2, 9 * SIZE(X) + + MUL t3, ALPHA, a3 + LD a3, 10 * SIZE(X) + MUL t4, ALPHA, a4 + LD a4, 11 * SIZE(X) + + ST t1, 0 * SIZE(X) + MUL t1, ALPHA, a5 + + LD a5, 12 * SIZE(X) + + ST t2, 1 * SIZE(X) + MUL t2, ALPHA, a6 + + LD a6, 13 * SIZE(X) + + ST t3, 2 * SIZE(X) + MUL t3, ALPHA, a7 + + LD a7, 14 * SIZE(X) + + ST t4, 3 * SIZE(X) + MUL t4, ALPHA, a8 + + LD a8, 15 * SIZE(X) + daddiu I, I, -1 + + ST t1, 4 * SIZE(X) + ST t2, 5 * SIZE(X) + ST t3, 6 * SIZE(X) + ST t4, 7 * SIZE(X) + + bgtz I, .L52 + daddiu X, X, 8 * SIZE + .align 3 + +.L53: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + + ST t1, 0 * SIZE(X) + MUL t1, ALPHA, a5 + ST t2, 1 * SIZE(X) + MUL t2, ALPHA, a6 + + ST t3, 2 * SIZE(X) + MUL t3, ALPHA, a7 + ST t4, 3 * SIZE(X) + MUL t4, ALPHA, a8 + + ST t1, 4 * SIZE(X) + ST t2, 5 * SIZE(X) + ST t3, 6 * SIZE(X) + ST t4, 7 * SIZE(X) + + daddiu X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L56: + LD a1, 0 * SIZE(X) + + MUL t1, ALPHA, a1 + + daddiu X, X, SIZE + daddiu I, I, -1 + + bgtz I, .L56 + ST t1, -1 * SIZE(X) + + j $31 + NOP + .align 3 + +.L60: + dsra I, N, 3 + move XX, X + + blez I, .L65 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + blez I, .L63 + NOP + .align 3 + +.L62: + MUL t1, ALPHA, a1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + MUL t2, ALPHA, a2 + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + MUL t3, ALPHA, a3 + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + MUL t4, ALPHA, a4 + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + ST t1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t4, 0 * SIZE(XX) + daddu XX, XX, INCX + + MUL t1, ALPHA, a5 + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + MUL t2, ALPHA, a6 + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + MUL t3, ALPHA, a7 + LD a7, 0 * SIZE(X) + daddu X, X, INCX + + MUL t4, ALPHA, a8 + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + ST t1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t4, 0 * SIZE(XX) + daddiu I, I, -1 + + bgtz I, .L62 + daddu XX, XX, INCX + .align 3 + +.L63: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + + ST t1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t4, 0 * SIZE(XX) + daddu XX, XX, INCX + + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + + ST t1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t4, 0 * SIZE(XX) + daddu XX, XX, INCX + .align 3 + +.L65: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L66: + LD a1, 0 * SIZE(X) + + MUL t1, ALPHA, a1 + + daddiu I, I, -1 + ST t1, 0 * SIZE(X) + + bgtz I, .L66 + daddu X, X, INCX + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S new file mode 100644 index 0000000..04a48bd --- /dev/null +++ b/kernel/mips64/snrm2.S @@ -0,0 +1,337 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f6 +#define a2 $f7 +#define a3 $f8 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + +#define s1 $f0 +#define s2 $f1 + +#define t1 $f2 +#define t2 $f3 +#define t3 $f4 +#define t4 $f5 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + dmtc1 $0, s1 + li TEMP, SIZE + + blez N, .L999 + mov.d s2, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + + LD a5, 4 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + LD a6, 5 * SIZE(X) + cvt.d.s t2, a2 + LD a7, 6 * SIZE(X) + cvt.d.s t3, a3 + LD a8, 7 * SIZE(X) + + blez I, .L13 + cvt.d.s t4, a4 + .align 3 + +.L12: + madd.d s1, s1, t1, t1 + LD a1, 8 * SIZE(X) + + cvt.d.s t1, a5 + NOP + + madd.d s2, s2, t2, t2 + LD a2, 9 * SIZE(X) + + cvt.d.s t2, a6 + NOP + + madd.d s1, s1, t3, t3 + LD a3, 10 * SIZE(X) + + cvt.d.s t3, a7 + NOP + + madd.d s2, s2, t4, t4 + LD a4, 11 * SIZE(X) + + cvt.d.s t4, a8 + NOP + + madd.d s1, s1, t1, t1 + LD a5, 12 * SIZE(X) + + cvt.d.s t1, a1 + NOP + + madd.d s2, s2, t2, t2 + LD a6, 13 * SIZE(X) + + cvt.d.s t2, a2 + daddiu I, I, -1 + + madd.d s1, s1, t3, t3 + LD a7, 14 * SIZE(X) + + cvt.d.s t3, a3 + daddiu X, X, 8 * SIZE + + madd.d s2, s2, t4, t4 + LD a8, 7 * SIZE(X) + + bgtz I, .L12 + cvt.d.s t4, a4 + .align 3 + +.L13: + madd.d s1, s1, t1, t1 + cvt.d.s t1, a5 + + madd.d s2, s2, t2, t2 + cvt.d.s t2, a6 + + madd.d s1, s1, t3, t3 + cvt.d.s t3, a7 + + madd.d s2, s2, t4, t4 + cvt.d.s t4, a8 + + madd.d s1, s1, t1, t1 + madd.d s2, s2, t2, t2 + madd.d s1, s1, t3, t3 + madd.d s2, s2, t4, t4 + + daddiu X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + + madd.d s1, s1, t1, t1 + + bgtz I, .L16 + daddiu X, X, SIZE + + j .L999 + NOP + .align 3 + +.L20: + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + LD a7, 0 * SIZE(X) + daddu X, X, INCX + + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + cvt.d.s t2, a2 + cvt.d.s t3, a3 + cvt.d.s t4, a4 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + madd.d s1, s1, t1, t1 + LD a1, 0 * SIZE(X) + + cvt.d.s t1, a5 + daddu X, X, INCX + + madd.d s2, s2, t2, t2 + LD a2, 0 * SIZE(X) + + cvt.d.s t2, a6 + daddu X, X, INCX + + madd.d s1, s1, t3, t3 + LD a3, 0 * SIZE(X) + + cvt.d.s t3, a7 + daddu X, X, INCX + + madd.d s2, s2, t4, t4 + LD a4, 0 * SIZE(X) + + cvt.d.s t4, a8 + daddu X, X, INCX + + madd.d s1, s1, t1, t1 + LD a5, 0 * SIZE(X) + + cvt.d.s t1, a1 + daddu X, X, INCX + + madd.d s2, s2, t2, t2 + LD a6, 0 * SIZE(X) + + cvt.d.s t2, a2 + daddu X, X, INCX + + madd.d s1, s1, t3, t3 + LD a7, 0 * SIZE(X) + + cvt.d.s t3, a3 + daddu X, X, INCX + + madd.d s2, s2, t4, t4 + LD a8, 0 * SIZE(X) + + cvt.d.s t4, a4 + daddiu I, I, -1 + + bgtz I, .L23 + daddu X, X, INCX + .align 3 + +.L24: + madd.d s1, s1, t1, t1 + cvt.d.s t1, a5 + + madd.d s2, s2, t2, t2 + cvt.d.s t2, a6 + + madd.d s1, s1, t3, t3 + cvt.d.s t3, a7 + + madd.d s2, s2, t4, t4 + cvt.d.s t4, a8 + + madd.d s1, s1, t1, t1 + madd.d s2, s2, t2, t2 + madd.d s1, s1, t3, t3 + madd.d s2, s2, t4, t4 + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + + daddu X, X, INCX + + bgtz I, .L26 + madd.d s1, s1, t1, t1 + .align 3 + +.L999: + add.d s1, s1, s2 + + sqrt.d s1, s1 + + j $31 + cvt.s.d s1, s1 + + EPILOGUE diff --git a/kernel/mips64/swap.S b/kernel/mips64/swap.S new file mode 100644 index 0000000..d54abd7 --- /dev/null +++ b/kernel/mips64/swap.S @@ -0,0 +1,392 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $8 +#define INCX $9 +#define Y $10 +#define INCY $11 + +#define I $2 +#define TEMP $3 + +#define XX $5 +#define YY $6 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 + + PROLOGUE + + li TEMP, SIZE + NOP + + blez N, .L999 + dsll INCX, INCX, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, BASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + LD a4, 3 * SIZE(X) + LD b4, 3 * SIZE(Y) + LD a5, 4 * SIZE(X) + LD b5, 4 * SIZE(Y) + LD a6, 5 * SIZE(X) + LD b6, 5 * SIZE(Y) + LD a7, 6 * SIZE(X) + LD b7, 6 * SIZE(Y) + LD a8, 7 * SIZE(X) + LD b8, 7 * SIZE(Y) + + blez I, .L13 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(Y) + LD a1, 8 * SIZE(X) + ST b1, 0 * SIZE(X) + LD b1, 8 * SIZE(Y) + + ST a2, 1 * SIZE(Y) + LD a2, 9 * SIZE(X) + ST b2, 1 * SIZE(X) + LD b2, 9 * SIZE(Y) + + ST a3, 2 * SIZE(Y) + LD a3, 10 * SIZE(X) + ST b3, 2 * SIZE(X) + LD b3, 10 * SIZE(Y) + + ST a4, 3 * SIZE(Y) + LD a4, 11 * SIZE(X) + ST b4, 3 * SIZE(X) + LD b4, 11 * SIZE(Y) + + ST a5, 4 * SIZE(Y) + LD a5, 12 * SIZE(X) + ST b5, 4 * SIZE(X) + LD b5, 12 * SIZE(Y) + + ST a6, 5 * SIZE(Y) + LD a6, 13 * SIZE(X) + ST b6, 5 * SIZE(X) + LD b6, 13 * SIZE(Y) + + ST a7, 6 * SIZE(Y) + LD a7, 14 * SIZE(X) + ST b7, 6 * SIZE(X) + LD b7, 14 * SIZE(Y) + + ST a8, 7 * SIZE(Y) + LD a8, 15 * SIZE(X) + ST b8, 7 * SIZE(X) + LD b8, 15 * SIZE(Y) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + ST a1, 0 * SIZE(Y) + ST b1, 0 * SIZE(X) + ST a2, 1 * SIZE(Y) + ST b2, 1 * SIZE(X) + ST a3, 2 * SIZE(Y) + ST b3, 2 * SIZE(X) + ST a4, 3 * SIZE(Y) + ST b4, 3 * SIZE(X) + ST a5, 4 * SIZE(Y) + ST b5, 4 * SIZE(X) + ST a6, 5 * SIZE(Y) + ST b6, 5 * SIZE(X) + ST a7, 6 * SIZE(Y) + ST b7, 6 * SIZE(X) + ST a8, 7 * SIZE(Y) + ST b8, 7 * SIZE(X) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + daddiu X, X, SIZE + daddiu I, I, -1 + daddiu Y, Y, SIZE + + ST b1, -1 * SIZE(X) + bgtz I, .L16 + ST a1, -1 * SIZE(Y) + + j .L999 + NOP + .align 3 + +.L20: + dsra I, N, 3 + move XX, X + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD b1, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD b2, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD b3, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD b4, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD b5, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD b6, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD b7, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a8, 0 * SIZE(X) + daddu X, X, INCX + LD b8, 0 * SIZE(Y) + daddu Y, Y, INCY + + blez I, .L23 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + ST b1, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b1, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a2, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + ST b2, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b2, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a3, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + ST b3, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b3, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a4, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + ST b4, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b4, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a5, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + ST b5, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b5, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a6, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + ST b6, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b6, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a7, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a7, 0 * SIZE(X) + daddu X, X, INCX + + ST b7, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b7, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a8, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + ST b8, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b8, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L22 + daddu Y, Y, INCY + .align 3 + +.L23: + ST a1, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a2, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a3, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a4, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b4, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a5, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b5, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a6, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b6, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a7, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b7, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a8, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b8, 0 * SIZE(XX) + daddu XX, XX, INCX + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + daddiu I, I, -1 + ST a1, 0 * SIZE(Y) + ST b1, 0 * SIZE(X) + + daddu X, X, INCX + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/symv_L.S b/kernel/mips64/symv_L.S new file mode 100644 index 0000000..9a54eb7 --- /dev/null +++ b/kernel/mips64/symv_L.S @@ -0,0 +1,658 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define A $6 +#define LDA $7 +#define X $8 +#define INCX $9 +#define Y $10 +#define INCY $11 +#define BUFFER $5 + +#define XX $12 +#define YY $13 + +#define I $14 +#define IS $15 + +#define AO1 $16 +#define AO2 $17 + +#define Y1 $18 +#define TEMP $19 + +#define II INCX + +#define ALPHA $f13 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define alpha1 $f8 +#define alpha2 $f9 + +#define x1 $f10 +#define x2 $f11 +#define x3 $f12 +#define x4 $f14 + +#define xsum1 $f15 +#define xsum2 $f16 + +#define ysum1 $f17 +#define ysum2 $f18 +#define ysum3 $f19 +#define ysum4 $f20 + + + PROLOGUE + + LDARG BUFFER, 0($sp) + daddiu $sp, $sp, -32 + + SDARG $16, 0($sp) + dsll LDA, LDA, BASE_SHIFT + SDARG $17, 8($sp) + dsll INCX, INCX, BASE_SHIFT + SDARG $18, 16($sp) + dsll INCY, INCY, BASE_SHIFT + SDARG $19, 24($sp) + nop + + blez M, .L999 + li IS, SIZE + + beq IS, INCX, .L05 + move Y1, Y + + dsra I, M, 2 + move XX, X + + blez I, .L02 + move X, BUFFER + .align 3 + +.L01: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a2, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a3, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a4, 0 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L01 + daddiu BUFFER, BUFFER, 4 * SIZE + .align 3 + +.L02: + andi I, M, 3 + blez I, .L05 + NOP + .align 3 + +.L03: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L03 + daddiu BUFFER, BUFFER, 1 * SIZE + .align 3 + +.L05: + beq IS, INCY, .L10 + daddiu BUFFER, BUFFER, 255 + + li TEMP, -256 + and BUFFER, BUFFER, TEMP + + dsra I, M, 2 + move Y1, BUFFER + + blez I, .L07 + move YY, Y + .align 3 + +.L06: + LD a1, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a2, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a4, 0 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu BUFFER, BUFFER, 4 * SIZE + .align 3 + +.L07: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L08: + LD a1, 0 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L08 + daddiu BUFFER, BUFFER, 1 * SIZE + .align 3 + +.L10: + slti TEMP, M, 2 + nop + + bgtz TEMP, .L20 + li IS, 0 + .align 3 + +.L11: + dsll TEMP, IS, BASE_SHIFT + nop + + daddu XX, X, TEMP + daddu YY, Y1, TEMP + + LD alpha1, 0 * SIZE(XX) + move AO1, A + LD alpha2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + LD a1, 0 * SIZE(AO1) + daddu AO2, A, LDA + LD a2, 1 * SIZE(AO1) + daddiu AO1, AO1, 2 * SIZE + + LD a3, 0 * SIZE(AO2) + daddu A, AO2, LDA + LD a4, 1 * SIZE(AO2) + daddiu AO2, AO2, 2 * SIZE + + MUL xsum1, alpha1, a1 + daddiu A, A, 2 * SIZE + MUL xsum2, alpha1, a2 + dsubu II, M, IS + + MADD xsum1, xsum1, alpha2, a2 + MADD xsum2, xsum2, alpha2, a4 + daddiu II, II, - 2 + + MUL alpha1, ALPHA, alpha1 + daddiu YY, YY, 2 * SIZE + MUL alpha2, ALPHA, alpha2 + dsra I, II, 3 + + blez I, .L15 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a5, 2 * SIZE(AO1) + LD a6, 3 * SIZE(AO1) + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + blez I, .L13 + LD ysum3, 2 * SIZE(YY) + .align 3 + +.L12: + MADD ysum1, ysum1, alpha1, a1 + LD ysum4, 3 * SIZE(YY) + MADD ysum2, ysum2, alpha1, a2 + LD x4, 3 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 4 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + daddiu I, I, -1 + MADD xsum1, xsum1, x2, a2 + LD a2, 5 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 5 * SIZE(AO2) + + ST ysum1, 0 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + nop + MADD ysum4, ysum4, alpha1, a6 + LD x2, 5 * SIZE(XX) + MADD xsum1, xsum1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 6 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 6 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + daddiu XX, XX, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + LD a6, 7 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 7 * SIZE(AO2) + + ST ysum3, 2 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + daddiu AO2, AO2, 8 * SIZE + MADD ysum2, ysum2, alpha1, a2 + LD x4,-1 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 8 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 0 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 0 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + nop + MADD xsum1, xsum1, x2, a2 + LD a2, 9 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 1 * SIZE(AO2) + + ST ysum1, 4 * SIZE(YY) + LD ysum1, 8 * SIZE(YY) + ST ysum2, 5 * SIZE(YY) + LD ysum2, 9 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + daddiu AO1, AO1, 8 * SIZE + MADD ysum4, ysum4, alpha1, a6 + LD x2, 1 * SIZE(XX) + MADD xsum1, xsum1, x3, a5 + LD a5, 2 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 2 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 2 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + daddiu YY, YY, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + LD a6, 3 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 3 * SIZE(AO2) + + ST ysum3,-2 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + bgtz I, .L12 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L13: + MADD ysum1, ysum1, alpha1, a1 + LD ysum4, 3 * SIZE(YY) + MADD ysum2, ysum2, alpha1, a2 + LD x4, 3 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 4 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + LD a2, 5 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 5 * SIZE(AO2) + LD x2, 5 * SIZE(XX) + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 6 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 6 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + MADD xsum1, xsum1, x4, a6 + LD a6, 7 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 7 * SIZE(AO2) + LD x4, 7 * SIZE(XX) + + ST ysum3, 2 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + MADD xsum2, xsum2, x3, a7 + + MADD ysum3, ysum3, alpha2, a7 + daddiu XX, XX, 8 * SIZE + MADD ysum4, ysum4, alpha2, a8 + daddiu AO1, AO1, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + daddiu AO2, AO2, 8 * SIZE + MADD xsum2, xsum2, x4, a8 + + ST ysum1, 4 * SIZE(YY) + ST ysum2, 5 * SIZE(YY) + ST ysum3, 6 * SIZE(YY) + ST ysum4, 7 * SIZE(YY) + daddiu YY, YY, 8 * SIZE + .align 3 + +.L15: + andi I, II, 4 + NOP + blez I, .L16 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + LD x4, 3 * SIZE(XX) + daddiu XX, XX, 4 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a5, 2 * SIZE(AO1) + LD a6, 3 * SIZE(AO1) + daddiu AO1, AO1, 4 * SIZE + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + daddiu AO2, AO2, 4 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + LD ysum4, 3 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + MADD xsum2, xsum2, x3, a7 + + MADD ysum3, ysum3, alpha2, a7 + MADD ysum4, ysum4, alpha2, a8 + MADD xsum1, xsum1, x4, a6 + MADD xsum2, xsum2, x4, a8 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + ST ysum3, 2 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + + daddiu YY, YY, 4 * SIZE + .align 3 + +.L16: + andi I, II, 2 + NOP + blez I, .L17 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + daddiu AO1, AO1, 2 * SIZE + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + daddiu AO2, AO2, 2 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + daddiu YY, YY, 2 * SIZE + .align 3 + +.L17: + andi I, M, 1 + NOP + blez I, .L19 + NOP + + LD x1, 0 * SIZE(XX) + daddiu XX, XX, 1 * SIZE + LD a1, 0 * SIZE(AO1) + daddiu AO1, AO1, 1 * SIZE + + LD a3, 0 * SIZE(AO2) + daddiu AO2, AO2, 1 * SIZE + LD ysum1, 0 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD xsum1, xsum1, x1, a1 + MADD ysum1, ysum1, alpha2, a3 + MADD xsum2, xsum2, x1, a3 + + ST ysum1, 0 * SIZE(YY) + .align 3 + +.L19: + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + + MADD ysum1, ysum1, ALPHA, xsum1 + MADD ysum2, ysum2, ALPHA, xsum2 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + + daddiu TEMP, IS, 4 + slt TEMP, M, TEMP + + beqz TEMP, .L11 + daddiu IS, IS, 2 + .align 3 + +.L20: + andi I, M, 1 + dsll TEMP, IS, BASE_SHIFT + blez I, .L900 + daddu XX, X, TEMP + + daddu YY, Y1, TEMP + + LD x1, 0 * SIZE(XX) + LD ysum1, 0 * SIZE(YY) + LD a1, 0 * SIZE(A) + + MUL xsum1, a1, x1 + + MADD ysum1, ysum1, ALPHA, xsum1 + + ST ysum1, 0 * SIZE(YY) + .align 3 + +.L900: + li IS, SIZE + + beq INCY, IS, .L999 + NOP + + dsra I, M, 2 + blez I, .L905 + NOP + .align 3 + +.L902: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + LD a3, 2 * SIZE(Y1) + LD a4, 3 * SIZE(Y1) + + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu Y1, Y1, 4 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(Y1) + daddiu Y1, Y1, 1 * SIZE + + ST a1, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + + j $31 + daddiu $sp, $sp, 32 + + EPILOGUE diff --git a/kernel/mips64/symv_U.S b/kernel/mips64/symv_U.S new file mode 100644 index 0000000..285e591 --- /dev/null +++ b/kernel/mips64/symv_U.S @@ -0,0 +1,782 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define A $6 +#define LDA $7 +#define X $8 +#define INCX $9 +#define Y $10 +#define INCY $11 +#define BUFFER $5 + +#define XX $12 +#define YY $13 + +#define I $14 +#define IS $15 + +#define AO1 $16 +#define AO2 $17 + +#define Y1 $18 +#define TEMP $19 + +#define ALPHA $f13 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define alpha1 $f8 +#define alpha2 $f9 + +#define x1 $f10 +#define x2 $f11 +#define x3 $f12 +#define x4 $f14 + +#define xsum1 $f15 +#define xsum2 $f16 + +#define ysum1 $f17 +#define ysum2 $f18 +#define ysum3 $f19 +#define ysum4 $f20 + + + PROLOGUE + + LDARG BUFFER, 0($sp) + daddiu $sp, $sp, -32 + + SDARG $16, 0($sp) + dsll LDA, LDA, BASE_SHIFT + SDARG $17, 8($sp) + dsll INCX, INCX, BASE_SHIFT + SDARG $18, 16($sp) + dsll INCY, INCY, BASE_SHIFT + SDARG $19, 24($sp) + nop + + blez M, .L999 + li IS, SIZE + + beq IS, INCX, .L05 + move Y1, Y + + dsra I, M, 2 + move XX, X + + blez I, .L02 + move X, BUFFER + .align 3 + +.L01: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a2, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a3, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a4, 0 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L01 + daddiu BUFFER, BUFFER, 4 * SIZE + .align 3 + +.L02: + andi I, M, 3 + blez I, .L05 + NOP + .align 3 + +.L03: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L03 + daddiu BUFFER, BUFFER, 1 * SIZE + .align 3 + +.L05: + beq IS, INCY, .L10 + daddiu BUFFER, BUFFER, 255 + + li TEMP, -256 + and BUFFER, BUFFER, TEMP + + dsra I, M, 2 + move Y1, BUFFER + + blez I, .L07 + move YY, Y + .align 3 + +.L06: + LD a1, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a2, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a4, 0 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu BUFFER, BUFFER, 4 * SIZE + .align 3 + +.L07: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L08: + LD a1, 0 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L08 + daddiu BUFFER, BUFFER, 1 * SIZE + .align 3 + +.L10: + slti TEMP, M, 2 + nop + + bgtz TEMP, .L20 + li IS, 0 + .align 3 + +.L11: + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, X, TEMP + + LD alpha1, 0 * SIZE(TEMP) + LD alpha2, 1 * SIZE(TEMP) + + move AO1, A + dsra I, IS, 3 + daddu AO2, A, LDA + daddu A, AO2, LDA + + MTC $0, xsum1 + MTC $0, xsum2 + + move XX, X + MUL alpha1, ALPHA, alpha1 + move YY, Y1 + MUL alpha2, ALPHA, alpha2 + + blez I, .L15 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a5, 2 * SIZE(AO1) + LD a6, 3 * SIZE(AO1) + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + blez I, .L13 + LD ysum3, 2 * SIZE(YY) + .align 3 + +.L12: + MADD ysum1, ysum1, alpha1, a1 + LD ysum4, 3 * SIZE(YY) + MADD ysum2, ysum2, alpha1, a2 + LD x4, 3 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 4 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + daddiu I, I, -1 + MADD xsum1, xsum1, x2, a2 + LD a2, 5 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 5 * SIZE(AO2) + + ST ysum1, 0 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + nop + MADD ysum4, ysum4, alpha1, a6 + LD x2, 5 * SIZE(XX) + MADD xsum1, xsum1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 6 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 6 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + daddiu XX, XX, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + LD a6, 7 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 7 * SIZE(AO2) + + ST ysum3, 2 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + daddiu AO2, AO2, 8 * SIZE + MADD ysum2, ysum2, alpha1, a2 + LD x4,-1 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 8 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 0 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 0 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + nop + MADD xsum1, xsum1, x2, a2 + LD a2, 9 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 1 * SIZE(AO2) + + ST ysum1, 4 * SIZE(YY) + LD ysum1, 8 * SIZE(YY) + ST ysum2, 5 * SIZE(YY) + LD ysum2, 9 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + daddiu AO1, AO1, 8 * SIZE + MADD ysum4, ysum4, alpha1, a6 + LD x2, 1 * SIZE(XX) + MADD xsum1, xsum1, x3, a5 + LD a5, 2 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 2 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 2 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + daddiu YY, YY, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + LD a6, 3 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 3 * SIZE(AO2) + + ST ysum3,-2 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + bgtz I, .L12 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L13: + MADD ysum1, ysum1, alpha1, a1 + LD ysum4, 3 * SIZE(YY) + MADD ysum2, ysum2, alpha1, a2 + LD x4, 3 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 4 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + LD a2, 5 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 5 * SIZE(AO2) + LD x2, 5 * SIZE(XX) + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 6 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 6 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + MADD xsum1, xsum1, x4, a6 + LD a6, 7 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 7 * SIZE(AO2) + LD x4, 7 * SIZE(XX) + + ST ysum3, 2 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + MADD xsum2, xsum2, x3, a7 + + MADD ysum3, ysum3, alpha2, a7 + daddiu XX, XX, 8 * SIZE + MADD ysum4, ysum4, alpha2, a8 + daddiu AO1, AO1, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + daddiu AO2, AO2, 8 * SIZE + MADD xsum2, xsum2, x4, a8 + + ST ysum1, 4 * SIZE(YY) + ST ysum2, 5 * SIZE(YY) + ST ysum3, 6 * SIZE(YY) + ST ysum4, 7 * SIZE(YY) + daddiu YY, YY, 8 * SIZE + .align 3 + +.L15: + andi I, IS, 4 + NOP + blez I, .L16 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + LD x4, 3 * SIZE(XX) + daddiu XX, XX, 4 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a5, 2 * SIZE(AO1) + LD a6, 3 * SIZE(AO1) + daddiu AO1, AO1, 4 * SIZE + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + daddiu AO2, AO2, 4 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + LD ysum4, 3 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + MADD xsum2, xsum2, x3, a7 + + MADD ysum3, ysum3, alpha2, a7 + MADD ysum4, ysum4, alpha2, a8 + MADD xsum1, xsum1, x4, a6 + MADD xsum2, xsum2, x4, a8 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + ST ysum3, 2 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + + daddiu YY, YY, 4 * SIZE + .align 3 + +.L16: + andi I, IS, 2 + NOP + blez I, .L19 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + daddiu AO1, AO1, 2 * SIZE + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + daddiu AO2, AO2, 2 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + .align 3 + +.L19: + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + + MUL xsum1, ALPHA, xsum1 + MUL xsum2, ALPHA, xsum2 + + MADD xsum1, xsum1, alpha1, a1 + MADD xsum2, xsum2, alpha1, a3 + MADD xsum1, xsum1, alpha2, a3 + MADD xsum2, xsum2, alpha2, a4 + + ADD ysum1, ysum1, xsum1 + ADD ysum2, ysum2, xsum2 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + + daddiu TEMP, IS, 4 + slt TEMP, M, TEMP + + beqz TEMP, .L11 + daddiu IS, IS, 2 + .align 3 + +.L20: + andi TEMP, M, 1 + nop + blez TEMP, .L900 + nop + .align 3 + + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, X, TEMP + + LD alpha1, 0 * SIZE(TEMP) + + move AO1, A + dsra I, IS, 2 + daddu A, AO1, LDA + + MTC $0, xsum1 + MTC $0, xsum2 + + move XX, X + MUL alpha1, ALPHA, alpha1 + move YY, Y1 + + blez I, .L25 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + + blez I, .L23 + LD ysum4, 3 * SIZE(YY) + .align 3 + +.L22: + MADD ysum1, ysum1, alpha1, a1 + daddiu I, I, -1 + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD ysum2, ysum2, alpha1, a2 + LD x1, 4 * SIZE(XX) + MADD xsum2, xsum2, x2, a2 + LD a2, 5 * SIZE(AO1) + + ST ysum1, 0 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + + ST ysum2, 1 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + daddiu AO1, AO1, 4 * SIZE + nop + + MADD ysum3, ysum3, alpha1, a3 + LD x2, 5 * SIZE(XX) + MADD xsum1, xsum1, x3, a3 + LD a3, 2 * SIZE(AO1) + MADD ysum4, ysum4, alpha1, a4 + LD x3, 6 * SIZE(XX) + MADD xsum2, xsum2, x4, a4 + LD a4, 3 * SIZE(AO1) + + ST ysum3, 2 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + daddiu XX, XX, 4 * SIZE + daddiu YY, YY, 4 * SIZE + + bgtz I, .L22 + LD x4, 3 * SIZE(XX) + .align 3 + +.L23: + MADD ysum1, ysum1, alpha1, a1 + daddiu AO1, AO1, 4 * SIZE + MADD xsum1, xsum1, x1, a1 + daddiu XX, XX, 4 * SIZE + MADD ysum2, ysum2, alpha1, a2 + daddiu YY, YY, 4 * SIZE + MADD xsum2, xsum2, x2, a2 + nop + + MADD ysum3, ysum3, alpha1, a3 + ST ysum1,-4 * SIZE(YY) + MADD xsum1, xsum1, x3, a3 + ST ysum2,-3 * SIZE(YY) + MADD ysum4, ysum4, alpha1, a4 + ST ysum3,-2 * SIZE(YY) + MADD xsum2, xsum2, x4, a4 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L25: + andi I, IS, 2 + NOP + blez I, .L26 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + daddiu AO1, AO1, 2 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD xsum1, xsum1, x1, a1 + + MADD ysum2, ysum2, alpha1, a2 + MADD xsum2, xsum2, x2, a2 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + + daddiu YY, YY, 2 * SIZE + .align 3 + +.L26: + andi I, IS, 1 + NOP + blez I, .L29 + NOP + + LD x1, 0 * SIZE(XX) + daddiu XX, XX, 1 * SIZE + LD a1, 0 * SIZE(AO1) + daddiu AO1, AO1, 1* SIZE + + LD ysum1, 0 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD xsum1, xsum1, x1, a1 + + ST ysum1, 0 * SIZE(YY) + .align 3 + +.L29: + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + + LD a1, 0 * SIZE(AO1) + + ADD xsum1, xsum1, xsum2 + + MUL xsum1, ALPHA, xsum1 + + MADD xsum1, xsum1, alpha1, a1 + + ADD ysum1, ysum1, xsum1 + + ST ysum1, 0 * SIZE(TEMP) + .align 3 + +.L900: + li IS, SIZE + + beq INCY, IS, .L999 + NOP + + dsra I, M, 2 + blez I, .L905 + NOP + .align 3 + +.L902: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + LD a3, 2 * SIZE(Y1) + LD a4, 3 * SIZE(Y1) + + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu Y1, Y1, 4 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(Y1) + daddiu Y1, Y1, 1 * SIZE + + ST a1, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + + j $31 + daddiu $sp, $sp, 32 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_LN.S b/kernel/mips64/trsm_kernel_LN.S new file mode 100644 index 0000000..28e1794 --- /dev/null +++ b/kernel/mips64/trsm_kernel_LN.S @@ -0,0 +1,3544 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f27 +#define a4 $f28 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f16 +#define c41 $f17 +#define c42 $f18 +#define c51 $f19 +#define c52 $f20 +#define c61 $f21 +#define c62 $f22 +#define c71 $f23 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + LDARG OFFSET, 144($sp) + + dsll LDC, LDC, BASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, BASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + dsra J, N, 3 + blez J, .L30 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 3 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 3 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO8, LDC +#endif + + andi I, M, 1 + MOV c61, c11 + blez I, .L20 + MOV c71, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 0 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + MOV c81, c11 + + blez L, .L25 + NOP +#endif + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + NMSUB c51, c51, b5, c11 + NMSUB c61, c61, b6, c11 + NMSUB c71, c71, b7, c11 + NMSUB c81, c81, b8, c11 + + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + NMSUB c51, c51, b5, c21 + NMSUB c61, c61, b6, c21 + NMSUB c71, c71, b7, c21 + NMSUB c81, c81, b8, c21 + + LD b3, 18 * SIZE(BO) + LD b4, 19 * SIZE(BO) + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + NMSUB c51, c51, b5, c31 + NMSUB c61, c61, b6, c31 + NMSUB c71, c71, b7, c31 + NMSUB c81, c81, b8, c31 + + LD b4, 27 * SIZE(BO) + LD b5, 28 * SIZE(BO) + LD b6, 29 * SIZE(BO) + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL c41, b4, c41 + + NMSUB c51, c51, b5, c41 + NMSUB c61, c61, b6, c41 + NMSUB c71, c71, b7, c41 + NMSUB c81, c81, b8, c41 + + LD b5, 36 * SIZE(BO) + LD b6, 37 * SIZE(BO) + LD b7, 38 * SIZE(BO) + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + + NMSUB c61, c61, b6, c51 + NMSUB c71, c71, b7, c51 + NMSUB c81, c81, b8, c51 + + LD b6, 45 * SIZE(BO) + LD b7, 46 * SIZE(BO) + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + + NMSUB c71, c71, b7, c61 + NMSUB c81, c81, b8, c61 + + LD b7, 54 * SIZE(BO) + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + + NMSUB c81, c81, b8, c71 + + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + LD b5, 59 * SIZE(BO) + LD b6, 58 * SIZE(BO) + LD b7, 57 * SIZE(BO) + LD b8, 56 * SIZE(BO) + + MUL c81, b1, c81 + + NMSUB c71, c71, b2, c81 + NMSUB c61, c61, b3, c81 + NMSUB c51, c51, b4, c81 + NMSUB c41, c41, b5, c81 + NMSUB c31, c31, b6, c81 + NMSUB c21, c21, b7, c81 + NMSUB c11, c11, b8, c81 + + LD b2, 54 * SIZE(BO) + LD b3, 53 * SIZE(BO) + LD b4, 52 * SIZE(BO) + LD b5, 51 * SIZE(BO) + LD b6, 50 * SIZE(BO) + LD b7, 49 * SIZE(BO) + LD b8, 48 * SIZE(BO) + + MUL c71, b2, c71 + + NMSUB c61, c61, b3, c71 + NMSUB c51, c51, b4, c71 + NMSUB c41, c41, b5, c71 + NMSUB c31, c31, b6, c71 + NMSUB c21, c21, b7, c71 + NMSUB c11, c11, b8, c71 + + LD b3, 45 * SIZE(BO) + LD b4, 44 * SIZE(BO) + LD b5, 43 * SIZE(BO) + LD b6, 42 * SIZE(BO) + LD b7, 41 * SIZE(BO) + LD b8, 40 * SIZE(BO) + + MUL c61, b3, c61 + + NMSUB c51, c51, b4, c61 + NMSUB c41, c41, b5, c61 + NMSUB c31, c31, b6, c61 + NMSUB c21, c21, b7, c61 + NMSUB c11, c11, b8, c61 + + LD b4, 36 * SIZE(BO) + LD b5, 35 * SIZE(BO) + LD b6, 34 * SIZE(BO) + LD b7, 33 * SIZE(BO) + LD b8, 32 * SIZE(BO) + + MUL c51, b4, c51 + + NMSUB c41, c41, b5, c51 + NMSUB c31, c31, b6, c51 + NMSUB c21, c21, b7, c51 + NMSUB c11, c11, b8, c51 + + LD b5, 27 * SIZE(BO) + LD b6, 26 * SIZE(BO) + LD b7, 25 * SIZE(BO) + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 18 * SIZE(BO) + LD b7, 17 * SIZE(BO) + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE + daddiu CO5, CO5, -1 * SIZE + daddiu CO6, CO6, -1 * SIZE + daddiu CO7, CO7, -1 * SIZE + daddiu CO8, CO8, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c61, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c81, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + + MTC $0, c11 + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + daddiu CO5, CO5, 1 * SIZE + daddiu CO6, CO6, 1 * SIZE + daddiu CO7, CO7, 1 * SIZE + daddiu CO8, CO8, 1 * SIZE +#endif + + MOV c21, c11 + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + + MOV c31, c11 + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + + MOV c41, c11 + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L20: + dsra I, M, 1 + MOV c51, c11 + blez I, .L29 + MOV c61, c11 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + dsra L, TEMP, 2 + blez L, .L15 + NOP +#endif + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(BO) + SUB c21, b2, c21 + LD b6, 5 * SIZE(BO) + SUB c31, b3, c31 + LD b7, 6 * SIZE(BO) + SUB c41, b4, c41 + LD b8, 7 * SIZE(BO) + + SUB c51, b5, c51 + LD b1, 8 * SIZE(BO) + SUB c61, b6, c61 + LD b2, 9 * SIZE(BO) + SUB c71, b7, c71 + LD b3, 10 * SIZE(BO) + SUB c81, b8, c81 + LD b4, 11 * SIZE(BO) + + SUB c12, b1, c12 + LD b5, 12 * SIZE(BO) + SUB c22, b2, c22 + LD b6, 13 * SIZE(BO) + SUB c32, b3, c32 + LD b7, 14 * SIZE(BO) + SUB c42, b4, c42 + LD b8, 15 * SIZE(BO) + + SUB c52, b5, c52 +#ifdef LN + LD b1, 3 * SIZE(AO) +#else + LD b1, 0 * SIZE(AO) +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(AO) + SUB c12, b2, c12 + LD b6, 5 * SIZE(AO) + SUB c21, b3, c21 + LD b7, 6 * SIZE(AO) + SUB c22, b4, c22 + LD b8, 7 * SIZE(AO) + + SUB c31, b5, c31 + LD b1, 8 * SIZE(AO) + SUB c32, b6, c32 + LD b2, 9 * SIZE(AO) + SUB c41, b7, c41 + LD b3, 10 * SIZE(AO) + SUB c42, b8, c42 + LD b4, 11 * SIZE(AO) + + LD b5, 12 * SIZE(AO) + SUB c51, b1, c51 + LD b6, 13 * SIZE(AO) + SUB c52, b2, c52 + LD b7, 14 * SIZE(AO) + SUB c61, b3, c61 + LD b8, 15 * SIZE(AO) + SUB c62, b4, c62 + + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif + +#ifdef LN + MUL c12, b1, c12 + LD b2, 2 * SIZE(AO) + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + + NMSUB c11, c11, b2, c12 + LD b3, 0 * SIZE(AO) + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + NMSUB c51, c51, b2, c52 + NMSUB c61, c61, b2, c62 + NMSUB c71, c71, b2, c72 + NMSUB c81, c81, b2, c82 + + MUL c11, b3, c11 + daddiu CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + daddiu CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + daddiu CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + daddiu CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + daddiu CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + daddiu CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + daddiu CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + daddiu CO8, CO8, -2 * SIZE +#endif + +#ifdef LT + MUL c11, b1, c11 + LD b2, 1 * SIZE(AO) + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + + NMSUB c12, c12, b2, c11 + LD b3, 3 * SIZE(AO) + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + NMSUB c52, c52, b2, c51 + NMSUB c62, c62, b2, c61 + NMSUB c72, c72, b2, c71 + NMSUB c82, c82, b2, c81 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, 4 * SIZE(BO) + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + LD b6, 5 * SIZE(BO) + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + LD b7, 6 * SIZE(BO) + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + LD b8, 7 * SIZE(BO) + + NMSUB c51, c51, b5, c11 + NMSUB c52, c52, b5, c12 + LD b2, 9 * SIZE(BO) + NMSUB c61, c61, b6, c11 + NMSUB c62, c62, b6, c12 + LD b3, 10 * SIZE(BO) + NMSUB c71, c71, b7, c11 + NMSUB c72, c72, b7, c12 + LD b4, 11 * SIZE(BO) + NMSUB c81, c81, b8, c11 + NMSUB c82, c82, b8, c12 + LD b5, 12 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, 13 * SIZE(BO) + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + LD b7, 14 * SIZE(BO) + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + LD b8, 15 * SIZE(BO) + NMSUB c51, c51, b5, c21 + NMSUB c52, c52, b5, c22 + LD b3, 18 * SIZE(BO) + NMSUB c61, c61, b6, c21 + NMSUB c62, c62, b6, c22 + LD b4, 19 * SIZE(BO) + NMSUB c71, c71, b7, c21 + NMSUB c72, c72, b7, c22 + LD b5, 20 * SIZE(BO) + NMSUB c81, c81, b8, c21 + NMSUB c82, c82, b8, c22 + LD b6, 21 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, 22 * SIZE(BO) + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + LD b8, 23 * SIZE(BO) + NMSUB c51, c51, b5, c31 + NMSUB c52, c52, b5, c32 + LD b4, 27 * SIZE(BO) + NMSUB c61, c61, b6, c31 + NMSUB c62, c62, b6, c32 + LD b5, 28 * SIZE(BO) + NMSUB c71, c71, b7, c31 + NMSUB c72, c72, b7, c32 + LD b6, 29 * SIZE(BO) + NMSUB c81, c81, b8, c31 + NMSUB c82, c82, b8, c32 + LD b7, 30 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, 31 * SIZE(BO) + + NMSUB c51, c51, b5, c41 + NMSUB c52, c52, b5, c42 + LD b5, 36 * SIZE(BO) + NMSUB c61, c61, b6, c41 + NMSUB c62, c62, b6, c42 + LD b6, 37 * SIZE(BO) + NMSUB c71, c71, b7, c41 + NMSUB c72, c72, b7, c42 + LD b7, 38 * SIZE(BO) + NMSUB c81, c81, b8, c41 + NMSUB c82, c82, b8, c42 + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + MUL c52, b5, c52 + + NMSUB c61, c61, b6, c51 + NMSUB c62, c62, b6, c52 + LD b6, 45 * SIZE(BO) + NMSUB c71, c71, b7, c51 + NMSUB c72, c72, b7, c52 + LD b7, 46 * SIZE(BO) + NMSUB c81, c81, b8, c51 + NMSUB c82, c82, b8, c52 + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + MUL c62, b6, c62 + + NMSUB c71, c71, b7, c61 + NMSUB c72, c72, b7, c62 + LD b7, 54 * SIZE(BO) + NMSUB c81, c81, b8, c61 + NMSUB c82, c82, b8, c62 + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + MUL c72, b7, c72 + + NMSUB c81, c81, b8, c71 + NMSUB c82, c82, b8, c72 + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, 59 * SIZE(BO) + + NMSUB c71, c71, b2, c81 + NMSUB c72, c72, b2, c82 + LD b6, 58 * SIZE(BO) + NMSUB c61, c61, b3, c81 + NMSUB c62, c62, b3, c82 + LD b7, 57 * SIZE(BO) + NMSUB c51, c51, b4, c81 + NMSUB c52, c52, b4, c82 + LD b8, 56 * SIZE(BO) + + NMSUB c41, c41, b5, c81 + NMSUB c42, c42, b5, c82 + LD b2, 54 * SIZE(BO) + NMSUB c31, c31, b6, c81 + NMSUB c32, c32, b6, c82 + LD b3, 53 * SIZE(BO) + NMSUB c21, c21, b7, c81 + NMSUB c22, c22, b7, c82 + LD b4, 52 * SIZE(BO) + NMSUB c11, c11, b8, c81 + NMSUB c12, c12, b8, c82 + LD b5, 51 * SIZE(BO) + + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, 50 * SIZE(BO) + + NMSUB c61, c61, b3, c71 + NMSUB c62, c62, b3, c72 + LD b7, 49 * SIZE(BO) + NMSUB c51, c51, b4, c71 + NMSUB c52, c52, b4, c72 + LD b8, 48 * SIZE(BO) + NMSUB c41, c41, b5, c71 + NMSUB c42, c42, b5, c72 + LD b3, 45 * SIZE(BO) + NMSUB c31, c31, b6, c71 + NMSUB c32, c32, b6, c72 + LD b4, 44 * SIZE(BO) + NMSUB c21, c21, b7, c71 + NMSUB c22, c22, b7, c72 + LD b5, 43 * SIZE(BO) + NMSUB c11, c11, b8, c71 + NMSUB c12, c12, b8, c72 + LD b6, 42 * SIZE(BO) + + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, 41 * SIZE(BO) + + NMSUB c51, c51, b4, c61 + NMSUB c52, c52, b4, c62 + LD b8, 40 * SIZE(BO) + NMSUB c41, c41, b5, c61 + NMSUB c42, c42, b5, c62 + LD b4, 36 * SIZE(BO) + NMSUB c31, c31, b6, c61 + NMSUB c32, c32, b6, c62 + LD b5, 35 * SIZE(BO) + NMSUB c21, c21, b7, c61 + NMSUB c22, c22, b7, c62 + LD b6, 34 * SIZE(BO) + NMSUB c11, c11, b8, c61 + NMSUB c12, c12, b8, c62 + LD b7, 33 * SIZE(BO) + + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, 32 * SIZE(BO) + + NMSUB c41, c41, b5, c51 + NMSUB c42, c42, b5, c52 + LD b5, 27 * SIZE(BO) + NMSUB c31, c31, b6, c51 + NMSUB c32, c32, b6, c52 + LD b6, 26 * SIZE(BO) + NMSUB c21, c21, b7, c51 + NMSUB c22, c22, b7, c52 + LD b7, 25 * SIZE(BO) + NMSUB c11, c11, b8, c51 + NMSUB c12, c12, b8, c52 + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + LD b6, 18 * SIZE(BO) + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + LD b7, 17 * SIZE(BO) + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + LD b7, 9 * SIZE(BO) + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) + + ST c12, 8 * SIZE(BO) + ST c22, 9 * SIZE(BO) + ST c32, 10 * SIZE(BO) + ST c42, 11 * SIZE(BO) + ST c52, 12 * SIZE(BO) + ST c62, 13 * SIZE(BO) + ST c72, 14 * SIZE(BO) + ST c82, 15 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) + + ST c51, 8 * SIZE(AO) + ST c52, 9 * SIZE(AO) + ST c61, 10 * SIZE(AO) + ST c62, 11 * SIZE(AO) + ST c71, 12 * SIZE(AO) + ST c72, 13 * SIZE(AO) + ST c81, 14 * SIZE(AO) + ST c82, 15 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c52, 1 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c62, 1 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c72, 1 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + ST c82, 1 * SIZE(CO8) + + MTC $0, a1 + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + daddiu CO5, CO5, 2 * SIZE + daddiu CO6, CO6, 2 * SIZE + daddiu CO7, CO7, 2 * SIZE + daddiu CO8, CO8, 2 * SIZE +#endif + + MOV c11, a1 + MOV c21, a1 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + + MOV c31, a1 + MOV c41, a1 + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + MOV c51, a1 + + bgtz I, .L11 + MOV c61, a1 + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 3 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 8 +#endif + +#ifdef RT + daddiu KK, KK, -8 +#endif + + bgtz J, .L10 + NOP + .align 3 + +.L30: + andi J, N, 4 + blez J, .L50 + move AO, A + +#ifdef RT + dsll TEMP, K, 2 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + MOV c21, c11 + daddu CO4, CO3, LDC + MOV c31, c11 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + andi I, M, 1 + blez I, .L40 + MOV c41, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + + blez L, .L45 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + + blez L, .L45 + NOP +#endif + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + + MTC $0, c11 + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE +#endif + + MOV c21, c11 + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + + MOV c31, c11 + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L40: + dsra I, M, 1 + MOV c61, c11 + blez I, .L49 + MOV c41, c11 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + MOV c32, c11 + LD b4, 3 * SIZE(BO) + MOV c42, c11 + + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L35 + NOP +#endif + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + daddiu CO3, CO3, -2 * SIZE + daddiu CO4, CO4, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c12, 4 * SIZE(BO) + ST c22, 5 * SIZE(BO) + ST c32, 6 * SIZE(BO) + ST c42, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L49: +#ifdef LN + dsll TEMP, K, 2 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 + blez J, .L70 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + move AO, A + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + andi I, M, 1 + blez I, .L60 + NOP + +#if defined(LT) || defined(RN) + dsra L, KK, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + dsra L, TEMP, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L65 + NOP +#endif + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif + +#if defined(LN) || defined(LT) + LD b3, 0 * SIZE(AO) + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + + MUL c21, b3, c21 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + + NMSUB c11, c11, b2, c21 + + MUL c11, b3, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 0 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L60: + dsra I, M, 1 + blez I, .L69 + NOP + +.L51: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B + +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L55 + NOP +#endif + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c21 + NMSUB c12, c12, b2, c22 + + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c12, 2 * SIZE(BO) + ST c22, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L51 + MOV c41, c11 + .align 3 + +.L69: +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + blez J, .L999 + NOP + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + move AO, A + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + andi I, M, 1 + blez I, .L80 + NOP + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + blez L, .L85 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L85 + NOP +#endif + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: + ADD c11, c11, c21 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -1 +#endif + + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + + SUB c11, b1, c11 +#else + LD b1, 0 * SIZE(AO) + + SUB c11, b1, c11 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L80: + dsra I, M, 1 + blez I, .L89 + NOP + +.L71: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L75 + NOP +#endif + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -1 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + NMSUB c11, c11, b2, c12 + MUL c11, b3, c11 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + NMSUB c12, c12, b2, c11 + MUL c12, b3, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + + bgtz I, .L71 + NOP + .align 3 + + +.L89: +#ifdef LN + dsll TEMP, K, BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_LT.S b/kernel/mips64/trsm_kernel_LT.S new file mode 100644 index 0000000..824e045 --- /dev/null +++ b/kernel/mips64/trsm_kernel_LT.S @@ -0,0 +1,3527 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f27 +#define a4 $f28 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f16 +#define c41 $f17 +#define c42 $f18 +#define c51 $f19 +#define c52 $f20 +#define c61 $f21 +#define c62 $f22 +#define c71 $f23 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + LDARG OFFSET, 144($sp) + + dsll LDC, LDC, BASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, BASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + dsra J, N, 3 + blez J, .L30 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 3 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 3 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + dsra I, M, 1 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO8, LDC +#endif + + blez I, .L20 + MOV c61, c11 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + dsra L, TEMP, 2 + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + blez L, .L15 + NOP +#endif + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(BO) + SUB c21, b2, c21 + LD b6, 5 * SIZE(BO) + SUB c31, b3, c31 + LD b7, 6 * SIZE(BO) + SUB c41, b4, c41 + LD b8, 7 * SIZE(BO) + + SUB c51, b5, c51 + LD b1, 8 * SIZE(BO) + SUB c61, b6, c61 + LD b2, 9 * SIZE(BO) + SUB c71, b7, c71 + LD b3, 10 * SIZE(BO) + SUB c81, b8, c81 + LD b4, 11 * SIZE(BO) + + SUB c12, b1, c12 + LD b5, 12 * SIZE(BO) + SUB c22, b2, c22 + LD b6, 13 * SIZE(BO) + SUB c32, b3, c32 + LD b7, 14 * SIZE(BO) + SUB c42, b4, c42 + LD b8, 15 * SIZE(BO) + + SUB c52, b5, c52 +#ifdef LN + LD b1, 3 * SIZE(AO) +#else + LD b1, 0 * SIZE(AO) +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(AO) + SUB c12, b2, c12 + LD b6, 5 * SIZE(AO) + SUB c21, b3, c21 + LD b7, 6 * SIZE(AO) + SUB c22, b4, c22 + LD b8, 7 * SIZE(AO) + + SUB c31, b5, c31 + LD b1, 8 * SIZE(AO) + SUB c32, b6, c32 + LD b2, 9 * SIZE(AO) + SUB c41, b7, c41 + LD b3, 10 * SIZE(AO) + SUB c42, b8, c42 + LD b4, 11 * SIZE(AO) + + LD b5, 12 * SIZE(AO) + SUB c51, b1, c51 + LD b6, 13 * SIZE(AO) + SUB c52, b2, c52 + LD b7, 14 * SIZE(AO) + SUB c61, b3, c61 + LD b8, 15 * SIZE(AO) + SUB c62, b4, c62 + + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif + +#ifdef LN + MUL c12, b1, c12 + LD b2, 2 * SIZE(AO) + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + + NMSUB c11, c11, b2, c12 + LD b3, 0 * SIZE(AO) + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + NMSUB c51, c51, b2, c52 + NMSUB c61, c61, b2, c62 + NMSUB c71, c71, b2, c72 + NMSUB c81, c81, b2, c82 + + MUL c11, b3, c11 + daddiu CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + daddiu CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + daddiu CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + daddiu CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + daddiu CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + daddiu CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + daddiu CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + daddiu CO8, CO8, -2 * SIZE +#endif + +#ifdef LT + MUL c11, b1, c11 + LD b2, 1 * SIZE(AO) + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + + NMSUB c12, c12, b2, c11 + LD b3, 3 * SIZE(AO) + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + NMSUB c52, c52, b2, c51 + NMSUB c62, c62, b2, c61 + NMSUB c72, c72, b2, c71 + NMSUB c82, c82, b2, c81 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, 4 * SIZE(BO) + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + LD b6, 5 * SIZE(BO) + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + LD b7, 6 * SIZE(BO) + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + LD b8, 7 * SIZE(BO) + + NMSUB c51, c51, b5, c11 + NMSUB c52, c52, b5, c12 + LD b2, 9 * SIZE(BO) + NMSUB c61, c61, b6, c11 + NMSUB c62, c62, b6, c12 + LD b3, 10 * SIZE(BO) + NMSUB c71, c71, b7, c11 + NMSUB c72, c72, b7, c12 + LD b4, 11 * SIZE(BO) + NMSUB c81, c81, b8, c11 + NMSUB c82, c82, b8, c12 + LD b5, 12 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, 13 * SIZE(BO) + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + LD b7, 14 * SIZE(BO) + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + LD b8, 15 * SIZE(BO) + NMSUB c51, c51, b5, c21 + NMSUB c52, c52, b5, c22 + LD b3, 18 * SIZE(BO) + NMSUB c61, c61, b6, c21 + NMSUB c62, c62, b6, c22 + LD b4, 19 * SIZE(BO) + NMSUB c71, c71, b7, c21 + NMSUB c72, c72, b7, c22 + LD b5, 20 * SIZE(BO) + NMSUB c81, c81, b8, c21 + NMSUB c82, c82, b8, c22 + LD b6, 21 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, 22 * SIZE(BO) + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + LD b8, 23 * SIZE(BO) + NMSUB c51, c51, b5, c31 + NMSUB c52, c52, b5, c32 + LD b4, 27 * SIZE(BO) + NMSUB c61, c61, b6, c31 + NMSUB c62, c62, b6, c32 + LD b5, 28 * SIZE(BO) + NMSUB c71, c71, b7, c31 + NMSUB c72, c72, b7, c32 + LD b6, 29 * SIZE(BO) + NMSUB c81, c81, b8, c31 + NMSUB c82, c82, b8, c32 + LD b7, 30 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, 31 * SIZE(BO) + + NMSUB c51, c51, b5, c41 + NMSUB c52, c52, b5, c42 + LD b5, 36 * SIZE(BO) + NMSUB c61, c61, b6, c41 + NMSUB c62, c62, b6, c42 + LD b6, 37 * SIZE(BO) + NMSUB c71, c71, b7, c41 + NMSUB c72, c72, b7, c42 + LD b7, 38 * SIZE(BO) + NMSUB c81, c81, b8, c41 + NMSUB c82, c82, b8, c42 + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + MUL c52, b5, c52 + + NMSUB c61, c61, b6, c51 + NMSUB c62, c62, b6, c52 + LD b6, 45 * SIZE(BO) + NMSUB c71, c71, b7, c51 + NMSUB c72, c72, b7, c52 + LD b7, 46 * SIZE(BO) + NMSUB c81, c81, b8, c51 + NMSUB c82, c82, b8, c52 + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + MUL c62, b6, c62 + + NMSUB c71, c71, b7, c61 + NMSUB c72, c72, b7, c62 + LD b7, 54 * SIZE(BO) + NMSUB c81, c81, b8, c61 + NMSUB c82, c82, b8, c62 + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + MUL c72, b7, c72 + + NMSUB c81, c81, b8, c71 + NMSUB c82, c82, b8, c72 + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, 59 * SIZE(BO) + + NMSUB c71, c71, b2, c81 + NMSUB c72, c72, b2, c82 + LD b6, 58 * SIZE(BO) + NMSUB c61, c61, b3, c81 + NMSUB c62, c62, b3, c82 + LD b7, 57 * SIZE(BO) + NMSUB c51, c51, b4, c81 + NMSUB c52, c52, b4, c82 + LD b8, 56 * SIZE(BO) + + NMSUB c41, c41, b5, c81 + NMSUB c42, c42, b5, c82 + LD b2, 54 * SIZE(BO) + NMSUB c31, c31, b6, c81 + NMSUB c32, c32, b6, c82 + LD b3, 53 * SIZE(BO) + NMSUB c21, c21, b7, c81 + NMSUB c22, c22, b7, c82 + LD b4, 52 * SIZE(BO) + NMSUB c11, c11, b8, c81 + NMSUB c12, c12, b8, c82 + LD b5, 51 * SIZE(BO) + + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, 50 * SIZE(BO) + + NMSUB c61, c61, b3, c71 + NMSUB c62, c62, b3, c72 + LD b7, 49 * SIZE(BO) + NMSUB c51, c51, b4, c71 + NMSUB c52, c52, b4, c72 + LD b8, 48 * SIZE(BO) + NMSUB c41, c41, b5, c71 + NMSUB c42, c42, b5, c72 + LD b3, 45 * SIZE(BO) + NMSUB c31, c31, b6, c71 + NMSUB c32, c32, b6, c72 + LD b4, 44 * SIZE(BO) + NMSUB c21, c21, b7, c71 + NMSUB c22, c22, b7, c72 + LD b5, 43 * SIZE(BO) + NMSUB c11, c11, b8, c71 + NMSUB c12, c12, b8, c72 + LD b6, 42 * SIZE(BO) + + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, 41 * SIZE(BO) + + NMSUB c51, c51, b4, c61 + NMSUB c52, c52, b4, c62 + LD b8, 40 * SIZE(BO) + NMSUB c41, c41, b5, c61 + NMSUB c42, c42, b5, c62 + LD b4, 36 * SIZE(BO) + NMSUB c31, c31, b6, c61 + NMSUB c32, c32, b6, c62 + LD b5, 35 * SIZE(BO) + NMSUB c21, c21, b7, c61 + NMSUB c22, c22, b7, c62 + LD b6, 34 * SIZE(BO) + NMSUB c11, c11, b8, c61 + NMSUB c12, c12, b8, c62 + LD b7, 33 * SIZE(BO) + + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, 32 * SIZE(BO) + + NMSUB c41, c41, b5, c51 + NMSUB c42, c42, b5, c52 + LD b5, 27 * SIZE(BO) + NMSUB c31, c31, b6, c51 + NMSUB c32, c32, b6, c52 + LD b6, 26 * SIZE(BO) + NMSUB c21, c21, b7, c51 + NMSUB c22, c22, b7, c52 + LD b7, 25 * SIZE(BO) + NMSUB c11, c11, b8, c51 + NMSUB c12, c12, b8, c52 + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + LD b6, 18 * SIZE(BO) + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + LD b7, 17 * SIZE(BO) + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + LD b7, 9 * SIZE(BO) + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) + + ST c12, 8 * SIZE(BO) + ST c22, 9 * SIZE(BO) + ST c32, 10 * SIZE(BO) + ST c42, 11 * SIZE(BO) + ST c52, 12 * SIZE(BO) + ST c62, 13 * SIZE(BO) + ST c72, 14 * SIZE(BO) + ST c82, 15 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) + + ST c51, 8 * SIZE(AO) + ST c52, 9 * SIZE(AO) + ST c61, 10 * SIZE(AO) + ST c62, 11 * SIZE(AO) + ST c71, 12 * SIZE(AO) + ST c72, 13 * SIZE(AO) + ST c81, 14 * SIZE(AO) + ST c82, 15 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c52, 1 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c62, 1 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c72, 1 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + ST c82, 1 * SIZE(CO8) + + MTC $0, a1 + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + daddiu CO5, CO5, 2 * SIZE + daddiu CO6, CO6, 2 * SIZE + daddiu CO7, CO7, 2 * SIZE + daddiu CO8, CO8, 2 * SIZE +#endif + + MOV c11, a1 + MOV c21, a1 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + + MOV c31, a1 + MOV c41, a1 + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + MOV c51, a1 + + bgtz I, .L11 + MOV c61, a1 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 + blez I, .L29 + MOV c71, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 0 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + MOV c81, c11 + + blez L, .L25 + NOP +#endif + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + NMSUB c51, c51, b5, c11 + NMSUB c61, c61, b6, c11 + NMSUB c71, c71, b7, c11 + NMSUB c81, c81, b8, c11 + + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + NMSUB c51, c51, b5, c21 + NMSUB c61, c61, b6, c21 + NMSUB c71, c71, b7, c21 + NMSUB c81, c81, b8, c21 + + LD b3, 18 * SIZE(BO) + LD b4, 19 * SIZE(BO) + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + NMSUB c51, c51, b5, c31 + NMSUB c61, c61, b6, c31 + NMSUB c71, c71, b7, c31 + NMSUB c81, c81, b8, c31 + + LD b4, 27 * SIZE(BO) + LD b5, 28 * SIZE(BO) + LD b6, 29 * SIZE(BO) + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL c41, b4, c41 + + NMSUB c51, c51, b5, c41 + NMSUB c61, c61, b6, c41 + NMSUB c71, c71, b7, c41 + NMSUB c81, c81, b8, c41 + + LD b5, 36 * SIZE(BO) + LD b6, 37 * SIZE(BO) + LD b7, 38 * SIZE(BO) + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + + NMSUB c61, c61, b6, c51 + NMSUB c71, c71, b7, c51 + NMSUB c81, c81, b8, c51 + + LD b6, 45 * SIZE(BO) + LD b7, 46 * SIZE(BO) + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + + NMSUB c71, c71, b7, c61 + NMSUB c81, c81, b8, c61 + + LD b7, 54 * SIZE(BO) + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + + NMSUB c81, c81, b8, c71 + + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + LD b5, 59 * SIZE(BO) + LD b6, 58 * SIZE(BO) + LD b7, 57 * SIZE(BO) + LD b8, 56 * SIZE(BO) + + MUL c81, b1, c81 + + NMSUB c71, c71, b2, c81 + NMSUB c61, c61, b3, c81 + NMSUB c51, c51, b4, c81 + NMSUB c41, c41, b5, c81 + NMSUB c31, c31, b6, c81 + NMSUB c21, c21, b7, c81 + NMSUB c11, c11, b8, c81 + + LD b2, 54 * SIZE(BO) + LD b3, 53 * SIZE(BO) + LD b4, 52 * SIZE(BO) + LD b5, 51 * SIZE(BO) + LD b6, 50 * SIZE(BO) + LD b7, 49 * SIZE(BO) + LD b8, 48 * SIZE(BO) + + MUL c71, b2, c71 + + NMSUB c61, c61, b3, c71 + NMSUB c51, c51, b4, c71 + NMSUB c41, c41, b5, c71 + NMSUB c31, c31, b6, c71 + NMSUB c21, c21, b7, c71 + NMSUB c11, c11, b8, c71 + + LD b3, 45 * SIZE(BO) + LD b4, 44 * SIZE(BO) + LD b5, 43 * SIZE(BO) + LD b6, 42 * SIZE(BO) + LD b7, 41 * SIZE(BO) + LD b8, 40 * SIZE(BO) + + MUL c61, b3, c61 + + NMSUB c51, c51, b4, c61 + NMSUB c41, c41, b5, c61 + NMSUB c31, c31, b6, c61 + NMSUB c21, c21, b7, c61 + NMSUB c11, c11, b8, c61 + + LD b4, 36 * SIZE(BO) + LD b5, 35 * SIZE(BO) + LD b6, 34 * SIZE(BO) + LD b7, 33 * SIZE(BO) + LD b8, 32 * SIZE(BO) + + MUL c51, b4, c51 + + NMSUB c41, c41, b5, c51 + NMSUB c31, c31, b6, c51 + NMSUB c21, c21, b7, c51 + NMSUB c11, c11, b8, c51 + + LD b5, 27 * SIZE(BO) + LD b6, 26 * SIZE(BO) + LD b7, 25 * SIZE(BO) + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 18 * SIZE(BO) + LD b7, 17 * SIZE(BO) + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE + daddiu CO5, CO5, -1 * SIZE + daddiu CO6, CO6, -1 * SIZE + daddiu CO7, CO7, -1 * SIZE + daddiu CO8, CO8, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c61, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c81, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + daddiu CO5, CO5, 1 * SIZE + daddiu CO6, CO6, 1 * SIZE + daddiu CO7, CO7, 1 * SIZE + daddiu CO8, CO8, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 3 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 8 +#endif + +#ifdef RT + daddiu KK, KK, -8 +#endif + + bgtz J, .L10 + NOP + .align 3 + +.L30: + andi J, N, 4 + blez J, .L50 + move AO, A + +#ifdef RT + dsll TEMP, K, 2 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + MOV c21, c11 + dsra I, M, 1 + MOV c31, c11 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + blez I, .L40 + MOV c41, c11 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + MOV c32, c11 + LD b4, 3 * SIZE(BO) + MOV c42, c11 + + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L35 + NOP +#endif + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + daddiu CO3, CO3, -2 * SIZE + daddiu CO4, CO4, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c12, 4 * SIZE(BO) + ST c22, 5 * SIZE(BO) + ST c32, 6 * SIZE(BO) + ST c42, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L40: + andi I, M, 1 + blez I, .L49 + MOV c61, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + + blez L, .L45 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + + blez L, .L45 + NOP +#endif + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + dsll TEMP, K, 2 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 + blez J, .L70 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + move AO, A + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + dsra I, M, 1 + blez I, .L60 + NOP + +.L51: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B + +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L55 + NOP +#endif + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c21 + NMSUB c12, c12, b2, c22 + + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c12, 2 * SIZE(BO) + ST c22, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L51 + MOV c41, c11 + .align 3 + +.L60: + andi I, M, 1 + blez I, .L69 + NOP + +#if defined(LT) || defined(RN) + dsra L, KK, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + dsra L, TEMP, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L65 + NOP +#endif + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif + +#if defined(LN) || defined(LT) + LD b3, 0 * SIZE(AO) + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + + MUL c21, b3, c21 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + + NMSUB c11, c11, b2, c21 + + MUL c11, b3, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 0 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + blez J, .L999 + NOP + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + move AO, A + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + dsra I, M, 1 + blez I, .L80 + NOP + +.L71: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L75 + NOP +#endif + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -1 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + NMSUB c11, c11, b2, c12 + MUL c11, b3, c11 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + NMSUB c12, c12, b2, c11 + MUL c12, b3, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + + bgtz I, .L71 + NOP + .align 3 + +.L80: + andi I, M, 1 + blez I, .L89 + NOP + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + blez L, .L85 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L85 + NOP +#endif + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: + ADD c11, c11, c21 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -1 +#endif + + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + + SUB c11, b1, c11 +#else + LD b1, 0 * SIZE(AO) + + SUB c11, b1, c11 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + dsll TEMP, K, BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_RT.S b/kernel/mips64/trsm_kernel_RT.S new file mode 100644 index 0000000..81bbfec --- /dev/null +++ b/kernel/mips64/trsm_kernel_RT.S @@ -0,0 +1,3529 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f27 +#define a4 $f28 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f16 +#define c41 $f17 +#define c42 $f18 +#define c51 $f19 +#define c52 $f20 +#define c61 $f21 +#define c62 $f22 +#define c71 $f23 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + LDARG OFFSET, 144($sp) + + dsll LDC, LDC, BASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, BASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + andi J, N, 1 + blez J, .L30 + NOP + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + move AO, A + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + dsra I, M, 1 + blez I, .L80 + NOP + +.L71: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L75 + NOP +#endif + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -1 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + NMSUB c11, c11, b2, c12 + MUL c11, b3, c11 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + NMSUB c12, c12, b2, c11 + MUL c12, b3, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + + bgtz I, .L71 + NOP + .align 3 + +.L80: + andi I, M, 1 + blez I, .L89 + NOP + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + MOV c21, c11 + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + blez L, .L85 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c21, c11 + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L85 + NOP +#endif + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: + ADD c11, c11, c21 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -1 +#endif + + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + + SUB c11, b1, c11 +#else + LD b1, 0 * SIZE(AO) + + SUB c11, b1, c11 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + dsll TEMP, K, BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + +.L30: + andi J, N, 2 + blez J, .L50 + NOP + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + move AO, A + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + dsra I, M, 1 + blez I, .L60 + NOP + +.L51: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B + +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L55 + NOP +#endif + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c21 + NMSUB c12, c12, b2, c22 + + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c12, 2 * SIZE(BO) + ST c22, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L51 + MOV c41, c11 + .align 3 + +.L60: + andi I, M, 1 + blez I, .L69 + NOP + +#if defined(LT) || defined(RN) + dsra L, KK, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + dsra L, TEMP, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L65 + NOP +#endif + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif + +#if defined(LN) || defined(LT) + LD b3, 0 * SIZE(AO) + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + + MUL c21, b3, c21 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + + NMSUB c11, c11, b2, c21 + + MUL c11, b3, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 0 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L50: + andi J, N, 4 + blez J, .L70 + move AO, A + +#ifdef RT + dsll TEMP, K, 2 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + MOV c21, c11 + dsra I, M, 1 + MOV c31, c11 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + blez I, .L40 + MOV c41, c11 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + MOV c32, c11 + LD b4, 3 * SIZE(BO) + MOV c42, c11 + + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L35 + NOP +#endif + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + daddiu CO3, CO3, -2 * SIZE + daddiu CO4, CO4, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c12, 4 * SIZE(BO) + ST c22, 5 * SIZE(BO) + ST c32, 6 * SIZE(BO) + ST c42, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L40: + andi I, M, 1 + blez I, .L49 + MOV c61, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + + blez L, .L45 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + + blez L, .L45 + NOP +#endif + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + dsll TEMP, K, 2 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + .align 3 + +.L70: + dsra J, N, 3 + blez J, .L999 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 3 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 3 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + dsra I, M, 1 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO8, LDC +#endif + + blez I, .L20 + MOV c61, c11 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + dsra L, TEMP, 2 + blez L, .L15 + NOP +#endif + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(BO) + SUB c21, b2, c21 + LD b6, 5 * SIZE(BO) + SUB c31, b3, c31 + LD b7, 6 * SIZE(BO) + SUB c41, b4, c41 + LD b8, 7 * SIZE(BO) + + SUB c51, b5, c51 + LD b1, 8 * SIZE(BO) + SUB c61, b6, c61 + LD b2, 9 * SIZE(BO) + SUB c71, b7, c71 + LD b3, 10 * SIZE(BO) + SUB c81, b8, c81 + LD b4, 11 * SIZE(BO) + + SUB c12, b1, c12 + LD b5, 12 * SIZE(BO) + SUB c22, b2, c22 + LD b6, 13 * SIZE(BO) + SUB c32, b3, c32 + LD b7, 14 * SIZE(BO) + SUB c42, b4, c42 + LD b8, 15 * SIZE(BO) + + SUB c52, b5, c52 +#ifdef LN + LD b1, 3 * SIZE(AO) +#else + LD b1, 0 * SIZE(AO) +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(AO) + SUB c12, b2, c12 + LD b6, 5 * SIZE(AO) + SUB c21, b3, c21 + LD b7, 6 * SIZE(AO) + SUB c22, b4, c22 + LD b8, 7 * SIZE(AO) + + SUB c31, b5, c31 + LD b1, 8 * SIZE(AO) + SUB c32, b6, c32 + LD b2, 9 * SIZE(AO) + SUB c41, b7, c41 + LD b3, 10 * SIZE(AO) + SUB c42, b8, c42 + LD b4, 11 * SIZE(AO) + + LD b5, 12 * SIZE(AO) + SUB c51, b1, c51 + LD b6, 13 * SIZE(AO) + SUB c52, b2, c52 + LD b7, 14 * SIZE(AO) + SUB c61, b3, c61 + LD b8, 15 * SIZE(AO) + SUB c62, b4, c62 + + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif + +#ifdef LN + MUL c12, b1, c12 + LD b2, 2 * SIZE(AO) + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + + NMSUB c11, c11, b2, c12 + LD b3, 0 * SIZE(AO) + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + NMSUB c51, c51, b2, c52 + NMSUB c61, c61, b2, c62 + NMSUB c71, c71, b2, c72 + NMSUB c81, c81, b2, c82 + + MUL c11, b3, c11 + daddiu CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + daddiu CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + daddiu CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + daddiu CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + daddiu CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + daddiu CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + daddiu CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + daddiu CO8, CO8, -2 * SIZE +#endif + +#ifdef LT + MUL c11, b1, c11 + LD b2, 1 * SIZE(AO) + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + + NMSUB c12, c12, b2, c11 + LD b3, 3 * SIZE(AO) + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + NMSUB c52, c52, b2, c51 + NMSUB c62, c62, b2, c61 + NMSUB c72, c72, b2, c71 + NMSUB c82, c82, b2, c81 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, 4 * SIZE(BO) + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + LD b6, 5 * SIZE(BO) + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + LD b7, 6 * SIZE(BO) + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + LD b8, 7 * SIZE(BO) + + NMSUB c51, c51, b5, c11 + NMSUB c52, c52, b5, c12 + LD b2, 9 * SIZE(BO) + NMSUB c61, c61, b6, c11 + NMSUB c62, c62, b6, c12 + LD b3, 10 * SIZE(BO) + NMSUB c71, c71, b7, c11 + NMSUB c72, c72, b7, c12 + LD b4, 11 * SIZE(BO) + NMSUB c81, c81, b8, c11 + NMSUB c82, c82, b8, c12 + LD b5, 12 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, 13 * SIZE(BO) + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + LD b7, 14 * SIZE(BO) + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + LD b8, 15 * SIZE(BO) + NMSUB c51, c51, b5, c21 + NMSUB c52, c52, b5, c22 + LD b3, 18 * SIZE(BO) + NMSUB c61, c61, b6, c21 + NMSUB c62, c62, b6, c22 + LD b4, 19 * SIZE(BO) + NMSUB c71, c71, b7, c21 + NMSUB c72, c72, b7, c22 + LD b5, 20 * SIZE(BO) + NMSUB c81, c81, b8, c21 + NMSUB c82, c82, b8, c22 + LD b6, 21 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, 22 * SIZE(BO) + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + LD b8, 23 * SIZE(BO) + NMSUB c51, c51, b5, c31 + NMSUB c52, c52, b5, c32 + LD b4, 27 * SIZE(BO) + NMSUB c61, c61, b6, c31 + NMSUB c62, c62, b6, c32 + LD b5, 28 * SIZE(BO) + NMSUB c71, c71, b7, c31 + NMSUB c72, c72, b7, c32 + LD b6, 29 * SIZE(BO) + NMSUB c81, c81, b8, c31 + NMSUB c82, c82, b8, c32 + LD b7, 30 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, 31 * SIZE(BO) + + NMSUB c51, c51, b5, c41 + NMSUB c52, c52, b5, c42 + LD b5, 36 * SIZE(BO) + NMSUB c61, c61, b6, c41 + NMSUB c62, c62, b6, c42 + LD b6, 37 * SIZE(BO) + NMSUB c71, c71, b7, c41 + NMSUB c72, c72, b7, c42 + LD b7, 38 * SIZE(BO) + NMSUB c81, c81, b8, c41 + NMSUB c82, c82, b8, c42 + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + MUL c52, b5, c52 + + NMSUB c61, c61, b6, c51 + NMSUB c62, c62, b6, c52 + LD b6, 45 * SIZE(BO) + NMSUB c71, c71, b7, c51 + NMSUB c72, c72, b7, c52 + LD b7, 46 * SIZE(BO) + NMSUB c81, c81, b8, c51 + NMSUB c82, c82, b8, c52 + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + MUL c62, b6, c62 + + NMSUB c71, c71, b7, c61 + NMSUB c72, c72, b7, c62 + LD b7, 54 * SIZE(BO) + NMSUB c81, c81, b8, c61 + NMSUB c82, c82, b8, c62 + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + MUL c72, b7, c72 + + NMSUB c81, c81, b8, c71 + NMSUB c82, c82, b8, c72 + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, 59 * SIZE(BO) + + NMSUB c71, c71, b2, c81 + NMSUB c72, c72, b2, c82 + LD b6, 58 * SIZE(BO) + NMSUB c61, c61, b3, c81 + NMSUB c62, c62, b3, c82 + LD b7, 57 * SIZE(BO) + NMSUB c51, c51, b4, c81 + NMSUB c52, c52, b4, c82 + LD b8, 56 * SIZE(BO) + + NMSUB c41, c41, b5, c81 + NMSUB c42, c42, b5, c82 + LD b2, 54 * SIZE(BO) + NMSUB c31, c31, b6, c81 + NMSUB c32, c32, b6, c82 + LD b3, 53 * SIZE(BO) + NMSUB c21, c21, b7, c81 + NMSUB c22, c22, b7, c82 + LD b4, 52 * SIZE(BO) + NMSUB c11, c11, b8, c81 + NMSUB c12, c12, b8, c82 + LD b5, 51 * SIZE(BO) + + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, 50 * SIZE(BO) + + NMSUB c61, c61, b3, c71 + NMSUB c62, c62, b3, c72 + LD b7, 49 * SIZE(BO) + NMSUB c51, c51, b4, c71 + NMSUB c52, c52, b4, c72 + LD b8, 48 * SIZE(BO) + NMSUB c41, c41, b5, c71 + NMSUB c42, c42, b5, c72 + LD b3, 45 * SIZE(BO) + NMSUB c31, c31, b6, c71 + NMSUB c32, c32, b6, c72 + LD b4, 44 * SIZE(BO) + NMSUB c21, c21, b7, c71 + NMSUB c22, c22, b7, c72 + LD b5, 43 * SIZE(BO) + NMSUB c11, c11, b8, c71 + NMSUB c12, c12, b8, c72 + LD b6, 42 * SIZE(BO) + + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, 41 * SIZE(BO) + + NMSUB c51, c51, b4, c61 + NMSUB c52, c52, b4, c62 + LD b8, 40 * SIZE(BO) + NMSUB c41, c41, b5, c61 + NMSUB c42, c42, b5, c62 + LD b4, 36 * SIZE(BO) + NMSUB c31, c31, b6, c61 + NMSUB c32, c32, b6, c62 + LD b5, 35 * SIZE(BO) + NMSUB c21, c21, b7, c61 + NMSUB c22, c22, b7, c62 + LD b6, 34 * SIZE(BO) + NMSUB c11, c11, b8, c61 + NMSUB c12, c12, b8, c62 + LD b7, 33 * SIZE(BO) + + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, 32 * SIZE(BO) + + NMSUB c41, c41, b5, c51 + NMSUB c42, c42, b5, c52 + LD b5, 27 * SIZE(BO) + NMSUB c31, c31, b6, c51 + NMSUB c32, c32, b6, c52 + LD b6, 26 * SIZE(BO) + NMSUB c21, c21, b7, c51 + NMSUB c22, c22, b7, c52 + LD b7, 25 * SIZE(BO) + NMSUB c11, c11, b8, c51 + NMSUB c12, c12, b8, c52 + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + LD b6, 18 * SIZE(BO) + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + LD b7, 17 * SIZE(BO) + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + LD b7, 9 * SIZE(BO) + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) + + ST c12, 8 * SIZE(BO) + ST c22, 9 * SIZE(BO) + ST c32, 10 * SIZE(BO) + ST c42, 11 * SIZE(BO) + ST c52, 12 * SIZE(BO) + ST c62, 13 * SIZE(BO) + ST c72, 14 * SIZE(BO) + ST c82, 15 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) + + ST c51, 8 * SIZE(AO) + ST c52, 9 * SIZE(AO) + ST c61, 10 * SIZE(AO) + ST c62, 11 * SIZE(AO) + ST c71, 12 * SIZE(AO) + ST c72, 13 * SIZE(AO) + ST c81, 14 * SIZE(AO) + ST c82, 15 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c52, 1 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c62, 1 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c72, 1 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + ST c82, 1 * SIZE(CO8) + + MTC $0, a1 + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + daddiu CO5, CO5, 2 * SIZE + daddiu CO6, CO6, 2 * SIZE + daddiu CO7, CO7, 2 * SIZE + daddiu CO8, CO8, 2 * SIZE +#endif + + MOV c11, a1 + MOV c21, a1 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + + MOV c31, a1 + MOV c41, a1 + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + MOV c51, a1 + + bgtz I, .L11 + MOV c61, a1 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 + blez I, .L29 + MOV c71, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 0 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + MOV c81, c11 + + blez L, .L25 + NOP +#endif + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + NMSUB c51, c51, b5, c11 + NMSUB c61, c61, b6, c11 + NMSUB c71, c71, b7, c11 + NMSUB c81, c81, b8, c11 + + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + NMSUB c51, c51, b5, c21 + NMSUB c61, c61, b6, c21 + NMSUB c71, c71, b7, c21 + NMSUB c81, c81, b8, c21 + + LD b3, 18 * SIZE(BO) + LD b4, 19 * SIZE(BO) + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + NMSUB c51, c51, b5, c31 + NMSUB c61, c61, b6, c31 + NMSUB c71, c71, b7, c31 + NMSUB c81, c81, b8, c31 + + LD b4, 27 * SIZE(BO) + LD b5, 28 * SIZE(BO) + LD b6, 29 * SIZE(BO) + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL c41, b4, c41 + + NMSUB c51, c51, b5, c41 + NMSUB c61, c61, b6, c41 + NMSUB c71, c71, b7, c41 + NMSUB c81, c81, b8, c41 + + LD b5, 36 * SIZE(BO) + LD b6, 37 * SIZE(BO) + LD b7, 38 * SIZE(BO) + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + + NMSUB c61, c61, b6, c51 + NMSUB c71, c71, b7, c51 + NMSUB c81, c81, b8, c51 + + LD b6, 45 * SIZE(BO) + LD b7, 46 * SIZE(BO) + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + + NMSUB c71, c71, b7, c61 + NMSUB c81, c81, b8, c61 + + LD b7, 54 * SIZE(BO) + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + + NMSUB c81, c81, b8, c71 + + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + LD b5, 59 * SIZE(BO) + LD b6, 58 * SIZE(BO) + LD b7, 57 * SIZE(BO) + LD b8, 56 * SIZE(BO) + + MUL c81, b1, c81 + + NMSUB c71, c71, b2, c81 + NMSUB c61, c61, b3, c81 + NMSUB c51, c51, b4, c81 + NMSUB c41, c41, b5, c81 + NMSUB c31, c31, b6, c81 + NMSUB c21, c21, b7, c81 + NMSUB c11, c11, b8, c81 + + LD b2, 54 * SIZE(BO) + LD b3, 53 * SIZE(BO) + LD b4, 52 * SIZE(BO) + LD b5, 51 * SIZE(BO) + LD b6, 50 * SIZE(BO) + LD b7, 49 * SIZE(BO) + LD b8, 48 * SIZE(BO) + + MUL c71, b2, c71 + + NMSUB c61, c61, b3, c71 + NMSUB c51, c51, b4, c71 + NMSUB c41, c41, b5, c71 + NMSUB c31, c31, b6, c71 + NMSUB c21, c21, b7, c71 + NMSUB c11, c11, b8, c71 + + LD b3, 45 * SIZE(BO) + LD b4, 44 * SIZE(BO) + LD b5, 43 * SIZE(BO) + LD b6, 42 * SIZE(BO) + LD b7, 41 * SIZE(BO) + LD b8, 40 * SIZE(BO) + + MUL c61, b3, c61 + + NMSUB c51, c51, b4, c61 + NMSUB c41, c41, b5, c61 + NMSUB c31, c31, b6, c61 + NMSUB c21, c21, b7, c61 + NMSUB c11, c11, b8, c61 + + LD b4, 36 * SIZE(BO) + LD b5, 35 * SIZE(BO) + LD b6, 34 * SIZE(BO) + LD b7, 33 * SIZE(BO) + LD b8, 32 * SIZE(BO) + + MUL c51, b4, c51 + + NMSUB c41, c41, b5, c51 + NMSUB c31, c31, b6, c51 + NMSUB c21, c21, b7, c51 + NMSUB c11, c11, b8, c51 + + LD b5, 27 * SIZE(BO) + LD b6, 26 * SIZE(BO) + LD b7, 25 * SIZE(BO) + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 18 * SIZE(BO) + LD b7, 17 * SIZE(BO) + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE + daddiu CO5, CO5, -1 * SIZE + daddiu CO6, CO6, -1 * SIZE + daddiu CO7, CO7, -1 * SIZE + daddiu CO8, CO8, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c61, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c81, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + daddiu CO5, CO5, 1 * SIZE + daddiu CO6, CO6, 1 * SIZE + daddiu CO7, CO7, 1 * SIZE + daddiu CO8, CO8, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 3 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 8 +#endif + +#ifdef RT + daddiu KK, KK, -8 +#endif + + bgtz J, .L10 + NOP + .align 3 + + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/zamax.S b/kernel/mips64/zamax.S new file mode 100644 index 0000000..e993867 --- /dev/null +++ b/kernel/mips64/zamax.S @@ -0,0 +1,245 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 +#define t5 $f16 +#define t6 $f17 +#define t7 $f18 +#define t8 $f19 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + blez N, .L999 + ADD s1, t1, t2 + + NOP + ADD s2, t1, t2 + + dsra I, N, 2 + ADD s3, t1, t2 + + blez I, .L15 + ADD s4, t1, t2 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + LD a2, 1 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + FABS t4, a4 + NOP + + FABS t5, a5 + LD a3, 0 * SIZE(X) + FABS t6, a6 + LD a4, 1 * SIZE(X) + + FABS t7, a7 + daddu X, X, INCX + FABS t8, a8 + NOP + + ADD t1, t1, t2 + LD a5, 0 * SIZE(X) + ADD t3, t3, t4 + LD a6, 1 * SIZE(X) + + ADD t5, t5, t6 + daddu X, X, INCX + ADD t7, t7, t8 + NOP + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t3 + LD a8, 1 * SIZE(X) + + CMPLT $fcc2, s3, t5 + daddu X, X, INCX + CMPLT $fcc3, s4, t7 + NOP + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + CMOVT s2, t3, $fcc1 + NOP + + CMOVT s3, t5, $fcc2 + bgtz I, .L12 + + CMOVT s4, t7, $fcc3 + NOP + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t3, $fcc1 + CMOVT s3, t5, $fcc2 + CMOVT s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + daddiu I, I, -1 + + FABS t1, a1 + FABS t2, a2 + + + ADD t1, t1, t2 + + CMPLT $fcc0, s1, t1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zamin.S b/kernel/mips64/zamin.S new file mode 100644 index 0000000..bd1d509 --- /dev/null +++ b/kernel/mips64/zamin.S @@ -0,0 +1,245 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 +#define t5 $f16 +#define t6 $f17 +#define t7 $f18 +#define t8 $f19 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + blez N, .L999 + ADD s1, t1, t2 + + NOP + ADD s2, t1, t2 + + dsra I, N, 2 + ADD s3, t1, t2 + + blez I, .L15 + ADD s4, t1, t2 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + LD a2, 1 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + FABS t4, a4 + NOP + + FABS t5, a5 + LD a3, 0 * SIZE(X) + FABS t6, a6 + LD a4, 1 * SIZE(X) + + FABS t7, a7 + daddu X, X, INCX + FABS t8, a8 + NOP + + ADD t1, t1, t2 + LD a5, 0 * SIZE(X) + ADD t3, t3, t4 + LD a6, 1 * SIZE(X) + + ADD t5, t5, t6 + daddu X, X, INCX + ADD t7, t7, t8 + NOP + + CMPLT $fcc0, t1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, t3, s2 + LD a8, 1 * SIZE(X) + + CMPLT $fcc2, t5, s3 + daddu X, X, INCX + CMPLT $fcc3, t7, s4 + NOP + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + CMOVT s2, t3, $fcc1 + NOP + + CMOVT s3, t5, $fcc2 + bgtz I, .L12 + + CMOVT s4, t7, $fcc3 + NOP + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t3, $fcc1 + CMOVT s3, t5, $fcc2 + CMOVT s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + daddiu I, I, -1 + + FABS t1, a1 + FABS t2, a2 + + + ADD t1, t1, t2 + + CMPLT $fcc0, t1, s1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zasum.S b/kernel/mips64/zasum.S new file mode 100644 index 0000000..d6dc205 --- /dev/null +++ b/kernel/mips64/zasum.S @@ -0,0 +1,204 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 + +#define t1 $f10 +#define t2 $f11 +#define t3 $f12 +#define t4 $f13 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC $0, s1 + + MTC $0, s2 + dsll INCX, INCX, ZBASE_SHIFT + + blez N, .L999 + dsra I, N, 2 + + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + + FABS t3, a3 + FABS t4, a4 + daddiu I, I, -1 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, 0 * SIZE(X) + + FABS t1, a5 + daddiu I, I, -1 + + ADD s2, s2, t2 + LD a2, 1 * SIZE(X) + + FABS t2, a6 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a3, 0 * SIZE(X) + + FABS t3, a7 + NOP + + ADD s2, s2, t4 + LD a4, 1 * SIZE(X) + + FABS t4, a8 + daddu X, X, INCX + + ADD s1, s1, t1 + LD a5, 0 * SIZE(X) + + FABS t1, a1 + NOP + + ADD s2, s2, t2 + LD a6, 1 * SIZE(X) + + FABS t2, a2 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a7, 0 * SIZE(X) + + FABS t3, a3 + LD a8, 1 * SIZE(X) + + ADD s2, s2, t4 + daddu X, X, INCX + + bgtz I, .L23 + FABS t4, a4 + .align 3 + +.L24: + ADD s1, s1, t1 + FABS t1, a5 + + ADD s2, s2, t2 + FABS t2, a6 + + ADD s1, s1, t3 + FABS t3, a7 + + ADD s2, s2, t4 + FABS t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + FABS t1, a1 + daddiu I, I, -1 + FABS t2, a2 + daddu X, X, INCX + + ADD s1, s1, t1 + bgtz I, .L26 + ADD s2, s2, t2 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE diff --git a/kernel/mips64/zaxpy.S b/kernel/mips64/zaxpy.S new file mode 100644 index 0000000..8a7b29a --- /dev/null +++ b/kernel/mips64/zaxpy.S @@ -0,0 +1,438 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $9 +#define INCX $10 +#define Y $11 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define YY $5 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f17 + +#define t1 $f18 +#define t2 $f19 +#define t3 $f20 +#define t4 $f21 + +#ifndef CONJ +#define MADD1 NMSUB +#define MADD2 MADD +#else +#define MADD1 MADD +#define MADD2 NMSUB +#endif + + PROLOGUE + + LDARG INCY, 0($sp) + li TEMP, 2 * SIZE + +#ifndef __64BIT__ + daddiu $sp, $sp, -16 + sdc1 $f20, 0($sp) + sdc1 $f21, 8($sp) +#endif + + blez N, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 2 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + LD a4, 3 * SIZE(X) + LD b4, 3 * SIZE(Y) + LD a5, 4 * SIZE(X) + LD b5, 4 * SIZE(Y) + LD a6, 5 * SIZE(X) + LD b6, 5 * SIZE(Y) + LD a7, 6 * SIZE(X) + LD b7, 6 * SIZE(Y) + LD a8, 7 * SIZE(X) + LD b8, 7 * SIZE(Y) + + blez I, .L13 + NOP + .align 3 + +.L12: + MADD t1, b1, ALPHA_R, a1 + LD b1, 8 * SIZE(Y) + MADD t2, b2, ALPHA_I, a1 + LD a1, 8 * SIZE(X) + MADD t3, b3, ALPHA_R, a3 + LD b3, 10 * SIZE(Y) + MADD t4, b4, ALPHA_I, a3 + LD a3, 10 * SIZE(X) + + MADD1 t1, t1, ALPHA_I, a2 + LD b2, 9 * SIZE(Y) + MADD2 t2, t2, ALPHA_R, a2 + LD a2, 9 * SIZE(X) + MADD1 t3, t3, ALPHA_I, a4 + LD b4, 11 * SIZE(Y) + MADD2 t4, t4, ALPHA_R, a4 + LD a4, 11 * SIZE(X) + + ST t1, 0 * SIZE(Y) + ST t2, 1 * SIZE(Y) + ST t3, 2 * SIZE(Y) + ST t4, 3 * SIZE(Y) + + MADD t1, b5, ALPHA_R, a5 + LD b5, 12 * SIZE(Y) + MADD t2, b6, ALPHA_I, a5 + LD a5, 12 * SIZE(X) + MADD t3, b7, ALPHA_R, a7 + LD b7, 14 * SIZE(Y) + MADD t4, b8, ALPHA_I, a7 + LD a7, 14 * SIZE(X) + + MADD1 t1, t1, ALPHA_I, a6 + LD b6, 13 * SIZE(Y) + MADD2 t2, t2, ALPHA_R, a6 + LD a6, 13 * SIZE(X) + MADD1 t3, t3, ALPHA_I, a8 + LD b8, 15 * SIZE(Y) + MADD2 t4, t4, ALPHA_R, a8 + LD a8, 15 * SIZE(X) + + ST t1, 4 * SIZE(Y) + ST t2, 5 * SIZE(Y) + ST t3, 6 * SIZE(Y) + ST t4, 7 * SIZE(Y) + + daddiu I, I, -1 + daddiu Y, Y, 8 * SIZE + + bgtz I, .L12 + daddiu X, X, 8 * SIZE + .align 3 + +.L13: + MADD t1, b1, ALPHA_R, a1 + MADD t2, b2, ALPHA_I, a1 + MADD t3, b3, ALPHA_R, a3 + MADD t4, b4, ALPHA_I, a3 + + MADD1 t1, t1, ALPHA_I, a2 + MADD2 t2, t2, ALPHA_R, a2 + MADD1 t3, t3, ALPHA_I, a4 + MADD2 t4, t4, ALPHA_R, a4 + + ST t1, 0 * SIZE(Y) + MADD t1, b5, ALPHA_R, a5 + ST t2, 1 * SIZE(Y) + MADD t2, b6, ALPHA_I, a5 + ST t3, 2 * SIZE(Y) + MADD t3, b7, ALPHA_R, a7 + ST t4, 3 * SIZE(Y) + MADD t4, b8, ALPHA_I, a7 + + MADD1 t1, t1, ALPHA_I, a6 + MADD2 t2, t2, ALPHA_R, a6 + MADD1 t3, t3, ALPHA_I, a8 + MADD2 t4, t4, ALPHA_R, a8 + + ST t1, 4 * SIZE(Y) + ST t2, 5 * SIZE(Y) + ST t3, 6 * SIZE(Y) + ST t4, 7 * SIZE(Y) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MADD t1, b1, ALPHA_R, a1 + daddiu X, X, 2 * SIZE + MADD t2, b2, ALPHA_I, a1 + + MADD1 t1, t1, ALPHA_I, a2 + daddiu I, I, -1 + MADD2 t2, t2, ALPHA_R, a2 + daddiu Y, Y, 2 * SIZE + + ST t1, -2 * SIZE(Y) + + bgtz I, .L16 + ST t2, -1 * SIZE(Y) + +#ifndef __64BIT__ + ldc1 $f20, 0($sp) + ldc1 $f21, 8($sp) + daddiu $sp, $sp, 16 +#endif + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 2 + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + LD a3, 0 * SIZE(X) + LD b3, 0 * SIZE(Y) + LD a4, 1 * SIZE(X) + LD b4, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + LD a5, 0 * SIZE(X) + LD b5, 0 * SIZE(Y) + LD a6, 1 * SIZE(X) + LD b6, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + LD a7, 0 * SIZE(X) + blez I, .L23 + LD b7, 0 * SIZE(Y) + .align 3 + +.L22: + MADD t1, b1, ALPHA_R, a1 + LD b8, 1 * SIZE(Y) + daddu Y, Y, INCY + MADD t2, b2, ALPHA_I, a1 + LD a8, 1 * SIZE(X) + daddu X, X, INCX + + MADD t3, b3, ALPHA_R, a3 + LD b1, 0 * SIZE(Y) + MADD t4, b4, ALPHA_I, a3 + LD a1, 0 * SIZE(X) + + MADD1 t1, t1, ALPHA_I, a2 + LD b2, 1 * SIZE(Y) + daddu Y, Y, INCY + MADD2 t2, t2, ALPHA_R, a2 + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + MADD1 t3, t3, ALPHA_I, a4 + LD a3, 0 * SIZE(X) + MADD2 t4, t4, ALPHA_R, a4 + LD b3, 0 * SIZE(Y) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + ST t4, 1 * SIZE(YY) + daddu YY, YY, INCY + + MADD t1, b5, ALPHA_R, a5 + LD a4, 1 * SIZE(X) + daddu X, X, INCX + MADD t2, b6, ALPHA_I, a5 + LD b4, 1 * SIZE(Y) + daddu Y, Y, INCY + + MADD t3, b7, ALPHA_R, a7 + LD b5, 0 * SIZE(Y) + MADD t4, b8, ALPHA_I, a7 + LD a5, 0 * SIZE(X) + + MADD1 t1, t1, ALPHA_I, a6 + LD b6, 1 * SIZE(Y) + daddu Y, Y, INCY + MADD2 t2, t2, ALPHA_R, a6 + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + MADD1 t3, t3, ALPHA_I, a8 + LD b7, 0 * SIZE(Y) + MADD2 t4, t4, ALPHA_R, a8 + LD a7, 0 * SIZE(X) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + ST t4, 1 * SIZE(YY) + daddu YY, YY, INCY + + + daddiu I, I, -1 + + bgtz I, .L22 + NOP + .align 3 + +.L23: + MADD t1, b1, ALPHA_R, a1 + LD a8, 1 * SIZE(X) + MADD t2, b2, ALPHA_I, a1 + LD b8, 1 * SIZE(Y) + MADD t3, b3, ALPHA_R, a3 + daddu X, X, INCX + MADD t4, b4, ALPHA_I, a3 + daddu Y, Y, INCY + + MADD1 t1, t1, ALPHA_I, a2 + MADD2 t2, t2, ALPHA_R, a2 + MADD1 t3, t3, ALPHA_I, a4 + MADD2 t4, t4, ALPHA_R, a4 + + ST t1, 0 * SIZE(YY) + MADD t1, b5, ALPHA_R, a5 + ST t2, 1 * SIZE(YY) + MADD t2, b6, ALPHA_I, a5 + daddu YY, YY, INCY + + ST t3, 0 * SIZE(YY) + MADD t3, b7, ALPHA_R, a7 + ST t4, 1 * SIZE(YY) + MADD t4, b8, ALPHA_I, a7 + daddu YY, YY, INCY + + MADD1 t1, t1, ALPHA_I, a6 + MADD2 t2, t2, ALPHA_R, a6 + MADD1 t3, t3, ALPHA_I, a8 + MADD2 t4, t4, ALPHA_R, a8 + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + ST t4, 1 * SIZE(YY) + daddu YY, YY, INCY + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MADD t1, b1, ALPHA_R, a1 + MADD t2, b2, ALPHA_I, a1 + daddu X, X, INCX + + MADD1 t1, t1, ALPHA_I, a2 + MADD2 t2, t2, ALPHA_R, a2 + daddiu I, I, -1 + + ST t1, 0 * SIZE(Y) + ST t2, 1 * SIZE(Y) + + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: +#ifndef __64BIT__ + ldc1 $f20, 0($sp) + ldc1 $f21, 8($sp) + daddiu $sp, $sp, 16 +#endif + + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zcopy.S b/kernel/mips64/zcopy.S new file mode 100644 index 0000000..5a4ce9c --- /dev/null +++ b/kernel/mips64/zcopy.S @@ -0,0 +1,265 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, 2 * SIZE + NOP + + blez N, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 2 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + LD a5, 4 * SIZE(X) + LD a6, 5 * SIZE(X) + LD a7, 6 * SIZE(X) + LD a8, 7 * SIZE(X) + + blez I, .L13 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(Y) + LD a1, 8 * SIZE(X) + + ST a2, 1 * SIZE(Y) + LD a2, 9 * SIZE(X) + + ST a3, 2 * SIZE(Y) + LD a3, 10 * SIZE(X) + + ST a4, 3 * SIZE(Y) + LD a4, 11 * SIZE(X) + + ST a5, 4 * SIZE(Y) + LD a5, 12 * SIZE(X) + + ST a6, 5 * SIZE(Y) + LD a6, 13 * SIZE(X) + + ST a7, 6 * SIZE(Y) + LD a7, 14 * SIZE(X) + + ST a8, 7 * SIZE(Y) + LD a8, 15 * SIZE(X) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + ST a3, 2 * SIZE(Y) + ST a4, 3 * SIZE(Y) + ST a5, 4 * SIZE(Y) + ST a6, 5 * SIZE(Y) + ST a7, 6 * SIZE(Y) + ST a8, 7 * SIZE(Y) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + daddiu X, X, 2 * SIZE + daddiu Y, Y, 2 * SIZE + + ST a1, -2 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L16 + ST a2, -1 * SIZE(Y) + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 2 + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + + blez I, .L23 + daddu X, X, INCX + .align 3 + +.L22: + ST a1, 0 * SIZE(Y) + LD a1, 0 * SIZE(X) + + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + ST a3, 0 * SIZE(Y) + LD a3, 0 * SIZE(X) + + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + ST a5, 0 * SIZE(Y) + LD a5, 0 * SIZE(X) + + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + ST a7, 0 * SIZE(Y) + LD a7, 0 * SIZE(X) + + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a8, 1 * SIZE(X) + + daddiu I, I, -1 + + bgtz I, .L22 + daddu X, X, INCX + .align 3 + +.L23: + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + daddiu I, I, -1 + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zdot.S b/kernel/mips64/zdot.S new file mode 100644 index 0000000..c50fe31 --- /dev/null +++ b/kernel/mips64/zdot.S @@ -0,0 +1,402 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC $0, s1 + + MOV s2, s1 + MOV s3, s2 + MOV s4, s3 + + dsll INCX, INCX, ZBASE_SHIFT + li TEMP, 2 * SIZE + + blez N, .L999 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 2 + + bne INCY, TEMP, .L20 + NOP + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + daddiu I, I, -1 + + blez I, .L14 + LD b2, 1 * SIZE(Y) + .align 3 + +.L13: + MADD s1, s1, a1, b1 + LD a3, 2 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 3 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 2 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 3 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a1, 4 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 5 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 4 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 5 * SIZE(Y) + + MADD s1, s1, a1, b1 + LD a3, 6 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 7 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 6 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 7 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a1, 8 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 9 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 8 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 9 * SIZE(Y) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L13 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L14: + MADD s1, s1, a1, b1 + LD a3, 2 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 3 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 2 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 3 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a1, 4 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 5 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 4 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 5 * SIZE(Y) + + MADD s1, s1, a1, b1 + LD a3, 6 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 7 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 6 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 7 * SIZE(Y) + + MADD s1, s1, a3, b3 + daddiu X, X, 8 * SIZE + MADD s2, s2, a4, b3 + daddiu Y, Y, 8 * SIZE + MADD s3, s3, a3, b4 + MADD s4, s4, a4, b4 + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + LD b1, 0 * SIZE(Y) + daddiu I, I, -1 + + blez I, .L17 + LD b2, 1 * SIZE(Y) + .align 3 + +.L16: + MADD s1, s1, a1, b1 + daddiu I, I, -1 + MADD s2, s2, a2, b1 + LD b1, 2 * SIZE(Y) + MADD s3, s3, a1, b2 + LD a1, 2 * SIZE(X) + MADD s4, s4, a2, b2 + LD a2, 3 * SIZE(X) + + LD b2, 3 * SIZE(Y) + daddiu X, X, 2 * SIZE + + bgtz I, .L16 + daddiu Y, Y, 2 * SIZE + .align 3 + +.L17: + MADD s1, s1, a1, b1 + MADD s2, s2, a2, b1 + NOP + MADD s3, s3, a1, b2 + j .L999 + MADD s4, s4, a2, b2 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + daddiu TEMP, N, -1 + + mult TEMP, INCX + + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + daddiu TEMP, N, -1 + + mult TEMP, INCY + + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + dadd X, X, INCX + daddiu I, I, -1 + + blez I, .L24 + dadd Y, Y, INCY + .align 3 + +.L23: + MADD s1, s1, a1, b1 + LD a3, 0 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 1 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 0 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a3, b3 + LD a1, 0 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 1 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 0 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + LD a3, 0 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 1 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 0 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a3, b3 + LD a1, 0 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 1 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 0 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 1 * SIZE(Y) + + dadd X, X, INCX + daddiu I, I, -1 + + bgtz I, .L23 + dadd Y, Y, INCY + .align 3 + +.L24: + MADD s1, s1, a1, b1 + LD a3, 0 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 1 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 0 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a3, b3 + LD a1, 0 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 1 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 0 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + LD a3, 0 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 1 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 0 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 1 * SIZE(Y) + + MADD s1, s1, a3, b3 + dadd X, X, INCX + MADD s2, s2, a4, b3 + dadd Y, Y, INCY + MADD s3, s3, a3, b4 + MADD s4, s4, a4, b4 + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MADD s1, s1, a1, b1 + MADD s2, s2, a2, b1 + MADD s3, s3, a1, b2 + MADD s4, s4, a2, b2 + + + dadd X, X, INCX + dadd Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L26 + NOP + .align 3 + +.L999: + NOP +#ifndef CONJ + SUB s1, s1, s4 +#else + ADD s1, s1, s4 +#endif + + j $31 +#ifndef CONJ + ADD s3, s3, s2 +#else + SUB s3, s3, s2 +#endif + + EPILOGUE diff --git a/kernel/mips64/zgemm3m_kernel.S b/kernel/mips64/zgemm3m_kernel.S new file mode 100644 index 0000000..14bb746 --- /dev/null +++ b/kernel/mips64/zgemm3m_kernel.S @@ -0,0 +1,1666 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#if defined(TRMMKERNEL) +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f28 +#define a4 $f29 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f17 +#define c41 $f18 +#define c42 $f19 +#define c51 $f20 +#define c52 $f21 +#define c61 $f22 +#define c62 $f23 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + + PROLOGUE + + daddiu $sp, $sp, -128 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + sdc1 $f29, 88($sp) + + LDARG LDC, 128($sp) + + dsll LDC, LDC, ZBASE_SHIFT + + dsra J, N, 3 + blez J, .L30 + nop + +.L10: + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + move AO, A + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + dsra I, M, 1 + daddu C, CO8, LDC + + blez I, .L20 + MOV c61, c11 + +.L11: + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, K, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD a4, 2 * SIZE(AO) + MADD c61, c61, a1, b2 + NOP + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD a4, 6 * SIZE(AO) + MADD c61, c61, a3, b2 + NOP + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: + andi L, K, 3 + NOP + blez L, .L18 + NOP + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 2 * SIZE(CO1) + LD $f3, 3 * SIZE(CO1) + + LD $f4, 0 * SIZE(CO2) + MADD $f0, $f0, ALPHA_R, c11 + LD $f5, 1 * SIZE(CO2) + MADD $f1, $f1, ALPHA_I, c11 + LD $f6, 2 * SIZE(CO2) + MADD $f2, $f2, ALPHA_R, c12 + LD $f7, 3 * SIZE(CO2) + MADD $f3, $f3, ALPHA_I, c12 + + MADD $f4, $f4, ALPHA_R, c21 + ST $f0, 0 * SIZE(CO1) + MADD $f5, $f5, ALPHA_I, c21 + ST $f1, 1 * SIZE(CO1) + MADD $f6, $f6, ALPHA_R, c22 + ST $f2, 2 * SIZE(CO1) + MADD $f7, $f7, ALPHA_I, c22 + ST $f3, 3 * SIZE(CO1) + + LD $f0, 0 * SIZE(CO3) + LD $f1, 1 * SIZE(CO3) + LD $f2, 2 * SIZE(CO3) + LD $f3, 3 * SIZE(CO3) + + ST $f4, 0 * SIZE(CO2) + ST $f5, 1 * SIZE(CO2) + ST $f6, 2 * SIZE(CO2) + ST $f7, 3 * SIZE(CO2) + + LD $f4, 0 * SIZE(CO4) + LD $f5, 1 * SIZE(CO4) + LD $f6, 2 * SIZE(CO4) + LD $f7, 3 * SIZE(CO4) + + MADD $f0, $f0, ALPHA_R, c31 + MADD $f1, $f1, ALPHA_I, c31 + MADD $f2, $f2, ALPHA_R, c32 + MADD $f3, $f3, ALPHA_I, c32 + + MADD $f4, $f4, ALPHA_R, c41 + ST $f0, 0 * SIZE(CO3) + MADD $f5, $f5, ALPHA_I, c41 + ST $f1, 1 * SIZE(CO3) + MADD $f6, $f6, ALPHA_R, c42 + ST $f2, 2 * SIZE(CO3) + MADD $f7, $f7, ALPHA_I, c42 + ST $f3, 3 * SIZE(CO3) + + LD $f0, 0 * SIZE(CO5) + LD $f1, 1 * SIZE(CO5) + LD $f2, 2 * SIZE(CO5) + LD $f3, 3 * SIZE(CO5) + + ST $f4, 0 * SIZE(CO4) + ST $f5, 1 * SIZE(CO4) + ST $f6, 2 * SIZE(CO4) + ST $f7, 3 * SIZE(CO4) + + LD $f4, 0 * SIZE(CO6) + LD $f5, 1 * SIZE(CO6) + LD $f6, 2 * SIZE(CO6) + LD $f7, 3 * SIZE(CO6) + + MADD $f0, $f0, ALPHA_R, c51 + daddiu CO1,CO1, 4 * SIZE + MADD $f1, $f1, ALPHA_I, c51 + daddiu CO2,CO2, 4 * SIZE + MADD $f2, $f2, ALPHA_R, c52 + daddiu CO3,CO3, 4 * SIZE + MADD $f3, $f3, ALPHA_I, c52 + daddiu CO4,CO4, 4 * SIZE + + MADD $f4, $f4, ALPHA_R, c61 + ST $f0, 0 * SIZE(CO5) + MADD $f5, $f5, ALPHA_I, c61 + ST $f1, 1 * SIZE(CO5) + MADD $f6, $f6, ALPHA_R, c62 + ST $f2, 2 * SIZE(CO5) + MADD $f7, $f7, ALPHA_I, c62 + ST $f3, 3 * SIZE(CO5) + + LD $f0, 0 * SIZE(CO7) + LD $f1, 1 * SIZE(CO7) + LD $f2, 2 * SIZE(CO7) + LD $f3, 3 * SIZE(CO7) + + ST $f4, 0 * SIZE(CO6) + ST $f5, 1 * SIZE(CO6) + ST $f6, 2 * SIZE(CO6) + ST $f7, 3 * SIZE(CO6) + + LD $f4, 0 * SIZE(CO8) + daddiu I, I, -1 + LD $f5, 1 * SIZE(CO8) + MTC $0, c11 + LD $f6, 2 * SIZE(CO8) + LD $f7, 3 * SIZE(CO8) + + MADD $f0, $f0, ALPHA_R, c71 + daddiu CO5,CO5, 4 * SIZE + MADD $f1, $f1, ALPHA_I, c71 + daddiu CO6,CO6, 4 * SIZE + MADD $f2, $f2, ALPHA_R, c72 + daddiu CO7,CO7, 4 * SIZE + MADD $f3, $f3, ALPHA_I, c72 + daddiu CO8,CO8, 4 * SIZE + + MADD $f4, $f4, ALPHA_R, c81 + ST $f0, -4 * SIZE(CO7) + MADD $f5, $f5, ALPHA_I, c81 + ST $f1, -3 * SIZE(CO7) + MADD $f6, $f6, ALPHA_R, c82 + ST $f2, -2 * SIZE(CO7) + MADD $f7, $f7, ALPHA_I, c82 + ST $f3, -1 * SIZE(CO7) + + ST $f4, -4 * SIZE(CO8) + MOV c21, c11 + ST $f5, -3 * SIZE(CO8) + MOV c31, c11 + ST $f6, -2 * SIZE(CO8) + MOV c41, c11 + ST $f7, -1 * SIZE(CO8) + MOV c51, c11 + bgtz I, .L11 + MOV c61, c11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 + blez I, .L29 + MOV c71, c11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: + andi L, K, 3 + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 0 * SIZE(CO2) + LD $f3, 1 * SIZE(CO2) + + LD $f4, 0 * SIZE(CO3) + MADD $f0, $f0, ALPHA_R, c11 + LD $f5, 1 * SIZE(CO3) + MADD $f1, $f1, ALPHA_I, c11 + LD $f6, 0 * SIZE(CO4) + MADD $f2, $f2, ALPHA_R, c21 + LD $f7, 1 * SIZE(CO4) + MADD $f3, $f3, ALPHA_I, c21 + + MADD $f4, $f4, ALPHA_R, c31 + ST $f0, 0 * SIZE(CO1) + MADD $f5, $f5, ALPHA_I, c31 + ST $f1, 1 * SIZE(CO1) + MADD $f6, $f6, ALPHA_R, c41 + ST $f2, 0 * SIZE(CO2) + MADD $f7, $f7, ALPHA_I, c41 + ST $f3, 1 * SIZE(CO2) + + LD $f0, 0 * SIZE(CO5) + LD $f1, 1 * SIZE(CO5) + LD $f2, 0 * SIZE(CO6) + LD $f3, 1 * SIZE(CO6) + + ST $f4, 0 * SIZE(CO3) + ST $f5, 1 * SIZE(CO3) + ST $f6, 0 * SIZE(CO4) + ST $f7, 1 * SIZE(CO4) + + LD $f4, 0 * SIZE(CO7) + MADD $f0, $f0, ALPHA_R, c51 + LD $f5, 1 * SIZE(CO7) + MADD $f1, $f1, ALPHA_I, c51 + LD $f6, 0 * SIZE(CO8) + MADD $f2, $f2, ALPHA_R, c61 + LD $f7, 1 * SIZE(CO8) + MADD $f3, $f3, ALPHA_I, c61 + + MADD $f4, $f4, ALPHA_R, c71 + ST $f0, 0 * SIZE(CO5) + MADD $f5, $f5, ALPHA_I, c71 + ST $f1, 1 * SIZE(CO5) + MADD $f6, $f6, ALPHA_R, c81 + ST $f2, 0 * SIZE(CO6) + MADD $f7, $f7, ALPHA_I, c81 + ST $f3, 1 * SIZE(CO6) + + ST $f4, 0 * SIZE(CO7) + ST $f5, 1 * SIZE(CO7) + ST $f6, 0 * SIZE(CO8) + ST $f7, 1 * SIZE(CO8) + .align 3 + +.L29: + bgtz J, .L10 + move B, BO + .align 3 + +.L30: + andi J, N, 4 + blez J, .L50 + move AO, A + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + MOV c21, c11 + daddu C, CO4, LDC + MOV c31, c11 + + dsra I, M, 1 + blez I, .L40 + MOV c41, c11 + +.L31: + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: + andi L, K, 3 + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 2 * SIZE(CO1) + LD $f3, 3 * SIZE(CO1) + + LD $f4, 0 * SIZE(CO2) + LD $f5, 1 * SIZE(CO2) + LD $f6, 2 * SIZE(CO2) + LD $f7, 3 * SIZE(CO2) + + MADD $f0, $f0, ALPHA_R, c11 + MADD $f1, $f1, ALPHA_I, c11 + MADD $f2, $f2, ALPHA_R, c12 + MADD $f3, $f3, ALPHA_I, c12 + + MADD $f4, $f4, ALPHA_R, c21 + ST $f0, 0 * SIZE(CO1) + MADD $f5, $f5, ALPHA_I, c21 + ST $f1, 1 * SIZE(CO1) + MADD $f6, $f6, ALPHA_R, c22 + ST $f2, 2 * SIZE(CO1) + MADD $f7, $f7, ALPHA_I, c22 + ST $f3, 3 * SIZE(CO1) + + LD $f0, 0 * SIZE(CO3) + LD $f1, 1 * SIZE(CO3) + LD $f2, 2 * SIZE(CO3) + LD $f3, 3 * SIZE(CO3) + + ST $f4, 0 * SIZE(CO2) + MADD $f0, $f0, ALPHA_R, c31 + ST $f5, 1 * SIZE(CO2) + MADD $f1, $f1, ALPHA_I, c31 + ST $f6, 2 * SIZE(CO2) + MADD $f2, $f2, ALPHA_R, c32 + ST $f7, 3 * SIZE(CO2) + MADD $f3, $f3, ALPHA_I, c32 + + LD $f4, 0 * SIZE(CO4) + LD $f5, 1 * SIZE(CO4) + LD $f6, 2 * SIZE(CO4) + LD $f7, 3 * SIZE(CO4) + + MADD $f4, $f4, ALPHA_R, c41 + daddiu CO1,CO1, 4 * SIZE + MADD $f5, $f5, ALPHA_I, c41 + daddiu CO2,CO2, 4 * SIZE + MADD $f6, $f6, ALPHA_R, c42 + daddiu CO3,CO3, 4 * SIZE + MADD $f7, $f7, ALPHA_I, c42 + daddiu CO4,CO4, 4 * SIZE + + ST $f0, -4 * SIZE(CO3) + daddiu I, I, -1 + ST $f1, -3 * SIZE(CO3) + ST $f2, -2 * SIZE(CO3) + ST $f3, -1 * SIZE(CO3) + + ST $f4, -4 * SIZE(CO4) + MTC $0, c11 + ST $f5, -3 * SIZE(CO4) + MOV c21, c11 + ST $f6, -2 * SIZE(CO4) + MOV c31, c11 + ST $f7, -1 * SIZE(CO4) + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L40: + andi I, M, 1 + blez I, .L49 + MOV c61, c11 + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + + blez L, .L45 + move BO, B + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: + andi L, K, 3 + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 0 * SIZE(CO2) + LD $f3, 1 * SIZE(CO2) + + LD $f4, 0 * SIZE(CO3) + MADD $f0, $f0, ALPHA_R, c11 + LD $f5, 1 * SIZE(CO3) + MADD $f1, $f1, ALPHA_I, c11 + LD $f6, 0 * SIZE(CO4) + MADD $f2, $f2, ALPHA_R, c21 + LD $f7, 1 * SIZE(CO4) + MADD $f3, $f3, ALPHA_I, c21 + + MADD $f4, $f4, ALPHA_R, c31 + ST $f0, 0 * SIZE(CO1) + MADD $f5, $f5, ALPHA_I, c31 + ST $f1, 1 * SIZE(CO1) + MADD $f6, $f6, ALPHA_R, c41 + ST $f2, 0 * SIZE(CO2) + MADD $f7, $f7, ALPHA_I, c41 + ST $f3, 1 * SIZE(CO2) + + ST $f4, 0 * SIZE(CO3) + ST $f5, 1 * SIZE(CO3) + ST $f6, 0 * SIZE(CO4) + ST $f7, 1 * SIZE(CO4) + .align 3 + +.L49: + move B, BO + .align 3 + +.L50: + andi J, N, 2 + blez J, .L70 + + move AO, A + move CO1, C + daddu CO2, C, LDC + + dsra I, M, 1 + blez I, .L60 + daddu C, CO2, LDC + +.L51: + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: + andi L, K, 3 + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 2 * SIZE(CO1) + LD $f3, 3 * SIZE(CO1) + + LD $f4, 0 * SIZE(CO2) + LD $f5, 1 * SIZE(CO2) + LD $f6, 2 * SIZE(CO2) + LD $f7, 3 * SIZE(CO2) + + MADD $f0, $f0, ALPHA_R, c11 + daddiu I, I, -1 + MADD $f1, $f1, ALPHA_I, c11 + daddiu CO1,CO1, 4 * SIZE + MADD $f2, $f2, ALPHA_R, c12 + daddiu CO2,CO2, 4 * SIZE + MADD $f3, $f3, ALPHA_I, c12 + MADD $f4, $f4, ALPHA_R, c21 + MADD $f5, $f5, ALPHA_I, c21 + MADD $f6, $f6, ALPHA_R, c22 + MADD $f7, $f7, ALPHA_I, c22 + + ST $f0, -4 * SIZE(CO1) + ST $f1, -3 * SIZE(CO1) + ST $f2, -2 * SIZE(CO1) + ST $f3, -1 * SIZE(CO1) + + ST $f4, -4 * SIZE(CO2) + ST $f5, -3 * SIZE(CO2) + ST $f6, -2 * SIZE(CO2) + bgtz I, .L51 + ST $f7, -1 * SIZE(CO2) + .align 3 + +.L60: + andi I, M, 1 + blez I, .L69 + NOP + + dsra L, K, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: + andi L, K, 3 + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 0 * SIZE(CO2) + LD $f3, 1 * SIZE(CO2) + + ADD c11, c11, c31 + ADD c21, c21, c41 + + MADD $f0, $f0, ALPHA_R, c11 + MADD $f1, $f1, ALPHA_I, c11 + MADD $f2, $f2, ALPHA_R, c21 + MADD $f3, $f3, ALPHA_I, c21 + + ST $f0, 0 * SIZE(CO1) + ST $f1, 1 * SIZE(CO1) + ST $f2, 0 * SIZE(CO2) + ST $f3, 1 * SIZE(CO2) + .align 3 + +.L69: + move B, BO + .align 3 + +.L70: + andi J, N, 1 + blez J, .L999 + + move AO, A + move CO1, C + + dsra I, M, 1 + blez I, .L80 + daddu C, CO1, LDC + +.L71: + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: + andi L, K, 3 + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 2 * SIZE(CO1) + LD $f3, 3 * SIZE(CO1) + + ADD c11, c11, c21 + daddiu I, I, -1 + ADD c12, c12, c22 + daddiu CO1,CO1, 4 * SIZE + + MADD $f0, $f0, ALPHA_R, c11 + MADD $f1, $f1, ALPHA_I, c11 + MADD $f2, $f2, ALPHA_R, c12 + MADD $f3, $f3, ALPHA_I, c12 + + ST $f0, -4 * SIZE(CO1) + ST $f1, -3 * SIZE(CO1) + ST $f2, -2 * SIZE(CO1) + + bgtz I, .L71 + ST $f3, -1 * SIZE(CO1) + .align 3 + +.L80: + andi I, M, 1 + blez I, .L89 + NOP + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + blez L, .L85 + move BO, B + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: + andi L, K, 3 + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + + ADD c11, c11, c21 + MADD $f0, $f0, ALPHA_R, c11 + MADD $f1, $f1, ALPHA_I, c11 + + ST $f0, 0 * SIZE(CO1) + ST $f1, 1 * SIZE(CO1) + .align 3 + +.L89: + move B, BO + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + ldc1 $f29, 88($sp) + + j $31 + daddiu $sp, $sp, 128 + + EPILOGUE diff --git a/kernel/mips64/zgemm_kernel.S b/kernel/mips64/zgemm_kernel.S new file mode 100644 index 0000000..c48519c --- /dev/null +++ b/kernel/mips64/zgemm_kernel.S @@ -0,0 +1,1286 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f28 +#define a4 $f29 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f17 +#define c41 $f18 +#define c42 $f19 +#define c51 $f20 +#define c52 $f21 +#define c61 $f22 +#define c62 $f23 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -128 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, 128 + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsll LDC, LDC, ZBASE_SHIFT + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsra J, N, 2 + blez J, .L20 + nop + +.L10: + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + move AO, A + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + MOV c41, c11 + MOV c51, c11 + move I, M + daddu C, CO4, LDC + + blez I, .L19 + MOV c61, c11 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 2 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + + blez L, .L15 + NOP +#else + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, K, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#endif + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + blez L, .L13 + MADD3 c41, c41, a1, b4 + .align 3 + +.L12: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + daddiu L, L, -1 + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + bgtz L, .L12 + MADD3 c41, c41, a1, b4 + .align 3 + +.L13: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L18 + NOP + .align 3 + +.L16: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + daddiu L, L, -1 + MADD3 c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD1 c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD3 c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(CO1) + ADD c11, c11, c22 + LD b2, 1 * SIZE(CO1) + ADD c12, c12, c21 + LD b3, 0 * SIZE(CO2) + ADD c31, c31, c42 + LD b4, 1 * SIZE(CO2) + ADD c32, c32, c41 + + LD b5, 0 * SIZE(CO3) + ADD c51, c51, c62 + LD b6, 1 * SIZE(CO3) + ADD c52, c52, c61 + LD b7, 0 * SIZE(CO4) + ADD c71, c71, c82 + LD b8, 1 * SIZE(CO4) + ADD c72, c72, c81 + + MADD b1, b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MADD b2, b2, ALPHA_R, c12 + daddiu CO2,CO2, 2 * SIZE + MADD b3, b3, ALPHA_R, c31 + daddiu CO3,CO3, 2 * SIZE + MADD b4, b4, ALPHA_R, c32 + daddiu CO4,CO4, 2 * SIZE + + MADD b5, b5, ALPHA_R, c51 + daddiu I, I, -1 + MADD b6, b6, ALPHA_R, c52 + NOP + MADD b7, b7, ALPHA_R, c71 + NOP + MADD b8, b8, ALPHA_R, c72 + NOP + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + NMSUB b3, b3, ALPHA_I, c32 + NOP + MADD b4, b4, ALPHA_I, c31 + NOP + + ST b1, -2 * SIZE(CO1) + NMSUB b5, b5, ALPHA_I, c52 + ST b2, -1 * SIZE(CO1) + MADD b6, b6, ALPHA_I, c51 + ST b3, -2 * SIZE(CO2) + NMSUB b7, b7, ALPHA_I, c72 + ST b4, -1 * SIZE(CO2) + MADD b8, b8, ALPHA_I, c71 + + ST b5, -2 * SIZE(CO3) + MOV c21, c11 + ST b6, -1 * SIZE(CO3) + MOV c31, c11 + ST b7, -2 * SIZE(CO4) + MOV c41, c11 + ST b8, -1 * SIZE(CO4) + MOV c51, c11 + +#else + + ADD c11, c11, c22 + daddiu CO1,CO1, 2 * SIZE + ADD c12, c12, c21 + daddiu CO2,CO2, 2 * SIZE + ADD c31, c31, c42 + daddiu CO3,CO3, 2 * SIZE + ADD c32, c32, c41 + daddiu CO4,CO4, 2 * SIZE + + ADD c51, c51, c62 + daddiu I, I, -1 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + + MUL b1, ALPHA_R, c11 + MUL b2, ALPHA_R, c12 + MUL b3, ALPHA_R, c31 + MUL b4, ALPHA_R, c32 + + MUL b5, ALPHA_R, c51 + MUL b6, ALPHA_R, c52 + MUL b7, ALPHA_R, c71 + MUL b8, ALPHA_R, c72 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + NMSUB b3, b3, ALPHA_I, c32 + NOP + MADD b4, b4, ALPHA_I, c31 + NOP + + ST b1, -2 * SIZE(CO1) + NMSUB b5, b5, ALPHA_I, c52 + ST b2, -1 * SIZE(CO1) + MADD b6, b6, ALPHA_I, c51 + ST b3, -2 * SIZE(CO2) + NMSUB b7, b7, ALPHA_I, c72 + ST b4, -1 * SIZE(CO2) + MADD b8, b8, ALPHA_I, c71 + + ST b5, -2 * SIZE(CO3) + MOV c21, c11 + ST b6, -1 * SIZE(CO3) + MOV c31, c11 + ST b7, -2 * SIZE(CO4) + MOV c41, c11 + ST b8, -1 * SIZE(CO4) + MOV c51, c11 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + bgtz I, .L11 + MOV c61, c11 + .align 3 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 4 +#endif + + bgtz J, .L10 + move B, BO + .align 3 + +.L20: + andi J, N, 2 + MTC $0, c11 + blez J, .L30 + move CO1, C + + daddu CO2, C, LDC + daddu C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move I, M + blez I, .L29 + move AO, A + .align 3 + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(BO) + + LD b3, 2 * SIZE(BO) + MOV c12, c11 + LD b4, 3 * SIZE(BO) + MOV c22, c11 + LD b5, 4 * SIZE(BO) + MOV c32, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + blez L, .L25 + MOV c42, c11 + +#else + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(B) + dsra L, K, 2 + + LD b3, 2 * SIZE(B) + MOV c12, c11 + LD b4, 3 * SIZE(B) + MOV c22, c11 + LD b5, 4 * SIZE(B) + MOV c32, c11 + + NOP + MOV c42, c11 + blez L, .L25 + move BO, B +#endif + .align 3 + +.L22: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 12 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c11, c11, a3, b5 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 19 * SIZE(BO) + + bgtz L, .L22 + daddiu BO, BO, 16 * SIZE + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + daddiu BO, BO, 4 * SIZE + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 0 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L26 + daddiu AO, AO, 2 * SIZE + +.L28: +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(CO1) + ADD c11, c11, c22 + LD b2, 1 * SIZE(CO1) + ADD c12, c12, c21 + LD b3, 0 * SIZE(CO2) + ADD c31, c31, c42 + LD b4, 1 * SIZE(CO2) + ADD c32, c32, c41 + + MADD b1, b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MADD b2, b2, ALPHA_R, c12 + daddiu CO2,CO2, 2 * SIZE + MADD b3, b3, ALPHA_R, c31 + daddiu I, I, -1 + MADD b4, b4, ALPHA_R, c32 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + NMSUB b3, b3, ALPHA_I, c32 + NOP + MADD b4, b4, ALPHA_I, c31 + NOP + + ST b1, -2 * SIZE(CO1) + ST b2, -1 * SIZE(CO1) + ST b3, -2 * SIZE(CO2) +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + + MUL b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + daddiu CO2,CO2, 2 * SIZE + MUL b3, ALPHA_R, c31 + daddiu I, I, -1 + MUL b4, ALPHA_R, c32 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + NMSUB b3, b3, ALPHA_I, c32 + NOP + MADD b4, b4, ALPHA_I, c31 + NOP + + ST b1, -2 * SIZE(CO1) + ST b2, -1 * SIZE(CO1) + ST b3, -2 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + bgtz I, .L21 + ST b4, -1 * SIZE(CO2) + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + move B, BO + .align 3 + +.L30: + andi J, N, 1 + MTC $0, c11 + blez J, .L999 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move I, M + daddu C, CO1, LDC + blez I, .L39 + move AO, A + .align 3 + +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(BO) + MOV c12, c11 + NOP + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + + blez L, .L35 + MOV c42, c11 +#else + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(B) + MOV c12, c11 + dsra L, K, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(B) + + NOP + MOV c42, c11 + blez L, .L35 + move BO, B +#endif + .align 3 + +.L32: + MADD1 c11, c11, a1, b1 + LD b4, 3 * SIZE(BO) + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + MADD1 c11, c11, a1, b1 + LD b2, 5 * SIZE(BO) + MADD3 c21, c21, a1, b4 + LD a1, 8 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 5 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b4, 7 * SIZE(BO) + MADD3 c21, c21, a3, b2 + LD a3, 6 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 7 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b2, 9 * SIZE(BO) + MADD3 c21, c21, a3, b4 + LD a3, 12 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 12 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 9 * SIZE(AO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + + bgtz L, .L32 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD1 c11, c11, a1, b1 + daddiu L, L, -1 + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + LD b2, 3 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + bgtz L, .L36 + daddiu AO, AO, 2 * SIZE + +.L38: +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(CO1) + ADD c11, c11, c22 + LD b2, 1 * SIZE(CO1) + ADD c12, c12, c21 + + MADD b1, b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MADD b2, b2, ALPHA_R, c12 + daddiu I, I, -1 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + + ST b1, -2 * SIZE(CO1) + NOP + bgtz I, .L31 + ST b2, -1 * SIZE(CO1) +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + + MUL b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + daddiu I, I, -1 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + ST b1, -2 * SIZE(CO1) + NOP + bgtz I, .L31 + ST b2, -1 * SIZE(CO1) +#endif + .align 3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 1 +#endif + move B, BO + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, 128 + + EPILOGUE diff --git a/kernel/mips64/zgemv_n.S b/kernel/mips64/zgemv_n.S new file mode 100644 index 0000000..c6cc896 --- /dev/null +++ b/kernel/mips64/zgemv_n.S @@ -0,0 +1,777 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define A $9 +#define LDA $10 +#define X $11 +#define INCX $2 +#define Y $6 +#define INCY $7 +#define BUFFER $8 + +#define YORIG $3 +#define XX $12 +#define YY $13 + +#define I $14 +#define J $15 + +#define AO1 $16 +#define AO2 $17 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define x1 $f8 +#define x2 $f9 +#define x3 $f10 +#define x4 $f11 + +#define y1 $f12 +#define y2 $f13 +#define y3 $f14 +#define y4 $f17 + +#define t1 $f18 +#define t2 $f19 +#define t3 $f20 +#define t4 $f21 +#define t5 $f22 +#define t6 $f23 +#define t7 $f24 +#define t8 $f25 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCX, 0($sp) + LDARG Y, 8($sp) + LDARG INCY, 16($sp) + LDARG BUFFER, 24($sp) +#ifndef __64BIT__ + daddiu $sp, $sp, -64 +#else + daddiu $sp, $sp, -32 +#endif + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + +#ifndef __64BIT__ + sdc1 $f20, 32($sp) + sdc1 $f21, 40($sp) + sdc1 $f22, 48($sp) + sdc1 $f23, 56($sp) +#endif + + dsll LDA, LDA, ZBASE_SHIFT + + blez M, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + blez N, .L999 + dsll INCY, INCY, ZBASE_SHIFT + + li YORIG, 2 * SIZE + + beq INCY, YORIG, .L10 + move YORIG, Y + + dsra I, M, 2 + move YORIG, BUFFER + + move XX, Y + + blez I, .L05 + move YY, BUFFER + .align 3 + +.L02: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCY + LD a3, 0 * SIZE(XX) + LD a4, 1 * SIZE(XX) + daddu XX, XX, INCY + LD a5, 0 * SIZE(XX) + LD a6, 1 * SIZE(XX) + daddu XX, XX, INCY + LD a7, 0 * SIZE(XX) + LD a8, 1 * SIZE(XX) + daddu XX, XX, INCY + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + ST a1, -8 * SIZE(YY) + ST a2, -7 * SIZE(YY) + ST a3, -6 * SIZE(YY) + ST a4, -5 * SIZE(YY) + ST a5, -4 * SIZE(YY) + ST a6, -3 * SIZE(YY) + ST a7, -2 * SIZE(YY) + + bgtz I, .L02 + ST a8, -1 * SIZE(YY) + .align 3 + +.L05: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L06: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCY + + daddiu I, I, -1 + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + + bgtz I, .L06 + daddiu YY, YY, 2 * SIZE + .align 3 + +.L10: + dsra J, N, 1 + blez J, .L20 + NOP + .align 3 + +.L11: + LD x1, 0 * SIZE(X) + LD x2, 1 * SIZE(X) + daddu X, X, INCX + LD x3, 0 * SIZE(X) + LD x4, 1 * SIZE(X) + daddu X, X, INCX + + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 + daddu AO2, A, LDA + MUL a3, ALPHA_R, x3 + daddu A, AO2, LDA + MUL a4, ALPHA_I, x3 + +#ifndef XCONJ + NMSUB x1, a1, ALPHA_I, x2 + MADD x2, a2, ALPHA_R, x2 + NMSUB x3, a3, ALPHA_I, x4 + MADD x4, a4, ALPHA_R, x4 +#else + MADD x1, a1, ALPHA_I, x2 + MSUB x2, a2, ALPHA_R, x2 + MADD x3, a3, ALPHA_I, x4 + MSUB x4, a4, ALPHA_R, x4 +#endif + + dsra I, M, 2 + + blez I, .L15 + move YY, YORIG + + LD y1, 0 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + MADD1 t1, y1, x1, a1 + LD y1, 4 * SIZE(YY) + MADD2 t2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + MADD1 t3, y3, x1, a3 + LD y2, 5 * SIZE(YY) + MADD2 t4, y4, x2, a3 + LD a3, 6 * SIZE(AO1) + + MADD3 t1, t1, x2, a2 + LD y3, 6 * SIZE(YY) + MADD4 t2, t2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD3 t3, t3, x2, a4 + LD y4, 7 * SIZE(YY) + MADD4 t4, t4, x1, a4 + LD a4, 7 * SIZE(AO1) + + MADD1 t1, t1, x3, a5 + NOP + MADD2 t2, t2, x4, a5 + LD a5, 4 * SIZE(AO2) + MADD1 t3, t3, x3, a7 + NOP + MADD2 t4, t4, x4, a7 + LD a7, 6 * SIZE(AO2) + + MADD3 t1, t1, x4, a6 + NOP + MADD4 t2, t2, x3, a6 + LD a6, 5 * SIZE(AO2) + MADD3 t3, t3, x4, a8 + daddiu I, I, -1 + MADD4 t4, t4, x3, a8 + + blez I, .L13 + LD a8, 7 * SIZE(AO2) + .align 3 + +.L12: + MADD1 t5, y1, x1, a1 + LD y1, 8 * SIZE(YY) + MADD2 t6, y2, x2, a1 + LD a1, 8 * SIZE(AO1) + MADD1 t7, y3, x1, a3 + LD y2, 9 * SIZE(YY) + MADD2 t8, y4, x2, a3 + LD a3, 10 * SIZE(AO1) + + MADD3 t5, t5, x2, a2 + LD y3, 10 * SIZE(YY) + MADD4 t6, t6, x1, a2 + LD a2, 9 * SIZE(AO1) + MADD3 t7, t7, x2, a4 + LD y4, 11 * SIZE(YY) + MADD4 t8, t8, x1, a4 + LD a4, 11 * SIZE(AO1) + + MADD1 t5, t5, x3, a5 + ST t1, 0 * SIZE(YY) + MADD2 t6, t6, x4, a5 + LD a5, 8 * SIZE(AO2) + MADD1 t7, t7, x3, a7 + ST t2, 1 * SIZE(YY) + MADD2 t8, t8, x4, a7 + LD a7, 10 * SIZE(AO2) + + MADD3 t5, t5, x4, a6 + ST t3, 2 * SIZE(YY) + MADD4 t6, t6, x3, a6 + LD a6, 9 * SIZE(AO2) + MADD3 t7, t7, x4, a8 + ST t4, 3 * SIZE(YY) + MADD4 t8, t8, x3, a8 + LD a8, 11 * SIZE(AO2) + + MADD1 t1, y1, x1, a1 + LD y1, 12 * SIZE(YY) + MADD2 t2, y2, x2, a1 + LD a1, 12 * SIZE(AO1) + MADD1 t3, y3, x1, a3 + LD y2, 13 * SIZE(YY) + MADD2 t4, y4, x2, a3 + LD a3, 14 * SIZE(AO1) + + MADD3 t1, t1, x2, a2 + LD y3, 14 * SIZE(YY) + MADD4 t2, t2, x1, a2 + LD a2, 13 * SIZE(AO1) + MADD3 t3, t3, x2, a4 + LD y4, 15 * SIZE(YY) + MADD4 t4, t4, x1, a4 + LD a4, 15 * SIZE(AO1) + + MADD1 t1, t1, x3, a5 + ST t5, 4 * SIZE(YY) + MADD2 t2, t2, x4, a5 + LD a5, 12 * SIZE(AO2) + MADD1 t3, t3, x3, a7 + ST t6, 5 * SIZE(YY) + MADD2 t4, t4, x4, a7 + LD a7, 14 * SIZE(AO2) + + MADD3 t1, t1, x4, a6 + ST t7, 6 * SIZE(YY) + MADD4 t2, t2, x3, a6 + LD a6, 13 * SIZE(AO2) + MADD3 t3, t3, x4, a8 + ST t8, 7 * SIZE(YY) + MADD4 t4, t4, x3, a8 + LD a8, 15 * SIZE(AO2) + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + daddiu AO1, AO1, 8 * SIZE + bgtz I, .L12 + daddiu AO2, AO2, 8 * SIZE + .align 3 + +.L13: + ST t1, 0 * SIZE(YY) + MADD1 t1, y1, x1, a1 + ST t2, 1 * SIZE(YY) + MADD2 t2, y2, x2, a1 + ST t3, 2 * SIZE(YY) + MADD1 t3, y3, x1, a3 + ST t4, 3 * SIZE(YY) + MADD2 t4, y4, x2, a3 + + MADD3 t1, t1, x2, a2 + MADD4 t2, t2, x1, a2 + MADD3 t3, t3, x2, a4 + MADD4 t4, t4, x1, a4 + + MADD1 t1, t1, x3, a5 + MADD2 t2, t2, x4, a5 + MADD1 t3, t3, x3, a7 + MADD2 t4, t4, x4, a7 + + MADD3 t1, t1, x4, a6 + daddiu AO1, AO1, 8 * SIZE + MADD4 t2, t2, x3, a6 + daddiu AO2, AO2, 8 * SIZE + MADD3 t3, t3, x4, a8 + daddiu YY, YY, 8 * SIZE + MADD4 t4, t4, x3, a8 + NOP + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L15: + andi I, M, 2 + NOP + blez I, .L16 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + MADD1 t1, y1, x1, a1 + LD a5, 0 * SIZE(AO2) + MADD2 t2, y2, x2, a1 + LD a6, 1 * SIZE(AO2) + MADD1 t3, y3, x1, a3 + LD a7, 2 * SIZE(AO2) + MADD2 t4, y4, x2, a3 + LD a8, 3 * SIZE(AO2) + + MADD3 t1, t1, x2, a2 + MADD4 t2, t2, x1, a2 + MADD3 t3, t3, x2, a4 + MADD4 t4, t4, x1, a4 + + MADD1 t1, t1, x3, a5 + MADD2 t2, t2, x4, a5 + MADD1 t3, t3, x3, a7 + MADD2 t4, t4, x4, a7 + + MADD3 t1, t1, x4, a6 + daddiu YY, YY, 4 * SIZE + MADD4 t2, t2, x3, a6 + daddiu AO1, AO1, 4 * SIZE + MADD3 t3, t3, x4, a8 + daddiu AO2, AO2, 4 * SIZE + MADD4 t4, t4, x3, a8 + NOP + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L16: + andi I, M, 1 + NOP + blez I, .L19 + NOP + + LD y1, 0 * SIZE(YY) + LD y2, 1 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + MADD1 t1, y1, x1, a1 + LD a5, 0 * SIZE(AO2) + MADD2 t2, y2, x2, a1 + LD a6, 1 * SIZE(AO2) + MADD3 t1, t1, x2, a2 + MADD4 t2, t2, x1, a2 + + MADD1 t1, t1, x3, a5 + MADD2 t2, t2, x4, a5 + MADD3 t1, t1, x4, a6 + MADD4 t2, t2, x3, a6 + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + .align 3 + + +.L19: + daddiu J, J, -1 + + bgtz J, .L11 + NOP + .align 3 + +.L20: + andi J, N, 1 + blez J, .L900 + NOP + + LD x1, 0 * SIZE(X) + LD x2, 1 * SIZE(X) + daddu X, X, INCX + + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 + +#ifndef XCONJ + NMSUB x1, a1, ALPHA_I, x2 + MADD x2, a2, ALPHA_R, x2 +#else + MADD x1, a1, ALPHA_I, x2 + MSUB x2, a2, ALPHA_R, x2 +#endif + + dsra I, M, 2 + + blez I, .L25 + move YY, YORIG + + LD y1, 0 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + + MADD1 t1, y1, x1, a1 + LD y1, 4 * SIZE(YY) + MADD2 t2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + MADD1 t3, y3, x1, a3 + LD y2, 5 * SIZE(YY) + MADD2 t4, y4, x2, a3 + LD a3, 6 * SIZE(AO1) + + MADD3 t1, t1, x2, a2 + LD y3, 6 * SIZE(YY) + MADD4 t2, t2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD3 t3, t3, x2, a4 + LD y4, 7 * SIZE(YY) + MADD4 t4, t4, x1, a4 + daddiu I, I, -1 + + blez I, .L23 + LD a4, 7 * SIZE(AO1) + .align 3 + +.L22: + MADD1 t5, y1, x1, a1 + LD y1, 8 * SIZE(YY) + MADD2 t6, y2, x2, a1 + LD a1, 8 * SIZE(AO1) + MADD1 t7, y3, x1, a3 + LD y2, 9 * SIZE(YY) + MADD2 t8, y4, x2, a3 + LD a3, 10 * SIZE(AO1) + + MADD3 t5, t5, x2, a2 + LD y3, 10 * SIZE(YY) + MADD4 t6, t6, x1, a2 + LD a2, 9 * SIZE(AO1) + MADD3 t7, t7, x2, a4 + LD y4, 11 * SIZE(YY) + MADD4 t8, t8, x1, a4 + LD a4, 11 * SIZE(AO1) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + ST t3, 2 * SIZE(YY) + ST t4, 3 * SIZE(YY) + + MADD1 t1, y1, x1, a1 + LD y1, 12 * SIZE(YY) + MADD2 t2, y2, x2, a1 + LD a1, 12 * SIZE(AO1) + MADD1 t3, y3, x1, a3 + LD y2, 13 * SIZE(YY) + MADD2 t4, y4, x2, a3 + LD a3, 14 * SIZE(AO1) + + MADD3 t1, t1, x2, a2 + LD y3, 14 * SIZE(YY) + MADD4 t2, t2, x1, a2 + LD a2, 13 * SIZE(AO1) + MADD3 t3, t3, x2, a4 + LD y4, 15 * SIZE(YY) + MADD4 t4, t4, x1, a4 + LD a4, 15 * SIZE(AO1) + + ST t5, 4 * SIZE(YY) + ST t6, 5 * SIZE(YY) + ST t7, 6 * SIZE(YY) + ST t8, 7 * SIZE(YY) + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + bgtz I, .L22 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L23: + ST t1, 0 * SIZE(YY) + MADD1 t1, y1, x1, a1 + ST t2, 1 * SIZE(YY) + MADD2 t2, y2, x2, a1 + ST t3, 2 * SIZE(YY) + MADD1 t3, y3, x1, a3 + ST t4, 3 * SIZE(YY) + MADD2 t4, y4, x2, a3 + + MADD3 t1, t1, x2, a2 + daddiu AO1, AO1, 8 * SIZE + MADD4 t2, t2, x1, a2 + daddiu YY, YY, 8 * SIZE + MADD3 t3, t3, x2, a4 + MADD4 t4, t4, x1, a4 + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L25: + andi I, M, 2 + NOP + blez I, .L26 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + MADD1 t1, y1, x1, a1 + MADD2 t2, y2, x2, a1 + MADD1 t3, y3, x1, a3 + MADD2 t4, y4, x2, a3 + + MADD3 t1, t1, x2, a2 + daddiu YY, YY, 4 * SIZE + MADD4 t2, t2, x1, a2 + daddiu AO1, AO1, 4 * SIZE + MADD3 t3, t3, x2, a4 + MADD4 t4, t4, x1, a4 + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L26: + andi I, M, 1 + NOP + blez I, .L900 + NOP + + LD y1, 0 * SIZE(YY) + LD y2, 1 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + MADD1 t1, y1, x1, a1 + MADD2 t2, y2, x2, a1 + MADD3 t1, t1, x2, a2 + MADD4 t2, t2, x1, a2 + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + .align 3 + +.L900: + li YORIG, 2 * SIZE + + beq INCY, YORIG, .L999 + dsra I, M, 2 + + blez I, .L905 + move XX, BUFFER + .align 3 + +.L902: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + LD a3, 2 * SIZE(XX) + LD a4, 3 * SIZE(XX) + LD a5, 4 * SIZE(XX) + LD a6, 5 * SIZE(XX) + LD a7, 6 * SIZE(XX) + LD a8, 7 * SIZE(XX) + + daddiu I, I, -1 + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + + bgtz I, .L902 + daddiu XX, XX, 8 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + daddiu I, I, -1 + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + +#ifndef __64BIT__ + ldc1 $f20, 32($sp) + ldc1 $f21, 40($sp) + ldc1 $f22, 48($sp) + ldc1 $f23, 56($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 32 +#else + daddiu $sp, $sp, 64 +#endif + + EPILOGUE diff --git a/kernel/mips64/zgemv_t.S b/kernel/mips64/zgemv_t.S new file mode 100644 index 0000000..f7f7fdf --- /dev/null +++ b/kernel/mips64/zgemv_t.S @@ -0,0 +1,669 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define A $9 +#define LDA $10 +#define X $11 +#define INCX $2 +#define Y $6 +#define INCY $7 +#define BUFFER $8 + +#define XORIG $3 +#define XX $12 +#define YY $13 + +#define I $14 +#define J $15 + +#define AO1 $16 +#define AO2 $17 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define y1 $f8 +#define y2 $f9 +#define y3 $f10 +#define y4 $f11 + +#define x1 $f12 +#define x2 $f13 +#define x3 $f14 +#define x4 $f17 +#define x5 $f18 +#define x6 $f19 +#define x7 $f20 +#define x8 $f21 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCX, 0($sp) + LDARG Y, 8($sp) + LDARG INCY, 16($sp) + LDARG BUFFER, 24($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -16 +#else + daddiu $sp, $sp, -32 +#endif + + MTC $0, y1 + SDARG $16, 0($sp) + + SDARG $17, 8($sp) + dsll LDA, LDA, ZBASE_SHIFT + +#ifndef __64BIT__ + sdc1 $f20, 16($sp) + sdc1 $f21, 24($sp) +#endif + + blez M, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + blez N, .L999 + dsll INCY, INCY, ZBASE_SHIFT + + li XORIG, 2 * SIZE + + beq INCX, XORIG, .L10 + move XORIG, X + + dsra I, M, 2 + move XORIG, BUFFER + + blez I, .L05 + move YY, BUFFER + .align 3 + +.L02: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddu X, X, INCX + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + ST a1, -8 * SIZE(YY) + ST a2, -7 * SIZE(YY) + ST a3, -6 * SIZE(YY) + ST a4, -5 * SIZE(YY) + ST a5, -4 * SIZE(YY) + ST a6, -3 * SIZE(YY) + ST a7, -2 * SIZE(YY) + + bgtz I, .L02 + ST a8, -1 * SIZE(YY) + .align 3 + +.L05: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L06: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu YY, YY, 2 * SIZE + .align 3 + +.L10: + dsra J, N, 1 + blez J, .L20 + move YY, Y + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + daddu AO2, A, LDA + MOV y3, y1 + daddu A, AO2, LDA + MOV y4, y1 + + dsra I, M, 2 + blez I, .L15 + move XX, XORIG + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a3, 0 * SIZE(AO2) + LD a2, 1 * SIZE(AO1) + LD a4, 1 * SIZE(AO2) + + LD a5, 2 * SIZE(AO1) + LD a7, 2 * SIZE(AO2) + LD a6, 3 * SIZE(AO1) + LD a8, 3 * SIZE(AO2) + daddiu I, I, -1 + + blez I, .L13 + NOP + .align 3 + +.L12: + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + MADD1 y3, y3, x1, a3 + NOP + MADD2 y4, y4, x2, a3 + LD a3, 4 * SIZE(AO2) + + MADD3 y1, y1, x2, a2 + NOP + MADD4 y2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD3 y3, y3, x2, a4 + LD x2, 5 * SIZE(XX) + MADD4 y4, y4, x1, a4 + LD a4, 5 * SIZE(AO2) + + MADD1 y1, y1, x3, a5 + LD x1, 4 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 6 * SIZE(AO1) + MADD1 y3, y3, x3, a7 + MADD2 y4, y4, x4, a7 + LD a7, 6 * SIZE(AO2) + + MADD3 y1, y1, x4, a6 + daddiu I, I, -1 + MADD4 y2, y2, x3, a6 + LD a6, 7 * SIZE(AO1) + MADD3 y3, y3, x4, a8 + LD x4, 7 * SIZE(XX) + MADD4 y4, y4, x3, a8 + LD a8, 7 * SIZE(AO2) + + MADD1 y1, y1, x1, a1 + LD x3, 6 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 8 * SIZE(AO1) + MADD1 y3, y3, x1, a3 + MADD2 y4, y4, x2, a3 + LD a3, 8 * SIZE(AO2) + + MADD3 y1, y1, x2, a2 + MADD4 y2, y2, x1, a2 + LD a2, 9 * SIZE(AO1) + MADD3 y3, y3, x2, a4 + LD x2, 9 * SIZE(XX) + MADD4 y4, y4, x1, a4 + LD a4, 9 * SIZE(AO2) + + MADD1 y1, y1, x3, a5 + LD x1, 8 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 10 * SIZE(AO1) + MADD1 y3, y3, x3, a7 + daddiu XX, XX, 8 * SIZE + MADD2 y4, y4, x4, a7 + LD a7, 10 * SIZE(AO2) + + MADD3 y1, y1, x4, a6 + daddiu AO2, AO2, 8 * SIZE + MADD4 y2, y2, x3, a6 + LD a6, 11 * SIZE(AO1) + MADD3 y3, y3, x4, a8 + LD x4, 3 * SIZE(XX) + MADD4 y4, y4, x3, a8 + LD a8, 3 * SIZE(AO2) + + bgtz I, .L12 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L13: + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + MADD1 y3, y3, x1, a3 + NOP + MADD2 y4, y4, x2, a3 + LD a3, 4 * SIZE(AO2) + + MADD3 y1, y1, x2, a2 + NOP + MADD4 y2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD3 y3, y3, x2, a4 + LD x2, 5 * SIZE(XX) + MADD4 y4, y4, x1, a4 + LD a4, 5 * SIZE(AO2) + + MADD1 y1, y1, x3, a5 + LD x1, 4 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 6 * SIZE(AO1) + MADD1 y3, y3, x3, a7 + MADD2 y4, y4, x4, a7 + LD a7, 6 * SIZE(AO2) + + MADD3 y1, y1, x4, a6 + NOP + MADD4 y2, y2, x3, a6 + LD a6, 7 * SIZE(AO1) + MADD3 y3, y3, x4, a8 + LD x4, 7 * SIZE(XX) + MADD4 y4, y4, x3, a8 + LD a8, 7 * SIZE(AO2) + + MADD1 y1, y1, x1, a1 + LD x3, 6 * SIZE(XX) + MADD2 y2, y2, x2, a1 + NOP + MADD1 y3, y3, x1, a3 + MADD2 y4, y4, x2, a3 + + MADD3 y1, y1, x2, a2 + MADD4 y2, y2, x1, a2 + MADD3 y3, y3, x2, a4 + MADD4 y4, y4, x1, a4 + + MADD1 y1, y1, x3, a5 + MADD2 y2, y2, x4, a5 + MADD1 y3, y3, x3, a7 + MADD2 y4, y4, x4, a7 + + MADD3 y1, y1, x4, a6 + daddiu XX, XX, 8 * SIZE + MADD4 y2, y2, x3, a6 + daddiu AO1, AO1, 8 * SIZE + MADD3 y3, y3, x4, a8 + daddiu AO2, AO2, 8 * SIZE + MADD4 y4, y4, x3, a8 + NOP + .align 3 + +.L15: + andi I, M, 2 + NOP + blez I, .L17 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a3, 0 * SIZE(AO2) + LD a2, 1 * SIZE(AO1) + LD a4, 1 * SIZE(AO2) + + LD a5, 2 * SIZE(AO1) + LD a7, 2 * SIZE(AO2) + LD a6, 3 * SIZE(AO1) + LD a8, 3 * SIZE(AO2) + + MADD1 y1, y1, x1, a1 + MADD2 y2, y2, x2, a1 + MADD1 y3, y3, x1, a3 + MADD2 y4, y4, x2, a3 + + MADD3 y1, y1, x2, a2 + MADD4 y2, y2, x1, a2 + MADD3 y3, y3, x2, a4 + MADD4 y4, y4, x1, a4 + + MADD1 y1, y1, x3, a5 + MADD2 y2, y2, x4, a5 + MADD1 y3, y3, x3, a7 + MADD2 y4, y4, x4, a7 + + MADD3 y1, y1, x4, a6 + daddiu XX, XX, 4 * SIZE + MADD4 y2, y2, x3, a6 + daddiu AO1, AO1, 4 * SIZE + MADD3 y3, y3, x4, a8 + daddiu AO2, AO2, 4 * SIZE + MADD4 y4, y4, x3, a8 + NOP + .align 3 + +.L17: + andi I, M, 1 + blez I, .L19 + .align 3 + +.L18: + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD a1, 0 * SIZE(AO1) + LD a3, 0 * SIZE(AO2) + + MADD1 y1, y1, x1, a1 + LD a2, 1 * SIZE(AO1) + MADD2 y2, y2, x2, a1 + LD a4, 1 * SIZE(AO2) + MADD1 y3, y3, x1, a3 + MADD2 y4, y4, x2, a3 + + MADD3 y1, y1, x2, a2 + MADD4 y2, y2, x1, a2 + MADD3 y3, y3, x2, a4 + MADD4 y4, y4, x1, a4 + .align 3 + +.L19: + LD a1, 0 * SIZE(Y) + LD a2, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a3, 0 * SIZE(Y) + LD a4, 1 * SIZE(Y) + daddu Y, Y, INCY + + MADD a1, a1, ALPHA_R, y1 + MADD a2, a2, ALPHA_I, y1 + MADD a3, a3, ALPHA_R, y3 + MADD a4, a4, ALPHA_I, y3 + + NMSUB a1, a1, ALPHA_I, y2 + MADD a2, a2, ALPHA_R, y2 + NMSUB a3, a3, ALPHA_I, y4 + MTC $0, y1 + MADD a4, a4, ALPHA_R, y4 + daddiu J, J, -1 + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + daddu YY, YY, INCY + ST a3, 0 * SIZE(YY) + ST a4, 1 * SIZE(YY) + + bgtz J, .L11 + daddu YY, YY, INCY + .align 3 + +.L20: + andi J, N, 1 + MOV y2, y1 + blez J, .L999 + dsra I, M, 2 + + MOV y3, y1 + move AO1, A + MOV y4, y1 + + blez I, .L25 + move XX, XORIG + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + LD a5, 2 * SIZE(AO1) + LD x4, 3 * SIZE(XX) + daddiu I, I, -1 + + blez I, .L23 + LD a6, 3 * SIZE(AO1) + .align 3 + +.L22: + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + + MADD3 y3, y3, x2, a2 + LD x2, 5 * SIZE(XX) + MADD4 y4, y4, x1, a2 + LD a2, 5 * SIZE(AO1) + + MADD1 y1, y1, x3, a5 + LD x1, 4 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 6 * SIZE(AO1) + + MADD3 y3, y3, x4, a6 + LD x4, 7 * SIZE(XX) + MADD4 y4, y4, x3, a6 + LD a6, 7 * SIZE(AO1) + + MADD1 y1, y1, x1, a1 + LD x3, 6 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 8 * SIZE(AO1) + + MADD3 y3, y3, x2, a2 + LD x2, 9 * SIZE(XX) + MADD4 y4, y4, x1, a2 + LD a2, 9 * SIZE(AO1) + + MADD1 y1, y1, x3, a5 + LD x1, 8 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 10 * SIZE(AO1) + + MADD3 y3, y3, x4, a6 + LD x4, 11 * SIZE(XX) + MADD4 y4, y4, x3, a6 + LD a6, 11 * SIZE(AO1) + + daddiu I, I, -1 + daddiu XX, XX, 8 * SIZE + + bgtz I, .L22 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L23: + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + + MADD3 y3, y3, x2, a2 + LD x2, 5 * SIZE(XX) + MADD4 y4, y4, x1, a2 + LD a2, 5 * SIZE(AO1) + + MADD1 y1, y1, x3, a5 + LD x1, 4 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 6 * SIZE(AO1) + + MADD3 y3, y3, x4, a6 + LD x4, 7 * SIZE(XX) + MADD4 y4, y4, x3, a6 + LD a6, 7 * SIZE(AO1) + + MADD1 y1, y1, x1, a1 + LD x3, 6 * SIZE(XX) + MADD2 y2, y2, x2, a1 + NOP + + MADD3 y3, y3, x2, a2 + MADD4 y4, y4, x1, a2 + MADD1 y1, y1, x3, a5 + MADD2 y2, y2, x4, a5 + + MADD3 y3, y3, x4, a6 + daddiu XX, XX, 8 * SIZE + MADD4 y4, y4, x3, a6 + daddiu AO1, AO1, 8 * SIZE + NOP + .align 3 + +.L25: + andi I, M, 2 + NOP + blez I, .L27 + NOP + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + + LD a5, 2 * SIZE(AO1) + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a6, 3 * SIZE(AO1) + MADD3 y3, y3, x2, a2 + LD x4, 3 * SIZE(XX) + MADD4 y4, y4, x1, a2 + + MADD1 y1, y1, x3, a5 + MADD2 y2, y2, x4, a5 + + MADD3 y3, y3, x4, a6 + daddiu XX, XX, 4 * SIZE + MADD4 y4, y4, x3, a6 + daddiu AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 1 + blez I, .L29 + .align 3 + +.L28: + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + + MADD1 y1, y1, x1, a1 + MADD2 y2, y2, x2, a1 + + MADD3 y3, y3, x2, a2 + MADD4 y4, y4, x1, a2 + .align 3 + +.L29: + LD a1, 0 * SIZE(Y) + LD a2, 1 * SIZE(Y) + + ADD y1, y1, y3 + ADD y2, y2, y4 + + MADD a1, a1, ALPHA_R, y1 + MADD a2, a2, ALPHA_I, y1 + NMSUB a1, a1, ALPHA_I, y2 + MADD a2, a2, ALPHA_R, y2 + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + +#ifndef __64BIT__ + ldc1 $f20, 16($sp) + ldc1 $f21, 24($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 16 +#else + daddiu $sp, $sp, 32 +#endif + + EPILOGUE diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S new file mode 100644 index 0000000..1f4a90e --- /dev/null +++ b/kernel/mips64/znrm2.S @@ -0,0 +1,378 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define XX $7 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define ALPHA $f16 +#define max $f17 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + move XX, X + MOV s2, s1 + + dsra I, N, 2 + MOV s3, s1 + + blez I, .L15 + MOV s4, s1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + NOP + + FABS t3, a3 + LD a2, 1 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + NOP + + CMPLT $fcc2, s3, t3 + LD a4, 1 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + NOP + + FABS t3, a7 + LD a6, 1 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + NOP + + CMPLT $fcc2, s3, t3 + LD a8, 1 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + + bgtz I, .L12 + CMOVT s4, t4, $fcc3 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L100 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + FABS t2, a2 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + + lui TEMP, 0x3f80 + dmtc1 $0, a1 + + mtc1 TEMP, ALPHA + CMPEQ $fcc0, s1, a1 + + bc1t $fcc0, .L999 + cvt.d.s ALPHA, ALPHA + + div.d ALPHA, ALPHA, s1 + MOV max, s1 + + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + + dsra I, N, 2 + blez I, .L105 + NOP + + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + + LD a3, 0 * SIZE(XX) + LD a4, 1 * SIZE(XX) + daddu XX, XX, INCX + + LD a5, 0 * SIZE(XX) + LD a6, 1 * SIZE(XX) + daddu XX, XX, INCX + + LD a7, 0 * SIZE(XX) + LD a8, 1 * SIZE(XX) + daddiu I, I, -1 + + blez I, .L104 + daddu XX, XX, INCX + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, 0 * SIZE(XX) + MUL t2, ALPHA, a2 + daddiu I, I, -1 + + MUL t3, ALPHA, a3 + LD a2, 1 * SIZE(XX) + MUL t4, ALPHA, a4 + daddu XX, XX, INCX + + MADD s1, s1, t1, t1 + LD a3, 0 * SIZE(XX) + MADD s2, s2, t2, t2 + NOP + + MADD s3, s3, t3, t3 + LD a4, 1 * SIZE(XX) + MADD s4, s4, t4, t4 + daddu XX, XX, INCX + + MUL t1, ALPHA, a5 + LD a5, 0 * SIZE(XX) + MUL t2, ALPHA, a6 + NOP + + MUL t3, ALPHA, a7 + LD a6, 1 * SIZE(XX) + MUL t4, ALPHA, a8 + daddu XX, XX, INCX + + MADD s1, s1, t1, t1 + LD a7, 0 * SIZE(XX) + MADD s2, s2, t2, t2 + LD a8, 1 * SIZE(XX) + + MADD s3, s3, t3, t3 + daddu XX, XX, INCX + bgtz I, .L103 + MADD s4, s4, t4, t4 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + + MADD s1, s1, t1, t1 + MADD s2, s2, t2, t2 + MADD s3, s3, t3, t3 + MADD s4, s4, t4, t4 + + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + + MADD s1, s1, t1, t1 + MADD s2, s2, t2, t2 + MADD s3, s3, t3, t3 + MADD s4, s4, t4, t4 + .align 3 + +.L105: + andi I, N, 3 + + blez I, .L998 + NOP + .align 3 + +.L106: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddiu I, I, -1 + + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + + MADD s1, s1, t1, t1 + daddu XX, XX, INCX + + bgtz I, .L106 + MADD s2, s2, t2, t2 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + + ADD s1, s1, s3 + + sqrt.d s1, s1 + + j $31 + MUL s1, max, s1 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zrot.S b/kernel/mips64/zrot.S new file mode 100644 index 0000000..0a20569 --- /dev/null +++ b/kernel/mips64/zrot.S @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define XX $9 +#define YY $10 + +#define C $f17 +#define S $f18 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 + +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 + + PROLOGUE + + dsll INCX, INCX, ZBASE_SHIFT + li TEMP, 2 * SIZE + + blez N, .L999 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 1 + + bne INCY, TEMP, .L20 + NOP + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + MUL t1, S, b1 + + LD a4, 3 * SIZE(X) + MUL t2, C, b1 + LD b4, 3 * SIZE(Y) + MUL t3, S, b2 + + blez I, .L13 + MUL t4, C, b2 + .align 3 + +.L12: + MADD t1, t1, C, a1 + LD b1, 4 * SIZE(Y) + NMSUB t2, t2, S, a1 + LD a1, 4 * SIZE(X) + MADD t3, t3, C, a2 + LD b2, 5 * SIZE(Y) + NMSUB t4, t4, S, a2 + LD a2, 5 * SIZE(X) + + ST t1, 0 * SIZE(X) + MUL t1, S, b3 + ST t2, 0 * SIZE(Y) + MUL t2, C, b3 + ST t3, 1 * SIZE(X) + MUL t3, S, b4 + ST t4, 1 * SIZE(Y) + MUL t4, C, b4 + + MADD t1, t1, C, a3 + LD b3, 6 * SIZE(Y) + NMSUB t2, t2, S, a3 + LD a3, 6 * SIZE(X) + MADD t3, t3, C, a4 + LD b4, 7 * SIZE(Y) + NMSUB t4, t4, S, a4 + LD a4, 7 * SIZE(X) + + ST t1, 2 * SIZE(X) + MUL t1, S, b1 + ST t2, 2 * SIZE(Y) + MUL t2, C, b1 + ST t3, 3 * SIZE(X) + MUL t3, S, b2 + ST t4, 3 * SIZE(Y) + MUL t4, C, b2 + + daddiu I, I, -1 + daddiu X, X, 4 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 4 * SIZE + .align 3 + +.L13: + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(X) + MUL t1, S, b3 + ST t2, 0 * SIZE(Y) + MUL t2, C, b3 + ST t3, 1 * SIZE(X) + MUL t3, S, b4 + ST t4, 1 * SIZE(Y) + MUL t4, C, b4 + + MADD t1, t1, C, a3 + NMSUB t2, t2, S, a3 + MADD t3, t3, C, a4 + daddiu X, X, 4 * SIZE + NMSUB t4, t4, S, a4 + daddiu Y, Y, 4 * SIZE + + ST t1, -2 * SIZE(X) + ST t2, -2 * SIZE(Y) + ST t3, -1 * SIZE(X) + ST t4, -1 * SIZE(Y) + .align 3 + +.L15: + andi I, N, 1 + + blez I, .L999 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MUL t1, S, b1 + MUL t2, C, b1 + MUL t3, S, b2 + MUL t4, C, b2 + + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(X) + ST t2, 0 * SIZE(Y) + ST t3, 1 * SIZE(X) + + j .L999 + ST t4, 1 * SIZE(Y) + .align 3 + +.L20: + move XX, X + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + LD a2, 1 * SIZE(X) + dadd X, X, INCX + LD b2, 1 * SIZE(Y) + dadd Y, Y, INCY + + LD a3, 0 * SIZE(X) + LD b3, 0 * SIZE(Y) + + LD a4, 1 * SIZE(X) + dadd X, X, INCX + MUL t1, S, b1 + LD b4, 1 * SIZE(Y) + MUL t2, C, b1 + dadd Y, Y, INCY + + MUL t3, S, b2 + blez I, .L23 + MUL t4, C, b2 + .align 3 + +.L22: + MADD t1, t1, C, a1 + LD b1, 0 * SIZE(Y) + NMSUB t2, t2, S, a1 + LD a1, 0 * SIZE(X) + MADD t3, t3, C, a2 + LD b2, 1 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t4, t4, S, a2 + LD a2, 1 * SIZE(X) + dadd X, X, INCX + + ST t1, 0 * SIZE(XX) + MUL t1, S, b3 + ST t2, 0 * SIZE(YY) + MUL t2, C, b3 + ST t3, 1 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b4 + ST t4, 1 * SIZE(YY) + dadd YY, YY, INCY + MUL t4, C, b4 + + MADD t1, t1, C, a3 + LD b3, 0 * SIZE(Y) + NMSUB t2, t2, S, a3 + LD a3, 0 * SIZE(X) + MADD t3, t3, C, a4 + LD b4, 1 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t4, t4, S, a4 + LD a4, 1 * SIZE(X) + dadd X, X, INCX + + ST t1, 0 * SIZE(XX) + MUL t1, S, b1 + ST t2, 0 * SIZE(YY) + MUL t2, C, b1 + ST t3, 1 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b2 + ST t4, 1 * SIZE(YY) + MUL t4, C, b2 + daddiu I, I, -1 + + bgtz I, .L22 + dadd YY, YY, INCY + .align 3 + +.L23: + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(XX) + MUL t1, S, b3 + ST t2, 0 * SIZE(YY) + MUL t2, C, b3 + ST t3, 1 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b4 + ST t4, 1 * SIZE(YY) + dadd YY, YY, INCY + MUL t4, C, b4 + + MADD t1, t1, C, a3 + NMSUB t2, t2, S, a3 + MADD t3, t3, C, a4 + NMSUB t4, t4, S, a4 + + ST t1, 0 * SIZE(XX) + ST t2, 0 * SIZE(YY) + ST t3, 1 * SIZE(XX) + dadd XX, XX, INCX + ST t4, 1 * SIZE(YY) + dadd YY, YY, INCY + .align 3 + +.L25: + andi I, N, 1 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MUL t1, S, b1 + MUL t2, C, b1 + MUL t3, S, b2 + MUL t4, C, b2 + + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(X) + ST t2, 0 * SIZE(Y) + ST t3, 1 * SIZE(X) + ST t4, 1 * SIZE(Y) + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zscal.S b/kernel/mips64/zscal.S new file mode 100644 index 0000000..3feaf5a --- /dev/null +++ b/kernel/mips64/zscal.S @@ -0,0 +1,441 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $9 +#define INCX $10 + +#define I $2 +#define TEMP $3 + +#define XX $5 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define t1 $f8 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + + li TEMP, 2 * SIZE + MTC $0, a1 + + blez N, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + CMPEQ $fcc0, ALPHA_R, a1 + CMPEQ $fcc1, ALPHA_I, a1 + + bc1f $fcc0, .L50 + NOP + + bc1f $fcc1, .L50 + NOP + + bne INCX, TEMP, .L20 + dsra I, N, 2 + + blez I, .L15 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + ST a1, 2 * SIZE(X) + ST a1, 3 * SIZE(X) + ST a1, 4 * SIZE(X) + ST a1, 5 * SIZE(X) + ST a1, 6 * SIZE(X) + ST a1, 7 * SIZE(X) + addiu I, I, -1 + + bgtz I, .L12 + daddiu X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddiu I, I, -1 + + bgtz I, .L16 + daddiu X, X, 2 * SIZE + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 2 + blez I, .L25 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddiu I, I, -1 + + bgtz I, .L22 + daddu X, X, INCX + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + ST a1, 0 * SIZE(X) + daddiu I, I, -1 + ST a1, 1 * SIZE(X) + + bgtz I, .L26 + daddu X, X, INCX + + j $31 + NOP + .align 3 + +.L50: + bne INCX, TEMP, .L60 + dsra I, N, 2 + + blez I, .L55 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + LD a5, 4 * SIZE(X) + LD a6, 5 * SIZE(X) + MUL t1, ALPHA_R, a1 + LD a7, 6 * SIZE(X) + MUL t2, ALPHA_I, a1 + LD a8, 7 * SIZE(X) + MUL t3, ALPHA_R, a3 + + blez I, .L53 + MUL t4, ALPHA_I, a3 + .align 3 + +.L52: + NMSUB t1, t1, ALPHA_I, a2 + LD a1, 8 * SIZE(X) + MADD t2, t2, ALPHA_R, a2 + LD a2, 9 * SIZE(X) + + NMSUB t3, t3, ALPHA_I, a4 + LD a3, 10 * SIZE(X) + MADD t4, t4, ALPHA_R, a4 + LD a4, 11 * SIZE(X) + + ST t1, 0 * SIZE(X) + MUL t1, ALPHA_R, a5 + ST t2, 1 * SIZE(X) + MUL t2, ALPHA_I, a5 + + ST t3, 2 * SIZE(X) + MUL t3, ALPHA_R, a7 + ST t4, 3 * SIZE(X) + MUL t4, ALPHA_I, a7 + + NMSUB t1, t1, ALPHA_I, a6 + LD a5, 12 * SIZE(X) + MADD t2, t2, ALPHA_R, a6 + LD a6, 13 * SIZE(X) + + NMSUB t3, t3, ALPHA_I, a8 + LD a7, 14 * SIZE(X) + MADD t4, t4, ALPHA_R, a8 + LD a8, 15 * SIZE(X) + + ST t1, 4 * SIZE(X) + MUL t1, ALPHA_R, a1 + ST t2, 5 * SIZE(X) + MUL t2, ALPHA_I, a1 + ST t3, 6 * SIZE(X) + MUL t3, ALPHA_R, a3 + ST t4, 7 * SIZE(X) + MUL t4, ALPHA_I, a3 + + daddiu I, I, -1 + + bgtz I, .L52 + daddiu X, X, 8 * SIZE + .align 3 + +.L53: + NMSUB t1, t1, ALPHA_I, a2 + MADD t2, t2, ALPHA_R, a2 + NMSUB t3, t3, ALPHA_I, a4 + MADD t4, t4, ALPHA_R, a4 + + ST t1, 0 * SIZE(X) + MUL t1, ALPHA_R, a5 + ST t2, 1 * SIZE(X) + MUL t2, ALPHA_I, a5 + ST t3, 2 * SIZE(X) + MUL t3, ALPHA_R, a7 + ST t4, 3 * SIZE(X) + MUL t4, ALPHA_I, a7 + + NMSUB t1, t1, ALPHA_I, a6 + MADD t2, t2, ALPHA_R, a6 + NMSUB t3, t3, ALPHA_I, a8 + MADD t4, t4, ALPHA_R, a8 + + ST t1, 4 * SIZE(X) + ST t2, 5 * SIZE(X) + ST t3, 6 * SIZE(X) + ST t4, 7 * SIZE(X) + + daddiu X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L56: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + + NMSUB t1, t1, ALPHA_I, a2 + MADD t2, t2, ALPHA_R, a2 + + daddiu X, X, 2 * SIZE + daddiu I, I, -1 + + ST t1, -2 * SIZE(X) + bgtz I, .L56 + ST t2, -1 * SIZE(X) + + j $31 + NOP + .align 3 + +.L60: + dsra I, N, 2 + move XX, X + + blez I, .L65 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + MUL t1, ALPHA_R, a1 + LD a7, 0 * SIZE(X) + MUL t2, ALPHA_I, a1 + LD a8, 1 * SIZE(X) + MUL t3, ALPHA_R, a3 + daddu X, X, INCX + + blez I, .L63 + MUL t4, ALPHA_I, a3 + .align 3 + +.L62: + NMSUB t1, t1, ALPHA_I, a2 + LD a1, 0 * SIZE(X) + MADD t2, t2, ALPHA_R, a2 + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + NMSUB t3, t3, ALPHA_I, a4 + LD a3, 0 * SIZE(X) + MADD t4, t4, ALPHA_R, a4 + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + ST t1, 0 * SIZE(XX) + MUL t1, ALPHA_R, a5 + + ST t2, 1 * SIZE(XX) + MUL t2, ALPHA_I, a5 + daddu XX, XX, INCX + + ST t3, 0 * SIZE(XX) + MUL t3, ALPHA_R, a7 + + ST t4, 1 * SIZE(XX) + MUL t4, ALPHA_I, a7 + daddu XX, XX, INCX + + + NMSUB t1, t1, ALPHA_I, a6 + LD a5, 0 * SIZE(X) + MADD t2, t2, ALPHA_R, a6 + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + NMSUB t3, t3, ALPHA_I, a8 + LD a7, 0 * SIZE(X) + MADD t4, t4, ALPHA_R, a8 + LD a8, 1 * SIZE(X) + daddu X, X, INCX + + ST t1, 0 * SIZE(XX) + MUL t1, ALPHA_R, a1 + ST t2, 1 * SIZE(XX) + MUL t2, ALPHA_I, a1 + daddu XX, XX, INCX + + ST t3, 0 * SIZE(XX) + MUL t3, ALPHA_R, a3 + ST t4, 1 * SIZE(XX) + MUL t4, ALPHA_I, a3 + + daddiu I, I, -1 + + bgtz I, .L62 + daddu XX, XX, INCX + .align 3 + +.L63: + NMSUB t1, t1, ALPHA_I, a2 + MADD t2, t2, ALPHA_R, a2 + NMSUB t3, t3, ALPHA_I, a4 + MADD t4, t4, ALPHA_R, a4 + + ST t1, 0 * SIZE(XX) + MUL t1, ALPHA_R, a5 + ST t2, 1 * SIZE(XX) + MUL t2, ALPHA_I, a5 + daddu XX, XX, INCX + + ST t3, 0 * SIZE(XX) + MUL t3, ALPHA_R, a7 + ST t4, 1 * SIZE(XX) + MUL t4, ALPHA_I, a7 + daddu XX, XX, INCX + + NMSUB t1, t1, ALPHA_I, a6 + MADD t2, t2, ALPHA_R, a6 + NMSUB t3, t3, ALPHA_I, a8 + MADD t4, t4, ALPHA_R, a8 + + ST t1, 0 * SIZE(XX) + ST t2, 1 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + ST t4, 1 * SIZE(XX) + daddu XX, XX, INCX + .align 3 + +.L65: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L66: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + + NMSUB t1, t1, ALPHA_I, a2 + MADD t2, t2, ALPHA_R, a2 + daddiu I, I, -1 + + ST t1, 0 * SIZE(X) + ST t2, 1 * SIZE(X) + + bgtz I, .L66 + daddu X, X, INCX + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zswap.S b/kernel/mips64/zswap.S new file mode 100644 index 0000000..663da23 --- /dev/null +++ b/kernel/mips64/zswap.S @@ -0,0 +1,361 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $9 +#define INCX $10 +#define Y $11 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define XX $5 +#define YY $6 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 + + PROLOGUE + + LDARG INCY, 0($sp) + li TEMP, 2 * SIZE + + blez N, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 2 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + LD a4, 3 * SIZE(X) + LD b4, 3 * SIZE(Y) + LD a5, 4 * SIZE(X) + LD b5, 4 * SIZE(Y) + LD a6, 5 * SIZE(X) + LD b6, 5 * SIZE(Y) + LD a7, 6 * SIZE(X) + LD b7, 6 * SIZE(Y) + LD a8, 7 * SIZE(X) + LD b8, 7 * SIZE(Y) + + blez I, .L13 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(Y) + LD a1, 8 * SIZE(X) + ST b1, 0 * SIZE(X) + LD b1, 8 * SIZE(Y) + + ST a2, 1 * SIZE(Y) + LD a2, 9 * SIZE(X) + ST b2, 1 * SIZE(X) + LD b2, 9 * SIZE(Y) + + ST a3, 2 * SIZE(Y) + LD a3, 10 * SIZE(X) + ST b3, 2 * SIZE(X) + LD b3, 10 * SIZE(Y) + + ST a4, 3 * SIZE(Y) + LD a4, 11 * SIZE(X) + ST b4, 3 * SIZE(X) + LD b4, 11 * SIZE(Y) + + ST a5, 4 * SIZE(Y) + LD a5, 12 * SIZE(X) + ST b5, 4 * SIZE(X) + LD b5, 12 * SIZE(Y) + + ST a6, 5 * SIZE(Y) + LD a6, 13 * SIZE(X) + ST b6, 5 * SIZE(X) + LD b6, 13 * SIZE(Y) + + ST a7, 6 * SIZE(Y) + LD a7, 14 * SIZE(X) + ST b7, 6 * SIZE(X) + LD b7, 14 * SIZE(Y) + + ST a8, 7 * SIZE(Y) + LD a8, 15 * SIZE(X) + ST b8, 7 * SIZE(X) + LD b8, 15 * SIZE(Y) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + ST a1, 0 * SIZE(Y) + ST b1, 0 * SIZE(X) + ST a2, 1 * SIZE(Y) + ST b2, 1 * SIZE(X) + ST a3, 2 * SIZE(Y) + ST b3, 2 * SIZE(X) + ST a4, 3 * SIZE(Y) + ST b4, 3 * SIZE(X) + ST a5, 4 * SIZE(Y) + ST b5, 4 * SIZE(X) + ST a6, 5 * SIZE(Y) + ST b6, 5 * SIZE(X) + ST a7, 6 * SIZE(Y) + ST b7, 6 * SIZE(X) + ST a8, 7 * SIZE(Y) + ST b8, 7 * SIZE(X) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + + daddiu X, X, 2 * SIZE + daddiu I, I, -1 + daddiu Y, Y, 2 * SIZE + + ST b1, -2 * SIZE(X) + ST b2, -1 * SIZE(X) + ST a1, -2 * SIZE(Y) + bgtz I, .L16 + ST a2, -1 * SIZE(Y) + + j .L999 + NOP + .align 3 + +.L20: + dsra I, N, 2 + move XX, X + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + LD a3, 0 * SIZE(X) + LD b3, 0 * SIZE(Y) + LD a4, 1 * SIZE(X) + LD b4, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + LD a5, 0 * SIZE(X) + LD b5, 0 * SIZE(Y) + LD a6, 1 * SIZE(X) + LD b6, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + LD a7, 0 * SIZE(X) + LD b7, 0 * SIZE(Y) + LD a8, 1 * SIZE(X) + LD b8, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + blez I, .L23 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(YY) + LD a1, 0 * SIZE(X) + ST b1, 0 * SIZE(XX) + LD b1, 0 * SIZE(Y) + + ST a2, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a2, 1 * SIZE(X) + daddu X, X, INCX + ST b2, 1 * SIZE(XX) + daddu XX, XX, INCX + LD b2, 1 * SIZE(Y) + daddu Y, Y, INCY + + ST a3, 0 * SIZE(YY) + LD a3, 0 * SIZE(X) + ST b3, 0 * SIZE(XX) + LD b3, 0 * SIZE(Y) + + ST a4, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a4, 1 * SIZE(X) + daddu X, X, INCX + ST b4, 1 * SIZE(XX) + daddu XX, XX, INCX + LD b4, 1 * SIZE(Y) + daddu Y, Y, INCY + + ST a5, 0 * SIZE(YY) + LD a5, 0 * SIZE(X) + ST b5, 0 * SIZE(XX) + LD b5, 0 * SIZE(Y) + + ST a6, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a6, 1 * SIZE(X) + daddu X, X, INCX + ST b6, 1 * SIZE(XX) + daddu XX, XX, INCX + LD b6, 1 * SIZE(Y) + daddu Y, Y, INCY + + ST a7, 0 * SIZE(YY) + LD a7, 0 * SIZE(X) + ST b7, 0 * SIZE(XX) + LD b7, 0 * SIZE(Y) + + ST a8, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a8, 1 * SIZE(X) + daddu X, X, INCX + ST b8, 1 * SIZE(XX) + daddu XX, XX, INCX + LD b8, 1 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L22 + daddu Y, Y, INCY + .align 3 + +.L23: + ST a1, 0 * SIZE(YY) + ST b1, 0 * SIZE(XX) + ST a2, 1 * SIZE(YY) + ST b2, 1 * SIZE(XX) + daddu YY, YY, INCY + daddu XX, XX, INCX + ST a3, 0 * SIZE(YY) + ST b3, 0 * SIZE(XX) + ST a4, 1 * SIZE(YY) + ST b4, 1 * SIZE(XX) + daddu YY, YY, INCY + daddu XX, XX, INCX + ST a5, 0 * SIZE(YY) + ST b5, 0 * SIZE(XX) + ST a6, 1 * SIZE(YY) + ST b6, 1 * SIZE(XX) + daddu YY, YY, INCY + daddu XX, XX, INCX + ST a7, 0 * SIZE(YY) + ST b7, 0 * SIZE(XX) + ST a8, 1 * SIZE(YY) + ST b8, 1 * SIZE(XX) + daddu YY, YY, INCY + daddu XX, XX, INCX + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + daddiu I, I, -1 + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + + ST b1, 0 * SIZE(X) + ST b2, 1 * SIZE(X) + + bgtz I, .L26 + daddu X, X, INCX + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zsymv_L.S b/kernel/mips64/zsymv_L.S new file mode 100644 index 0000000..65d5ce3 --- /dev/null +++ b/kernel/mips64/zsymv_L.S @@ -0,0 +1,698 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define A $7 +#define LDA $8 +#define X $9 +#define INCX $10 +#define Y $11 +#define INCY $5 +#define BUFFER $6 + +#define XX $12 +#define YY $13 + +#define I $14 +#define IS $15 + +#define AO1 $16 +#define AO2 $17 + +#define Y1 $18 +#define TEMP $19 + +#define II INCX + +#define ALPHA_R $f13 +#define ALPHA_I $f14 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define alpha1 $f8 +#define alpha2 $f9 +#define alpha3 $f10 +#define alpha4 $f11 + +#define x1 $f12 +#define x2 $f15 +#define x3 $f16 +#define x4 $f17 + +#define xsum1 $f18 +#define xsum2 $f19 +#define xsum3 $f20 +#define xsum4 $f21 + +#define ysum1 $f22 +#define ysum2 $f23 +#define ysum3 $f24 +#define ysum4 $f25 + +#ifndef HEMV +#define ADD1 NMSUB +#define ADD2 MADD +#else +#define ADD1 MADD +#define ADD2 NMSUB +#endif + + PROLOGUE + + LDARG INCY, 0($sp) + LDARG BUFFER, 8($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -64 +#else + daddiu $sp, $sp, -80 +#endif + + SDARG $16, 0($sp) + dsll LDA, LDA, ZBASE_SHIFT + SDARG $17, 8($sp) + dsll INCX, INCX, ZBASE_SHIFT + SDARG $18, 16($sp) + dsll INCY, INCY, ZBASE_SHIFT + SDARG $19, 24($sp) + nop + + sdc1 $f24, 32($sp) + sdc1 $f25, 40($sp) + +#ifndef __64BIT__ + sdc1 $f20, 48($sp) + sdc1 $f21, 56($sp) + sdc1 $f22, 64($sp) + sdc1 $f23, 72($sp) +#endif + + blez M, .L999 + li IS, 2 * SIZE + + beq IS, INCX, .L05 + move Y1, Y + + dsra I, M, 2 + move XX, X + + blez I, .L02 + move X, BUFFER + .align 3 + +.L01: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a3, 0 * SIZE(XX) + LD a4, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a5, 0 * SIZE(XX) + LD a6, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a7, 0 * SIZE(XX) + LD a8, 1 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + ST a5, 4 * SIZE(BUFFER) + ST a6, 5 * SIZE(BUFFER) + ST a7, 6 * SIZE(BUFFER) + ST a8, 7 * SIZE(BUFFER) + + daddiu I, I, -1 + + bgtz I, .L01 + daddiu BUFFER, BUFFER, 8 * SIZE + .align 3 + +.L02: + andi I, M, 3 + blez I, .L05 + NOP + .align 3 + +.L03: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L03 + daddiu BUFFER, BUFFER, 2 * SIZE + .align 3 + +.L05: + beq IS, INCY, .L10 + daddiu BUFFER, BUFFER, 255 + + li TEMP, -256 + and BUFFER, BUFFER, TEMP + + dsra I, M, 2 + move Y1, BUFFER + + blez I, .L07 + move YY, Y + .align 3 + +.L06: + LD a1, 0 * SIZE(YY) + LD a2, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(YY) + LD a4, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a5, 0 * SIZE(YY) + LD a6, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a7, 0 * SIZE(YY) + LD a8, 1 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + ST a5, 4 * SIZE(BUFFER) + ST a6, 5 * SIZE(BUFFER) + ST a7, 6 * SIZE(BUFFER) + ST a8, 7 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu BUFFER, BUFFER, 8 * SIZE + .align 3 + +.L07: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L08: + LD a1, 0 * SIZE(YY) + LD a2, 1 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L08 + daddiu BUFFER, BUFFER, 2 * SIZE + .align 3 + +.L10: + slti TEMP, M, 2 + nop + + bgtz TEMP, .L20 + li IS, 0 + .align 3 + +.L11: + dsll TEMP, IS, ZBASE_SHIFT + nop + + daddu XX, X, TEMP + daddu YY, Y1, TEMP + + LD alpha1, 0 * SIZE(XX) + LD alpha2, 1 * SIZE(XX) + LD alpha3, 2 * SIZE(XX) + LD alpha4, 3 * SIZE(XX) + + move AO1, A + daddu AO2, A, LDA + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + MUL xsum1, alpha1, a1 + daddiu XX, XX, 4 * SIZE + MUL xsum2, alpha2, a1 + daddiu YY, YY, 4 * SIZE + MUL xsum3, alpha1, a3 + daddu A, AO2, LDA + MUL xsum4, alpha2, a3 + daddiu A, A, 4 * SIZE + +#ifndef HEMV + NMSUB xsum1, xsum1, alpha2, a2 + MADD xsum2, xsum2, alpha1, a2 +#endif + NMSUB xsum3, xsum3, alpha2, a4 + daddiu AO1, AO1, 4 * SIZE + MADD xsum4, xsum4, alpha1, a4 + daddiu AO2, AO2, 4 * SIZE + + MADD xsum1, xsum1, alpha3, a3 + MADD xsum2, xsum2, alpha4, a3 + MADD xsum3, xsum3, alpha3, a7 + MADD xsum4, xsum4, alpha4, a7 + + ADD1 xsum1, xsum1, alpha4, a4 + ADD2 xsum2, xsum2, alpha3, a4 +#ifndef HEMV + ADD1 xsum3, xsum3, alpha4, a8 + ADD2 xsum4, xsum4, alpha3, a8 +#endif + + MOV x1, alpha1 + dsubu II, M, IS + MOV x2, alpha2 + daddiu II, II, - 2 + MOV x3, alpha3 + dsra I, II, 1 + MOV x4, alpha4 + nop + + MUL alpha1, ALPHA_R, alpha1 + MUL alpha2, ALPHA_R, alpha2 + MUL alpha3, ALPHA_R, alpha3 + MUL alpha4, ALPHA_R, alpha4 + + NMSUB alpha1, alpha1, ALPHA_I, x2 + MADD alpha2, alpha2, ALPHA_I, x1 + NMSUB alpha3, alpha3, ALPHA_I, x4 + MADD alpha4, alpha4, ALPHA_I, x3 + + blez I, .L15 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + + blez I, .L13 + LD ysum2, 1 * SIZE(YY) + .align 3 + +.L12: + MADD ysum1, ysum1, alpha1, a1 + LD ysum3, 2 * SIZE(YY) + MADD ysum2, ysum2, alpha2, a1 + LD ysum4, 3 * SIZE(YY) + MADD xsum1, xsum1, x1, a1 + LD a8, 3 * SIZE(AO2) + MADD xsum2, xsum2, x2, a1 + LD a1, 4 * SIZE(AO1) + + MADD ysum3, ysum3, alpha1, a3 + LD x3, 2 * SIZE(XX) + MADD ysum4, ysum4, alpha2, a3 + daddiu I, I, -1 + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + daddiu AO2, AO2, 4 * SIZE + ADD2 xsum2, xsum2, x1, a2 + LD a2, 5 * SIZE(AO1) + + NMSUB ysum3, ysum3, alpha2, a4 + MADD ysum4, ysum4, alpha1, a4 + ADD1 xsum3, xsum3, x2, a6 + LD x2, 5 * SIZE(XX) + ADD2 xsum4, xsum4, x1, a6 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + MADD xsum1, xsum1, x3, a3 + LD a5, 0 * SIZE(AO2) + MADD xsum2, xsum2, x4, a3 + LD a3, 6 * SIZE(AO1) + + MADD ysum3, ysum3, alpha3, a7 + MADD ysum4, ysum4, alpha4, a7 + MADD xsum3, xsum3, x3, a7 + daddiu AO1, AO1, 4 * SIZE + MADD xsum4, xsum4, x4, a7 + LD a7, 2 * SIZE(AO2) + + NMSUB ysum1, ysum1, alpha4, a6 + daddiu XX, XX, 4 * SIZE + MADD ysum2, ysum2, alpha3, a6 + LD a6, 1 * SIZE(AO2) + ADD1 xsum1, xsum1, x4, a4 + daddiu YY, YY, 4 * SIZE + ADD2 xsum2, xsum2, x3, a4 + LD a4, 3 * SIZE(AO1) + + NMSUB ysum3, ysum3, alpha4, a8 + ST ysum1,-4 * SIZE(YY) + MADD ysum4, ysum4, alpha3, a8 + ST ysum2,-3 * SIZE(YY) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + ADD1 xsum3, xsum3, x4, a8 + LD x4, 3 * SIZE(XX) + ADD2 xsum4, xsum4, x3, a8 + + ST ysum3,-2 * SIZE(YY) + bgtz I, .L12 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L13: + MADD ysum1, ysum1, alpha1, a1 + LD ysum3, 2 * SIZE(YY) + MADD ysum2, ysum2, alpha2, a1 + LD ysum4, 3 * SIZE(YY) + MADD xsum1, xsum1, x1, a1 + LD a8, 3 * SIZE(AO2) + MADD xsum2, xsum2, x2, a1 + LD x3, 2 * SIZE(XX) + + MADD ysum3, ysum3, alpha1, a3 + MADD ysum4, ysum4, alpha2, a3 + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + ADD2 xsum2, xsum2, x1, a2 + + NMSUB ysum3, ysum3, alpha2, a4 + MADD ysum4, ysum4, alpha1, a4 + ADD1 xsum3, xsum3, x2, a6 + ADD2 xsum4, xsum4, x1, a6 + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + MADD xsum1, xsum1, x3, a3 + MADD xsum2, xsum2, x4, a3 + + MADD ysum3, ysum3, alpha3, a7 + MADD ysum4, ysum4, alpha4, a7 + MADD xsum3, xsum3, x3, a7 + MADD xsum4, xsum4, x4, a7 + + NMSUB ysum1, ysum1, alpha4, a6 + MADD ysum2, ysum2, alpha3, a6 + ADD1 xsum1, xsum1, x4, a4 + ADD2 xsum2, xsum2, x3, a4 + + NMSUB ysum3, ysum3, alpha4, a8 + daddiu XX, XX, 4 * SIZE + MADD ysum4, ysum4, alpha3, a8 + daddiu YY, YY, 4 * SIZE + ADD1 xsum3, xsum3, x4, a8 + daddiu AO1, AO1, 4 * SIZE + ADD2 xsum4, xsum4, x3, a8 + daddiu AO2, AO2, 4 * SIZE + + ST ysum1, -4 * SIZE(YY) + ST ysum2, -3 * SIZE(YY) + ST ysum3, -2 * SIZE(YY) + ST ysum4, -1 * SIZE(YY) + .align 3 + +.L15: + andi I, M, 1 + NOP + blez I, .L16 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha2, a1 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x2, a1 + + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + ADD2 xsum2, xsum2, x1, a2 + + ADD1 xsum3, xsum3, x2, a6 + ADD2 xsum4, xsum4, x1, a6 + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + + NMSUB ysum1, ysum1, alpha4, a6 + MADD ysum2, ysum2, alpha3, a6 + + daddiu XX, XX, 2 * SIZE + daddiu YY, YY, 2 * SIZE + daddiu AO1, AO1, 2 * SIZE + daddiu AO2, AO2, 2 * SIZE + + ST ysum1, -2 * SIZE(YY) + ST ysum2, -1 * SIZE(YY) + .align 3 + +.L16: + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + LD ysum3, 2 * SIZE(TEMP) + LD ysum4, 3 * SIZE(TEMP) + + MADD ysum1, ysum1, ALPHA_R, xsum1 + MADD ysum2, ysum2, ALPHA_I, xsum1 + MADD ysum3, ysum3, ALPHA_R, xsum3 + MADD ysum4, ysum4, ALPHA_I, xsum3 + + NMSUB ysum1, ysum1, ALPHA_I, xsum2 + MADD ysum2, ysum2, ALPHA_R, xsum2 + NMSUB ysum3, ysum3, ALPHA_I, xsum4 + MADD ysum4, ysum4, ALPHA_R, xsum4 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + ST ysum3, 2 * SIZE(TEMP) + ST ysum4, 3 * SIZE(TEMP) + + daddiu TEMP, IS, 4 + slt TEMP, M, TEMP + + beqz TEMP, .L11 + daddiu IS, IS, 2 + .align 3 + +.L20: + andi TEMP, M, 1 + nop + blez TEMP, .L900 + nop + + dsll TEMP, IS, ZBASE_SHIFT + nop + + daddu XX, X, TEMP + daddu YY, Y1, TEMP + + LD alpha1, 0 * SIZE(XX) + LD alpha2, 1 * SIZE(XX) + + LD a1, 0 * SIZE(A) + LD a2, 1 * SIZE(A) + + MUL xsum1, alpha1, a1 + LD ysum1, 0 * SIZE(YY) + MUL xsum2, alpha2, a1 + LD ysum2, 1 * SIZE(YY) + +#ifndef HEMV + NMSUB xsum1, xsum1, alpha2, a2 + MADD xsum2, xsum2, alpha1, a2 +#endif + + MOV x1, alpha1 + MOV x2, alpha2 + + MUL alpha1, ALPHA_R, alpha1 + MUL alpha2, ALPHA_R, alpha2 + + NMSUB alpha1, alpha1, ALPHA_I, x2 + MADD alpha2, alpha2, ALPHA_I, x1 + + MADD ysum1, ysum1, ALPHA_R, xsum1 + MADD ysum2, ysum2, ALPHA_I, xsum1 + NMSUB ysum1, ysum1, ALPHA_I, xsum2 + MADD ysum2, ysum2, ALPHA_R, xsum2 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + .align 3 + +.L900: + li IS, 2 * SIZE + NOP + + beq INCY, IS, .L999 + dsra I, M, 2 + + blez I, .L905 + NOP + .align 3 + +.L902: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + LD a3, 2 * SIZE(Y1) + LD a4, 3 * SIZE(Y1) + LD a5, 4 * SIZE(Y1) + LD a6, 5 * SIZE(Y1) + LD a7, 6 * SIZE(Y1) + LD a8, 7 * SIZE(Y1) + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu Y1, Y1, 8 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + daddiu Y1, Y1, 2 * SIZE + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + ldc1 $f24, 32($sp) + ldc1 $f25, 40($sp) + +#ifndef __64BIT__ + ldc1 $f20, 48($sp) + ldc1 $f21, 56($sp) + ldc1 $f22, 64($sp) + ldc1 $f23, 72($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 64 +#else + daddiu $sp, $sp, 80 +#endif + + EPILOGUE diff --git a/kernel/mips64/zsymv_U.S b/kernel/mips64/zsymv_U.S new file mode 100644 index 0000000..938d911 --- /dev/null +++ b/kernel/mips64/zsymv_U.S @@ -0,0 +1,717 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define A $7 +#define LDA $8 +#define X $9 +#define INCX $10 +#define Y $11 +#define INCY $5 +#define BUFFER $6 + +#define XX $12 +#define YY $13 + +#define I $14 +#define IS $15 + +#define AO1 $16 +#define AO2 $17 + +#define Y1 $18 +#define TEMP $19 + +#define ALPHA_R $f13 +#define ALPHA_I $f14 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define alpha1 $f8 +#define alpha2 $f9 +#define alpha3 $f10 +#define alpha4 $f11 + +#define x1 $f12 +#define x2 $f15 +#define x3 $f16 +#define x4 $f17 + +#define xsum1 $f18 +#define xsum2 $f19 +#define xsum3 $f20 +#define xsum4 $f21 + +#define ysum1 $f22 +#define ysum2 $f23 +#define ysum3 $f24 +#define ysum4 $f25 + +#ifndef HEMV +#define ADD1 NMSUB +#define ADD2 MADD +#else +#define ADD1 MADD +#define ADD2 NMSUB +#endif + + PROLOGUE + + LDARG INCY, 0($sp) + LDARG BUFFER, 8($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -64 +#else + daddiu $sp, $sp, -80 +#endif + + SDARG $16, 0($sp) + dsll LDA, LDA, ZBASE_SHIFT + SDARG $17, 8($sp) + dsll INCX, INCX, ZBASE_SHIFT + SDARG $18, 16($sp) + dsll INCY, INCY, ZBASE_SHIFT + SDARG $19, 24($sp) + nop + + sdc1 $f24, 32($sp) + sdc1 $f25, 40($sp) + +#ifndef __64BIT__ + sdc1 $f20, 48($sp) + sdc1 $f21, 56($sp) + sdc1 $f22, 64($sp) + sdc1 $f23, 72($sp) +#endif + + blez M, .L999 + li IS, 2 * SIZE + + beq IS, INCX, .L05 + move Y1, Y + + dsra I, M, 2 + move XX, X + + blez I, .L02 + move X, BUFFER + .align 3 + +.L01: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a3, 0 * SIZE(XX) + LD a4, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a5, 0 * SIZE(XX) + LD a6, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a7, 0 * SIZE(XX) + LD a8, 1 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + ST a5, 4 * SIZE(BUFFER) + ST a6, 5 * SIZE(BUFFER) + ST a7, 6 * SIZE(BUFFER) + ST a8, 7 * SIZE(BUFFER) + + daddiu I, I, -1 + + bgtz I, .L01 + daddiu BUFFER, BUFFER, 8 * SIZE + .align 3 + +.L02: + andi I, M, 3 + blez I, .L05 + NOP + .align 3 + +.L03: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L03 + daddiu BUFFER, BUFFER, 2 * SIZE + .align 3 + +.L05: + beq IS, INCY, .L10 + daddiu BUFFER, BUFFER, 255 + + li TEMP, -256 + and BUFFER, BUFFER, TEMP + + dsra I, M, 2 + move Y1, BUFFER + + blez I, .L07 + move YY, Y + .align 3 + +.L06: + LD a1, 0 * SIZE(YY) + LD a2, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(YY) + LD a4, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a5, 0 * SIZE(YY) + LD a6, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a7, 0 * SIZE(YY) + LD a8, 1 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + ST a5, 4 * SIZE(BUFFER) + ST a6, 5 * SIZE(BUFFER) + ST a7, 6 * SIZE(BUFFER) + ST a8, 7 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu BUFFER, BUFFER, 8 * SIZE + .align 3 + +.L07: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L08: + LD a1, 0 * SIZE(YY) + LD a2, 1 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L08 + daddiu BUFFER, BUFFER, 2 * SIZE + .align 3 + +.L10: + slti TEMP, M, 2 + nop + + bgtz TEMP, .L20 + li IS, 0 + .align 3 + +.L11: + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, X, TEMP + + LD x1, 0 * SIZE(TEMP) + LD x2, 1 * SIZE(TEMP) + LD x3, 2 * SIZE(TEMP) + LD x4, 3 * SIZE(TEMP) + + MTC $0, xsum1 + MTC $0, xsum2 + MTC $0, xsum3 + MTC $0, xsum4 + + MUL alpha1, ALPHA_R, x1 + move AO1, A + MUL alpha2, ALPHA_I, x1 + dsra I, IS, 1 + MUL alpha3, ALPHA_R, x3 + daddu AO2, A, LDA + MUL alpha4, ALPHA_I, x3 + daddu A, AO2, LDA + + NMSUB alpha1, alpha1, ALPHA_I, x2 + move XX, X + MADD alpha2, alpha2, ALPHA_R, x2 + move YY, Y1 + NMSUB alpha3, alpha3, ALPHA_I, x4 + MADD alpha4, alpha4, ALPHA_R, x4 + + blez I, .L15 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + + blez I, .L13 + LD ysum2, 1 * SIZE(YY) + .align 3 + +.L12: + MADD ysum1, ysum1, alpha1, a1 + LD ysum3, 2 * SIZE(YY) + MADD ysum2, ysum2, alpha2, a1 + LD ysum4, 3 * SIZE(YY) + MADD xsum1, xsum1, x1, a1 + LD a8, 3 * SIZE(AO2) + MADD xsum2, xsum2, x2, a1 + LD a1, 4 * SIZE(AO1) + + MADD ysum3, ysum3, alpha1, a3 + LD x3, 2 * SIZE(XX) + MADD ysum4, ysum4, alpha2, a3 + daddiu I, I, -1 + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + daddiu AO2, AO2, 4 * SIZE + ADD2 xsum2, xsum2, x1, a2 + LD a2, 5 * SIZE(AO1) + + NMSUB ysum3, ysum3, alpha2, a4 + MADD ysum4, ysum4, alpha1, a4 + ADD1 xsum3, xsum3, x2, a6 + LD x2, 5 * SIZE(XX) + ADD2 xsum4, xsum4, x1, a6 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + MADD xsum1, xsum1, x3, a3 + LD a5, 0 * SIZE(AO2) + MADD xsum2, xsum2, x4, a3 + LD a3, 6 * SIZE(AO1) + + MADD ysum3, ysum3, alpha3, a7 + MADD ysum4, ysum4, alpha4, a7 + MADD xsum3, xsum3, x3, a7 + daddiu AO1, AO1, 4 * SIZE + MADD xsum4, xsum4, x4, a7 + LD a7, 2 * SIZE(AO2) + + NMSUB ysum1, ysum1, alpha4, a6 + daddiu XX, XX, 4 * SIZE + MADD ysum2, ysum2, alpha3, a6 + LD a6, 1 * SIZE(AO2) + ADD1 xsum1, xsum1, x4, a4 + daddiu YY, YY, 4 * SIZE + ADD2 xsum2, xsum2, x3, a4 + LD a4, 3 * SIZE(AO1) + + NMSUB ysum3, ysum3, alpha4, a8 + ST ysum1,-4 * SIZE(YY) + MADD ysum4, ysum4, alpha3, a8 + ST ysum2,-3 * SIZE(YY) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + ADD1 xsum3, xsum3, x4, a8 + LD x4, 3 * SIZE(XX) + ADD2 xsum4, xsum4, x3, a8 + + ST ysum3,-2 * SIZE(YY) + bgtz I, .L12 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L13: + MADD ysum1, ysum1, alpha1, a1 + LD ysum3, 2 * SIZE(YY) + MADD ysum2, ysum2, alpha2, a1 + LD ysum4, 3 * SIZE(YY) + MADD xsum1, xsum1, x1, a1 + LD a8, 3 * SIZE(AO2) + MADD xsum2, xsum2, x2, a1 + LD x3, 2 * SIZE(XX) + + MADD ysum3, ysum3, alpha1, a3 + MADD ysum4, ysum4, alpha2, a3 + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + ADD2 xsum2, xsum2, x1, a2 + + NMSUB ysum3, ysum3, alpha2, a4 + MADD ysum4, ysum4, alpha1, a4 + ADD1 xsum3, xsum3, x2, a6 + ADD2 xsum4, xsum4, x1, a6 + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + MADD xsum1, xsum1, x3, a3 + MADD xsum2, xsum2, x4, a3 + + MADD ysum3, ysum3, alpha3, a7 + MADD ysum4, ysum4, alpha4, a7 + MADD xsum3, xsum3, x3, a7 + MADD xsum4, xsum4, x4, a7 + + NMSUB ysum1, ysum1, alpha4, a6 + MADD ysum2, ysum2, alpha3, a6 + ADD1 xsum1, xsum1, x4, a4 + ADD2 xsum2, xsum2, x3, a4 + + NMSUB ysum3, ysum3, alpha4, a8 + daddiu XX, XX, 4 * SIZE + MADD ysum4, ysum4, alpha3, a8 + daddiu YY, YY, 4 * SIZE + ADD1 xsum3, xsum3, x4, a8 + daddiu AO1, AO1, 4 * SIZE + ADD2 xsum4, xsum4, x3, a8 + daddiu AO2, AO2, 4 * SIZE + + ST ysum1, -4 * SIZE(YY) + ST ysum2, -3 * SIZE(YY) + ST ysum3, -2 * SIZE(YY) + ST ysum4, -1 * SIZE(YY) + .align 3 + +.L15: + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + LD ysum3, 2 * SIZE(TEMP) + LD ysum4, 3 * SIZE(TEMP) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + MOV x1, xsum1 + MOV x2, xsum2 + MOV x3, xsum3 + MOV x4, xsum4 + + MUL xsum1, ALPHA_R, xsum1 + MUL xsum2, ALPHA_R, xsum2 + MUL xsum3, ALPHA_R, xsum3 + MUL xsum4, ALPHA_R, xsum4 + + NMSUB xsum1, xsum1, ALPHA_I, x2 + MADD xsum2, xsum2, ALPHA_I, x1 + NMSUB xsum3, xsum3, ALPHA_I, x4 + MADD xsum4, xsum4, ALPHA_I, x3 + + MADD xsum1, xsum1, alpha1, a1 + MADD xsum2, xsum2, alpha2, a1 + MADD xsum3, xsum3, alpha1, a5 + MADD xsum4, xsum4, alpha2, a5 + +#ifndef HEMV + ADD1 xsum1, xsum1, alpha2, a2 + ADD2 xsum2, xsum2, alpha1, a2 +#endif + ADD1 xsum3, xsum3, alpha2, a6 + ADD2 xsum4, xsum4, alpha1, a6 + + MADD xsum1, xsum1, alpha3, a5 + MADD xsum2, xsum2, alpha4, a5 + MADD xsum3, xsum3, alpha3, a7 + MADD xsum4, xsum4, alpha4, a7 + + NMSUB xsum1, xsum1, alpha4, a6 + MADD xsum2, xsum2, alpha3, a6 +#ifndef HEMV + ADD1 xsum3, xsum3, alpha4, a8 + ADD2 xsum4, xsum4, alpha3, a8 +#endif + + ADD ysum1, ysum1, xsum1 + ADD ysum2, ysum2, xsum2 + ADD ysum3, ysum3, xsum3 + ADD ysum4, ysum4, xsum4 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + ST ysum3, 2 * SIZE(TEMP) + ST ysum4, 3 * SIZE(TEMP) + + daddiu TEMP, IS, 4 + slt TEMP, M, TEMP + + beqz TEMP, .L11 + daddiu IS, IS, 2 + .align 3 + +.L20: + andi TEMP, M, 1 + nop + blez TEMP, .L900 + nop + + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, X, TEMP + + LD x1, 0 * SIZE(TEMP) + LD x2, 1 * SIZE(TEMP) + + MTC $0, xsum1 + MTC $0, xsum2 + + MUL alpha1, ALPHA_R, x1 + move AO1, A + MUL alpha2, ALPHA_I, x1 + move I, IS + daddu A, AO1, LDA + + NMSUB alpha1, alpha1, ALPHA_I, x2 + move XX, X + MADD alpha2, alpha2, ALPHA_R, x2 + move YY, Y1 + + blez I, .L25 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + LD ysum1, 0 * SIZE(YY) + blez I, .L23 + LD ysum2, 1 * SIZE(YY) + .align 3 + +.L22: + MADD ysum1, ysum1, alpha1, a1 + daddiu XX, XX, 2 * SIZE + MADD ysum2, ysum2, alpha2, a1 + daddiu YY, YY, 2 * SIZE + MADD xsum1, xsum1, x1, a1 + daddiu AO1, AO1, 2 * SIZE + MADD xsum2, xsum2, x2, a1 + daddiu I, I, -1 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + LD x2, 1 * SIZE(XX) + ADD2 xsum2, xsum2, x1, a2 + LD x1, 0 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + ST ysum1, -2 * SIZE(YY) + LD ysum1, 0 * SIZE(YY) + ST ysum2, -1 * SIZE(YY) + bgtz I, .L22 + LD ysum2, 1 * SIZE(YY) + .align 3 + +.L23: + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha2, a1 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x2, a1 + + NMSUB ysum1, ysum1, alpha2, a2 + daddiu XX, XX, 2 * SIZE + MADD ysum2, ysum2, alpha1, a2 + daddiu YY, YY, 2 * SIZE + ADD1 xsum1, xsum1, x2, a2 + daddiu AO1, AO1, 2 * SIZE + ADD2 xsum2, xsum2, x1, a2 + nop + + ST ysum1, -2 * SIZE(YY) + ST ysum2, -1 * SIZE(YY) + .align 3 + +.L25: + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + MOV x1, xsum1 + MOV x2, xsum2 + + MUL xsum1, ALPHA_R, xsum1 + MUL xsum2, ALPHA_R, xsum2 + + NMSUB xsum1, xsum1, ALPHA_I, x2 + MADD xsum2, xsum2, ALPHA_I, x1 + + MADD xsum1, xsum1, alpha1, a1 + MADD xsum2, xsum2, alpha2, a1 + +#ifndef HEMV + NMSUB xsum1, xsum1, alpha2, a2 + MADD xsum2, xsum2, alpha1, a2 +#endif + + ADD ysum1, ysum1, xsum1 + ADD ysum2, ysum2, xsum2 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + .align 3 + +.L900: + li IS, 2 * SIZE + + beq INCY, IS, .L999 + NOP + + dsra I, M, 2 + blez I, .L905 + NOP + .align 3 + +.L902: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + LD a3, 2 * SIZE(Y1) + LD a4, 3 * SIZE(Y1) + LD a5, 4 * SIZE(Y1) + LD a6, 5 * SIZE(Y1) + LD a7, 6 * SIZE(Y1) + LD a8, 7 * SIZE(Y1) + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu Y1, Y1, 8 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + daddiu Y1, Y1, 2 * SIZE + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + ldc1 $f24, 32($sp) + ldc1 $f25, 40($sp) + +#ifndef __64BIT__ + ldc1 $f20, 48($sp) + ldc1 $f21, 56($sp) + ldc1 $f22, 64($sp) + ldc1 $f23, 72($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 64 +#else + daddiu $sp, $sp, 80 +#endif + + EPILOGUE diff --git a/kernel/mips64/ztrsm_kernel_LT.S b/kernel/mips64/ztrsm_kernel_LT.S new file mode 100644 index 0000000..0e70118 --- /dev/null +++ b/kernel/mips64/ztrsm_kernel_LT.S @@ -0,0 +1,1685 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#define AORIG $21 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f26 +#define a4 $f27 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f15 +#define c41 $f16 +#define c42 $f17 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f22 +#define c72 $f23 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + daddiu $sp, $sp, -128 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + LDARG LDC, 128 + 0($sp) + LDARG OFFSET, 128 + 8($sp) + + dsll LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, ZBASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + dsra J, N, 2 + blez J, .L20 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 2 + ZBASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + blez I, .L19 + MOV c61, c11 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 2 + ZBASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + dsra L, TEMP, 2 + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + blez L, .L15 + NOP +#endif + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + blez L, .L13 + MADD3 c41, c41, a1, b4 + .align 3 + +.L12: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + daddiu L, L, -1 + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + bgtz L, .L12 + MADD3 c41, c41, a1, b4 + .align 3 + +.L13: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + daddiu L, L, -1 + MADD3 c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD1 c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD3 c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 + +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + MADD5 c31, a3, b1, c31 + MADD6 c32, a4, b1, c32 + + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + + MADD5 c51, a1, b1, c51 + MADD6 c52, a2, b1, c52 + MADD5 c71, a3, b1, c71 + MADD6 c72, a4, b1, c72 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + + NMSUB c31, c31, b3, c11 + MADD7 c32, c32, b4, c11 + NMSUB c51, c51, b5, c11 + MADD7 c52, c52, b6, c11 + NMSUB c71, c71, b7, c11 + MADD7 c72, c72, b8, c11 + + MADD8 c31, c31, b4, c12 + NMSUB c32, c32, b3, c12 + MADD8 c51, c51, b6, c12 + NMSUB c52, c52, b5, c12 + MADD8 c71, c71, b8, c12 + NMSUB c72, c72, b7, c12 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL a1, b4, c32 + MUL a2, b4, c31 + + MADD5 c31, a1, b3, c31 + MADD6 c32, a2, b3, c32 + + NMSUB c51, c51, b5, c31 + MADD7 c52, c52, b6, c31 + NMSUB c71, c71, b7, c31 + MADD7 c72, c72, b8, c31 + + MADD8 c51, c51, b6, c32 + NMSUB c52, c52, b5, c32 + MADD8 c71, c71, b8, c32 + NMSUB c72, c72, b7, c32 + + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL a1, b6, c52 + MUL a2, b6, c51 + + MADD5 c51, a1, b5, c51 + MADD6 c52, a2, b5, c52 + + NMSUB c71, c71, b7, c51 + MADD7 c72, c72, b8, c51 + + MADD8 c71, c71, b8, c52 + NMSUB c72, c72, b7, c52 + + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL a1, b8, c72 + MUL a2, b8, c71 + + MADD5 c71, a1, b7, c71 + MADD6 c72, a2, b7, c72 +#endif + +#ifdef RT + LD b1, 30 * SIZE(BO) + LD b2, 31 * SIZE(BO) + LD b3, 28 * SIZE(BO) + LD b4, 29 * SIZE(BO) + LD b5, 26 * SIZE(BO) + LD b6, 27 * SIZE(BO) + LD b7, 24 * SIZE(BO) + LD b8, 25 * SIZE(BO) + + MUL a1, b2, c72 + MUL a2, b2, c71 + + MADD5 c71, a1, b1, c71 + MADD6 c72, a2, b1, c72 + + NMSUB c51, c51, b3, c71 + MADD7 c52, c52, b4, c71 + NMSUB c31, c31, b5, c71 + MADD7 c32, c32, b6, c71 + NMSUB c11, c11, b7, c71 + MADD7 c12, c12, b8, c71 + + MADD8 c51, c51, b4, c72 + NMSUB c52, c52, b3, c72 + MADD8 c31, c31, b6, c72 + NMSUB c32, c32, b5, c72 + MADD8 c11, c11, b8, c72 + NMSUB c12, c12, b7, c72 + + LD b3, 20 * SIZE(BO) + LD b4, 21 * SIZE(BO) + LD b5, 18 * SIZE(BO) + LD b6, 19 * SIZE(BO) + LD b7, 16 * SIZE(BO) + LD b8, 17 * SIZE(BO) + + MUL a1, b4, c52 + MUL a2, b4, c51 + + MADD5 c51, a1, b3, c51 + MADD6 c52, a2, b3, c52 + + NMSUB c31, c31, b5, c51 + MADD7 c32, c32, b6, c51 + NMSUB c11, c11, b7, c51 + MADD7 c12, c12, b8, c51 + + MADD8 c31, c31, b6, c52 + NMSUB c32, c32, b5, c52 + MADD8 c11, c11, b8, c52 + NMSUB c12, c12, b7, c52 + + LD b5, 10 * SIZE(BO) + LD b6, 11 * SIZE(BO) + LD b7, 8 * SIZE(BO) + LD b8, 9 * SIZE(BO) + + MUL a1, b6, c32 + MUL a2, b6, c31 + + MADD5 c31, a1, b5, c31 + MADD6 c32, a2, b5, c32 + + NMSUB c11, c11, b7, c31 + MADD7 c12, c12, b8, c31 + + MADD8 c11, c11, b8, c32 + NMSUB c12, c12, b7, c32 + + LD b7, 0 * SIZE(BO) + LD b8, 1 * SIZE(BO) + + MUL a1, b8, c12 + MUL a2, b8, c11 + + MADD5 c11, a1, b7, c11 + MADD6 c12, a2, b7, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c32, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c52, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c72, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c32, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c52, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c72, 7 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE + daddiu CO2,CO2, -2 * SIZE + daddiu CO3,CO3, -2 * SIZE + daddiu CO4,CO4, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c31, 0 * SIZE(CO2) + ST c32, 1 * SIZE(CO2) + ST c51, 0 * SIZE(CO3) + ST c52, 1 * SIZE(CO3) + ST c71, 0 * SIZE(CO4) + ST c72, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE +#endif + + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + MTC $0, c11 + + daddiu I, I, -1 + + + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + + bgtz I, .L11 + MOV c61, c11 + .align 3 + +.L19: +#ifdef LN + dsll TEMP, K, 2 + ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + + bgtz J, .L10 + NOP + .align 3 + +.L20: + andi J, N, 2 + blez J, .L30 + NOP + +#ifdef RT + dsll TEMP, K, 1 + ZBASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + MTC $0, c11 + + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + move I, M + blez I, .L29 + NOP + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(B) + dsra L, KK, 2 + + LD b3, 2 * SIZE(B) + MOV c12, c11 + LD b4, 3 * SIZE(B) + MOV c22, c11 + LD b5, 4 * SIZE(B) + MOV c32, c11 + + NOP + MOV c42, c11 + blez L, .L25 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(BO) + dsra L, TEMP, 2 + + LD b3, 2 * SIZE(BO) + MOV c12, c11 + LD b4, 3 * SIZE(BO) + MOV c22, c11 + LD b5, 4 * SIZE(BO) + MOV c32, c11 + + blez L, .L25 + MOV c42, c11 +#endif + .align 3 + +.L22: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 12 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c11, c11, a3, b5 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 19 * SIZE(BO) + + bgtz L, .L22 + daddiu BO, BO, 16 * SIZE + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L28 + NOP + .align 3 + +.L26: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + daddiu BO, BO, 4 * SIZE + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 0 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L26 + daddiu AO, AO, 2 * SIZE + +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + MADD5 c31, a3, b1, c31 + MADD6 c32, a4, b1, c32 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + + NMSUB c31, c31, b3, c11 + MADD7 c32, c32, b4, c11 + + MADD8 c31, c31, b4, c12 + NMSUB c32, c32, b3, c12 + + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL a1, b4, c32 + MUL a2, b4, c31 + + MADD5 c31, a1, b3, c31 + MADD6 c32, a2, b3, c32 +#endif + +#ifdef RT + LD b5, 6 * SIZE(BO) + LD b6, 7 * SIZE(BO) + LD b7, 4 * SIZE(BO) + LD b8, 5 * SIZE(BO) + + MUL a1, b6, c32 + MUL a2, b6, c31 + + MADD5 c31, a1, b5, c31 + MADD6 c32, a2, b5, c32 + + NMSUB c11, c11, b7, c31 + MADD7 c12, c12, b8, c31 + + MADD8 c11, c11, b8, c32 + NMSUB c12, c12, b7, c32 + + LD b7, 0 * SIZE(BO) + LD b8, 1 * SIZE(BO) + + MUL a1, b8, c12 + MUL a2, b8, c11 + + MADD5 c11, a1, b7, c11 + MADD6 c12, a2, b7, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c32, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c32, 3 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE + daddiu CO2,CO2, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c31, 0 * SIZE(CO2) + ST c32, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE +#endif + + MTC $0, c11 + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + daddiu I, I, -1 + + bgtz I, .L21 + NOP + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 1 + ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L30: + andi J, N, 1 + blez J, .L999 + NOP + + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + MTC $0, c11 + + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + move I, M + blez I, .L39 + NOP + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(B) + MOV c12, c11 + dsra L, KK, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(B) + + NOP + MOV c42, c11 + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(BO) + MOV c12, c11 + dsra L, TEMP, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(BO) + + blez L, .L35 + MOV c42, c11 +#endif + .align 3 + +.L32: + MADD1 c11, c11, a1, b1 + LD b4, 3 * SIZE(BO) + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + MADD1 c11, c11, a1, b1 + LD b2, 5 * SIZE(BO) + MADD3 c21, c21, a1, b4 + LD a1, 8 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 5 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b4, 7 * SIZE(BO) + MADD3 c21, c21, a3, b2 + LD a3, 6 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 7 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b2, 9 * SIZE(BO) + MADD3 c21, c21, a3, b4 + LD a3, 12 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 12 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 9 * SIZE(AO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + + bgtz L, .L32 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L38 + NOP + .align 3 + +.L36: + MADD1 c11, c11, a1, b1 + daddiu L, L, -1 + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + LD b2, 3 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + bgtz L, .L36 + daddiu AO, AO, 2 * SIZE + +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 + +#if defined(LN) || defined(RT) + daddiu TEMP, KK, -1 + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE +#endif + + MTC $0, c11 + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, ZBASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + daddiu I, I, -1 + + bgtz I, .L31 + NOP + .align 3 + +.L39: +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, 128 + + EPILOGUE diff --git a/kernel/mips64/ztrsm_kernel_RT.S b/kernel/mips64/ztrsm_kernel_RT.S new file mode 100644 index 0000000..1fc2684 --- /dev/null +++ b/kernel/mips64/ztrsm_kernel_RT.S @@ -0,0 +1,1684 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#define AORIG $21 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f26 +#define a4 $f27 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f15 +#define c41 $f16 +#define c42 $f17 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f22 +#define c72 $f23 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + daddiu $sp, $sp, -128 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + LDARG LDC, 128 + 0($sp) + LDARG OFFSET, 128 + 8($sp) + + dsll LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, ZBASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + andi J, N, 1 + blez J, .L20 + NOP + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + MTC $0, c11 + + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + move I, M + blez I, .L39 + NOP + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(B) + MOV c12, c11 + dsra L, KK, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(B) + + NOP + MOV c42, c11 + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(BO) + MOV c12, c11 + dsra L, TEMP, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(BO) + + blez L, .L35 + MOV c42, c11 +#endif + .align 3 + +.L32: + MADD1 c11, c11, a1, b1 + LD b4, 3 * SIZE(BO) + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + MADD1 c11, c11, a1, b1 + LD b2, 5 * SIZE(BO) + MADD3 c21, c21, a1, b4 + LD a1, 8 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 5 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b4, 7 * SIZE(BO) + MADD3 c21, c21, a3, b2 + LD a3, 6 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 7 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b2, 9 * SIZE(BO) + MADD3 c21, c21, a3, b4 + LD a3, 12 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 12 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 9 * SIZE(AO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + + bgtz L, .L32 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L38 + NOP + .align 3 + +.L36: + MADD1 c11, c11, a1, b1 + daddiu L, L, -1 + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + LD b2, 3 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + bgtz L, .L36 + daddiu AO, AO, 2 * SIZE + +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 + +#if defined(LN) || defined(RT) + daddiu TEMP, KK, -1 + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE +#endif + + MTC $0, c11 + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, ZBASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + daddiu I, I, -1 + + bgtz I, .L31 + NOP + .align 3 + +.L39: +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + +.L20: + andi J, N, 2 + blez J, .L30 + NOP + +#ifdef RT + dsll TEMP, K, 1 + ZBASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + MTC $0, c11 + + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + move I, M + blez I, .L29 + NOP + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(B) + dsra L, KK, 2 + + LD b3, 2 * SIZE(B) + MOV c12, c11 + LD b4, 3 * SIZE(B) + MOV c22, c11 + LD b5, 4 * SIZE(B) + MOV c32, c11 + + NOP + MOV c42, c11 + blez L, .L25 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(BO) + dsra L, TEMP, 2 + + LD b3, 2 * SIZE(BO) + MOV c12, c11 + LD b4, 3 * SIZE(BO) + MOV c22, c11 + LD b5, 4 * SIZE(BO) + MOV c32, c11 + + blez L, .L25 + MOV c42, c11 +#endif + .align 3 + +.L22: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 12 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c11, c11, a3, b5 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 19 * SIZE(BO) + + bgtz L, .L22 + daddiu BO, BO, 16 * SIZE + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L28 + NOP + .align 3 + +.L26: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + daddiu BO, BO, 4 * SIZE + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 0 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L26 + daddiu AO, AO, 2 * SIZE + +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + MADD5 c31, a3, b1, c31 + MADD6 c32, a4, b1, c32 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + + NMSUB c31, c31, b3, c11 + MADD7 c32, c32, b4, c11 + + MADD8 c31, c31, b4, c12 + NMSUB c32, c32, b3, c12 + + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL a1, b4, c32 + MUL a2, b4, c31 + + MADD5 c31, a1, b3, c31 + MADD6 c32, a2, b3, c32 +#endif + +#ifdef RT + LD b5, 6 * SIZE(BO) + LD b6, 7 * SIZE(BO) + LD b7, 4 * SIZE(BO) + LD b8, 5 * SIZE(BO) + + MUL a1, b6, c32 + MUL a2, b6, c31 + + MADD5 c31, a1, b5, c31 + MADD6 c32, a2, b5, c32 + + NMSUB c11, c11, b7, c31 + MADD7 c12, c12, b8, c31 + + MADD8 c11, c11, b8, c32 + NMSUB c12, c12, b7, c32 + + LD b7, 0 * SIZE(BO) + LD b8, 1 * SIZE(BO) + + MUL a1, b8, c12 + MUL a2, b8, c11 + + MADD5 c11, a1, b7, c11 + MADD6 c12, a2, b7, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c32, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c32, 3 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE + daddiu CO2,CO2, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c31, 0 * SIZE(CO2) + ST c32, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE +#endif + + MTC $0, c11 + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + daddiu I, I, -1 + + bgtz I, .L21 + NOP + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 1 + ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L30: + dsra J, N, 2 + blez J, .L999 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 2 + ZBASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + blez I, .L19 + MOV c61, c11 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 2 + ZBASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + dsra L, TEMP, 2 + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + blez L, .L15 + NOP +#endif + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + blez L, .L13 + MADD3 c41, c41, a1, b4 + .align 3 + +.L12: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + daddiu L, L, -1 + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + bgtz L, .L12 + MADD3 c41, c41, a1, b4 + .align 3 + +.L13: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + daddiu L, L, -1 + MADD3 c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD1 c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD3 c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 + +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + MADD5 c31, a3, b1, c31 + MADD6 c32, a4, b1, c32 + + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + + MADD5 c51, a1, b1, c51 + MADD6 c52, a2, b1, c52 + MADD5 c71, a3, b1, c71 + MADD6 c72, a4, b1, c72 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + + NMSUB c31, c31, b3, c11 + MADD7 c32, c32, b4, c11 + NMSUB c51, c51, b5, c11 + MADD7 c52, c52, b6, c11 + NMSUB c71, c71, b7, c11 + MADD7 c72, c72, b8, c11 + + MADD8 c31, c31, b4, c12 + NMSUB c32, c32, b3, c12 + MADD8 c51, c51, b6, c12 + NMSUB c52, c52, b5, c12 + MADD8 c71, c71, b8, c12 + NMSUB c72, c72, b7, c12 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL a1, b4, c32 + MUL a2, b4, c31 + + MADD5 c31, a1, b3, c31 + MADD6 c32, a2, b3, c32 + + NMSUB c51, c51, b5, c31 + MADD7 c52, c52, b6, c31 + NMSUB c71, c71, b7, c31 + MADD7 c72, c72, b8, c31 + + MADD8 c51, c51, b6, c32 + NMSUB c52, c52, b5, c32 + MADD8 c71, c71, b8, c32 + NMSUB c72, c72, b7, c32 + + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL a1, b6, c52 + MUL a2, b6, c51 + + MADD5 c51, a1, b5, c51 + MADD6 c52, a2, b5, c52 + + NMSUB c71, c71, b7, c51 + MADD7 c72, c72, b8, c51 + + MADD8 c71, c71, b8, c52 + NMSUB c72, c72, b7, c52 + + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL a1, b8, c72 + MUL a2, b8, c71 + + MADD5 c71, a1, b7, c71 + MADD6 c72, a2, b7, c72 +#endif + +#ifdef RT + LD b1, 30 * SIZE(BO) + LD b2, 31 * SIZE(BO) + LD b3, 28 * SIZE(BO) + LD b4, 29 * SIZE(BO) + LD b5, 26 * SIZE(BO) + LD b6, 27 * SIZE(BO) + LD b7, 24 * SIZE(BO) + LD b8, 25 * SIZE(BO) + + MUL a1, b2, c72 + MUL a2, b2, c71 + + MADD5 c71, a1, b1, c71 + MADD6 c72, a2, b1, c72 + + NMSUB c51, c51, b3, c71 + MADD7 c52, c52, b4, c71 + NMSUB c31, c31, b5, c71 + MADD7 c32, c32, b6, c71 + NMSUB c11, c11, b7, c71 + MADD7 c12, c12, b8, c71 + + MADD8 c51, c51, b4, c72 + NMSUB c52, c52, b3, c72 + MADD8 c31, c31, b6, c72 + NMSUB c32, c32, b5, c72 + MADD8 c11, c11, b8, c72 + NMSUB c12, c12, b7, c72 + + LD b3, 20 * SIZE(BO) + LD b4, 21 * SIZE(BO) + LD b5, 18 * SIZE(BO) + LD b6, 19 * SIZE(BO) + LD b7, 16 * SIZE(BO) + LD b8, 17 * SIZE(BO) + + MUL a1, b4, c52 + MUL a2, b4, c51 + + MADD5 c51, a1, b3, c51 + MADD6 c52, a2, b3, c52 + + NMSUB c31, c31, b5, c51 + MADD7 c32, c32, b6, c51 + NMSUB c11, c11, b7, c51 + MADD7 c12, c12, b8, c51 + + MADD8 c31, c31, b6, c52 + NMSUB c32, c32, b5, c52 + MADD8 c11, c11, b8, c52 + NMSUB c12, c12, b7, c52 + + LD b5, 10 * SIZE(BO) + LD b6, 11 * SIZE(BO) + LD b7, 8 * SIZE(BO) + LD b8, 9 * SIZE(BO) + + MUL a1, b6, c32 + MUL a2, b6, c31 + + MADD5 c31, a1, b5, c31 + MADD6 c32, a2, b5, c32 + + NMSUB c11, c11, b7, c31 + MADD7 c12, c12, b8, c31 + + MADD8 c11, c11, b8, c32 + NMSUB c12, c12, b7, c32 + + LD b7, 0 * SIZE(BO) + LD b8, 1 * SIZE(BO) + + MUL a1, b8, c12 + MUL a2, b8, c11 + + MADD5 c11, a1, b7, c11 + MADD6 c12, a2, b7, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c32, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c52, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c72, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c32, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c52, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c72, 7 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE + daddiu CO2,CO2, -2 * SIZE + daddiu CO3,CO3, -2 * SIZE + daddiu CO4,CO4, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c31, 0 * SIZE(CO2) + ST c32, 1 * SIZE(CO2) + ST c51, 0 * SIZE(CO3) + ST c52, 1 * SIZE(CO3) + ST c71, 0 * SIZE(CO4) + ST c72, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE +#endif + + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + MTC $0, c11 + + daddiu I, I, -1 + + + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + + bgtz I, .L11 + MOV c61, c11 + .align 3 + +.L19: +#ifdef LN + dsll TEMP, K, 2 + ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + + bgtz J, .L10 + NOP + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, 128 + + EPILOGUE diff --git a/kernel/power/._KERNEL b/kernel/power/._KERNEL new file mode 100644 index 0000000..8d1cb17 Binary files /dev/null and b/kernel/power/._KERNEL differ diff --git a/kernel/power/._KERNEL.CELL b/kernel/power/._KERNEL.CELL new file mode 100644 index 0000000..f65e960 Binary files /dev/null and b/kernel/power/._KERNEL.CELL differ diff --git a/kernel/power/._KERNEL.POWER3 b/kernel/power/._KERNEL.POWER3 new file mode 100644 index 0000000..4f755ef Binary files /dev/null and b/kernel/power/._KERNEL.POWER3 differ diff --git a/kernel/power/._KERNEL.POWER4 b/kernel/power/._KERNEL.POWER4 new file mode 100644 index 0000000..39f9b11 Binary files /dev/null and b/kernel/power/._KERNEL.POWER4 differ diff --git a/kernel/power/._KERNEL.POWER5 b/kernel/power/._KERNEL.POWER5 new file mode 100644 index 0000000..9e879df Binary files /dev/null and b/kernel/power/._KERNEL.POWER5 differ diff --git a/kernel/power/._KERNEL.POWER6 b/kernel/power/._KERNEL.POWER6 new file mode 100644 index 0000000..2f6ef07 Binary files /dev/null and b/kernel/power/._KERNEL.POWER6 differ diff --git a/kernel/power/._KERNEL.PPC440 b/kernel/power/._KERNEL.PPC440 new file mode 100644 index 0000000..899a935 Binary files /dev/null and b/kernel/power/._KERNEL.PPC440 differ diff --git a/kernel/power/._KERNEL.PPC440FP2 b/kernel/power/._KERNEL.PPC440FP2 new file mode 100644 index 0000000..181ff3e Binary files /dev/null and b/kernel/power/._KERNEL.PPC440FP2 differ diff --git a/kernel/power/._KERNEL.PPC970 b/kernel/power/._KERNEL.PPC970 new file mode 100644 index 0000000..ba38343 Binary files /dev/null and b/kernel/power/._KERNEL.PPC970 differ diff --git a/kernel/power/._KERNEL.PPCG4 b/kernel/power/._KERNEL.PPCG4 new file mode 100644 index 0000000..899e1ed Binary files /dev/null and b/kernel/power/._KERNEL.PPCG4 differ diff --git a/kernel/power/._Makefile b/kernel/power/._Makefile new file mode 100644 index 0000000..3d94376 Binary files /dev/null and b/kernel/power/._Makefile differ diff --git a/kernel/power/._amax.S b/kernel/power/._amax.S new file mode 100644 index 0000000..eaac134 Binary files /dev/null and b/kernel/power/._amax.S differ diff --git a/kernel/power/._amax_cell.S b/kernel/power/._amax_cell.S new file mode 100644 index 0000000..d65c6fc Binary files /dev/null and b/kernel/power/._amax_cell.S differ diff --git a/kernel/power/._amax_hummer.S b/kernel/power/._amax_hummer.S new file mode 100644 index 0000000..134eb81 Binary files /dev/null and b/kernel/power/._amax_hummer.S differ diff --git a/kernel/power/._amax_ppc440.S b/kernel/power/._amax_ppc440.S new file mode 100644 index 0000000..1af0837 Binary files /dev/null and b/kernel/power/._amax_ppc440.S differ diff --git a/kernel/power/._amin.S b/kernel/power/._amin.S new file mode 100644 index 0000000..b590df6 Binary files /dev/null and b/kernel/power/._amin.S differ diff --git a/kernel/power/._amin_cell.S b/kernel/power/._amin_cell.S new file mode 100644 index 0000000..659b097 Binary files /dev/null and b/kernel/power/._amin_cell.S differ diff --git a/kernel/power/._amin_hummer.S b/kernel/power/._amin_hummer.S new file mode 100644 index 0000000..faafb93 Binary files /dev/null and b/kernel/power/._amin_hummer.S differ diff --git a/kernel/power/._amin_ppc440.S b/kernel/power/._amin_ppc440.S new file mode 100644 index 0000000..e3c16be Binary files /dev/null and b/kernel/power/._amin_ppc440.S differ diff --git a/kernel/power/._asum.S b/kernel/power/._asum.S new file mode 100644 index 0000000..d0badf3 Binary files /dev/null and b/kernel/power/._asum.S differ diff --git a/kernel/power/._asum_cell.S b/kernel/power/._asum_cell.S new file mode 100644 index 0000000..4e2bdcc Binary files /dev/null and b/kernel/power/._asum_cell.S differ diff --git a/kernel/power/._asum_hummer.S b/kernel/power/._asum_hummer.S new file mode 100644 index 0000000..c3977ed Binary files /dev/null and b/kernel/power/._asum_hummer.S differ diff --git a/kernel/power/._asum_ppc440.S b/kernel/power/._asum_ppc440.S new file mode 100644 index 0000000..44fb53a Binary files /dev/null and b/kernel/power/._asum_ppc440.S differ diff --git a/kernel/power/._axpy.S b/kernel/power/._axpy.S new file mode 100644 index 0000000..7e8dedc Binary files /dev/null and b/kernel/power/._axpy.S differ diff --git a/kernel/power/._axpy_hummer.S b/kernel/power/._axpy_hummer.S new file mode 100644 index 0000000..3311d4e Binary files /dev/null and b/kernel/power/._axpy_hummer.S differ diff --git a/kernel/power/._axpy_ppc440.S b/kernel/power/._axpy_ppc440.S new file mode 100644 index 0000000..cde9343 Binary files /dev/null and b/kernel/power/._axpy_ppc440.S differ diff --git a/kernel/power/._cabs.S b/kernel/power/._cabs.S new file mode 100644 index 0000000..c22432b Binary files /dev/null and b/kernel/power/._cabs.S differ diff --git a/kernel/power/._cnrm2.S b/kernel/power/._cnrm2.S new file mode 100644 index 0000000..8a7ad22 Binary files /dev/null and b/kernel/power/._cnrm2.S differ diff --git a/kernel/power/._cnrm2_hummer.S b/kernel/power/._cnrm2_hummer.S new file mode 100644 index 0000000..bd9765d Binary files /dev/null and b/kernel/power/._cnrm2_hummer.S differ diff --git a/kernel/power/._cnrm2_ppc440.S b/kernel/power/._cnrm2_ppc440.S new file mode 100644 index 0000000..931a87a Binary files /dev/null and b/kernel/power/._cnrm2_ppc440.S differ diff --git a/kernel/power/._copy.S b/kernel/power/._copy.S new file mode 100644 index 0000000..20fe0d6 Binary files /dev/null and b/kernel/power/._copy.S differ diff --git a/kernel/power/._copy_hummer.S b/kernel/power/._copy_hummer.S new file mode 100644 index 0000000..f8a149d Binary files /dev/null and b/kernel/power/._copy_hummer.S differ diff --git a/kernel/power/._dnrm2_hummer.S b/kernel/power/._dnrm2_hummer.S new file mode 100644 index 0000000..ef4db01 Binary files /dev/null and b/kernel/power/._dnrm2_hummer.S differ diff --git a/kernel/power/._dnrm2_ppc440.S b/kernel/power/._dnrm2_ppc440.S new file mode 100644 index 0000000..523d3a6 Binary files /dev/null and b/kernel/power/._dnrm2_ppc440.S differ diff --git a/kernel/power/._dot.S b/kernel/power/._dot.S new file mode 100644 index 0000000..ad360d6 Binary files /dev/null and b/kernel/power/._dot.S differ diff --git a/kernel/power/._dot_cell.S b/kernel/power/._dot_cell.S new file mode 100644 index 0000000..b9264b6 Binary files /dev/null and b/kernel/power/._dot_cell.S differ diff --git a/kernel/power/._dot_hummer.S b/kernel/power/._dot_hummer.S new file mode 100644 index 0000000..7a0b532 Binary files /dev/null and b/kernel/power/._dot_hummer.S differ diff --git a/kernel/power/._dot_ppc440.S b/kernel/power/._dot_ppc440.S new file mode 100644 index 0000000..ddbf616 Binary files /dev/null and b/kernel/power/._dot_ppc440.S differ diff --git a/kernel/power/._exfunc.S b/kernel/power/._exfunc.S new file mode 100644 index 0000000..e25c999 Binary files /dev/null and b/kernel/power/._exfunc.S differ diff --git a/kernel/power/._gemm_beta.S b/kernel/power/._gemm_beta.S new file mode 100644 index 0000000..5561bac Binary files /dev/null and b/kernel/power/._gemm_beta.S differ diff --git a/kernel/power/._gemm_kernel.S b/kernel/power/._gemm_kernel.S new file mode 100644 index 0000000..ca7d671 Binary files /dev/null and b/kernel/power/._gemm_kernel.S differ diff --git a/kernel/power/._gemm_kernel_altivec.S b/kernel/power/._gemm_kernel_altivec.S new file mode 100644 index 0000000..83ffe33 Binary files /dev/null and b/kernel/power/._gemm_kernel_altivec.S differ diff --git a/kernel/power/._gemm_kernel_altivec_cell.S b/kernel/power/._gemm_kernel_altivec_cell.S new file mode 100644 index 0000000..ebc9c97 Binary files /dev/null and b/kernel/power/._gemm_kernel_altivec_cell.S differ diff --git a/kernel/power/._gemm_kernel_altivec_g4.S b/kernel/power/._gemm_kernel_altivec_g4.S new file mode 100644 index 0000000..da63e1c Binary files /dev/null and b/kernel/power/._gemm_kernel_altivec_g4.S differ diff --git a/kernel/power/._gemm_kernel_cell.S b/kernel/power/._gemm_kernel_cell.S new file mode 100644 index 0000000..ffdc528 Binary files /dev/null and b/kernel/power/._gemm_kernel_cell.S differ diff --git a/kernel/power/._gemm_kernel_g4.S b/kernel/power/._gemm_kernel_g4.S new file mode 100644 index 0000000..589ef08 Binary files /dev/null and b/kernel/power/._gemm_kernel_g4.S differ diff --git a/kernel/power/._gemm_kernel_hummer.S b/kernel/power/._gemm_kernel_hummer.S new file mode 100644 index 0000000..e458d34 Binary files /dev/null and b/kernel/power/._gemm_kernel_hummer.S differ diff --git a/kernel/power/._gemm_kernel_power3.S b/kernel/power/._gemm_kernel_power3.S new file mode 100644 index 0000000..b3906f6 Binary files /dev/null and b/kernel/power/._gemm_kernel_power3.S differ diff --git a/kernel/power/._gemm_kernel_power6.S b/kernel/power/._gemm_kernel_power6.S new file mode 100644 index 0000000..f656598 Binary files /dev/null and b/kernel/power/._gemm_kernel_power6.S differ diff --git a/kernel/power/._gemm_kernel_ppc440.S b/kernel/power/._gemm_kernel_ppc440.S new file mode 100644 index 0000000..53cc3bc Binary files /dev/null and b/kernel/power/._gemm_kernel_ppc440.S differ diff --git a/kernel/power/._gemm_ncopy_4.S b/kernel/power/._gemm_ncopy_4.S new file mode 100644 index 0000000..45cce74 Binary files /dev/null and b/kernel/power/._gemm_ncopy_4.S differ diff --git a/kernel/power/._gemm_ncopy_hummer_4.S b/kernel/power/._gemm_ncopy_hummer_4.S new file mode 100644 index 0000000..e117a7f Binary files /dev/null and b/kernel/power/._gemm_ncopy_hummer_4.S differ diff --git a/kernel/power/._gemm_ncopy_hummer_8.S b/kernel/power/._gemm_ncopy_hummer_8.S new file mode 100644 index 0000000..77f2943 Binary files /dev/null and b/kernel/power/._gemm_ncopy_hummer_8.S differ diff --git a/kernel/power/._gemm_tcopy_4.S b/kernel/power/._gemm_tcopy_4.S new file mode 100644 index 0000000..98581c0 Binary files /dev/null and b/kernel/power/._gemm_tcopy_4.S differ diff --git a/kernel/power/._gemm_tcopy_hummer_4.S b/kernel/power/._gemm_tcopy_hummer_4.S new file mode 100644 index 0000000..62855d8 Binary files /dev/null and b/kernel/power/._gemm_tcopy_hummer_4.S differ diff --git a/kernel/power/._gemm_tcopy_hummer_8.S b/kernel/power/._gemm_tcopy_hummer_8.S new file mode 100644 index 0000000..96098c6 Binary files /dev/null and b/kernel/power/._gemm_tcopy_hummer_8.S differ diff --git a/kernel/power/._gemv_hummer_n.S b/kernel/power/._gemv_hummer_n.S new file mode 100644 index 0000000..0d7f3e8 Binary files /dev/null and b/kernel/power/._gemv_hummer_n.S differ diff --git a/kernel/power/._gemv_n.S b/kernel/power/._gemv_n.S new file mode 100644 index 0000000..6958119 Binary files /dev/null and b/kernel/power/._gemv_n.S differ diff --git a/kernel/power/._gemv_n_ppc440.S b/kernel/power/._gemv_n_ppc440.S new file mode 100644 index 0000000..36989b1 Binary files /dev/null and b/kernel/power/._gemv_n_ppc440.S differ diff --git a/kernel/power/._gemv_t.S b/kernel/power/._gemv_t.S new file mode 100644 index 0000000..7b8113d Binary files /dev/null and b/kernel/power/._gemv_t.S differ diff --git a/kernel/power/._gemv_t_ppc440.S b/kernel/power/._gemv_t_ppc440.S new file mode 100644 index 0000000..f2c17bd Binary files /dev/null and b/kernel/power/._gemv_t_ppc440.S differ diff --git a/kernel/power/._ger.S b/kernel/power/._ger.S new file mode 100644 index 0000000..2423289 Binary files /dev/null and b/kernel/power/._ger.S differ diff --git a/kernel/power/._iamax.S b/kernel/power/._iamax.S new file mode 100644 index 0000000..8e579cd Binary files /dev/null and b/kernel/power/._iamax.S differ diff --git a/kernel/power/._iamax_hummer.S b/kernel/power/._iamax_hummer.S new file mode 100644 index 0000000..b7bac8f Binary files /dev/null and b/kernel/power/._iamax_hummer.S differ diff --git a/kernel/power/._iamax_ppc440.S b/kernel/power/._iamax_ppc440.S new file mode 100644 index 0000000..b452ecb Binary files /dev/null and b/kernel/power/._iamax_ppc440.S differ diff --git a/kernel/power/._iamin.S b/kernel/power/._iamin.S new file mode 100644 index 0000000..5da86f6 Binary files /dev/null and b/kernel/power/._iamin.S differ diff --git a/kernel/power/._iamin_hummer.S b/kernel/power/._iamin_hummer.S new file mode 100644 index 0000000..b881d5c Binary files /dev/null and b/kernel/power/._iamin_hummer.S differ diff --git a/kernel/power/._iamin_ppc440.S b/kernel/power/._iamin_ppc440.S new file mode 100644 index 0000000..c8b7c14 Binary files /dev/null and b/kernel/power/._iamin_ppc440.S differ diff --git a/kernel/power/._imax.S b/kernel/power/._imax.S new file mode 100644 index 0000000..e5cecd1 Binary files /dev/null and b/kernel/power/._imax.S differ diff --git a/kernel/power/._imax_hummer.S b/kernel/power/._imax_hummer.S new file mode 100644 index 0000000..160cf11 Binary files /dev/null and b/kernel/power/._imax_hummer.S differ diff --git a/kernel/power/._imax_ppc440.S b/kernel/power/._imax_ppc440.S new file mode 100644 index 0000000..6bdc289 Binary files /dev/null and b/kernel/power/._imax_ppc440.S differ diff --git a/kernel/power/._imin.S b/kernel/power/._imin.S new file mode 100644 index 0000000..dbef6b8 Binary files /dev/null and b/kernel/power/._imin.S differ diff --git a/kernel/power/._imin_hummer.S b/kernel/power/._imin_hummer.S new file mode 100644 index 0000000..dc94982 Binary files /dev/null and b/kernel/power/._imin_hummer.S differ diff --git a/kernel/power/._imin_ppc440.S b/kernel/power/._imin_ppc440.S new file mode 100644 index 0000000..28844f2 Binary files /dev/null and b/kernel/power/._imin_ppc440.S differ diff --git a/kernel/power/._izamax.S b/kernel/power/._izamax.S new file mode 100644 index 0000000..5074112 Binary files /dev/null and b/kernel/power/._izamax.S differ diff --git a/kernel/power/._izamax_hummer.S b/kernel/power/._izamax_hummer.S new file mode 100644 index 0000000..80731b3 Binary files /dev/null and b/kernel/power/._izamax_hummer.S differ diff --git a/kernel/power/._izamax_ppc440.S b/kernel/power/._izamax_ppc440.S new file mode 100644 index 0000000..3b05bbe Binary files /dev/null and b/kernel/power/._izamax_ppc440.S differ diff --git a/kernel/power/._izamin.S b/kernel/power/._izamin.S new file mode 100644 index 0000000..8960257 Binary files /dev/null and b/kernel/power/._izamin.S differ diff --git a/kernel/power/._izamin_hummer.S b/kernel/power/._izamin_hummer.S new file mode 100644 index 0000000..1e3199a Binary files /dev/null and b/kernel/power/._izamin_hummer.S differ diff --git a/kernel/power/._izamin_ppc440.S b/kernel/power/._izamin_ppc440.S new file mode 100644 index 0000000..b8d7319 Binary files /dev/null and b/kernel/power/._izamin_ppc440.S differ diff --git a/kernel/power/._lock.c b/kernel/power/._lock.c new file mode 100644 index 0000000..8b258f2 Binary files /dev/null and b/kernel/power/._lock.c differ diff --git a/kernel/power/._lsame.S b/kernel/power/._lsame.S new file mode 100644 index 0000000..bd5f8bc Binary files /dev/null and b/kernel/power/._lsame.S differ diff --git a/kernel/power/._max.S b/kernel/power/._max.S new file mode 100644 index 0000000..62f9df3 Binary files /dev/null and b/kernel/power/._max.S differ diff --git a/kernel/power/._max_hummer.S b/kernel/power/._max_hummer.S new file mode 100644 index 0000000..1c32ec9 Binary files /dev/null and b/kernel/power/._max_hummer.S differ diff --git a/kernel/power/._max_ppc440.S b/kernel/power/._max_ppc440.S new file mode 100644 index 0000000..6b93baa Binary files /dev/null and b/kernel/power/._max_ppc440.S differ diff --git a/kernel/power/._min.S b/kernel/power/._min.S new file mode 100644 index 0000000..404710d Binary files /dev/null and b/kernel/power/._min.S differ diff --git a/kernel/power/._min_hummer.S b/kernel/power/._min_hummer.S new file mode 100644 index 0000000..6821471 Binary files /dev/null and b/kernel/power/._min_hummer.S differ diff --git a/kernel/power/._min_ppc440.S b/kernel/power/._min_ppc440.S new file mode 100644 index 0000000..d0fc49b Binary files /dev/null and b/kernel/power/._min_ppc440.S differ diff --git a/kernel/power/._nrm2.S b/kernel/power/._nrm2.S new file mode 100644 index 0000000..bb76ff9 Binary files /dev/null and b/kernel/power/._nrm2.S differ diff --git a/kernel/power/._rot.S b/kernel/power/._rot.S new file mode 100644 index 0000000..23689e3 Binary files /dev/null and b/kernel/power/._rot.S differ diff --git a/kernel/power/._rot_ppc440.S b/kernel/power/._rot_ppc440.S new file mode 100644 index 0000000..fb192e3 Binary files /dev/null and b/kernel/power/._rot_ppc440.S differ diff --git a/kernel/power/._scal.S b/kernel/power/._scal.S new file mode 100644 index 0000000..bc576c7 Binary files /dev/null and b/kernel/power/._scal.S differ diff --git a/kernel/power/._scal_hummer.S b/kernel/power/._scal_hummer.S new file mode 100644 index 0000000..73b7437 Binary files /dev/null and b/kernel/power/._scal_hummer.S differ diff --git a/kernel/power/._scal_ppc440.S b/kernel/power/._scal_ppc440.S new file mode 100644 index 0000000..fd7a1f1 Binary files /dev/null and b/kernel/power/._scal_ppc440.S differ diff --git a/kernel/power/._snrm2.S b/kernel/power/._snrm2.S new file mode 100644 index 0000000..1542855 Binary files /dev/null and b/kernel/power/._snrm2.S differ diff --git a/kernel/power/._snrm2_hummer.S b/kernel/power/._snrm2_hummer.S new file mode 100644 index 0000000..81e26d4 Binary files /dev/null and b/kernel/power/._snrm2_hummer.S differ diff --git a/kernel/power/._snrm2_ppc440.S b/kernel/power/._snrm2_ppc440.S new file mode 100644 index 0000000..328bf04 Binary files /dev/null and b/kernel/power/._snrm2_ppc440.S differ diff --git a/kernel/power/._staticbuffer.S b/kernel/power/._staticbuffer.S new file mode 100644 index 0000000..dda4e2a Binary files /dev/null and b/kernel/power/._staticbuffer.S differ diff --git a/kernel/power/._swap.S b/kernel/power/._swap.S new file mode 100644 index 0000000..a429a86 Binary files /dev/null and b/kernel/power/._swap.S differ diff --git a/kernel/power/._swap_hummer.S b/kernel/power/._swap_hummer.S new file mode 100644 index 0000000..f2b2b1b Binary files /dev/null and b/kernel/power/._swap_hummer.S differ diff --git a/kernel/power/._symv_L.S b/kernel/power/._symv_L.S new file mode 100644 index 0000000..274acb3 Binary files /dev/null and b/kernel/power/._symv_L.S differ diff --git a/kernel/power/._symv_U.S b/kernel/power/._symv_U.S new file mode 100644 index 0000000..b7e36b7 Binary files /dev/null and b/kernel/power/._symv_U.S differ diff --git a/kernel/power/._trsm_kernel_LN.S b/kernel/power/._trsm_kernel_LN.S new file mode 100644 index 0000000..db040b8 Binary files /dev/null and b/kernel/power/._trsm_kernel_LN.S differ diff --git a/kernel/power/._trsm_kernel_LT.S b/kernel/power/._trsm_kernel_LT.S new file mode 100644 index 0000000..8bc4003 Binary files /dev/null and b/kernel/power/._trsm_kernel_LT.S differ diff --git a/kernel/power/._trsm_kernel_RT.S b/kernel/power/._trsm_kernel_RT.S new file mode 100644 index 0000000..21d898a Binary files /dev/null and b/kernel/power/._trsm_kernel_RT.S differ diff --git a/kernel/power/._trsm_kernel_cell_LN.S b/kernel/power/._trsm_kernel_cell_LN.S new file mode 100644 index 0000000..a5d9699 Binary files /dev/null and b/kernel/power/._trsm_kernel_cell_LN.S differ diff --git a/kernel/power/._trsm_kernel_cell_LT.S b/kernel/power/._trsm_kernel_cell_LT.S new file mode 100644 index 0000000..e6d574c Binary files /dev/null and b/kernel/power/._trsm_kernel_cell_LT.S differ diff --git a/kernel/power/._trsm_kernel_cell_RT.S b/kernel/power/._trsm_kernel_cell_RT.S new file mode 100644 index 0000000..44d2e9e Binary files /dev/null and b/kernel/power/._trsm_kernel_cell_RT.S differ diff --git a/kernel/power/._trsm_kernel_hummer_LN.S b/kernel/power/._trsm_kernel_hummer_LN.S new file mode 100644 index 0000000..e8300ee Binary files /dev/null and b/kernel/power/._trsm_kernel_hummer_LN.S differ diff --git a/kernel/power/._trsm_kernel_hummer_LT.S b/kernel/power/._trsm_kernel_hummer_LT.S new file mode 100644 index 0000000..7f7bdcb Binary files /dev/null and b/kernel/power/._trsm_kernel_hummer_LT.S differ diff --git a/kernel/power/._trsm_kernel_hummer_RT.S b/kernel/power/._trsm_kernel_hummer_RT.S new file mode 100644 index 0000000..166fe34 Binary files /dev/null and b/kernel/power/._trsm_kernel_hummer_RT.S differ diff --git a/kernel/power/._trsm_kernel_power6_LN.S b/kernel/power/._trsm_kernel_power6_LN.S new file mode 100644 index 0000000..69d7bde Binary files /dev/null and b/kernel/power/._trsm_kernel_power6_LN.S differ diff --git a/kernel/power/._trsm_kernel_power6_LT.S b/kernel/power/._trsm_kernel_power6_LT.S new file mode 100644 index 0000000..cefa097 Binary files /dev/null and b/kernel/power/._trsm_kernel_power6_LT.S differ diff --git a/kernel/power/._trsm_kernel_power6_RT.S b/kernel/power/._trsm_kernel_power6_RT.S new file mode 100644 index 0000000..aa7a687 Binary files /dev/null and b/kernel/power/._trsm_kernel_power6_RT.S differ diff --git a/kernel/power/._trsm_kernel_ppc440_LN.S b/kernel/power/._trsm_kernel_ppc440_LN.S new file mode 100644 index 0000000..712af71 Binary files /dev/null and b/kernel/power/._trsm_kernel_ppc440_LN.S differ diff --git a/kernel/power/._trsm_kernel_ppc440_LT.S b/kernel/power/._trsm_kernel_ppc440_LT.S new file mode 100644 index 0000000..13b9e24 Binary files /dev/null and b/kernel/power/._trsm_kernel_ppc440_LT.S differ diff --git a/kernel/power/._trsm_kernel_ppc440_RT.S b/kernel/power/._trsm_kernel_ppc440_RT.S new file mode 100644 index 0000000..a5ab33c Binary files /dev/null and b/kernel/power/._trsm_kernel_ppc440_RT.S differ diff --git a/kernel/power/._zamax.S b/kernel/power/._zamax.S new file mode 100644 index 0000000..23c1eb1 Binary files /dev/null and b/kernel/power/._zamax.S differ diff --git a/kernel/power/._zamax_cell.S b/kernel/power/._zamax_cell.S new file mode 100644 index 0000000..d0ed79b Binary files /dev/null and b/kernel/power/._zamax_cell.S differ diff --git a/kernel/power/._zamax_hummer.S b/kernel/power/._zamax_hummer.S new file mode 100644 index 0000000..1fd0b97 Binary files /dev/null and b/kernel/power/._zamax_hummer.S differ diff --git a/kernel/power/._zamax_ppc440.S b/kernel/power/._zamax_ppc440.S new file mode 100644 index 0000000..3afa13d Binary files /dev/null and b/kernel/power/._zamax_ppc440.S differ diff --git a/kernel/power/._zamin.S b/kernel/power/._zamin.S new file mode 100644 index 0000000..571c4ba Binary files /dev/null and b/kernel/power/._zamin.S differ diff --git a/kernel/power/._zamin_cell.S b/kernel/power/._zamin_cell.S new file mode 100644 index 0000000..66b5ac7 Binary files /dev/null and b/kernel/power/._zamin_cell.S differ diff --git a/kernel/power/._zamin_hummer.S b/kernel/power/._zamin_hummer.S new file mode 100644 index 0000000..ba2a124 Binary files /dev/null and b/kernel/power/._zamin_hummer.S differ diff --git a/kernel/power/._zamin_ppc440.S b/kernel/power/._zamin_ppc440.S new file mode 100644 index 0000000..6e01d04 Binary files /dev/null and b/kernel/power/._zamin_ppc440.S differ diff --git a/kernel/power/._zasum.S b/kernel/power/._zasum.S new file mode 100644 index 0000000..6e16876 Binary files /dev/null and b/kernel/power/._zasum.S differ diff --git a/kernel/power/._zasum_cell.S b/kernel/power/._zasum_cell.S new file mode 100644 index 0000000..32e9b34 Binary files /dev/null and b/kernel/power/._zasum_cell.S differ diff --git a/kernel/power/._zasum_hummer.S b/kernel/power/._zasum_hummer.S new file mode 100644 index 0000000..322fcbe Binary files /dev/null and b/kernel/power/._zasum_hummer.S differ diff --git a/kernel/power/._zasum_ppc440.S b/kernel/power/._zasum_ppc440.S new file mode 100644 index 0000000..d7b5b6d Binary files /dev/null and b/kernel/power/._zasum_ppc440.S differ diff --git a/kernel/power/._zaxpy.S b/kernel/power/._zaxpy.S new file mode 100644 index 0000000..64951c6 Binary files /dev/null and b/kernel/power/._zaxpy.S differ diff --git a/kernel/power/._zaxpy_hummer.S b/kernel/power/._zaxpy_hummer.S new file mode 100644 index 0000000..279dc32 Binary files /dev/null and b/kernel/power/._zaxpy_hummer.S differ diff --git a/kernel/power/._zaxpy_ppc440.S b/kernel/power/._zaxpy_ppc440.S new file mode 100644 index 0000000..5f300ba Binary files /dev/null and b/kernel/power/._zaxpy_ppc440.S differ diff --git a/kernel/power/._zcopy.S b/kernel/power/._zcopy.S new file mode 100644 index 0000000..ed25ebd Binary files /dev/null and b/kernel/power/._zcopy.S differ diff --git a/kernel/power/._zcopy_hummer.S b/kernel/power/._zcopy_hummer.S new file mode 100644 index 0000000..7041af7 Binary files /dev/null and b/kernel/power/._zcopy_hummer.S differ diff --git a/kernel/power/._zdot.S b/kernel/power/._zdot.S new file mode 100644 index 0000000..fcd0d13 Binary files /dev/null and b/kernel/power/._zdot.S differ diff --git a/kernel/power/._zdot_cell.S b/kernel/power/._zdot_cell.S new file mode 100644 index 0000000..e68a452 Binary files /dev/null and b/kernel/power/._zdot_cell.S differ diff --git a/kernel/power/._zdot_hummer.S b/kernel/power/._zdot_hummer.S new file mode 100644 index 0000000..7e06e75 Binary files /dev/null and b/kernel/power/._zdot_hummer.S differ diff --git a/kernel/power/._zdot_ppc440.S b/kernel/power/._zdot_ppc440.S new file mode 100644 index 0000000..878c4b7 Binary files /dev/null and b/kernel/power/._zdot_ppc440.S differ diff --git a/kernel/power/._zgemm_beta.S b/kernel/power/._zgemm_beta.S new file mode 100644 index 0000000..c8ce799 Binary files /dev/null and b/kernel/power/._zgemm_beta.S differ diff --git a/kernel/power/._zgemm_kernel.S b/kernel/power/._zgemm_kernel.S new file mode 100644 index 0000000..3f56520 Binary files /dev/null and b/kernel/power/._zgemm_kernel.S differ diff --git a/kernel/power/._zgemm_kernel_altivec.S b/kernel/power/._zgemm_kernel_altivec.S new file mode 100644 index 0000000..ab0da10 Binary files /dev/null and b/kernel/power/._zgemm_kernel_altivec.S differ diff --git a/kernel/power/._zgemm_kernel_altivec_cell.S b/kernel/power/._zgemm_kernel_altivec_cell.S new file mode 100644 index 0000000..517b4d7 Binary files /dev/null and b/kernel/power/._zgemm_kernel_altivec_cell.S differ diff --git a/kernel/power/._zgemm_kernel_altivec_g4.S b/kernel/power/._zgemm_kernel_altivec_g4.S new file mode 100644 index 0000000..e6b8bad Binary files /dev/null and b/kernel/power/._zgemm_kernel_altivec_g4.S differ diff --git a/kernel/power/._zgemm_kernel_cell.S b/kernel/power/._zgemm_kernel_cell.S new file mode 100644 index 0000000..491c6db Binary files /dev/null and b/kernel/power/._zgemm_kernel_cell.S differ diff --git a/kernel/power/._zgemm_kernel_g4.S b/kernel/power/._zgemm_kernel_g4.S new file mode 100644 index 0000000..eb2565b Binary files /dev/null and b/kernel/power/._zgemm_kernel_g4.S differ diff --git a/kernel/power/._zgemm_kernel_hummer.S b/kernel/power/._zgemm_kernel_hummer.S new file mode 100644 index 0000000..0ff7fec Binary files /dev/null and b/kernel/power/._zgemm_kernel_hummer.S differ diff --git a/kernel/power/._zgemm_kernel_power3.S b/kernel/power/._zgemm_kernel_power3.S new file mode 100644 index 0000000..7393930 Binary files /dev/null and b/kernel/power/._zgemm_kernel_power3.S differ diff --git a/kernel/power/._zgemm_kernel_power6.S b/kernel/power/._zgemm_kernel_power6.S new file mode 100644 index 0000000..1cd1608 Binary files /dev/null and b/kernel/power/._zgemm_kernel_power6.S differ diff --git a/kernel/power/._zgemm_kernel_ppc440.S b/kernel/power/._zgemm_kernel_ppc440.S new file mode 100644 index 0000000..9e9546d Binary files /dev/null and b/kernel/power/._zgemm_kernel_ppc440.S differ diff --git a/kernel/power/._zgemm_ncopy_hummer_2.S b/kernel/power/._zgemm_ncopy_hummer_2.S new file mode 100644 index 0000000..5c43515 Binary files /dev/null and b/kernel/power/._zgemm_ncopy_hummer_2.S differ diff --git a/kernel/power/._zgemm_ncopy_hummer_4.S b/kernel/power/._zgemm_ncopy_hummer_4.S new file mode 100644 index 0000000..ff6f7bf Binary files /dev/null and b/kernel/power/._zgemm_ncopy_hummer_4.S differ diff --git a/kernel/power/._zgemm_tcopy_hummer_2.S b/kernel/power/._zgemm_tcopy_hummer_2.S new file mode 100644 index 0000000..63ee829 Binary files /dev/null and b/kernel/power/._zgemm_tcopy_hummer_2.S differ diff --git a/kernel/power/._zgemm_tcopy_hummer_4.S b/kernel/power/._zgemm_tcopy_hummer_4.S new file mode 100644 index 0000000..426a472 Binary files /dev/null and b/kernel/power/._zgemm_tcopy_hummer_4.S differ diff --git a/kernel/power/._zgemv_n.S b/kernel/power/._zgemv_n.S new file mode 100644 index 0000000..fb55517 Binary files /dev/null and b/kernel/power/._zgemv_n.S differ diff --git a/kernel/power/._zgemv_n_ppc440.S b/kernel/power/._zgemv_n_ppc440.S new file mode 100644 index 0000000..61fa5da Binary files /dev/null and b/kernel/power/._zgemv_n_ppc440.S differ diff --git a/kernel/power/._zgemv_t.S b/kernel/power/._zgemv_t.S new file mode 100644 index 0000000..3c655d9 Binary files /dev/null and b/kernel/power/._zgemv_t.S differ diff --git a/kernel/power/._zgemv_t_ppc440.S b/kernel/power/._zgemv_t_ppc440.S new file mode 100644 index 0000000..11d751f Binary files /dev/null and b/kernel/power/._zgemv_t_ppc440.S differ diff --git a/kernel/power/._zger.S b/kernel/power/._zger.S new file mode 100644 index 0000000..46eab16 Binary files /dev/null and b/kernel/power/._zger.S differ diff --git a/kernel/power/._znrm2.S b/kernel/power/._znrm2.S new file mode 100644 index 0000000..ada4060 Binary files /dev/null and b/kernel/power/._znrm2.S differ diff --git a/kernel/power/._znrm2_hummer.S b/kernel/power/._znrm2_hummer.S new file mode 100644 index 0000000..64c3534 Binary files /dev/null and b/kernel/power/._znrm2_hummer.S differ diff --git a/kernel/power/._znrm2_ppc440.S b/kernel/power/._znrm2_ppc440.S new file mode 100644 index 0000000..4d96a65 Binary files /dev/null and b/kernel/power/._znrm2_ppc440.S differ diff --git a/kernel/power/._zrot.S b/kernel/power/._zrot.S new file mode 100644 index 0000000..ccb262b Binary files /dev/null and b/kernel/power/._zrot.S differ diff --git a/kernel/power/._zrot_ppc440.S b/kernel/power/._zrot_ppc440.S new file mode 100644 index 0000000..4fc1999 Binary files /dev/null and b/kernel/power/._zrot_ppc440.S differ diff --git a/kernel/power/._zscal.S b/kernel/power/._zscal.S new file mode 100644 index 0000000..ca4fe2b Binary files /dev/null and b/kernel/power/._zscal.S differ diff --git a/kernel/power/._zscal_hummer.S b/kernel/power/._zscal_hummer.S new file mode 100644 index 0000000..a650213 Binary files /dev/null and b/kernel/power/._zscal_hummer.S differ diff --git a/kernel/power/._zscal_ppc440.S b/kernel/power/._zscal_ppc440.S new file mode 100644 index 0000000..503caea Binary files /dev/null and b/kernel/power/._zscal_ppc440.S differ diff --git a/kernel/power/._zswap.S b/kernel/power/._zswap.S new file mode 100644 index 0000000..3e0ad58 Binary files /dev/null and b/kernel/power/._zswap.S differ diff --git a/kernel/power/._zswap_hummer.S b/kernel/power/._zswap_hummer.S new file mode 100644 index 0000000..00cb0c8 Binary files /dev/null and b/kernel/power/._zswap_hummer.S differ diff --git a/kernel/power/._zsymv_L.S b/kernel/power/._zsymv_L.S new file mode 100644 index 0000000..7fe818a Binary files /dev/null and b/kernel/power/._zsymv_L.S differ diff --git a/kernel/power/._zsymv_U.S b/kernel/power/._zsymv_U.S new file mode 100644 index 0000000..bb33034 Binary files /dev/null and b/kernel/power/._zsymv_U.S differ diff --git a/kernel/power/._ztrsm_kernel_LN.S b/kernel/power/._ztrsm_kernel_LN.S new file mode 100644 index 0000000..fdc0378 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_LN.S differ diff --git a/kernel/power/._ztrsm_kernel_LT.S b/kernel/power/._ztrsm_kernel_LT.S new file mode 100644 index 0000000..bb15780 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_LT.S differ diff --git a/kernel/power/._ztrsm_kernel_RT.S b/kernel/power/._ztrsm_kernel_RT.S new file mode 100644 index 0000000..a08b2a9 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_RT.S differ diff --git a/kernel/power/._ztrsm_kernel_cell_LN.S b/kernel/power/._ztrsm_kernel_cell_LN.S new file mode 100644 index 0000000..209fe15 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_cell_LN.S differ diff --git a/kernel/power/._ztrsm_kernel_cell_LT.S b/kernel/power/._ztrsm_kernel_cell_LT.S new file mode 100644 index 0000000..0d39e13 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_cell_LT.S differ diff --git a/kernel/power/._ztrsm_kernel_cell_RT.S b/kernel/power/._ztrsm_kernel_cell_RT.S new file mode 100644 index 0000000..d9ccdf3 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_cell_RT.S differ diff --git a/kernel/power/._ztrsm_kernel_hummer_LN.S b/kernel/power/._ztrsm_kernel_hummer_LN.S new file mode 100644 index 0000000..8cc02a2 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_hummer_LN.S differ diff --git a/kernel/power/._ztrsm_kernel_hummer_LT.S b/kernel/power/._ztrsm_kernel_hummer_LT.S new file mode 100644 index 0000000..3f3b409 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_hummer_LT.S differ diff --git a/kernel/power/._ztrsm_kernel_hummer_RT.S b/kernel/power/._ztrsm_kernel_hummer_RT.S new file mode 100644 index 0000000..e2383ee Binary files /dev/null and b/kernel/power/._ztrsm_kernel_hummer_RT.S differ diff --git a/kernel/power/._ztrsm_kernel_power6_LN.S b/kernel/power/._ztrsm_kernel_power6_LN.S new file mode 100644 index 0000000..dda9766 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_power6_LN.S differ diff --git a/kernel/power/._ztrsm_kernel_power6_LT.S b/kernel/power/._ztrsm_kernel_power6_LT.S new file mode 100644 index 0000000..4940d5e Binary files /dev/null and b/kernel/power/._ztrsm_kernel_power6_LT.S differ diff --git a/kernel/power/._ztrsm_kernel_power6_RT.S b/kernel/power/._ztrsm_kernel_power6_RT.S new file mode 100644 index 0000000..e0722fa Binary files /dev/null and b/kernel/power/._ztrsm_kernel_power6_RT.S differ diff --git a/kernel/power/._ztrsm_kernel_ppc440_LN.S b/kernel/power/._ztrsm_kernel_ppc440_LN.S new file mode 100644 index 0000000..f4c8ec1 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_ppc440_LN.S differ diff --git a/kernel/power/._ztrsm_kernel_ppc440_LT.S b/kernel/power/._ztrsm_kernel_ppc440_LT.S new file mode 100644 index 0000000..2df5452 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_ppc440_LT.S differ diff --git a/kernel/power/._ztrsm_kernel_ppc440_RT.S b/kernel/power/._ztrsm_kernel_ppc440_RT.S new file mode 100644 index 0000000..f7e6ae2 Binary files /dev/null and b/kernel/power/._ztrsm_kernel_ppc440_RT.S differ diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL new file mode 100644 index 0000000..cb9ed84 --- /dev/null +++ b/kernel/power/KERNEL @@ -0,0 +1,86 @@ +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S + + +ifndef SSYMV_U_KERNEL +SSYMV_U_KERNEL = symv_U.S +endif + +ifndef SSYMV_L_KERNEL +SSYMV_L_KERNEL = symv_L.S +endif + +ifndef DSYMV_U_KERNEL +DSYMV_U_KERNEL = symv_U.S +endif + +ifndef DSYMV_L_KERNEL +DSYMV_L_KERNEL = symv_L.S +endif + +ifndef CSYMV_U_KERNEL +CSYMV_U_KERNEL = zsymv_U.S +endif + +ifndef CSYMV_L_KERNEL +CSYMV_L_KERNEL = zsymv_L.S +endif + +ifndef ZSYMV_U_KERNEL +ZSYMV_U_KERNEL = zsymv_U.S +endif + +ifndef ZSYMV_L_KERNEL +ZSYMV_L_KERNEL = zsymv_L.S +endif + +ifndef CHEMV_U_KERNEL +CHEMV_U_KERNEL = zsymv_U.S +endif + +ifndef CHEMV_L_KERNEL +CHEMV_L_KERNEL = zsymv_L.S +endif + +ifndef ZHEMV_U_KERNEL +ZHEMV_U_KERNEL = zsymv_U.S +endif + +ifndef ZHEMV_L_KERNEL +ZHEMV_L_KERNEL = zsymv_L.S +endif + +ifndef STRSMKERNEL_LN +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +endif + +ifndef STRSMKERNEL_LT +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +endif + +ifndef STRSMKERNEL_RN +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +endif + +ifndef STRSMKERNEL_RT +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifndef CTRSMKERNEL_LN +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +endif + +ifndef CTRSMKERNEL_LT +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +endif + +ifndef CTRSMKERNEL_RN +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +endif + +ifndef CTRSMKERNEL_RT +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + diff --git a/kernel/power/KERNEL.CELL b/kernel/power/KERNEL.CELL new file mode 100644 index 0000000..745e16e --- /dev/null +++ b/kernel/power/KERNEL.CELL @@ -0,0 +1,76 @@ +SAMAXKERNEL = amax_cell.S +DAMAXKERNEL = amax_cell.S +CAMAXKERNEL = zamax_cell.S +ZAMAXKERNEL = zamax_cell.S + +SAMINKERNEL = amin_cell.S +DAMINKERNEL = amin_cell.S +CAMINKERNEL = zamin_cell.S +ZAMINKERNEL = zamin_cell.S + +SASUMKERNEL = asum_cell.S +DASUMKERNEL = asum_cell.S +CASUMKERNEL = zasum_cell.S +ZASUMKERNEL = zasum_cell.S + +SDOTKERNEL = dot_cell.S +DDOTKERNEL = dot_cell.S +CDOTKERNEL = zdot_cell.S +ZDOTKERNEL = zdot_cell.S + +SGEMMKERNEL = gemm_kernel_altivec_cell.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_cell.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_altivec_cell.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_cell.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +#STRSMKERNEL_LN = trsm_kernel_LN.S +#STRSMKERNEL_LT = trsm_kernel_LT.S +#STRSMKERNEL_RN = trsm_kernel_LT.S +#STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_cell_LN.S +DTRSMKERNEL_LT = trsm_kernel_cell_LT.S +DTRSMKERNEL_RN = trsm_kernel_cell_LT.S +DTRSMKERNEL_RT = trsm_kernel_cell_RT.S + +#CTRSMKERNEL_LN = ztrsm_kernel_LN.S +#CTRSMKERNEL_LT = ztrsm_kernel_LT.S +#CTRSMKERNEL_RN = ztrsm_kernel_LT.S +#CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_cell_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_cell_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_cell_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_cell_RT.S diff --git a/kernel/power/KERNEL.POWER3 b/kernel/power/KERNEL.POWER3 new file mode 100644 index 0000000..188eab8 --- /dev/null +++ b/kernel/power/KERNEL.POWER3 @@ -0,0 +1,2 @@ +include $(KERNELDIR)/KERNEL.POWER5 + diff --git a/kernel/power/KERNEL.POWER4 b/kernel/power/KERNEL.POWER4 new file mode 100644 index 0000000..932dbe5 --- /dev/null +++ b/kernel/power/KERNEL.POWER4 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.POWER5 diff --git a/kernel/power/KERNEL.POWER5 b/kernel/power/KERNEL.POWER5 new file mode 100644 index 0000000..af0960d --- /dev/null +++ b/kernel/power/KERNEL.POWER5 @@ -0,0 +1,56 @@ +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN.S +STRSMKERNEL_LT = trsm_kernel_LT.S +STRSMKERNEL_RN = trsm_kernel_LT.S +STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S diff --git a/kernel/power/KERNEL.POWER6 b/kernel/power/KERNEL.POWER6 new file mode 100644 index 0000000..ef5f744 --- /dev/null +++ b/kernel/power/KERNEL.POWER6 @@ -0,0 +1,56 @@ +SGEMMKERNEL = gemm_kernel_power6.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_power6.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_power6.S +CGEMMINCOPY = ../generic/zgemm_ncopy_2.c +CGEMMITCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_power6.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_power6_LN.S +STRSMKERNEL_LT = trsm_kernel_power6_LT.S +STRSMKERNEL_RN = trsm_kernel_power6_LT.S +STRSMKERNEL_RT = trsm_kernel_power6_RT.S + +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 new file mode 100644 index 0000000..5e2a7f9 --- /dev/null +++ b/kernel/power/KERNEL.PPC440 @@ -0,0 +1,118 @@ +SAMAXKERNEL = amax_ppc440.S +DAMAXKERNEL = amax_ppc440.S +CAMAXKERNEL = zamax_ppc440.S +ZAMAXKERNEL = zamax_ppc440.S + +SAMINKERNEL = amin_ppc440.S +DAMINKERNEL = amin_ppc440.S +CAMINKERNEL = zamin_ppc440.S +ZAMINKERNEL = zamin_ppc440.S + +SASUMKERNEL = asum_ppc440.S +DASUMKERNEL = asum_ppc440.S +CASUMKERNEL = zasum_ppc440.S +ZASUMKERNEL = zasum_ppc440.S + +SAXPYKERNEL = axpy_ppc440.S +DAXPYKERNEL = axpy_ppc440.S +CAXPYKERNEL = zaxpy_ppc440.S +ZAXPYKERNEL = zaxpy_ppc440.S + +SDOTKERNEL = dot_ppc440.S +DDOTKERNEL = dot_ppc440.S +CDOTKERNEL = zdot_ppc440.S +ZDOTKERNEL = zdot_ppc440.S + +ISAMAXKERNEL = iamax_ppc440.S +IDAMAXKERNEL = iamax_ppc440.S +ICAMAXKERNEL = izamax_ppc440.S +IZAMAXKERNEL = izamax_ppc440.S + +ISAMINKERNEL = iamin_ppc440.S +IDAMINKERNEL = iamin_ppc440.S +ICAMINKERNEL = izamin_ppc440.S +IZAMINKERNEL = izamin_ppc440.S + +ISMAXKERNEL = imax_ppc440.S +IDMAXKERNEL = imax_ppc440.S + +ISMINKERNEL = imin_ppc440.S +IDMINKERNEL = imin_ppc440.S + +SMAXKERNEL = max_ppc440.S +DMAXKERNEL = max_ppc440.S + +SMINKERNEL = min_ppc440.S +DMINKERNEL = min_ppc440.S + +SNRM2KERNEL = snrm2_ppc440.S +DNRM2KERNEL = dnrm2_ppc440.S +CNRM2KERNEL = cnrm2_ppc440.S +ZNRM2KERNEL = znrm2_ppc440.S + +SROTKERNEL = rot_ppc440.S +DROTKERNEL = rot_ppc440.S +CROTKERNEL = zrot_ppc440.S +ZROTKERNEL = zrot_ppc440.S + +SSCALKERNEL = scal_ppc440.S +DSCALKERNEL = scal_ppc440.S +CSCALKERNEL = zscal_ppc440.S +ZSCALKERNEL = zscal_ppc440.S + +SGEMMKERNEL = gemm_kernel_ppc440.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_ppc440.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_ppc440.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_ppc440.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_ppc440_LN.S +STRSMKERNEL_LT = trsm_kernel_ppc440_LT.S +STRSMKERNEL_RN = trsm_kernel_ppc440_LT.S +STRSMKERNEL_RT = trsm_kernel_ppc440_RT.S + +DTRSMKERNEL_LN = trsm_kernel_ppc440_LN.S +DTRSMKERNEL_LT = trsm_kernel_ppc440_LT.S +DTRSMKERNEL_RN = trsm_kernel_ppc440_LT.S +DTRSMKERNEL_RT = trsm_kernel_ppc440_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S diff --git a/kernel/power/KERNEL.PPC440FP2 b/kernel/power/KERNEL.PPC440FP2 new file mode 100644 index 0000000..3359385 --- /dev/null +++ b/kernel/power/KERNEL.PPC440FP2 @@ -0,0 +1,128 @@ +SAMAXKERNEL = amax_hummer.S +DAMAXKERNEL = amax_hummer.S +CAMAXKERNEL = zamax_hummer.S +ZAMAXKERNEL = zamax_hummer.S + +SAMINKERNEL = amin_hummer.S +DAMINKERNEL = amin_hummer.S +CAMINKERNEL = zamin_hummer.S +ZAMINKERNEL = zamin_hummer.S + +SASUMKERNEL = asum_hummer.S +DASUMKERNEL = asum_hummer.S +CASUMKERNEL = zasum_hummer.S +ZASUMKERNEL = zasum_hummer.S + +SAXPYKERNEL = axpy_hummer.S +DAXPYKERNEL = axpy_hummer.S +CAXPYKERNEL = zaxpy_hummer.S +ZAXPYKERNEL = zaxpy_hummer.S + +SCOPYKERNEL = copy_hummer.S +DCOPYKERNEL = copy_hummer.S +CCOPYKERNEL = zcopy_hummer.S +ZCOPYKERNEL = zcopy_hummer.S + +SDOTKERNEL = dot_hummer.S +DDOTKERNEL = dot_hummer.S +CDOTKERNEL = zdot_hummer.S +ZDOTKERNEL = zdot_hummer.S + +ISAMAXKERNEL = iamax_hummer.S +IDAMAXKERNEL = iamax_hummer.S +ICAMAXKERNEL = izamax_hummer.S +IZAMAXKERNEL = izamax_hummer.S + +ISAMINKERNEL = iamin_hummer.S +IDAMINKERNEL = iamin_hummer.S +ICAMINKERNEL = izamin_hummer.S +IZAMINKERNEL = izamin_hummer.S + +ISMAXKERNEL = imax_hummer.S +IDMAXKERNEL = imax_hummer.S + +ISMINKERNEL = imin_hummer.S +IDMINKERNEL = imin_hummer.S + +SMAXKERNEL = max_hummer.S +DMAXKERNEL = max_hummer.S + +SMINKERNEL = min_hummer.S +DMINKERNEL = min_hummer.S + +SNRM2KERNEL = snrm2_hummer.S +DNRM2KERNEL = dnrm2_hummer.S +CNRM2KERNEL = cnrm2_hummer.S +ZNRM2KERNEL = znrm2_hummer.S + +SROTKERNEL = rot_ppc440.S +DROTKERNEL = rot_ppc440.S +CROTKERNEL = zrot_ppc440.S +ZROTKERNEL = zrot_ppc440.S + +SSCALKERNEL = scal_hummer.S +DSCALKERNEL = scal_hummer.S +CSCALKERNEL = zscal_hummer.S +ZSCALKERNEL = zscal_hummer.S + +SSWAPKERNEL = swap_hummer.S +DSWAPKERNEL = swap_hummer.S +CSWAPKERNEL = zswap_hummer.S +ZSWAPKERNEL = zswap_hummer.S + +SGEMMKERNEL = gemm_kernel_hummer.S +SGEMMINCOPY = gemm_ncopy_hummer_8.S +SGEMMITCOPY = gemm_tcopy_hummer_8.S +SGEMMONCOPY = gemm_ncopy_hummer_4.S +SGEMMOTCOPY = gemm_tcopy_hummer_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_hummer.S +DGEMMINCOPY = gemm_ncopy_hummer_8.S +DGEMMITCOPY = gemm_tcopy_hummer_8.S +DGEMMONCOPY = gemm_ncopy_hummer_4.S +DGEMMOTCOPY = gemm_tcopy_hummer_4.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_hummer.S +CGEMMINCOPY = zgemm_ncopy_hummer_4.S +CGEMMITCOPY = zgemm_tcopy_hummer_4.S +CGEMMONCOPY = zgemm_ncopy_hummer_2.S +CGEMMOTCOPY = zgemm_tcopy_hummer_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_hummer.S +ZGEMMINCOPY = zgemm_ncopy_hummer_4.S +ZGEMMITCOPY = zgemm_tcopy_hummer_4.S +ZGEMMONCOPY = zgemm_ncopy_hummer_2.S +ZGEMMOTCOPY = zgemm_tcopy_hummer_2.S +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_hummer_LN.S +STRSMKERNEL_LT = trsm_kernel_hummer_LT.S +STRSMKERNEL_RN = trsm_kernel_hummer_LT.S +STRSMKERNEL_RT = trsm_kernel_hummer_RT.S + +DTRSMKERNEL_LN = trsm_kernel_hummer_LN.S +DTRSMKERNEL_LT = trsm_kernel_hummer_LT.S +DTRSMKERNEL_RN = trsm_kernel_hummer_LT.S +DTRSMKERNEL_RT = trsm_kernel_hummer_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_hummer_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_hummer_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_hummer_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_hummer_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_hummer_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_hummer_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_hummer_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_hummer_RT.S diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 new file mode 100644 index 0000000..bfa43b7 --- /dev/null +++ b/kernel/power/KERNEL.PPC970 @@ -0,0 +1,56 @@ +SGEMMKERNEL = gemm_kernel_altivec.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_altivec.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +#STRSMKERNEL_LN = trsm_kernel_LN.S +#STRSMKERNEL_LT = trsm_kernel_LT.S +#STRSMKERNEL_RN = trsm_kernel_LT.S +#STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +#CTRSMKERNEL_LN = ztrsm_kernel_LN.S +#CTRSMKERNEL_LT = ztrsm_kernel_LT.S +#CTRSMKERNEL_RN = ztrsm_kernel_LT.S +#CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 new file mode 100644 index 0000000..c41df97 --- /dev/null +++ b/kernel/power/KERNEL.PPCG4 @@ -0,0 +1,118 @@ +SAMAXKERNEL = amax_ppc440.S +DAMAXKERNEL = amax_ppc440.S +CAMAXKERNEL = zamax_ppc440.S +ZAMAXKERNEL = zamax_ppc440.S + +SAMINKERNEL = amin_ppc440.S +DAMINKERNEL = amin_ppc440.S +CAMINKERNEL = zamin_ppc440.S +ZAMINKERNEL = zamin_ppc440.S + +SASUMKERNEL = asum_ppc440.S +DASUMKERNEL = asum_ppc440.S +CASUMKERNEL = zasum_ppc440.S +ZASUMKERNEL = zasum_ppc440.S + +SAXPYKERNEL = axpy_ppc440.S +DAXPYKERNEL = axpy_ppc440.S +CAXPYKERNEL = zaxpy_ppc440.S +ZAXPYKERNEL = zaxpy_ppc440.S + +SDOTKERNEL = dot_ppc440.S +DDOTKERNEL = dot_ppc440.S +CDOTKERNEL = zdot_ppc440.S +ZDOTKERNEL = zdot_ppc440.S + +ISAMAXKERNEL = iamax_ppc440.S +IDAMAXKERNEL = iamax_ppc440.S +ICAMAXKERNEL = izamax_ppc440.S +IZAMAXKERNEL = izamax_ppc440.S + +ISAMINKERNEL = iamin_ppc440.S +IDAMINKERNEL = iamin_ppc440.S +ICAMINKERNEL = izamin_ppc440.S +IZAMINKERNEL = izamin_ppc440.S + +ISMAXKERNEL = imax_ppc440.S +IDMAXKERNEL = imax_ppc440.S + +ISMINKERNEL = imin_ppc440.S +IDMINKERNEL = imin_ppc440.S + +SMAXKERNEL = max_ppc440.S +DMAXKERNEL = max_ppc440.S + +SMINKERNEL = min_ppc440.S +DMINKERNEL = min_ppc440.S + +SNRM2KERNEL = snrm2_ppc440.S +DNRM2KERNEL = dnrm2_ppc440.S +CNRM2KERNEL = cnrm2_ppc440.S +ZNRM2KERNEL = znrm2_ppc440.S + +SROTKERNEL = rot_ppc440.S +DROTKERNEL = rot_ppc440.S +CROTKERNEL = zrot_ppc440.S +ZROTKERNEL = zrot_ppc440.S + +SSCALKERNEL = scal_ppc440.S +DSCALKERNEL = scal_ppc440.S +CSCALKERNEL = zscal_ppc440.S +ZSCALKERNEL = zscal_ppc440.S + +SGEMMKERNEL = gemm_kernel_altivec_g4.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_g4.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_altivec_g4.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_g4.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +#STRSMKERNEL_LN = trsm_kernel_ppc440_LN.S +#STRSMKERNEL_LT = trsm_kernel_ppc440_LT.S +#STRSMKERNEL_RN = trsm_kernel_ppc440_LT.S +#STRSMKERNEL_RT = trsm_kernel_ppc440_RT.S + +DTRSMKERNEL_LN = trsm_kernel_ppc440_LN.S +DTRSMKERNEL_LT = trsm_kernel_ppc440_LT.S +DTRSMKERNEL_RN = trsm_kernel_ppc440_LT.S +DTRSMKERNEL_RT = trsm_kernel_ppc440_RT.S + +#CTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S +#CTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S +#CTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S +#CTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S diff --git a/kernel/power/Makefile b/kernel/power/Makefile new file mode 100644 index 0000000..520349b --- /dev/null +++ b/kernel/power/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/power/amax.S b/kernel/power/amax.S new file mode 100644 index 0000000..7fbe39e --- /dev/null +++ b/kernel/power/amax.S @@ -0,0 +1,523 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amax_cell.S b/kernel/power/amax_cell.S new file mode 100644 index 0000000..3f25e75 --- /dev/null +++ b/kernel/power/amax_cell.S @@ -0,0 +1,691 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, 10 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(20) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(15) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + bdz LL(13) + .align 4 + +LL(12): + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + + fabs f14, f14 + dcbt X, PREA + fabs f15, f15 + nop + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + + fsel f6, f22, f6, f14 + LFD f8, 8 * SIZE(X) + fsel f7, f23, f7, f15 + LFD f9, 9 * SIZE(X) + + fabs f8, f8 + LFD f10, 10 * SIZE(X) + fabs f9, f9 + LFD f11, 11 * SIZE(X) + fabs f10, f10 + LFD f12, 12 * SIZE(X) + fabs f11, f11 + LFD f13, 13 * SIZE(X) + fabs f12, f12 + LFD f14, 14 * SIZE(X) + fabs f13, f13 + LFD f15, 15 * SIZE(X) + + fabs f14, f14 + addi X, X, 16 * SIZE + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + LFD f8, 0 * SIZE(X) + + fsel f6, f22, f6, f14 + LFD f9, 1 * SIZE(X) + fsel f7, f23, f7, f15 + bdnz LL(12) + .align 4 + +LL(13): + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + + fsel f6, f22, f6, f14 + LFD f8, 8 * SIZE(X) + fsel f7, f23, f7, f15 + LFD f9, 9 * SIZE(X) + + fabs f8, f8 + LFD f10, 10 * SIZE(X) + fabs f9, f9 + LFD f11, 11 * SIZE(X) + fabs f10, f10 + LFD f12, 12 * SIZE(X) + fabs f11, f11 + LFD f13, 13 * SIZE(X) + fabs f12, f12 + LFD f14, 14 * SIZE(X) + fabs f13, f13 + LFD f15, 15 * SIZE(X) + + fabs f14, f14 + addi X, X, 16 * SIZE + fabs f15, f15 + nop + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + nop + fsel f7, f23, f7, f15 + addi X, X, 8 * SIZE + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + fabs f11, f11 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + nop + fsel f3, f19, f3, f11 + addi X, X, 4 * SIZE + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + + fsel f0, f16, f0, f8 + nop + fsel f1, f17, f1, f9 + addi X, X, 2 * SIZE + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFD f8, 0 * SIZE(X) + fabs f8, f8 + fsub f16, f0, f8 + fsel f0, f16, f0, f8 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(25) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + bdz LL(23) + .align 4 + +LL(22): + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + + fsel f6, f22, f6, f14 + LFDUX f8, X, INCX + fsel f7, f23, f7, f15 + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + LFDUX f8, X, INCX + + fsel f6, f22, f6, f14 + LFDUX f9, X, INCX + fsel f7, f23, f7, f15 + bdnz LL(22) + .align 4 + +LL(23): + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + + fsel f6, f22, f6, f14 + LFDUX f8, X, INCX + fsel f7, f23, f7, f15 + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + fabs f11, f11 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f0, f8 + fsel f0, f16, f0, f8 + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amax_hummer.S b/kernel/power/amax_hummer.S new file mode 100644 index 0000000..0d8b97d --- /dev/null +++ b/kernel/power/amax_hummer.S @@ -0,0 +1,540 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(999) + + fsmfp C1, C1 + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C2, C2 + ble LL(998) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, T1 + LFPDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFPDUX A2, X, INCX2 + fpsub F3, C3, T3 + LFPDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + b LL(998) + .align 4 + + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, T1 + LFDUX A1, X, INCX + fpsub F2, C2, T2 + LFDUX A2, X, INCX + fpsub F3, C3, T3 + LFDUX A3, X, INCX + fpsub F4, C4, T4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, C1, T5 + LFSDUX A5, X, INCX + fpsub F6, C2, T6 + LFSDUX A6, X, INCX + fpsub F7, C3, T7 + LFSDUX A7, X, INCX + fpsub F8, C4, T8 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + + +LL(998): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/amax_ppc440.S b/kernel/power/amax_ppc440.S new file mode 100644 index 0000000..0184493 --- /dev/null +++ b/kernel/power/amax_ppc440.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREX r8 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + sub X, X, INCX + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fabs f0, f1 + li PREX, 3 * 16 * SIZE + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amin.S b/kernel/power/amin.S new file mode 100644 index 0000000..01056c3 --- /dev/null +++ b/kernel/power/amin.S @@ -0,0 +1,523 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amin_cell.S b/kernel/power/amin_cell.S new file mode 100644 index 0000000..e4179f5 --- /dev/null +++ b/kernel/power/amin_cell.S @@ -0,0 +1,691 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, 10 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(20) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(15) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + bdz LL(13) + .align 4 + +LL(12): + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + + fabs f14, f14 + dcbt X, PREA + fabs f15, f15 + nop + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + + fsel f6, f22, f14, f6 + LFD f8, 8 * SIZE(X) + fsel f7, f23, f15, f7 + LFD f9, 9 * SIZE(X) + + fabs f8, f8 + LFD f10, 10 * SIZE(X) + fabs f9, f9 + LFD f11, 11 * SIZE(X) + fabs f10, f10 + LFD f12, 12 * SIZE(X) + fabs f11, f11 + LFD f13, 13 * SIZE(X) + fabs f12, f12 + LFD f14, 14 * SIZE(X) + fabs f13, f13 + LFD f15, 15 * SIZE(X) + + fabs f14, f14 + addi X, X, 16 * SIZE + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + LFD f8, 0 * SIZE(X) + + fsel f6, f22, f14, f6 + LFD f9, 1 * SIZE(X) + fsel f7, f23, f15, f7 + bdnz LL(12) + .align 4 + +LL(13): + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + + fsel f6, f22, f14, f6 + LFD f8, 8 * SIZE(X) + fsel f7, f23, f15, f7 + LFD f9, 9 * SIZE(X) + + fabs f8, f8 + LFD f10, 10 * SIZE(X) + fabs f9, f9 + LFD f11, 11 * SIZE(X) + fabs f10, f10 + LFD f12, 12 * SIZE(X) + fabs f11, f11 + LFD f13, 13 * SIZE(X) + fabs f12, f12 + LFD f14, 14 * SIZE(X) + fabs f13, f13 + LFD f15, 15 * SIZE(X) + + fabs f14, f14 + addi X, X, 16 * SIZE + fabs f15, f15 + nop + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + nop + fsel f7, f23, f15, f7 + addi X, X, 8 * SIZE + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + fabs f11, f11 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + nop + fsel f3, f19, f11, f3 + addi X, X, 4 * SIZE + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + + fsel f0, f16, f8, f0 + nop + fsel f1, f17, f9, f1 + addi X, X, 2 * SIZE + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFD f8, 0 * SIZE(X) + fabs f8, f8 + fsub f16, f0, f8 + fsel f0, f16, f8, f0 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(25) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + bdz LL(23) + .align 4 + +LL(22): + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + + fsel f6, f22, f14, f6 + LFDUX f8, X, INCX + fsel f7, f23, f15, f7 + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + LFDUX f8, X, INCX + + fsel f6, f22, f14, f6 + LFDUX f9, X, INCX + fsel f7, f23, f15, f7 + bdnz LL(22) + .align 4 + +LL(23): + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + + fsel f6, f22, f14, f6 + LFDUX f8, X, INCX + fsel f7, f23, f15, f7 + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + fabs f11, f11 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f0, f8 + fsel f0, f16, f8, f0 + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amin_hummer.S b/kernel/power/amin_hummer.S new file mode 100644 index 0000000..f4bbf07 --- /dev/null +++ b/kernel/power/amin_hummer.S @@ -0,0 +1,539 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(999) + + fsmfp C1, C1 + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C2, C2 + ble LL(998) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, T1, C1 + LFPDUX A1, X, INCX2 + fpsub F2, T2, C2 + LFPDUX A2, X, INCX2 + fpsub F3, T3, C3 + LFPDUX A3, X, INCX2 + fpsub F4, T4, C4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + b LL(998) + .align 4 + + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, T1, C1 + LFDUX A1, X, INCX + fpsub F2, T2, C2 + LFDUX A2, X, INCX + fpsub F3, T3, C3 + LFDUX A3, X, INCX + fpsub F4, T4, C4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, T5, C1 + LFSDUX A5, X, INCX + fpsub F6, T6, C2 + LFSDUX A6, X, INCX + fpsub F7, T7, C3 + LFSDUX A7, X, INCX + fpsub F8, T8, C4 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsub F3, A3, C3 + fsub F4, A4, C4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, A1, C1 + fsub F2, A2, C2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(998): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/amin_ppc440.S b/kernel/power/amin_ppc440.S new file mode 100644 index 0000000..b47742b --- /dev/null +++ b/kernel/power/amin_ppc440.S @@ -0,0 +1,333 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define INC1 r6 + +#define PREX r8 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + sub X, X, INCX + li INC1, SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fabs f0, f1 + li PREX, 3 * 16 * SIZE + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + subi N, N, 1 + fabs f6, f1 + srawi. r0, N, 4 + fabs f7, f1 + mtspr CTR, r0 + fabs f1, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f9, f1 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f10, f2 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f11, f3 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f13, f5 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f14, f6 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f15, f7 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f9, f1 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f10, f2 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f11, f3 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f13, f5 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f14, f6 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f15, f7 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/asum.S b/kernel/power/asum.S new file mode 100644 index 0000000..1188aa5 --- /dev/null +++ b/kernel/power/asum.S @@ -0,0 +1,448 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fabs f16, f8 + fabs f17, f9 + fabs f18, f10 + fabs f19, f11 + + fabs f20, f12 + fabs f21, f13 + fabs f22, f14 + fabs f23, f15 + bdz LL(20) + .align 4 + +LL(10): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + FADD f0, f0, f16 + fabs f16, f8 + FADD f1, f1, f17 + fabs f17, f9 + + FADD f2, f2, f18 + fabs f18, f10 + FADD f3, f3, f19 + fabs f19, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + FADD f4, f4, f20 + fabs f20, f12 + FADD f5, f5, f21 + fabs f21, f13 + + FADD f6, f6, f22 + fabs f22, f14 + FADD f7, f7, f23 + fabs f23, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + + fabs f8, f8 + FADD f0, f0, f8 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f16, f8 + fabs f17, f9 + fabs f18, f10 + fabs f19, f11 + + fabs f20, f12 + fabs f21, f13 + fabs f22, f14 + fabs f23, f15 + bdz LL(120) + .align 4 + +LL(110): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + FADD f0, f0, f16 + fabs f16, f8 + FADD f1, f1, f17 + fabs f17, f9 + + FADD f2, f2, f18 + fabs f18, f10 + FADD f3, f3, f19 + fabs f19, f11 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + FADD f4, f4, f20 + fabs f20, f12 + FADD f5, f5, f21 + fabs f21, f13 + + FADD f6, f6, f22 + fabs f22, f14 + FADD f7, f7, f23 + fabs f23, f15 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + FADD f0, f0, f8 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/asum_cell.S b/kernel/power/asum_cell.S new file mode 100644 index 0000000..076651f --- /dev/null +++ b/kernel/power/asum_cell.S @@ -0,0 +1,599 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 16 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stw r0, 0(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfs FZERO, 0(SP) + + slwi INCX, INCX, BASE_SHIFT + fmr f1, FZERO + li PREA, 8 * 16 * SIZE + fmr f2, FZERO + + cmpwi cr0, N, 0 + fmr f3, FZERO + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(20) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(15) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + bdz LL(13) + .align 4 + +LL(12): + FADD f0, f0, f4 + dcbt X, PREA + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 7 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 8 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 9 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 10 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 11 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 12 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 13 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 14 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 15 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 16 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 17 * SIZE(X) + + FADD f1, f1, f5 + addi X, X, 16 * SIZE + fabs f5, f9 + LFD f10, 2 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 3 * SIZE(X) + + FADD f3, f3, f7 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + bdnz LL(12) + .align 4 + +LL(13): + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 7 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 8 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 9 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 10 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 11 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 12 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 13 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 14 * SIZE(X) + + FADD f2, f2, f6 + addi X, X, 16 * SIZE + fabs f6, f10 + LFD f11, -1 * SIZE(X) + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + addi X, X, 8 * SIZE + fabs f6, f10 + LFD f11, -1 * SIZE(X) + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + addi X, X, 4 * SIZE + fabs f7, f11 + nop + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + fabs f5, f9 + + FADD f0, f0, f4 + addi X, X, 2 * SIZE + FADD f1, f1, f5 + nop + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFD f8, 0 * SIZE(X) + fabs f4, f8 + FADD f0, f0, f4 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(25) + .align 4 + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f4, f8 + LFDUX f10, X, INCX + fabs f5, f9 + LFDUX f11, X, INCX + fabs f6, f10 + LFDUX f8, X, INCX + fabs f7, f11 + bdz LL(23) + .align 4 + +LL(22): + FADD f0, f0, f4 + dcbt X, PREA + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + LFDUX f8, X, INCX + fabs f7, f11 + bdnz LL(22) + .align 4 + +LL(23): + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f4, f8 + LFDUX f10, X, INCX + fabs f5, f9 + LFDUX f11, X, INCX + fabs f6, f10 + LFDUX f8, X, INCX + fabs f7, f11 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f4, f8 + LFDUX f10, X, INCX + fabs f5, f9 + LFDUX f11, X, INCX + + fabs f6, f10 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f4, f8 + fabs f5, f9 + + FADD f0, f0, f4 + FADD f1, f1, f5 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX f8, X, INCX + fabs f4, f8 + FADD f0, f0, f4 + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + + FADD f1, f0, f2 + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/asum_hummer.S b/kernel/power/asum_hummer.S new file mode 100644 index 0000000..9906a44 --- /dev/null +++ b/kernel/power/asum_hummer.S @@ -0,0 +1,455 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define T1 f12 +#define T2 f13 +#define T3 f14 +#define T4 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C1, 0(X) + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(999) + .align 4 + +LL(05): + srawi. r0, N, 4 + sub X, X, INCX2 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + fpmr T1, C2 + LFPDUX A2, X, INCX2 + fpmr T2, C2 + LFPDUX A3, X, INCX2 + fpmr T3, C2 + LFPDUX A4, X, INCX2 + fpmr T4, C2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFPDUX A1, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFPDUX A2, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A3 + LFPDUX A3, X, INCX2 + + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFPDUX A4, X, INCX2 + + fpadd C1, C1, T1 + nop + fpabs T1, A5 + LFPDUX A5, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A6 + LFPDUX A6, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A7 + LFPDUX A7, X, INCX2 + + fpadd C4, C4, T4 + fpabs T4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + fpadd C1, C1, T1 + fpabs T1, A1 + fpadd C2, C2, T2 + fpabs T2, A2 + fpadd C3, C3, T3 + fpabs T3, A3 + fpadd C4, C4, T4 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpabs T1, A1 + fpabs T2, A2 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs T1, A1 + fpadd C1, C1, T1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFDX A1, X, INCX2 + fabs T1, A1 + fadd C1, C1, T1 + b LL(999) + .align 4 + +LL(100): + sub X2, X, INCX + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(115) + + + LFDUX A1, X, INCX2 + fpmr T1, C2 + LFDUX A2, X, INCX2 + fpmr T2, C2 + LFDUX A3, X, INCX2 + fpmr T3, C2 + LFDUX A4, X, INCX2 + fpmr T4, C2 + + LFDUX A5, X, INCX2 + LFSDUX A1, X2, INCX2 + + LFDUX A6, X, INCX2 + LFSDUX A2, X2, INCX2 + + LFDUX A7, X, INCX2 + LFSDUX A3, X2, INCX2 + + LFDUX A8, X, INCX2 + LFSDUX A4, X2, INCX2 + bdz LL(113) + .align 4 + +LL(112): + fpadd C1, C1, T1 + LFSDUX A5, X2, INCX2 + fpabs T1, A1 + LFDUX A1, X, INCX2 + + fpadd C2, C2, T2 + LFSDUX A6, X2, INCX2 + fpabs T2, A2 + LFDUX A2, X, INCX2 + + fpadd C3, C3, T3 + LFSDUX A7, X2, INCX2 + fpabs T3, A3 + LFDUX A3, X, INCX2 + + fpadd C4, C4, T4 + LFSDUX A8, X2, INCX2 + fpabs T4, A4 + LFDUX A4, X, INCX2 + + fpadd C1, C1, T1 + LFSDUX A1, X2, INCX2 + fpabs T1, A5 + LFDUX A5, X, INCX2 + fpadd C2, C2, T2 + LFSDUX A2, X2, INCX2 + fpabs T2, A6 + LFDUX A6, X, INCX2 + + fpadd C3, C3, T3 + LFSDUX A3, X2, INCX2 + fpabs T3, A7 + LFDUX A7, X, INCX2 + fpadd C4, C4, T4 + LFSDUX A4, X2, INCX2 + fpabs T4, A8 + LFDUX A8, X, INCX2 + + bdnz LL(112) + .align 4 + +LL(113): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFSDUX A5, X2, INCX2 + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFSDUX A6, X2, INCX2 + fpadd C3, C3, T3 + + nop + fpabs T3, A3 + LFSDUX A7, X2, INCX2 + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFSDUX A8, X2, INCX2 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(115): + andi. r0, N, 15 + beq LL(999) + andi. r0, N, 8 + beq LL(116) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs T1, A1 + LFDUX A5, X, INCX2 + fabs T2, A2 + LFDUX A6, X2, INCX2 + fabs T3, A3 + LFDUX A7, X, INCX2 + fabs T4, A4 + LFDUX A8, X2, INCX2 + + fadd C1, C1, T1 + fabs T1, A5 + fadd C2, C2, T2 + fabs T2, A6 + + fadd C3, C3, T3 + fabs T3, A7 + fadd C4, C4, T4 + fabs T4, A8 + + fadd C1, C1, T1 + fadd C2, C2, T2 + fadd C3, C3, T3 + fadd C4, C4, T4 + .align 4 + +LL(116): + andi. r0, N, 4 + beq LL(117) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs T1, A1 + fabs T2, A2 + fabs T3, A3 + fabs T4, A4 + + fadd C1, C1, T1 + fadd C2, C2, T2 + fadd C3, C3, T3 + fadd C4, C4, T4 + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + fabs T1, A1 + fabs T2, A2 + fadd C1, C1, T1 + fadd C2, C2, T2 + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(999) + + LFDX A1, X, INCX2 + fabs T1, A1 + fadd C1, C1, T1 + .align 4 + +LL(999): + fpadd C1, C1, C2 + li r10, 16 + fpadd C3, C3, C4 + fpadd C1, C1, C3 + lfpdux f15, SP, r10 + fsmtp C2, C1 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fadd C1, C2, C1 + blr + + EPILOGUE diff --git a/kernel/power/asum_ppc440.S b/kernel/power/asum_ppc440.S new file mode 100644 index 0000000..c6ad0f0 --- /dev/null +++ b/kernel/power/asum_ppc440.S @@ -0,0 +1,313 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define PREX r6 + +#define ATTR r7 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + fmr f1, FZERO + li PREX, 3 * 16 * SIZE + fmr f2, FZERO + sub X, X, INCX + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + cmpwi cr0, N, 0 + fmr f7, FZERO + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + fabs f16, f8 + + LFDUX f24, X, INCX + fabs f17, f9 + LFDUX f25, X, INCX + fabs f18, f10 + LFDUX f26, X, INCX + fabs f19, f11 + LFDUX f27, X, INCX + fabs f20, f12 + LFDUX f28, X, INCX + fabs f21, f13 + LFDUX f29, X, INCX + fabs f22, f14 + LFDUX f30, X, INCX + fabs f23, f15 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + LFDUX f8, X, INCX + FADD f0, f0, f16 +#ifdef PPCG4 + dcbt X, PREX +#else + nop +#endif + fabs f16, f24 + + LFDUX f9, X, INCX + FADD f1, f1, f17 + nop + fabs f17, f25 + + LFDUX f10, X, INCX + FADD f2, f2, f18 + nop + fabs f18, f26 + LFDUX f11, X, INCX + FADD f3, f3, f19 + nop + fabs f19, f27 + + LFDUX f12, X, INCX + FADD f4, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#else + nop +#endif + fabs f20, f28 + + LFDUX f13, X, INCX + FADD f5, f5, f21 + nop + fabs f21, f29 + + LFDUX f14, X, INCX + FADD f6, f6, f22 + nop + fabs f22, f30 + LFDUX f15, X, INCX + FADD f7, f7, f23 + nop + fabs f23, f31 + + LFDUX f24, X, INCX + FADD f0, f0, f16 +#ifdef PPCG4 + dcbt X, PREX +#else + nop +#endif + fabs f16, f8 + LFDUX f25, X, INCX + FADD f1, f1, f17 + nop + fabs f17, f9 + + LFDUX f26, X, INCX + FADD f2, f2, f18 + nop + fabs f18, f10 + LFDUX f27, X, INCX + FADD f3, f3, f19 + nop + fabs f19, f11 + + LFDUX f28, X, INCX + FADD f4, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#else + nop +#endif + fabs f20, f12 + + LFDUX f29, X, INCX + FADD f5, f5, f21 + nop + fabs f21, f13 + + LFDUX f30, X, INCX + FADD f6, f6, f22 + nop + fabs f22, f14 + + LFDUX f31, X, INCX + FADD f7, f7, f23 + fabs f23, f15 + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + FADD f0, f0, f8 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S new file mode 100644 index 0000000..9f9605f --- /dev/null +++ b/kernel/power/axpy.S @@ -0,0 +1,550 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define PREA r4 +#define YY r5 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define PREA r4 +#define YY r5 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define PREA r5 +#define YY r6 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define PREA r4 +#define YY r5 +#endif +#endif + +#define ALPHA f24 + +#ifndef NEEDPARAM + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCY, 56 + STACKSIZE(SP) +#endif + + fmr ALPHA, f1 + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + LFD f8, 0 * SIZE(Y) + LFD f9, 1 * SIZE(Y) + LFD f10, 2 * SIZE(Y) + LFD f11, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + LFD f12, 4 * SIZE(Y) + LFD f13, 5 * SIZE(Y) + LFD f14, 6 * SIZE(Y) + LFD f15, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFD f0, 8 * SIZE(X) + LFD f1, 9 * SIZE(X) + LFD f2, 10 * SIZE(X) + LFD f3, 11 * SIZE(X) + + LFD f8, 8 * SIZE(Y) + LFD f9, 9 * SIZE(Y) + LFD f10, 10 * SIZE(Y) + LFD f11, 11 * SIZE(Y) + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + STFD f18, 2 * SIZE(Y) + STFD f19, 3 * SIZE(Y) + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFD f4, 12 * SIZE(X) + LFD f5, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f7, 15 * SIZE(X) + + LFD f12, 12 * SIZE(Y) + LFD f13, 13 * SIZE(Y) + LFD f14, 14 * SIZE(Y) + LFD f15, 15 * SIZE(Y) + + STFD f20, 4 * SIZE(Y) + STFD f21, 5 * SIZE(Y) + STFD f22, 6 * SIZE(Y) + STFD f23, 7 * SIZE(Y) + + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFD f0, 16 * SIZE(X) + LFD f1, 17 * SIZE(X) + LFD f2, 18 * SIZE(X) + LFD f3, 19 * SIZE(X) + + LFD f8, 16 * SIZE(Y) + LFD f9, 17 * SIZE(Y) + LFD f10, 18 * SIZE(Y) + LFD f11, 19 * SIZE(Y) + + STFD f16, 8 * SIZE(Y) + STFD f17, 9 * SIZE(Y) + STFD f18, 10 * SIZE(Y) + STFD f19, 11 * SIZE(Y) + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFD f4, 20 * SIZE(X) + LFD f5, 21 * SIZE(X) + LFD f6, 22 * SIZE(X) + LFD f7, 23 * SIZE(X) + + LFD f12, 20 * SIZE(Y) + LFD f13, 21 * SIZE(Y) + LFD f14, 22 * SIZE(Y) + LFD f15, 23 * SIZE(Y) + + STFD f20, 12 * SIZE(Y) + STFD f21, 13 * SIZE(Y) + STFD f22, 14 * SIZE(Y) + STFD f23, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst Y, PREA +#ifdef L1_DUALFETCH + dcbt X, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst Y, PREA + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFD f0, 8 * SIZE(X) + LFD f1, 9 * SIZE(X) + LFD f2, 10 * SIZE(X) + LFD f3, 11 * SIZE(X) + + LFD f8, 8 * SIZE(Y) + LFD f9, 9 * SIZE(Y) + LFD f10, 10 * SIZE(Y) + LFD f11, 11 * SIZE(Y) + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFD f4, 12 * SIZE(X) + LFD f5, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f7, 15 * SIZE(X) + + LFD f12, 12 * SIZE(Y) + LFD f13, 13 * SIZE(Y) + LFD f14, 14 * SIZE(Y) + LFD f15, 15 * SIZE(Y) + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + STFD f18, 2 * SIZE(Y) + STFD f19, 3 * SIZE(Y) + + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + STFD f20, 4 * SIZE(Y) + STFD f21, 5 * SIZE(Y) + STFD f22, 6 * SIZE(Y) + STFD f23, 7 * SIZE(Y) + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + STFD f16, 8 * SIZE(Y) + STFD f17, 9 * SIZE(Y) + STFD f18, 10 * SIZE(Y) + STFD f19, 11 * SIZE(Y) + + STFD f20, 12 * SIZE(Y) + STFD f21, 13 * SIZE(Y) + STFD f22, 14 * SIZE(Y) + STFD f23, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f0, 0 * SIZE(X) + LFD f8, 0 * SIZE(Y) + + FMADD f16, ALPHA, f0, f8 + + STFD f16, 0 * SIZE(Y) + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + .align 4 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDUX f8, Y, INCY + + FMADD f16, ALPHA, f0, f8 + + STFDUX f16, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/axpy_hummer.S b/kernel/power/axpy_hummer.S new file mode 100644 index 0000000..372a846 --- /dev/null +++ b/kernel/power/axpy_hummer.S @@ -0,0 +1,656 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 + +#define YY r4 +#define INCX2 r5 +#define INCY2 r10 + +#define ALPHA f1 + +#define A1 f0 +#define A2 f8 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 +#define A9 f25 + +#define B1 f9 +#define B2 f10 +#define B3 f11 +#define B4 f12 +#define B5 f13 +#define B6 f14 +#define B7 f15 +#define B8 f16 + +#define C1 f17 +#define C2 f18 +#define C3 f19 +#define C4 f20 +#define C5 f21 +#define C6 f22 +#define C7 f23 +#define C8 f24 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + + fsmfp ALPHA, ALPHA + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + andi. r0, Y, 2 * SIZE - 1 + beq LL(05) + + LFD A1, 0 * SIZE(X) + LFD B1, 0 * SIZE(Y) + + addi X, X, SIZE + addi Y, Y, SIZE + + fmadd C1, ALPHA, A1, B1 + addi N, N, -1 + STFD C1, -1 * SIZE(Y) + +LL(05): + andi. r0, X, 2 * SIZE - 1 + bne LL(20) + + sub X, X, INCX2 + sub Y, Y, INCY2 + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + bdz LL(13) + .align 4 + +LL(12): + fpmadd C1, ALPHA, A1, B1 + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + fpmadd C2, ALPHA, A2, B2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fpmadd C3, ALPHA, A3, B3 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + fpmadd C4, ALPHA, A4, B4 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C5, ALPHA, A5, B5 + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + fpmadd C6, ALPHA, A6, B6 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + fpmadd C7, ALPHA, A7, B7 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + fpmadd C8, ALPHA, A8, B8 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + bdnz LL(12) + .align 4 + +LL(13): + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + fpmadd C3, ALPHA, A3, B3 + fpmadd C4, ALPHA, A4, B4 + + fpmadd C5, ALPHA, A5, B5 + fpmadd C6, ALPHA, A6, B6 + STFPDUX C1, YY, INCY2 + fpmadd C7, ALPHA, A7, B7 + STFPDUX C2, YY, INCY2 + fpmadd C8, ALPHA, A8, B8 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + fpmadd C3, ALPHA, A3, B3 + fpmadd C4, ALPHA, A4, B4 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + fpmadd C1, ALPHA, A1, B1 + + STFPDUX C1, YY, INCY2 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + fmadd C1, ALPHA, A1, B1 + STFDUX C1, YY, INCY2 + b LL(999) + .align 4 + +/* X is unaliged */ + +LL(20): + LFD A1, 0 * SIZE(X) + addi X, X, SIZE + sub X, X, INCX2 + sub Y, Y, INCY2 + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + LFXDUX A4, X, INCX2 + LFPDUX B3, Y, INCY2 + LFXDUX A5, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFXDUX A6, X, INCX2 + LFPDUX B5, Y, INCY2 + LFXDUX A7, X, INCX2 + LFPDUX B6, Y, INCY2 + fsmr A1, A2 + LFXDUX A8, X, INCX2 + fsmr A2, A3 + LFPDUX B7, Y, INCY2 + fsmr A3, A4 + LFXDUX A9, X, INCX2 + fsmr A4, A5 + LFPDUX B8, Y, INCY2 + bdz LL(23) + .align 4 + +LL(22): + fpmadd C1, ALPHA, A1, B1 + fsmr A5, A6 + LFPDUX B1, Y, INCY2 + fpmadd C2, ALPHA, A2, B2 + LFXDUX A2, X, INCX2 + fsmr A6, A7 + LFPDUX B2, Y, INCY2 + fpmadd C3, ALPHA, A3, B3 + LFXDUX A3, X, INCX2 + fsmr A7, A8 + LFPDUX B3, Y, INCY2 + fpmadd C4, ALPHA, A4, B4 + LFXDUX A4, X, INCX2 + fsmr A8, A9 + LFPDUX B4, Y, INCY2 + + fpmadd C5, ALPHA, A5, B5 + LFXDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + fpmadd C6, ALPHA, A6, B6 + LFXDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + fpmadd C7, ALPHA, A7, B7 + LFXDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + fpmadd C8, ALPHA, A8, B8 + LFXDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + + fpmr A1, A9 + LFXDUX A9, X, INCX2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + fsmr A1, A2 + + STFPDUX C5, YY, INCY2 + fsmr A2, A3 + STFPDUX C6, YY, INCY2 + fsmr A3, A4 + STFPDUX C7, YY, INCY2 + fsmr A4, A5 + STFPDUX C8, YY, INCY2 + bdnz LL(22) + .align 4 + +LL(23): + fpmadd C1, ALPHA, A1, B1 + fsmr A5, A6 + fpmadd C2, ALPHA, A2, B2 + fsmr A6, A7 + fpmadd C3, ALPHA, A3, B3 + fsmr A7, A8 + fpmadd C4, ALPHA, A4, B4 + fsmr A8, A9 + + fpmadd C5, ALPHA, A5, B5 + fpmadd C6, ALPHA, A6, B6 + fpmadd C7, ALPHA, A7, B7 + fpmadd C8, ALPHA, A8, B8 + fpmr A1, A9 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + LFXDUX A4, X, INCX2 + LFPDUX B3, Y, INCY2 + LFXDUX A5, X, INCX2 + LFPDUX B4, Y, INCY2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + fpmadd C3, ALPHA, A3, B3 + fpmadd C4, ALPHA, A4, B4 + fpmr A1, A5 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + + fsmr A1, A2 + fsmr A2, A3 + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + fpmr A1, A3 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + + fsmr A1, A2 + fpmadd C1, ALPHA, A1, B1 + fpmr A1, A2 + + STFPDUX C1, YY, INCY2 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX B1, Y, INCY2 + + fmadd C1, ALPHA, A1, B1 + STFDUX C1, YY, INCY2 + b LL(999) + .align 4 +#### + + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + LFDUX A5, X, INCX + LFDUX B5, Y, INCY + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + + LFDUX A7, X, INCX + LFDUX B7, Y, INCY + LFDUX A8, X, INCX + LFDUX B8, Y, INCY + bdz LL(113) + .align 4 + +LL(112): + fmadd C1, ALPHA, A1, B1 + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + + fmadd C2, ALPHA, A2, B2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C3, ALPHA, A3, B3 + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + + fmadd C4, ALPHA, A4, B4 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C5, ALPHA, A5, B5 + LFDUX A5, X, INCX + LFDUX B5, Y, INCY + fmadd C6, ALPHA, A6, B6 + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + fmadd C7, ALPHA, A7, B7 + LFDUX A7, X, INCX + LFDUX B7, Y, INCY + fmadd C8, ALPHA, A8, B8 + LFDUX A8, X, INCX + LFDUX B8, Y, INCY + + STFDUX C1, YY, INCY + STFDUX C2, YY, INCY + STFDUX C3, YY, INCY + STFDUX C4, YY, INCY + + STFDUX C5, YY, INCY + STFDUX C6, YY, INCY + STFDUX C7, YY, INCY + STFDUX C8, YY, INCY + bdnz LL(112) + .align 4 + +LL(113): + fmadd C1, ALPHA, A1, B1 + fmadd C2, ALPHA, A2, B2 + fmadd C3, ALPHA, A3, B3 + fmadd C4, ALPHA, A4, B4 + + fmadd C5, ALPHA, A5, B5 + fmadd C6, ALPHA, A6, B6 + STFDUX C1, YY, INCY + fmadd C7, ALPHA, A7, B7 + STFDUX C2, YY, INCY + fmadd C8, ALPHA, A8, B8 + STFDUX C3, YY, INCY + + STFDUX C4, YY, INCY + STFDUX C5, YY, INCY + STFDUX C6, YY, INCY + STFDUX C7, YY, INCY + STFDUX C8, YY, INCY + .align 4 + +LL(115): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(117) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C1, ALPHA, A1, B1 + fmadd C2, ALPHA, A2, B2 + fmadd C3, ALPHA, A3, B3 + fmadd C4, ALPHA, A4, B4 + + STFDUX C1, YY, INCY + STFDUX C2, YY, INCY + STFDUX C3, YY, INCY + STFDUX C4, YY, INCY + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C1, ALPHA, A1, B1 + fmadd C2, ALPHA, A2, B2 + + STFDUX C1, YY, INCY + STFDUX C2, YY, INCY + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + + fmadd C1, ALPHA, A1, B1 + STFDUX C1, YY, INCY + .align 4 + +LL(999): + li r10, 16 + subi SP, SP, 16 + + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S new file mode 100644 index 0000000..cc2605c --- /dev/null +++ b/kernel/power/axpy_ppc440.S @@ -0,0 +1,337 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define YY r5 +#define PRE r4 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define YY r5 +#define PRE r4 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define YY r6 +#define PRE r5 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define YY r5 +#define PRE r4 +#endif +#endif + +#define ALPHA f24 + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCY, 56 + STACKSIZE(SP) +#endif + + fmr ALPHA, f1 + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + li PRE, 2 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + sub X, X, INCX + sub Y, Y, INCY + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + .align 4 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f16, ALPHA, f0, f8 + LFDUX f0, X, INCX + LFDUX f8, Y, INCY +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f17, ALPHA, f1, f9 + LFDUX f1, X, INCX + LFDUX f9, Y, INCY + FMADD f18, ALPHA, f2, f10 + LFDUX f2, X, INCX + LFDUX f10, Y, INCY +#ifdef PPCG4 + dcbtst Y, PRE +#endif + FMADD f19, ALPHA, f3, f11 + LFDUX f3, X, INCX + LFDUX f11, Y, INCY + + FMADD f20, ALPHA, f4, f12 + LFDUX f4, X, INCX + LFDUX f12, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f21, ALPHA, f5, f13 + LFDUX f5, X, INCX + LFDUX f13, Y, INCY + FMADD f22, ALPHA, f6, f14 + LFDUX f6, X, INCX + LFDUX f14, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + FMADD f23, ALPHA, f7, f15 + LFDUX f7, X, INCX + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + FMADD f16, ALPHA, f0, f8 + LFDUX f0, X, INCX + LFDUX f8, Y, INCY +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f17, ALPHA, f1, f9 + LFDUX f1, X, INCX + LFDUX f9, Y, INCY + FMADD f18, ALPHA, f2, f10 + LFDUX f2, X, INCX + LFDUX f10, Y, INCY +#ifdef PPCG4 + dcbtst Y, PRE +#endif + FMADD f19, ALPHA, f3, f11 + LFDUX f3, X, INCX + LFDUX f11, Y, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + + FMADD f20, ALPHA, f4, f12 + LFDUX f4, X, INCX + LFDUX f12, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f21, ALPHA, f5, f13 + LFDUX f5, X, INCX + LFDUX f13, Y, INCY + FMADD f22, ALPHA, f6, f14 + LFDUX f6, X, INCX + LFDUX f14, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + FMADD f23, ALPHA, f7, f15 + LFDUX f7, X, INCX + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f16, ALPHA, f0, f8 + LFDUX f0, X, INCX + LFDUX f8, Y, INCY + FMADD f17, ALPHA, f1, f9 + LFDUX f1, X, INCX + LFDUX f9, Y, INCY + FMADD f18, ALPHA, f2, f10 + LFDUX f2, X, INCX + LFDUX f10, Y, INCY + FMADD f19, ALPHA, f3, f11 + LFDUX f3, X, INCX + LFDUX f11, Y, INCY + + FMADD f20, ALPHA, f4, f12 + LFDUX f4, X, INCX + LFDUX f12, Y, INCY + FMADD f21, ALPHA, f5, f13 + LFDUX f5, X, INCX + LFDUX f13, Y, INCY + FMADD f22, ALPHA, f6, f14 + LFDUX f6, X, INCX + LFDUX f14, Y, INCY + FMADD f23, ALPHA, f7, f15 + LFDUX f7, X, INCX + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + FMADD f16, ALPHA, f0, f8 + STFDUX f20, YY, INCY + FMADD f17, ALPHA, f1, f9 + STFDUX f21, YY, INCY + FMADD f18, ALPHA, f2, f10 + STFDUX f22, YY, INCY + FMADD f19, ALPHA, f3, f11 + STFDUX f23, YY, INCY + + FMADD f20, ALPHA, f4, f12 + STFDUX f16, YY, INCY + FMADD f21, ALPHA, f5, f13 + STFDUX f17, YY, INCY + FMADD f22, ALPHA, f6, f14 + STFDUX f18, YY, INCY + FMADD f23, ALPHA, f7, f15 + STFDUX f19, YY, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDUX f8, Y, INCY + + FMADD f16, ALPHA, f0, f8 + + STFDUX f16, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/cabs.S b/kernel/power/cabs.S new file mode 100644 index 0000000..28ae703 --- /dev/null +++ b/kernel/power/cabs.S @@ -0,0 +1,54 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + LFD f0, 0 * SIZE(r3) + LFD f1, 1 * SIZE(r3) + fabs f0, f0 + fabs f1, f1 + fadd f1, f0, f1 + blr + + EPILOGUE + + diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S new file mode 100644 index 0000000..930ea29 --- /dev/null +++ b/kernel/power/cnrm2.S @@ -0,0 +1,418 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO 144(SP) +#define FONE 148(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, 4 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + fmr f8, f1 + fmr f9, f1 + fmr f10, f1 + fmr f11, f1 + fmr f12, f1 + fmr f13, f1 + fmr f14, f1 + fmr f15, f1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(1000) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + addi X, X, 2 * SIZE + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f0, f0, f8 + + fsqrt f1, f0 + b LL(9999) + .align 4 + +LL(1000): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(1150) + + LFDX f16, X, INCXM1 + LFDUX f17, X, INCX + LFDX f18, X, INCXM1 + LFDUX f19, X, INCX + LFDX f20, X, INCXM1 + LFDUX f21, X, INCX + LFDX f22, X, INCXM1 + LFDUX f23, X, INCX + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + LFDX f16, X, INCXM1 + LFDUX f17, X, INCX + LFDX f18, X, INCXM1 + LFDUX f19, X, INCX + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + LFDX f20, X, INCXM1 + LFDUX f21, X, INCX + LFDX f22, X, INCXM1 + LFDUX f23, X, INCX + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdnz LL(1110) + .align 4 + +LL(1120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + .align 4 + +LL(1150): + andi. r0, N, 7 + mtspr CTR, r0 + beq- cr0, LL(1170) + .align 4 + +LL(1160): + LFDX f16, X, INCXM1 + LFDUX f17, X, INCX + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(1160) + .align 4 + +LL(1170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f0, f0, f8 + + fsqrt f1, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S new file mode 100644 index 0000000..e6b022f --- /dev/null +++ b/kernel/power/cnrm2_hummer.S @@ -0,0 +1,812 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 +#define C5 f4 +#define C6 f5 +#define C7 f6 +#define C8 f7 + +#define A1 f8 +#define A2 f9 +#define A3 f10 +#define A4 f11 +#define A5 f12 +#define A6 f13 +#define A7 f14 +#define A8 f15 + +#define A9 f16 +#define A10 f17 +#define A11 f18 +#define A12 f19 +#define A13 f20 +#define A14 f21 +#define A15 f22 +#define A16 f23 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + fpmr C5, C1 + fpmr C6, C1 + fpmr C7, C1 + fpmr C8, C1 + + cmpwi cr0, N, 0 + ble LL(99) + cmpwi cr0, INCX, 0 + ble LL(99) + + andi. r0, X, 2 * SIZE - 1 + bne LL(100) + + srawi. r0, N, 4 + sub X, X, INCX2 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + LFPDUX A9, X, INCX2 + LFPDUX A10, X, INCX2 + LFPDUX A11, X, INCX2 + LFPDUX A12, X, INCX2 + LFPDUX A13, X, INCX2 + LFPDUX A14, X, INCX2 + LFPDUX A15, X, INCX2 + LFPDUX A16, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpmadd C1, A1, A1, C1 + LFPDUX A1, X, INCX2 + fpmadd C2, A2, A2, C2 + LFPDUX A2, X, INCX2 + fpmadd C3, A3, A3, C3 + LFPDUX A3, X, INCX2 + fpmadd C4, A4, A4, C4 + LFPDUX A4, X, INCX2 + + fpmadd C5, A5, A5, C5 + LFPDUX A5, X, INCX2 + fpmadd C6, A6, A6, C6 + LFPDUX A6, X, INCX2 + fpmadd C7, A7, A7, C7 + LFPDUX A7, X, INCX2 + fpmadd C8, A8, A8, C8 + LFPDUX A8, X, INCX2 + + fpmadd C1, A9, A9, C1 + LFPDUX A9, X, INCX2 + fpmadd C2, A10, A10, C2 + LFPDUX A10, X, INCX2 + fpmadd C3, A11, A11, C3 + LFPDUX A11, X, INCX2 + fpmadd C4, A12, A12, C4 + LFPDUX A12, X, INCX2 + + fpmadd C5, A13, A13, C5 + LFPDUX A13, X, INCX2 + fpmadd C6, A14, A14, C6 + LFPDUX A14, X, INCX2 + fpmadd C7, A15, A15, C7 + LFPDUX A15, X, INCX2 + fpmadd C8, A16, A16, C8 + LFPDUX A16, X, INCX2 + + bdnz LL(12) + .align 4 + +LL(13): + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + + fpmadd C1, A9, A9, C1 + fpmadd C2, A10, A10, C2 + fpmadd C3, A11, A11, C3 + fpmadd C4, A12, A12, C4 + + fpmadd C5, A13, A13, C5 + fpmadd C6, A14, A14, C6 + fpmadd C7, A15, A15, C7 + fpmadd C8, A16, A16, C8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(98) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(98) + + LFPDUX A1, X, INCX2 + fpmadd C3, A1, A1, C3 + .align 4 + +LL(98): + fpadd C1, C1, C5 + lis r3, 0x3f00 + fpadd C2, C2, C6 + lis r4, 0x4040 + fpadd C3, C3, C7 + stw r3, 4(SP) + fpadd C4, C4, C8 + stw r4, 8(SP) + + fpadd C1, C1, C2 + lfs f10, 0(SP) + fpadd C3, C3, C4 + lfs f11, 4(SP) + + fpadd C1, C1, C3 + lfs f12, 8(SP) + + fsmtp C2, C1 + fadd C1, C2, C1 + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr +#else + fsqrt f1, f1 + + li r10, 16 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(99): + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + cmpwi cr0, INCX, SIZE + bne LL(200) + + LFD C1, 0(X) + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + fmul C1, C1, C1 + sub X, X, INCX2 + ble LL(198) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(115) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + LFPDUX A9, X, INCX2 + LFPDUX A10, X, INCX2 + LFPDUX A11, X, INCX2 + LFPDUX A12, X, INCX2 + LFPDUX A13, X, INCX2 + LFPDUX A14, X, INCX2 + LFPDUX A15, X, INCX2 + LFPDUX A16, X, INCX2 + bdz LL(113) + .align 4 + +LL(112): + fpmadd C1, A1, A1, C1 + LFPDUX A1, X, INCX2 + fpmadd C2, A2, A2, C2 + LFPDUX A2, X, INCX2 + fpmadd C3, A3, A3, C3 + LFPDUX A3, X, INCX2 + fpmadd C4, A4, A4, C4 + LFPDUX A4, X, INCX2 + + fpmadd C5, A5, A5, C5 + LFPDUX A5, X, INCX2 + fpmadd C6, A6, A6, C6 + LFPDUX A6, X, INCX2 + fpmadd C7, A7, A7, C7 + LFPDUX A7, X, INCX2 + fpmadd C8, A8, A8, C8 + LFPDUX A8, X, INCX2 + + fpmadd C1, A9, A9, C1 + LFPDUX A9, X, INCX2 + fpmadd C2, A10, A10, C2 + LFPDUX A10, X, INCX2 + fpmadd C3, A11, A11, C3 + LFPDUX A11, X, INCX2 + fpmadd C4, A12, A12, C4 + LFPDUX A12, X, INCX2 + + fpmadd C5, A13, A13, C5 + LFPDUX A13, X, INCX2 + fpmadd C6, A14, A14, C6 + LFPDUX A14, X, INCX2 + fpmadd C7, A15, A15, C7 + LFPDUX A15, X, INCX2 + fpmadd C8, A16, A16, C8 + LFPDUX A16, X, INCX2 + + bdnz LL(112) + .align 4 + +LL(113): + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + + fpmadd C1, A9, A9, C1 + fpmadd C2, A10, A10, C2 + fpmadd C3, A11, A11, C3 + fpmadd C4, A12, A12, C4 + + fpmadd C5, A13, A13, C5 + fpmadd C6, A14, A14, C6 + fpmadd C7, A15, A15, C7 + fpmadd C8, A16, A16, C8 + .align 4 + +LL(115): + andi. r0, N, 15 + beq LL(198) + + andi. r0, N, 8 + beq LL(116) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + .align 4 + +LL(116): + andi. r0, N, 4 + beq LL(117) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(198) + + LFPDUX A1, X, INCX2 + fpmadd C3, A1, A1, C3 + .align 4 + +LL(198): + LFDX A1, X, INCX2 + fmadd C4, A1, A1, C4 + + fpadd C1, C1, C5 + lis r3, 0x3f00 + fpadd C2, C2, C6 + lis r4, 0x4040 + fpadd C3, C3, C7 + stw r3, 4(SP) + fpadd C4, C4, C8 + stw r4, 8(SP) + + fpadd C1, C1, C2 + lfs f10, 0(SP) + fpadd C3, C3, C4 + lfs f11, 4(SP) + + fpadd C1, C1, C3 + lfs f12, 8(SP) + + fsmtp C2, C1 + fadd C1, C2, C1 + + fcmpu cr0, f10, C1 + beq cr0, LL(199) + +#ifndef HUMMER_EMULATOR + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr +#else + fsqrt f1, f1 + + li r10, 16 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(199): + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(200): + sub X, X, INCX2 + addi X2, X, SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(215) + + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + + LFDUX A9, X, INCX2 + LFDUX A10, X2, INCX2 + LFDUX A11, X, INCX2 + LFDUX A12, X2, INCX2 + + LFDUX A13, X, INCX2 + LFDUX A14, X2, INCX2 + LFDUX A15, X, INCX2 + LFDUX A16, X2, INCX2 + bdz LL(213) + .align 4 + +LL(212): + fmadd C1, A1, A1, C1 + LFDUX A1, X, INCX2 + fmadd C2, A2, A2, C2 + LFDUX A2, X2, INCX2 + fmadd C3, A3, A3, C3 + LFDUX A3, X, INCX2 + fmadd C4, A4, A4, C4 + LFDUX A4, X2, INCX2 + + fmadd C5, A5, A5, C5 + LFDUX A5, X, INCX2 + fmadd C6, A6, A6, C6 + LFDUX A6, X2, INCX2 + fmadd C7, A7, A7, C7 + LFDUX A7, X, INCX2 + fmadd C8, A8, A8, C8 + LFDUX A8, X2, INCX2 + + fmadd C1, A9, A9, C1 + LFDUX A9, X, INCX2 + fmadd C2, A10, A10, C2 + LFDUX A10, X2, INCX2 + fmadd C3, A11, A11, C3 + LFDUX A11, X, INCX2 + fmadd C4, A12, A12, C4 + LFDUX A12, X2, INCX2 + + fmadd C5, A13, A13, C5 + LFDUX A13, X, INCX2 + fmadd C6, A14, A14, C6 + LFDUX A14, X2, INCX2 + fmadd C7, A15, A15, C7 + LFDUX A15, X, INCX2 + fmadd C8, A16, A16, C8 + LFDUX A16, X2, INCX2 + + bdnz LL(212) + .align 4 + +LL(213): + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + + fmadd C5, A5, A5, C5 + fmadd C6, A6, A6, C6 + fmadd C7, A7, A7, C7 + fmadd C8, A8, A8, C8 + + fmadd C1, A9, A9, C1 + fmadd C2, A10, A10, C2 + fmadd C3, A11, A11, C3 + fmadd C4, A12, A12, C4 + + fmadd C5, A13, A13, C5 + fmadd C6, A14, A14, C6 + fmadd C7, A15, A15, C7 + fmadd C8, A16, A16, C8 + .align 4 + +LL(215): + andi. r0, N, 7 + beq LL(998) + andi. r0, N, 4 + beq LL(216) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + + fmadd C5, A5, A5, C5 + fmadd C6, A6, A6, C6 + fmadd C7, A7, A7, C7 + fmadd C8, A8, A8, C8 + .align 4 + +LL(216): + andi. r0, N, 2 + beq LL(217) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + .align 4 + +LL(217): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + .align 4 + +LL(998): + fadd C1, C1, C5 + lis r3, 0x3f00 + fadd C2, C2, C6 + lis r4, 0x4040 + fadd C3, C3, C7 + stw r3, 4(SP) + fadd C4, C4, C8 + stw r4, 8(SP) + + fadd C1, C1, C2 + lfs f10, 0(SP) + fadd C3, C3, C4 + lfs f11, 4(SP) + fadd C1, C1, C3 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr + +LL(999): + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S new file mode 100644 index 0000000..5ead681 --- /dev/null +++ b/kernel/power/cnrm2_ppc440.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PRE r8 +#define INC1 r9 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define C1 152(SP) +#define C2 156(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r6, 0x3f00 + lis r7, 0x4040 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r6, C1 + stw r7, C2 + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + li INC1, SIZE + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + cmpwi cr0, INCX, 0 + ble- LL(999) + + fmr f0, f1 + sub X, X, INCX + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + fmr f8, f1 + fmr f9, f1 + fmr f10, f1 + fmr f11, f1 + fmr f12, f1 + fmr f13, f1 + fmr f14, f1 + fmr f15, f1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(1150) + + LFDUX f16, X, INCX + LFDX f17, X, INC1 + LFDUX f18, X, INCX + LFDX f19, X, INC1 + LFDUX f20, X, INCX + LFDX f21, X, INC1 + LFDUX f22, X, INCX + LFDX f23, X, INC1 + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + bdz LL(1120) + .align 4 + +LL(1110): + fmadd f0, f16, f16, f0 + LFDUX f16, X, INCX + fmadd f1, f17, f17, f1 + LFDX f17, X, INC1 + fmadd f2, f18, f18, f2 + LFDUX f18, X, INCX + fmadd f3, f19, f19, f3 + LFDX f19, X, INC1 + +#ifdef PPCG4 + dcbt X, PRE +#endif + + fmadd f4, f20, f20, f4 + LFDUX f20, X, INCX + fmadd f5, f21, f21, f5 + LFDX f21, X, INC1 + fmadd f6, f22, f22, f6 + LFDUX f22, X, INCX + fmadd f7, f23, f23, f7 + LFDX f23, X, INC1 + + fmadd f8, f24, f24, f8 + LFDUX f24, X, INCX + fmadd f9, f25, f25, f9 + LFDX f25, X, INC1 + fmadd f10, f26, f26, f10 + LFDUX f26, X, INCX + fmadd f11, f27, f27, f11 + LFDX f27, X, INC1 + +#ifdef PPCG4 + dcbt X, PRE +#endif + + fmadd f12, f28, f28, f12 + LFDUX f28, X, INCX + fmadd f13, f29, f29, f13 + LFDX f29, X, INC1 + fmadd f14, f30, f30, f14 + LFDUX f30, X, INCX + fmadd f15, f31, f31, f15 + LFDX f31, X, INC1 + bdnz LL(1110) + .align 4 + +LL(1120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + .align 4 + +LL(1150): + andi. r0, N, 7 + mtspr CTR, r0 + beq- cr0, LL(1170) + .align 4 + +LL(1160): + LFDUX f16, X, INCX + LFDX f17, X, INC1 + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(1160) + .align 4 + +LL(1170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f1, f0, f8 + lfs f4, FZERO + + fcmpu cr0, f1, f4 + beq cr0, LL(999) + + frsqrte f0, f1 + lfs f8, C1 + lfs f9, C2 + + fmul f2, f1, f0 + fadd f7, f8, f8 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f5, f1, f0 + fmul f2, f5, f8 + fnmsub f3, f5, f0, f7 + fmadd f1, f2, f3, f5 + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/copy.S b/kernel/power/copy.S new file mode 100644 index 0000000..5a6c610 --- /dev/null +++ b/kernel/power/copy.S @@ -0,0 +1,226 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 + +#define STACKSIZE 16 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + +LL(10): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + STFD f2, 2 * SIZE(Y) + STFD f3, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + STFD f4, 4 * SIZE(Y) + STFD f5, 5 * SIZE(Y) + STFD f6, 6 * SIZE(Y) + STFD f7, 7 * SIZE(Y) + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + STFD f8, 8 * SIZE(Y) + STFD f9, 9 * SIZE(Y) + STFD f10, 10 * SIZE(Y) + STFD f11, 11 * SIZE(Y) + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + STFD f12, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f14, 14 * SIZE(Y) + STFD f15, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst Y, PREA +#ifdef L1_DUALFETCH + dcbt X, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst Y, PREA + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + + STFD f8, 0 * SIZE(Y) + addi Y, Y, 1 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + STFDUX f0, Y, INCY + STFDUX f1, Y, INCY + STFDUX f2, Y, INCY + STFDUX f3, Y, INCY + + STFDUX f4, Y, INCY + STFDUX f5, Y, INCY + STFDUX f6, Y, INCY + STFDUX f7, Y, INCY + + STFDUX f8, Y, INCY + STFDUX f9, Y, INCY + STFDUX f10, Y, INCY + STFDUX f11, Y, INCY + + STFDUX f12, Y, INCY + STFDUX f13, Y, INCY + STFDUX f14, Y, INCY + STFDUX f15, Y, INCY + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + STFDUX f8, Y, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/copy_hummer.S b/kernel/power/copy_hummer.S new file mode 100644 index 0000000..1efa6fb --- /dev/null +++ b/kernel/power/copy_hummer.S @@ -0,0 +1,958 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 + +#define INCX2 r8 +#define INCY2 r9 +#define X2 r10 +#define Y2 r11 + +#define A1 f0 +#define A2 f1 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 +#define A9 f8 + +#define T1 f9 +#define T2 f10 +#define T3 f11 +#define T4 f12 +#define T5 f13 +#define T6 f14 +#define T7 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCY, SIZE + bne LL(60) + + cmpwi cr0, INCX, SIZE + bne LL(50) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + .align 4 + +LL(10): /* X : aligned Y : aligned */ + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + STFPDUX A1, Y, INCY2 + LFPDUX A1, X, INCX2 + STFPDUX A2, Y, INCY2 + LFPDUX A2, X, INCX2 + STFPDUX A3, Y, INCY2 + LFPDUX A3, X, INCX2 + STFPDUX A4, Y, INCY2 + LFPDUX A4, X, INCX2 + + STFPDUX A5, Y, INCY2 + LFPDUX A5, X, INCX2 + STFPDUX A6, Y, INCY2 + LFPDUX A6, X, INCX2 + STFPDUX A7, Y, INCY2 + LFPDUX A7, X, INCX2 + STFPDUX A8, Y, INCY2 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A6, Y, INCY2 + STFPDUX A7, Y, INCY2 + STFPDUX A8, Y, INCY2 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + STFPDUX A1, Y, INCY2 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + STFDUX A1, Y, INCY2 + .align 4 + b LL(999) + .align 4 + +LL(20): /* X ): aligned Y ): unaligned */ + + LFXDUX A1, X, INCX2 + addi N, N, -1 + cmpwi cr0, N, 0 + STFSDX A1, Y, INCY2 + add Y, Y, INCY + ble LL(999) + .align 4 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX T1, X, INCX2 + LFXDUX T2, X, INCX2 + LFXDUX T3, X, INCX2 + LFXDUX T4, X, INCX2 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdz LL(23) + .align 4 + +LL(22): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + LFPDUX A2, X, INCX2 + fsmr T5, T6 + LFPDUX A3, X, INCX2 + fsmr T6, T7 + LFPDUX A4, X, INCX2 + fsmr T7, A1 + LFPDUX A5, X, INCX2 + + STFPDUX T4, Y, INCY2 + fxmr T1, A2 + STFPDUX T5, Y, INCY2 + fxmr T2, A3 + STFPDUX T6, Y, INCY2 + fxmr T3, A4 + STFPDUX T7, Y, INCY2 + fxmr T4, A5 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdnz LL(22) + .align 4 + +LL(23): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + fsmr T5, T6 + fsmr T6, T7 + fsmr T7, A1 + + STFPDUX T4, Y, INCY2 + STFPDUX T5, Y, INCY2 + STFPDUX T6, Y, INCY2 + STFPDUX T7, Y, INCY2 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + LFXDUX A4, X, INCX2 + LFXDUX A5, X, INCX2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + fpmr A1, A5 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + fsmr A1, A2 + fsmr A2, A3 + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + fpmr A1, A3 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFXDUX A2, X, INCX2 + fsmr A1, A2 + STFPDUX A1, Y, INCY2 + fpmr A1, A2 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(30): /* X : unaligned Y : aligned */ + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFDX A1, X, INCX2 + add X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX T1, X, INCX2 + LFXDUX T2, X, INCX2 + LFXDUX T3, X, INCX2 + LFXDUX T4, X, INCX2 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdz LL(33) + .align 4 + +LL(32): + fxmr T5, A6 + STFPDUX A1, Y, INCY2 + fxmr T6, A7 + STFPDUX T1, Y, INCY2 + fxmr T7, A8 + STFPDUX T2, Y, INCY2 + fxmr A1, A9 + STFPDUX T3, Y, INCY2 + + fsmr T4, T5 + LFPDUX A2, X, INCX2 + fsmr T5, T6 + LFPDUX A3, X, INCX2 + fsmr T6, T7 + LFPDUX A4, X, INCX2 + fsmr T7, A1 + LFPDUX A5, X, INCX2 + + STFPDUX T4, Y, INCY2 + fxmr T1, A2 + STFPDUX T5, Y, INCY2 + fxmr T2, A3 + STFPDUX T6, Y, INCY2 + fxmr T3, A4 + STFPDUX T7, Y, INCY2 + fxmr T4, A5 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + + bdnz LL(32) + .align 4 + +LL(33): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + fsmr T5, T6 + fsmr T6, T7 + fsmr T7, A1 + + STFPDUX T4, Y, INCY2 + STFPDUX T5, Y, INCY2 + STFPDUX T6, Y, INCY2 + STFPDUX T7, Y, INCY2 + .align 4 + +LL(35): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(36) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + LFXDUX A4, X, INCX2 + LFXDUX A5, X, INCX2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + fpmr A1, A5 + .align 4 + +LL(36): + andi. r0, N, 4 + beq LL(37) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + fsmr A1, A2 + fsmr A2, A3 + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + fpmr A1, A3 + .align 4 + +LL(37): + andi. r0, N, 2 + beq LL(38) + + LFXDUX A2, X, INCX2 + fsmr A1, A2 + STFPDUX A1, Y, INCY2 + fpmr A1, A2 + .align 4 + +LL(38): + andi. r0, N, 1 + beq LL(999) + + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(40): /* X : unaligned Y : unaligned */ + + LFDX A1, X, INCX2 + add X, X, INCX + addi N, N, -1 + cmpwi cr0, N, 0 + STFDX A1, Y, INCY2 + add Y, Y, INCY + ble LL(999) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(43) + .align 4 + +LL(42): + STFPDUX A1, Y, INCY2 + LFPDUX A1, X, INCX2 + STFPDUX A2, Y, INCY2 + LFPDUX A2, X, INCX2 + STFPDUX A3, Y, INCY2 + LFPDUX A3, X, INCX2 + STFPDUX A4, Y, INCY2 + LFPDUX A4, X, INCX2 + + STFPDUX A5, Y, INCY2 + LFPDUX A5, X, INCX2 + STFPDUX A6, Y, INCY2 + LFPDUX A6, X, INCX2 + STFPDUX A7, Y, INCY2 + LFPDUX A7, X, INCX2 + STFPDUX A8, Y, INCY2 + LFPDUX A8, X, INCX2 + bdnz LL(42) + .align 4 + +LL(43): + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A6, Y, INCY2 + STFPDUX A7, Y, INCY2 + STFPDUX A8, Y, INCY2 + .align 4 + +LL(45): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + .align 4 + +LL(46): + andi. r0, N, 4 + beq LL(47) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + .align 4 + +LL(47): + andi. r0, N, 2 + beq LL(48) + + LFPDUX A1, X, INCX2 + STFPDUX A1, Y, INCY2 + .align 4 + +LL(48): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + STFDUX A1, Y, INCY2 + .align 4 + b LL(999) + .align 4 + +# INCX != 1, INCY == 1 +LL(50): + andi. r0, Y, 2 * SIZE - 1 + beq LL(51) + + LFD A1, 0 * SIZE(X) + add X, X, INCX + STFD A1, 0 * SIZE(Y) + add Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + .align 4 + +LL(51): + sub X, X, INCX + sub Y, Y, INCY2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(55) + .align 4 + +LL(52): + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + + LFDUX A9, X, INCX + LFDUX T1, X, INCX + LFDUX T2, X, INCX + LFDUX T3, X, INCX + fsmfp A1, A2 + LFDUX T4, X, INCX + fsmfp A3, A4 + LFDUX T5, X, INCX + fsmfp A5, A6 + LFDUX T6, X, INCX + fsmfp A7, A8 + LFDUX T7, X, INCX + fsmfp A9, T1 + + STFPDUX A1, Y, INCY2 + fsmfp T2, T3 + STFPDUX A3, Y, INCY2 + fsmfp T4, T5 + STFPDUX A5, Y, INCY2 + fsmfp T6, T7 + STFPDUX A7, Y, INCY2 + STFPDUX A9, Y, INCY2 + STFPDUX T2, Y, INCY2 + STFPDUX T4, Y, INCY2 + STFPDUX T6, Y, INCY2 + bdnz LL(52) + .align 4 + +LL(55): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(56) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + + fsmfp A1, A2 + fsmfp A3, A4 + fsmfp A5, A6 + fsmfp A7, A8 + + STFPDUX A1, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A7, Y, INCY2 + .align 4 + +LL(56): + andi. r0, N, 4 + beq LL(57) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsmfp A1, A2 + fsmfp A3, A4 + + STFPDUX A1, Y, INCY2 + STFPDUX A3, Y, INCY2 + .align 4 + +LL(57): + andi. r0, N, 2 + beq LL(58) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + fsmfp A1, A2 + + STFPDUX A1, Y, INCY2 + .align 4 + +LL(58): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + + +# INCX == 1, INCY != 1 +LL(60): + cmpwi cr0, INCY, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(61) + + LFD A1, 0 * SIZE(X) + add X, X, INCX + STFD A1, 0 * SIZE(Y) + add Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + .align 4 + +LL(61): + sub X, X, INCX2 + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(65) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(63) + .align 4 + +LL(62): + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + LFPDUX A1, X, INCX2 + + STFDUX A2, Y, INCY + STFSDUX A2, Y, INCY + LFPDUX A2, X, INCX2 + + STFDUX A3, Y, INCY + STFSDUX A3, Y, INCY + LFPDUX A3, X, INCX2 + + STFDUX A4, Y, INCY + STFSDUX A4, Y, INCY + LFPDUX A4, X, INCX2 + + STFDUX A5, Y, INCY + STFSDUX A5, Y, INCY + LFPDUX A5, X, INCX2 + + STFDUX A6, Y, INCY + STFSDUX A6, Y, INCY + LFPDUX A6, X, INCX2 + + STFDUX A7, Y, INCY + STFSDUX A7, Y, INCY + LFPDUX A7, X, INCX2 + + STFDUX A8, Y, INCY + STFSDUX A8, Y, INCY + LFPDUX A8, X, INCX2 + bdnz LL(62) + .align 4 + +LL(63): + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFSDUX A2, Y, INCY + STFDUX A3, Y, INCY + STFSDUX A3, Y, INCY + STFDUX A4, Y, INCY + STFSDUX A4, Y, INCY + STFDUX A5, Y, INCY + STFSDUX A5, Y, INCY + STFDUX A6, Y, INCY + STFSDUX A6, Y, INCY + STFDUX A7, Y, INCY + STFSDUX A7, Y, INCY + STFDUX A8, Y, INCY + STFSDUX A8, Y, INCY + .align 4 + +LL(65): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(66) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFSDUX A2, Y, INCY + STFDUX A3, Y, INCY + STFSDUX A3, Y, INCY + STFDUX A4, Y, INCY + STFSDUX A4, Y, INCY + .align 4 + +LL(66): + andi. r0, N, 4 + beq LL(67) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFSDUX A2, Y, INCY + .align 4 + +LL(67): + andi. r0, N, 2 + beq LL(68) + + LFPDUX A1, X, INCX2 + + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + .align 4 + +LL(68): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + STFDUX A1, Y, INCY + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + bdz LL(113) + .align 4 + +LL(112): + STFDUX A1, Y, INCY + LFDUX A1, X, INCX + STFDUX A2, Y, INCY + LFDUX A2, X, INCX + STFDUX A3, Y, INCY + LFDUX A3, X, INCX + STFDUX A4, Y, INCY + LFDUX A4, X, INCX + + STFDUX A5, Y, INCY + LFDUX A5, X, INCX + STFDUX A6, Y, INCY + LFDUX A6, X, INCX + STFDUX A7, Y, INCY + LFDUX A7, X, INCX + STFDUX A8, Y, INCY + LFDUX A8, X, INCX + bdnz LL(112) + .align 4 + +LL(113): + STFDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFDUX A3, Y, INCY + STFDUX A4, Y, INCY + STFDUX A5, Y, INCY + STFDUX A6, Y, INCY + STFDUX A7, Y, INCY + STFDUX A8, Y, INCY + .align 4 + +LL(115): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(117) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + STFDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFDUX A3, Y, INCY + STFDUX A4, Y, INCY + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + STFDUX A1, Y, INCY + STFDUX A2, Y, INCY + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + STFDUX A1, Y, INCY + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S new file mode 100644 index 0000000..4faa6c9 --- /dev/null +++ b/kernel/power/dnrm2_hummer.S @@ -0,0 +1,1066 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define ALPHA f4 +#define ALPHA_R f5 + +#define A1 f6 +#define A2 f7 +#define A3 f8 +#define A4 f9 +#define A5 f10 +#define A6 f11 +#define A7 f12 +#define A8 f13 + +#define F1 f14 +#define F2 f15 +#define F3 f16 +#define F4 f17 +#define F5 f18 +#define F6 f19 +#define F7 f20 +#define F8 f21 + +#define T1 f22 +#define T2 f23 +#define T3 f24 +#define T4 f25 +#define T5 f26 +#define T6 f27 +#define T7 f28 +#define T8 f29 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + stfpdux f28, SP, r10 + stfpdux f29, SP, r10 + + li r10, 0 + lis r11, 0x3f80 + stwu r11, -4(SP) + stwu r11, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpsx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(99) + cmpwi cr0, INCX, 0 + ble LL(99) + + mr XX, X + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, T1 + LFPDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFPDUX A2, X, INCX2 + fpsub F3, C3, T3 + LFPDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel ALPHA, F1, C1, C2 + + li r10, 0 + + lfs ALPHA_R, 8(SP) # load 1.0 + fdiv ALPHA_R, ALPHA_R, ALPHA + + lfpsx C1, SP, r10 # Zero clear + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + fsmfp ALPHA_R, ALPHA_R + + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD C1, 0 * SIZE(XX) + add XX, XX, INCX + + cmpwi cr0, N, 0 + fmul C1, ALPHA_R, C1 + fmul C1, C1, C1 + ble LL(998) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + + fpmul T1, ALPHA_R, A1 + fpmul T2, ALPHA_R, A2 + fpmul T3, ALPHA_R, A3 + fpmul T4, ALPHA_R, A4 + + bdz LL(23) + .align 4 + +LL(22): + fpmadd C1, T1, T1, C1 + LFPDUX A1, XX, INCX2 + fpmul T1, ALPHA_R, A5 + LFPDUX A2, XX, INCX2 + + fpmadd C2, T2, T2, C2 + LFPDUX A3, XX, INCX2 + fpmul T2, ALPHA_R, A6 + LFPDUX A4, XX, INCX2 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + LFPDUX A5, XX, INCX2 + fpmul T1, ALPHA_R, A1 + LFPDUX A6, XX, INCX2 + + fpmadd C2, T2, T2, C2 + LFPDUX A7, XX, INCX2 + fpmul T2, ALPHA_R, A2 + LFPDUX A8, XX, INCX2 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A3 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A4 + bdnz LL(22) + .align 4 + +LL(23): + fpmadd C1, T1, T1, C1 + fpmul T1, ALPHA_R, A5 + fpmadd C2, T2, T2, C2 + fpmul T2, ALPHA_R, A6 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + fpmadd C2, T2, T2, C2 + fpmadd C3, T3, T3, C3 + fpmadd C4, T4, T4, C4 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(98) + + andi. r0, N, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + fpmul A3, ALPHA_R, A3 + fpmul A4, ALPHA_R, A4 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + fpmul A1, ALPHA_R, A1 + fpmadd C1, A1, A1, C1 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(98) + + LFDUX A1, XX, INCX2 + fmul A1, ALPHA_R, A1 + fmadd C1, A1, A1, C1 + .align 4 + +LL(98): + fpadd C1, C1, C2 + lis r3, 0x3f00 + fpadd C3, C3, C4 + lis r4, 0x4040 + + stw r3, 4(SP) + stw r4, 8(SP) + + fpadd C1, C1, C3 + lfs f10, 0(SP) + + fsmtp C2, C1 + lfs f11, 4(SP) + fadd C1, C2, C1 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, C1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f29, SP, r10 + fmul f3, f9, f11 + lfpdux f28, SP, r10 + fnmsub f7, f2, f9, f12 + lfpdux f27, SP, r10 + fmul f9, f3, f7 + lfpdux f26, SP, r10 + fadd f13, f11, f11 + lfpdux f25, SP, r10 + fmul f12, f1, f9 + lfpdux f24, SP, r10 + fmul f11, f12, f11 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f21, SP, r10 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fmadd f1, f11, f1, f12 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#else + fsqrt C1, C1 + + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(99): + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, T1 + LFDUX A1, X, INCX + fpsub F2, C2, T2 + LFDUX A2, X, INCX + fpsub F3, C3, T3 + LFDUX A3, X, INCX + fpsub F4, C4, T4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, C1, T5 + LFSDUX A5, X, INCX + fpsub F6, C2, T6 + LFSDUX A6, X, INCX + fpsub F7, C3, T7 + LFSDUX A7, X, INCX + fpsub F8, C4, T8 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel ALPHA, F1, C1, C2 + + li r10, 0 + + lfs ALPHA_R, 8(SP) # load 1.0 + fdiv ALPHA_R, ALPHA_R, ALPHA + + lfpsx C1, SP, r10 # Zero clear + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + fsmfp ALPHA_R, ALPHA_R + + sub XX, XX, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(125) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + LFSDUX A1, XX, INCX + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX + LFSDUX A4, XX, INCX + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + LFSDUX A5, XX, INCX + fpmul T1, ALPHA_R, A1 + LFSDUX A6, XX, INCX + fpmul T2, ALPHA_R, A2 + LFSDUX A7, XX, INCX + fpmul T3, ALPHA_R, A3 + LFSDUX A8, XX, INCX + fpmul T4, ALPHA_R, A4 + bdz LL(123) + .align 4 + +LL(122): + fpmadd C1, T1, T1, C1 + LFDUX A1, XX, INCX + fpmul T1, ALPHA_R, A5 + LFDUX A2, XX, INCX + + fpmadd C2, T2, T2, C2 + LFDUX A3, XX, INCX + fpmul T2, ALPHA_R, A6 + LFDUX A4, XX, INCX + + fpmadd C3, T3, T3, C3 + LFSDUX A1, XX, INCX + fpmul T3, ALPHA_R, A7 + LFSDUX A2, XX, INCX + + fpmadd C4, T4, T4, C4 + LFSDUX A3, XX, INCX + fpmul T4, ALPHA_R, A8 + LFSDUX A4, XX, INCX + + fpmadd C1, T1, T1, C1 + LFDUX A5, XX, INCX + fpmul T1, ALPHA_R, A1 + LFDUX A6, XX, INCX + + fpmadd C2, T2, T2, C2 + LFDUX A7, XX, INCX + fpmul T2, ALPHA_R, A2 + LFDUX A8, XX, INCX + + fpmadd C3, T3, T3, C3 + LFSDUX A5, XX, INCX + fpmul T3, ALPHA_R, A3 + LFSDUX A6, XX, INCX + fpmadd C4, T4, T4, C4 + LFSDUX A7, XX, INCX + fpmul T4, ALPHA_R, A4 + LFSDUX A8, XX, INCX + bdnz LL(122) + .align 4 + +LL(123): + fpmadd C1, T1, T1, C1 + fpmul T1, ALPHA_R, A5 + fpmadd C2, T2, T2, C2 + fpmul T2, ALPHA_R, A6 + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + fpmadd C2, T2, T2, C2 + fpmadd C3, T3, T3, C3 + fpmadd C4, T4, T4, C4 + .align 4 + +LL(125): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + LFSDUX A1, XX, INCX + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX + LFSDUX A4, XX, INCX + + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + fpmul A3, ALPHA_R, A3 + fpmul A4, ALPHA_R, A4 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(126): + andi. r0, N, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fmul A1, ALPHA_R, A1 + fmul A2, ALPHA_R, A2 + fmul A3, ALPHA_R, A3 + fmul A4, ALPHA_R, A4 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + .align 4 + +LL(127): + andi. r0, N, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + fmul A1, ALPHA_R, A1 + fmul A2, ALPHA_R, A2 + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + .align 4 + +LL(128): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, XX, INCX + fmul A1, ALPHA_R, A1 + fmadd C1, A1, A1, C1 + .align 4 + +LL(998): + fpadd C1, C1, C2 + lis r3, 0x3f00 + fpadd C3, C3, C4 + lis r4, 0x4040 + + stw r3, 4(SP) + stw r4, 8(SP) + + fpadd C1, C1, C3 + lfs f10, 0(SP) + + fsmtp C2, C1 + lfs f11, 4(SP) + fadd C1, C2, C1 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(999) + +#ifndef HUMMER_EMULATOR + frsqrte f9, C1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f29, SP, r10 + fmul f3, f9, f11 + lfpdux f28, SP, r10 + fnmsub f7, f2, f9, f12 + lfpdux f27, SP, r10 + fmul f9, f3, f7 + lfpdux f26, SP, r10 + fadd f13, f11, f11 + lfpdux f25, SP, r10 + fmul f12, f1, f9 + lfpdux f24, SP, r10 + fmul f11, f12, f11 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + fnmsub f1, f12, f9, f13 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fmadd f1, f11, f1, f12 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#else + fsqrt C1, C1 + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(999): + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S new file mode 100644 index 0000000..6be9ead --- /dev/null +++ b/kernel/power/dnrm2_ppc440.S @@ -0,0 +1,556 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define NN r6 +#define XX r7 + +#define PRE r8 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define FMAX 152(SP) +#define C1 156(SP) +#define C2 160(SP) + +#define STACKSIZE 168 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r12, 0x5fe0 + lis r6, 0x3f00 + lis r7, 0x4040 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r12, FMAX + stw r10, 4 + FMAX + stw r6, C1 + stw r7, C2 + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + cmpwi cr0, INCX, 0 + ble- LL(999) + + mr NN, N + mr XX, X + + LFDUX f1, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + subi N, N, 1 + + cmpwi cr0, N, 0 + ble- LL(999) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(50) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(99) + .align 4 + +LL(60): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + .align 4 + +LL(99): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + lfd f2, FMAX + + fcmpu cr0, f1, f31 + beq- cr0, LL(999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmul f17, f30, f9 + LFDUX f9, XX, INCX + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmul f19, f30, f11 + LFDUX f11, XX, INCX + + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmul f21, f30, f13 + LFDUX f13, XX, INCX + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmul f23, f30, f15 + LFDUX f15, XX, INCX + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + LFDUX f9, XX, INCX + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + LFDUX f13, XX, INCX + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + LFDUX f15, XX, INCX + + fmadd f0, f16, f16, f0 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + LFDUX f9, XX, INCX + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + LFDUX f13, XX, INCX + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + LFDUX f15, XX, INCX + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + .align 4 + +LL(150): + andi. r0, NN, 15 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFDUX f8, XX, INCX + + fmul f16, f30, f8 + fmadd f0, f16, f16, f0 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f1, f0, f4 + + frsqrte f0, f1 + lfs f8, C1 + lfs f9, C2 + + fmul f2, f1, f0 + fadd f7, f8, f8 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f2, f1, f0 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f2, f1, f0 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f5, f1, f0 + fmul f2, f5, f8 + fnmsub f3, f5, f0, f7 + fmadd f1, f2, f3, f5 + fmul f1, f31, f1 + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + EPILOGUE diff --git a/kernel/power/dot.S b/kernel/power/dot.S new file mode 100644 index 0000000..724b0c3 --- /dev/null +++ b/kernel/power/dot.S @@ -0,0 +1,468 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + + stw r0, 80(SP) + lfs FZERO, 80(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- cr0, LL(999) + + cmpwi cr0, INCX, SIZE + bne cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f16, 8 * SIZE(Y) + LFD f17, 9 * SIZE(Y) + LFD f18, 10 * SIZE(Y) + LFD f19, 11 * SIZE(Y) + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f20, 12 * SIZE(Y) + LFD f21, 13 * SIZE(Y) + LFD f22, 14 * SIZE(Y) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + LFD f16, 16 * SIZE(Y) + LFD f17, 17 * SIZE(Y) + LFD f18, 18 * SIZE(Y) + LFD f19, 19 * SIZE(Y) + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + LFD f20, 20 * SIZE(Y) + LFD f21, 21 * SIZE(Y) + LFD f22, 22 * SIZE(Y) + LFD f23, 23 * SIZE(Y) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#ifdef L1_DUALFETCH + L1_PREFETCH Y, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + L1_PREFETCH X, PREA +#ifdef L1_DUALFETCH + L1_PREFETCH Y, PREA +#endif +#endif + bdnz LL(10) + .align 4 + +LL(20): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f16, 8 * SIZE(Y) + LFD f17, 9 * SIZE(Y) + LFD f18, 10 * SIZE(Y) + LFD f19, 11 * SIZE(Y) + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f20, 12 * SIZE(Y) + LFD f21, 13 * SIZE(Y) + LFD f22, 14 * SIZE(Y) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f16, 0 * SIZE(Y) + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + + FMADD f0, f8, f16, f0 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + FMADD f0, f8, f16, f0 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/dot_cell.S b/kernel/power/dot_cell.S new file mode 100644 index 0000000..617fb13 --- /dev/null +++ b/kernel/power/dot_cell.S @@ -0,0 +1,458 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + + stw r0, 80(SP) + lfs FZERO, 80(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, 16 * 20 * SIZE + + cmpwi cr0, N, 0 + ble- cr0, LL(999) + + cmpwi cr0, INCX, SIZE + bne cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f0, f8, f16, f0 + LFD f8, 8 * SIZE(X) + LFD f16, 8 * SIZE(Y) + + FMADD f1, f9, f17, f1 + LFD f9, 9 * SIZE(X) + LFD f17, 9 * SIZE(Y) + + FMADD f2, f10, f18, f2 + LFD f10, 10 * SIZE(X) + LFD f18, 10 * SIZE(Y) + + FMADD f3, f11, f19, f3 + LFD f11, 11 * SIZE(X) + LFD f19, 11 * SIZE(Y) + + FMADD f4, f12, f20, f4 + LFD f12, 12 * SIZE(X) + LFD f20, 12 * SIZE(Y) + + FMADD f5, f13, f21, f5 + LFD f13, 13 * SIZE(X) + LFD f21, 13 * SIZE(Y) + + FMADD f6, f14, f22, f6 + LFD f14, 14 * SIZE(X) + LFD f22, 14 * SIZE(Y) + + FMADD f7, f15, f23, f7 + LFD f15, 15 * SIZE(X) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + LFD f8, 16 * SIZE(X) + LFD f16, 16 * SIZE(Y) + + FMADD f1, f9, f17, f1 + LFD f9, 17 * SIZE(X) + LFD f17, 17 * SIZE(Y) + + FMADD f2, f10, f18, f2 + LFD f10, 18 * SIZE(X) + LFD f18, 18 * SIZE(Y) + + FMADD f3, f11, f19, f3 + LFD f11, 19 * SIZE(X) + LFD f19, 19 * SIZE(Y) + + FMADD f4, f12, f20, f4 + LFD f12, 20 * SIZE(X) + LFD f20, 20 * SIZE(Y) + + FMADD f5, f13, f21, f5 + LFD f13, 21 * SIZE(X) + LFD f21, 21 * SIZE(Y) + + FMADD f6, f14, f22, f6 + LFD f14, 22 * SIZE(X) + LFD f22, 22 * SIZE(Y) + + FMADD f7, f15, f23, f7 + LFD f15, 23 * SIZE(X) + LFD f23, 23 * SIZE(Y) + + dcbt X, PREA + addi X, X, 16 * SIZE + dcbt Y, PREA + addi Y, Y, 16 * SIZE + bdnz LL(10) + .align 4 + +LL(20): + FMADD f0, f8, f16, f0 + LFD f8, 8 * SIZE(X) + LFD f16, 8 * SIZE(Y) + + FMADD f1, f9, f17, f1 + LFD f9, 9 * SIZE(X) + LFD f17, 9 * SIZE(Y) + + FMADD f2, f10, f18, f2 + LFD f10, 10 * SIZE(X) + LFD f18, 10 * SIZE(Y) + + FMADD f3, f11, f19, f3 + LFD f11, 11 * SIZE(X) + LFD f19, 11 * SIZE(Y) + + FMADD f4, f12, f20, f4 + LFD f12, 12 * SIZE(X) + LFD f20, 12 * SIZE(Y) + + FMADD f5, f13, f21, f5 + LFD f13, 13 * SIZE(X) + LFD f21, 13 * SIZE(Y) + + FMADD f6, f14, f22, f6 + LFD f14, 14 * SIZE(X) + LFD f22, 14 * SIZE(Y) + + FMADD f7, f15, f23, f7 + LFD f15, 15 * SIZE(X) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + addi X, X, 16 * SIZE + FMADD f7, f15, f23, f7 + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f16, 0 * SIZE(Y) + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + + FMADD f0, f8, f16, f0 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + FMADD f0, f8, f16, f0 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/dot_hummer.S b/kernel/power/dot_hummer.S new file mode 100644 index 0000000..14a3780 --- /dev/null +++ b/kernel/power/dot_hummer.S @@ -0,0 +1,879 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 + +#define INCX2 r8 +#define INCY2 r9 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 +#define A9 f20 + +#define B1 f12 +#define B2 f13 +#define B3 f14 +#define B4 f15 +#define B5 f16 +#define B6 f17 +#define B7 f18 +#define B8 f19 +#define B9 f20 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + fpmr C2, C1 + + slwi INCY, INCY, BASE_SHIFT + fpmr C3, C1 + add INCY2, INCY, INCY + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + +/* X is aligned, Y is aligned */ +LL(10): + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + bdz LL(14) + .align 4 + +LL(13): + fpmadd C1, A1, B1, C1 + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + fpmadd C2, A2, B2, C2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + fpmadd C3, A3, B3, C3 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + fpmadd C4, A4, B4, C4 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C1, A5, B5, C1 + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + fpmadd C2, A6, B6, C2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + fpmadd C3, A7, B7, C3 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + fpmadd C4, A8, B8, C4 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + + bdnz LL(13) + .align 4 + +LL(14): + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + fpmadd C1, A5, B5, C1 + fpmadd C2, A6, B6, C2 + fpmadd C3, A7, B7, C3 + fpmadd C4, A8, B8, C4 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + fpmadd C1, A1, B1, C1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + fmadd C1, A1, B1, C1 + b LL(999) + .align 4 + +/* X is aligned, Y is NOT aligned */ + +LL(20): + LFD B1, 0 * SIZE(Y) + sub X, X, INCX2 + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, X, INCX2 + LFXDUX B2, Y, INCY2 + LFPDUX A2, X, INCX2 + LFXDUX B3, Y, INCY2 + + LFPDUX A3, X, INCX2 + LFXDUX B4, Y, INCY2 + LFPDUX A4, X, INCX2 + LFXDUX B5, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFXDUX B6, Y, INCY2 + LFPDUX A6, X, INCX2 + LFXDUX B7, Y, INCY2 + + LFPDUX A7, X, INCX2 + fsmr B1, B2 + LFXDUX B8, Y, INCY2 + fsmr B2, B3 + LFPDUX A8, X, INCX2 + fsmr B3, B4 + bdz LL(24) + .align 4 + +LL(23): + fpmadd C1, A1, B1, C1 + LFPDUX A1, X, INCX2 + fsmr B4, B5 + LFXDUX B9, Y, INCY2 + + fpmadd C2, A2, B2, C2 + LFPDUX A2, X, INCX2 + fsmr B5, B6 + LFXDUX B2, Y, INCY2 + + fpmadd C3, A3, B3, C3 + LFXDUX B3, Y, INCY2 + fsmr B6, B7 + LFPDUX A3, X, INCX2 + + fpmadd C4, A4, B4, C4 + LFXDUX B4, Y, INCY2 + fsmr B7, B8 + LFPDUX A4, X, INCX2 + + fpmadd C1, A5, B5, C1 + LFXDUX B5, Y, INCY2 + fsmr B8, B9 + LFPDUX A5, X, INCX2 + + fpmadd C2, A6, B6, C2 + LFXDUX B6, Y, INCY2 + fpmr B1, B9 + LFPDUX A6, X, INCX2 + + fpmadd C3, A7, B7, C3 + LFXDUX B7, Y, INCY2 + fsmr B1, B2 + LFPDUX A7, X, INCX2 + + fpmadd C4, A8, B8, C4 + LFXDUX B8, Y, INCY2 + fsmr B2, B3 + LFPDUX A8, X, INCX2 + + fsmr B3, B4 + bdnz LL(23) + .align 4 + +LL(24): + LFXDUX B9, Y, INCY2 + fpmadd C1, A1, B1, C1 + fsmr B4, B5 + fpmadd C2, A2, B2, C2 + fsmr B5, B6 + fpmadd C3, A3, B3, C3 + fsmr B6, B7 + fpmadd C4, A4, B4, C4 + fsmr B7, B8 + fpmadd C1, A5, B5, C1 + fsmr B8, B9 + fpmadd C2, A6, B6, C2 + fpmr B1, B9 + fpmadd C3, A7, B7, C3 + fpmadd C4, A8, B8, C4 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFPDUX A1, X, INCX2 + LFXDUX B2, Y, INCY2 + LFPDUX A2, X, INCX2 + LFXDUX B3, Y, INCY2 + LFPDUX A3, X, INCX2 + LFXDUX B4, Y, INCY2 + LFPDUX A4, X, INCX2 + LFXDUX B5, Y, INCY2 + + fsmr B1, B2 + fsmr B2, B3 + fsmr B3, B4 + fsmr B4, B5 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + fpmr B1, B5 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFPDUX A1, X, INCX2 + LFXDUX B2, Y, INCY2 + LFPDUX A2, X, INCX2 + LFXDUX B3, Y, INCY2 + + fsmr B1, B2 + fsmr B2, B3 + fpmadd C1, A1, B1, C1 + fpmr B1, B3 + fpmadd C2, A2, B2, C2 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFPDUX A1, X, INCX2 + LFXDUX B2, Y, INCY2 + fsmr B1, B2 + fpmadd C1, A1, B1, C1 + fpmr B1, B2 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + fmadd C1, A1, B1, C1 + b LL(999) + .align 4 + +/* X is not aligned, Y is aligned */ +LL(30): + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFD A1, 0 * SIZE(X) + sub X, X, INCX + sub Y, Y, INCY2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + + LFXDUX A4, X, INCX2 + LFPDUX B3, Y, INCY2 + LFXDUX A5, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFXDUX A6, X, INCX2 + LFPDUX B5, Y, INCY2 + LFXDUX A7, X, INCX2 + LFPDUX B6, Y, INCY2 + + LFXDUX A8, X, INCX2 + fsmr A1, A2 + LFPDUX B7, Y, INCY2 + fsmr A2, A3 + LFPDUX B8, Y, INCY2 + fsmr A3, A4 + bdz LL(34) + .align 4 + +LL(33): + fpmadd C1, A1, B1, C1 + LFXDUX A9, X, INCX2 + fsmr A4, A5 + LFPDUX B1, Y, INCY2 + + fpmadd C2, A2, B2, C2 + LFXDUX A2, X, INCX2 + fsmr A5, A6 + LFPDUX B2, Y, INCY2 + + fpmadd C3, A3, B3, C3 + LFXDUX A3, X, INCX2 + fsmr A6, A7 + LFPDUX B3, Y, INCY2 + + fpmadd C4, A4, B4, C4 + LFXDUX A4, X, INCX2 + fsmr A7, A8 + LFPDUX B4, Y, INCY2 + + fpmadd C1, A5, B5, C1 + LFXDUX A5, X, INCX2 + fsmr A8, A9 + LFPDUX B5, Y, INCY2 + + fpmadd C2, A6, B6, C2 + LFXDUX A6, X, INCX2 + fpmr A1, A9 + LFPDUX B6, Y, INCY2 + + fpmadd C3, A7, B7, C3 + LFXDUX A7, X, INCX2 + fsmr A1, A2 + LFPDUX B7, Y, INCY2 + + fpmadd C4, A8, B8, C4 + LFXDUX A8, X, INCX2 + fsmr A2, A3 + LFPDUX B8, Y, INCY2 + + fsmr A3, A4 + bdnz LL(33) + .align 4 + +LL(34): + LFXDUX A9, X, INCX2 + fpmadd C1, A1, B1, C1 + fsmr A4, A5 + fpmadd C2, A2, B2, C2 + fsmr A5, A6 + fpmadd C3, A3, B3, C3 + fsmr A6, A7 + fpmadd C4, A4, B4, C4 + fsmr A7, A8 + fpmadd C1, A5, B5, C1 + fsmr A8, A9 + fpmadd C2, A6, B6, C2 + fpmr A1, A9 + fpmadd C3, A7, B7, C3 + fpmadd C4, A8, B8, C4 + .align 4 + +LL(35): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(36) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + LFXDUX A4, X, INCX2 + LFPDUX B3, Y, INCY2 + LFXDUX A5, X, INCX2 + LFPDUX B4, Y, INCY2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + fpmadd C1, A1, B1, C1 + fpmr A1, A5 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + .align 4 + +LL(36): + andi. r0, N, 4 + beq LL(37) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + + fsmr A1, A2 + fsmr A2, A3 + fpmadd C1, A1, B1, C1 + fpmr A1, A3 + fpmadd C2, A2, B2, C2 + .align 4 + +LL(37): + andi. r0, N, 2 + beq LL(38) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + + fsmr A1, A2 + fpmadd C1, A1, B1, C1 + fpmr A1, A2 + .align 4 + +LL(38): + andi. r0, N, 1 + beq LL(999) + + LFDUX B1, Y, INCY2 + fmadd C1, A1, B1, C1 + b LL(999) + .align 4 + +/* X is NOT aligned, Y is NOT aligned */ +LL(40): + LFD A1, 0 * SIZE(X) + LFD B1, 0 * SIZE(Y) + + sub X, X, INCX + sub Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + fmadd C1, A1, B1, C1 + ble LL(999) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + bdz LL(44) + .align 4 + +LL(43): + fpmadd C1, A1, B1, C1 + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + fpmadd C2, A2, B2, C2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + fpmadd C3, A3, B3, C3 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + fpmadd C4, A4, B4, C4 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + fpmadd C1, A5, B5, C1 + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + fpmadd C2, A6, B6, C2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + fpmadd C3, A7, B7, C3 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + fpmadd C4, A8, B8, C4 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + bdnz LL(43) + .align 4 + +LL(44): + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + fpmadd C1, A5, B5, C1 + fpmadd C2, A6, B6, C2 + fpmadd C3, A7, B7, C3 + fpmadd C4, A8, B8, C4 + .align 4 + +LL(45): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + .align 4 + +LL(46): + andi. r0, N, 4 + beq LL(47) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + .align 4 + +LL(47): + andi. r0, N, 2 + beq LL(48) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + fpmadd C1, A1, B1, C1 + .align 4 + +LL(48): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + fmadd C1, A1, B1, C1 + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(101) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(101): + cmpwi cr0, INCY, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(102): +#endif + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + LFDUX A5, X, INCX + LFDUX B5, Y, INCY + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + + LFDUX A7, X, INCX + LFDUX B7, Y, INCY + LFDUX A8, X, INCX + LFDUX B8, Y, INCY + bdz LL(104) + .align 4 + +LL(103): + fmadd C1, A1, B1, C1 + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + fmadd C2, A2, B2, C2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C3, A3, B3, C3 + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + fmadd C4, A4, B4, C4 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C1, A5, B5, C1 + LFDUX A5, X, INCX + LFDUX B5, Y, INCY + fmadd C2, A6, B6, C2 + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + + fmadd C3, A7, B7, C3 + LFDUX A7, X, INCX + LFDUX B7, Y, INCY + fmadd C4, A8, B8, C4 + LFDUX A8, X, INCX + LFDUX B8, Y, INCY + + bdnz LL(103) + .align 4 + +LL(104): + fmadd C1, A1, B1, C1 + fmadd C2, A2, B2, C2 + fmadd C3, A3, B3, C3 + fmadd C4, A4, B4, C4 + fmadd C1, A5, B5, C1 + fmadd C2, A6, B6, C2 + fmadd C3, A7, B7, C3 + fmadd C4, A8, B8, C4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C1, A1, B1, C1 + fmadd C2, A2, B2, C2 + fmadd C3, A3, B3, C3 + fmadd C4, A4, B4, C4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C1, A1, B1, C1 + fmadd C2, A2, B2, C2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + + fmadd C1, A1, B1, C1 + .align 4 + +LL(999): + li r10, 16 + + fpadd C1, C1, C2 + fpadd C3, C3, C4 + fpadd C1, C1, C3 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + fsmtp C2, C1 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fadd C1, C1, C2 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/dot_ppc440.S b/kernel/power/dot_ppc440.S new file mode 100644 index 0000000..b3f3efc --- /dev/null +++ b/kernel/power/dot_ppc440.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PRE r8 + +#define FZERO f0 + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + + stw r0, 80(SP) + lfs FZERO, 80(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + LFDUX f8, X, INCX + LFDUX f16, Y, INCY +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f1, f9, f17, f1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + FMADD f2, f10, f18, f2 + LFDUX f10, X, INCX + LFDUX f18, Y, INCY +#ifdef PPCG4 + dcbt Y, PRE +#endif + FMADD f3, f11, f19, f3 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + LFDUX f12, X, INCX + LFDUX f20, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f5, f13, f21, f5 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + FMADD f6, f14, f22, f6 + LFDUX f14, X, INCX + LFDUX f22, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt Y, PRE +#endif + FMADD f7, f15, f23, f7 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + LFDUX f8, X, INCX + LFDUX f16, Y, INCY +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f1, f9, f17, f1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + FMADD f2, f10, f18, f2 + LFDUX f10, X, INCX + LFDUX f18, Y, INCY +#ifdef PPCG4 + dcbt Y, PRE +#endif + FMADD f3, f11, f19, f3 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + LFDUX f12, X, INCX + LFDUX f20, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f5, f13, f21, f5 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + FMADD f6, f14, f22, f6 + LFDUX f14, X, INCX + LFDUX f22, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt Y, PRE +#endif + FMADD f7, f15, f23, f7 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + FMADD f1, f9, f17, f1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + FMADD f2, f10, f18, f2 + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + FMADD f3, f11, f19, f3 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + FMADD f5, f13, f21, f5 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + FMADD f6, f14, f22, f6 + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + FMADD f7, f15, f23, f7 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + FMADD f0, f8, f16, f0 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/exfunc.S b/kernel/power/exfunc.S new file mode 100644 index 0000000..257736c --- /dev/null +++ b/kernel/power/exfunc.S @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + .machine "any" + + .globl .rpcc +.rpcc: + mftb r3 + rlinm r3, r3, 3, 0, 31 # ldc(scaling) + bcr BO_ALWAYS,CR0_LT + + .globl .blas_lock +.blas_lock: + cal r7, 1(r0) +LL(0): + l r6, 0(r3) + cmpi CR0, r6, 0 + bne LL(2) + lwarx r6, r0, r3 + cmpwi CR6, r6, 0 + bne LL(2) + stwcx. r7, r0, r3 + bne- LL(0) +LL(1): + bcr BO_ALWAYS,CR0_LT + +LL(2): + b LL(0) diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S new file mode 100644 index 0000000..e531bde --- /dev/null +++ b/kernel/power/gemm_beta.S @@ -0,0 +1,253 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define C r10 +#define LDC r11 +#define J r5 +#define PRE r6 +#define CO1 r7 + +#define ALPHA f31 + +#define STACKSIZE 32 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f31, 16(SP) + stw r0, 24(SP) + +#ifdef linux +#ifndef __64BIT__ + lwz LDC, 8 + STACKSIZE(SP) +#else + ld C, 112 + STACKSIZE(SP) + ld LDC, 120 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld C, 112 + STACKSIZE(SP) + ld LDC, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz C, 56 + STACKSIZE(SP) + lwz LDC, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + + fmr ALPHA, f1 + lfs f0, 24(SP) + + cmpwi cr0, M, 0 + ble- LL(999) + cmpwi cr0, N, 0 + ble- LL(999) + + mr J, N + fcmpu cr7, f1, f0 + bne cr7, LL(20) + .align 4 + +LL(10): + mr CO1, C + add C, C, LDC + addi PRE, 0, 32 * SIZE + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + STFD f0, 0 * SIZE(CO1) + STFD f0, 1 * SIZE(CO1) + STFD f0, 2 * SIZE(CO1) + STFD f0, 3 * SIZE(CO1) + STFD f0, 4 * SIZE(CO1) + STFD f0, 5 * SIZE(CO1) + STFD f0, 6 * SIZE(CO1) + STFD f0, 7 * SIZE(CO1) + STFD f0, 8 * SIZE(CO1) + STFD f0, 9 * SIZE(CO1) + STFD f0, 10 * SIZE(CO1) + STFD f0, 11 * SIZE(CO1) + STFD f0, 12 * SIZE(CO1) + STFD f0, 13 * SIZE(CO1) + STFD f0, 14 * SIZE(CO1) + STFD f0, 15 * SIZE(CO1) + + dcbst PRE, CO1 + addi CO1, CO1, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 15 + mtspr CTR, r0 + beq LL(19) + .align 4 + +LL(16): + STFD f0, 0 * SIZE(CO1) + addi CO1, CO1, 1 * SIZE + bdnz LL(16) + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(10) + b LL(999) + .align 4 + +LL(20): + mr CO1, C + add C, C, LDC + addi PRE, 0, 16 * SIZE + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFD f0, 0 * SIZE(CO1) + LFD f1, 1 * SIZE(CO1) + LFD f2, 2 * SIZE(CO1) + LFD f3, 3 * SIZE(CO1) + LFD f4, 4 * SIZE(CO1) + LFD f5, 5 * SIZE(CO1) + LFD f6, 6 * SIZE(CO1) + LFD f7, 7 * SIZE(CO1) + + LFD f8, 8 * SIZE(CO1) + LFD f9, 9 * SIZE(CO1) + LFD f10, 10 * SIZE(CO1) + LFD f11, 11 * SIZE(CO1) + LFD f12, 12 * SIZE(CO1) + LFD f13, 13 * SIZE(CO1) + LFD f14, 14 * SIZE(CO1) + LFD f15, 15 * SIZE(CO1) + + FMUL f0, ALPHA, f0 + FMUL f1, ALPHA, f1 + FMUL f2, ALPHA, f2 + FMUL f3, ALPHA, f3 + FMUL f4, ALPHA, f4 + FMUL f5, ALPHA, f5 + FMUL f6, ALPHA, f6 + FMUL f7, ALPHA, f7 + + FMUL f8, ALPHA, f8 + FMUL f9, ALPHA, f9 + FMUL f10, ALPHA, f10 + FMUL f11, ALPHA, f11 + FMUL f12, ALPHA, f12 + FMUL f13, ALPHA, f13 + FMUL f14, ALPHA, f14 + FMUL f15, ALPHA, f15 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + STFD f4, 4 * SIZE(CO1) + STFD f5, 5 * SIZE(CO1) + STFD f6, 6 * SIZE(CO1) + STFD f7, 7 * SIZE(CO1) + + STFD f8, 8 * SIZE(CO1) + STFD f9, 9 * SIZE(CO1) + STFD f10, 10 * SIZE(CO1) + STFD f11, 11 * SIZE(CO1) + STFD f12, 12 * SIZE(CO1) + STFD f13, 13 * SIZE(CO1) + STFD f14, 14 * SIZE(CO1) + STFD f15, 15 * SIZE(CO1) + + addi CO1, CO1, 16 * SIZE + dcbtst PRE, CO1 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 15 + mtspr CTR, r0 + ble LL(29) + .align 4 + +LL(26): + LFD f0, 0 * SIZE(CO1) + FMUL f0, f0, ALPHA + STFD f0, 0 * SIZE(CO1) + addi CO1, CO1, 1 * SIZE + bdnz LL(26) + .align 4 + +LL(29): + addic. J, J, -1 + bgt LL(20) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f31, 16(SP) + addi SP, SP, STACKSIZE + + blr + EPILOGUE diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S new file mode 100644 index 0000000..2b7d1d9 --- /dev/null +++ b/kernel/power/gemm_kernel.S @@ -0,0 +1,2705 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define TEMP r18 +#define KK r19 +#define BB r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) +#if defined(TRMMKERNEL) + std r19, 240(SP) + std r18, 248(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) +#if defined(TRMMKERNEL) + stw r19, 192(SP) + stw r18, 196(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +/* Normal prefetch */ +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 3 * SIZE +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE) + li PREB, (16 * 5 * SIZE) +#else + li PREA, (16 * 19 * SIZE) + li PREB, (16 * 8 * SIZE) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE) + li PREB, (16 * 1 * SIZE) +#else + li PREA, (16 * 2 * SIZE) + li PREB, (16 * 2 * SIZE) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE) + li PREB, (16 * 7 * SIZE) +#else + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 6 * SIZE) +#endif +#endif +#endif + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + slwi BB, K, BASE_SHIFT + 2 + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble LL(20) + .align 4 + +LL(11): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + +#ifdef POWER5 + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + LFD f31, 7 * SIZE(B) +#endif + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + +#ifdef POWER5 + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif +#endif + + DCBTST(CO1, PREC) + DCBTST(CO2, PREC) + DCBTST(CO3, PREC) + DCBTST(CO4, PREC) + + dcbt B, BB + addi BB, BB, 16 * SIZE + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(15) + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + +#ifdef POWER5 + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + LFD f31, 7 * SIZE(B) +#endif + + DCBTST(CO1, PREC) + DCBTST(CO2, PREC) + DCBTST(CO3, PREC) + DCBTST(CO4, PREC) + + dcbt B, BB + addi BB, BB, 16 * SIZE + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(15) +#endif + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + +#if defined(ALLOC_HUGETLB) && !defined(POWER5) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + +#if !defined(ALLOC_HUGETLB) && !defined(POWER5) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + +#ifndef POWER5 + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) +#else + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) +#endif + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + +#ifndef POWER5 + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) +#else + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) +#endif + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + +#if (L2_SIZE == 1024976) && defined (ALLOC_HUGETLB) + nop + nop + nop + nop +#endif + +#ifdef POWER5 + LFD f28, 20 * SIZE(BO) + LFD f29, 21 * SIZE(BO) + LFD f30, 22 * SIZE(BO) + LFD f31, 23 * SIZE(BO) +#endif + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 +#ifndef ALLOC_HUGETLB + DCBT(BO, PREB) + DCBT(AO, PREA) +#endif +#endif + bdnz LL(12) + .align 4 + +LL(15): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 + + LFD f16, 0 * SIZE(CO3) + LFD f17, 1 * SIZE(CO3) + LFD f18, 2 * SIZE(CO3) + LFD f19, 3 * SIZE(CO3) + + LFD f20, 0 * SIZE(CO4) + LFD f21, 1 * SIZE(CO4) + LFD f22, 2 * SIZE(CO4) + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +LL(39): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(40): + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble LL(70) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble LL(50) + .align 4 + +LL(41): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + DCBTST(CO1, PREC) + DCBTST(CO2, PREC) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + DCBTST(CO1, PREC) + DCBTST(CO2, PREC) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(69): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +LL(70): + mr CO1, C + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble LL(80) + .align 4 + +LL(71): +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + DCBTST(CO1, PREC) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + DCBTST(CO1, PREC) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + ld r19, 240(SP) + ld r18, 248(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S new file mode 100644 index 0000000..6f5c362 --- /dev/null +++ b/kernel/power/gemm_kernel_altivec.S @@ -0,0 +1,2708 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 +#define C6 v21 +#define C7 v22 +#define C8 v23 +#define C9 v24 + +#define c00 v25 + +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 +#define PERMRSHIFT3 v28 +#define PERMRSHIFT4 v29 + +#define VZERO v30 +#define alpha v31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + li r0, -1 + + mfspr VREG, VRsave + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -128 + and SP, SP, r0 + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + stfs f1, ALPHA + 0(SP) + stfs f1, ALPHA + 4(SP) + stfs f1, ALPHA + 8(SP) + stfs f1, ALPHA + 12(SP) + + li r29, 0 + stw r29, FZERO(SP) + + slwi LDC, LDC, BASE_SHIFT + + li PREC, (15 * SIZE) +#ifdef CELL + li PREB, (3 * 32 * SIZE) +#else + li PREB, (5 * 32 * SIZE) +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 2 + ble LL(60) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + vxor c05, c05, c05 + LOAD_A a4, OFFSET_3, AO + vxor c06, c06, c06 + LOAD_A a5, OFFSET_4, AO + vxor c07, c07, c07 + nop + vxor c08, c08, c08 + + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + dcbtst CO3, PREC + vxor c12, c12, c12 + dcbtst CO4, PREC + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(13) + .align 4 + +#define NOP1 mr r3, r3 +#define NOP2 mr r4, r4 + +LL(12): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + DCBT(A, PREA) + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + vspltw bp1, b1, 2 + + vmaddfp c05, a1, bp2, c05 + DCBT(B, PREB) + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + vspltw bp2, b1, 3 + + vmaddfp c09, a1, bp1, c09 + NOP1 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 8 * SIZE + vmaddfp c12, a4, bp1, c12 + vspltw bp1, b2, 0 + + vmaddfp c13, a1, bp2, c13 + NOP1 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + addi AO, AO, 32 * SIZE + vmaddfp c07, a7, bp2, c07 + LOAD_B b1, OFFSET_0, BO + vmaddfp c08, a8, bp2, c08 + vspltw bp2, b2, 3 + + vmaddfp c09, a5, bp1, c09 + NOP1 + vmaddfp c10, a6, bp1, c10 + NOP2 + vmaddfp c11, a7, bp1, c11 + NOP1 + vmaddfp c12, a8, bp1, c12 + vspltw bp1, b1, 0 + + vmaddfp c13, a5, bp2, c13 + DCBT(A, PREA) + vmaddfp c14, a6, bp2, c14 + LOAD_A a1, OFFSET_0, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a8, bp2, c16 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + LOAD_A a3, OFFSET_2, AO + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + vspltw bp1, b1, 2 + + vmaddfp c05, a1, bp2, c05 + NOP1 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + vspltw bp2, b1, 3 + + vmaddfp c09, a1, bp1, c09 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a2, bp1, c10 + NOP2 + vmaddfp c11, a3, bp1, c11 + NOP1 + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + addi AO, AO, 32 * SIZE + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c11, a7, bp1, c11 + LOAD_A a2, OFFSET_1, AO + vmaddfp c12, a8, bp1, c12 + NOP2 + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a4, OFFSET_3, AO + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(13): + andi. r0, K, 2 + nop + nop + ble+ LL(15) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + NOP2 + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_A a5, OFFSET_4, AO + vmaddfp c11, a3, bp1, c11 + LOAD_A a6, OFFSET_5, AO + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a7, OFFSET_6, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a8, OFFSET_7, AO + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 32 * SIZE + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + NOP2 + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO + vmaddfp c11, a7, bp1, c11 + LOAD_A a2, OFFSET_1, AO + vmaddfp c12, a8, bp1, c12 + NOP2 + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a4, OFFSET_3, AO + vmaddfp c16, a8, bp2, c16 + .align 4 + +LL(15): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(18) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 16 * SIZE + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 4 * SIZE + vmaddfp c12, a4, bp1, c12 + nop + + vmaddfp c13, a1, bp2, c13 + vmaddfp c14, a2, bp2, c14 + vmaddfp c15, a3, bp2, c15 + vmaddfp c16, a4, bp2, c16 + .align 4 + +LL(18): + lvx C1, OFFSET_0, CO1 + cmpwi cr0, LDC, 32 * SIZE + lvx C2, OFFSET_1, CO1 + lvsr PERMRSHIFT1, 0, CO1 + lvx C3, OFFSET_2, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvx C4, OFFSET_3, CO1 + lvsr PERMRSHIFT3, 0, CO3 + lvx C5, OFFSET_4, CO1 + lvsr PERMRSHIFT4, 0, CO4 + ble LL(19) + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO2 + vmaddfp c01, alpha, c01, C2 + lvx C6, OFFSET_1, CO2 + vmaddfp c02, alpha, c02, C3 + lvx C7, OFFSET_2, CO2 + vmaddfp c03, alpha, c03, C4 + lvx C8, OFFSET_3, CO2 + vmaddfp c04, alpha, c04, C5 + lvx C9, OFFSET_4, CO2 + + stvx c00, OFFSET_0, CO1 + vperm c00, VZERO, c05, PERMRSHIFT2 + stvx c01, OFFSET_1, CO1 + vperm c05, c05, c06, PERMRSHIFT2 + stvx c02, OFFSET_2, CO1 + vperm c06, c06, c07, PERMRSHIFT2 + stvx c03, OFFSET_3, CO1 + vperm c07, c07, c08, PERMRSHIFT2 + stvx c04, OFFSET_4, CO1 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO3 + vmaddfp c05, alpha, c05, C6 + lvx C2, OFFSET_1, CO3 + vmaddfp c06, alpha, c06, C7 + lvx C3, OFFSET_2, CO3 + vmaddfp c07, alpha, c07, C8 + lvx C4, OFFSET_3, CO3 + vmaddfp c08, alpha, c08, C9 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO2 + vperm c00, VZERO, c09, PERMRSHIFT3 + stvx c05, OFFSET_1, CO2 + vperm c09, c09, c10, PERMRSHIFT3 + stvx c06, OFFSET_2, CO2 + vperm c10, c10, c11, PERMRSHIFT3 + stvx c07, OFFSET_3, CO2 + vperm c11, c11, c12, PERMRSHIFT3 + stvx c08, OFFSET_4, CO2 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + lvx C9, OFFSET_4, CO4 + vmaddfp c09, alpha, c09, C2 + lvx C1, OFFSET_0, CO4 + vmaddfp c10, alpha, c10, C3 + lvx C6, OFFSET_1, CO4 + vmaddfp c11, alpha, c11, C4 + lvx C7, OFFSET_2, CO4 + vmaddfp c12, alpha, c12, C5 + lvx C8, OFFSET_3, CO4 + + stvx c00, OFFSET_0, CO3 + vperm c00, VZERO, c13, PERMRSHIFT4 + stvx c09, OFFSET_1, CO3 + vperm c13, c13, c14, PERMRSHIFT4 + stvx c10, OFFSET_2, CO3 + vperm c14, c14, c15, PERMRSHIFT4 + stvx c11, OFFSET_3, CO3 + vperm c15, c15, c16, PERMRSHIFT4 + stvx c12, OFFSET_4, CO3 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + b LL(20) + .align 4 + +LL(19): + lvx C6, OFFSET_1, CO2 + lvx C7, OFFSET_2, CO2 + lvx C8, OFFSET_3, CO2 + lvx C9, OFFSET_4, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + lvx C2, OFFSET_1, CO3 + vmaddfp c02, alpha, c02, C3 + lvx C3, OFFSET_2, CO3 + vmaddfp c03, alpha, c03, C4 + lvx C4, OFFSET_3, CO3 + vmaddfp c04, alpha, c04, C5 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C6 + lvx C6, OFFSET_1, CO4 + vmaddfp c06, alpha, c06, C7 + lvx C7, OFFSET_2, CO4 + vmaddfp c07, alpha, c07, C8 + lvx C8, OFFSET_3, CO4 + vmaddfp c08, alpha, c08, C9 + lvx C9, OFFSET_4, CO4 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + lvx C1, OFFSET_0, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, c11, PERMRSHIFT3 + vperm c11, c11, c12, PERMRSHIFT3 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + vmaddfp c11, alpha, c11, C4 + vmaddfp c12, alpha, c12, C5 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + stvx c11, OFFSET_3, CO3 + stvx c12, OFFSET_4, CO3 + + lvx C1, OFFSET_0, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, c15, PERMRSHIFT4 + vperm c15, c15, c16, PERMRSHIFT4 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 8 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + lvx C3, OFFSET_2, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + lvx C3, OFFSET_2, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + vmaddfp c14, alpha, c14, C3 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + addi CO3, CO3, 8 * SIZE + addi CO4, CO4, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 4 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 2 + ble LL(50) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + LFD f8, 0 * SIZE(CO3) + LFD f9, 1 * SIZE(CO3) + LFD f10, 0 * SIZE(CO4) + LFD f11, 1 * SIZE(CO4) + + FMADD f4, f4, f13, f8 + FMADD f5, f5, f13, f9 + FMADD f6, f6, f13, f10 + FMADD f7, f7, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + STFD f4, 0 * SIZE(CO3) + STFD f5, 1 * SIZE(CO3) + STFD f6, 0 * SIZE(CO4) + STFD f7, 1 * SIZE(CO4) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + .align 4 + +LL(50): + andi. I, M, 1 + ble LL(59) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f9, f10, f0 + FMADD f1, f9, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f9, 3 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(52) + .align 4 + +LL(55): + andi. r0, K, 1 + ble LL(58) + .align 4 + +LL(56): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(58): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + LFD f10, 0 * SIZE(CO3) + LFD f11, 0 * SIZE(CO4) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + STFD f2, 0 * SIZE(CO3) + STFD f3, 0 * SIZE(CO4) + .align 4 + +LL(59): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(60): + andi. r0, N, 2 + ble LL(120) + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(80) + .align 4 + +LL(71): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(78): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + vmaddfp c07, alpha, c07, C4 + vmaddfp c08, alpha, c08, C5 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 8 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(88) + .align 4 + +LL(86): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(88): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(90): + andi. I, M, 4 + ble LL(100) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(98) + .align 4 + +LL(96): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(98): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(100): + andi. I, M, 2 + ble LL(110) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(105) + .align 4 + +LL(102): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + FMADD f4, f8, f12, f4 + FMADD f5, f9, f12, f5 + FMADD f6, f8, f13, f6 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(102) + .align 4 + +LL(105): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(108) + .align 4 + +LL(106): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(108): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + .align 4 + +LL(110): + andi. I, M, 1 + ble LL(119) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(118) + .align 4 + +LL(116): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + + LFD f8, 1 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(118): + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + .align 4 + +LL(119): + mr B, BO + .align 4 + +LL(120): + andi. r0, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + srawi. I, M, 4 + ble LL(140) + .align 4 + +LL(130): + vxor c01, c01, c01 + vxor c02, c02, c02 + vxor c03, c03, c03 + vxor c04, c04, c04 + + mr BO, B + + dcbtst CO1, PREC + + mr J, K + + andi. r0, B, 15 + ble+ LL(131) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + .align 4 + + +LL(131): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(135) + .align 4 + +LL(133): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vspltw bp2, b1, 1 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vspltw bp2, b1, 3 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(133) + .align 4 + +LL(135): + andi. r0, J, 3 + ble+ LL(138) + + cmpwi cr0, r0, 3 + bne LL(136) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 3 * SIZE + b LL(138) + .align 4 + +LL(136): + cmpwi cr0, r0, 2 + bne LL(137) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + LOAD_A a3, OFFSET_6, AO + LOAD_A a4, OFFSET_7, AO + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 2 * SIZE + b LL(138) + .align 4 + +LL(137): + cmpwi cr0, r0, 1 + bne LL(138) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(138): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(130) + .align 4 + +LL(140): + andi. I, M, 8 + ble LL(150) + + vxor c01, c01, c01 + vxor c02, c02, c02 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(141) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + .align 4 + + +LL(141): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(145) + .align 4 + +LL(143): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + vspltw bp1, b1, 2 + vmaddfp c01, a5, bp1, c01 + vmaddfp c02, a6, bp1, c02 + + vspltw bp2, b1, 3 + vmaddfp c01, a7, bp2, c01 + vmaddfp c02, a8, bp2, c02 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(143) + .align 4 + +LL(145): + andi. r0, J, 3 + ble+ LL(148) + + cmpwi cr0, r0, 3 + bne LL(146) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + + addi AO, AO, 24 * SIZE + addi BO, BO, 3 * SIZE + b LL(148) + .align 4 + +LL(146): + cmpwi cr0, r0, 2 + bne LL(147) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + addi AO, AO, 16 * SIZE + addi BO, BO, 2 * SIZE + b LL(148) + .align 4 + +LL(147): + cmpwi cr0, r0, 1 + bne LL(148) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + addi AO, AO, 8 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(148): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(150): + andi. I, M, 4 + ble LL(160) + + vxor c01, c01, c01 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(151) + + LOAD_A a1, OFFSET_0, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + + LOAD_A a1, OFFSET_0, AO + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + .align 4 + + +LL(151): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(155) + .align 4 + +LL(153): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c01, a4, bp2, c01 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(153) + .align 4 + +LL(155): + andi. r0, J, 3 + ble+ LL(158) + + cmpwi cr0, r0, 3 + bne LL(156) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + + addi AO, AO, 12 * SIZE + addi BO, BO, 3 * SIZE + b LL(158) + .align 4 + +LL(156): + cmpwi cr0, r0, 2 + bne LL(157) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c01, a2, bp2, c01 + + addi AO, AO, 8 * SIZE + addi BO, BO, 2 * SIZE + b LL(158) + .align 4 + +LL(157): + cmpwi cr0, r0, 1 + bne LL(158) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + + addi AO, AO, 4 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(158): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(160): + andi. I, M, 2 + ble LL(170) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + LFD f10, 2 * SIZE(AO) + LFD f11, 3 * SIZE(AO) + + LFD f12, 0 * SIZE(B) + LFD f13, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(165) + .align 4 + +LL(162): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + FMADD f2, f10, f13, f2 + FMADD f3, f11, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f10, 6 * SIZE(AO) + LFD f11, 7 * SIZE(AO) + + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(162) + .align 4 + +LL(165): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(168) + .align 4 + +LL(166): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + + addi AO, AO, 2 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(168): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + .align 4 + +LL(170): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(175) + .align 4 + +LL(172): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f11, f1 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(172) + .align 4 + +LL(175): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(178) + .align 4 + +LL(176): + FMADD f0, f8, f10, f0 + + addi AO, AO, 1 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(178): + LFD f8, 0 * SIZE(CO1) + + FADD f0, f0, f1 + + FMADD f0, f0, f13, f8 + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S new file mode 100644 index 0000000..010ed39 --- /dev/null +++ b/kernel/power/gemm_kernel_altivec_cell.S @@ -0,0 +1,2711 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 +#define C6 v21 +#define C7 v22 +#define C8 v23 +#define C9 v24 + +#define c00 v25 + +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 +#define PERMRSHIFT3 v28 +#define PERMRSHIFT4 v29 + +#define VZERO v30 +#define alpha v31 + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../sparam.h" +#else +#include "../dparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + li r0, -1 + + mfspr VREG, VRsave + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -128 + and SP, SP, r0 + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + stfs f1, ALPHA + 0(SP) + stfs f1, ALPHA + 4(SP) + stfs f1, ALPHA + 8(SP) + stfs f1, ALPHA + 12(SP) + + li r29, 0 + stw r29, FZERO(SP) + + slwi LDC, LDC, BASE_SHIFT + + li PREC, (15 * SIZE) +#ifdef CELL + li PREB, (5 * 32 * SIZE) +#else + li PREB, (5 * 32 * SIZE) +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 2 + ble LL(60) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + + vxor c05, c05, c05 + vxor c06, c06, c06 + vxor c07, c07, c07 + vxor c08, c08, c08 + + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + dcbtst CO3, PREC + vxor c12, c12, c12 + dcbtst CO4, PREC + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(13) + .align 4 + +#define NOP1 mr r3, r3 +#define NOP2 mr r4, r4 + +LL(12): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + dcbt AO, PREA + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + dcbt BO, PREB + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 8 * SIZE + vmaddfp c12, a4, bp1, c12 + NOP1 + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + addi AO, AO, 32 * SIZE + vmaddfp c07, a7, bp2, c07 + LOAD_B b1, OFFSET_0, BO + vmaddfp c08, a8, bp2, c08 + NOP1 + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + NOP2 + vmaddfp c11, a7, bp1, c11 + NOP1 + vmaddfp c12, a8, bp1, c12 + dcbt AO, PREA + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c15, a7, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a8, bp2, c16 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + LOAD_A a3, OFFSET_2, AO + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + NOP2 + vmaddfp c11, a3, bp1, c11 + NOP1 + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + addi AO, AO, 32 * SIZE + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c11, a7, bp1, c11 + NOP2 + vmaddfp c12, a8, bp1, c12 + vspltw bp1, b1, 0 + + vmaddfp c13, a5, bp2, c13 + LOAD_A a2, OFFSET_1, AO + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + NOP1 + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(13): + andi. r0, K, 2 + nop + nop + ble+ LL(15) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_A a5, OFFSET_4, AO + vmaddfp c11, a3, bp1, c11 + LOAD_A a6, OFFSET_5, AO + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a7, OFFSET_6, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a8, OFFSET_7, AO + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 32 * SIZE + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + NOP2 + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO + vmaddfp c11, a7, bp1, c11 + LOAD_A a2, OFFSET_1, AO + vmaddfp c12, a8, bp1, c12 + NOP2 + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + vmaddfp c16, a8, bp2, c16 + .align 4 + +LL(15): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(18) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 16 * SIZE + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 4 * SIZE + vmaddfp c12, a4, bp1, c12 + nop + + vmaddfp c13, a1, bp2, c13 + vmaddfp c14, a2, bp2, c14 + vmaddfp c15, a3, bp2, c15 + vmaddfp c16, a4, bp2, c16 + .align 4 + +LL(18): + lvx C1, OFFSET_0, CO1 + cmpwi cr0, LDC, 32 * SIZE + lvx C2, OFFSET_1, CO1 + lvsr PERMRSHIFT1, 0, CO1 + lvx C3, OFFSET_2, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvx C4, OFFSET_3, CO1 + lvsr PERMRSHIFT3, 0, CO3 + lvx C5, OFFSET_4, CO1 + lvsr PERMRSHIFT4, 0, CO4 + ble LL(19) + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO2 + vmaddfp c01, alpha, c01, C2 + lvx C6, OFFSET_1, CO2 + vmaddfp c02, alpha, c02, C3 + lvx C7, OFFSET_2, CO2 + vmaddfp c03, alpha, c03, C4 + lvx C8, OFFSET_3, CO2 + vmaddfp c04, alpha, c04, C5 + lvx C9, OFFSET_4, CO2 + + stvx c00, OFFSET_0, CO1 + vperm c00, VZERO, c05, PERMRSHIFT2 + stvx c01, OFFSET_1, CO1 + vperm c05, c05, c06, PERMRSHIFT2 + stvx c02, OFFSET_2, CO1 + vperm c06, c06, c07, PERMRSHIFT2 + stvx c03, OFFSET_3, CO1 + vperm c07, c07, c08, PERMRSHIFT2 + stvx c04, OFFSET_4, CO1 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO3 + vmaddfp c05, alpha, c05, C6 + lvx C2, OFFSET_1, CO3 + vmaddfp c06, alpha, c06, C7 + lvx C3, OFFSET_2, CO3 + vmaddfp c07, alpha, c07, C8 + lvx C4, OFFSET_3, CO3 + vmaddfp c08, alpha, c08, C9 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO2 + vperm c00, VZERO, c09, PERMRSHIFT3 + stvx c05, OFFSET_1, CO2 + vperm c09, c09, c10, PERMRSHIFT3 + stvx c06, OFFSET_2, CO2 + vperm c10, c10, c11, PERMRSHIFT3 + stvx c07, OFFSET_3, CO2 + vperm c11, c11, c12, PERMRSHIFT3 + stvx c08, OFFSET_4, CO2 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + lvx C9, OFFSET_4, CO4 + vmaddfp c09, alpha, c09, C2 + lvx C1, OFFSET_0, CO4 + vmaddfp c10, alpha, c10, C3 + lvx C6, OFFSET_1, CO4 + vmaddfp c11, alpha, c11, C4 + lvx C7, OFFSET_2, CO4 + vmaddfp c12, alpha, c12, C5 + lvx C8, OFFSET_3, CO4 + + stvx c00, OFFSET_0, CO3 + vperm c00, VZERO, c13, PERMRSHIFT4 + stvx c09, OFFSET_1, CO3 + vperm c13, c13, c14, PERMRSHIFT4 + stvx c10, OFFSET_2, CO3 + vperm c14, c14, c15, PERMRSHIFT4 + stvx c11, OFFSET_3, CO3 + vperm c15, c15, c16, PERMRSHIFT4 + stvx c12, OFFSET_4, CO3 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + b LL(20) + .align 4 + +LL(19): + lvx C6, OFFSET_1, CO2 + lvx C7, OFFSET_2, CO2 + lvx C8, OFFSET_3, CO2 + lvx C9, OFFSET_4, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + lvx C2, OFFSET_1, CO3 + vmaddfp c02, alpha, c02, C3 + lvx C3, OFFSET_2, CO3 + vmaddfp c03, alpha, c03, C4 + lvx C4, OFFSET_3, CO3 + vmaddfp c04, alpha, c04, C5 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C6 + lvx C6, OFFSET_1, CO4 + vmaddfp c06, alpha, c06, C7 + lvx C7, OFFSET_2, CO4 + vmaddfp c07, alpha, c07, C8 + lvx C8, OFFSET_3, CO4 + vmaddfp c08, alpha, c08, C9 + lvx C9, OFFSET_4, CO4 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + lvx C1, OFFSET_0, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, c11, PERMRSHIFT3 + vperm c11, c11, c12, PERMRSHIFT3 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + vmaddfp c11, alpha, c11, C4 + vmaddfp c12, alpha, c12, C5 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + stvx c11, OFFSET_3, CO3 + stvx c12, OFFSET_4, CO3 + + lvx C1, OFFSET_0, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, c15, PERMRSHIFT4 + vperm c15, c15, c16, PERMRSHIFT4 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 8 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + lvx C3, OFFSET_2, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + lvx C3, OFFSET_2, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + vmaddfp c14, alpha, c14, C3 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + addi CO3, CO3, 8 * SIZE + addi CO4, CO4, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 4 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 2 + ble LL(50) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + LFD f8, 0 * SIZE(CO3) + LFD f9, 1 * SIZE(CO3) + LFD f10, 0 * SIZE(CO4) + LFD f11, 1 * SIZE(CO4) + + FMADD f4, f4, f13, f8 + FMADD f5, f5, f13, f9 + FMADD f6, f6, f13, f10 + FMADD f7, f7, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + STFD f4, 0 * SIZE(CO3) + STFD f5, 1 * SIZE(CO3) + STFD f6, 0 * SIZE(CO4) + STFD f7, 1 * SIZE(CO4) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + .align 4 + +LL(50): + andi. I, M, 1 + ble LL(59) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f9, f10, f0 + FMADD f1, f9, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f9, 3 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(52) + .align 4 + +LL(55): + andi. r0, K, 1 + ble LL(58) + .align 4 + +LL(56): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(58): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + LFD f10, 0 * SIZE(CO3) + LFD f11, 0 * SIZE(CO4) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + STFD f2, 0 * SIZE(CO3) + STFD f3, 0 * SIZE(CO4) + .align 4 + +LL(59): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(60): + andi. r0, N, 2 + ble LL(120) + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(80) + .align 4 + +LL(71): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(78): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + vmaddfp c07, alpha, c07, C4 + vmaddfp c08, alpha, c08, C5 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 8 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(88) + .align 4 + +LL(86): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(88): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(90): + andi. I, M, 4 + ble LL(100) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(98) + .align 4 + +LL(96): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(98): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(100): + andi. I, M, 2 + ble LL(110) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(105) + .align 4 + +LL(102): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + FMADD f4, f8, f12, f4 + FMADD f5, f9, f12, f5 + FMADD f6, f8, f13, f6 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(102) + .align 4 + +LL(105): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(108) + .align 4 + +LL(106): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(108): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + .align 4 + +LL(110): + andi. I, M, 1 + ble LL(119) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(118) + .align 4 + +LL(116): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + + LFD f8, 1 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(118): + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + .align 4 + +LL(119): + mr B, BO + .align 4 + +LL(120): + andi. r0, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + srawi. I, M, 4 + ble LL(140) + .align 4 + +LL(130): + vxor c01, c01, c01 + vxor c02, c02, c02 + vxor c03, c03, c03 + vxor c04, c04, c04 + + mr BO, B + + dcbtst CO1, PREC + + mr J, K + + andi. r0, B, 15 + ble+ LL(131) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + .align 4 + + +LL(131): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(135) + .align 4 + +LL(133): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vspltw bp2, b1, 1 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vspltw bp2, b1, 3 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(133) + .align 4 + +LL(135): + andi. r0, J, 3 + ble+ LL(138) + + cmpwi cr0, r0, 3 + bne LL(136) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 3 * SIZE + b LL(138) + .align 4 + +LL(136): + cmpwi cr0, r0, 2 + bne LL(137) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + LOAD_A a3, OFFSET_6, AO + LOAD_A a4, OFFSET_7, AO + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 2 * SIZE + b LL(138) + .align 4 + +LL(137): + cmpwi cr0, r0, 1 + bne LL(138) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(138): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(130) + .align 4 + +LL(140): + andi. I, M, 8 + ble LL(150) + + vxor c01, c01, c01 + vxor c02, c02, c02 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(141) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + .align 4 + + +LL(141): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(145) + .align 4 + +LL(143): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + vspltw bp1, b1, 2 + vmaddfp c01, a5, bp1, c01 + vmaddfp c02, a6, bp1, c02 + + vspltw bp2, b1, 3 + vmaddfp c01, a7, bp2, c01 + vmaddfp c02, a8, bp2, c02 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(143) + .align 4 + +LL(145): + andi. r0, J, 3 + ble+ LL(148) + + cmpwi cr0, r0, 3 + bne LL(146) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + + addi AO, AO, 24 * SIZE + addi BO, BO, 3 * SIZE + b LL(148) + .align 4 + +LL(146): + cmpwi cr0, r0, 2 + bne LL(147) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + addi AO, AO, 16 * SIZE + addi BO, BO, 2 * SIZE + b LL(148) + .align 4 + +LL(147): + cmpwi cr0, r0, 1 + bne LL(148) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + addi AO, AO, 8 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(148): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(150): + andi. I, M, 4 + ble LL(160) + + vxor c01, c01, c01 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(151) + + LOAD_A a1, OFFSET_0, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + + LOAD_A a1, OFFSET_0, AO + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + .align 4 + + +LL(151): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(155) + .align 4 + +LL(153): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c01, a4, bp2, c01 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(153) + .align 4 + +LL(155): + andi. r0, J, 3 + ble+ LL(158) + + cmpwi cr0, r0, 3 + bne LL(156) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + + addi AO, AO, 12 * SIZE + addi BO, BO, 3 * SIZE + b LL(158) + .align 4 + +LL(156): + cmpwi cr0, r0, 2 + bne LL(157) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c01, a2, bp2, c01 + + addi AO, AO, 8 * SIZE + addi BO, BO, 2 * SIZE + b LL(158) + .align 4 + +LL(157): + cmpwi cr0, r0, 1 + bne LL(158) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + + addi AO, AO, 4 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(158): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(160): + andi. I, M, 2 + ble LL(170) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + LFD f10, 2 * SIZE(AO) + LFD f11, 3 * SIZE(AO) + + LFD f12, 0 * SIZE(B) + LFD f13, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(165) + .align 4 + +LL(162): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + FMADD f2, f10, f13, f2 + FMADD f3, f11, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f10, 6 * SIZE(AO) + LFD f11, 7 * SIZE(AO) + + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(162) + .align 4 + +LL(165): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(168) + .align 4 + +LL(166): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + + addi AO, AO, 2 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(168): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + .align 4 + +LL(170): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(175) + .align 4 + +LL(172): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f11, f1 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(172) + .align 4 + +LL(175): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(178) + .align 4 + +LL(176): + FMADD f0, f8, f10, f0 + + addi AO, AO, 1 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(178): + LFD f8, 0 * SIZE(CO1) + + FADD f0, f0, f1 + + FMADD f0, f0, f13, f8 + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S new file mode 100644 index 0000000..24d437d --- /dev/null +++ b/kernel/power/gemm_kernel_altivec_g4.S @@ -0,0 +1,2647 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 +#define C6 v21 +#define C7 v22 +#define C8 v23 +#define C9 v24 + +#define c00 v25 + +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 +#define PERMRSHIFT3 v28 +#define PERMRSHIFT4 v29 + +#define VZERO v30 +#define alpha v31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + li r0, -1 + + mfspr VREG, VRsave + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -128 + and SP, SP, r0 + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + stfs f1, ALPHA + 0(SP) + stfs f1, ALPHA + 4(SP) + stfs f1, ALPHA + 8(SP) + stfs f1, ALPHA + 12(SP) + + li r29, 0 + stw r29, FZERO(SP) + + slwi LDC, LDC, BASE_SHIFT + + li PREC, (15 * SIZE) + li PREB, (25 * 8 * SIZE) + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 2 + ble LL(60) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + vxor c05, c05, c05 + LOAD_A a4, OFFSET_3, AO + vxor c06, c06, c06 + LOAD_B b2, OFFSET_2, B + vxor c07, c07, c07 + LOAD_A a5, OFFSET_4, AO + vxor c08, c08, c08 + LOAD_A a6, OFFSET_5, AO + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + dcbtst CO3, PREC + vxor c12, c12, c12 + dcbtst CO4, PREC + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(15) + .align 4 + +LL(12): +/* 1 */ + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c03, a3, bp1, c03 + LOAD_A a7, OFFSET_4, AO + vmaddfp c04, a4, bp1, c04 + LOAD_A a8, OFFSET_5, AO + +/* 2 */ + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + dcbt BO, PREB + vmaddfp c07, a3, bp2, c07 + dcbt AO, PREB + vmaddfp c08, a4, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 3 */ + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + dcbt AO, PREB + vmaddfp c12, a4, bp1, c12 + addi AO, AO, 8 * SIZE + +/* 4 */ + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_2, AO + vmaddfp c15, a3, bp2, c15 + dcbt AO, PREB + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 8 * SIZE + +/* 5 */ + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a6, bp1, c02 + LOAD_A a2, OFFSET_1, AO + vmaddfp c03, a7, bp1, c03 + LOAD_A a3, OFFSET_2, AO + vmaddfp c04, a8, bp1, c04 + LOAD_A a4, OFFSET_3, AO + +/* 6 */ + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + dcbt AO, PREA + vmaddfp c08, a8, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 7 */ + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b1, OFFSET_4, BO + vmaddfp c11, a7, bp1, c11 + nop + vmaddfp c12, a8, bp1, c12 + nop + +/* 8 */ + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a5, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a6, OFFSET_3, AO + vmaddfp c16, a8, bp2, c16 + LOAD_A a7, OFFSET_4, AO + +/* 9 */ + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a8, OFFSET_5, AO + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 8 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + +/* 10 */ + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + +/* 11 */ + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + dcbt AO, PREA + vmaddfp c12, a4, bp1, c12 + addi AO, AO, 8 * SIZE + +/* 12 */ + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a2, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + LOAD_A a3, OFFSET_6, AO + +/* 13 */ + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + LOAD_A a4, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + dcbt AO, PREA + vmaddfp c04, a8, bp1, c04 + addi AO, AO, 8 * SIZE + +/* 14 */ + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + dcbt AO, PREA + vmaddfp c08, a8, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 15 */ + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b2, OFFSET_4, BO + vmaddfp c11, a7, bp1, c11 + dcbt AO, PREA + vmaddfp c12, a8, bp1, c12 + addi BO, BO, 8 * SIZE + +/* 16 */ + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(15): + andi. r0, K, 3 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi AO, AO, 16 * SIZE + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 4 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_0, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a4, bp2, c16 + LOAD_A a3, OFFSET_2, AO + + LOAD_A a4, OFFSET_3, AO + bdnz+ LL(16) + .align 4 + +LL(18): + lvx C1, OFFSET_0, CO1 + cmpwi cr0, LDC, 32 * SIZE + lvx C2, OFFSET_1, CO1 + lvsr PERMRSHIFT1, 0, CO1 + lvx C3, OFFSET_2, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvx C4, OFFSET_3, CO1 + lvsr PERMRSHIFT3, 0, CO3 + lvx C5, OFFSET_4, CO1 + lvsr PERMRSHIFT4, 0, CO4 + ble LL(19) + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO2 + vmaddfp c01, alpha, c01, C2 + lvx C6, OFFSET_1, CO2 + vmaddfp c02, alpha, c02, C3 + lvx C7, OFFSET_2, CO2 + vmaddfp c03, alpha, c03, C4 + lvx C8, OFFSET_3, CO2 + vmaddfp c04, alpha, c04, C5 + lvx C9, OFFSET_4, CO2 + + stvx c00, OFFSET_0, CO1 + vperm c00, VZERO, c05, PERMRSHIFT2 + stvx c01, OFFSET_1, CO1 + vperm c05, c05, c06, PERMRSHIFT2 + stvx c02, OFFSET_2, CO1 + vperm c06, c06, c07, PERMRSHIFT2 + stvx c03, OFFSET_3, CO1 + vperm c07, c07, c08, PERMRSHIFT2 + stvx c04, OFFSET_4, CO1 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO3 + vmaddfp c05, alpha, c05, C6 + lvx C2, OFFSET_1, CO3 + vmaddfp c06, alpha, c06, C7 + lvx C3, OFFSET_2, CO3 + vmaddfp c07, alpha, c07, C8 + lvx C4, OFFSET_3, CO3 + vmaddfp c08, alpha, c08, C9 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO2 + vperm c00, VZERO, c09, PERMRSHIFT3 + stvx c05, OFFSET_1, CO2 + vperm c09, c09, c10, PERMRSHIFT3 + stvx c06, OFFSET_2, CO2 + vperm c10, c10, c11, PERMRSHIFT3 + stvx c07, OFFSET_3, CO2 + vperm c11, c11, c12, PERMRSHIFT3 + stvx c08, OFFSET_4, CO2 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + lvx C9, OFFSET_4, CO4 + vmaddfp c09, alpha, c09, C2 + lvx C1, OFFSET_0, CO4 + vmaddfp c10, alpha, c10, C3 + lvx C6, OFFSET_1, CO4 + vmaddfp c11, alpha, c11, C4 + lvx C7, OFFSET_2, CO4 + vmaddfp c12, alpha, c12, C5 + lvx C8, OFFSET_3, CO4 + + stvx c00, OFFSET_0, CO3 + vperm c00, VZERO, c13, PERMRSHIFT4 + stvx c09, OFFSET_1, CO3 + vperm c13, c13, c14, PERMRSHIFT4 + stvx c10, OFFSET_2, CO3 + vperm c14, c14, c15, PERMRSHIFT4 + stvx c11, OFFSET_3, CO3 + vperm c15, c15, c16, PERMRSHIFT4 + stvx c12, OFFSET_4, CO3 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + b LL(20) + .align 4 + +LL(19): + lvx C6, OFFSET_1, CO2 + lvx C7, OFFSET_2, CO2 + lvx C8, OFFSET_3, CO2 + lvx C9, OFFSET_4, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + lvx C2, OFFSET_1, CO3 + vmaddfp c02, alpha, c02, C3 + lvx C3, OFFSET_2, CO3 + vmaddfp c03, alpha, c03, C4 + lvx C4, OFFSET_3, CO3 + vmaddfp c04, alpha, c04, C5 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C6 + lvx C6, OFFSET_1, CO4 + vmaddfp c06, alpha, c06, C7 + lvx C7, OFFSET_2, CO4 + vmaddfp c07, alpha, c07, C8 + lvx C8, OFFSET_3, CO4 + vmaddfp c08, alpha, c08, C9 + lvx C9, OFFSET_4, CO4 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + lvx C1, OFFSET_0, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, c11, PERMRSHIFT3 + vperm c11, c11, c12, PERMRSHIFT3 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + vmaddfp c11, alpha, c11, C4 + vmaddfp c12, alpha, c12, C5 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + stvx c11, OFFSET_3, CO3 + stvx c12, OFFSET_4, CO3 + + lvx C1, OFFSET_0, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, c15, PERMRSHIFT4 + vperm c15, c15, c16, PERMRSHIFT4 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 8 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + lvx C3, OFFSET_2, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + lvx C3, OFFSET_2, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + vmaddfp c14, alpha, c14, C3 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + addi CO3, CO3, 8 * SIZE + addi CO4, CO4, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 4 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 2 + ble LL(50) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + LFD f8, 0 * SIZE(CO3) + LFD f9, 1 * SIZE(CO3) + LFD f10, 0 * SIZE(CO4) + LFD f11, 1 * SIZE(CO4) + + FMADD f4, f4, f13, f8 + FMADD f5, f5, f13, f9 + FMADD f6, f6, f13, f10 + FMADD f7, f7, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + STFD f4, 0 * SIZE(CO3) + STFD f5, 1 * SIZE(CO3) + STFD f6, 0 * SIZE(CO4) + STFD f7, 1 * SIZE(CO4) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + .align 4 + +LL(50): + andi. I, M, 1 + ble LL(59) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f9, f10, f0 + FMADD f1, f9, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f9, 3 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(52) + .align 4 + +LL(55): + andi. r0, K, 1 + ble LL(58) + .align 4 + +LL(56): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(58): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + LFD f10, 0 * SIZE(CO3) + LFD f11, 0 * SIZE(CO4) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + STFD f2, 0 * SIZE(CO3) + STFD f3, 0 * SIZE(CO4) + .align 4 + +LL(59): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(60): + andi. r0, N, 2 + ble LL(120) + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(80) + .align 4 + +LL(71): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(78): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + vmaddfp c07, alpha, c07, C4 + vmaddfp c08, alpha, c08, C5 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 8 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(88) + .align 4 + +LL(86): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(88): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(90): + andi. I, M, 4 + ble LL(100) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(98) + .align 4 + +LL(96): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(98): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(100): + andi. I, M, 2 + ble LL(110) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(105) + .align 4 + +LL(102): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + FMADD f4, f8, f12, f4 + FMADD f5, f9, f12, f5 + FMADD f6, f8, f13, f6 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(102) + .align 4 + +LL(105): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(108) + .align 4 + +LL(106): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(108): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + .align 4 + +LL(110): + andi. I, M, 1 + ble LL(119) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(118) + .align 4 + +LL(116): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + + LFD f8, 1 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(118): + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + .align 4 + +LL(119): + mr B, BO + .align 4 + +LL(120): + andi. r0, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + srawi. I, M, 4 + ble LL(140) + .align 4 + +LL(130): + vxor c01, c01, c01 + vxor c02, c02, c02 + vxor c03, c03, c03 + vxor c04, c04, c04 + + mr BO, B + + dcbtst CO1, PREC + + mr J, K + + andi. r0, B, 15 + ble+ LL(131) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + .align 4 + + +LL(131): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(135) + .align 4 + +LL(133): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vspltw bp2, b1, 1 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vspltw bp2, b1, 3 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(133) + .align 4 + +LL(135): + andi. r0, J, 3 + ble+ LL(138) + + cmpwi cr0, r0, 3 + bne LL(136) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 3 * SIZE + b LL(138) + .align 4 + +LL(136): + cmpwi cr0, r0, 2 + bne LL(137) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + LOAD_A a3, OFFSET_6, AO + LOAD_A a4, OFFSET_7, AO + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 2 * SIZE + b LL(138) + .align 4 + +LL(137): + cmpwi cr0, r0, 1 + bne LL(138) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(138): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(130) + .align 4 + +LL(140): + andi. I, M, 8 + ble LL(150) + + vxor c01, c01, c01 + vxor c02, c02, c02 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(141) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + .align 4 + + +LL(141): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(145) + .align 4 + +LL(143): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + vspltw bp1, b1, 2 + vmaddfp c01, a5, bp1, c01 + vmaddfp c02, a6, bp1, c02 + + vspltw bp2, b1, 3 + vmaddfp c01, a7, bp2, c01 + vmaddfp c02, a8, bp2, c02 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(143) + .align 4 + +LL(145): + andi. r0, J, 3 + ble+ LL(148) + + cmpwi cr0, r0, 3 + bne LL(146) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + + addi AO, AO, 24 * SIZE + addi BO, BO, 3 * SIZE + b LL(148) + .align 4 + +LL(146): + cmpwi cr0, r0, 2 + bne LL(147) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + addi AO, AO, 16 * SIZE + addi BO, BO, 2 * SIZE + b LL(148) + .align 4 + +LL(147): + cmpwi cr0, r0, 1 + bne LL(148) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + addi AO, AO, 8 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(148): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(150): + andi. I, M, 4 + ble LL(160) + + vxor c01, c01, c01 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(151) + + LOAD_A a1, OFFSET_0, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + + LOAD_A a1, OFFSET_0, AO + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + .align 4 + + +LL(151): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(155) + .align 4 + +LL(153): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c01, a4, bp2, c01 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(153) + .align 4 + +LL(155): + andi. r0, J, 3 + ble+ LL(158) + + cmpwi cr0, r0, 3 + bne LL(156) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + + addi AO, AO, 12 * SIZE + addi BO, BO, 3 * SIZE + b LL(158) + .align 4 + +LL(156): + cmpwi cr0, r0, 2 + bne LL(157) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c01, a2, bp2, c01 + + addi AO, AO, 8 * SIZE + addi BO, BO, 2 * SIZE + b LL(158) + .align 4 + +LL(157): + cmpwi cr0, r0, 1 + bne LL(158) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + + addi AO, AO, 4 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(158): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(160): + andi. I, M, 2 + ble LL(170) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + LFD f10, 2 * SIZE(AO) + LFD f11, 3 * SIZE(AO) + + LFD f12, 0 * SIZE(B) + LFD f13, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(165) + .align 4 + +LL(162): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + FMADD f2, f10, f13, f2 + FMADD f3, f11, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f10, 6 * SIZE(AO) + LFD f11, 7 * SIZE(AO) + + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(162) + .align 4 + +LL(165): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(168) + .align 4 + +LL(166): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + + addi AO, AO, 2 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(168): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + .align 4 + +LL(170): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(175) + .align 4 + +LL(172): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f11, f1 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(172) + .align 4 + +LL(175): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(178) + .align 4 + +LL(176): + FMADD f0, f8, f10, f0 + + addi AO, AO, 1 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(178): + LFD f8, 0 * SIZE(CO1) + + FADD f0, f0, f1 + + FMADD f0, f0, f13, f8 + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S new file mode 100644 index 0000000..0b0d75f --- /dev/null +++ b/kernel/power/gemm_kernel_cell.S @@ -0,0 +1,2642 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../sparam.h" +#else +#include "../dparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) +#if defined(TRMMKERNEL) + std r19, 240(SP) + std r18, 248(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) +#if defined(TRMMKERNEL) + stw r19, 192(SP) + stw r18, 196(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +/* Normal prefetch */ +#ifdef CELL + li PREC, 4 * SIZE +#endif + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ +xc ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST + li PREC, 3 * SIZE + li PREA, 16 * 12 * SIZE + li PREB, 16 * 12 * SIZE +#endif + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble LL(20) + .align 4 + +LL(11): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(15) + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(15) +#endif + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 + + LFD f16, 0 * SIZE(CO3) + LFD f17, 1 * SIZE(CO3) + LFD f18, 2 * SIZE(CO3) + LFD f19, 3 * SIZE(CO3) + + LFD f20, 0 * SIZE(CO4) + LFD f21, 1 * SIZE(CO4) + LFD f22, 2 * SIZE(CO4) + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + dcbt 0, BO, PREB + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + dcbt 0, BO, PREB + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +LL(39): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(40): + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble LL(70) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble LL(50) + .align 4 + +LL(41): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbt CO1, PREC + dcbt CO2, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt 0, BO, PREB + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + dcbt 0, BO, PREB + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(69): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +LL(70): + mr CO1, C + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble LL(80) + .align 4 + +LL(71): +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbt CO1, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + dcbt 0, BO, PREB + bdnz LL(72) + .align 4 + +LL(75): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + dcbt 0, BO, PREB + bdnz LL(82) + .align 4 + +LL(85): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + ld r19, 240(SP) + ld r18, 248(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S new file mode 100644 index 0000000..1ee4b28 --- /dev/null +++ b/kernel/power/gemm_kernel_g4.S @@ -0,0 +1,2412 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREC r30 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) +#if defined(TRMMKERNEL) + std r19, 240(SP) + std r18, 248(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) +#if defined(TRMMKERNEL) + stw r19, 192(SP) + stw r18, 196(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + li PREA, 8 * 8 * SIZE + li PREC, 3 * SIZE + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + srawi. J, N, 2 + ble .L40 + .align 4 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + +.L10: + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble .L20 + .align 4 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 1 + mtspr CTR, TEMP + ble .L15 + +#else + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, K, 1 + mtspr CTR, r0 + mr BO, B + ble .L15 +#endif + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFDU B5, 4 * SIZE(BO) + FMADD f4, A1, B2, f4 + dcbt AO, PREA + + FMADD f8, A1, B3, f8 + LFD A4, -1 * SIZE(AO) + FMADD f12, A1, B4, f12 + dcbt BO, PREA + + FMADD f1, A2, B1, f1 + LFD B6, 1 * SIZE(BO) + FMADD f5, A2, B2, f5 + nop + + FMADD f9, A2, B3, f9 + LFDU A1, 4 * SIZE(AO) + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + LFD B7, 2 * SIZE(BO) + FMADD f6, A3, B2, f6 + nop + + FMADD f10, A3, B3, f10 + LFD A2, -3 * SIZE(AO) + FMADD f14, A3, B4, f14 + nop + + FMADD f3, A4, B1, f3 + LFD B8, 3 * SIZE(BO) + FMADD f7, A4, B2, f7 + nop + + FMADD f11, A4, B3, f11 + LFD A3, -2 * SIZE(AO) + FMADD f15, A4, B4, f15 + nop + + FMADD f0, A5, B5, f0 + LFDU B1, 4 * SIZE(BO) + FMADD f4, A5, B6, f4 + nop + + FMADD f8, A5, B7, f8 + LFD A4, -1 * SIZE(AO) + FMADD f12, A5, B8, f12 +#ifdef DOUBLE + dcbt BO, PREA +#else + nop +#endif + + FMADD f1, A2, B5, f1 + LFD B2, 1 * SIZE(BO) + FMADD f5, A2, B6, f5 + nop + + FMADD f9, A2, B7, f9 + LFDU A5, 4 * SIZE(AO) + FMADD f13, A2, B8, f13 +#ifdef DOUBLE + dcbt AO, PREA +#else + nop +#endif + + FMADD f2, A3, B5, f2 + LFD B3, 2 * SIZE(BO) + FMADD f6, A3, B6, f6 + nop + + FMADD f10, A3, B7, f10 + LFD A2, -3 * SIZE(AO) + FMADD f14, A3, B8, f14 + nop + + FMADD f3, A4, B5, f3 + LFD B4, 3 * SIZE(BO) + FMADD f7, A4, B6, f7 + nop + + FMADD f11, A4, B7, f11 + LFD A3, -2 * SIZE(AO) + FMADD f15, A4, B8, f15 + bdnz .L12 + .align 4 + +.L15: + addi AO, AO, -4 * SIZE + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 1 +#else + + andi. r0, K, 1 +#endif + ble+ .L18 + +.L16: + LFD A4, 3 * SIZE(AO) + + FMADD f0, A1, B1, f0 + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + + FMADD f3, A4, B1, f3 + FMADD f7, A4, B2, f7 + FMADD f11, A4, B3, f11 + FMADD f15, A4, B4, f15 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + + .align 4 + +.L18: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + LFD f16, 0 * SIZE(CO3) + FMADD f1, f1, f30, f17 + LFD f17, 1 * SIZE(CO3) + FMADD f2, f2, f30, f18 + LFD f18, 2 * SIZE(CO3) + FMADD f3, f3, f30, f19 + LFD f19, 3 * SIZE(CO3) + + FMADD f4, f4, f30, f20 + LFD f20, 0 * SIZE(CO4) + FMADD f5, f5, f30, f21 + LFD f21, 1 * SIZE(CO4) + FMADD f6, f6, f30, f22 + LFD f22, 2 * SIZE(CO4) + FMADD f7, f7, f30, f23 + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + fmr f4, f0 + STFD f5, 1 * SIZE(CO2) + fmr f5, f0 + STFD f6, 2 * SIZE(CO2) + fmr f6, f0 + STFD f7, 3 * SIZE(CO2) + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + fmr f8, f0 + STFD f9, 1 * SIZE(CO3) + fmr f9, f0 + STFD f10, 2 * SIZE(CO3) + fmr f10, f0 + STFD f11, 3 * SIZE(CO3) + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + fmr f12, f0 + STFD f13, 1 * SIZE(CO4) + fmr f13, f0 + STFD f14, 2 * SIZE(CO4) + fmr f14, f0 + STFD f15, 3 * SIZE(CO4) + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + ble .L30 + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L30: + andi. I, M, 1 + ble .L39 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + addic. J, J, -1 + bgt .L10 + .align 4 + +.L40: + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble .L50 + .align 4 + +.L41: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L45 + .align 5 + +.L42: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L42 + .align 4 + +.L45: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ .L41 + .align 4 + +.L50: + andi. I, M, 2 + ble .L60 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L60: + andi. I, M, 1 + ble .L69 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +.L70: + mr CO1, C + andi. J, N, 1 + ble .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble .L80 + .align 4 + +.L71: +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble .L75 + +#endif + ble .L75 + .align 5 + +.L72: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L72 + .align 4 + +.L75: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ .L71 + .align 4 + +.L80: + andi. I, M, 2 + ble .L90 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L999 + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + ld r19, 240(SP) + ld r18, 248(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S new file mode 100644 index 0000000..6b4e6b9 --- /dev/null +++ b/kernel/power/gemm_kernel_hummer.S @@ -0,0 +1,7006 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define ALPHA 0 +#define FZERO 8 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define CO3 r30 +#define CO4 r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) # dummy + + li r0, 0 + + stwu r0, -4(SP) + stwu r0, -4(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + andi. r0, C, 2 * SIZE - 1 + bne .L1000 + andi. r0, LDC, 2 * SIZE - 1 + bne .L1000 + +/* High performance version */ + + li INCM3, -2 * SIZE + li INCM5, -5 * SIZE + li INCM7, -6 * SIZE + + addi C, C, - 2 * SIZE + srawi. J, N, 2 + ble .L50 + .align 4 + +.L10: + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -4 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L20 + .align 4 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + fpmr f1, f0 + mtspr CTR, TEMP + ble .L14 + +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 +#else + nop +#endif + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 +#ifndef TRMMKERNEL + LFPDUX B1, CO1, INC4 +#else + nop +#endif + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFPDUX A3, CO2, INC2 +#else + nop +#endif + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A5, CO2, INC4 +#else + nop +#endif + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 +#ifndef TRMMKERNEL + LFPDUX B3, CO3, INC2 +#else + nop +#endif + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 +#ifndef TRMMKERNEL + LFPDUX A6, CO3, INC4 +#else + nop +#endif + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A7, CO4, INC2 +#else + nop +#endif + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + nop + fxcsmadd f12, B4, A9, f12 +#ifndef TRMMKERNEL + LFPDUX B2, CO4, INC4 +#else + nop +#endif + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFPDUX B5, CO1, INCM3 +#else + nop +#endif + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 +#ifndef TRMMKERNEL + LFPDUX A8, CO1, INC4 +#else + nop +#endif + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 +#ifndef TRMMKERNEL + LFPDUX A9, CO2, INCM3 +#else + nop +#endif + .align 4 + +.L14: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 4 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 + + cmpwi cr0, TEMP, 3 + bgt+ .L15 +#else + andi. r0, K, 3 + mtspr CTR, r0 + ble+ .L18 + + cmpwi cr0, K, 3 + bgt+ .L15 +#endif + +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + fpmr f5, f0 + LFPDUX B1, CO1, INC4 + fpmr f9, f0 + LFPDUX A3, CO2, INC2 + fpmr f13, f0 + LFPDUX A5, CO2, INC4 + fpmr f2, f0 + + LFPDUX B3, CO3, INC2 + fpmr f6, f0 + LFPDUX A6, CO3, INC4 + fpmr f10, f0 + LFPDUX A7, CO4, INC2 + fpmr f14, f0 + LFPDUX B2, CO4, INC4 + fpmr f3, f0 + + LFPDUX B5, CO1, INCM3 + fpmr f7, f0 + LFPDUX A8, CO1, INC4 + fpmr f11, f0 + LFPDUX A9, CO2, INCM3 + fpmr f15, f0 +#else + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop +#endif + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L18: +#ifndef TRMMKERNEL + fxcpmadd f0, AP, f0, A1 + LFPDUX B4, CO2, INC4 + fxcpmadd f1, AP, f1, B5 + LFPDUX A2, CO3, INCM3 + + fxcpmadd f2, AP, f2, B1 + LFPDUX A4, CO3, INC4 + fxcpmadd f3, AP, f3, A8 + LFPDUX A10, CO4, INCM3 + + fxcpmadd f4, AP, f4, A3 + LFPDUX A1, CO4, INC4 + fxcpmadd f5, AP, f5, A9 + STFPDUX f0, CO1, INCM7 + + fxcpmadd f6, AP, f6, A5 + STFPDUX f1, CO1, INC2 + fxcpmadd f7, AP, f7, B4 + STFPDUX f2, CO1, INC2 + + fxcpmadd f8, AP, f8, B3 + STFPDUX f3, CO1, INC2 + fxcpmadd f9, AP, f9, A2 + STFPDUX f4, CO2, INCM7 + + fxcpmadd f10, AP, f10, A6 + STFPDUX f5, CO2, INC2 + fxcpmadd f11, AP, f11, A4 + STFPDUX f6, CO2, INC2 + + fxcpmadd f12, AP, f12, A7 + STFPDUX f7, CO2, INC2 + fxcpmadd f13, AP, f13, A10 + STFPDUX f8, CO3, INCM7 + + fxcpmadd f14, AP, f14, B2 + STFPDUX f9, CO3, INC2 + fxcpmadd f15, AP, f15, A1 + STFPDUX f10, CO3, INC2 + + STFPDUX f11, CO3, INC2 + STFPDUX f12, CO4, INCM7 + STFPDUX f13, CO4, INC2 + STFPDUX f14, CO4, INC2 + STFPDUX f15, CO4, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + fpmul f4, AP, f4 + fpmul f5, AP, f5 + STFPDUX f0, CO1, INC2 + + fpmul f6, AP, f6 + STFPDUX f1, CO1, INC2 + fpmul f7, AP, f7 + STFPDUX f2, CO1, INC2 + + fpmul f8, AP, f8 + STFPDUX f3, CO1, INC2 + fpmul f9, AP, f9 + STFPDUX f4, CO2, INC2 + + fpmul f10, AP, f10 + STFPDUX f5, CO2, INC2 + fpmul f11, AP, f11 + STFPDUX f6, CO2, INC2 + + fpmul f12, AP, f12 + STFPDUX f7, CO2, INC2 + fpmul f13, AP, f13 + STFPDUX f8, CO3, INC2 + + fpmul f14, AP, f14 + STFPDUX f9, CO3, INC2 + fpmul f15, AP, f15 + STFPDUX f10, CO3, INC2 + + STFPDUX f11, CO3, INC2 + STFPDUX f12, CO4, INC2 + STFPDUX f13, CO4, INC2 + STFPDUX f14, CO4, INC2 + STFPDUX f15, CO4, INC2 +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 4 + beq .L30 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, TEMP + fpmr f13, f0 + ble .L24 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L24: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L28 + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L28: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX B1, CO1, INC2 + LFPDUX B3, CO2, INC2 + LFPDUX A6, CO2, INC2 + + LFPDUX B5, CO3, INC2 + LFPDUX A8, CO3, INC2 + LFPDUX A2, CO4, INC2 + LFPDUX A4, CO4, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + fxcpmadd f4, AP, f4, B3 + fxcpmadd f5, AP, f5, A6 + + fxcpmadd f8, AP, f8, B5 + fxcpmadd f9, AP, f9, A8 + STFPDUX f0, CO1, INCM3 + fxcpmadd f12, AP, f12, A2 + STFPDUX f1, CO1, INC2 + fxcpmadd f13, AP, f13, A4 + STFPDUX f4, CO2, INCM3 + + STFPDUX f5, CO2, INC2 + STFPDUX f8, CO3, INCM3 + STFPDUX f9, CO3, INC2 + STFPDUX f12, CO4, INCM3 + STFPDUX f13, CO4, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f4, AP, f4 + fpmul f5, AP, f5 + + fpmul f8, AP, f8 + fpmul f9, AP, f9 + STFPDUX f0, CO1, INC2 + fpmul f12, AP, f12 + STFPDUX f1, CO1, INC2 + fpmul f13, AP, f13 + STFPDUX f4, CO2, INC2 + + STFPDUX f5, CO2, INC2 + STFPDUX f8, CO3, INC2 + STFPDUX f9, CO3, INC2 + STFPDUX f12, CO4, INC2 + STFPDUX f13, CO4, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 2 + beq .L40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 + +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + fxcpmadd f2, B4, A2, f2 + fxcsmadd f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + fxcpmadd f2, A6, A3, f2 + fxcsmadd f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + fxcpmadd f2, A8, A4, f2 + fxcsmadd f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f1, B3, A2, f1 + fxcpmadd f2, B4, A2, f2 + fxcsmadd f3, B4, A2, f3 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f1, A5, A3, f1 + fxcpmadd f2, A6, A3, f2 + fxcsmadd f3, A6, A3, f3 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f1, A7, A4, f1 + fxcpmadd f2, A8, A4, f2 + fxcsmadd f3, A8, A4, f3 + .align 4 + +.L34: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L38 + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + .align 4 + +.L38: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 + LFPDX A2, CO2, INC2 + LFPDX A3, CO3, INC2 + LFPDX A4, CO4, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A2 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A4 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 +#endif + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO2, INC2 + STFPDUX f2, CO3, INC2 + STFPDUX f3, CO4, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L40: + andi. I, M, 1 + beq .L49 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L44 + +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L44 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L43 + .align 4 + +.L42: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L42 + .align 4 + +.L43: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L44: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L48 + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L47 + .align 4 + +.L46: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L46 + .align 4 + +.L47: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + .align 4 + +.L48: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC2 + LFDX A2, CO2, INC2 + LFDX A3, CO3, INC2 + LFDX A4, CO4, INC2 + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fsmfp A1, A2 + fsmfp A3, A4 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A3 +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 +#endif + + STFDX f0, CO1, INC2 + STFSDX f0, CO2, INC2 + STFDX f1, CO3, INC2 + STFSDX f1, CO4, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + addi B, BO, 4 * SIZE + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 2 + beq .L90 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L60 + .align 4 + +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, K, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L54: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L58 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L58: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX B1, CO1, INC2 + LFPDUX A3, CO1, INC2 + LFPDUX A5, CO1, INC2 + + LFPDUX B3, CO2, INC2 + LFPDUX A6, CO2, INC2 + LFPDUX A7, CO2, INC2 + LFPDUX B2, CO2, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A5 + + fxcpmadd f4, AP, f4, B3 + fxcpmadd f5, AP, f5, A6 + STFPDUX f0, CO1, INCM7 + fxcpmadd f6, AP, f6, A7 + STFPDUX f1, CO1, INC2 + fxcpmadd f7, AP, f7, B2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 + STFPDUX f4, CO2, INCM7 + + STFPDUX f5, CO2, INC2 + STFPDUX f6, CO2, INC2 + STFPDUX f7, CO2, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + fpmul f4, AP, f4 + fpmul f5, AP, f5 + STFPDUX f0, CO1, INC2 + fpmul f6, AP, f6 + STFPDUX f1, CO1, INC2 + fpmul f7, AP, f7 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 + STFPDUX f4, CO2, INC2 + + STFPDUX f5, CO2, INC2 + STFPDUX f6, CO2, INC2 + STFPDUX f7, CO2, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 4 + beq .L70 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + fpmr f2, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + fpmr f3, f0 + ble .L64 +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L64: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L68 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L68: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX A2, CO1, INC2 + LFPDUX A3, CO2, INC2 + LFPDUX A4, CO2, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A2 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A4 + + STFPDUX f0, CO1, INCM3 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO2, INCM3 + STFPDUX f3, CO2, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO2, INC2 + STFPDUX f3, CO2, INC2 +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 2 + beq .L80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L74 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + srawi. r0, K, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L74 +#endif + + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L72 + .align 4 + +.L73: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L74: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L78 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L78: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 + LFPDX B3, CO2, INC2 + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B3 +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 +#endif + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO2, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L80: + andi. I, M, 1 + beq .L89 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L84 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L84 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L83 + .align 4 + +.L82: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L82 + .align 4 + +.L83: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L84: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L88 + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L87 + .align 4 + +.L86: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L86 + .align 4 + +.L87: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L88: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC2 + LFDX A2, CO2, INC2 + + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fsmfp A1, A2 + fpadd f0, f0, f2 + fxcpmadd f0, AP, f0, A1 +#else + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fsmfp A1, A2 + fpadd f0, f0, f2 + fpmul f0, AP, f0 +#endif + + STFDX f0, CO1, INC2 + STFSDX f0, CO2, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + addi B, BO, 2 * SIZE + .align 4 + +.L90: + andi. J, N, 1 + beq .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + mr CO1, C + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L100 + .align 4 + +.L91: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 1 +#endif + fpmr f2, f0 + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + ble .L94 + +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + mtspr CTR, r0 + ble .L94 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L93 + .align 4 + +.L92: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L92 + .align 4 + +.L93: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L94: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L98 + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L97 + .align 4 + +.L96: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L96 + .align 4 + +.L97: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L98: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX B1, CO1, INC2 + LFPDUX A3, CO1, INC2 + LFPDUX A5, CO1, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A5 + + STFPDUX f0, CO1, INCM7 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L91 + .align 4 + +.L100: + andi. I, M, 4 + beq .L110 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L104 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L103 + .align 4 + +.L102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L102 + .align 4 + +.L103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L104: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L108 + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L107 + .align 4 + +.L106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L106 + .align 4 + +.L107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L108: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX B1, CO1, INC2 + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + + STFPDUX f0, CO1, INCM3 + STFPDUX f1, CO1, INC2 +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L110: + andi. I, M, 2 + beq .L120 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L114 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L113 + .align 4 + +.L112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L112 + .align 4 + +.L113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L114: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L118 + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L117 + .align 4 + +.L116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L116 + .align 4 + +.L117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L118: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 + + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + fxcpmadd f1, AP, f0, A1 + + li r0, FZERO + lfpsx f0, SP, r0 + + STFPDUX f1, CO1, INC2 +#else + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + fpmul f1, AP, f0 + + li r0, FZERO + lfpsx f0, SP, r0 + + STFPDUX f1, CO1, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L120: + andi. I, M, 1 + beq .L999 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L124 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L123 + .align 4 + +.L122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L122 + .align 4 + +.L123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L124: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L128 + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L127 + .align 4 + +.L126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L126 + .align 4 + +.L127: + fmadd f0, A1, B1, f0 + .align 4 + +.L128: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC2 + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + fadd f0, f0, f1 + fmadd f0, AP, f0, A1 +#else + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + fadd f0, f0, f1 + fpmul f0, AP, f0 +#endif + STFDUX f0, CO1, INC2 + .align 4 + +.L999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + +.L1000: + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + srawi. J, N, 2 + ble .L1050 + .align 4 + +.L1010: + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -4 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L1020 + .align 4 + +.L1011: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + fpmr f1, f0 + mtspr CTR, TEMP + ble .L1014 + +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L1014 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L1013 + .align 4 + +.L1012: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L1012 + .align 4 + +.L1013: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC +#else + nop +#endif + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 +#ifndef TRMMKERNEL + LFDUX B1, CO1, INC2 +#else + nop +#endif + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFDUX A3, CO1, INC2 +#else + nop +#endif + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 +#ifndef TRMMKERNEL + LFDUX A5, CO1, INC2 +#else + nop +#endif + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 +#ifndef TRMMKERNEL + LFSDUX A1, CO1, INCM5 +#else + nop +#endif + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 +#ifndef TRMMKERNEL + LFSDUX B1, CO1, INC2 +#else + nop +#endif + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFSDUX A3, CO1, INC2 +#else + nop +#endif + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 +#ifndef TRMMKERNEL + LFSDUX A5, CO1, INC2 +#else + nop +#endif + fxcsmadd f12, B4, A9, f12 +#ifndef TRMMKERNEL + LFDUX B3, CO2, INC +#else + nop +#endif + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFDUX A6, CO2, INC2 +#else + nop +#endif + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 +#ifndef TRMMKERNEL + LFDUX A7, CO2, INC2 +#else + nop +#endif + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 +#ifndef TRMMKERNEL + LFDUX B2, CO2, INC2 +#else + nop +#endif + .align 4 + +.L1014: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 4 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L1018 + + cmpwi cr0, TEMP, 3 + bgt+ .L1015 +#else + andi. r0, K, 3 + mtspr CTR, r0 + ble+ .L1018 + + cmpwi cr0, K, 3 + bgt+ .L1015 +#endif + +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + fpmr f5, f0 + LFDUX B1, CO1, INC2 + fpmr f9, f0 + LFDUX A3, CO1, INC2 + fpmr f13, f0 + LFDUX A5, CO1, INC2 + fpmr f2, f0 + + LFSDUX A1, CO1, INCM5 + fpmr f6, f0 + LFSDUX B1, CO1, INC2 + fpmr f10, f0 + LFSDUX A3, CO1, INC2 + fpmr f14, f0 + LFSDUX A5, CO1, INC2 + fpmr f3, f0 + + LFDUX B3, CO2, INC + fpmr f7, f0 + LFDUX A6, CO2, INC2 + fpmr f11, f0 + LFDUX A7, CO2, INC2 + fpmr f15, f0 + LFDUX B2, CO2, INC2 +#else + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop +#endif + .align 4 + +.L1015: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L1017 + .align 4 + +.L1016: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L1016 + .align 4 + +.L1017: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L1018: +#ifndef TRMMKERNEL + LFSDUX B3, CO2, INCM5 + LFSDUX A6, CO2, INC2 + LFSDUX A7, CO2, INC2 + LFSDUX B2, CO2, INC2 + + LFDUX B5, CO3, INC + LFDUX A8, CO3, INC2 + LFDUX A9, CO3, INC2 + LFDUX B4, CO3, INC2 + + LFSDUX B5, CO3, INCM5 + LFSDUX A8, CO3, INC2 + LFSDUX A9, CO3, INC2 + LFSDUX B4, CO3, INC2 + + LFDUX A2, CO4, INC + LFDUX A4, CO4, INC2 + + fxcpmadd f0, AP, f0, A1 + LFDUX A10, CO4, INC2 + LFDUX A1, CO4, INC2 + + fxcpmadd f1, AP, f1, B1 + LFSDUX A2, CO4, INCM5 + LFSDUX A4, CO4, INC2 + + fxcpmadd f2, AP, f2, A3 + LFSDUX A10, CO4, INC2 + LFSDUX A1, CO4, INC2 + + fxcpmadd f3, AP, f3, A5 + STFDUX f0, CO1, INCM7 + STFSDUX f0, CO1, INC + + fxcpmadd f4, AP, f4, B3 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fxcpmadd f5, AP, f5, A6 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + fxcpmadd f6, AP, f6, A7 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fxcpmadd f7, AP, f7, B2 + STFDUX f4, CO2, INCM7 + STFSDUX f4, CO2, INC + + fxcpmadd f8, AP, f8, B5 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + fxcpmadd f9, AP, f9, A8 + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + + fxcpmadd f10, AP, f10, A9 + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + fxcpmadd f11, AP, f11, B4 + STFDUX f8, CO3, INCM7 + STFSDUX f8, CO3, INC + + fxcpmadd f12, AP, f12, A2 + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + + fxcpmadd f13, AP, f13, A4 + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + + fxcpmadd f14, AP, f14, A10 + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + fxcpmadd f15, AP, f15, A1 + STFDUX f12, CO4, INCM7 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f4, AP, f4 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fpmul f5, AP, f5 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + fpmul f6, AP, f6 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fpmul f7, AP, f7 + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + fpmul f8, AP, f8 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + fpmul f9, AP, f9 + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + + fpmul f10, AP, f10 + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + fpmul f11, AP, f11 + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + + fpmul f12, AP, f12 + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + + fpmul f13, AP, f13 + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + + fpmul f14, AP, f14 + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + fpmul f15, AP, f15 + STFDUX f12, CO4, INC +#endif + + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC + STFDUX f14, CO4, INC + STFSDUX f14, CO4, INC + STFDUX f15, CO4, INC + STFSDUX f15, CO4, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1011 + .align 4 + +.L1020: + andi. I, M, 4 + beq .L1030 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, TEMP + fpmr f13, f0 + ble .L1024 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L1024 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L1023 + .align 4 + +.L1022: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L1022 + .align 4 + +.L1023: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L1024: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1028 + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L1027 + .align 4 + +.L1026: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L1026 + .align 4 + +.L1027: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L1028: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B1, CO1, INC2 + LFDUX B3, CO2, INC + LFDUX A6, CO2, INC2 + + LFSDUX A1, CO1, INCM1 + LFSDUX B1, CO1, INC2 + LFSDUX B3, CO2, INCM1 + LFSDUX A6, CO2, INC2 + + LFDUX B5, CO3, INC + LFDUX A8, CO3, INC2 + LFDUX A2, CO4, INC + LFDUX A4, CO4, INC2 + + fxcpmadd f0, AP, f0, A1 + LFSDUX B5, CO3, INCM1 + LFSDUX A8, CO3, INC2 + + fxcpmadd f1, AP, f1, B1 + LFSDUX A2, CO4, INCM1 + LFSDUX A4, CO4, INC2 + + fxcpmadd f4, AP, f4, B3 + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC + + fxcpmadd f5, AP, f5, A6 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fxcpmadd f8, AP, f8, B5 + STFDUX f4, CO2, INCM3 + STFSDUX f4, CO2, INC + + fxcpmadd f9, AP, f9, A8 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + fxcpmadd f12, AP, f12, A2 + STFDUX f8, CO3, INCM3 + STFSDUX f8, CO3, INC + + fxcpmadd f13, AP, f13, A4 + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + + STFDUX f12, CO4, INCM3 + STFSDUX f12, CO4, INC + + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + fpmul f4, AP, f4 + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f5, AP, f5 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fpmul f8, AP, f8 + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + fpmul f9, AP, f9 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + fpmul f12, AP, f12 + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + + fpmul f13, AP, f13 + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1030: + andi. I, M, 2 + beq .L1040 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L1034 + +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 2 + mtspr CTR, r0 + ble .L1034 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L1033 + .align 4 + +.L1032: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + fxcpmadd f2, B4, A2, f2 + fxcsmadd f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + fxcpmadd f2, A6, A3, f2 + fxcsmadd f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + fxcpmadd f2, A8, A4, f2 + fxcsmadd f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L1032 + .align 4 + +.L1033: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f1, B3, A2, f1 + fxcpmadd f2, B4, A2, f2 + fxcsmadd f3, B4, A2, f3 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f1, A5, A3, f1 + fxcpmadd f2, A6, A3, f2 + fxcsmadd f3, A6, A3, f3 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f1, A7, A4, f1 + fxcpmadd f2, A8, A4, f2 + fxcsmadd f3, A8, A4, f3 + .align 4 + +.L1034: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1038 + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L1037 + .align 4 + +.L1036: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L1036 + .align 4 + +.L1037: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + .align 4 + +.L1038: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO2, INC + LFDUX A3, CO3, INC + LFDUX A4, CO4, INC + + LFSDUX A1, CO1, INC + LFSDUX A2, CO2, INC + LFSDUX A3, CO3, INC + LFSDUX A4, CO4, INC + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A2 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A4 + + STFDUX f0, CO1, INCM1 + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INCM1 + STFSDUX f1, CO2, INC + + STFDUX f2, CO3, INCM1 + STFSDUX f2, CO3, INC + + STFDUX f3, CO4, INCM1 + STFSDUX f3, CO4, INC +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC + + STFDUX f2, CO3, INC + STFSDUX f2, CO3, INC + + STFDUX f3, CO4, INC + STFSDUX f3, CO4, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1040: + andi. I, M, 1 + beq .L1049 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L1044 + +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L1044 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L1043 + .align 4 + +.L1042: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L1042 + .align 4 + +.L1043: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L1044: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L1048 + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L1047 + .align 4 + +.L1046: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L1046 + .align 4 + +.L1047: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + .align 4 + +.L1048: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC + LFDX B3, CO3, INC + LFSDX A1, CO2, INC + LFSDX B3, CO4, INC + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B3 +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC + STFDUX f1, CO3, INC + STFSDUX f1, CO4, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L1049: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + addi B, BO, 4 * SIZE + + addic. J, J, -1 + bgt+ .L1010 + .align 4 + +.L1050: + andi. J, N, 2 + beq .L1090 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L1060 + .align 4 + +.L1051: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L1054 +#else + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, K, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L1054 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1053 + .align 4 + +.L1052: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L1052 + .align 4 + +.L1053: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L1054: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1058 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L1057 + .align 4 + +.L1056: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L1056 + .align 4 + +.L1057: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L1058: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B1, CO1, INC2 + LFDUX A3, CO1, INC2 + LFDUX A5, CO1, INC2 + + LFSDUX A1, CO1, INCM5 + LFSDUX B1, CO1, INC2 + LFSDUX A3, CO1, INC2 + LFSDUX A5, CO1, INC2 + + LFDUX B3, CO2, INC + LFDUX A6, CO2, INC2 + LFDUX A7, CO2, INC2 + LFDUX B2, CO2, INC2 + + fxcpmadd f0, AP, f0, A1 + LFSDUX B3, CO2, INCM5 + LFSDUX A6, CO2, INC2 + fxcpmadd f1, AP, f1, B1 + LFSDUX A7, CO2, INC2 + LFSDUX B2, CO2, INC2 + + fxcpmadd f2, AP, f2, A3 + STFDUX f0, CO1, INCM7 + STFSDUX f0, CO1, INC + + fxcpmadd f3, AP, f3, A5 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fxcpmadd f4, AP, f4, B3 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + fxcpmadd f5, AP, f5, A6 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fxcpmadd f6, AP, f6, A7 + STFDUX f4, CO2, INCM7 + STFSDUX f4, CO2, INC + + fxcpmadd f7, AP, f7, B2 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + fpmul f2, AP, f2 + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f3, AP, f3 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fpmul f4, AP, f4 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + fpmul f5, AP, f5 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fpmul f6, AP, f6 + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + fpmul f7, AP, f7 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1051 + .align 4 + +.L1060: + andi. I, M, 4 + beq .L1070 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + fpmr f2, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1064 +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1064 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1063 + .align 4 + +.L1062: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L1062 + .align 4 + +.L1063: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L1064: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1068 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L1067 + .align 4 + +.L1066: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L1066 + .align 4 + +.L1067: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L1068: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC2 + LFDUX A3, CO2, INC + LFDUX A4, CO2, INC2 + + LFSDUX A1, CO1, INCM1 + LFSDUX A2, CO1, INC2 + LFSDUX A3, CO2, INCM1 + LFSDUX A4, CO2, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A2 + fxcpmadd f2, AP, f2, A3 + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC + + fxcpmadd f3, AP, f3, A4 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INCM3 + STFSDUX f2, CO2, INC + + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f3, AP, f3 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1070: + andi. I, M, 2 + beq .L1080 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1074 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + srawi. r0, K, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1074 +#endif + + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L1073 + .align 4 + +.L1072: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L1072 + .align 4 + +.L1073: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L1074: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L1078 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L1077 + .align 4 + +.L1076: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L1076 + .align 4 + +.L1077: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L1078: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B3, CO2, INC + LFSDUX A1, CO1, INC + LFSDUX B3, CO2, INC + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B3 + + STFDUX f0, CO1, INCM1 + STFSDUX f0, CO1, INC + STFDUX f1, CO2, INCM1 + STFSDUX f1, CO2, INC +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1080: + andi. I, M, 1 + beq .L1089 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L1084 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L1084 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L1083 + .align 4 + +.L1082: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L1082 + .align 4 + +.L1083: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L1084: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L1088 + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L1087 + .align 4 + +.L1086: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L1086 + .align 4 + +.L1087: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L1088: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC + LFDX A2, CO2, INC + + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fsmfp A1, A2 + fpadd f0, f0, f2 + fxcpmadd f0, AP, f0, A1 +#else + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fsmfp A1, A2 + fpadd f0, f0, f2 + fpmul f0, AP, f0 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L1089: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + addi B, BO, 2 * SIZE + .align 4 + +.L1090: + andi. J, N, 1 + beq .L10999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + mr CO1, C + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L10100 + .align 4 + +.L1091: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 1 +#endif + fpmr f2, f0 + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + ble .L1094 + +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + mtspr CTR, r0 + ble .L1094 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1093 + .align 4 + +.L1092: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L1092 + .align 4 + +.L1093: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L1094: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1098 + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L1097 + .align 4 + +.L1096: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L1096 + .align 4 + +.L1097: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L1098: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B1, CO1, INC2 + LFDUX A3, CO1, INC2 + LFDUX A5, CO1, INC2 + + LFSDUX A1, CO1, INCM5 + LFSDUX B1, CO1, INC2 + LFSDUX A3, CO1, INC2 + LFSDUX A5, CO1, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + fxcpmadd f2, AP, f2, A3 + STFDUX f0, CO1, INCM7 + STFSDUX f0, CO1, INC + + fxcpmadd f3, AP, f3, A5 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f3, AP, f3 +#endif + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1091 + .align 4 + +.L10100: + andi. I, M, 4 + beq .L10110 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L10104 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L10104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L10103 + .align 4 + +.L10102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L10102 + .align 4 + +.L10103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L10104: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L10108 + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L10107 + .align 4 + +.L10106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L10106 + .align 4 + +.L10107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L10108: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B1, CO1, INC2 + LFSDUX A1, CO1, INCM1 + LFSDUX B1, CO1, INC2 + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L10110: + andi. I, M, 2 + beq .L10120 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L10114 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L10114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L10113 + .align 4 + +.L10112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L10112 + .align 4 + +.L10113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L10114: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L10118 + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L10117 + .align 4 + +.L10116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L10116 + .align 4 + +.L10117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L10118: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC + + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fsmfp A1, A2 + fpadd f0, f0, f2 + fxcpmadd f1, AP, f0, A1 + + li r0, FZERO + lfpsx f0, SP, r0 + + STFDUX f1, CO1, INCM1 + STFSDUX f1, CO1, INC +#else + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fsmfp A1, A2 + fpadd f0, f0, f2 + fpmul f1, AP, f0 + + li r0, FZERO + lfpsx f0, SP, r0 + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L10120: + andi. I, M, 1 + beq .L10999 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L10124 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L10124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L10123 + .align 4 + +.L10122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L10122 + .align 4 + +.L10123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L10124: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L10128 + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L10127 + .align 4 + +.L10126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L10126 + .align 4 + +.L10127: + fmadd f0, A1, B1, f0 + .align 4 + +.L10128: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + fadd f0, f0, f1 + fmadd f0, AP, f0, A1 + STFDUX f0, CO1, INC +#else + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + fadd f0, f0, f1 + fmul f0, AP, f0 + STFDUX f0, CO1, INC +#endif + .align 4 + +.L10999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S new file mode 100644 index 0000000..92e8e9f --- /dev/null +++ b/kernel/power/gemm_kernel_power3.S @@ -0,0 +1,1664 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../sparam.h" +#else +#include "../dparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREA, (16 * 5 * SIZE + 16) + li PREB, (16 * 5 * SIZE + 16) + li PREC, 4 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble LL(20) + .align 4 + +LL(11): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + +#if 0 + PREFETCH_C1 + PREFETCH_C2 + PREFETCH_C3 + PREFETCH_C4 +#endif + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(15) + .align 4 + +LL(12): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + LFD f28, 4 * SIZE(BO) + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFD f16, 8 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + LFD f29, 5 * SIZE(BO) + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + LFD f30, 6 * SIZE(BO) + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + fmadd f3, f19, f20, f3 + fmadd f7, f19, f21, f7 + LFD f31, 7 * SIZE(BO) + fmadd f11, f19, f22, f11 + fmadd f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f4, f24, f29, f4 + LFD f20, 8 * SIZE(BO) + fmadd f8, f24, f30, f8 + fmadd f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f5, f25, f29, f5 + LFD f21, 9 * SIZE(BO) + fmadd f9, f25, f30, f9 + fmadd f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + fmadd f2, f26, f28, f2 + fmadd f6, f26, f29, f6 + LFD f22, 10 * SIZE(BO) + fmadd f10, f26, f30, f10 + fmadd f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + fmadd f3, f27, f28, f3 + fmadd f7, f27, f29, f7 + LFD f23, 11 * SIZE(BO) + fmadd f11, f27, f30, f11 + fmadd f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + LFD f28, 12 * SIZE(BO) + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFDU f16, 16 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + LFD f29, 13 * SIZE(BO) + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + LFD f30, 14 * SIZE(BO) + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 2 * SIZE(AO) + + fmadd f3, f19, f20, f3 + fmadd f7, f19, f21, f7 + LFD f31, 15 * SIZE(BO) + fmadd f11, f19, f22, f11 + fmadd f15, f19, f23, f15 + LFD f19, 3 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f4, f24, f29, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f8, f24, f30, f8 + fmadd f12, f24, f31, f12 + LFD f24, 4 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f5, f25, f29, f5 + LFD f21, 1 * SIZE(BO) + fmadd f9, f25, f30, f9 + fmadd f13, f25, f31, f13 + LFD f25, 5 * SIZE(AO) + + fmadd f2, f26, f28, f2 + fmadd f6, f26, f29, f6 + LFD f22, 2 * SIZE(BO) + fmadd f10, f26, f30, f10 + fmadd f14, f26, f31, f14 + LFD f26, 6 * SIZE(AO) + + fmadd f3, f27, f28, f3 + fmadd f7, f27, f29, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f11, f27, f30, f11 + fmadd f15, f27, f31, f15 + LFD f27, 7 * SIZE(AO) + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, K, 3 + lfd f30, ALPHA + lfs f31, FZERO + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 5 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 6 * SIZE(AO) + + fmadd f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + fmadd f7, f19, f21, f7 + LFD f21, 5 * SIZE(BO) + fmadd f11, f19, f22, f11 + LFD f22, 6 * SIZE(BO) + fmadd f15, f19, f23, f15 + LFD f19, 7 * SIZE(AO) + + LFD f23, 7 * SIZE(BO) + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + fmadd f0, f0, f30, f16 + LFD f16, 0 * SIZE(CO3) + fmadd f1, f1, f30, f17 + LFD f17, 1 * SIZE(CO3) + fmadd f2, f2, f30, f18 + LFD f18, 2 * SIZE(CO3) + fmadd f3, f3, f30, f19 + LFD f19, 3 * SIZE(CO3) + + fmadd f4, f4, f30, f20 + LFD f20, 0 * SIZE(CO4) + fmadd f5, f5, f30, f21 + LFD f21, 1 * SIZE(CO4) + fmadd f6, f6, f30, f22 + LFD f22, 2 * SIZE(CO4) + fmadd f7, f7, f30, f23 + LFD f23, 3 * SIZE(CO4) + + fmadd f8, f8, f30, f16 + fmadd f9, f9, f30, f17 + STFD f0, 0 * SIZE(CO1) + + fmadd f10, f10, f30, f18 + fmadd f11, f11, f30, f19 + STFD f1, 1 * SIZE(CO1) + + fmadd f12, f12, f30, f20 + fmadd f13, f13, f30, f21 + STFD f2, 2 * SIZE(CO1) + + fmadd f14, f14, f30, f22 + fmadd f15, f15, f30, f23 + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + fmr f0, f31 + fmr f1, f31 + STFD f5, 1 * SIZE(CO2) + fmr f2, f31 + fmr f3, f31 + + STFD f6, 2 * SIZE(CO2) + fmr f4, f31 + fmr f5, f31 + STFD f7, 3 * SIZE(CO2) + fmr f6, f31 + fmr f7, f31 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + addi CO1, CO1, 4 * SIZE + fmr f8, f31 + fmr f9, f31 + + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + addi CO2, CO2, 4 * SIZE + fmr f10, f31 + fmr f11, f31 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + addi CO3, CO3, 4 * SIZE + fmr f12, f31 + fmr f13, f31 + + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + addi CO4, CO4, 4 * SIZE + fmr f14, f31 + fmr f15, f31 + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(25) + .align 5 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f2, f18, f24, f2 + fmadd f3, f19, f24, f3 + fmadd f6, f18, f25, f6 + fmadd f7, f19, f25, f7 + + fmadd f10, f18, f26, f10 + fmadd f11, f19, f26, f11 + fmadd f14, f18, f27, f14 + fmadd f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f2, f18, f24, f2 + fmadd f3, f19, f24, f3 + fmadd f6, f18, f25, f6 + fmadd f7, f19, f25, f7 + + fmadd f10, f18, f26, f10 + fmadd f11, f19, f26, f11 + fmadd f14, f18, f27, f14 + fmadd f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + fmadd f4, f4, f30, f18 + fmadd f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + fmadd f8, f8, f30, f20 + fmadd f9, f9, f30, f21 + fmadd f12, f12, f30, f22 + fmadd f13, f13, f30, f23 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(35) + .align 5 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f1, f17, f24, f1 + fmadd f5, f17, f25, f5 + fmadd f9, f17, f26, f9 + fmadd f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + fmadd f0, f18, f20, f0 + fmadd f4, f18, f21, f4 + fmadd f8, f18, f22, f8 + fmadd f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f1, f19, f24, f1 + fmadd f5, f19, f25, f5 + fmadd f9, f19, f26, f9 + fmadd f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + fmadd f0, f0, f30, f16 + fmadd f4, f4, f30, f18 + fmadd f8, f8, f30, f20 + fmadd f12, f12, f30, f22 + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + .align 4 + +LL(39): + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble LL(70) + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble LL(50) + .align 4 + +LL(41): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + PREFETCH_C1 + PREFETCH_C2 + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(45) + .align 5 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f22, f0 + fmadd f1, f17, f22, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f22, f3 + + fmadd f4, f16, f23, f4 + fmadd f5, f17, f23, f5 + fmadd f6, f18, f23, f6 + fmadd f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + fmadd f0, f16, f22, f0 + fmadd f1, f17, f22, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f22, f3 + + fmadd f4, f16, f23, f4 + fmadd f5, f17, f23, f5 + fmadd f6, f18, f23, f6 + fmadd f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + fmadd f2, f2, f30, f18 + fmadd f3, f3, f30, f19 + + fmadd f4, f4, f30, f20 + fmadd f5, f5, f30, f21 + fmadd f6, f6, f30, f22 + fmadd f7, f7, f30, f23 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(55) + .align 5 + +LL(52): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f16, f21, f2 + fmadd f3, f17, f21, f3 + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f22, f5 + fmadd f6, f18, f23, f6 + fmadd f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f16, f24, f0 + fmadd f1, f17, f24, f1 + fmadd f2, f16, f25, f2 + fmadd f3, f17, f25, f3 + + fmadd f4, f18, f26, f4 + fmadd f5, f19, f26, f5 + fmadd f6, f18, f27, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f16, f21, f2 + fmadd f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + fadd f0, f4, f0 + fadd f1, f5, f1 + fadd f2, f6, f2 + fadd f3, f7, f3 + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + fmadd f2, f2, f30, f18 + fmadd f3, f3, f30, f19 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(65) + .align 5 + +LL(62): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f17, f22, f2 + fmadd f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f19, f26, f2 + fmadd f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + fadd f0, f2, f0 + fadd f1, f3, f1 + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f18 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + .align 4 + +LL(69): + mr B, BO + lfs f0, FZERO + .align 4 + +LL(70): + mr CO1, C + andi. J, N, 1 + ble LL(999) + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble LL(80) + .align 4 + +LL(71): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + PREFETCH_C1 + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(75) + .align 5 + +LL(72): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f21, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f18, f21, f2 + fmadd f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + fmadd f0, f16, f22, f0 + fmadd f1, f17, f22, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + fmadd f0, f16, f23, f0 + fmadd f1, f17, f23, f1 + fmadd f2, f18, f23, f2 + fmadd f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + fmadd f2, f2, f30, f18 + fmadd f3, f3, f30, f19 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(85) + .align 5 + +LL(82): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f21, f2 + fmadd f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f22, f0 + fmadd f1, f17, f22, f1 + fmadd f2, f18, f23, f2 + fmadd f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + fadd f0, f2, f0 + fadd f1, f3, f1 + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B + ble LL(95) + .align 5 + +LL(92): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + lfd f30, ALPHA + andi. r0, K, 7 + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + fmadd f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + LFD f16, 0 * SIZE(CO1) + + fadd f0, f1, f0 + fadd f2, f3, f2 + fadd f0, f2, f0 + + fmadd f0, f0, f30, f16 + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S new file mode 100644 index 0000000..b10a042 --- /dev/null +++ b/kernel/power/gemm_kernel_power6.S @@ -0,0 +1,2667 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define TEMP r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#ifdef TRMMKERNEL + std r20, 232(SP) + std r19, 240(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#ifdef TRMMKERNEL + stw r20, 188(SP) + stw r19, 192(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + lfs f0, FZERO + li PREA, (16 * 3) * SIZE + srawi. J, N, 2 + + li PREC, 3 * SIZE + ble LL(40) + .align 4 + +LL(10): + mr CO1, C + fmr f1, f0 + add CO2, C, LDC + fmr f2, f0 + add CO3, CO2, LDC + fmr f3, f0 + add CO4, CO3, LDC + fmr f4, f0 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + slwi BB, K, BASE_SHIFT + 2 + fmr f5, f0 + + srawi. I, M, 2 + fmr f6, f0 + + mr AO, A + fmr f7, f0 + add C, CO4, LDC + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + ble LL(20) + .align 4 + +LL(11): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(15) + +#else + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, K, 3 + mtctr r0 + mr BO, B + ble LL(15) +#endif + .align 4 + +LL(12): + dcbt AO, PREA + FMADD f0, f16, f20, f0 + nop + FMADD f4, f16, f21, f4 + + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + LFD f28, 20 * SIZE(BO) + LFD f29, 21 * SIZE(BO) + + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + LFD f30, 22 * SIZE(BO) + LFD f31, 23 * SIZE(BO) + + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + LFD f24, 28 * SIZE(AO) + LFD f25, 29 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + LFD f28, 28 * SIZE(BO) + LFD f29, 29 * SIZE(BO) + + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + LFD f26, 30 * SIZE(AO) + LFD f27, 31 * SIZE(AO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + lfd f30, ALPHA + + dcbtst B, BB + addi BB, BB, 16 * SIZE + dcbtst B, BB + addi BB, BB, 16 * SIZE + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + nop + bdnz LL(16) + .align 4 + +LL(18): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 + + LFD f16, 0 * SIZE(CO3) + LFD f17, 1 * SIZE(CO3) + LFD f18, 2 * SIZE(CO3) + LFD f19, 3 * SIZE(CO3) + + LFD f20, 0 * SIZE(CO4) + LFD f21, 1 * SIZE(CO4) + LFD f22, 2 * SIZE(CO4) + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +LL(39): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + lfs f0, FZERO + + mr B, BO + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(40): + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble LL(70) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble LL(50) + .align 4 + +LL(41): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(69): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +LL(70): + mr CO1, C + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble LL(80) + .align 4 + +LL(71): +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(72) + .align 4 + +LL(75): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(82) + .align 4 + +LL(85): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#ifdef TRMMKERNEL + ld r20, 232(SP) + ld r19, 240(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#ifdef TRMMKERNEL + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S new file mode 100644 index 0000000..5d3b306 --- /dev/null +++ b/kernel/power/gemm_kernel_ppc440.S @@ -0,0 +1,2470 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) +#if defined(TRMMKERNEL) + std r19, 240(SP) + std r18, 248(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) +#if defined(TRMMKERNEL) + stw r19, 192(SP) + stw r18, 196(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + srawi. J, N, 2 + ble .L40 + .align 4 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + +.L10: + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble .L20 + .align 4 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L15 + +#else + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble .L15 +#endif + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L18 + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.L18: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + LFD f16, 0 * SIZE(CO3) + FMADD f1, f1, f30, f17 + LFD f17, 1 * SIZE(CO3) + FMADD f2, f2, f30, f18 + LFD f18, 2 * SIZE(CO3) + FMADD f3, f3, f30, f19 + LFD f19, 3 * SIZE(CO3) + + FMADD f4, f4, f30, f20 + LFD f20, 0 * SIZE(CO4) + FMADD f5, f5, f30, f21 + LFD f21, 1 * SIZE(CO4) + FMADD f6, f6, f30, f22 + LFD f22, 2 * SIZE(CO4) + FMADD f7, f7, f30, f23 + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + fmr f4, f0 + STFD f5, 1 * SIZE(CO2) + fmr f5, f0 + STFD f6, 2 * SIZE(CO2) + fmr f6, f0 + STFD f7, 3 * SIZE(CO2) + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + fmr f8, f0 + STFD f9, 1 * SIZE(CO3) + fmr f9, f0 + STFD f10, 2 * SIZE(CO3) + fmr f10, f0 + STFD f11, 3 * SIZE(CO3) + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + fmr f12, f0 + STFD f13, 1 * SIZE(CO4) + fmr f13, f0 + STFD f14, 2 * SIZE(CO4) + fmr f14, f0 + STFD f15, 3 * SIZE(CO4) + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + ble .L30 + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L30: + andi. I, M, 1 + ble .L39 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + addic. J, J, -1 + bgt .L10 + .align 4 + +.L40: + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble .L50 + .align 4 + +.L41: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L45 + .align 5 + +.L42: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L42 + .align 4 + +.L45: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ .L41 + .align 4 + +.L50: + andi. I, M, 2 + ble .L60 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L60: + andi. I, M, 1 + ble .L69 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +.L70: + mr CO1, C + andi. J, N, 1 + ble .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble .L80 + .align 4 + +.L71: +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble .L75 + +#endif + ble .L75 + .align 5 + +.L72: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L72 + .align 4 + +.L75: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ .L71 + .align 4 + +.L80: + andi. I, M, 2 + ble .L90 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L999 + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + ld r19, 240(SP) + ld r18, 248(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S new file mode 100644 index 0000000..93c687b --- /dev/null +++ b/kernel/power/gemm_ncopy_4.S @@ -0,0 +1,366 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define PREA r14 +#define PREB1 r15 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define STACKSIZE 32 + +#ifdef CELL +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef PPC440 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef PPCG4 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + +#ifdef __64BIT__ + std r14, 16(SP) + std r15, 24(SP) +#else + stw r14, 16(SP) + stw r15, 20(SP) +#endif + + slwi LDA, LDA, BASE_SHIFT + + li PREA, PREFETCHSIZE * SIZE + li PREB1, (PREFETCHWSIZE + 0) * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + cmpwi cr0, N, 0 + ble- LL(999) + + srawi. J, N, 2 + ble LL(20) + .align 4 + +LL(10): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + LFD c05, 0 * SIZE(AO2) + LFD c06, 1 * SIZE(AO2) + LFD c07, 2 * SIZE(AO2) + LFD c08, 3 * SIZE(AO2) + + LFD c09, 0 * SIZE(AO3) + LFD c10, 1 * SIZE(AO3) + LFD c11, 2 * SIZE(AO3) + LFD c12, 3 * SIZE(AO3) + + LFD c13, 0 * SIZE(AO4) + LFD c14, 1 * SIZE(AO4) + LFD c15, 2 * SIZE(AO4) + LFD c16, 3 * SIZE(AO4) + + STFD c01, 0 * SIZE(B) + STFD c05, 1 * SIZE(B) + STFD c09, 2 * SIZE(B) + STFD c13, 3 * SIZE(B) + + STFD c02, 4 * SIZE(B) + STFD c06, 5 * SIZE(B) + STFD c10, 6 * SIZE(B) + STFD c14, 7 * SIZE(B) + + STFD c03, 8 * SIZE(B) + STFD c07, 9 * SIZE(B) + STFD c11, 10 * SIZE(B) + STFD c15, 11 * SIZE(B) + + STFD c04, 12 * SIZE(B) + STFD c08, 13 * SIZE(B) + STFD c12, 14 * SIZE(B) + STFD c16, 15 * SIZE(B) + +#ifdef POWER6 + dcbtst PREA, AO1 + dcbtst PREA, AO2 + dcbtst PREA, AO3 + dcbtst PREA, AO4 +#else + dcbt PREA, AO1 + dcbt PREA, AO2 + dcbt PREA, AO3 + dcbt PREA, AO4 +#endif + + dcbtst PREB1, B + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + addi B, B, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(17) + .align 4 + +LL(16): + LFD c01, 0 * SIZE(AO1) + LFD c05, 0 * SIZE(AO2) + LFD c09, 0 * SIZE(AO3) + LFD c13, 0 * SIZE(AO4) + + STFD c01, 0 * SIZE(B) + STFD c05, 1 * SIZE(B) + STFD c09, 2 * SIZE(B) + STFD c13, 3 * SIZE(B) + + addi AO1, AO1, 1 * SIZE + addi AO2, AO2, 1 * SIZE + addi AO3, AO3, 1 * SIZE + addi AO4, AO4, 1 * SIZE + addi B, B, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(17): + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + LFD c05, 0 * SIZE(AO2) + LFD c06, 1 * SIZE(AO2) + LFD c07, 2 * SIZE(AO2) + LFD c08, 3 * SIZE(AO2) + + STFD c01, 0 * SIZE(B) + STFD c05, 1 * SIZE(B) + STFD c02, 2 * SIZE(B) + STFD c06, 3 * SIZE(B) + + STFD c03, 4 * SIZE(B) + STFD c07, 5 * SIZE(B) + STFD c04, 6 * SIZE(B) + STFD c08, 7 * SIZE(B) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi B, B, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(30) + .align 4 + +LL(26): + LFD c01, 0 * SIZE(AO1) + LFD c05, 0 * SIZE(AO2) + + STFD c01, 0 * SIZE(B) + STFD c05, 1 * SIZE(B) + + addi AO1, AO1, 1 * SIZE + addi AO2, AO2, 1 * SIZE + addi B, B, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + STFD c01, 0 * SIZE(B) + STFD c02, 1 * SIZE(B) + STFD c03, 2 * SIZE(B) + STFD c04, 3 * SIZE(B) + + addi AO1, AO1, 4 * SIZE + addi B, B, 4 * SIZE + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(999) + .align 4 + +LL(36): + LFD c01, 0 * SIZE(AO1) + + STFD c01, 0 * SIZE(B) + + addi AO1, AO1, 1 * SIZE + addi B, B, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + +#ifdef __64BIT__ + ld r14, 16(SP) + ld r15, 24(SP) +#else + lwz r14, 16(SP) + lwz r15, 20(SP) +#endif + addi SP, SP, STACKSIZE + + blr + EPILOGUE diff --git a/kernel/power/gemm_ncopy_hummer_4.S b/kernel/power/gemm_ncopy_hummer_4.S new file mode 100644 index 0000000..f05fdaa --- /dev/null +++ b/kernel/power/gemm_ncopy_hummer_4.S @@ -0,0 +1,798 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define sel_p f16 +#define sel_s f17 + +#define c17 f18 +#define c18 f19 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + + lis r9, 0x3f80 + lis r10, 0xbf80 + + stwu r9, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r9, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + + cmpwi cr0, M, 0 + ble- .L99 + cmpwi cr0, N, 0 + ble- .L99 + + andi. r0, A, 2 * SIZE - 1 + bne .L100 + andi. r0, LDA, 2 * SIZE - 1 + bne .L100 + + li r0, 8 + addi SP, SP, -8 + + lfpsux sel_p, SP, r0 + lfpsux sel_s, SP, r0 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + subi A, A, 2 * SIZE + subi B, B, 2 * SIZE + + srawi. J, N, 2 + ble .L20 + .align 4 +.L11: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L15 + .align 4 + +.L12: + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFXDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFXDUX c14, AO4, INC2 + + LFPDUX c03, AO1, INC2 + LFXDUX c07, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFXDUX c15, AO4, INC2 + + LFPDUX c04, AO1, INC2 + LFXDUX c08, AO2, INC2 + LFPDUX c12, AO3, INC2 + LFXDUX c16, AO4, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c18, sel_p, c09, c13 + fpsel c01, sel_s, c01, c05 + fpsel c05, sel_s, c09, c13 + + fpsel c09, sel_p, c02, c06 + fpsel c13, sel_p, c10, c14 + STFPDUX c17, B, INC2 + fpsel c02, sel_s, c02, c06 + STFPDUX c18, B, INC2 + fpsel c06, sel_s, c10, c14 + STFXDUX c01, B, INC2 + + fpsel c10, sel_p, c03, c07 + STFXDUX c05, B, INC2 + fpsel c14, sel_p, c11, c15 + STFPDUX c09, B, INC2 + fpsel c03, sel_s, c03, c07 + STFPDUX c13, B, INC2 + fpsel c07, sel_s, c11, c15 + STFXDUX c02, B, INC2 + + fpsel c11, sel_p, c04, c08 + STFXDUX c06, B, INC2 + fpsel c15, sel_p, c12, c16 + STFPDUX c10, B, INC2 + fpsel c04, sel_s, c04, c08 + STFPDUX c14, B, INC2 + fpsel c08, sel_s, c12, c16 + STFXDUX c03, B, INC2 + + STFXDUX c07, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFXDUX c04, B, INC2 + STFXDUX c08, B, INC2 + bdnz .L12 + .align 4 + +.L15: + andi. r0, M, 7 + ble .L19 + + andi. r0, M, 4 + beq .L16 + + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFXDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFXDUX c14, AO4, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c18, sel_p, c09, c13 + fpsel c01, sel_s, c01, c05 + fpsel c05, sel_s, c09, c13 + + fpsel c09, sel_p, c02, c06 + fpsel c13, sel_p, c10, c14 + STFPDUX c17, B, INC2 + fpsel c02, sel_s, c02, c06 + STFPDUX c18, B, INC2 + fpsel c06, sel_s, c10, c14 + STFXDUX c01, B, INC2 + STFXDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFXDUX c02, B, INC2 + STFXDUX c06, B, INC2 + .align 4 + +.L16: + andi. r0, M, 2 + beq .L17 + + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFXDUX c13, AO4, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c18, sel_p, c09, c13 + fpsel c01, sel_s, c01, c05 + fpsel c05, sel_s, c09, c13 + + STFPDUX c17, B, INC2 + STFPDUX c18, B, INC2 + STFXDUX c01, B, INC2 + STFXDUX c05, B, INC2 + .align 4 + +.L17: + andi. r0, M, 1 + beq .L19 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + LFDUX c03, AO3, INC2 + LFDUX c04, AO4, INC2 + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +.L19: + addic. J, J, -1 + bgt .L11 + .align 4 + +.L20: + andi. J, N, 2 + ble .L30 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L25 + .align 4 + +.L22: + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + + LFPDUX c03, AO1, INC2 + LFXDUX c07, AO2, INC2 + LFPDUX c04, AO1, INC2 + LFXDUX c08, AO2, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c01, sel_s, c01, c05 + fpsel c09, sel_p, c02, c06 + fpsel c02, sel_s, c02, c06 + + fpsel c10, sel_p, c03, c07 + fpsel c03, sel_s, c03, c07 + STFPDUX c17, B, INC2 + fpsel c11, sel_p, c04, c08 + STFXDUX c01, B, INC2 + fpsel c04, sel_s, c04, c08 + STFPDUX c09, B, INC2 + + STFXDUX c02, B, INC2 + STFPDUX c10, B, INC2 + STFXDUX c03, B, INC2 + STFPDUX c11, B, INC2 + STFXDUX c04, B, INC2 + bdnz .L22 + .align 4 + +.L25: + andi. r0, M, 7 + ble .L30 + + andi. r0, M, 4 + beq .L26 + + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c01, sel_s, c01, c05 + fpsel c09, sel_p, c02, c06 + fpsel c02, sel_s, c02, c06 + + STFPDUX c17, B, INC2 + STFXDUX c01, B, INC2 + STFPDUX c09, B, INC2 + STFXDUX c02, B, INC2 + .align 4 + +.L26: + andi. r0, M, 2 + beq .L27 + + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c01, sel_s, c01, c05 + + STFPDUX c17, B, INC2 + STFXDUX c01, B, INC2 + .align 4 + +.L27: + andi. r0, M, 1 + beq .L30 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L30: + andi. J, N, 1 + ble .L99 + + mr AO1, A + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L35 + .align 4 + +.L32: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + bdnz .L32 + .align 4 + +.L35: + andi. r0, M, 7 + ble .L99 + + andi. r0, M, 4 + beq .L36 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +.L36: + andi. r0, M, 2 + beq .L37 + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B, INC2 + .align 4 + +.L37: + andi. r0, M, 1 + beq .L99 + + LFDX c01, AO1, INC2 + STFDX c01, B, INC2 + .align 4 + +.L99: + addi SP, SP, 4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + +.L100: + li INC, 1 * SIZE + li INC2, 2 * SIZE + + subi A, A, 1 * SIZE + subi B, B, 2 * SIZE + + srawi. J, N, 2 + ble .L120 + .align 4 +.L111: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L115 + .align 4 + +.L112: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO1, INC + LFDUX c12, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFSDUX c09, AO2, INC + LFSDUX c10, AO2, INC + LFSDUX c11, AO2, INC + LFSDUX c12, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO3, INC + LFDUX c08, AO3, INC + + LFDUX c13, AO3, INC + LFDUX c14, AO3, INC + LFDUX c15, AO3, INC + LFDUX c16, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c07, AO4, INC + LFSDUX c08, AO4, INC + + LFSDUX c13, AO4, INC + LFSDUX c14, AO4, INC + LFSDUX c15, AO4, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c16, B, INC2 + bdnz .L112 + .align 4 + +.L115: + andi. r0, M, 7 + ble .L119 + + andi. r0, M, 4 + beq .L116 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO3, INC + LFDUX c08, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c07, AO4, INC + LFSDUX c08, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + .align 4 + +.L116: + andi. r0, M, 2 + beq .L117 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + .align 4 + +.L117: + andi. r0, M, 1 + beq .L119 + + LFDUX c01, AO1, INC + LFDUX c05, AO3, INC + + nop + nop + + LFSDUX c01, AO2, INC + LFSDUX c05, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + .align 4 + +.L119: + addic. J, J, -1 + bgt .L111 + .align 4 + +.L120: + andi. J, N, 2 + ble .L130 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L125 + .align 4 + +.L122: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO1, INC + LFDUX c12, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFSDUX c09, AO2, INC + LFSDUX c10, AO2, INC + LFSDUX c11, AO2, INC + LFSDUX c12, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + bdnz .L122 + .align 4 + +.L125: + andi. r0, M, 7 + ble .L130 + + andi. r0, M, 4 + beq .L126 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +.L126: + andi. r0, M, 2 + beq .L127 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +.L127: + andi. r0, M, 1 + beq .L130 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L130: + andi. J, N, 1 + ble .L999 + + mr AO1, A + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L135 + .align 4 + +.L132: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz .L132 + .align 4 + +.L135: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + beq .L136 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +.L136: + andi. r0, M, 2 + beq .L137 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L137: + andi. r0, M, 1 + beq .L999 + + LFDX c01, AO1, INC + STFDX c01, B, INC2 + .align 4 + +.L999: + addi SP, SP, 12 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/gemm_ncopy_hummer_8.S b/kernel/power/gemm_ncopy_hummer_8.S new file mode 100644 index 0000000..fec7c13 --- /dev/null +++ b/kernel/power/gemm_ncopy_hummer_8.S @@ -0,0 +1,1217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define AO5 r26 +#define AO6 r27 +#define AO7 r28 +#define AO8 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define c17 f16 +#define c18 f17 +#define c19 f18 +#define c20 f19 +#define c21 f20 +#define c22 f21 +#define c23 f22 +#define c24 f23 +#define c25 f24 +#define c26 f25 +#define c27 f26 +#define c28 f27 +#define c29 f28 +#define c30 f29 +#define c31 f30 +#define c32 f31 + +#define sel_p f30 +#define sel_s f31 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + + lis r9, 0x3f80 + lis r10, 0xbf80 + + stwu r9, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r9, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + + li r0, 0 + lfpsux sel_p, SP, r0 + li r0, 8 + lfpsux sel_s, SP, r0 + + cmpwi cr0, M, 0 + ble- .L999 + cmpwi cr0, N, 0 + ble- .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + subi B, B, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne .L100 + andi. r0, LDA, 2 * SIZE - 1 + bne .L100 + + subi A, A, 2 * SIZE + srawi. J, N, 3 + ble .L20 + .align 4 +.L11: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble .L15 + .align 4 + +.L12: + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + LFPDUX c05, AO5, INC2 + LFXDUX c06, AO6, INC2 + LFPDUX c07, AO7, INC2 + LFXDUX c08, AO8, INC2 + + LFPDUX c09, AO1, INC2 + LFXDUX c10, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFXDUX c12, AO4, INC2 + fpsel c17, sel_p, c01, c02 + + LFPDUX c13, AO5, INC2 + fpsel c18, sel_p, c03, c04 + LFXDUX c14, AO6, INC2 + fpsel c19, sel_p, c05, c06 + LFPDUX c15, AO7, INC2 + fpsel c20, sel_p, c07, c08 + LFXDUX c16, AO8, INC2 + fpsel c21, sel_s, c01, c02 + + fpsel c22, sel_s, c03, c04 + STFPDUX c17, B, INC2 + fpsel c23, sel_s, c05, c06 + STFPDUX c18, B, INC2 + fpsel c24, sel_s, c07, c08 + STFPDUX c19, B, INC2 + + fpsel c01, sel_p, c09, c10 + STFPDUX c20, B, INC2 + fpsel c02, sel_p, c11, c12 + STFXDUX c21, B, INC2 + fpsel c03, sel_p, c13, c14 + STFXDUX c22, B, INC2 + fpsel c04, sel_p, c15, c16 + STFXDUX c23, B, INC2 + + fpsel c05, sel_s, c09, c10 + STFXDUX c24, B, INC2 + fpsel c06, sel_s, c11, c12 + STFPDUX c01, B, INC2 + fpsel c07, sel_s, c13, c14 + STFPDUX c02, B, INC2 + fpsel c08, sel_s, c15, c16 + STFPDUX c03, B, INC2 + + STFPDUX c04, B, INC2 + STFXDUX c05, B, INC2 + STFXDUX c06, B, INC2 + STFXDUX c07, B, INC2 + STFXDUX c08, B, INC2 + bdnz .L12 + .align 4 + +.L15: + andi. r0, M, 3 + ble .L19 + + andi. r0, M, 2 + beq .L17 + + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + LFPDUX c05, AO5, INC2 + fpsel c09, sel_p, c01, c02 + LFXDUX c06, AO6, INC2 + fpsel c10, sel_p, c03, c04 + LFPDUX c07, AO7, INC2 + fpsel c11, sel_p, c05, c06 + LFXDUX c08, AO8, INC2 + fpsel c12, sel_p, c07, c08 + + fpsel c13, sel_s, c01, c02 + fpsel c14, sel_s, c03, c04 + STFPDUX c09, B, INC2 + fpsel c15, sel_s, c05, c06 + STFPDUX c10, B, INC2 + fpsel c16, sel_s, c07, c08 + STFPDUX c11, B, INC2 + + STFPDUX c12, B, INC2 + STFXDUX c13, B, INC2 + STFXDUX c14, B, INC2 + STFXDUX c15, B, INC2 + STFXDUX c16, B, INC2 + .align 4 + +.L17: + andi. r0, M, 1 + beq .L19 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO3, INC2 + LFDUX c03, AO5, INC2 + LFDUX c04, AO7, INC2 + + LFSDUX c01, AO2, INC2 + LFSDUX c02, AO4, INC2 + LFSDUX c03, AO6, INC2 + LFSDUX c04, AO8, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +.L19: + addic. J, J, -1 + bgt .L11 + .align 4 + +.L20: + andi. J, N, 4 + ble .L30 + .align 4 +.L21: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L25 + .align 4 + +.L22: + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + LFPDUX c05, AO1, INC2 + LFXDUX c06, AO2, INC2 + LFPDUX c07, AO3, INC2 + LFXDUX c08, AO4, INC2 + + LFPDUX c09, AO1, INC2 + LFXDUX c10, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFXDUX c12, AO4, INC2 + fpsel c17, sel_p, c01, c02 + + LFPDUX c13, AO1, INC2 + fpsel c18, sel_p, c03, c04 + LFXDUX c14, AO2, INC2 + fpsel c19, sel_s, c01, c02 + LFPDUX c15, AO3, INC2 + fpsel c20, sel_s, c03, c04 + LFXDUX c16, AO4, INC2 + fpsel c21, sel_p, c05, c06 + + fpsel c22, sel_p, c07, c08 + STFPDUX c17, B, INC2 + fpsel c23, sel_s, c05, c06 + STFPDUX c18, B, INC2 + fpsel c24, sel_s, c07, c08 + STFXDUX c19, B, INC2 + + fpsel c01, sel_p, c09, c10 + STFXDUX c20, B, INC2 + fpsel c02, sel_p, c11, c12 + STFPDUX c21, B, INC2 + fpsel c03, sel_s, c09, c10 + STFPDUX c22, B, INC2 + fpsel c04, sel_s, c11, c12 + STFXDUX c23, B, INC2 + + fpsel c05, sel_p, c13, c14 + STFXDUX c24, B, INC2 + fpsel c06, sel_p, c15, c16 + STFPDUX c01, B, INC2 + fpsel c07, sel_s, c13, c14 + STFPDUX c02, B, INC2 + fpsel c08, sel_s, c15, c16 + STFXDUX c03, B, INC2 + + STFXDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFXDUX c07, B, INC2 + STFXDUX c08, B, INC2 + bdnz .L22 + .align 4 + +.L25: + andi. r0, M, 7 + ble .L30 + + andi. r0, M, 4 + beq .L26 + + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + LFPDUX c05, AO1, INC2 + fpsel c09, sel_p, c01, c02 + LFXDUX c06, AO2, INC2 + fpsel c10, sel_p, c03, c04 + LFPDUX c07, AO3, INC2 + fpsel c11, sel_s, c01, c02 + LFXDUX c08, AO4, INC2 + fpsel c12, sel_s, c03, c04 + + fpsel c13, sel_p, c05, c06 + fpsel c14, sel_p, c07, c08 + STFPDUX c09, B, INC2 + fpsel c15, sel_s, c05, c06 + STFPDUX c10, B, INC2 + fpsel c16, sel_s, c07, c08 + STFXDUX c11, B, INC2 + + STFXDUX c12, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c14, B, INC2 + STFXDUX c15, B, INC2 + STFXDUX c16, B, INC2 + .align 4 + +.L26: + andi. r0, M, 2 + beq .L27 + + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + fpsel c05, sel_p, c01, c02 + fpsel c06, sel_p, c03, c04 + fpsel c07, sel_s, c01, c02 + fpsel c08, sel_s, c03, c04 + + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFXDUX c07, B, INC2 + STFXDUX c08, B, INC2 + .align 4 + +.L27: + andi. r0, M, 1 + beq .L30 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + LFDUX c03, AO3, INC2 + LFDUX c04, AO4, INC2 + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + + +.L30: + andi. J, N, 2 + ble .L40 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L35 + .align 4 + +.L32: + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + + LFPDUX c03, AO1, INC2 + fpsel c09, sel_p, c01, c05 + LFXDUX c07, AO2, INC2 + fpsel c10, sel_s, c01, c05 + LFPDUX c04, AO1, INC2 + fpsel c11, sel_p, c02, c06 + LFXDUX c08, AO2, INC2 + fpsel c12, sel_s, c02, c06 + + fpsel c13, sel_p, c03, c07 + fpsel c14, sel_s, c03, c07 + STFPDUX c09, B, INC2 + fpsel c15, sel_p, c04, c08 + STFXDUX c10, B, INC2 + fpsel c16, sel_s, c04, c08 + STFPDUX c11, B, INC2 + STFXDUX c12, B, INC2 + + STFPDUX c13, B, INC2 + STFXDUX c14, B, INC2 + STFPDUX c15, B, INC2 + STFXDUX c16, B, INC2 + bdnz .L32 + .align 4 + +.L35: + andi. r0, M, 7 + ble .L40 + + andi. r0, M, 4 + beq .L36 + + LFPDUX c01, AO1, INC2 + LFXDUX c03, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFXDUX c04, AO2, INC2 + + fpsel c05, sel_p, c01, c03 + fpsel c06, sel_s, c01, c03 + fpsel c07, sel_p, c02, c04 + fpsel c08, sel_s, c02, c04 + + STFPDUX c05, B, INC2 + STFXDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFXDUX c08, B, INC2 + .align 4 + +.L36: + andi. r0, M, 2 + beq .L37 + + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + + fpsel c03, sel_p, c01, c02 + fpsel c04, sel_s, c01, c02 + + STFPDUX c03, B, INC2 + STFXDUX c04, B, INC2 + .align 4 + +.L37: + andi. r0, M, 1 + beq .L40 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L40: + andi. J, N, 1 + ble .L999 + + mr AO1, A + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L45 + .align 4 + +.L42: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + bdnz .L42 + .align 4 + +.L45: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + beq .L46 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +.L46: + andi. r0, M, 2 + beq .L47 + + LFPDUX c01, AO1, INC2 + STFPDUX c01, B, INC2 + .align 4 + +.L47: + andi. r0, M, 1 + beq .L999 + + LFDX c01, AO1, INC2 + STFDX c01, B, INC2 + b .L999 + .align 4 + + +.L100: + subi A, A, 1 * SIZE + srawi. J, N, 3 + ble .L120 + .align 4 +.L111: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L115 + .align 4 + +.L112: + LFDUX c01, AO1, INC + LFDUX c05, AO1, INC + LFDUX c09, AO1, INC + LFDUX c13, AO1, INC + + LFDUX c17, AO1, INC + LFDUX c21, AO1, INC + LFDUX c25, AO1, INC + LFDUX c29, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO2, INC + LFSDUX c13, AO2, INC + + LFSDUX c17, AO2, INC + LFSDUX c21, AO2, INC + LFSDUX c25, AO2, INC + LFSDUX c29, AO2, INC + + LFDUX c02, AO3, INC + LFDUX c06, AO3, INC + LFDUX c10, AO3, INC + LFDUX c14, AO3, INC + + LFDUX c18, AO3, INC + LFDUX c22, AO3, INC + LFDUX c26, AO3, INC + LFDUX c30, AO3, INC + + LFSDUX c02, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c10, AO4, INC + LFSDUX c14, AO4, INC + + LFSDUX c18, AO4, INC + LFSDUX c22, AO4, INC + LFSDUX c26, AO4, INC + LFSDUX c30, AO4, INC + + LFDUX c03, AO5, INC + LFDUX c07, AO5, INC + LFDUX c11, AO5, INC + LFDUX c15, AO5, INC + + LFDUX c19, AO5, INC + LFDUX c23, AO5, INC + LFDUX c27, AO5, INC + LFDUX c31, AO5, INC + + LFSDUX c03, AO6, INC + LFSDUX c07, AO6, INC + LFSDUX c11, AO6, INC + LFSDUX c15, AO6, INC + + LFSDUX c19, AO6, INC + LFSDUX c23, AO6, INC + LFSDUX c27, AO6, INC + LFSDUX c31, AO6, INC + + LFDUX c04, AO7, INC + LFDUX c08, AO7, INC + LFDUX c12, AO7, INC + LFDUX c16, AO7, INC + + LFDUX c20, AO7, INC + LFDUX c24, AO7, INC + LFDUX c28, AO7, INC + LFDUX c32, AO7, INC + + LFSDUX c04, AO8, INC + LFSDUX c08, AO8, INC + LFSDUX c12, AO8, INC + LFSDUX c16, AO8, INC + + LFSDUX c20, AO8, INC + LFSDUX c24, AO8, INC + LFSDUX c28, AO8, INC + LFSDUX c32, AO8, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c16, B, INC2 + + STFPDUX c17, B, INC2 + STFPDUX c18, B, INC2 + STFPDUX c19, B, INC2 + STFPDUX c20, B, INC2 + STFPDUX c21, B, INC2 + STFPDUX c22, B, INC2 + STFPDUX c23, B, INC2 + STFPDUX c24, B, INC2 + + STFPDUX c25, B, INC2 + STFPDUX c26, B, INC2 + STFPDUX c27, B, INC2 + STFPDUX c28, B, INC2 + STFPDUX c29, B, INC2 + STFPDUX c30, B, INC2 + STFPDUX c31, B, INC2 + STFPDUX c32, B, INC2 + bdnz .L112 + .align 4 + +.L115: + andi. r0, M, 7 + ble .L119 + + andi. r0, M, 4 + beq .L116 + + LFDUX c01, AO1, INC + LFDUX c05, AO1, INC + LFDUX c09, AO1, INC + LFDUX c13, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO2, INC + LFSDUX c13, AO2, INC + + LFDUX c02, AO3, INC + LFDUX c06, AO3, INC + LFDUX c10, AO3, INC + LFDUX c14, AO3, INC + + LFSDUX c02, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c10, AO4, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO5, INC + LFDUX c07, AO5, INC + LFDUX c11, AO5, INC + LFDUX c15, AO5, INC + + LFSDUX c03, AO6, INC + LFSDUX c07, AO6, INC + LFSDUX c11, AO6, INC + LFSDUX c15, AO6, INC + + LFDUX c04, AO7, INC + LFDUX c08, AO7, INC + LFDUX c12, AO7, INC + LFDUX c16, AO7, INC + + LFSDUX c04, AO8, INC + LFSDUX c08, AO8, INC + LFSDUX c12, AO8, INC + LFSDUX c16, AO8, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c16, B, INC2 + .align 4 + +.L116: + andi. r0, M, 2 + beq .L117 + + LFDUX c01, AO1, INC + LFDUX c05, AO1, INC + LFDUX c02, AO3, INC + LFDUX c06, AO3, INC + + LFSDUX c01, AO2, INC + LFSDUX c05, AO2, INC + LFSDUX c02, AO4, INC + LFSDUX c06, AO4, INC + + LFDUX c03, AO5, INC + LFDUX c07, AO5, INC + LFDUX c04, AO7, INC + LFDUX c08, AO7, INC + + LFSDUX c03, AO6, INC + LFSDUX c07, AO6, INC + LFSDUX c04, AO8, INC + LFSDUX c08, AO8, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + .align 4 + +.L117: + andi. r0, M, 1 + beq .L119 + + LFDUX c01, AO1, INC + LFDUX c02, AO3, INC + LFDUX c03, AO5, INC + LFDUX c04, AO7, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO4, INC + LFSDUX c03, AO6, INC + LFSDUX c04, AO8, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +.L119: + addic. J, J, -1 + bgt .L111 + .align 4 + +.L120: + andi. J, N, 4 + ble .L130 + .align 4 +.L121: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L125 + .align 4 + +.L122: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO1, INC + LFDUX c12, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFSDUX c09, AO2, INC + LFSDUX c10, AO2, INC + LFSDUX c11, AO2, INC + LFSDUX c12, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO3, INC + LFDUX c08, AO3, INC + + LFDUX c13, AO3, INC + LFDUX c14, AO3, INC + LFDUX c15, AO3, INC + LFDUX c16, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c07, AO4, INC + LFSDUX c08, AO4, INC + + LFSDUX c13, AO4, INC + LFSDUX c14, AO4, INC + LFSDUX c15, AO4, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c16, B, INC2 + bdnz .L122 + .align 4 + +.L125: + andi. r0, M, 7 + ble .L130 + + andi. r0, M, 4 + beq .L126 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO3, INC + LFDUX c08, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c07, AO4, INC + LFSDUX c08, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + .align 4 + +.L126: + andi. r0, M, 2 + beq .L127 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + .align 4 + +.L127: + andi. r0, M, 1 + beq .L130 + + LFDUX c01, AO1, INC + LFDUX c05, AO3, INC + + nop + nop + + LFSDUX c01, AO2, INC + LFSDUX c05, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + .align 4 + + +.L130: + andi. J, N, 2 + ble .L140 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L135 + .align 4 + +.L132: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO1, INC + LFDUX c12, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFSDUX c09, AO2, INC + LFSDUX c10, AO2, INC + LFSDUX c11, AO2, INC + LFSDUX c12, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + bdnz .L132 + .align 4 + +.L135: + andi. r0, M, 7 + ble .L140 + + andi. r0, M, 4 + beq .L136 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +.L136: + andi. r0, M, 2 + beq .L137 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +.L137: + andi. r0, M, 1 + beq .L140 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L140: + andi. J, N, 1 + ble .L999 + + mr AO1, A + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L145 + .align 4 + +.L142: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz .L142 + .align 4 + +.L145: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + beq .L146 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +.L146: + andi. r0, M, 2 + beq .L147 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L147: + andi. r0, M, 1 + beq .L999 + + LFDX c01, AO1, INC + STFDX c01, B, INC2 + .align 4 + +.L999: + addi SP, SP, 4 + + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S new file mode 100644 index 0000000..712420f --- /dev/null +++ b/kernel/power/gemm_tcopy_4.S @@ -0,0 +1,452 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define PREA r14 +#define PREB1 r15 +#define B1 r16 +#define B2 r17 +#define B3 r18 +#define M4 r19 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define STACKSIZE 64 + +#ifdef CELL +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef PPC440 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef PPCG4 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + +#ifdef __64BIT__ + std r14, 16(SP) + std r15, 24(SP) + std r16, 32(SP) + std r17, 40(SP) + std r18, 48(SP) + std r19, 56(SP) +#else + stw r14, 16(SP) + stw r15, 20(SP) + stw r16, 24(SP) + stw r17, 28(SP) + stw r18, 32(SP) + stw r19, 36(SP) +#endif + + slwi LDA, LDA, BASE_SHIFT + slwi M4, M, 2 + BASE_SHIFT + + li PREA, -4 + li PREB1, -2 + + and B2, N, PREA + and B3, N, PREB1 + + mullw B2, B2, M + mullw B3, B3, M + + slwi B2, B2, BASE_SHIFT + slwi B3, B3, BASE_SHIFT + + add B2, B2, B + add B3, B3, B + + li PREA, PREFETCHSIZE * SIZE + li PREB1, (PREFETCHWSIZE + 0) * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + cmpwi cr0, N, 0 + ble- LL(999) + + srawi. J, M, 2 + ble LL(20) + .align 4 + +LL(10): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr B1, B + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(13) + .align 4 + +LL(12): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + LFD c05, 0 * SIZE(AO2) + LFD c06, 1 * SIZE(AO2) + LFD c07, 2 * SIZE(AO2) + LFD c08, 3 * SIZE(AO2) + + LFD c09, 0 * SIZE(AO3) + LFD c10, 1 * SIZE(AO3) + LFD c11, 2 * SIZE(AO3) + LFD c12, 3 * SIZE(AO3) + + LFD c13, 0 * SIZE(AO4) + LFD c14, 1 * SIZE(AO4) + LFD c15, 2 * SIZE(AO4) + LFD c16, 3 * SIZE(AO4) + + STFD c01, 0 * SIZE(B1) + STFD c02, 1 * SIZE(B1) + STFD c03, 2 * SIZE(B1) + STFD c04, 3 * SIZE(B1) + + STFD c05, 4 * SIZE(B1) + STFD c06, 5 * SIZE(B1) + STFD c07, 6 * SIZE(B1) + STFD c08, 7 * SIZE(B1) + + STFD c09, 8 * SIZE(B1) + STFD c10, 9 * SIZE(B1) + STFD c11, 10 * SIZE(B1) + STFD c12, 11 * SIZE(B1) + + STFD c13, 12 * SIZE(B1) + STFD c14, 13 * SIZE(B1) + STFD c15, 14 * SIZE(B1) + STFD c16, 15 * SIZE(B1) + +#ifdef POWER6 + dcbtst PREA, AO1 + dcbtst PREA, AO2 + dcbtst PREA, AO3 + dcbtst PREA, AO4 +#else + dcbt PREA, AO1 + dcbt PREA, AO2 + dcbt PREA, AO3 + dcbt PREA, AO4 +#endif + + dcbtst PREB1, B + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + add B1, B1, M4 + bdnz LL(12) + .align 4 + +LL(13): + andi. r0, N, 2 + ble LL(14) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 0 * SIZE(AO2) + LFD c04, 1 * SIZE(AO2) + + LFD c05, 0 * SIZE(AO3) + LFD c06, 1 * SIZE(AO3) + LFD c07, 0 * SIZE(AO4) + LFD c08, 1 * SIZE(AO4) + + STFD c01, 0 * SIZE(B2) + STFD c02, 1 * SIZE(B2) + STFD c03, 2 * SIZE(B2) + STFD c04, 3 * SIZE(B2) + + STFD c05, 4 * SIZE(B2) + STFD c06, 5 * SIZE(B2) + STFD c07, 6 * SIZE(B2) + STFD c08, 7 * SIZE(B2) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi AO3, AO3, 2 * SIZE + addi AO4, AO4, 2 * SIZE + addi B2, B2, 8 * SIZE + .align 4 + +LL(14): + andi. r0, N, 1 + ble LL(17) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 0 * SIZE(AO2) + LFD c03, 0 * SIZE(AO3) + LFD c04, 0 * SIZE(AO4) + + STFD c01, 0 * SIZE(B3) + STFD c02, 1 * SIZE(B3) + STFD c03, 2 * SIZE(B3) + STFD c04, 3 * SIZE(B3) + + addi B3, B3, 4 * SIZE + .align 4 + +LL(17): + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(20): + andi. J, M, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr B1, B + addi B, B, 8 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(23) + .align 4 + +LL(22): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + LFD c05, 0 * SIZE(AO2) + LFD c06, 1 * SIZE(AO2) + LFD c07, 2 * SIZE(AO2) + LFD c08, 3 * SIZE(AO2) + + STFD c01, 0 * SIZE(B1) + STFD c02, 1 * SIZE(B1) + STFD c03, 2 * SIZE(B1) + STFD c04, 3 * SIZE(B1) + + STFD c05, 4 * SIZE(B1) + STFD c06, 5 * SIZE(B1) + STFD c07, 6 * SIZE(B1) + STFD c08, 7 * SIZE(B1) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + add B1, B1, M4 + bdnz LL(22) + .align 4 + +LL(23): + andi. r0, N, 2 + ble LL(24) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 0 * SIZE(AO2) + LFD c04, 1 * SIZE(AO2) + + STFD c01, 0 * SIZE(B2) + STFD c02, 1 * SIZE(B2) + STFD c03, 2 * SIZE(B2) + STFD c04, 3 * SIZE(B2) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi B2, B2, 4 * SIZE + .align 4 + +LL(24): + andi. r0, N, 1 + ble LL(30) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 0 * SIZE(AO2) + + STFD c01, 0 * SIZE(B3) + STFD c02, 1 * SIZE(B3) + + addi B3, B3, 2 * SIZE + .align 4 + +LL(30): + andi. J, M, 1 + ble LL(999) + + mr AO1, A + + mr B1, B + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(33) + .align 4 + +LL(32): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + STFD c01, 0 * SIZE(B1) + STFD c02, 1 * SIZE(B1) + STFD c03, 2 * SIZE(B1) + STFD c04, 3 * SIZE(B1) + + addi AO1, AO1, 4 * SIZE + add B1, B1, M4 + bdnz LL(32) + .align 4 + +LL(33): + andi. r0, N, 2 + ble LL(34) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + + STFD c01, 0 * SIZE(B2) + STFD c02, 1 * SIZE(B2) + + addi AO1, AO1, 2 * SIZE + addi B2, B2, 2 * SIZE + .align 4 + +LL(34): + andi. r0, N, 1 + ble LL(999) + + LFD c01, 0 * SIZE(AO1) + STFD c01, 0 * SIZE(B3) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + +#ifdef __64BIT__ + ld r14, 16(SP) + ld r15, 24(SP) + ld r16, 32(SP) + ld r17, 40(SP) + ld r18, 48(SP) + ld r19, 56(SP) +#else + lwz r14, 16(SP) + lwz r15, 20(SP) + lwz r16, 24(SP) + lwz r17, 28(SP) + lwz r18, 32(SP) + lwz r19, 36(SP) +#endif + addi SP, SP, STACKSIZE + + blr + EPILOGUE diff --git a/kernel/power/gemm_tcopy_hummer_4.S b/kernel/power/gemm_tcopy_hummer_4.S new file mode 100644 index 0000000..dc94b04 --- /dev/null +++ b/kernel/power/gemm_tcopy_hummer_4.S @@ -0,0 +1,521 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r25 +#define B1 r26 +#define B2 r27 +#define B3 r28 +#define M4 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 + + PROLOGUE + PROFCODE + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + slwi M4, M, 2 + BASE_SHIFT + + li r8, -4 + li r9, -2 + + and B2, N, r8 + and B3, N, r9 + + mullw B2, B2, M + mullw B3, B3, M + + slwi B2, B2, BASE_SHIFT + slwi B3, B3, BASE_SHIFT + + add B2, B2, B + add B3, B3, B + + cmpwi cr0, M, 0 + ble- .L99 + cmpwi cr0, N, 0 + ble- .L99 + + subi B2, B2, 2 * SIZE + subi B3, B3, 2 * SIZE + subi M4, M4, 14 * SIZE + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne .L100 + andi. r0, LDA, 2 * SIZE - 1 + bne .L100 + + subi A, A, 2 * SIZE + srawi. J, M, 2 + ble .L20 + .align 4 + +.L10: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M4 + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L15 + .align 4 + +.L12: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + LFPDUX c05, AO3, INC2 + LFPDUX c06, AO3, INC2 + LFPDUX c07, AO4, INC2 + LFPDUX c08, AO4, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + bdnz .L12 + .align 4 + +.L15: + andi. r0, N, 3 + ble .L19 + + andi. r0, N, 2 + ble .L17 + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c05, AO3, INC2 + LFPDUX c07, AO4, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c07, B2, INC2 + .align 4 + +.L17: + andi. r0, N, 1 + ble .L19 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + LFDUX c03, AO3, INC2 + LFDUX c04, AO4, INC2 + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + .align 4 + +.L19: + addic. J, J, -1 + bgt .L10 + .align 4 + +.L20: + andi. J, M, 2 + addi M4, M4, 8 * SIZE + + ble .L30 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 8 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L23 + .align 4 + +.L22: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz .L22 + .align 4 + +.L23: + andi. r0, N, 2 + ble .L24 + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO2, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +.L24: + andi. r0, N, 1 + ble .L30 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + + fsmfp c01, c02 + STFPDUX c01, B3, INC2 + .align 4 + +.L30: + andi. J, M, 1 + addi M4, M4, 4 * SIZE + ble .L99 + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L33 + .align 4 + +.L32: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + bdnz .L32 + .align 4 + +.L33: + andi. r0, N, 2 + ble .L34 + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B2, INC2 + .align 4 + +.L34: + andi. r0, N, 1 + ble .L99 + + LFDX c01, AO1, INC2 + STFDX c01, B3, INC2 + .align 4 + +.L99: + addi SP, SP, -4 + + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + addi SP, SP, 4 + blr + +.L100: + subi A, A, SIZE + srawi. J, M, 2 + ble .L120 + .align 4 + +.L110: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M4 + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L115 + .align 4 + +.L112: + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c04, AO2, INC + LFDUX c06, AO3, INC + LFDUX c08, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c04, AO2, INC + LFSDUX c06, AO3, INC + LFSDUX c08, AO4, INC + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + bdnz .L112 + .align 4 + +.L115: + andi. r0, N, 3 + ble .L119 + + andi. r0, N, 2 + ble .L117 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c07, B2, INC2 + .align 4 + +.L117: + andi. r0, N, 1 + ble .L119 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + LFDUX c03, AO3, INC + LFDUX c04, AO4, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + .align 4 + +.L119: + addic. J, J, -1 + bgt .L110 + .align 4 + +.L120: + andi. J, M, 2 + addi M4, M4, 8 * SIZE + + ble .L130 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 8 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L123 + .align 4 + +.L122: + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + + LFDUX c02, AO1, INC + LFDUX c04, AO2, INC + LFSDUX c02, AO1, INC + LFSDUX c04, AO2, INC + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz .L122 + .align 4 + +.L123: + andi. r0, N, 2 + ble .L124 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +.L124: + andi. r0, N, 1 + ble .L130 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + + fsmfp c01, c02 + STFPDUX c01, B3, INC2 + .align 4 + +.L130: + andi. J, M, 1 + addi M4, M4, 4 * SIZE + ble .L999 + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L133 + .align 4 + +.L132: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B1, M4 + STFPDUX c03, B1, INC2 + bdnz .L132 + .align 4 + +.L133: + andi. r0, N, 2 + ble .L134 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B2, INC2 + .align 4 + +.L134: + andi. r0, N, 1 + ble .L999 + + LFDX c01, AO1, INC + STFDX c01, B3, INC2 + .align 4 + +.L999: + addi SP, SP, -4 + + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + addi SP, SP, 4 + blr + EPILOGUE diff --git a/kernel/power/gemm_tcopy_hummer_8.S b/kernel/power/gemm_tcopy_hummer_8.S new file mode 100644 index 0000000..5062f65 --- /dev/null +++ b/kernel/power/gemm_tcopy_hummer_8.S @@ -0,0 +1,1285 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define B1 r21 +#define B2 r22 +#define B3 r23 +#define B4 r24 +#define M8 r25 + +#define AO5 r26 +#define AO6 r27 +#define AO7 r28 +#define AO8 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define c17 f16 +#define c18 f17 +#define c19 f18 +#define c20 f19 +#define c21 f20 +#define c22 f21 +#define c23 f22 +#define c24 f23 +#define c25 f24 +#define c26 f25 +#define c27 f26 +#define c28 f27 +#define c29 f28 +#define c30 f29 +#define c31 f30 +#define c32 f31 + +#define STACKSIZE 64 + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + slwi M8, M, 3 + BASE_SHIFT + + li r8, -8 + li r9, -4 + li r10, -2 + + and B2, N, r8 + and B3, N, r9 + and B4, N, r10 + + mullw B2, B2, M + mullw B3, B3, M + mullw B4, B4, M + + slwi B2, B2, BASE_SHIFT + slwi B3, B3, BASE_SHIFT + slwi B4, B4, BASE_SHIFT + + add B2, B2, B + add B3, B3, B + add B4, B4, B + + cmpwi cr0, M, 0 + ble- .L999 + cmpwi cr0, N, 0 + ble- .L999 + + subi B2, B2, 2 * SIZE + subi B3, B3, 2 * SIZE + subi B4, B4, 2 * SIZE + + subi M8, M8, 62 * SIZE + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne .L100 + andi. r0, LDA, 2 * SIZE - 1 + bne .L100 + + subi A, A, 2 * SIZE + srawi. J, M, 3 + ble .L20 + .align 4 + +.L10: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + sub B1, B, M8 + addi B, B, 64 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L15 + .align 4 + +.L12: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + LFPDUX c05, AO2, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c08, AO2, INC2 + + LFPDUX c09, AO3, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c11, AO3, INC2 + LFPDUX c12, AO3, INC2 + + LFPDUX c13, AO4, INC2 + LFPDUX c14, AO4, INC2 + LFPDUX c15, AO4, INC2 + LFPDUX c16, AO4, INC2 + + LFPDUX c17, AO5, INC2 + LFPDUX c18, AO5, INC2 + LFPDUX c19, AO5, INC2 + LFPDUX c20, AO5, INC2 + + LFPDUX c21, AO6, INC2 + LFPDUX c22, AO6, INC2 + LFPDUX c23, AO6, INC2 + LFPDUX c24, AO6, INC2 + + LFPDUX c25, AO7, INC2 + LFPDUX c26, AO7, INC2 + LFPDUX c27, AO7, INC2 + LFPDUX c28, AO7, INC2 + + LFPDUX c29, AO8, INC2 + LFPDUX c30, AO8, INC2 + LFPDUX c31, AO8, INC2 + LFPDUX c32, AO8, INC2 + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + + STFPDUX c17, B1, INC2 + STFPDUX c18, B1, INC2 + STFPDUX c19, B1, INC2 + STFPDUX c20, B1, INC2 + STFPDUX c21, B1, INC2 + STFPDUX c22, B1, INC2 + STFPDUX c23, B1, INC2 + STFPDUX c24, B1, INC2 + + STFPDUX c25, B1, INC2 + STFPDUX c26, B1, INC2 + STFPDUX c27, B1, INC2 + STFPDUX c28, B1, INC2 + STFPDUX c29, B1, INC2 + STFPDUX c30, B1, INC2 + STFPDUX c31, B1, INC2 + STFPDUX c32, B1, INC2 + bdnz .L12 + .align 4 + +.L15: + andi. r0, N, 7 + ble .L19 + + andi. r0, N, 4 + ble .L16 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + LFPDUX c05, AO3, INC2 + LFPDUX c06, AO3, INC2 + LFPDUX c07, AO4, INC2 + LFPDUX c08, AO4, INC2 + + LFPDUX c09, AO5, INC2 + LFPDUX c10, AO5, INC2 + LFPDUX c11, AO6, INC2 + LFPDUX c12, AO6, INC2 + + LFPDUX c13, AO7, INC2 + LFPDUX c14, AO7, INC2 + LFPDUX c15, AO8, INC2 + LFPDUX c16, AO8, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + STFPDUX c09, B2, INC2 + STFPDUX c10, B2, INC2 + STFPDUX c11, B2, INC2 + STFPDUX c12, B2, INC2 + STFPDUX c13, B2, INC2 + STFPDUX c14, B2, INC2 + STFPDUX c15, B2, INC2 + STFPDUX c16, B2, INC2 + .align 4 + +.L16: + andi. r0, N, 2 + ble .L17 + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c05, AO3, INC2 + LFPDUX c07, AO4, INC2 + + LFPDUX c09, AO5, INC2 + LFPDUX c11, AO6, INC2 + LFPDUX c13, AO7, INC2 + LFPDUX c15, AO8, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + STFPDUX c09, B3, INC2 + STFPDUX c11, B3, INC2 + STFPDUX c13, B3, INC2 + STFPDUX c15, B3, INC2 + .align 4 + +.L17: + andi. r0, N, 1 + ble .L19 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO3, INC2 + LFDUX c03, AO5, INC2 + LFDUX c04, AO7, INC2 + + LFSDUX c01, AO2, INC2 + LFSDUX c02, AO4, INC2 + LFSDUX c03, AO6, INC2 + LFSDUX c04, AO8, INC2 + + STFPDUX c01, B4, INC2 + STFPDUX c02, B4, INC2 + STFPDUX c03, B4, INC2 + STFPDUX c04, B4, INC2 + .align 4 + +.L19: + addic. J, J, -1 + bgt .L10 + .align 4 + +.L20: + andi. J, M, 4 + addi M8, M8, 32 * SIZE + ble .L30 + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M8 + addi B, B, 32 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L25 + .align 4 + +.L22: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + LFPDUX c05, AO2, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c08, AO2, INC2 + + LFPDUX c09, AO3, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c11, AO3, INC2 + LFPDUX c12, AO3, INC2 + + LFPDUX c13, AO4, INC2 + LFPDUX c14, AO4, INC2 + LFPDUX c15, AO4, INC2 + LFPDUX c16, AO4, INC2 + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + bdnz .L22 + .align 4 + +.L25: + andi. r0, N, 7 + ble .L30 + + andi. r0, N, 4 + ble .L26 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + LFPDUX c05, AO3, INC2 + LFPDUX c06, AO3, INC2 + LFPDUX c07, AO4, INC2 + LFPDUX c08, AO4, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + .align 4 + +.L26: + andi. r0, N, 2 + ble .L27 + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c05, AO3, INC2 + LFPDUX c07, AO4, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + .align 4 + +.L27: + andi. r0, N, 1 + ble .L30 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + LFDUX c03, AO3, INC2 + LFDUX c04, AO4, INC2 + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B4, INC2 + STFPDUX c03, B4, INC2 + .align 4 + +.L30: + andi. J, M, 2 + addi M8, M8, 16 * SIZE + ble .L40 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M8 + addi B, B, 16 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L35 + .align 4 + +.L32: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + LFPDUX c05, AO2, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c08, AO2, INC2 + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + bdnz .L32 + .align 4 + +.L35: + andi. r0, N, 7 + ble .L40 + + andi. r0, N, 4 + ble .L36 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + .align 4 + +.L36: + andi. r0, N, 2 + ble .L37 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c02, B3, INC2 + .align 4 + +.L37: + andi. r0, N, 1 + ble .L40 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + + fsmfp c01, c02 + STFPDUX c01, B4, INC2 + .align 4 + +.L40: + andi. J, M, 1 + addi M8, M8, 8 * SIZE + ble .L999 + + mr AO1, A + + sub B1, B, M8 + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L45 + .align 4 + +.L42: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz .L42 + .align 4 + +.L45: + andi. r0, N, 7 + ble .L999 + + andi. r0, N, 4 + ble .L46 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + .align 4 + +.L46: + andi. r0, N, 2 + ble .L47 + + LFPDUX c01, AO1, INC2 + STFPDUX c01, B3, INC2 + .align 4 + +.L47: + andi. r0, N, 1 + ble .L999 + + LFDX c01, AO1, INC2 + STFDX c01, B4, INC2 + b .L999 + .align 4 + + +.L100: + subi A, A, SIZE + srawi. J, M, 3 + ble .L120 + .align 4 + +.L110: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + sub B1, B, M8 + addi B, B, 64 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L115 + .align 4 + +.L112: + LFDUX c01, AO1, INC + LFDUX c05, AO2, INC + LFDUX c09, AO3, INC + LFDUX c13, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO3, INC + LFSDUX c13, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c06, AO2, INC + LFDUX c10, AO3, INC + LFDUX c14, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c06, AO2, INC + LFSDUX c10, AO3, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c07, AO2, INC + LFDUX c11, AO3, INC + LFDUX c15, AO4, INC + + LFSDUX c03, AO1, INC + LFSDUX c07, AO2, INC + LFSDUX c11, AO3, INC + LFSDUX c15, AO4, INC + + LFDUX c04, AO1, INC + LFDUX c08, AO2, INC + LFDUX c12, AO3, INC + LFDUX c16, AO4, INC + + LFSDUX c04, AO1, INC + LFSDUX c08, AO2, INC + LFSDUX c12, AO3, INC + LFSDUX c16, AO4, INC + + + LFDUX c17, AO5, INC + LFDUX c21, AO6, INC + LFDUX c25, AO7, INC + LFDUX c29, AO8, INC + + LFSDUX c17, AO5, INC + LFSDUX c21, AO6, INC + LFSDUX c25, AO7, INC + LFSDUX c29, AO8, INC + + LFDUX c18, AO5, INC + LFDUX c22, AO6, INC + LFDUX c26, AO7, INC + LFDUX c30, AO8, INC + + LFSDUX c18, AO5, INC + LFSDUX c22, AO6, INC + LFSDUX c26, AO7, INC + LFSDUX c30, AO8, INC + + LFDUX c19, AO5, INC + LFDUX c23, AO6, INC + LFDUX c27, AO7, INC + LFDUX c31, AO8, INC + + LFSDUX c19, AO5, INC + LFSDUX c23, AO6, INC + LFSDUX c27, AO7, INC + LFSDUX c31, AO8, INC + + LFDUX c20, AO5, INC + LFDUX c24, AO6, INC + LFDUX c28, AO7, INC + LFDUX c32, AO8, INC + + LFSDUX c20, AO5, INC + LFSDUX c24, AO6, INC + LFSDUX c28, AO7, INC + LFSDUX c32, AO8, INC + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + + STFPDUX c17, B1, INC2 + STFPDUX c18, B1, INC2 + STFPDUX c19, B1, INC2 + STFPDUX c20, B1, INC2 + STFPDUX c21, B1, INC2 + STFPDUX c22, B1, INC2 + STFPDUX c23, B1, INC2 + STFPDUX c24, B1, INC2 + + STFPDUX c25, B1, INC2 + STFPDUX c26, B1, INC2 + STFPDUX c27, B1, INC2 + STFPDUX c28, B1, INC2 + STFPDUX c29, B1, INC2 + STFPDUX c30, B1, INC2 + STFPDUX c31, B1, INC2 + STFPDUX c32, B1, INC2 + bdnz .L112 + .align 4 + +.L115: + andi. r0, N, 7 + ble .L119 + + andi. r0, N, 4 + ble .L116 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c04, AO2, INC + LFDUX c06, AO3, INC + LFDUX c08, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c04, AO2, INC + LFSDUX c06, AO3, INC + LFSDUX c08, AO4, INC + + LFDUX c09, AO5, INC + LFDUX c11, AO6, INC + LFDUX c13, AO7, INC + LFDUX c15, AO8, INC + + LFSDUX c09, AO5, INC + LFSDUX c11, AO6, INC + LFSDUX c13, AO7, INC + LFSDUX c15, AO8, INC + + LFDUX c10, AO5, INC + LFDUX c12, AO6, INC + LFDUX c14, AO7, INC + LFDUX c16, AO8, INC + + LFSDUX c10, AO5, INC + LFSDUX c12, AO6, INC + LFSDUX c14, AO7, INC + LFSDUX c16, AO8, INC + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + STFPDUX c09, B2, INC2 + STFPDUX c10, B2, INC2 + STFPDUX c11, B2, INC2 + STFPDUX c12, B2, INC2 + STFPDUX c13, B2, INC2 + STFPDUX c14, B2, INC2 + STFPDUX c15, B2, INC2 + STFPDUX c16, B2, INC2 + .align 4 + +.L116: + andi. r0, N, 2 + ble .L117 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + LFDUX c09, AO5, INC + LFDUX c11, AO6, INC + LFDUX c13, AO7, INC + LFDUX c15, AO8, INC + + LFSDUX c09, AO5, INC + LFSDUX c11, AO6, INC + LFSDUX c13, AO7, INC + LFSDUX c15, AO8, INC + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + STFPDUX c09, B3, INC2 + STFPDUX c11, B3, INC2 + STFPDUX c13, B3, INC2 + STFPDUX c15, B3, INC2 + .align 4 + +.L117: + andi. r0, N, 1 + ble .L119 + + LFDUX c01, AO1, INC + LFDUX c02, AO3, INC + LFDUX c03, AO5, INC + LFDUX c04, AO7, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO4, INC + LFSDUX c03, AO6, INC + LFSDUX c04, AO8, INC + + STFPDUX c01, B4, INC2 + STFPDUX c02, B4, INC2 + STFPDUX c03, B4, INC2 + STFPDUX c04, B4, INC2 + .align 4 + +.L119: + addic. J, J, -1 + bgt .L110 + .align 4 + +.L120: + andi. J, M, 4 + addi M8, M8, 32 * SIZE + ble .L130 + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M8 + addi B, B, 32 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L125 + .align 4 + +.L122: + LFDUX c01, AO1, INC + LFDUX c05, AO2, INC + LFDUX c09, AO3, INC + LFDUX c13, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO3, INC + LFSDUX c13, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c06, AO2, INC + LFDUX c10, AO3, INC + LFDUX c14, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c06, AO2, INC + LFSDUX c10, AO3, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c07, AO2, INC + LFDUX c11, AO3, INC + LFDUX c15, AO4, INC + + LFSDUX c03, AO1, INC + LFSDUX c07, AO2, INC + LFSDUX c11, AO3, INC + LFSDUX c15, AO4, INC + + LFDUX c04, AO1, INC + LFDUX c08, AO2, INC + LFDUX c12, AO3, INC + LFDUX c16, AO4, INC + + LFSDUX c04, AO1, INC + LFSDUX c08, AO2, INC + LFSDUX c12, AO3, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + bdnz .L122 + .align 4 + +.L125: + andi. r0, N, 7 + ble .L130 + + andi. r0, N, 4 + ble .L126 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c04, AO2, INC + LFDUX c06, AO3, INC + LFDUX c08, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c04, AO2, INC + LFSDUX c06, AO3, INC + LFSDUX c08, AO4, INC + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + .align 4 + +.L126: + andi. r0, N, 2 + ble .L127 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + .align 4 + +.L127: + andi. r0, N, 1 + ble .L130 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + LFDUX c03, AO3, INC + LFDUX c04, AO4, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B4, INC2 + STFPDUX c03, B4, INC2 + .align 4 + +.L130: + andi. J, M, 2 + addi M8, M8, 16 * SIZE + ble .L140 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M8 + addi B, B, 16 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L135 + .align 4 + +.L132: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + LFDUX c13, AO2, INC + LFDUX c14, AO2, INC + LFDUX c15, AO2, INC + LFDUX c16, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + fsmfp c09, c10 + fsmfp c11, c12 + fsmfp c13, c14 + fsmfp c15, c16 + + STFPDUX c01, B1, M8 + STFPDUX c03, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c09, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c15, B1, INC2 + bdnz .L132 + .align 4 + +.L135: + andi. r0, N, 7 + ble .L140 + + andi. r0, N, 4 + ble .L136 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c09, c10 + fsmfp c11, c12 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c09, B2, INC2 + STFPDUX c11, B2, INC2 + .align 4 + +.L136: + andi. r0, N, 2 + ble .L137 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + + fsmfp c01, c02 + fsmfp c09, c10 + + STFPDUX c01, B3, INC2 + STFPDUX c09, B3, INC2 + .align 4 + +.L137: + andi. r0, N, 1 + ble .L140 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + + fsmfp c01, c02 + STFPDUX c01, B4, INC2 + .align 4 + +.L140: + andi. J, M, 1 + addi M8, M8, 8 * SIZE + ble .L999 + + mr AO1, A + + sub B1, B, M8 + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L145 + .align 4 + +.L142: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B1, M8 + STFPDUX c03, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c07, B1, INC2 + bdnz .L142 + .align 4 + +.L145: + andi. r0, N, 7 + ble .L999 + + andi. r0, N, 4 + ble .L146 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +.L146: + andi. r0, N, 2 + ble .L147 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + + STFPDUX c01, B3, INC2 + .align 4 + +.L147: + andi. r0, N, 1 + ble .L999 + + LFDX c01, AO1, INC + STFDX c01, B4, INC2 + .align 4 + +.L999: + addi SP, SP, -4 + + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/gemv_hummer_n.S b/kernel/power/gemv_hummer_n.S new file mode 100644 index 0000000..a9340be --- /dev/null +++ b/kernel/power/gemv_hummer_n.S @@ -0,0 +1,1780 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 + +#define I r11 +#define J r12 + +#define INCY2 r24 +#define A1 r25 +#define A2 r26 +#define A3 r27 +#define A4 r28 + +#define YL r29 +#define YS r30 +#define INC2 r31 + +#define yl1 f0 +#define yl2 f2 +#define yl3 f3 +#define yl4 f4 +#define ys1 f5 +#define ys2 f6 +#define ys3 f7 +#define ys4 f8 +#define yl5 f27 +#define ys5 f28 + +#define alpha1 f9 +#define alpha2 f10 + +#define a1 f11 +#define a2 f12 +#define a3 f13 +#define a4 f14 +#define a5 f15 +#define a6 f16 +#define a7 f17 +#define a8 f18 + +#define a9 f19 +#define a10 f20 +#define a11 f21 +#define a12 f22 +#define a13 f23 +#define a14 f24 +#define a15 f25 +#define a16 f26 + +#define alpha f1 + + PROLOGUE + PROFCODE + + li r0, -16 + lwz INCY, 8(SP) + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + fsmfp alpha, alpha + + cmpwi cr0, M, 0 + ble- .L999 + cmpwi cr0, N, 0 + ble- .L999 + + add INCY2, INCY, INCY + li INC2, 2 * SIZE + sub X, X, INCX + + andi. r0, A, 2 * SIZE - 1 +# bne .L100 + +# All cases for aligned A, even LDA + + cmpwi cr0, INCY, SIZE + bne .L70 + + andi. r0, Y, 2 * SIZE - 1 + bne .L40 + +# A : aligned LDA : even Y : Unit Aligned + + sub A, A, INC2 + sub Y, Y, INCY2 + + srawi. J, N, 2 + ble .L20 + .align 4 + +.L11: + LFDUX alpha1, X, INCX + mr A1, A + add A2, A, LDA + add A3, A2, LDA + LFSDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + add A4, A3, LDA + add A, A4, LDA + mr YL, Y + LFSDUX alpha2, X, INCX + fpmul alpha1, alpha, alpha1 + mr YS, Y + srawi. r0, M, 3 + mtspr CTR, r0 + fpmul alpha2, alpha, alpha2 + ble .L15 + + LFPDUX yl1, YL, INCY2 + LFPDUX yl2, YL, INCY2 + LFPDUX yl3, YL, INCY2 + LFPDUX yl4, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a10, A2, INC2 + LFPDUX a14, A2, INC2 + + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + LFPDUX a11, A3, INC2 + LFPDUX a15, A3, INC2 + + LFPDUX a4, A4, INC2 + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX a8, A4, INC2 + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX a12, A4, INC2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX a16, A4, INC2 + fxcpmadd ys4, alpha1, a13, yl4 + bdz .L13 + .align 4 + +.L12: + LFPDUX yl1, YL, INCY2 + + fxcsmadd ys1, alpha1, a2, ys1 + LFPDUX a1, A1, INC2 + fxcsmadd ys2, alpha1, a6, ys2 + LFPDUX a5, A1, INC2 + fxcsmadd ys3, alpha1, a10, ys3 + LFPDUX a9, A1, INC2 + fxcsmadd ys4, alpha1, a14, ys4 + LFPDUX a13, A1, INC2 + + LFPDUX yl2, YL, INCY2 + + fxcpmadd ys1, alpha2, a3, ys1 + LFPDUX a2, A2, INC2 + fxcpmadd ys2, alpha2, a7, ys2 + LFPDUX a6, A2, INC2 + fxcpmadd ys3, alpha2, a11, ys3 + LFPDUX a10, A2, INC2 + fxcpmadd ys4, alpha2, a15, ys4 + LFPDUX a14, A2, INC2 + + LFPDUX yl3, YL, INCY2 + + fxcsmadd ys1, alpha2, a4, ys1 + LFPDUX a3, A3, INC2 + fxcsmadd ys2, alpha2, a8, ys2 + LFPDUX a7, A3, INC2 + fxcsmadd ys3, alpha2, a12, ys3 + LFPDUX a11, A3, INC2 + fxcsmadd ys4, alpha2, a16, ys4 + LFPDUX a15, A3, INC2 + + LFPDUX yl4, YL, INCY2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + + LFPDUX a4, A4, INC2 + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX a8, A4, INC2 + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX a12, A4, INC2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX a16, A4, INC2 + fxcpmadd ys4, alpha1, a13, yl4 + bdnz .L12 + .align 4 + +.L13: + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + fxcsmadd ys3, alpha1, a10, ys3 + fxcsmadd ys4, alpha1, a14, ys4 + + fxcpmadd ys1, alpha2, a3, ys1 + fxcpmadd ys2, alpha2, a7, ys2 + fxcpmadd ys3, alpha2, a11, ys3 + fxcpmadd ys4, alpha2, a15, ys4 + + fxcsmadd ys1, alpha2, a4, ys1 + fxcsmadd ys2, alpha2, a8, ys2 + fxcsmadd ys3, alpha2, a12, ys3 + fxcsmadd ys4, alpha2, a16, ys4 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + .align 4 + +.L15: + andi. r0, M, 7 + ble .L19 + + andi. r0, M, 4 + ble .L17 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + + LFPDUX a4, A4, INC2 + LFPDUX a8, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + + fxcpmadd ys1, alpha2, a3, ys1 + fxcpmadd ys2, alpha2, a7, ys2 + fxcsmadd ys1, alpha2, a4, ys1 + fxcsmadd ys2, alpha2, a8, ys2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + .align 4 + +.L17: + andi. r0, M, 2 + ble .L18 + + LFPDUX yl1, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFPDUX a3, A3, INC2 + LFPDUX a4, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys1, alpha2, a3, ys1 + fxcsmadd ys1, alpha2, a4, ys1 + + STFPDUX ys1, YS, INCY2 + .align 4 + +.L18: + andi. r0, M, 1 + ble .L19 + + LFDUX yl1, YL, INCY2 + + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + LFDUX a3, A3, INC2 + LFDUX a4, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys1, alpha2, a3, ys1 + fxcsmadd ys1, alpha2, a4, ys1 + + STFDUX ys1, YS, INCY2 + .align 4 + +.L19: + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt .L11 + .align 4 + +.L20: + andi. J, N, 2 + ble .L30 + + LFDUX alpha1, X, INCX + + mr A1, A + add A2, A, LDA + add A, A2, LDA + LFSDUX alpha1, X, INCX + + mr YL, Y + mr YS, Y + fpmul alpha1, alpha, alpha1 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L25 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + + LFPDUX yl3, YL, INCY2 + LFPDUX a9, A1, INC2 + LFPDUX yl4, YL, INCY2 + LFPDUX a13, A1, INC2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a10, A2, INC2 + LFPDUX a14, A2, INC2 + bdz .L23 + .align 4 + +.L22: + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + LFPDUX yl1, YL, INCY2 + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX a5, A1, INC2 + LFPDUX yl2, YL, INCY2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX a9, A1, INC2 + LFPDUX yl3, YL, INCY2 + fxcpmadd ys4, alpha1, a13, yl4 + LFPDUX a13, A1, INC2 + LFPDUX yl4, YL, INCY2 + + fxcsmadd ys1, alpha1, a2, ys1 + LFPDUX a2, A2, INC2 + fxcsmadd ys2, alpha1, a6, ys2 + LFPDUX a6, A2, INC2 + fxcsmadd ys3, alpha1, a10, ys3 + LFPDUX a10, A2, INC2 + fxcsmadd ys4, alpha1, a14, ys4 + LFPDUX a14, A2, INC2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + bdnz .L22 + .align 4 + +.L23: + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcpmadd ys3, alpha1, a9, yl3 + fxcpmadd ys4, alpha1, a13, yl4 + + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + fxcsmadd ys3, alpha1, a10, ys3 + fxcsmadd ys4, alpha1, a14, ys4 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + .align 4 + +.L25: + andi. r0, M, 7 + ble .L30 + + andi. r0, M, 4 + ble .L27 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + LFPDUX a6, A2, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcsmadd ys2, alpha1, a6, ys2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + .align 4 + +.L27: + andi. r0, M, 2 + ble .L28 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + + STFPDUX ys1, YS, INCY2 + .align 4 + +.L28: + andi. r0, M, 1 + ble .L30 + + LFDUX yl1, YL, INCY2 + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + + STFDUX ys1, YS, INCY2 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + + LFDUX alpha1, X, INCX + + mr A1, A + mr YL, Y + mr YS, Y + fmul alpha1, alpha, alpha1 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L35 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + + LFPDUX yl3, YL, INCY2 + LFPDUX a9, A1, INC2 + LFPDUX yl4, YL, INCY2 + LFPDUX a13, A1, INC2 + bdz .L33 + .align 4 + +.L32: + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX yl3, YL, INCY2 + LFPDUX a9, A1, INC2 + fxcpmadd ys4, alpha1, a13, yl4 + LFPDUX yl4, YL, INCY2 + LFPDUX a13, A1, INC2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + bdnz .L32 + .align 4 + +.L33: + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcpmadd ys3, alpha1, a9, yl3 + fxcpmadd ys4, alpha1, a13, yl4 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + .align 4 + +.L35: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + ble .L37 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + .align 4 + +.L37: + andi. r0, M, 2 + ble .L38 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + + STFPDUX ys1, YS, INCY2 + .align 4 + +.L38: + andi. r0, M, 1 + ble .L999 + + LFDUX yl1, YL, INCY2 + LFDUX a1, A1, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + + STFDUX ys1, YS, INCY2 + b .L999 + .align 4 + +.L40: +# A : aligned LDA : even Y : Unaligned + + sub A, A, INC2 + sub Y, Y, INCY + + srawi. J, N, 2 + ble .L50 + .align 4 + +.L41: + LFDUX alpha1, X, INCX + LFSDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + LFSDUX alpha2, X, INCX + + fpmul alpha1, alpha, alpha1 + fpmul alpha2, alpha, alpha2 + + mr A1, A + add A2, A, LDA + add A3, A2, LDA + add A4, A3, LDA + add A, A4, LDA + + mr YL, Y + sub YS, Y, INCY2 + + LFSDX ys1, YS, INCY2 + LFDX yl1, YL, INCY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L45 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + LFXDUX yl4, YL, INCY2 + LFXDUX yl5, YL, INCY2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a10, A2, INC2 + LFPDUX a14, A2, INC2 + + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + LFPDUX a11, A3, INC2 + LFPDUX a15, A3, INC2 + + LFPDUX a4, A4, INC2 + fsmr yl1, yl2 + LFPDUX a8, A4, INC2 + fsmr yl2, yl3 + LFPDUX a12, A4, INC2 + fsmr yl3, yl4 + LFPDUX a16, A4, INC2 + fsmr yl4, yl5 + bdz .L43 + .align 4 + +.L42: + fxcpmadd ys2, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + fxcpmadd ys3, alpha1, a5, yl2 + LFPDUX a5, A1, INC2 + fxcpmadd ys4, alpha1, a9, yl3 + LFPDUX a9, A1, INC2 + fxcpmadd ys5, alpha1, a13, yl4 + LFPDUX a13, A1, INC2 + + fxcsmadd ys2, alpha1, a2, ys2 + LFPDUX a2, A2, INC2 + fxcsmadd ys3, alpha1, a6, ys3 + LFPDUX a6, A2, INC2 + fxcsmadd ys4, alpha1, a10, ys4 + LFPDUX a10, A2, INC2 + fxcsmadd ys5, alpha1, a14, ys5 + LFPDUX a14, A2, INC2 + + fxcpmadd ys2, alpha2, a3, ys2 + LFPDUX a3, A3, INC2 + fxcpmadd ys3, alpha2, a7, ys3 + LFPDUX a7, A3, INC2 + fxcpmadd ys4, alpha2, a11, ys4 + LFPDUX a11, A3, INC2 + fxcpmadd ys5, alpha2, a15, ys5 + LFPDUX a15, A3, INC2 + + fxcsmadd ys2, alpha2, a4, ys2 + LFPDUX a4, A4, INC2 + fxcsmadd ys3, alpha2, a8, ys3 + LFPDUX a8, A4, INC2 + fxcsmadd ys4, alpha2, a12, ys4 + LFPDUX a12, A4, INC2 + fxcsmadd ys5, alpha2, a16, ys5 + LFPDUX a16, A4, INC2 + + fmr yl1, yl5 + LFXDUX yl2, YL, INCY2 + fmr ys1, ys2 + LFXDUX yl3, YL, INCY2 + fmr ys2, ys3 + LFXDUX yl4, YL, INCY2 + fmr ys3, ys4 + LFXDUX yl5, YL, INCY2 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + fsmr yl1, yl2 + STFXDUX ys3, YS, INCY2 + fsmr yl2, yl3 + STFXDUX ys4, YS, INCY2 + fsmr yl3, yl4 + + fsmr yl4, yl5 + bdnz .L42 + .align 4 + +.L43: + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + fxcpmadd ys4, alpha1, a9, yl3 + fxcpmadd ys5, alpha1, a13, yl4 + + fxcsmadd ys2, alpha1, a2, ys2 + fxcsmadd ys3, alpha1, a6, ys3 + fxcsmadd ys4, alpha1, a10, ys4 + fxcsmadd ys5, alpha1, a14, ys5 + + fxcpmadd ys2, alpha2, a3, ys2 + fxcpmadd ys3, alpha2, a7, ys3 + fxcpmadd ys4, alpha2, a11, ys4 + fxcpmadd ys5, alpha2, a15, ys5 + + fxcsmadd ys2, alpha2, a4, ys2 + fxcsmadd ys3, alpha2, a8, ys3 + fxcsmadd ys4, alpha2, a12, ys4 + fxcsmadd ys5, alpha2, a16, ys5 + + fmr ys1, ys2 + fmr ys2, ys3 + fmr ys3, ys4 + fmr ys4, ys5 + fmr yl1, yl5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + STFXDUX ys3, YS, INCY2 + STFXDUX ys4, YS, INCY2 + .align 4 + +.L45: + andi. r0, M, 7 + ble .L48 + + andi. r0, M, 4 + ble .L46 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + + LFPDUX a4, A4, INC2 + fsmr yl1, yl2 + LFPDUX a8, A4, INC2 + fsmr yl2, yl3 + + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + fxcsmadd ys2, alpha1, a2, ys2 + fxcsmadd ys3, alpha1, a6, ys3 + + fxcpmadd ys2, alpha2, a3, ys2 + fxcpmadd ys3, alpha2, a7, ys3 + fxcsmadd ys2, alpha2, a4, ys2 + fxcsmadd ys3, alpha2, a8, ys3 + + fmr yl1, yl3 + fmr ys1, ys2 + fmr ys2, ys3 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys3 + STFXDUX ys2, YS, INCY2 + .align 4 + +.L46: + andi. r0, M, 2 + ble .L47 + + LFXDUX yl2, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFPDUX a3, A3, INC2 + LFPDUX a4, A4, INC2 + + fsmr yl1, yl2 + fxcpmadd ys2, alpha1, a1, yl1 + fxcsmadd ys2, alpha1, a2, ys2 + fxcpmadd ys2, alpha2, a3, ys2 + fxcsmadd ys2, alpha2, a4, ys2 + fmr yl1, yl2 + + fmr ys1, ys2 + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys2 + .align 4 + +.L47: + andi. r0, M, 1 + ble .L48 + + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + LFDUX a3, A3, INC2 + LFDUX a4, A4, INC2 + + fxcpmadd ys2, alpha1, a1, yl1 + fxcsmadd ys2, alpha1, a2, ys2 + fxcpmadd ys2, alpha2, a3, ys2 + fxcsmadd ys2, alpha2, a4, ys2 + + STFSDX ys1, YS, INCY2 + add YS, YS, INCY + STFDX ys2, YS, INCY2 + b .L49 + .align 4 + +.L48: + STFSDUX ys1, YS, INCY2 + .align 4 + +.L49: + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt .L41 + .align 4 + +.L50: + andi. J, N, 2 + ble .L60 + + LFDUX alpha1, X, INCX + + mr A1, A + add A2, A, LDA + add A, A2, LDA + LFSDUX alpha1, X, INCX + + mr YL, Y + sub YS, Y, INCY2 + fpmul alpha1, alpha, alpha1 + + LFSDX ys1, YS, INCY2 + LFDX yl1, YL, INCY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L55 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + LFXDUX yl4, YL, INCY2 + LFXDUX yl5, YL, INCY2 + + LFPDUX a2, A2, INC2 + fsmr yl1, yl2 + LFPDUX a6, A2, INC2 + fsmr yl2, yl3 + LFPDUX a10, A2, INC2 + fsmr yl3, yl4 + LFPDUX a14, A2, INC2 + fsmr yl4, yl5 + bdz .L53 + .align 4 + +.L52: + fxcpmadd ys2, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + fxcpmadd ys3, alpha1, a5, yl2 + LFPDUX a5, A1, INC2 + fxcpmadd ys4, alpha1, a9, yl3 + LFPDUX a9, A1, INC2 + fxcpmadd ys5, alpha1, a13, yl4 + LFPDUX a13, A1, INC2 + + fxcsmadd ys2, alpha1, a2, ys2 + LFPDUX a2, A2, INC2 + fxcsmadd ys3, alpha1, a6, ys3 + LFPDUX a6, A2, INC2 + fxcsmadd ys4, alpha1, a10, ys4 + LFPDUX a10, A2, INC2 + fxcsmadd ys5, alpha1, a14, ys5 + LFPDUX a14, A2, INC2 + + fmr yl1, yl5 + LFXDUX yl2, YL, INCY2 + fmr ys1, ys2 + LFXDUX yl3, YL, INCY2 + fmr ys2, ys3 + LFXDUX yl4, YL, INCY2 + fmr ys3, ys4 + LFXDUX yl5, YL, INCY2 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + fsmr yl1, yl2 + STFXDUX ys3, YS, INCY2 + fsmr yl2, yl3 + STFXDUX ys4, YS, INCY2 + fsmr yl3, yl4 + + fsmr yl4, yl5 + bdnz .L52 + .align 4 + +.L53: + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + fxcpmadd ys4, alpha1, a9, yl3 + fxcpmadd ys5, alpha1, a13, yl4 + + fxcsmadd ys2, alpha1, a2, ys2 + fxcsmadd ys3, alpha1, a6, ys3 + fxcsmadd ys4, alpha1, a10, ys4 + fxcsmadd ys5, alpha1, a14, ys5 + + fmr yl1, yl5 + fmr ys1, ys2 + fmr ys2, ys3 + fmr ys3, ys4 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + STFXDUX ys3, YS, INCY2 + STFXDUX ys4, YS, INCY2 + .align 4 + +.L55: + andi. r0, M, 7 + ble .L59 + + andi. r0, M, 4 + ble .L57 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + + LFPDUX a5, A1, INC2 + LFPDUX a6, A2, INC2 + + fsmr yl1, yl2 + fsmr yl2, yl3 + + fxcpmadd ys2, alpha1, a1, yl1 + fxcsmadd ys2, alpha1, a2, ys2 + fxcpmadd ys3, alpha1, a5, yl2 + fxcsmadd ys3, alpha1, a6, ys3 + + fmr yl1, yl3 + fmr ys1, ys2 + fmr ys2, ys3 + + STFXDUX ys1, YS, INCY2 + STFXDUX ys2, YS, INCY2 + fsmr ys1, ys3 + .align 4 + +.L57: + andi. r0, M, 2 + ble .L58 + + LFXDUX yl2, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + + fsmr yl1, yl2 + fxcpmadd ys2, alpha1, a1, yl1 + fxcsmadd ys2, alpha1, a2, ys2 + fmr yl1, yl2 + + fmr ys1, ys2 + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys2 + .align 4 + +.L58: + andi. r0, M, 1 + ble .L59 + + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + + fxmr alpha2, alpha1 + fmadd ys1, alpha1, a1, yl1 + fmadd ys1, alpha2, a2, ys1 + + STFXDUX ys1, YS, INCY2 + b .L60 + .align 4 + +.L59: + STFSDUX ys1, YS, INCY2 + .align 4 + +.L60: + andi. J, N, 1 + ble .L999 + + LFDUX alpha1, X, INCX + mr A1, A + + mr YL, Y + sub YS, Y, INCY2 + + fmul alpha1, alpha, alpha1 + + LFSDX ys1, YS, INCY2 + LFDX yl1, YL, INCY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L65 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + LFXDUX yl4, YL, INCY2 + LFXDUX yl5, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + fsmr yl1, yl2 + fsmr yl2, yl3 + fsmr yl3, yl4 + fsmr yl4, yl5 + bdz .L63 + .align 4 + +.L62: + fxcpmadd ys2, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + fxcpmadd ys3, alpha1, a5, yl2 + LFXDUX yl2, YL, INCY2 + fxcpmadd ys4, alpha1, a9, yl3 + LFXDUX yl3, YL, INCY2 + fxcpmadd ys5, alpha1, a13, yl4 + LFXDUX yl4, YL, INCY2 + + fmr yl1, yl5 + LFXDUX yl5, YL, INCY2 + fmr ys1, ys2 + LFPDUX a5, A1, INC2 + fmr ys2, ys3 + LFPDUX a9, A1, INC2 + fmr ys3, ys4 + LFPDUX a13, A1, INC2 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + fsmr yl1, yl2 + STFXDUX ys3, YS, INCY2 + fsmr yl2, yl3 + STFXDUX ys4, YS, INCY2 + fsmr yl3, yl4 + + fsmr yl4, yl5 + bdnz .L62 + .align 4 + +.L63: + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + fxcpmadd ys4, alpha1, a9, yl3 + fxcpmadd ys5, alpha1, a13, yl4 + + fmr yl1, yl5 + fmr ys1, ys2 + fmr ys2, ys3 + fmr ys3, ys4 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + STFXDUX ys3, YS, INCY2 + STFXDUX ys4, YS, INCY2 + .align 4 + +.L65: + andi. r0, M, 7 + ble .L69 + + andi. r0, M, 4 + ble .L67 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + + fsmr yl1, yl2 + fsmr yl2, yl3 + + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + + fmr yl1, yl3 + fmr ys1, ys2 + fmr ys2, ys3 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys3 + STFXDUX ys2, YS, INCY2 + .align 4 + +.L67: + andi. r0, M, 2 + ble .L68 + + LFPDUX a1, A1, INC2 + LFXDUX yl2, YL, INCY2 + + fsmr yl1, yl2 + fxcpmadd ys2, alpha1, a1, yl1 + fmr yl1, yl2 + fmr ys1, ys2 + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys2 + .align 4 + +.L68: + andi. r0, M, 1 + ble .L69 + + LFDUX a1, A1, INC2 + fmadd ys1, alpha1, a1, yl1 + STFXDUX ys1, YS, INCY2 + b .L999 + .align 4 + +.L69: + STFSDUX ys1, YS, INCY2 + b .L999 + .align 4 + +.L70: + sub A, A, INC2 + sub Y, Y, INCY + srawi. J, N, 2 + ble .L80 + .align 4 + +.L71: + LFDUX alpha1, X, INCX + mr A1, A + add A2, A, LDA + add A3, A2, LDA + LFSDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + add A4, A3, LDA + add A, A4, LDA + mr YL, Y + LFSDUX alpha2, X, INCX + fpmul alpha1, alpha, alpha1 + mr YS, Y + srawi. r0, M, 3 + mtspr CTR, r0 + fpmul alpha2, alpha, alpha2 + ble .L75 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + LFSDUX yl1, YL, INCY + + LFDUX yl2, YL, INCY + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a10, A2, INC2 + LFPDUX a14, A2, INC2 + LFSDUX yl2, YL, INCY + + LFDUX yl3, YL, INCY + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + LFPDUX a11, A3, INC2 + LFPDUX a15, A3, INC2 + LFSDUX yl3, YL, INCY + + LFDUX yl4, YL, INCY + LFPDUX a4, A4, INC2 + LFPDUX a8, A4, INC2 + LFPDUX a12, A4, INC2 + LFPDUX a16, A4, INC2 + LFSDUX yl4, YL, INCY + bdz .L73 + .align 4 + +.L72: + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + LFDUX yl1, YL, INCY + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX a5, A1, INC2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX a9, A1, INC2 + fxcpmadd ys4, alpha1, a13, yl4 + LFPDUX a13, A1, INC2 + LFSDUX yl1, YL, INCY + + fxcsmadd ys1, alpha1, a2, ys1 + LFPDUX a2, A2, INC2 + LFDUX yl2, YL, INCY + fxcsmadd ys2, alpha1, a6, ys2 + LFPDUX a6, A2, INC2 + fxcsmadd ys3, alpha1, a10, ys3 + LFPDUX a10, A2, INC2 + fxcsmadd ys4, alpha1, a14, ys4 + LFPDUX a14, A2, INC2 + LFSDUX yl2, YL, INCY + + fxcpmadd ys1, alpha2, a3, ys1 + LFPDUX a3, A3, INC2 + LFDUX yl3, YL, INCY + fxcpmadd ys2, alpha2, a7, ys2 + LFPDUX a7, A3, INC2 + fxcpmadd ys3, alpha2, a11, ys3 + LFPDUX a11, A3, INC2 + fxcpmadd ys4, alpha2, a15, ys4 + LFPDUX a15, A3, INC2 + LFSDUX yl3, YL, INCY + + fxcsmadd ys1, alpha2, a4, ys1 + LFPDUX a4, A4, INC2 + LFDUX yl4, YL, INCY + fxcsmadd ys2, alpha2, a8, ys2 + LFPDUX a8, A4, INC2 + fxcsmadd ys3, alpha2, a12, ys3 + LFPDUX a12, A4, INC2 + fxcsmadd ys4, alpha2, a16, ys4 + LFPDUX a16, A4, INC2 + LFSDUX yl4, YL, INCY + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + bdnz .L72 + .align 4 + +.L73: + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcpmadd ys3, alpha1, a9, yl3 + fxcpmadd ys4, alpha1, a13, yl4 + + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + fxcsmadd ys3, alpha1, a10, ys3 + fxcsmadd ys4, alpha1, a14, ys4 + + fxcpmadd ys1, alpha2, a3, ys1 + fxcpmadd ys2, alpha2, a7, ys2 + fxcpmadd ys3, alpha2, a11, ys3 + fxcpmadd ys4, alpha2, a15, ys4 + + fxcsmadd ys1, alpha2, a4, ys1 + fxcsmadd ys2, alpha2, a8, ys2 + fxcsmadd ys3, alpha2, a12, ys3 + fxcsmadd ys4, alpha2, a16, ys4 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + .align 4 + +.L75: + andi. r0, M, 7 + ble .L79 + + andi. r0, M, 4 + ble .L77 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFSDUX yl1, YL, INCY + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + + LFDUX yl2, YL, INCY + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + LFSDUX yl2, YL, INCY + LFPDUX a4, A4, INC2 + LFPDUX a8, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + + fxcpmadd ys1, alpha2, a3, ys1 + fxcpmadd ys2, alpha2, a7, ys2 + fxcsmadd ys1, alpha2, a4, ys1 + fxcsmadd ys2, alpha2, a8, ys2 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + .align 4 + +.L77: + andi. r0, M, 2 + ble .L78 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFSDUX yl1, YL, INCY + LFPDUX a3, A3, INC2 + LFPDUX a4, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys1, alpha2, a3, ys1 + fxcsmadd ys1, alpha2, a4, ys1 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + .align 4 + +.L78: + andi. r0, M, 1 + ble .L79 + + LFDUX yl1, YL, INCY + + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + LFDUX a3, A3, INC2 + LFDUX a4, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys1, alpha2, a3, ys1 + fxcsmadd ys1, alpha2, a4, ys1 + + STFDUX ys1, YS, INCY + .align 4 + +.L79: + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt .L71 + .align 4 + +.L80: + andi. J, N, 2 + ble .L90 + + LFDUX alpha1, X, INCX + + mr A1, A + add A2, A, LDA + add A, A2, LDA + LFSDUX alpha1, X, INCX + + mr YL, Y + mr YS, Y + fpmul alpha1, alpha, alpha1 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L85 + + LFDUX yl1, YL, INCY + LFDUX a9, YL, INCY + LFDUX yl2, YL, INCY + LFDUX a10, YL, INCY + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a3, A1, INC2 + LFPDUX a7, A1, INC2 + + LFDUX yl3, YL, INCY + LFDUX a11, YL, INCY + LFDUX yl4, YL, INCY + LFDUX a12, YL, INCY + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a4, A2, INC2 + LFPDUX a8, A2, INC2 + + bdz .L83 + .align 4 + +.L82: + fsmfp yl1, a9 + fsmfp yl2, a10 + fsmfp yl3, a11 + fsmfp yl4, a12 + + fxcpmadd ys1, alpha1, a1, yl1 + LFDUX yl1, YL, INCY + LFDUX a9, YL, INCY + LFPDUX a1, A1, INC2 + fxcpmadd ys2, alpha1, a5, yl2 + LFDUX yl2, YL, INCY + LFDUX a10, YL, INCY + LFPDUX a5, A1, INC2 + fxcpmadd ys3, alpha1, a3, yl3 + LFDUX yl3, YL, INCY + LFDUX a11, YL, INCY + LFPDUX a3, A1, INC2 + fxcpmadd ys4, alpha1, a7, yl4 + LFDUX yl4, YL, INCY + LFDUX a12, YL, INCY + LFPDUX a7, A1, INC2 + + fxcsmadd ys1, alpha1, a2, ys1 + LFPDUX a2, A2, INC2 + fxcsmadd ys2, alpha1, a6, ys2 + LFPDUX a6, A2, INC2 + fxcsmadd ys3, alpha1, a4, ys3 + LFPDUX a4, A2, INC2 + fxcsmadd ys4, alpha1, a8, ys4 + LFPDUX a8, A2, INC2 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + bdnz .L82 + .align 4 + +.L83: + fsmfp yl1, a9 + fsmfp yl2, a10 + fsmfp yl3, a11 + fsmfp yl4, a12 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcpmadd ys3, alpha1, a3, yl3 + fxcpmadd ys4, alpha1, a7, yl4 + + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + fxcsmadd ys3, alpha1, a4, ys3 + fxcsmadd ys4, alpha1, a8, ys4 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + .align 4 + +.L85: + andi. r0, M, 7 + ble .L90 + + andi. r0, M, 4 + ble .L87 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFSDUX yl1, YL, INCY + LFDUX yl2, YL, INCY + LFPDUX a5, A1, INC2 + LFPDUX a6, A2, INC2 + LFSDUX yl2, YL, INCY + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + .align 4 + +.L87: + andi. r0, M, 2 + ble .L88 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFSDUX yl1, YL, INCY + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + .align 4 + +.L88: + andi. r0, M, 1 + ble .L90 + + LFDUX yl1, YL, INCY + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + + STFDUX ys1, YS, INCY + .align 4 + +.L90: + andi. J, N, 1 + ble .L999 + + LFDUX alpha1, X, INCX + + mr A1, A + mr YL, Y + mr YS, Y + fmul alpha1, alpha, alpha1 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L95 + + LFDUX yl1, YL, INCY + LFSDUX a2, YL, INCY + LFDUX yl2, YL, INCY + LFSDUX a4, YL, INCY + LFDUX yl3, YL, INCY + LFSDUX a6, YL, INCY + LFDUX yl4, YL, INCY + LFSDUX a8, YL, INCY + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + bdz .L93 + .align 4 + +.L92: + fmr a2, yl1 + fmr a4, yl2 + fmr a6, yl3 + fmr a8, yl4 + + fxcpmadd ys1, alpha1, a1, a2 + LFDUX yl1, YL, INCY + LFSDUX a2, YL, INCY + fxcpmadd ys2, alpha1, a5, a4 + LFDUX yl2, YL, INCY + LFSDUX a4, YL, INCY + fxcpmadd ys3, alpha1, a9, a6 + LFDUX yl3, YL, INCY + LFSDUX a6, YL, INCY + fxcpmadd ys4, alpha1, a13, a8 + LFDUX yl4, YL, INCY + LFSDUX a8, YL, INCY + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + bdnz .L92 + .align 4 + +.L93: + fmr a2, yl1 + fmr a4, yl2 + fmr a6, yl3 + fmr a8, yl4 + + fxcpmadd ys1, alpha1, a1, a2 + fxcpmadd ys2, alpha1, a5, a4 + fxcpmadd ys3, alpha1, a9, a6 + fxcpmadd ys4, alpha1, a13, a8 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + .align 4 + +.L95: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + ble .L97 + + LFPDUX a1, A1, INC2 + LFDUX yl1, YL, INCY + LFDUX yl2, YL, INCY + LFPDUX a2, A1, INC2 + LFDUX yl3, YL, INCY + LFDUX yl4, YL, INCY + + fxcpmadd ys1, a1, alpha1, yl1 + fxcsmadd ys2, a1, alpha1, yl2 + fxcpmadd ys3, a2, alpha1, yl3 + fxcsmadd ys4, a2, alpha1, yl4 + + STFDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + .align 4 + +.L97: + andi. r0, M, 2 + ble .L98 + + LFPDUX a1, A1, INC2 + LFDUX yl1, YL, INCY + LFDUX yl2, YL, INCY + + fxcpmadd ys1, a1, alpha1, yl1 + fxcsmadd ys2, a1, alpha1, yl2 + + STFDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + .align 4 + +.L98: + andi. r0, M, 1 + ble .L999 + + LFDUX yl1, YL, INCY + LFDUX a1, A1, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + + STFDUX ys1, YS, INCY + b .L999 + .align 4 + + +.L999: + addi SP, SP, -4 + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S new file mode 100644 index 0000000..b66caa7 --- /dev/null +++ b/kernel/power/gemv_n.S @@ -0,0 +1,3090 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define AO5 r18 +#define AO6 r19 +#define AO7 r20 +#define AO8 r21 +#define LDA8 r22 + +#define Y1 r23 +#define PREA r24 +#define PREC r25 +#define YY r26 +#define BUFFER r27 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define alpha1 f16 +#define alpha2 f17 +#define alpha3 f18 +#define alpha4 f19 +#define alpha5 f20 +#define alpha6 f21 +#define alpha7 f22 +#define alpha8 f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha f31 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 16 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 40 +#define PREFETCHSIZE_C 24 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 40 +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA 200(SP) +#define FZERO 208(SP) +#else +#define STACKSIZE 280 +#define ALPHA 256(SP) +#define FZERO 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA + fmr alpha, f1 + + slwi LDA8, LDA, BASE_SHIFT + 3 + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + mr YY, Y + lfd f0, FZERO + + cmpi cr0, 0, INCY, SIZE + beq LL(10) + + mr YY, BUFFER + mr Y1, BUFFER + + addi r0, M, 7 + srawi. r0, r0, 3 + mtspr CTR, r0 + .align 4 + +LL(02): + STFD f0, 0 * SIZE(Y1) + STFD f0, 1 * SIZE(Y1) + STFD f0, 2 * SIZE(Y1) + STFD f0, 3 * SIZE(Y1) + STFD f0, 4 * SIZE(Y1) + STFD f0, 5 * SIZE(Y1) + STFD f0, 6 * SIZE(Y1) + STFD f0, 7 * SIZE(Y1) + addi Y1, Y1, 8 * SIZE + bdnz LL(02) + .align 4 + +LL(10): + srawi. J, N, 3 + ble LL(20) + .align 4 + +LL(11): + LFD alpha1, 0 * SIZE(X) + add X, X, INCX + LFD alpha2, 0 * SIZE(X) + add X, X, INCX + LFD alpha3, 0 * SIZE(X) + add X, X, INCX + LFD alpha4, 0 * SIZE(X) + add X, X, INCX + LFD alpha5, 0 * SIZE(X) + add X, X, INCX + LFD alpha6, 0 * SIZE(X) + add X, X, INCX + LFD alpha7, 0 * SIZE(X) + add X, X, INCX + LFD alpha8, 0 * SIZE(X) + add X, X, INCX + + FMUL alpha1, alpha, alpha1 + FMUL alpha2, alpha, alpha2 + FMUL alpha3, alpha, alpha3 + FMUL alpha4, alpha, alpha4 + FMUL alpha5, alpha, alpha5 + FMUL alpha6, alpha, alpha6 + FMUL alpha7, alpha, alpha7 + FMUL alpha8, alpha, alpha8 + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + mr Y1, YY + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(15) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + FMADD y06, alpha1, a6, y06 + FMADD y07, alpha1, a7, y07 + FMADD y08, alpha1, a8, y08 + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + nop + DCBT(AO1, PREA) + + FMADD y09, alpha1, a1, y09 + FMADD y10, alpha1, a2, y10 + FMADD y11, alpha1, a3, y11 + FMADD y12, alpha1, a4, y12 + + LFD a1, 0 * SIZE(AO2) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + FMADD y14, alpha1, a6, y14 + FMADD y15, alpha1, a7, y15 + FMADD y16, alpha1, a8, y16 + + LFD a5, 4 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + FMADD y02, alpha2, a2, y02 + FMADD y03, alpha2, a3, y03 + FMADD y04, alpha2, a4, y04 + + LFD a1, 8 * SIZE(AO2) + LFD a2, 9 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + FMADD y06, alpha2, a6, y06 + FMADD y07, alpha2, a7, y07 + FMADD y08, alpha2, a8, y08 + + LFD a5, 12 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + addi AO2, AO2, 16 * SIZE + nop + nop + DCBT(AO2, PREA) + + FMADD y09, alpha2, a1, y09 + FMADD y10, alpha2, a2, y10 + FMADD y11, alpha2, a3, y11 + FMADD y12, alpha2, a4, y12 + + LFD a1, 0 * SIZE(AO3) + LFD a2, 1 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + + FMADD y13, alpha2, a5, y13 + FMADD y14, alpha2, a6, y14 + FMADD y15, alpha2, a7, y15 + FMADD y16, alpha2, a8, y16 + + LFD a5, 4 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + FMADD y02, alpha3, a2, y02 + FMADD y03, alpha3, a3, y03 + FMADD y04, alpha3, a4, y04 + + LFD a1, 8 * SIZE(AO3) + LFD a2, 9 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + + FMADD y05, alpha3, a5, y05 + FMADD y06, alpha3, a6, y06 + FMADD y07, alpha3, a7, y07 + FMADD y08, alpha3, a8, y08 + + LFD a5, 12 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + addi AO3, AO3, 16 * SIZE + nop + nop + DCBT(AO3, PREA) + + FMADD y09, alpha3, a1, y09 + FMADD y10, alpha3, a2, y10 + FMADD y11, alpha3, a3, y11 + FMADD y12, alpha3, a4, y12 + + LFD a1, 0 * SIZE(AO4) + LFD a2, 1 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + + FMADD y13, alpha3, a5, y13 + FMADD y14, alpha3, a6, y14 + FMADD y15, alpha3, a7, y15 + FMADD y16, alpha3, a8, y16 + + LFD a5, 4 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + FMADD y02, alpha4, a2, y02 + FMADD y03, alpha4, a3, y03 + FMADD y04, alpha4, a4, y04 + + LFD a1, 8 * SIZE(AO4) + LFD a2, 9 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + + FMADD y05, alpha4, a5, y05 + FMADD y06, alpha4, a6, y06 + FMADD y07, alpha4, a7, y07 + FMADD y08, alpha4, a8, y08 + + LFD a5, 12 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + addi AO4, AO4, 16 * SIZE + nop + nop + DCBT(AO4, PREA) + + FMADD y09, alpha4, a1, y09 + FMADD y10, alpha4, a2, y10 + FMADD y11, alpha4, a3, y11 + FMADD y12, alpha4, a4, y12 + + LFD a1, 0 * SIZE(AO5) + LFD a2, 1 * SIZE(AO5) + LFD a3, 2 * SIZE(AO5) + LFD a4, 3 * SIZE(AO5) + + FMADD y13, alpha4, a5, y13 + FMADD y14, alpha4, a6, y14 + FMADD y15, alpha4, a7, y15 + FMADD y16, alpha4, a8, y16 + + LFD a5, 4 * SIZE(AO5) + LFD a6, 5 * SIZE(AO5) + LFD a7, 6 * SIZE(AO5) + LFD a8, 7 * SIZE(AO5) + + FMADD y01, alpha5, a1, y01 + FMADD y02, alpha5, a2, y02 + FMADD y03, alpha5, a3, y03 + FMADD y04, alpha5, a4, y04 + + LFD a1, 8 * SIZE(AO5) + LFD a2, 9 * SIZE(AO5) + LFD a3, 10 * SIZE(AO5) + LFD a4, 11 * SIZE(AO5) + + FMADD y05, alpha5, a5, y05 + FMADD y06, alpha5, a6, y06 + FMADD y07, alpha5, a7, y07 + FMADD y08, alpha5, a8, y08 + + LFD a5, 12 * SIZE(AO5) + LFD a6, 13 * SIZE(AO5) + LFD a7, 14 * SIZE(AO5) + LFD a8, 15 * SIZE(AO5) + + addi AO5, AO5, 16 * SIZE + nop + nop + DCBT(AO5, PREA) + + FMADD y09, alpha5, a1, y09 + FMADD y10, alpha5, a2, y10 + FMADD y11, alpha5, a3, y11 + FMADD y12, alpha5, a4, y12 + + LFD a1, 0 * SIZE(AO6) + LFD a2, 1 * SIZE(AO6) + LFD a3, 2 * SIZE(AO6) + LFD a4, 3 * SIZE(AO6) + + FMADD y13, alpha5, a5, y13 + FMADD y14, alpha5, a6, y14 + FMADD y15, alpha5, a7, y15 + FMADD y16, alpha5, a8, y16 + + LFD a5, 4 * SIZE(AO6) + LFD a6, 5 * SIZE(AO6) + LFD a7, 6 * SIZE(AO6) + LFD a8, 7 * SIZE(AO6) + + FMADD y01, alpha6, a1, y01 + FMADD y02, alpha6, a2, y02 + FMADD y03, alpha6, a3, y03 + FMADD y04, alpha6, a4, y04 + + LFD a1, 8 * SIZE(AO6) + LFD a2, 9 * SIZE(AO6) + LFD a3, 10 * SIZE(AO6) + LFD a4, 11 * SIZE(AO6) + + FMADD y05, alpha6, a5, y05 + FMADD y06, alpha6, a6, y06 + FMADD y07, alpha6, a7, y07 + FMADD y08, alpha6, a8, y08 + + LFD a5, 12 * SIZE(AO6) + LFD a6, 13 * SIZE(AO6) + LFD a7, 14 * SIZE(AO6) + LFD a8, 15 * SIZE(AO6) + + addi AO6, AO6, 16 * SIZE + nop + nop + DCBT(AO6, PREA) + + FMADD y09, alpha6, a1, y09 + FMADD y10, alpha6, a2, y10 + FMADD y11, alpha6, a3, y11 + FMADD y12, alpha6, a4, y12 + + LFD a1, 0 * SIZE(AO7) + LFD a2, 1 * SIZE(AO7) + LFD a3, 2 * SIZE(AO7) + LFD a4, 3 * SIZE(AO7) + + FMADD y13, alpha6, a5, y13 + FMADD y14, alpha6, a6, y14 + FMADD y15, alpha6, a7, y15 + FMADD y16, alpha6, a8, y16 + + LFD a5, 4 * SIZE(AO7) + LFD a6, 5 * SIZE(AO7) + LFD a7, 6 * SIZE(AO7) + LFD a8, 7 * SIZE(AO7) + + FMADD y01, alpha7, a1, y01 + FMADD y02, alpha7, a2, y02 + FMADD y03, alpha7, a3, y03 + FMADD y04, alpha7, a4, y04 + + LFD a1, 8 * SIZE(AO7) + LFD a2, 9 * SIZE(AO7) + LFD a3, 10 * SIZE(AO7) + LFD a4, 11 * SIZE(AO7) + + FMADD y05, alpha7, a5, y05 + FMADD y06, alpha7, a6, y06 + FMADD y07, alpha7, a7, y07 + FMADD y08, alpha7, a8, y08 + + LFD a5, 12 * SIZE(AO7) + LFD a6, 13 * SIZE(AO7) + LFD a7, 14 * SIZE(AO7) + LFD a8, 15 * SIZE(AO7) + + addi AO7, AO7, 16 * SIZE + nop + nop + DCBT(AO7, PREA) + + FMADD y09, alpha7, a1, y09 + FMADD y10, alpha7, a2, y10 + FMADD y11, alpha7, a3, y11 + FMADD y12, alpha7, a4, y12 + + LFD a1, 0 * SIZE(AO8) + LFD a2, 1 * SIZE(AO8) + LFD a3, 2 * SIZE(AO8) + LFD a4, 3 * SIZE(AO8) + + FMADD y13, alpha7, a5, y13 + FMADD y14, alpha7, a6, y14 + FMADD y15, alpha7, a7, y15 + FMADD y16, alpha7, a8, y16 + + LFD a5, 4 * SIZE(AO8) + LFD a6, 5 * SIZE(AO8) + LFD a7, 6 * SIZE(AO8) + LFD a8, 7 * SIZE(AO8) + + FMADD y01, alpha8, a1, y01 + FMADD y02, alpha8, a2, y02 + FMADD y03, alpha8, a3, y03 + FMADD y04, alpha8, a4, y04 + + LFD a1, 8 * SIZE(AO8) + LFD a2, 9 * SIZE(AO8) + LFD a3, 10 * SIZE(AO8) + LFD a4, 11 * SIZE(AO8) + + FMADD y05, alpha8, a5, y05 + FMADD y06, alpha8, a6, y06 + FMADD y07, alpha8, a7, y07 + FMADD y08, alpha8, a8, y08 + + LFD a5, 12 * SIZE(AO8) + LFD a6, 13 * SIZE(AO8) + LFD a7, 14 * SIZE(AO8) + LFD a8, 15 * SIZE(AO8) + + addi AO8, AO8, 16 * SIZE + nop + nop + DCBT(AO8, PREA) + + FMADD y09, alpha8, a1, y09 + FMADD y10, alpha8, a2, y10 + FMADD y11, alpha8, a3, y11 + FMADD y12, alpha8, a4, y12 + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + FMADD y13, alpha8, a5, y13 + FMADD y14, alpha8, a6, y14 + FMADD y15, alpha8, a7, y15 + FMADD y16, alpha8, a8, y16 + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + LFD y01, 16 * SIZE(Y1) + LFD y02, 17 * SIZE(Y1) + LFD y03, 18 * SIZE(Y1) + LFD y04, 19 * SIZE(Y1) + + DCBT(Y1, PREC) + bdz LL(13) + .align 4 + +LL(12): + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + LFD y05, 20 * SIZE(Y1) + LFD y06, 21 * SIZE(Y1) + LFD y07, 22 * SIZE(Y1) + LFD y08, 23 * SIZE(Y1) + + FMADD y05, alpha1, a5, y05 + FMADD y06, alpha1, a6, y06 + FMADD y07, alpha1, a7, y07 + FMADD y08, alpha1, a8, y08 + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + + LFD y09, 24 * SIZE(Y1) + LFD y10, 25 * SIZE(Y1) + LFD y11, 26 * SIZE(Y1) + LFD y12, 27 * SIZE(Y1) + + FMADD y09, alpha1, a1, y09 + FMADD y10, alpha1, a2, y10 + FMADD y11, alpha1, a3, y11 + FMADD y12, alpha1, a4, y12 + + LFD a1, 0 * SIZE(AO2) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + LFD y13, 28 * SIZE(Y1) + LFD y14, 29 * SIZE(Y1) + LFD y15, 30 * SIZE(Y1) + LFD y16, 31 * SIZE(Y1) + + FMADD y13, alpha1, a5, y13 + FMADD y14, alpha1, a6, y14 + FMADD y15, alpha1, a7, y15 + FMADD y16, alpha1, a8, y16 + + LFD a5, 4 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + FMADD y02, alpha2, a2, y02 + FMADD y03, alpha2, a3, y03 + FMADD y04, alpha2, a4, y04 + + LFD a1, 8 * SIZE(AO2) + LFD a2, 9 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + FMADD y06, alpha2, a6, y06 + FMADD y07, alpha2, a7, y07 + FMADD y08, alpha2, a8, y08 + + LFD a5, 12 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2, a1, y09 + FMADD y10, alpha2, a2, y10 + FMADD y11, alpha2, a3, y11 + FMADD y12, alpha2, a4, y12 + + LFD a1, 0 * SIZE(AO3) + LFD a2, 1 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + + FMADD y13, alpha2, a5, y13 + FMADD y14, alpha2, a6, y14 + FMADD y15, alpha2, a7, y15 + FMADD y16, alpha2, a8, y16 + + LFD a5, 4 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + FMADD y02, alpha3, a2, y02 + FMADD y03, alpha3, a3, y03 + FMADD y04, alpha3, a4, y04 + + LFD a1, 8 * SIZE(AO3) + LFD a2, 9 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + + FMADD y05, alpha3, a5, y05 + FMADD y06, alpha3, a6, y06 + FMADD y07, alpha3, a7, y07 + FMADD y08, alpha3, a8, y08 + + LFD a5, 12 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3, a1, y09 + FMADD y10, alpha3, a2, y10 + FMADD y11, alpha3, a3, y11 + FMADD y12, alpha3, a4, y12 + + LFD a1, 0 * SIZE(AO4) + LFD a2, 1 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + + FMADD y13, alpha3, a5, y13 + FMADD y14, alpha3, a6, y14 + FMADD y15, alpha3, a7, y15 + FMADD y16, alpha3, a8, y16 + + LFD a5, 4 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + FMADD y02, alpha4, a2, y02 + FMADD y03, alpha4, a3, y03 + FMADD y04, alpha4, a4, y04 + + LFD a1, 8 * SIZE(AO4) + LFD a2, 9 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + + FMADD y05, alpha4, a5, y05 + FMADD y06, alpha4, a6, y06 + FMADD y07, alpha4, a7, y07 + FMADD y08, alpha4, a8, y08 + + LFD a5, 12 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + FMADD y09, alpha4, a1, y09 + FMADD y10, alpha4, a2, y10 + FMADD y11, alpha4, a3, y11 + FMADD y12, alpha4, a4, y12 + + LFD a1, 0 * SIZE(AO5) + LFD a2, 1 * SIZE(AO5) + LFD a3, 2 * SIZE(AO5) + LFD a4, 3 * SIZE(AO5) + + FMADD y13, alpha4, a5, y13 + FMADD y14, alpha4, a6, y14 + FMADD y15, alpha4, a7, y15 + FMADD y16, alpha4, a8, y16 + + LFD a5, 4 * SIZE(AO5) + LFD a6, 5 * SIZE(AO5) + LFD a7, 6 * SIZE(AO5) + LFD a8, 7 * SIZE(AO5) + + FMADD y01, alpha5, a1, y01 + FMADD y02, alpha5, a2, y02 + FMADD y03, alpha5, a3, y03 + FMADD y04, alpha5, a4, y04 + + LFD a1, 8 * SIZE(AO5) + LFD a2, 9 * SIZE(AO5) + LFD a3, 10 * SIZE(AO5) + LFD a4, 11 * SIZE(AO5) + + FMADD y05, alpha5, a5, y05 + FMADD y06, alpha5, a6, y06 + FMADD y07, alpha5, a7, y07 + FMADD y08, alpha5, a8, y08 + + LFD a5, 12 * SIZE(AO5) + LFD a6, 13 * SIZE(AO5) + LFD a7, 14 * SIZE(AO5) + LFD a8, 15 * SIZE(AO5) + + FMADD y09, alpha5, a1, y09 + FMADD y10, alpha5, a2, y10 + FMADD y11, alpha5, a3, y11 + FMADD y12, alpha5, a4, y12 + + LFD a1, 0 * SIZE(AO6) + LFD a2, 1 * SIZE(AO6) + LFD a3, 2 * SIZE(AO6) + LFD a4, 3 * SIZE(AO6) + + FMADD y13, alpha5, a5, y13 + FMADD y14, alpha5, a6, y14 + FMADD y15, alpha5, a7, y15 + FMADD y16, alpha5, a8, y16 + + LFD a5, 4 * SIZE(AO6) + LFD a6, 5 * SIZE(AO6) + LFD a7, 6 * SIZE(AO6) + LFD a8, 7 * SIZE(AO6) + + FMADD y01, alpha6, a1, y01 + FMADD y02, alpha6, a2, y02 + FMADD y03, alpha6, a3, y03 + FMADD y04, alpha6, a4, y04 + + LFD a1, 8 * SIZE(AO6) + LFD a2, 9 * SIZE(AO6) + LFD a3, 10 * SIZE(AO6) + LFD a4, 11 * SIZE(AO6) + + FMADD y05, alpha6, a5, y05 + FMADD y06, alpha6, a6, y06 + FMADD y07, alpha6, a7, y07 + FMADD y08, alpha6, a8, y08 + + LFD a5, 12 * SIZE(AO6) + LFD a6, 13 * SIZE(AO6) + LFD a7, 14 * SIZE(AO6) + LFD a8, 15 * SIZE(AO6) + + FMADD y09, alpha6, a1, y09 + FMADD y10, alpha6, a2, y10 + FMADD y11, alpha6, a3, y11 + FMADD y12, alpha6, a4, y12 + + LFD a1, 0 * SIZE(AO7) + LFD a2, 1 * SIZE(AO7) + LFD a3, 2 * SIZE(AO7) + LFD a4, 3 * SIZE(AO7) + + FMADD y13, alpha6, a5, y13 + FMADD y14, alpha6, a6, y14 + FMADD y15, alpha6, a7, y15 + FMADD y16, alpha6, a8, y16 + + LFD a5, 4 * SIZE(AO7) + LFD a6, 5 * SIZE(AO7) + LFD a7, 6 * SIZE(AO7) + LFD a8, 7 * SIZE(AO7) + + FMADD y01, alpha7, a1, y01 + FMADD y02, alpha7, a2, y02 + FMADD y03, alpha7, a3, y03 + FMADD y04, alpha7, a4, y04 + + LFD a1, 8 * SIZE(AO7) + LFD a2, 9 * SIZE(AO7) + LFD a3, 10 * SIZE(AO7) + LFD a4, 11 * SIZE(AO7) + + FMADD y05, alpha7, a5, y05 + FMADD y06, alpha7, a6, y06 + FMADD y07, alpha7, a7, y07 + FMADD y08, alpha7, a8, y08 + + LFD a5, 12 * SIZE(AO7) + LFD a6, 13 * SIZE(AO7) + LFD a7, 14 * SIZE(AO7) + LFD a8, 15 * SIZE(AO7) + + FMADD y09, alpha7, a1, y09 + FMADD y10, alpha7, a2, y10 + FMADD y11, alpha7, a3, y11 + FMADD y12, alpha7, a4, y12 + + LFD a1, 0 * SIZE(AO8) + LFD a2, 1 * SIZE(AO8) + LFD a3, 2 * SIZE(AO8) + LFD a4, 3 * SIZE(AO8) + + FMADD y13, alpha7, a5, y13 + FMADD y14, alpha7, a6, y14 + FMADD y15, alpha7, a7, y15 + FMADD y16, alpha7, a8, y16 + + LFD a5, 4 * SIZE(AO8) + LFD a6, 5 * SIZE(AO8) + LFD a7, 6 * SIZE(AO8) + LFD a8, 7 * SIZE(AO8) + + FMADD y01, alpha8, a1, y01 + FMADD y02, alpha8, a2, y02 + FMADD y03, alpha8, a3, y03 + FMADD y04, alpha8, a4, y04 + + LFD a1, 8 * SIZE(AO8) + LFD a2, 9 * SIZE(AO8) + LFD a3, 10 * SIZE(AO8) + LFD a4, 11 * SIZE(AO8) + + FMADD y05, alpha8, a5, y05 + FMADD y06, alpha8, a6, y06 + FMADD y07, alpha8, a7, y07 + FMADD y08, alpha8, a8, y08 + + LFD a5, 12 * SIZE(AO8) + LFD a6, 13 * SIZE(AO8) + LFD a7, 14 * SIZE(AO8) + LFD a8, 15 * SIZE(AO8) + + addi AO5, AO5, 16 * SIZE + addi AO6, AO6, 16 * SIZE + addi AO7, AO7, 16 * SIZE + addi AO8, AO8, 16 * SIZE + + DCBT(AO5, PREA) + DCBT(AO6, PREA) + DCBT(AO7, PREA) + DCBT(AO8, PREA) + + FMADD y09, alpha8, a1, y09 + FMADD y10, alpha8, a2, y10 + FMADD y11, alpha8, a3, y11 + FMADD y12, alpha8, a4, y12 + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + FMADD y13, alpha8, a5, y13 + FMADD y14, alpha8, a6, y14 + FMADD y15, alpha8, a7, y15 + FMADD y16, alpha8, a8, y16 + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y01, 16 * SIZE(Y1) + STFD y02, 17 * SIZE(Y1) + STFD y03, 18 * SIZE(Y1) + STFD y04, 19 * SIZE(Y1) + + LFD y01, 32 * SIZE(Y1) + LFD y02, 33 * SIZE(Y1) + LFD y03, 34 * SIZE(Y1) + LFD y04, 35 * SIZE(Y1) + + DCBT(Y1, PREC) + addi Y1, Y1, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(13): + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + addi Y1, Y1, 16 * SIZE + .align 4 + +LL(15): + andi. r0, M, 15 + ble LL(19) + + andi. r0, M, 8 + ble LL(16) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO2) + + FMADD y05, alpha1, a5, y05 + LFD a5, 4 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFD a6, 5 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFD a7, 6 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 0 * SIZE(AO3) + FMADD y02, alpha2, a2, y02 + LFD a2, 1 * SIZE(AO3) + + FMADD y03, alpha2, a3, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, alpha2, a4, y04 + LFD a4, 3 * SIZE(AO3) + + FMADD y05, alpha2, a5, y05 + LFD a5, 4 * SIZE(AO3) + FMADD y06, alpha2, a6, y06 + LFD a6, 5 * SIZE(AO3) + + FMADD y07, alpha2, a7, y07 + LFD a7, 6 * SIZE(AO3) + FMADD y08, alpha2, a8, y08 + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFD a1, 0 * SIZE(AO4) + FMADD y02, alpha3, a2, y02 + LFD a2, 1 * SIZE(AO4) + + FMADD y03, alpha3, a3, y03 + LFD a3, 2 * SIZE(AO4) + FMADD y04, alpha3, a4, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y05, alpha3, a5, y05 + LFD a5, 4 * SIZE(AO4) + FMADD y06, alpha3, a6, y06 + LFD a6, 5 * SIZE(AO4) + + FMADD y07, alpha3, a7, y07 + LFD a7, 6 * SIZE(AO4) + FMADD y08, alpha3, a8, y08 + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + LFD a1, 0 * SIZE(AO5) + FMADD y02, alpha4, a2, y02 + LFD a2, 1 * SIZE(AO5) + + FMADD y03, alpha4, a3, y03 + LFD a3, 2 * SIZE(AO5) + FMADD y04, alpha4, a4, y04 + LFD a4, 3 * SIZE(AO5) + + FMADD y05, alpha4, a5, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, alpha4, a6, y06 + LFD a6, 5 * SIZE(AO5) + + FMADD y07, alpha4, a7, y07 + LFD a7, 6 * SIZE(AO5) + FMADD y08, alpha4, a8, y08 + LFD a8, 7 * SIZE(AO5) + + FMADD y01, alpha5, a1, y01 + LFD a1, 0 * SIZE(AO6) + FMADD y02, alpha5, a2, y02 + LFD a2, 1 * SIZE(AO6) + + FMADD y03, alpha5, a3, y03 + LFD a3, 2 * SIZE(AO6) + FMADD y04, alpha5, a4, y04 + LFD a4, 3 * SIZE(AO6) + + FMADD y05, alpha5, a5, y05 + LFD a5, 4 * SIZE(AO6) + FMADD y06, alpha5, a6, y06 + LFD a6, 5 * SIZE(AO6) + + FMADD y07, alpha5, a7, y07 + LFD a7, 6 * SIZE(AO6) + FMADD y08, alpha5, a8, y08 + LFD a8, 7 * SIZE(AO6) + + FMADD y01, alpha6, a1, y01 + LFD a1, 0 * SIZE(AO7) + FMADD y02, alpha6, a2, y02 + LFD a2, 1 * SIZE(AO7) + + FMADD y03, alpha6, a3, y03 + LFD a3, 2 * SIZE(AO7) + FMADD y04, alpha6, a4, y04 + LFD a4, 3 * SIZE(AO7) + + FMADD y05, alpha6, a5, y05 + LFD a5, 4 * SIZE(AO7) + FMADD y06, alpha6, a6, y06 + LFD a6, 5 * SIZE(AO7) + + FMADD y07, alpha6, a7, y07 + LFD a7, 6 * SIZE(AO7) + FMADD y08, alpha6, a8, y08 + LFD a8, 7 * SIZE(AO7) + + FMADD y01, alpha7, a1, y01 + LFD a1, 0 * SIZE(AO8) + FMADD y02, alpha7, a2, y02 + LFD a2, 1 * SIZE(AO8) + + FMADD y03, alpha7, a3, y03 + LFD a3, 2 * SIZE(AO8) + FMADD y04, alpha7, a4, y04 + LFD a4, 3 * SIZE(AO8) + + FMADD y05, alpha7, a5, y05 + LFD a5, 4 * SIZE(AO8) + FMADD y06, alpha7, a6, y06 + LFD a6, 5 * SIZE(AO8) + + FMADD y07, alpha7, a7, y07 + LFD a7, 6 * SIZE(AO8) + FMADD y08, alpha7, a8, y08 + LFD a8, 7 * SIZE(AO8) + + FMADD y01, alpha8, a1, y01 + addi AO1, AO1, 8 * SIZE + FMADD y02, alpha8, a2, y02 + addi AO2, AO2, 8 * SIZE + FMADD y03, alpha8, a3, y03 + addi AO3, AO3, 8 * SIZE + FMADD y04, alpha8, a4, y04 + addi AO4, AO4, 8 * SIZE + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + FMADD y05, alpha8, a5, y05 + addi AO5, AO5, 8 * SIZE + FMADD y06, alpha8, a6, y06 + addi AO6, AO6, 8 * SIZE + FMADD y07, alpha8, a7, y07 + addi AO7, AO7, 8 * SIZE + FMADD y08, alpha8, a8, y08 + addi AO8, AO8, 8 * SIZE + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + addi Y1, Y1, 8 * SIZE + .align 4 + +LL(16): + andi. r0, M, 4 + ble LL(17) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO3) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO3) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO3) + + FMADD y01, alpha2, a5, y01 + LFD a5, 0 * SIZE(AO4) + FMADD y02, alpha2, a6, y02 + LFD a6, 1 * SIZE(AO4) + FMADD y03, alpha2, a7, y03 + LFD a7, 2 * SIZE(AO4) + FMADD y04, alpha2, a8, y04 + LFD a8, 3 * SIZE(AO4) + + FMADD y01, alpha3, a1, y01 + LFD a1, 0 * SIZE(AO5) + FMADD y02, alpha3, a2, y02 + LFD a2, 1 * SIZE(AO5) + FMADD y03, alpha3, a3, y03 + LFD a3, 2 * SIZE(AO5) + FMADD y04, alpha3, a4, y04 + LFD a4, 3 * SIZE(AO5) + + FMADD y01, alpha4, a5, y01 + LFD a5, 0 * SIZE(AO6) + FMADD y02, alpha4, a6, y02 + LFD a6, 1 * SIZE(AO6) + FMADD y03, alpha4, a7, y03 + LFD a7, 2 * SIZE(AO6) + FMADD y04, alpha4, a8, y04 + LFD a8, 3 * SIZE(AO6) + + FMADD y01, alpha5, a1, y01 + LFD a1, 0 * SIZE(AO7) + FMADD y02, alpha5, a2, y02 + LFD a2, 1 * SIZE(AO7) + FMADD y03, alpha5, a3, y03 + LFD a3, 2 * SIZE(AO7) + FMADD y04, alpha5, a4, y04 + LFD a4, 3 * SIZE(AO7) + + FMADD y01, alpha6, a5, y01 + LFD a5, 0 * SIZE(AO8) + FMADD y02, alpha6, a6, y02 + LFD a6, 1 * SIZE(AO8) + FMADD y03, alpha6, a7, y03 + LFD a7, 2 * SIZE(AO8) + FMADD y04, alpha6, a8, y04 + LFD a8, 3 * SIZE(AO8) + + FMADD y01, alpha7, a1, y01 + addi AO1, AO1, 4 * SIZE + FMADD y02, alpha7, a2, y02 + addi AO2, AO2, 4 * SIZE + FMADD y03, alpha7, a3, y03 + addi AO3, AO3, 4 * SIZE + FMADD y04, alpha7, a4, y04 + addi AO4, AO4, 4 * SIZE + + FMADD y01, alpha8, a5, y01 + addi AO5, AO5, 4 * SIZE + FMADD y02, alpha8, a6, y02 + addi AO6, AO6, 4 * SIZE + FMADD y03, alpha8, a7, y03 + addi AO7, AO7, 4 * SIZE + FMADD y04, alpha8, a8, y04 + addi AO8, AO8, 4 * SIZE + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + addi Y1, Y1, 4 * SIZE + .align 4 + +LL(17): + andi. r0, M, 2 + ble LL(18) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD a5, 0 * SIZE(AO3) + LFD a6, 1 * SIZE(AO3) + LFD a7, 0 * SIZE(AO4) + LFD a8, 1 * SIZE(AO4) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO5) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO5) + FMADD y01, alpha2, a3, y01 + LFD a3, 0 * SIZE(AO6) + FMADD y02, alpha2, a4, y02 + LFD a4, 1 * SIZE(AO6) + + FMADD y01, alpha3, a5, y01 + LFD a5, 0 * SIZE(AO7) + FMADD y02, alpha3, a6, y02 + LFD a6, 1 * SIZE(AO7) + FMADD y01, alpha4, a7, y01 + LFD a7, 0 * SIZE(AO8) + FMADD y02, alpha4, a8, y02 + LFD a8, 1 * SIZE(AO8) + + FMADD y01, alpha5, a1, y01 + addi AO1, AO1, 2 * SIZE + FMADD y02, alpha5, a2, y02 + addi AO2, AO2, 2 * SIZE + FMADD y01, alpha6, a3, y01 + addi AO3, AO3, 2 * SIZE + FMADD y02, alpha6, a4, y02 + addi AO4, AO4, 2 * SIZE + + FMADD y01, alpha7, a5, y01 + addi AO5, AO5, 2 * SIZE + FMADD y02, alpha7, a6, y02 + addi AO6, AO6, 2 * SIZE + FMADD y01, alpha8, a7, y01 + addi AO7, AO7, 2 * SIZE + FMADD y02, alpha8, a8, y02 + addi AO8, AO8, 2 * SIZE + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + addi Y1, Y1, 2 * SIZE + .align 4 + +LL(18): + andi. r0, M, 1 + ble LL(19) + + LFD y01, 0 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 0 * SIZE(AO2) + LFD a3, 0 * SIZE(AO3) + LFD a4, 0 * SIZE(AO4) + LFD a5, 0 * SIZE(AO5) + LFD a6, 0 * SIZE(AO6) + LFD a7, 0 * SIZE(AO7) + LFD a8, 0 * SIZE(AO8) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + FMADD y01, alpha3, a3, y01 + FMADD y01, alpha4, a4, y01 + + FMADD y01, alpha5, a5, y01 + FMADD y01, alpha6, a6, y01 + FMADD y01, alpha7, a7, y01 + FMADD y01, alpha8, a8, y01 + + STFD y01, 0 * SIZE(Y1) + .align 4 + +LL(19): + addi J, J, -1 + lfd alpha, ALPHA + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 4 + mr AO1, A + add AO2, A, LDA + ble LL(30) + .align 4 + + LFD alpha1, 0 * SIZE(X) + add X, X, INCX + LFD alpha2, 0 * SIZE(X) + add X, X, INCX + LFD alpha3, 0 * SIZE(X) + add X, X, INCX + LFD alpha4, 0 * SIZE(X) + add X, X, INCX + + FMUL alpha1, alpha, alpha1 + add AO3, AO2, LDA + FMUL alpha2, alpha, alpha2 + add AO4, AO3, LDA + FMUL alpha3, alpha, alpha3 + add A, AO4, LDA + FMUL alpha4, alpha, alpha4 + mr Y1, YY + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(25) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + bdz LL(23) + .align 4 + +LL(22): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 0 * SIZE(AO2) + FMADD y10, alpha1, a2, y10 + LFD a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y11 + LFD a3, 2 * SIZE(AO2) + FMADD y12, alpha1, a4, y12 + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + LFD a5, 4 * SIZE(AO2) + FMADD y14, alpha1, a6, y14 + LFD a6, 5 * SIZE(AO2) + FMADD y15, alpha1, a7, y15 + LFD a7, 6 * SIZE(AO2) + FMADD y16, alpha1, a8, y16 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 8 * SIZE(AO2) + FMADD y02, alpha2, a2, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, alpha2, a3, y03 + LFD a3, 10 * SIZE(AO2) + FMADD y04, alpha2, a4, y04 + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + LFD a5, 12 * SIZE(AO2) + FMADD y06, alpha2, a6, y06 + LFD a6, 13 * SIZE(AO2) + FMADD y07, alpha2, a7, y07 + LFD a7, 14 * SIZE(AO2) + FMADD y08, alpha2, a8, y08 + LFD a8, 15 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + DCBT(AO1, PREA) + DCBT(AO2, PREA) + + FMADD y09, alpha2, a1, y09 + LFD a1, 0 * SIZE(AO3) + FMADD y10, alpha2, a2, y10 + LFD a2, 1 * SIZE(AO3) + FMADD y11, alpha2, a3, y11 + LFD a3, 2 * SIZE(AO3) + FMADD y12, alpha2, a4, y12 + LFD a4, 3 * SIZE(AO3) + + FMADD y13, alpha2, a5, y13 + LFD a5, 4 * SIZE(AO3) + FMADD y14, alpha2, a6, y14 + LFD a6, 5 * SIZE(AO3) + FMADD y15, alpha2, a7, y15 + LFD a7, 6 * SIZE(AO3) + FMADD y16, alpha2, a8, y16 + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFD a1, 8 * SIZE(AO3) + FMADD y02, alpha3, a2, y02 + LFD a2, 9 * SIZE(AO3) + FMADD y03, alpha3, a3, y03 + LFD a3, 10 * SIZE(AO3) + FMADD y04, alpha3, a4, y04 + LFD a4, 11 * SIZE(AO3) + + FMADD y05, alpha3, a5, y05 + LFD a5, 12 * SIZE(AO3) + FMADD y06, alpha3, a6, y06 + LFD a6, 13 * SIZE(AO3) + FMADD y07, alpha3, a7, y07 + LFD a7, 14 * SIZE(AO3) + FMADD y08, alpha3, a8, y08 + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3, a1, y09 + LFD a1, 0 * SIZE(AO4) + FMADD y10, alpha3, a2, y10 + LFD a2, 1 * SIZE(AO4) + FMADD y11, alpha3, a3, y11 + LFD a3, 2 * SIZE(AO4) + FMADD y12, alpha3, a4, y12 + LFD a4, 3 * SIZE(AO4) + + FMADD y13, alpha3, a5, y13 + LFD a5, 4 * SIZE(AO4) + FMADD y14, alpha3, a6, y14 + LFD a6, 5 * SIZE(AO4) + FMADD y15, alpha3, a7, y15 + LFD a7, 6 * SIZE(AO4) + FMADD y16, alpha3, a8, y16 + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + LFD a1, 8 * SIZE(AO4) + FMADD y02, alpha4, a2, y02 + LFD a2, 9 * SIZE(AO4) + FMADD y03, alpha4, a3, y03 + LFD a3, 10 * SIZE(AO4) + FMADD y04, alpha4, a4, y04 + LFD a4, 11 * SIZE(AO4) + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + LFD y01, 16 * SIZE(Y1) + LFD y02, 17 * SIZE(Y1) + LFD y03, 18 * SIZE(Y1) + LFD y04, 19 * SIZE(Y1) + + FMADD y05, alpha4, a5, y05 + LFD a5, 12 * SIZE(AO4) + FMADD y06, alpha4, a6, y06 + LFD a6, 13 * SIZE(AO4) + FMADD y07, alpha4, a7, y07 + LFD a7, 14 * SIZE(AO4) + FMADD y08, alpha4, a8, y08 + LFD a8, 15 * SIZE(AO4) + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + LFD y05, 20 * SIZE(Y1) + LFD y06, 21 * SIZE(Y1) + LFD y07, 22 * SIZE(Y1) + LFD y08, 23 * SIZE(Y1) + + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + FMADD y09, alpha4, a1, y09 + LFD a1, 0 * SIZE(AO1) + FMADD y10, alpha4, a2, y10 + LFD a2, 1 * SIZE(AO1) + FMADD y11, alpha4, a3, y11 + LFD a3, 2 * SIZE(AO1) + FMADD y12, alpha4, a4, y12 + LFD a4, 3 * SIZE(AO1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + + LFD y09, 24 * SIZE(Y1) + LFD y10, 25 * SIZE(Y1) + LFD y11, 26 * SIZE(Y1) + LFD y12, 27 * SIZE(Y1) + + FMADD y13, alpha4, a5, y13 + LFD a5, 4 * SIZE(AO1) + FMADD y14, alpha4, a6, y14 + LFD a6, 5 * SIZE(AO1) + FMADD y15, alpha4, a7, y15 + LFD a7, 6 * SIZE(AO1) + FMADD y16, alpha4, a8, y16 + LFD a8, 7 * SIZE(AO1) + + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + LFD y13, 28 * SIZE(Y1) + LFD y14, 29 * SIZE(Y1) + LFD y15, 30 * SIZE(Y1) + LFD y16, 31 * SIZE(Y1) + + addi Y1, Y1, 16 * SIZE + DCBT(Y1, PREC) + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 0 * SIZE(AO2) + FMADD y10, alpha1, a2, y10 + LFD a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y11 + LFD a3, 2 * SIZE(AO2) + FMADD y12, alpha1, a4, y12 + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + LFD a5, 4 * SIZE(AO2) + FMADD y14, alpha1, a6, y14 + LFD a6, 5 * SIZE(AO2) + FMADD y15, alpha1, a7, y15 + LFD a7, 6 * SIZE(AO2) + FMADD y16, alpha1, a8, y16 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 8 * SIZE(AO2) + FMADD y02, alpha2, a2, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, alpha2, a3, y03 + LFD a3, 10 * SIZE(AO2) + FMADD y04, alpha2, a4, y04 + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + LFD a5, 12 * SIZE(AO2) + FMADD y06, alpha2, a6, y06 + LFD a6, 13 * SIZE(AO2) + FMADD y07, alpha2, a7, y07 + LFD a7, 14 * SIZE(AO2) + FMADD y08, alpha2, a8, y08 + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2, a1, y09 + LFD a1, 0 * SIZE(AO3) + FMADD y10, alpha2, a2, y10 + LFD a2, 1 * SIZE(AO3) + FMADD y11, alpha2, a3, y11 + LFD a3, 2 * SIZE(AO3) + FMADD y12, alpha2, a4, y12 + LFD a4, 3 * SIZE(AO3) + + FMADD y13, alpha2, a5, y13 + LFD a5, 4 * SIZE(AO3) + FMADD y14, alpha2, a6, y14 + LFD a6, 5 * SIZE(AO3) + FMADD y15, alpha2, a7, y15 + LFD a7, 6 * SIZE(AO3) + FMADD y16, alpha2, a8, y16 + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFD a1, 8 * SIZE(AO3) + FMADD y02, alpha3, a2, y02 + LFD a2, 9 * SIZE(AO3) + FMADD y03, alpha3, a3, y03 + LFD a3, 10 * SIZE(AO3) + FMADD y04, alpha3, a4, y04 + LFD a4, 11 * SIZE(AO3) + + FMADD y05, alpha3, a5, y05 + LFD a5, 12 * SIZE(AO3) + FMADD y06, alpha3, a6, y06 + LFD a6, 13 * SIZE(AO3) + FMADD y07, alpha3, a7, y07 + LFD a7, 14 * SIZE(AO3) + FMADD y08, alpha3, a8, y08 + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3, a1, y09 + LFD a1, 0 * SIZE(AO4) + FMADD y10, alpha3, a2, y10 + LFD a2, 1 * SIZE(AO4) + FMADD y11, alpha3, a3, y11 + LFD a3, 2 * SIZE(AO4) + FMADD y12, alpha3, a4, y12 + LFD a4, 3 * SIZE(AO4) + + FMADD y13, alpha3, a5, y13 + LFD a5, 4 * SIZE(AO4) + FMADD y14, alpha3, a6, y14 + LFD a6, 5 * SIZE(AO4) + FMADD y15, alpha3, a7, y15 + LFD a7, 6 * SIZE(AO4) + FMADD y16, alpha3, a8, y16 + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + LFD a1, 8 * SIZE(AO4) + FMADD y02, alpha4, a2, y02 + LFD a2, 9 * SIZE(AO4) + FMADD y03, alpha4, a3, y03 + LFD a3, 10 * SIZE(AO4) + FMADD y04, alpha4, a4, y04 + LFD a4, 11 * SIZE(AO4) + + FMADD y05, alpha4, a5, y05 + LFD a5, 12 * SIZE(AO4) + FMADD y06, alpha4, a6, y06 + LFD a6, 13 * SIZE(AO4) + FMADD y07, alpha4, a7, y07 + LFD a7, 14 * SIZE(AO4) + FMADD y08, alpha4, a8, y08 + LFD a8, 15 * SIZE(AO4) + + FMADD y09, alpha4, a1, y09 + addi AO1, AO1, 16 * SIZE + FMADD y10, alpha4, a2, y10 + addi AO2, AO2, 16 * SIZE + FMADD y11, alpha4, a3, y11 + addi AO3, AO3, 16 * SIZE + FMADD y12, alpha4, a4, y12 + addi AO4, AO4, 16 * SIZE + + FMADD y13, alpha4, a5, y13 + FMADD y14, alpha4, a6, y14 + FMADD y15, alpha4, a7, y15 + FMADD y16, alpha4, a8, y16 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + addi Y1, Y1, 16 * SIZE + .align 4 + +LL(25): + andi. r0, M, 15 + ble LL(30) + + andi. r0, M, 8 + ble LL(26) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO2) + + FMADD y05, alpha1, a5, y05 + LFD a5, 4 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFD a6, 5 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFD a7, 6 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 0 * SIZE(AO3) + FMADD y02, alpha2, a2, y02 + LFD a2, 1 * SIZE(AO3) + FMADD y03, alpha2, a3, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, alpha2, a4, y04 + LFD a4, 3 * SIZE(AO3) + FMADD y05, alpha2, a5, y05 + LFD a5, 4 * SIZE(AO3) + FMADD y06, alpha2, a6, y06 + LFD a6, 5 * SIZE(AO3) + FMADD y07, alpha2, a7, y07 + LFD a7, 6 * SIZE(AO3) + FMADD y08, alpha2, a8, y08 + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFD a1, 0 * SIZE(AO4) + FMADD y02, alpha3, a2, y02 + LFD a2, 1 * SIZE(AO4) + FMADD y03, alpha3, a3, y03 + LFD a3, 2 * SIZE(AO4) + FMADD y04, alpha3, a4, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y05, alpha3, a5, y05 + LFD a5, 4 * SIZE(AO4) + FMADD y06, alpha3, a6, y06 + LFD a6, 5 * SIZE(AO4) + FMADD y07, alpha3, a7, y07 + LFD a7, 6 * SIZE(AO4) + FMADD y08, alpha3, a8, y08 + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + addi AO1, AO1, 8 * SIZE + FMADD y02, alpha4, a2, y02 + addi AO2, AO2, 8 * SIZE + FMADD y03, alpha4, a3, y03 + addi AO3, AO3, 8 * SIZE + FMADD y04, alpha4, a4, y04 + addi AO4, AO4, 8 * SIZE + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + FMADD y05, alpha4, a5, y05 + FMADD y06, alpha4, a6, y06 + FMADD y07, alpha4, a7, y07 + FMADD y08, alpha4, a8, y08 + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + addi Y1, Y1, 8 * SIZE + .align 4 + +LL(26): + andi. r0, M, 4 + ble LL(27) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO3) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO3) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO3) + + FMADD y01, alpha2, a5, y01 + LFD a5, 0 * SIZE(AO4) + FMADD y02, alpha2, a6, y02 + LFD a6, 1 * SIZE(AO4) + FMADD y03, alpha2, a7, y03 + LFD a7, 2 * SIZE(AO4) + FMADD y04, alpha2, a8, y04 + LFD a8, 3 * SIZE(AO4) + + FMADD y01, alpha3, a1, y01 + addi AO1, AO1, 4 * SIZE + FMADD y02, alpha3, a2, y02 + addi AO2, AO2, 4 * SIZE + FMADD y03, alpha3, a3, y03 + addi AO3, AO3, 4 * SIZE + FMADD y04, alpha3, a4, y04 + addi AO4, AO4, 4 * SIZE + + FMADD y01, alpha4, a5, y01 + FMADD y02, alpha4, a6, y02 + FMADD y03, alpha4, a7, y03 + FMADD y04, alpha4, a8, y04 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + addi Y1, Y1, 4 * SIZE + .align 4 + +LL(27): + andi. r0, M, 2 + ble LL(28) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD a5, 0 * SIZE(AO3) + LFD a6, 1 * SIZE(AO3) + LFD a7, 0 * SIZE(AO4) + LFD a8, 1 * SIZE(AO4) + + FMADD y01, alpha1, a1, y01 + addi AO1, AO1, 2 * SIZE + FMADD y02, alpha1, a2, y02 + addi AO2, AO2, 2 * SIZE + FMADD y01, alpha2, a3, y01 + addi AO3, AO3, 2 * SIZE + FMADD y02, alpha2, a4, y02 + addi AO4, AO4, 2 * SIZE + + FMADD y01, alpha3, a5, y01 + FMADD y02, alpha3, a6, y02 + FMADD y01, alpha4, a7, y01 + FMADD y02, alpha4, a8, y02 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + addi Y1, Y1, 2 * SIZE + .align 4 + +LL(28): + andi. r0, M, 1 + ble LL(30) + + LFD y01, 0 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 0 * SIZE(AO2) + LFD a3, 0 * SIZE(AO3) + LFD a4, 0 * SIZE(AO4) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + FMADD y01, alpha3, a3, y01 + FMADD y01, alpha4, a4, y01 + + STFD y01, 0 * SIZE(Y1) + .align 4 + +LL(30): + andi. J, N, 2 + lfd alpha, ALPHA + ble LL(40) + .align 4 + + LFD alpha1, 0 * SIZE(X) + add X, X, INCX + LFD alpha2, 0 * SIZE(X) + add X, X, INCX + + FMUL alpha1, alpha, alpha1 + FMUL alpha2, alpha, alpha2 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr Y1, YY + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(35) + + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + bdz LL(33) + .align 4 + +LL(32): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 0 * SIZE(AO2) + FMADD y10, alpha1, a2, y10 + LFD a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y11 + LFD a3, 2 * SIZE(AO2) + FMADD y12, alpha1, a4, y12 + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + LFD a5, 4 * SIZE(AO2) + FMADD y14, alpha1, a6, y14 + LFD a6, 5 * SIZE(AO2) + FMADD y15, alpha1, a7, y15 + LFD a7, 6 * SIZE(AO2) + FMADD y16, alpha1, a8, y16 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 8 * SIZE(AO2) + FMADD y02, alpha2, a2, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, alpha2, a3, y03 + LFD a3, 10 * SIZE(AO2) + FMADD y04, alpha2, a4, y04 + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + LFD a5, 12 * SIZE(AO2) + FMADD y06, alpha2, a6, y06 + LFD a6, 13 * SIZE(AO2) + FMADD y07, alpha2, a7, y07 + LFD a7, 14 * SIZE(AO2) + FMADD y08, alpha2, a8, y08 + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2, a1, y09 + LFD a1, 16 * SIZE(AO1) + FMADD y10, alpha2, a2, y10 + LFD a2, 17 * SIZE(AO1) + FMADD y11, alpha2, a3, y11 + LFD a3, 18 * SIZE(AO1) + FMADD y12, alpha2, a4, y12 + LFD a4, 19 * SIZE(AO1) + + FMADD y13, alpha2, a5, y13 + LFD a5, 20 * SIZE(AO1) + FMADD y14, alpha2, a6, y14 + LFD a6, 21 * SIZE(AO1) + FMADD y15, alpha2, a7, y15 + LFD a7, 22 * SIZE(AO1) + FMADD y16, alpha2, a8, y16 + LFD a8, 23 * SIZE(AO1) + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + LFD y01, 16 * SIZE(Y1) + LFD y02, 17 * SIZE(Y1) + LFD y03, 18 * SIZE(Y1) + LFD y04, 19 * SIZE(Y1) + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + LFD y05, 20 * SIZE(Y1) + LFD y06, 21 * SIZE(Y1) + LFD y07, 22 * SIZE(Y1) + LFD y08, 23 * SIZE(Y1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + + LFD y09, 24 * SIZE(Y1) + LFD y10, 25 * SIZE(Y1) + LFD y11, 26 * SIZE(Y1) + LFD y12, 27 * SIZE(Y1) + + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + LFD y13, 28 * SIZE(Y1) + LFD y14, 29 * SIZE(Y1) + LFD y15, 30 * SIZE(Y1) + LFD y16, 31 * SIZE(Y1) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi Y1, Y1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(Y1, PREC) + + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 0 * SIZE(AO2) + FMADD y10, alpha1, a2, y10 + LFD a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y11 + LFD a3, 2 * SIZE(AO2) + FMADD y12, alpha1, a4, y12 + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + LFD a5, 4 * SIZE(AO2) + FMADD y14, alpha1, a6, y14 + LFD a6, 5 * SIZE(AO2) + FMADD y15, alpha1, a7, y15 + LFD a7, 6 * SIZE(AO2) + FMADD y16, alpha1, a8, y16 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 8 * SIZE(AO2) + FMADD y02, alpha2, a2, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, alpha2, a3, y03 + LFD a3, 10 * SIZE(AO2) + FMADD y04, alpha2, a4, y04 + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + LFD a5, 12 * SIZE(AO2) + FMADD y06, alpha2, a6, y06 + LFD a6, 13 * SIZE(AO2) + FMADD y07, alpha2, a7, y07 + LFD a7, 14 * SIZE(AO2) + FMADD y08, alpha2, a8, y08 + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2, a1, y09 + FMADD y10, alpha2, a2, y10 + FMADD y11, alpha2, a3, y11 + FMADD y12, alpha2, a4, y12 + FMADD y13, alpha2, a5, y13 + FMADD y14, alpha2, a6, y14 + FMADD y15, alpha2, a7, y15 + FMADD y16, alpha2, a8, y16 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi Y1, Y1, 16 * SIZE + .align 4 + +LL(35): + andi. r0, M, 15 + ble LL(40) + + andi. r0, M, 8 + ble LL(36) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO2) + FMADD y05, alpha1, a5, y05 + LFD a5, 4 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFD a6, 5 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFD a7, 6 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + FMADD y02, alpha2, a2, y02 + FMADD y03, alpha2, a3, y03 + FMADD y04, alpha2, a4, y04 + FMADD y05, alpha2, a5, y05 + FMADD y06, alpha2, a6, y06 + FMADD y07, alpha2, a7, y07 + FMADD y08, alpha2, a8, y08 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi Y1, Y1, 8 * SIZE + .align 4 + +LL(36): + andi. r0, M, 4 + ble LL(37) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + FMADD y01, alpha2, a5, y01 + FMADD y02, alpha2, a6, y02 + FMADD y03, alpha2, a7, y03 + FMADD y04, alpha2, a8, y04 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi Y1, Y1, 4 * SIZE + .align 4 + +LL(37): + andi. r0, M, 2 + ble LL(38) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y01, alpha2, a3, y01 + FMADD y02, alpha2, a4, y02 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi Y1, Y1, 2 * SIZE + .align 4 + +LL(38): + andi. r0, M, 1 + ble LL(40) + + LFD y01, 0 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 0 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + + STFD y01, 0 * SIZE(Y1) + .align 4 + +LL(40): + andi. J, N, 1 + lfd alpha, ALPHA + ble LL(990) + .align 4 + + LFD alpha1, 0 * SIZE(X) + FMUL alpha1, alpha, alpha1 + + mr AO1, A + mr Y1, YY + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(45) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + bdz LL(43) + .align 4 + +LL(42): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 16 * SIZE(AO1) + FMADD y10, alpha1, a2, y10 + LFD a2, 17 * SIZE(AO1) + FMADD y11, alpha1, a3, y11 + LFD a3, 18 * SIZE(AO1) + FMADD y12, alpha1, a4, y12 + LFD a4, 19 * SIZE(AO1) + + FMADD y13, alpha1, a5, y13 + LFD a5, 20 * SIZE(AO1) + FMADD y14, alpha1, a6, y14 + LFD a6, 21 * SIZE(AO1) + FMADD y15, alpha1, a7, y15 + LFD a7, 22 * SIZE(AO1) + FMADD y16, alpha1, a8, y16 + LFD a8, 23 * SIZE(AO1) + + STFD y01, 0 * SIZE(Y1) + LFD y01, 16 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + LFD y02, 17 * SIZE(Y1) + + STFD y03, 2 * SIZE(Y1) + LFD y03, 18 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + LFD y04, 19 * SIZE(Y1) + + STFD y05, 4 * SIZE(Y1) + LFD y05, 20 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + LFD y06, 21 * SIZE(Y1) + + STFD y07, 6 * SIZE(Y1) + LFD y07, 22 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + LFD y08, 23 * SIZE(Y1) + + STFD y09, 8 * SIZE(Y1) + LFD y09, 24 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + LFD y10, 25 * SIZE(Y1) + + STFD y11, 10 * SIZE(Y1) + LFD y11, 26 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + LFD y12, 27 * SIZE(Y1) + + STFD y13, 12 * SIZE(Y1) + LFD y13, 28 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + LFD y14, 29 * SIZE(Y1) + + STFD y15, 14 * SIZE(Y1) + LFD y15, 30 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + LFD y16, 31 * SIZE(Y1) + + addi AO1, AO1, 16 * SIZE + addi Y1, Y1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(Y1, PREC) + + bdnz LL(42) + .align 4 + +LL(43): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + FMADD y10, alpha1, a2, y10 + FMADD y11, alpha1, a3, y11 + FMADD y12, alpha1, a4, y12 + FMADD y13, alpha1, a5, y13 + FMADD y14, alpha1, a6, y14 + FMADD y15, alpha1, a7, y15 + FMADD y16, alpha1, a8, y16 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + addi AO1, AO1, 16 * SIZE + addi Y1, Y1, 16 * SIZE + .align 4 + +LL(45): + andi. r0, M, 15 + ble LL(990) + + andi. r0, M, 8 + ble LL(46) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + FMADD y05, alpha1, a5, y05 + FMADD y06, alpha1, a6, y06 + FMADD y07, alpha1, a7, y07 + FMADD y08, alpha1, a8, y08 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + addi AO1, AO1, 8 * SIZE + addi Y1, Y1, 8 * SIZE + .align 4 + +LL(46): + andi. r0, M, 4 + ble LL(47) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + addi AO1, AO1, 4 * SIZE + addi Y1, Y1, 4 * SIZE + .align 4 + +LL(47): + andi. r0, M, 2 + ble LL(48) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + + addi AO1, AO1, 2 * SIZE + addi Y1, Y1, 2 * SIZE + .align 4 + +LL(48): + andi. r0, M, 1 + ble LL(990) + + LFD y01, 0 * SIZE(Y1) + LFD a1, 0 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + STFD y01, 0 * SIZE(Y1) + .align 4 + +LL(990): + cmpi cr0, 0, INCY, SIZE + beq LL(999) + + mr YY, BUFFER + mr Y1, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + add Y, Y, INCY + LFD f5, 0 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + add Y, Y, INCY + LFD f7, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(YY) + LFD f9, 1 * SIZE(YY) + LFD f10, 2 * SIZE(YY) + LFD f11, 3 * SIZE(YY) + LFD f12, 4 * SIZE(YY) + LFD f13, 5 * SIZE(YY) + LFD f14, 6 * SIZE(YY) + LFD f15, 7 * SIZE(YY) + addi YY, YY, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f9, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f10, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f11, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f12, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f13, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f14, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f15, 0 * SIZE(Y1) + add Y1, Y1, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 4 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(YY) + LFD f9, 1 * SIZE(YY) + LFD f10, 2 * SIZE(YY) + LFD f11, 3 * SIZE(YY) + addi YY, YY, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f9, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f10, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f11, 0 * SIZE(Y1) + add Y1, Y1, INCY + .align 4 + +LL(996): + andi. J, M, 2 + ble LL(997) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(YY) + LFD f9, 1 * SIZE(YY) + addi YY, YY, 2 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f9, 0 * SIZE(Y1) + add Y1, Y1, INCY + .align 4 + +LL(997): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f8, 0 * SIZE(YY) + + FADD f8, f8, f0 + + STFD f8, 0 * SIZE(Y1) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemv_n_ppc440.S b/kernel/power/gemv_n_ppc440.S new file mode 100644 index 0000000..baedebc --- /dev/null +++ b/kernel/power/gemv_n_ppc440.S @@ -0,0 +1,1185 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define LDA8 r18 + +#define Y1 r19 +#define Y2 r20 +#define PREA r21 +#define YY r22 +#define BUFFER r23 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define alpha1 f16 +#define alpha2 f17 +#define alpha3 f18 +#define alpha4 f19 + +#define a1 f20 +#define a2 f21 +#define a3 f22 +#define a4 f23 +#define a5 f24 +#define a6 f25 +#define a7 f26 +#define a8 f27 + +#define alpha f27 + +#if defined(PPCG4) +#define PREFETCHSIZE_A (3 * 4) +#endif + +#if defined(POWER6) +#define PREFETCHSIZE_A (3 * 4) +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA 200(SP) +#define FZERO 208(SP) +#else +#define STACKSIZE 280 +#define ALPHA 256(SP) +#define FZERO 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA + fmr alpha, f1 + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + addi A, A, -SIZE + sub X, X, INCX + sub Y, Y, INCY + + mr YY, Y + lfd f0, FZERO + + cmpi cr0, 0, INCY, SIZE + beq LL(10) + + addi YY, BUFFER, -SIZE + addi Y1, BUFFER, -SIZE + + addi r0, M, 7 + srawi. r0, r0, 3 + mtspr CTR, r0 + .align 4 + +LL(02): + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + bdnz LL(02) + .align 4 + +LL(10): + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(21): + mr AO1, A + add AO2, A, LDA + + LFDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + LFDUX alpha3, X, INCX + LFDUX alpha4, X, INCX + + FMUL alpha1, alpha, alpha1 + add AO3, AO2, LDA + FMUL alpha2, alpha, alpha2 + add AO4, AO3, LDA + FMUL alpha3, alpha, alpha3 + add A, AO4, LDA + FMUL alpha4, alpha, alpha4 + mr Y1, YY + mr Y2, YY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(25) + + LFDU y01, 1 * SIZE(Y1) + LFDU a1, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a5, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + bdz LL(23) + .align 4 + +LL(22): +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y09, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO2) + FMADD y10, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO2) + FMADD y12, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO2) + + LFDU y01, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD y13, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO2) + FMADD y14, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y15, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO2) + FMADD y16, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO2) + + LFDU y02, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y09, alpha2, a1, y09 + LFDU a1, 1 * SIZE(AO3) + FMADD y10, alpha2, a2, y10 + LFDU a2, 1 * SIZE(AO3) + FMADD y11, alpha2, a3, y11 + LFDU a3, 1 * SIZE(AO3) + FMADD y12, alpha2, a4, y12 + LFDU a4, 1 * SIZE(AO3) + + LFDU y03, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO3, PREA +#endif + + FMADD y13, alpha2, a5, y13 + LFDU a5, 1 * SIZE(AO3) + FMADD y14, alpha2, a6, y14 + LFDU a6, 1 * SIZE(AO3) + FMADD y15, alpha2, a7, y15 + LFDU a7, 1 * SIZE(AO3) + FMADD y16, alpha2, a8, y16 + LFDU a8, 1 * SIZE(AO3) + + LFDU y04, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO3, PREA +#endif + + FMADD y09, alpha3, a1, y09 + LFDU a1, 1 * SIZE(AO4) + FMADD y10, alpha3, a2, y10 + LFDU a2, 1 * SIZE(AO4) + FMADD y11, alpha3, a3, y11 + LFDU a3, 1 * SIZE(AO4) + FMADD y12, alpha3, a4, y12 + LFDU a4, 1 * SIZE(AO4) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + LFDU y05, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO4, PREA +#endif + + FMADD y13, alpha3, a5, y13 + LFDU a5, 1 * SIZE(AO4) + FMADD y14, alpha3, a6, y14 + LFDU a6, 1 * SIZE(AO4) + FMADD y15, alpha3, a7, y15 + LFDU a7, 1 * SIZE(AO4) + FMADD y16, alpha3, a8, y16 + LFDU a8, 1 * SIZE(AO4) + + LFDU y06, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO4, PREA +#endif + + FMADD y09, alpha4, a1, y09 + LFDU a1, 1 * SIZE(AO1) + FMADD y10, alpha4, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMADD y11, alpha4, a3, y11 + LFDU a3, 1 * SIZE(AO1) + FMADD y12, alpha4, a4, y12 + LFDU a4, 1 * SIZE(AO1) + + LFDU y07, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + STFDU y10, 1 * SIZE(Y2) + STFDU y11, 1 * SIZE(Y2) + STFDU y12, 1 * SIZE(Y2) + + FMADD y13, alpha4, a5, y13 + LFDU a5, 1 * SIZE(AO1) + FMADD y14, alpha4, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMADD y15, alpha4, a7, y15 + LFDU a7, 1 * SIZE(AO1) + FMADD y16, alpha4, a8, y16 + LFDU a8, 1 * SIZE(AO1) + + LFDU y08, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO2) + + FMADD y05, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFDU a1, 1 * SIZE(AO3) + FMADD y02, alpha2, a2, y02 + LFDU a2, 1 * SIZE(AO3) + FMADD y03, alpha2, a3, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, alpha2, a4, y04 + LFDU a4, 1 * SIZE(AO3) + + FMADD y05, alpha2, a5, y05 + LFDU a5, 1 * SIZE(AO3) + FMADD y06, alpha2, a6, y06 + LFDU a6, 1 * SIZE(AO3) + FMADD y07, alpha2, a7, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, alpha2, a8, y08 + LFDU a8, 1 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFDU a1, 1 * SIZE(AO4) + FMADD y02, alpha3, a2, y02 + LFDU a2, 1 * SIZE(AO4) + FMADD y03, alpha3, a3, y03 + LFDU a3, 1 * SIZE(AO4) + FMADD y04, alpha3, a4, y04 + LFDU a4, 1 * SIZE(AO4) + + FMADD y05, alpha3, a5, y05 + LFDU a5, 1 * SIZE(AO4) + FMADD y06, alpha3, a6, y06 + LFDU a6, 1 * SIZE(AO4) + FMADD y07, alpha3, a7, y07 + LFDU a7, 1 * SIZE(AO4) + FMADD y08, alpha3, a8, y08 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + FMADD y02, alpha4, a2, y02 + FMADD y03, alpha4, a3, y03 + FMADD y04, alpha4, a4, y04 + + FMADD y05, alpha4, a5, y05 + STFDU y01, 1 * SIZE(Y2) + FMADD y06, alpha4, a6, y06 + STFDU y02, 1 * SIZE(Y2) + FMADD y07, alpha4, a7, y07 + STFDU y03, 1 * SIZE(Y2) + FMADD y08, alpha4, a8, y08 + STFDU y04, 1 * SIZE(Y2) + + STFDU y05, 1 * SIZE(Y2) + STFDU y06, 1 * SIZE(Y2) + STFDU y07, 1 * SIZE(Y2) + STFDU y08, 1 * SIZE(Y2) + .align 4 + +LL(25): + andi. r0, M, 7 + ble LL(29) + + andi. r0, M, 4 + ble LL(27) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + LFDU a5, 1 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFDU a7, 1 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFDU a8, 1 * SIZE(AO2) + + FMADD y01, alpha2, a5, y01 + LFDU a1, 1 * SIZE(AO3) + FMADD y02, alpha2, a6, y02 + LFDU a2, 1 * SIZE(AO3) + FMADD y03, alpha2, a7, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, alpha2, a8, y04 + LFDU a4, 1 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFDU a5, 1 * SIZE(AO4) + FMADD y02, alpha3, a2, y02 + LFDU a6, 1 * SIZE(AO4) + FMADD y03, alpha3, a3, y03 + LFDU a7, 1 * SIZE(AO4) + FMADD y04, alpha3, a4, y04 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha4, a5, y01 + FMADD y02, alpha4, a6, y02 + FMADD y03, alpha4, a7, y03 + FMADD y04, alpha4, a8, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(27): + andi. r0, M, 2 + ble LL(28) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + LFDU a5, 1 * SIZE(AO3) + FMADD y02, alpha1, a2, y02 + LFDU a6, 1 * SIZE(AO3) + FMADD y01, alpha2, a3, y01 + LFDU a7, 1 * SIZE(AO4) + FMADD y02, alpha2, a4, y02 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha3, a5, y01 + FMADD y02, alpha3, a6, y02 + FMADD y01, alpha4, a7, y01 + FMADD y02, alpha4, a8, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(28): + andi. r0, M, 1 + ble LL(29) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO2) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + FMADD y01, alpha3, a3, y01 + FMADD y01, alpha4, a4, y01 + + STFDU y01, 1 * SIZE(Y2) + .align 4 + +LL(29): + addi J, J, -1 + lfd alpha, ALPHA + cmpi cr0, 0, J, 0 + bgt LL(21) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(40) + + LFDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + FMUL alpha1, alpha, alpha1 + mr Y1, YY + FMUL alpha2, alpha, alpha2 + mr Y2, YY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(35) + + LFDU y01, 1 * SIZE(Y1) + LFDU a1, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a5, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + bdz LL(33) + .align 4 + +LL(32): +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y09, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO2) + FMADD y10, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO2) + FMADD y12, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO2) + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD y13, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO2) + FMADD y14, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y15, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO2) + FMADD y16, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO2) + + LFDU y03, 1 * SIZE(Y1) + LFDU y04, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y09, alpha2, a1, y09 + LFDU a1, 1 * SIZE(AO1) + FMADD y10, alpha2, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMADD y11, alpha2, a3, y11 + LFDU a3, 1 * SIZE(AO1) + FMADD y12, alpha2, a4, y12 + LFDU a4, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + LFDU y05, 1 * SIZE(Y1) + LFDU y06, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y13, alpha2, a5, y13 + LFDU a5, 1 * SIZE(AO1) + FMADD y14, alpha2, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMADD y15, alpha2, a7, y15 + LFDU a7, 1 * SIZE(AO1) + FMADD y16, alpha2, a8, y16 + LFDU a8, 1 * SIZE(AO1) + + LFDU y07, 1 * SIZE(Y1) + LFDU y08, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + STFDU y10, 1 * SIZE(Y2) + STFDU y11, 1 * SIZE(Y2) + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO2) + + FMADD y05, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + FMADD y02, alpha2, a2, y02 + FMADD y03, alpha2, a3, y03 + FMADD y04, alpha2, a4, y04 + + FMADD y05, alpha2, a5, y05 + STFDU y01, 1 * SIZE(Y2) + FMADD y06, alpha2, a6, y06 + STFDU y02, 1 * SIZE(Y2) + FMADD y07, alpha2, a7, y07 + STFDU y03, 1 * SIZE(Y2) + FMADD y08, alpha2, a8, y08 + STFDU y04, 1 * SIZE(Y2) + + STFDU y05, 1 * SIZE(Y2) + STFDU y06, 1 * SIZE(Y2) + STFDU y07, 1 * SIZE(Y2) + STFDU y08, 1 * SIZE(Y2) + .align 4 + +LL(35): + andi. r0, M, 7 + ble LL(40) + + andi. r0, M, 4 + ble LL(37) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + LFDU a5, 1 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFDU a7, 1 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFDU a8, 1 * SIZE(AO2) + + FMADD y01, alpha2, a5, y01 + FMADD y02, alpha2, a6, y02 + FMADD y03, alpha2, a7, y03 + FMADD y04, alpha2, a8, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(37): + andi. r0, M, 2 + ble LL(38) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y01, alpha2, a3, y01 + FMADD y02, alpha2, a4, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(38): + andi. r0, M, 1 + ble LL(40) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + + STFDU y01, 1 * SIZE(Y2) + .align 4 + +LL(40): + andi. J, N, 1 + lfd alpha, ALPHA + ble LL(990) + + LFDUX alpha1, X, INCX + + mr AO1, A + add A, A, LDA + + FMUL alpha1, alpha, alpha1 + mr Y1, YY + mr Y2, YY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(45) + + LFDU y01, 1 * SIZE(Y1) + LFDU a1, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a5, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + bdz LL(43) + .align 4 + +LL(42): +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y09, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y10, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO1) + FMADD y11, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO1) + FMADD y12, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO1) + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) + LFDU y03, 1 * SIZE(Y1) + LFDU y04, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y13, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y14, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO1) + FMADD y15, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO1) + FMADD y16, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + LFDU y05, 1 * SIZE(Y1) + LFDU y06, 1 * SIZE(Y1) + LFDU y07, 1 * SIZE(Y1) + LFDU y08, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + STFDU y10, 1 * SIZE(Y2) + STFDU y11, 1 * SIZE(Y2) + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + bdnz LL(42) + .align 4 + +LL(43): + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + FMADD y05, alpha1, a5, y05 + STFDU y01, 1 * SIZE(Y2) + FMADD y06, alpha1, a6, y06 + STFDU y02, 1 * SIZE(Y2) + FMADD y07, alpha1, a7, y07 + STFDU y03, 1 * SIZE(Y2) + FMADD y08, alpha1, a8, y08 + STFDU y04, 1 * SIZE(Y2) + + STFDU y05, 1 * SIZE(Y2) + STFDU y06, 1 * SIZE(Y2) + STFDU y07, 1 * SIZE(Y2) + STFDU y08, 1 * SIZE(Y2) + .align 4 + +LL(45): + andi. r0, M, 7 + ble LL(990) + + andi. r0, M, 4 + ble LL(47) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(47): + andi. r0, M, 2 + ble LL(48) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(48): + andi. r0, M, 1 + ble LL(990) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + + STFDU y01, 1 * SIZE(Y2) + .align 4 + +LL(990): + cmpi cr0, 0, INCY, SIZE + beq LL(999) + + addi YY, BUFFER, -SIZE + mr Y1, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFDUX f0, Y, INCY + LFDUX f1, Y, INCY + LFDUX f2, Y, INCY + LFDUX f3, Y, INCY + LFDUX f4, Y, INCY + LFDUX f5, Y, INCY + LFDUX f6, Y, INCY + LFDUX f7, Y, INCY + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + LFDU f10, 1 * SIZE(YY) + LFDU f11, 1 * SIZE(YY) + LFDU f12, 1 * SIZE(YY) + LFDU f13, 1 * SIZE(YY) + LFDU f14, 1 * SIZE(YY) + LFDU f15, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFDUX f8, Y1, INCY + STFDUX f9, Y1, INCY + STFDUX f10, Y1, INCY + STFDUX f11, Y1, INCY + STFDUX f12, Y1, INCY + STFDUX f13, Y1, INCY + STFDUX f14, Y1, INCY + STFDUX f15, Y1, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 4 + ble LL(996) + + LFDUX f0, Y, INCY + LFDUX f1, Y, INCY + LFDUX f2, Y, INCY + LFDUX f3, Y, INCY + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + LFDU f10, 1 * SIZE(YY) + LFDU f11, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFDUX f8, Y1, INCY + STFDUX f9, Y1, INCY + STFDUX f10, Y1, INCY + STFDUX f11, Y1, INCY + .align 4 + +LL(996): + andi. J, M, 2 + ble LL(997) + + LFDUX f0, Y, INCY + LFDUX f1, Y, INCY + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFDUX f8, Y1, INCY + STFDUX f9, Y1, INCY + .align 4 + +LL(997): + andi. J, M, 1 + ble LL(999) + + LFDUX f0, Y, INCY + LFDU f8, 1 * SIZE(YY) + + FADD f8, f8, f0 + + STFDUX f8, Y1, INCY + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S new file mode 100644 index 0000000..a70e8b8 --- /dev/null +++ b/kernel/power/gemv_t.S @@ -0,0 +1,2964 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#define BUFFER r11 +#define XP r12 +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define AO5 r18 +#define AO6 r19 +#define AO7 r20 +#define AO8 r21 +#define MIN_N r22 +#define J r23 +#define CO r24 +#define PREA r25 +#define PREC r26 +#define BO r27 +#define PLDA_M r28 +#define IS r29 + +#define Y1 CO + +#if defined(PPCG4) +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 16 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 48 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 40 +#define PREFETCHSIZE_C 8 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 8 +#endif + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define a1 f16 +#define a2 f17 +#define a3 f18 +#define a4 f19 +#define a5 f20 +#define a6 f21 +#define a7 f22 +#define a8 f23 + +#define b1 f24 +#define b2 f25 +#define b3 f26 +#define b4 f27 +#define b5 f28 +#define b6 f29 +#define b7 f30 +#define b8 f31 + +#define alpha f31 + +#ifndef NEEDPARAM + +#define P 2048 + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 288 +#endif + +#define FZERO 144(SP) +#define ALPHA 152(SP) + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + stfd f1, ALPHA + std r14, 160(SP) + std r15, 168(SP) + std r16, 176(SP) + std r17, 184(SP) + std r18, 192(SP) + std r19, 200(SP) + std r20, 208(SP) + std r21, 216(SP) + std r22, 224(SP) + std r23, 232(SP) + std r24, 240(SP) + std r25, 248(SP) + std r26, 256(SP) + std r27, 264(SP) + std r28, 272(SP) + std r29, 280(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stfd f1, ALPHA + stw r14, 160(SP) + stw r15, 164(SP) + stw r16, 168(SP) + stw r17, 172(SP) + stw r18, 176(SP) + stw r19, 180(SP) + stw r20, 184(SP) + stw r21, 188(SP) + stw r22, 192(SP) + stw r23, 196(SP) + stw r24, 200(SP) + stw r25, 204(SP) + stw r26, 208(SP) + stw r27, 212(SP) + stw r28, 216(SP) + stw r29, 220(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + mullw PLDA_M, LDA, N + li XP, P + subf PLDA_M, XP, PLDA_M + slwi PLDA_M, PLDA_M, BASE_SHIFT + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + subf Y, INCY, Y + + li IS, 0 + + addi A, A, -SIZE + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpi cr0, 0, M, 0 + ble LL(999) + + cmpi cr0, 0, N, 0 + ble LL(999) + .align 4 + +LL(ISLoop): + subf MIN_N, IS, M + slwi r0, IS, BASE_SHIFT + cmpi cr0, 0, MIN_N, P + ble+ LL(min_nP) + li MIN_N, P +LL(min_nP): + add XP, X, r0 + cmpi cr0, 0, INCX, SIZE + beq LL(10) + + mr XP, BUFFER + addi CO, BUFFER, -SIZE + + srawi. r0, MIN_N, 3 + mtspr CTR, r0 + ble LL(CopyRemain) + .align 4 + +LL(CopyKernel): + LFD f0, 0 * SIZE(X) + add X, X, INCX + LFD f1, 0 * SIZE(X) + add X, X, INCX + LFD f2, 0 * SIZE(X) + add X, X, INCX + LFD f3, 0 * SIZE(X) + add X, X, INCX + LFD f4, 0 * SIZE(X) + add X, X, INCX + LFD f5, 0 * SIZE(X) + add X, X, INCX + LFD f6, 0 * SIZE(X) + add X, X, INCX + LFD f7, 0 * SIZE(X) + add X, X, INCX + + STFD f0, 1 * SIZE(CO) + STFD f1, 2 * SIZE(CO) + STFD f2, 3 * SIZE(CO) + STFD f3, 4 * SIZE(CO) + STFD f4, 5 * SIZE(CO) + STFD f5, 6 * SIZE(CO) + STFD f6, 7 * SIZE(CO) + STFDU f7, 8 * SIZE(CO) + bdnz LL(CopyKernel) + .align 4 + +LL(CopyRemain): + andi. r0, MIN_N, 7 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(CopySub): + LFD f0, 0 * SIZE(X) + add X, X, INCX + STFDU f0, 1 * SIZE(CO) + bdnz LL(CopySub) + .align 4 + +LL(10): + mr CO, Y + addi XP, XP, -SIZE + srawi. J, N, 3 + ble LL(20) + .align 4 + +LL(11): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + mr BO, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y05, y01 + fmr y06, y01 + fmr y07, y01 + fmr y08, y01 + fmr y09, y01 + fmr y10, y01 + fmr y11, y01 + fmr y12, y01 + fmr y13, y01 + fmr y14, y01 + fmr y15, y01 + fmr y16, y01 + + DCBT(Y1, PREC) + + srawi. r0, MIN_N, 4 + mtspr CTR, r0 + ble LL(14) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + bdz LL(13) + .align 4 + +LL(12): + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 3 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 3 * SIZE(AO2) + + FMADD y11, a3, b2, y11 + LFD a3, 3 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 3 * SIZE(AO4) + + FMADD y13, a5, b2, y13 + LFD a5, 3 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 3 * SIZE(AO6) + + FMADD y15, a7, b2, y15 + LFD a7, 3 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 3 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 4 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 4 * SIZE(AO2) + + FMADD y03, a3, b3, y03 + LFD a3, 4 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 4 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 4 * SIZE(AO6) + + FMADD y07, a7, b3, y07 + LFD a7, 4 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 4 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 5 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 5 * SIZE(AO2) + + FMADD y11, a3, b4, y11 + LFD a3, 5 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 5 * SIZE(AO4) + + FMADD y13, a5, b4, y13 + LFD a5, 5 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 5 * SIZE(AO6) + + FMADD y15, a7, b4, y15 + LFD a7, 5 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 5 * SIZE(AO8) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 6 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 6 * SIZE(AO2) + + FMADD y03, a3, b5, y03 + LFD a3, 6 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 6 * SIZE(AO4) + + FMADD y05, a5, b5, y05 + LFD a5, 6 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 6 * SIZE(AO6) + + FMADD y07, a7, b5, y07 + LFD a7, 6 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 6 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 7 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 7 * SIZE(AO2) + + FMADD y11, a3, b6, y11 + LFD a3, 7 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 7 * SIZE(AO4) + + FMADD y13, a5, b6, y13 + LFD a5, 7 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 7 * SIZE(AO6) + + FMADD y15, a7, b6, y15 + LFD a7, 7 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 7 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 8 * SIZE(AO2) + + FMADD y03, a3, b7, y03 + LFD a3, 8 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 8 * SIZE(AO4) + + FMADD y05, a5, b7, y05 + LFD a5, 8 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 8 * SIZE(AO6) + + FMADD y07, a7, b7, y07 + LFD a7, 8 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 8 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + LFD a1, 9 * SIZE(AO1) + FMADD y10, a2, b8, y10 + LFD a2, 9 * SIZE(AO2) + + FMADD y11, a3, b8, y11 + LFD a3, 9 * SIZE(AO3) + FMADD y12, a4, b8, y12 + LFD a4, 9 * SIZE(AO4) + + FMADD y13, a5, b8, y13 + LFD a5, 9 * SIZE(AO5) + FMADD y14, a6, b8, y14 + LFD a6, 9 * SIZE(AO6) + + FMADD y15, a7, b8, y15 + LFD a7, 9 * SIZE(AO7) + FMADD y16, a8, b8, y16 + LFD a8, 9 * SIZE(AO8) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + FMADD y01, a1, b1, y01 + LFD a1, 10 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 10 * SIZE(AO2) + + FMADD y03, a3, b1, y03 + LFD a3, 10 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 10 * SIZE(AO4) + + FMADD y05, a5, b1, y05 + LFD a5, 10 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 10 * SIZE(AO6) + + FMADD y07, a7, b1, y07 + LFD a7, 10 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 10 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 11 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 11 * SIZE(AO2) + + FMADD y11, a3, b2, y11 + LFD a3, 11 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 11 * SIZE(AO4) + + FMADD y13, a5, b2, y13 + LFD a5, 11 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 11 * SIZE(AO6) + + FMADD y15, a7, b2, y15 + LFD a7, 11 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 11 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 12 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 12 * SIZE(AO2) + + FMADD y03, a3, b3, y03 + LFD a3, 12 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 12 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 12 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 12 * SIZE(AO6) + + FMADD y07, a7, b3, y07 + LFD a7, 12 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 12 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 13 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 13 * SIZE(AO2) + + FMADD y11, a3, b4, y11 + LFD a3, 13 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 13 * SIZE(AO4) + + FMADD y13, a5, b4, y13 + LFD a5, 13 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 13 * SIZE(AO6) + + FMADD y15, a7, b4, y15 + LFD a7, 13 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 13 * SIZE(AO8) + + LFD b1, 17 * SIZE(BO) + LFD b2, 18 * SIZE(BO) + LFD b3, 19 * SIZE(BO) + LFD b4, 20 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 14 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 14 * SIZE(AO2) + + FMADD y03, a3, b5, y03 + LFD a3, 14 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 14 * SIZE(AO4) + + FMADD y05, a5, b5, y05 + LFD a5, 14 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 14 * SIZE(AO6) + + FMADD y07, a7, b5, y07 + LFD a7, 14 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 14 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 15 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 15 * SIZE(AO2) + + FMADD y11, a3, b6, y11 + LFD a3, 15 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 15 * SIZE(AO4) + + FMADD y13, a5, b6, y13 + LFD a5, 15 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 15 * SIZE(AO6) + + FMADD y15, a7, b6, y15 + LFD a7, 15 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 15 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 16 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 16 * SIZE(AO2) + + FMADD y03, a3, b7, y03 + LFD a3, 16 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 16 * SIZE(AO4) + + FMADD y05, a5, b7, y05 + LFD a5, 16 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 16 * SIZE(AO6) + + FMADD y07, a7, b7, y07 + LFD a7, 16 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 16 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + LFD a1, 17 * SIZE(AO1) + FMADD y10, a2, b8, y10 + LFD a2, 17 * SIZE(AO2) + + FMADD y11, a3, b8, y11 + LFD a3, 17 * SIZE(AO3) + FMADD y12, a4, b8, y12 + LFD a4, 17 * SIZE(AO4) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + FMADD y13, a5, b8, y13 + LFD a5, 17 * SIZE(AO5) + FMADD y14, a6, b8, y14 + LFD a6, 17 * SIZE(AO6) + + FMADD y15, a7, b8, y15 + LFD a7, 17 * SIZE(AO7) + FMADD y16, a8, b8, y16 + LFD a8, 17 * SIZE(AO8) + + LFD b5, 21 * SIZE(BO) + LFD b6, 22 * SIZE(BO) + LFD b7, 23 * SIZE(BO) + LFD b8, 24 * SIZE(BO) + + addi AO5, AO5, 16 * SIZE + addi AO6, AO6, 16 * SIZE + DCBT(AO5, PREA) + DCBT(AO6, PREA) + + addi AO7, AO7, 16 * SIZE + addi AO8, AO8, 16 * SIZE + DCBT(AO7, PREA) + DCBT(AO8, PREA) + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(13): + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 3 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 3 * SIZE(AO2) + + FMADD y11, a3, b2, y11 + LFD a3, 3 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 3 * SIZE(AO4) + + FMADD y13, a5, b2, y13 + LFD a5, 3 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 3 * SIZE(AO6) + + FMADD y15, a7, b2, y15 + LFD a7, 3 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 3 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 4 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 4 * SIZE(AO2) + + FMADD y03, a3, b3, y03 + LFD a3, 4 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 4 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 4 * SIZE(AO6) + + FMADD y07, a7, b3, y07 + LFD a7, 4 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 4 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 5 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 5 * SIZE(AO2) + + FMADD y11, a3, b4, y11 + LFD a3, 5 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 5 * SIZE(AO4) + + FMADD y13, a5, b4, y13 + LFD a5, 5 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 5 * SIZE(AO6) + + FMADD y15, a7, b4, y15 + LFD a7, 5 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 5 * SIZE(AO8) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 6 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 6 * SIZE(AO2) + + FMADD y03, a3, b5, y03 + LFD a3, 6 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 6 * SIZE(AO4) + + FMADD y05, a5, b5, y05 + LFD a5, 6 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 6 * SIZE(AO6) + + FMADD y07, a7, b5, y07 + LFD a7, 6 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 6 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 7 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 7 * SIZE(AO2) + + FMADD y11, a3, b6, y11 + LFD a3, 7 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 7 * SIZE(AO4) + + FMADD y13, a5, b6, y13 + LFD a5, 7 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 7 * SIZE(AO6) + + FMADD y15, a7, b6, y15 + LFD a7, 7 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 7 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 8 * SIZE(AO2) + + FMADD y03, a3, b7, y03 + LFD a3, 8 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 8 * SIZE(AO4) + + FMADD y05, a5, b7, y05 + LFD a5, 8 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 8 * SIZE(AO6) + + FMADD y07, a7, b7, y07 + LFD a7, 8 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 8 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + LFD a1, 9 * SIZE(AO1) + FMADD y10, a2, b8, y10 + LFD a2, 9 * SIZE(AO2) + + FMADD y11, a3, b8, y11 + LFD a3, 9 * SIZE(AO3) + FMADD y12, a4, b8, y12 + LFD a4, 9 * SIZE(AO4) + + FMADD y13, a5, b8, y13 + LFD a5, 9 * SIZE(AO5) + FMADD y14, a6, b8, y14 + LFD a6, 9 * SIZE(AO6) + + FMADD y15, a7, b8, y15 + LFD a7, 9 * SIZE(AO7) + FMADD y16, a8, b8, y16 + LFD a8, 9 * SIZE(AO8) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 10 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 10 * SIZE(AO2) + + FMADD y03, a3, b1, y03 + LFD a3, 10 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 10 * SIZE(AO4) + + FMADD y05, a5, b1, y05 + LFD a5, 10 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 10 * SIZE(AO6) + + FMADD y07, a7, b1, y07 + LFD a7, 10 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 10 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 11 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 11 * SIZE(AO2) + + FMADD y11, a3, b2, y11 + LFD a3, 11 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 11 * SIZE(AO4) + + FMADD y13, a5, b2, y13 + LFD a5, 11 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 11 * SIZE(AO6) + + FMADD y15, a7, b2, y15 + LFD a7, 11 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 11 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 12 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 12 * SIZE(AO2) + + FMADD y03, a3, b3, y03 + LFD a3, 12 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 12 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 12 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 12 * SIZE(AO6) + + FMADD y07, a7, b3, y07 + LFD a7, 12 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 12 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 13 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 13 * SIZE(AO2) + + FMADD y11, a3, b4, y11 + LFD a3, 13 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 13 * SIZE(AO4) + + FMADD y13, a5, b4, y13 + LFD a5, 13 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 13 * SIZE(AO6) + + FMADD y15, a7, b4, y15 + LFD a7, 13 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 13 * SIZE(AO8) + + FMADD y01, a1, b5, y01 + LFD a1, 14 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 14 * SIZE(AO2) + + FMADD y03, a3, b5, y03 + LFD a3, 14 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 14 * SIZE(AO4) + + FMADD y05, a5, b5, y05 + LFD a5, 14 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 14 * SIZE(AO6) + + FMADD y07, a7, b5, y07 + LFD a7, 14 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 14 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 15 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 15 * SIZE(AO2) + + FMADD y11, a3, b6, y11 + LFD a3, 15 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 15 * SIZE(AO4) + + FMADD y13, a5, b6, y13 + LFD a5, 15 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 15 * SIZE(AO6) + + FMADD y15, a7, b6, y15 + LFD a7, 15 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 15 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 16 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 16 * SIZE(AO2) + + FMADD y03, a3, b7, y03 + LFD a3, 16 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 16 * SIZE(AO4) + + FMADD y05, a5, b7, y05 + LFD a5, 16 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 16 * SIZE(AO6) + + FMADD y07, a7, b7, y07 + LFD a7, 16 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 16 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + FMADD y10, a2, b8, y10 + FMADD y11, a3, b8, y11 + FMADD y12, a4, b8, y12 + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + FMADD y13, a5, b8, y13 + FMADD y14, a6, b8, y14 + FMADD y15, a7, b8, y15 + FMADD y16, a8, b8, y16 + + addi AO5, AO5, 16 * SIZE + addi AO6, AO6, 16 * SIZE + addi AO7, AO7, 16 * SIZE + addi AO8, AO8, 16 * SIZE + addi BO, BO, 16 * SIZE + .align 4 + +LL(14): + andi. r0, MIN_N, 15 + ble LL(18) + + andi. r0, MIN_N, 8 + ble LL(15) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 3 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 3 * SIZE(AO2) + FMADD y11, a3, b2, y11 + LFD a3, 3 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 3 * SIZE(AO4) + FMADD y13, a5, b2, y13 + LFD a5, 3 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 3 * SIZE(AO6) + FMADD y15, a7, b2, y15 + LFD a7, 3 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 3 * SIZE(AO8) + + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + + FMADD y01, a1, b3, y01 + LFD a1, 4 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 4 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 4 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 4 * SIZE(AO4) + FMADD y05, a5, b3, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 4 * SIZE(AO6) + FMADD y07, a7, b3, y07 + LFD a7, 4 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 4 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 5 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 5 * SIZE(AO2) + FMADD y11, a3, b4, y11 + LFD a3, 5 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 5 * SIZE(AO4) + FMADD y13, a5, b4, y13 + LFD a5, 5 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 5 * SIZE(AO6) + FMADD y15, a7, b4, y15 + LFD a7, 5 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 5 * SIZE(AO8) + + FMADD y01, a1, b5, y01 + LFD a1, 6 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 6 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 6 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 6 * SIZE(AO4) + FMADD y05, a5, b5, y05 + LFD a5, 6 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 6 * SIZE(AO6) + FMADD y07, a7, b5, y07 + LFD a7, 6 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 6 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 7 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 7 * SIZE(AO2) + FMADD y11, a3, b6, y11 + LFD a3, 7 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 7 * SIZE(AO4) + FMADD y13, a5, b6, y13 + LFD a5, 7 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 7 * SIZE(AO6) + FMADD y15, a7, b6, y15 + LFD a7, 7 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 7 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 8 * SIZE(AO2) + FMADD y03, a3, b7, y03 + LFD a3, 8 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 8 * SIZE(AO4) + FMADD y05, a5, b7, y05 + LFD a5, 8 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 8 * SIZE(AO6) + FMADD y07, a7, b7, y07 + LFD a7, 8 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 8 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + addi AO1, AO1, 8 * SIZE + FMADD y10, a2, b8, y10 + addi AO2, AO2, 8 * SIZE + FMADD y11, a3, b8, y11 + addi AO3, AO3, 8 * SIZE + FMADD y12, a4, b8, y12 + addi AO4, AO4, 8 * SIZE + FMADD y13, a5, b8, y13 + addi AO5, AO5, 8 * SIZE + FMADD y14, a6, b8, y14 + addi AO6, AO6, 8 * SIZE + FMADD y15, a7, b8, y15 + addi AO7, AO7, 8 * SIZE + FMADD y16, a8, b8, y16 + addi AO8, AO8, 8 * SIZE + addi BO, BO, 8 * SIZE + .align 4 + +LL(15): + andi. r0, MIN_N, 4 + ble LL(16) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 3 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 3 * SIZE(AO2) + FMADD y11, a3, b2, y11 + LFD a3, 3 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 3 * SIZE(AO4) + FMADD y13, a5, b2, y13 + LFD a5, 3 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 3 * SIZE(AO6) + FMADD y15, a7, b2, y15 + LFD a7, 3 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 3 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 4 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 4 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 4 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 4 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 4 * SIZE(AO6) + FMADD y07, a7, b3, y07 + LFD a7, 4 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 4 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + addi AO1, AO1, 4 * SIZE + FMADD y10, a2, b4, y10 + addi AO2, AO2, 4 * SIZE + FMADD y11, a3, b4, y11 + addi AO3, AO3, 4 * SIZE + FMADD y12, a4, b4, y12 + addi AO4, AO4, 4 * SIZE + FMADD y13, a5, b4, y13 + addi AO5, AO5, 4 * SIZE + FMADD y14, a6, b4, y14 + addi AO6, AO6, 4 * SIZE + FMADD y15, a7, b4, y15 + addi AO7, AO7, 4 * SIZE + FMADD y16, a8, b4, y16 + addi AO8, AO8, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(16): + andi. r0, MIN_N, 2 + ble LL(17) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + LFD b2, 2 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + FMADD y10, a2, b2, y10 + addi AO3, AO3, 2 * SIZE + addi AO4, AO4, 2 * SIZE + FMADD y11, a3, b2, y11 + FMADD y12, a4, b2, y12 + addi AO5, AO5, 2 * SIZE + addi AO6, AO6, 2 * SIZE + FMADD y13, a5, b2, y13 + FMADD y14, a6, b2, y14 + addi AO7, AO7, 2 * SIZE + addi AO8, AO8, 2 * SIZE + FMADD y15, a7, b2, y15 + FMADD y16, a8, b2, y16 + addi BO, BO, 2 * SIZE + .align 4 + +LL(17): + andi. r0, MIN_N, 1 + ble LL(18) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b1, y03 + FMADD y04, a4, b1, y04 + FMADD y05, a5, b1, y05 + FMADD y06, a6, b1, y06 + FMADD y07, a7, b1, y07 + FMADD y08, a8, b1, y08 + .align 4 + +LL(18): + mr BO, CO + lfd alpha, ALPHA + cmpi cr0, 0, INCY, SIZE + bne LL(19) + + LFD a1, 1 * SIZE(CO) + LFD a2, 2 * SIZE(CO) + LFD a3, 3 * SIZE(CO) + LFD a4, 4 * SIZE(CO) + LFD a5, 5 * SIZE(CO) + LFD a6, 6 * SIZE(CO) + LFD a7, 7 * SIZE(CO) + LFD a8, 8 * SIZE(CO) + + FADD y01, y09, y01 + FADD y02, y10, y02 + FADD y03, y11, y03 + FADD y04, y12, y04 + FADD y05, y13, y05 + FADD y06, y14, y06 + FADD y07, y15, y07 + FADD y08, y16, y08 + + FMADD a1, alpha, y01, a1 + FMADD a2, alpha, y02, a2 + FMADD a3, alpha, y03, a3 + FMADD a4, alpha, y04, a4 + FMADD a5, alpha, y05, a5 + FMADD a6, alpha, y06, a6 + FMADD a7, alpha, y07, a7 + FMADD a8, alpha, y08, a8 + + STFD a1, 1 * SIZE(CO) + STFD a2, 2 * SIZE(CO) + STFD a3, 3 * SIZE(CO) + STFD a4, 4 * SIZE(CO) + STFD a5, 5 * SIZE(CO) + STFD a6, 6 * SIZE(CO) + STFD a7, 7 * SIZE(CO) + STFD a8, 8 * SIZE(CO) + + addi J, J, -1 + addi CO, CO, 8 * SIZE + cmpi cr0, 0, J, 0 + bgt LL(11) + b LL(20) + .align 4 + +LL(19): + LFDUX a1, CO, INCY + LFDUX a2, CO, INCY + LFDUX a3, CO, INCY + LFDUX a4, CO, INCY + LFDUX a5, CO, INCY + LFDUX a6, CO, INCY + LFDUX a7, CO, INCY + LFDUX a8, CO, INCY + + FADD y01, y09, y01 + FADD y02, y10, y02 + FADD y03, y11, y03 + FADD y04, y12, y04 + FADD y05, y13, y05 + FADD y06, y14, y06 + FADD y07, y15, y07 + FADD y08, y16, y08 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + FMADD a3, alpha, f2, a3 + FMADD a4, alpha, f3, a4 + FMADD a5, alpha, f4, a5 + FMADD a6, alpha, f5, a6 + FMADD a7, alpha, f6, a7 + FMADD a8, alpha, f7, a8 + + STFDUX a1, BO, INCY + STFDUX a2, BO, INCY + STFDUX a3, BO, INCY + STFDUX a4, BO, INCY + STFDUX a5, BO, INCY + STFDUX a6, BO, INCY + STFDUX a7, BO, INCY + STFDUX a8, BO, INCY + + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 7 + ble LL(99) + andi. J, N, 4 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr BO, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y09, y01 + fmr y10, y01 + fmr y11, y01 + fmr y12, y01 + + DCBT(Y1, PREC) + + srawi. r0, MIN_N, 4 + mtspr CTR, r0 + ble LL(24) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 2 * SIZE(AO1) + LFD a6, 2 * SIZE(AO2) + LFD a7, 2 * SIZE(AO3) + LFD a8, 2 * SIZE(AO4) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + bdz LL(23) + .align 4 + +LL(22): + FMADD y01, a1, b1, y01 + LFD a1, 3 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 3 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 3 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 4 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 4 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 4 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 4 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 5 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 5 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 6 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 6 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 6 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 6 * SIZE(AO4) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 7 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 7 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 7 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 7 * SIZE(AO4) + + FMADD y09, a5, b6, y09 + LFD a5, 8 * SIZE(AO1) + FMADD y10, a6, b6, y10 + LFD a6, 8 * SIZE(AO2) + FMADD y11, a7, b6, y11 + LFD a7, 8 * SIZE(AO3) + FMADD y12, a8, b6, y12 + LFD a8, 8 * SIZE(AO4) + + FMADD y01, a1, b7, y01 + LFD a1, 9 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, a3, b7, y03 + LFD a3, 9 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 9 * SIZE(AO4) + + FMADD y09, a5, b8, y09 + LFD a5, 10 * SIZE(AO1) + FMADD y10, a6, b8, y10 + LFD a6, 10 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 10 * SIZE(AO3) + FMADD y12, a8, b8, y12 + LFD a8, 10 * SIZE(AO4) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 11 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 11 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 11 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 11 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 12 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 12 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 12 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 12 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 13 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 13 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 13 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 13 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 14 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 14 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 14 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 14 * SIZE(AO4) + + LFD b1, 17 * SIZE(BO) + LFD b2, 18 * SIZE(BO) + LFD b3, 19 * SIZE(BO) + LFD b4, 20 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 15 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 15 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 15 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 15 * SIZE(AO4) + + FMADD y09, a5, b6, y09 + LFD a5, 16 * SIZE(AO1) + FMADD y10, a6, b6, y10 + LFD a6, 16 * SIZE(AO2) + FMADD y11, a7, b6, y11 + LFD a7, 16 * SIZE(AO3) + FMADD y12, a8, b6, y12 + LFD a8, 16 * SIZE(AO4) + + FMADD y01, a1, b7, y01 + LFD a1, 17 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 17 * SIZE(AO2) + FMADD y03, a3, b7, y03 + LFD a3, 17 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 17 * SIZE(AO4) + + FMADD y09, a5, b8, y09 + LFD a5, 18 * SIZE(AO1) + FMADD y10, a6, b8, y10 + LFD a6, 18 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 18 * SIZE(AO3) + FMADD y12, a8, b8, y12 + LFD a8, 18 * SIZE(AO4) + + LFD b5, 21 * SIZE(BO) + LFD b6, 22 * SIZE(BO) + LFD b7, 23 * SIZE(BO) + LFD b8, 24 * SIZE(BO) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + DCBT(AO1, PREA) + DCBT(AO2, PREA) + + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + addi BO, BO, 16 * SIZE + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, a1, b1, y01 + LFD a1, 3 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 3 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 3 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 4 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 4 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 4 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 4 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 5 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 5 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 6 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 6 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 6 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 6 * SIZE(AO4) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 7 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 7 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 7 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 7 * SIZE(AO4) + + FMADD y09, a5, b6, y09 + LFD a5, 8 * SIZE(AO1) + FMADD y10, a6, b6, y10 + LFD a6, 8 * SIZE(AO2) + FMADD y11, a7, b6, y11 + LFD a7, 8 * SIZE(AO3) + FMADD y12, a8, b6, y12 + LFD a8, 8 * SIZE(AO4) + + FMADD y01, a1, b7, y01 + LFD a1, 9 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, a3, b7, y03 + LFD a3, 9 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 9 * SIZE(AO4) + + FMADD y09, a5, b8, y09 + LFD a5, 10 * SIZE(AO1) + FMADD y10, a6, b8, y10 + LFD a6, 10 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 10 * SIZE(AO3) + FMADD y12, a8, b8, y12 + LFD a8, 10 * SIZE(AO4) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 11 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 11 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 11 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 11 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 12 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 12 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 12 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 12 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 13 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 13 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 13 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 13 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 14 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 14 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 14 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 14 * SIZE(AO4) + + FMADD y01, a1, b5, y01 + LFD a1, 15 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 15 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 15 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 15 * SIZE(AO4) + + FMADD y09, a5, b6, y09 + LFD a5, 16 * SIZE(AO1) + FMADD y10, a6, b6, y10 + LFD a6, 16 * SIZE(AO2) + FMADD y11, a7, b6, y11 + LFD a7, 16 * SIZE(AO3) + FMADD y12, a8, b6, y12 + LFD a8, 16 * SIZE(AO4) + + FMADD y01, a1, b7, y01 + FMADD y02, a2, b7, y02 + FMADD y03, a3, b7, y03 + FMADD y04, a4, b7, y04 + + FMADD y09, a5, b8, y09 + FMADD y10, a6, b8, y10 + FMADD y11, a7, b8, y11 + FMADD y12, a8, b8, y12 + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + addi BO, BO, 16 * SIZE + .align 4 + +LL(24): + andi. r0, MIN_N, 15 + ble LL(28) + + andi. r0, MIN_N, 8 + ble LL(25) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + LFD a5, 2 * SIZE(AO1) + LFD a6, 2 * SIZE(AO2) + LFD a7, 2 * SIZE(AO3) + LFD a8, 2 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + LFD a1, 3 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 3 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 3 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 4 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 4 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 4 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 4 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 5 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 5 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 6 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 6 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 6 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 6 * SIZE(AO4) + + LFD b1, 5 * SIZE(BO) + LFD b2, 6 * SIZE(BO) + LFD b3, 7 * SIZE(BO) + LFD b4, 8 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 7 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 7 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 7 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 7 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 8 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 8 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 8 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 8 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + FMADD y02, a2, b3, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b3, y04 + + FMADD y09, a5, b4, y09 + addi AO1, AO1, 8 * SIZE + FMADD y10, a6, b4, y10 + addi AO2, AO2, 8 * SIZE + FMADD y11, a7, b4, y11 + addi AO3, AO3, 8 * SIZE + FMADD y12, a8, b4, y12 + addi AO4, AO4, 8 * SIZE + + addi BO, BO, 8 * SIZE + .align 4 + +LL(25): + andi. r0, MIN_N, 4 + ble LL(26) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + LFD a5, 2 * SIZE(AO1) + LFD a6, 2 * SIZE(AO2) + LFD a7, 2 * SIZE(AO3) + LFD a8, 2 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + LFD a1, 3 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 3 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 3 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 4 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 4 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 4 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 4 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + FMADD y02, a2, b3, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b3, y04 + + FMADD y09, a5, b4, y09 + addi AO1, AO1, 4 * SIZE + FMADD y10, a6, b4, y10 + addi AO2, AO2, 4 * SIZE + FMADD y11, a7, b4, y11 + addi AO3, AO3, 4 * SIZE + FMADD y12, a8, b4, y12 + addi AO4, AO4, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(26): + andi. r0, MIN_N, 2 + ble LL(27) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + + LFD a5, 2 * SIZE(AO1) + LFD a6, 2 * SIZE(AO2) + LFD a7, 2 * SIZE(AO3) + LFD a8, 2 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b1, y03 + FMADD y04, a4, b1, y04 + + FMADD y09, a5, b2, y09 + addi AO1, AO1, 2 * SIZE + FMADD y10, a6, b2, y10 + addi AO2, AO2, 2 * SIZE + FMADD y11, a7, b2, y11 + addi AO3, AO3, 2 * SIZE + FMADD y12, a8, b2, y12 + addi AO4, AO4, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(27): + andi. r0, MIN_N, 1 + ble LL(28) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b1, y03 + FMADD y04, a4, b1, y04 + .align 4 + +LL(28): + mr BO, CO + lfd alpha, ALPHA + cmpi cr0, 0, INCY, SIZE + bne LL(29) + + LFD a1, 1 * SIZE(CO) + LFD a2, 2 * SIZE(CO) + LFD a3, 3 * SIZE(CO) + LFD a4, 4 * SIZE(CO) + + FADD y01, y09, y01 + FADD y02, y10, y02 + FADD y03, y11, y03 + FADD y04, y12, y04 + + FMADD a1, alpha, y01, a1 + FMADD a2, alpha, y02, a2 + FMADD a3, alpha, y03, a3 + FMADD a4, alpha, y04, a4 + + STFD a1, 1 * SIZE(CO) + STFD a2, 2 * SIZE(CO) + STFD a3, 3 * SIZE(CO) + STFD a4, 4 * SIZE(CO) + + addi CO, CO, 4 * SIZE + b LL(30) + .align 4 + +LL(29): + LFDUX a1, CO, INCY + LFDUX a2, CO, INCY + LFDUX a3, CO, INCY + LFDUX a4, CO, INCY + + FADD y01, y09, y01 + FADD y02, y10, y02 + FADD y03, y11, y03 + FADD y04, y12, y04 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + FMADD a3, alpha, f2, a3 + FMADD a4, alpha, f3, a4 + + STFDUX a1, BO, INCY + STFDUX a2, BO, INCY + STFDUX a3, BO, INCY + STFDUX a4, BO, INCY + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(40) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr BO, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y09, y01 + fmr y10, y01 + fmr y11, y01 + fmr y12, y01 + + DCBT(Y1, PREC) + + srawi. r0, MIN_N, 4 + mtspr CTR, r0 + ble LL(34) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO1) + LFD a4, 2 * SIZE(AO2) + LFD a5, 3 * SIZE(AO1) + LFD a6, 3 * SIZE(AO2) + LFD a7, 4 * SIZE(AO1) + LFD a8, 4 * SIZE(AO2) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + bdz LL(33) + .align 4 + +LL(32): + FMADD y01, a1, b1, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b2, y03 + LFD a3, 6 * SIZE(AO1) + FMADD y04, a4, b2, y04 + LFD a4, 6 * SIZE(AO2) + + FMADD y09, a5, b3, y09 + LFD a5, 7 * SIZE(AO1) + FMADD y10, a6, b3, y10 + LFD a6, 7 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 8 * SIZE(AO1) + FMADD y12, a8, b4, y12 + LFD a8, 8 * SIZE(AO2) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 9 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, a3, b6, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, a4, b6, y04 + LFD a4, 10 * SIZE(AO2) + + FMADD y09, a5, b7, y09 + LFD a5, 11 * SIZE(AO1) + FMADD y10, a6, b7, y10 + LFD a6, 11 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 12 * SIZE(AO1) + FMADD y12, a8, b8, y12 + LFD a8, 12 * SIZE(AO2) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 13 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 13 * SIZE(AO2) + FMADD y03, a3, b2, y03 + LFD a3, 14 * SIZE(AO1) + FMADD y04, a4, b2, y04 + LFD a4, 14 * SIZE(AO2) + + FMADD y09, a5, b3, y09 + LFD a5, 15 * SIZE(AO1) + FMADD y10, a6, b3, y10 + LFD a6, 15 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 16 * SIZE(AO1) + FMADD y12, a8, b4, y12 + LFD a8, 16 * SIZE(AO2) + + LFD b1, 17 * SIZE(BO) + LFD b2, 18 * SIZE(BO) + LFD b3, 19 * SIZE(BO) + LFD b4, 20 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 17 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 17 * SIZE(AO2) + FMADD y03, a3, b6, y03 + LFD a3, 18 * SIZE(AO1) + FMADD y04, a4, b6, y04 + LFD a4, 18 * SIZE(AO2) + + FMADD y09, a5, b7, y09 + LFD a5, 19 * SIZE(AO1) + FMADD y10, a6, b7, y10 + LFD a6, 19 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 20 * SIZE(AO1) + FMADD y12, a8, b8, y12 + LFD a8, 20 * SIZE(AO2) + + LFD b5, 21 * SIZE(BO) + LFD b6, 22 * SIZE(BO) + LFD b7, 23 * SIZE(BO) + LFD b8, 24 * SIZE(BO) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + DCBT(AO1, PREA) + DCBT(AO2, PREA) + + addi BO, BO, 16 * SIZE + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, a1, b1, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b2, y03 + LFD a3, 6 * SIZE(AO1) + FMADD y04, a4, b2, y04 + LFD a4, 6 * SIZE(AO2) + + FMADD y09, a5, b3, y09 + LFD a5, 7 * SIZE(AO1) + FMADD y10, a6, b3, y10 + LFD a6, 7 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 8 * SIZE(AO1) + FMADD y12, a8, b4, y12 + LFD a8, 8 * SIZE(AO2) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 9 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, a3, b6, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, a4, b6, y04 + LFD a4, 10 * SIZE(AO2) + + FMADD y09, a5, b7, y09 + LFD a5, 11 * SIZE(AO1) + FMADD y10, a6, b7, y10 + LFD a6, 11 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 12 * SIZE(AO1) + FMADD y12, a8, b8, y12 + LFD a8, 12 * SIZE(AO2) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 13 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 13 * SIZE(AO2) + FMADD y03, a3, b2, y03 + LFD a3, 14 * SIZE(AO1) + FMADD y04, a4, b2, y04 + LFD a4, 14 * SIZE(AO2) + + FMADD y09, a5, b3, y09 + LFD a5, 15 * SIZE(AO1) + FMADD y10, a6, b3, y10 + LFD a6, 15 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 16 * SIZE(AO1) + FMADD y12, a8, b4, y12 + LFD a8, 16 * SIZE(AO2) + + FMADD y01, a1, b5, y01 + FMADD y02, a2, b5, y02 + FMADD y03, a3, b6, y03 + FMADD y04, a4, b6, y04 + + FMADD y09, a5, b7, y09 + FMADD y10, a6, b7, y10 + FMADD y11, a7, b8, y11 + FMADD y12, a8, b8, y12 + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi BO, BO, 16 * SIZE + .align 4 + +LL(34): + andi. r0, MIN_N, 15 + ble LL(38) + andi. r0, MIN_N, 8 + ble LL(35) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO1) + LFD a4, 2 * SIZE(AO2) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + LFD a5, 3 * SIZE(AO1) + LFD a6, 3 * SIZE(AO2) + LFD a7, 4 * SIZE(AO1) + LFD a8, 4 * SIZE(AO2) + + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y09, a3, b2, y09 + LFD a3, 6 * SIZE(AO1) + FMADD y10, a4, b2, y10 + LFD a4, 6 * SIZE(AO2) + + FMADD y01, a5, b3, y01 + LFD a5, 7 * SIZE(AO1) + FMADD y02, a6, b3, y02 + LFD a6, 7 * SIZE(AO2) + FMADD y09, a7, b4, y09 + LFD a7, 8 * SIZE(AO1) + FMADD y10, a8, b4, y10 + LFD a8, 8 * SIZE(AO2) + + FMADD y01, a1, b5, y01 + FMADD y02, a2, b5, y02 + FMADD y09, a3, b6, y09 + FMADD y10, a4, b6, y10 + + FMADD y01, a5, b7, y01 + addi AO1, AO1, 8 * SIZE + FMADD y02, a6, b7, y02 + addi AO2, AO2, 8 * SIZE + FMADD y09, a7, b8, y09 + addi BO, BO, 8 * SIZE + FMADD y10, a8, b8, y10 + nop + .align 4 + +LL(35): + andi. r0, MIN_N, 4 + ble LL(36) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO1) + LFD a4, 2 * SIZE(AO2) + + LFD a5, 3 * SIZE(AO1) + LFD a6, 3 * SIZE(AO2) + LFD a7, 4 * SIZE(AO1) + LFD a8, 4 * SIZE(AO2) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y09, a3, b2, y09 + FMADD y10, a4, b2, y10 + + FMADD y01, a5, b3, y01 + addi AO1, AO1, 4 * SIZE + FMADD y02, a6, b3, y02 + addi AO2, AO2, 4 * SIZE + + FMADD y09, a7, b4, y09 + addi BO, BO, 4 * SIZE + FMADD y10, a8, b4, y10 + .align 4 + +LL(36): + andi. r0, MIN_N, 2 + ble LL(37) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + + LFD a3, 2 * SIZE(AO1) + LFD a4, 2 * SIZE(AO2) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y09, a3, b2, y09 + FMADD y10, a4, b2, y10 + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(37): + andi. r0, MIN_N, 1 + ble LL(38) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + .align 4 + +LL(38): + mr BO, CO + lfd alpha, ALPHA + cmpi cr0, 0, INCY, SIZE + bne LL(39) + + LFD a1, 1 * SIZE(CO) + LFD a2, 2 * SIZE(CO) + + FADD y01, y03, y01 + FADD y02, y04, y02 + FADD y09, y11, y09 + FADD y10, y12, y10 + + FADD y01, y09, y01 + FADD y02, y10, y02 + + FMADD a1, alpha, y01, a1 + FMADD a2, alpha, y02, a2 + + STFD a1, 1 * SIZE(CO) + STFD a2, 2 * SIZE(CO) + + addi CO, CO, 2 * SIZE + b LL(40) + .align 4 + +LL(39): + LFDUX a1, CO, INCY + LFDUX a2, CO, INCY + + FADD y01, y03, y01 + FADD y02, y04, y02 + FADD y09, y11, y09 + FADD y10, y12, y10 + + FADD y01, y09, y01 + FADD y02, y10, y02 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + + STFDUX a1, BO, INCY + STFDUX a2, BO, INCY + .align 4 + +LL(40): + andi. J, N, 1 + ble LL(99) + + mr AO1, A + add A, A, LDA + mr BO, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y09, y01 + fmr y10, y01 + fmr y11, y01 + fmr y12, y01 + + DCBT(Y1, PREC) + + srawi. r0, MIN_N, 4 + mtspr CTR, r0 + ble LL(44) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 2 * SIZE(AO1) + LFD a3, 3 * SIZE(AO1) + LFD a4, 4 * SIZE(AO1) + LFD a5, 5 * SIZE(AO1) + LFD a6, 6 * SIZE(AO1) + LFD a7, 7 * SIZE(AO1) + LFD a8, 8 * SIZE(AO1) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + bdz LL(43) + .align 4 + +LL(42): + FMADD y01, a1, b1, y01 + nop + LFD a1, 9 * SIZE(AO1) + LFD b1, 9 * SIZE(BO) + + FMADD y02, a2, b2, y02 + nop + LFD a2, 10 * SIZE(AO1) + LFD b2, 10 * SIZE(BO) + + FMADD y03, a3, b3, y03 + nop + LFD a3, 11 * SIZE(AO1) + LFD b3, 11 * SIZE(BO) + + FMADD y04, a4, b4, y04 + nop + LFD a4, 12 * SIZE(AO1) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a5, b5, y01 + nop + LFD a5, 13 * SIZE(AO1) + LFD b5, 13 * SIZE(BO) + + FMADD y02, a6, b6, y02 + nop + LFD a6, 14 * SIZE(AO1) + LFD b6, 14 * SIZE(BO) + + FMADD y03, a7, b7, y03 + nop + LFD a7, 15 * SIZE(AO1) + LFD b7, 15 * SIZE(BO) + + FMADD y04, a8, b8, y04 + nop + LFD a8, 16 * SIZE(AO1) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + nop + LFD a1, 17 * SIZE(AO1) + LFD b1, 17 * SIZE(BO) + + FMADD y02, a2, b2, y02 + nop + LFD a2, 18 * SIZE(AO1) + LFD b2, 18 * SIZE(BO) + + FMADD y03, a3, b3, y03 + nop + LFD a3, 19 * SIZE(AO1) + LFD b3, 19 * SIZE(BO) + + FMADD y04, a4, b4, y04 + nop + LFD a4, 20 * SIZE(AO1) + LFD b4, 20 * SIZE(BO) + + FMADD y01, a5, b5, y01 + nop + LFD a5, 21 * SIZE(AO1) + LFD b5, 21 * SIZE(BO) + + FMADD y02, a6, b6, y02 + nop + LFD a6, 22 * SIZE(AO1) + LFD b6, 22 * SIZE(BO) + + FMADD y03, a7, b7, y03 + nop + LFD a7, 23 * SIZE(AO1) + LFD b7, 23 * SIZE(BO) + + FMADD y04, a8, b8, y04 + nop + LFD a8, 24 * SIZE(AO1) + LFD b8, 24 * SIZE(BO) + + addi AO1, AO1, 16 * SIZE + addi BO, BO, 16 * SIZE + DCBT(AO1, PREA) + bdnz LL(42) + .align 4 + +LL(43): + FMADD y01, a1, b1, y01 + nop + LFD a1, 9 * SIZE(AO1) + LFD b1, 9 * SIZE(BO) + + FMADD y02, a2, b2, y02 + nop + LFD a2, 10 * SIZE(AO1) + LFD b2, 10 * SIZE(BO) + + FMADD y03, a3, b3, y03 + nop + LFD a3, 11 * SIZE(AO1) + LFD b3, 11 * SIZE(BO) + + FMADD y04, a4, b4, y04 + nop + LFD a4, 12 * SIZE(AO1) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a5, b5, y01 + nop + LFD a5, 13 * SIZE(AO1) + LFD b5, 13 * SIZE(BO) + + FMADD y02, a6, b6, y02 + nop + LFD a6, 14 * SIZE(AO1) + LFD b6, 14 * SIZE(BO) + + FMADD y03, a7, b7, y03 + nop + LFD a7, 15 * SIZE(AO1) + LFD b7, 15 * SIZE(BO) + + FMADD y04, a8, b8, y04 + nop + LFD a8, 16 * SIZE(AO1) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b2, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b4, y04 + + FMADD y01, a5, b5, y01 + addi AO1, AO1, 16 * SIZE + FMADD y02, a6, b6, y02 + addi BO, BO, 16 * SIZE + + FMADD y03, a7, b7, y03 + nop + FMADD y04, a8, b8, y04 + nop + .align 4 + +LL(44): + andi. r0, MIN_N, 15 + ble LL(48) + andi. r0, MIN_N, 8 + ble LL(45) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 2 * SIZE(AO1) + LFD a3, 3 * SIZE(AO1) + LFD a4, 4 * SIZE(AO1) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + LFD a5, 5 * SIZE(AO1) + LFD a6, 6 * SIZE(AO1) + LFD a7, 7 * SIZE(AO1) + LFD a8, 8 * SIZE(AO1) + + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b2, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b4, y04 + + FMADD y01, a5, b5, y01 + addi AO1, AO1, 8 * SIZE + FMADD y02, a6, b6, y02 + addi BO, BO, 8 * SIZE + FMADD y03, a7, b7, y03 + nop + FMADD y04, a8, b8, y04 + nop + .align 4 + +LL(45): + andi. r0, MIN_N, 4 + ble LL(46) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 2 * SIZE(AO1) + LFD b2, 2 * SIZE(BO) + + LFD a3, 3 * SIZE(AO1) + LFD b3, 3 * SIZE(BO) + LFD a4, 4 * SIZE(AO1) + LFD b4, 4 * SIZE(BO) + + FMADD y01, a1, b1, y01 + addi AO1, AO1, 4 * SIZE + FMADD y02, a2, b2, y02 + addi AO2, AO2, 4 * SIZE + + FMADD y03, a3, b3, y03 + addi BO, BO, 4 * SIZE + FMADD y04, a4, b4, y04 + nop + .align 4 + +LL(46): + andi. r0, MIN_N, 2 + ble LL(47) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 2 * SIZE(AO1) + LFD b2, 2 * SIZE(BO) + + FMADD y01, a1, b1, y01 + addi AO1, AO1, 2 * SIZE + FMADD y02, a2, b2, y02 + addi BO, BO, 2 * SIZE + .align 4 + +LL(47): + andi. r0, MIN_N, 1 + ble LL(48) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + FMADD y01, a1, b1, y01 + .align 4 + +LL(48): + mr BO, CO + lfd alpha, ALPHA + cmpi cr0, 0, INCY, SIZE + bne LL(49) + + LFD a1, 1 * SIZE(CO) + + FADD y01, y02, y01 + FADD y03, y04, y03 + FADD y01, y03, y01 + + FMADD a1, alpha, y01, a1 + STFD a1, 1 * SIZE(CO) + b LL(99) + .align 4 + +LL(49): + LFDUX a1, CO, INCY + FADD y01, y02, y01 + FADD y03, y04, y03 + FADD y01, y03, y01 + FMADD a1, alpha, f0, a1 + STFDUX a1, BO, INCY + .align 4 + +LL(99): + subf A, PLDA_M, A + addi IS, IS, P + cmp cr0, 0, IS, M + blt LL(ISLoop) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 160(SP) + ld r15, 168(SP) + ld r16, 176(SP) + ld r17, 184(SP) + ld r18, 192(SP) + ld r19, 200(SP) + ld r20, 208(SP) + ld r21, 216(SP) + ld r22, 224(SP) + ld r23, 232(SP) + ld r24, 240(SP) + ld r25, 248(SP) + ld r26, 256(SP) + ld r27, 264(SP) + ld r28, 272(SP) + ld r29, 280(SP) +#else + lwz r14, 160(SP) + lwz r15, 164(SP) + lwz r16, 168(SP) + lwz r17, 172(SP) + lwz r18, 176(SP) + lwz r19, 180(SP) + lwz r20, 184(SP) + lwz r21, 188(SP) + lwz r22, 192(SP) + lwz r23, 196(SP) + lwz r24, 200(SP) + lwz r25, 204(SP) + lwz r26, 208(SP) + lwz r27, 212(SP) + lwz r28, 216(SP) + lwz r29, 220(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE + +#endif diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S new file mode 100644 index 0000000..1aa59b2 --- /dev/null +++ b/kernel/power/gemv_t_ppc440.S @@ -0,0 +1,1089 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#define BUFFER r11 +#define XP r12 +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define J r18 +#define YY r19 +#define PREA r20 +#define PREC r21 +#define X1 r22 + + +#if defined(PPCG4) +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 7 +#endif + +#if defined(POWER6) +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 7 +#endif + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define a1 f8 +#define a2 f9 +#define a3 f10 +#define a4 f11 +#define a5 f12 +#define a6 f13 +#define a7 f14 +#define a8 f15 + +#define b1 f16 +#define b2 f17 +#define b3 f18 +#define b4 f19 +#define b5 f20 +#define b6 f21 +#define b7 f22 +#define b8 f23 + +#define alpha f23 + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 288 +#endif + +#define FZERO 144(SP) +#define ALPHA 152(SP) + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + +#ifdef __64BIT__ + std r0, FZERO + stfd f1, ALPHA + std r14, 160(SP) + std r15, 168(SP) + std r16, 176(SP) + std r17, 184(SP) + std r18, 192(SP) + std r19, 200(SP) + std r20, 208(SP) + std r21, 216(SP) + std r22, 224(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stfd f1, ALPHA + stw r14, 160(SP) + stw r15, 164(SP) + stw r16, 168(SP) + stw r17, 172(SP) + stw r18, 176(SP) + stw r19, 180(SP) + stw r20, 184(SP) + stw r21, 188(SP) + stw r22, 192(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + addi A, A, -SIZE + sub X, X, INCX + sub Y, Y, INCY + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpi cr0, 0, M, 0 + ble LL(999) + cmpi cr0, 0, N, 0 + ble LL(999) + + mr XP, X + + cmpi cr0, 0, INCX, SIZE + beq LL(10) + + addi XP, BUFFER, -SIZE + addi X1, BUFFER, -SIZE + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(CopyRemain) + .align 4 + +LL(CopyKernel): + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + STFDU f0, 1 * SIZE(X1) + STFDU f1, 1 * SIZE(X1) + STFDU f2, 1 * SIZE(X1) + STFDU f3, 1 * SIZE(X1) + STFDU f4, 1 * SIZE(X1) + STFDU f5, 1 * SIZE(X1) + STFDU f6, 1 * SIZE(X1) + STFDU f7, 1 * SIZE(X1) + bdnz LL(CopyKernel) + .align 4 + +LL(CopyRemain): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(CopySub): + LFDUX f0, X, INCX + STFDU f0, 1 * SIZE(X1) + bdnz LL(CopySub) + .align 4 + +LL(10): + mr YY, Y + + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(21): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr X1, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y05, y01 + fmr y06, y01 + fmr y07, y01 + fmr y08, y01 + + dcbtst Y, PREC + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(24) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + + LFDU a5, 1 * SIZE(AO1) + LFDU a6, 1 * SIZE(AO2) + LFDU a7, 1 * SIZE(AO3) + LFDU a8, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) + LFDU b4, 1 * SIZE(X1) + bdz LL(23) + .align 4 + +LL(22): +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y05, a5, b2, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b2, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO3, PREA +#endif + + FMADD y05, a5, b4, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b4, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b4, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b4, y08 + LFDU a8, 1 * SIZE(AO4) + +#ifdef PPCG4 + dcbt AO4, PREA +#endif + LFDU b4, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD y05, a5, b2, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b2, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO3, PREA +#endif + + FMADD y05, a5, b4, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b4, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b4, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b4, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b4, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO4, PREA +#endif + + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) + + FMADD y05, a5, b2, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) + + FMADD y05, a5, b4, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b4, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b4, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b4, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a4, 1 * SIZE(AO4) + + FMADD y05, a5, b2, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + FMADD y02, a2, b3, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b3, y04 + + FMADD y05, a5, b4, y05 + FMADD y06, a6, b4, y06 + FMADD y07, a7, b4, y07 + FMADD y08, a8, b4, y08 + .align 4 + +LL(24): + andi. r0, M, 7 + ble LL(28) + + andi. r0, M, 4 + ble LL(26) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU b1, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a7, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a8, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) + + FMADD y05, a5, b2, y05 + LFDU a1, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a2, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a3, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a4, 1 * SIZE(AO4) + + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFDU a7, 1 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFDU a8, 1 * SIZE(AO4) + + FMADD y05, a5, b4, y05 + FMADD y06, a6, b4, y06 + FMADD y07, a7, b4, y07 + FMADD y08, a8, b4, y08 + .align 4 + +LL(26): + andi. r0, M, 2 + ble LL(27) + + LFDU b1, 1 * SIZE(X1) + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a7, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a8, 1 * SIZE(AO4) + + FMADD y05, a5, b2, y05 + FMADD y06, a6, b2, y06 + FMADD y07, a7, b2, y07 + FMADD y08, a8, b2, y08 + .align 4 + +LL(27): + andi. r0, M, 1 + ble LL(28) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + + LFDU a2, 1 * SIZE(AO2) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b1, y03 + FMADD y04, a4, b1, y04 + .align 4 + +LL(28): + lfd alpha, ALPHA + + LFDUX a1, Y, INCY + LFDUX a2, Y, INCY + LFDUX a3, Y, INCY + LFDUX a4, Y, INCY + + FADD y01, y05, y01 + FADD y02, y06, y02 + FADD y03, y07, y03 + FADD y04, y08, y04 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + FMADD a3, alpha, f2, a3 + FMADD a4, alpha, f3, a4 + + STFDUX a1, YY, INCY + addi J, J, -1 + STFDUX a2, YY, INCY + cmpi cr0, 0, J, 0 + STFDUX a3, YY, INCY + STFDUX a4, YY, INCY + bgt LL(21) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(40) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr X1, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(34) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + + LFDU a5, 1 * SIZE(AO1) + LFDU a6, 1 * SIZE(AO2) + LFDU b3, 1 * SIZE(X1) + LFDU b4, 1 * SIZE(X1) + bdz LL(33) + .align 4 + +LL(32): +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b1, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y03, a5, b2, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b3, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD y03, a5, b4, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b4, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + LFDU b1, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD y03, a5, b2, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b3, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y03, a5, b4, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b4, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b4, 1 * SIZE(X1) + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b1, 1 * SIZE(X1) + + FMADD y03, a5, b2, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b3, 1 * SIZE(X1) + + FMADD y03, a5, b4, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b4, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + + FMADD y03, a5, b2, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a6, 1 * SIZE(AO2) + + FMADD y01, a1, b3, y01 + FMADD y02, a2, b3, y02 + + FMADD y03, a5, b4, y03 + FMADD y04, a6, b4, y04 + .align 4 + +LL(34): + andi. r0, M, 7 + ble LL(38) + + andi. r0, M, 4 + ble LL(36) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU b1, 1 * SIZE(X1) + + LFDU b2, 1 * SIZE(X1) + FMADD y01, a1, b1, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a6, 1 * SIZE(AO2) + + LFDU b3, 1 * SIZE(X1) + FMADD y03, a5, b2, y03 + LFDU a1, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a2, 1 * SIZE(AO2) + + LFDU b4, 1 * SIZE(X1) + FMADD y01, a1, b3, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a6, 1 * SIZE(AO2) + + FMADD y03, a5, b4, y03 + FMADD y04, a6, b4, y04 + .align 4 + +LL(36): + andi. r0, M, 2 + ble LL(37) + + LFDU b1, 1 * SIZE(X1) + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO1) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b2, y03 + FMADD y04, a4, b2, y04 + .align 4 + +LL(37): + andi. r0, M, 1 + ble LL(38) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO2) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + .align 4 + +LL(38): + lfd alpha, ALPHA + + LFDUX a1, Y, INCY + LFDUX a2, Y, INCY + + FADD y01, y03, y01 + FADD y02, y04, y02 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + + STFDUX a1, YY, INCY + STFDUX a2, YY, INCY + .align 4 + +LL(40): + andi. J, N, 1 + ble LL(999) + + mr AO1, A + add A, A, LDA + + mr X1, XP + + lfd y01, FZERO + fmr y02, y01 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(44) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO1) + LFDU a4, 1 * SIZE(AO1) + + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + LFDU b3, 1 * SIZE(X1) + LFDU b4, 1 * SIZE(X1) + bdz LL(43) + .align 4 + +LL(42): + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD y02, a2, b2, y02 + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y01, a3, b3, y01 + LFDU a3, 1 * SIZE(AO1) + LFDU b3, 1 * SIZE(X1) + + FMADD y02, a4, b4, y02 + LFDU a4, 1 * SIZE(AO1) + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + + FMADD y02, a2, b2, y02 + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD y01, a3, b3, y01 + LFDU a3, 1 * SIZE(AO1) + LFDU b3, 1 * SIZE(X1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD y02, a4, b4, y02 + LFDU a4, 1 * SIZE(AO1) + LFDU b4, 1 * SIZE(X1) + + bdnz LL(42) + .align 4 + +LL(43): + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + + FMADD y02, a2, b2, y02 + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a3, b3, y01 + LFDU a3, 1 * SIZE(AO1) + LFDU b3, 1 * SIZE(X1) + + FMADD y02, a4, b4, y02 + LFDU a4, 1 * SIZE(AO1) + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b2, y02 + FMADD y01, a3, b3, y01 + FMADD y02, a4, b4, y02 + .align 4 + +LL(44): + andi. r0, M, 7 + ble LL(48) + + andi. r0, M, 4 + ble LL(46) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a3, 1 * SIZE(AO1) + LFDU b3, 1 * SIZE(X1) + + FMADD y02, a2, b2, y02 + LFDU a4, 1 * SIZE(AO1) + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a3, b3, y01 + FMADD y02, a4, b4, y02 + .align 4 + +LL(46): + andi. r0, M, 2 + ble LL(47) + + LFDU b1, 1 * SIZE(X1) + LFDU a1, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b2, y02 + .align 4 + +LL(47): + andi. r0, M, 1 + ble LL(48) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + .align 4 + +LL(48): + lfd alpha, ALPHA + + LFDUX a1, Y, INCY + + FADD y01, y02, y01 + + FMADD a1, alpha, f0, a1 + + STFDUX a1, YY, INCY + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + +#ifdef __64BIT__ + ld r14, 160(SP) + ld r15, 168(SP) + ld r16, 176(SP) + ld r17, 184(SP) + ld r18, 192(SP) + ld r19, 200(SP) + ld r20, 208(SP) + ld r21, 216(SP) + ld r22, 224(SP) +#else + lwz r14, 160(SP) + lwz r15, 164(SP) + lwz r16, 168(SP) + lwz r17, 172(SP) + lwz r18, 176(SP) + lwz r19, 180(SP) + lwz r20, 184(SP) + lwz r21, 188(SP) + lwz r22, 192(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE + +#endif diff --git a/kernel/power/ger.S b/kernel/power/ger.S new file mode 100644 index 0000000..0068569 --- /dev/null +++ b/kernel/power/ger.S @@ -0,0 +1,1209 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef NEEDPARAM +#ifndef DOUBLE +#include "sparam.h" +#else +#include "dparam.h" +#endif +#endif + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define A r10 +#define LDA r5 +#else +#define M r3 +#define N r4 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define A r5 +#define LDA r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define A r6 +#define LDA r7 +#else +#define M r3 +#define N r4 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define A r5 +#define LDA r6 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define AO5 r18 +#define AO6 r19 +#define AO7 r20 +#define AO8 r21 + +#define X1 r22 +#define PREA r23 +#define PREC r24 +#define XX r25 +#define BUFFER r26 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define alpha1 f8 +#define alpha2 f9 + +#define a1 f12 +#define a2 f13 +#define a3 f14 +#define a4 f15 +#define a5 f16 +#define a6 f17 +#define a7 f18 +#define a8 f19 +#define a9 f20 +#define a10 f21 +#define a11 f22 +#define a12 f23 +#define a13 f24 +#define a14 f25 +#define a15 f26 +#define a16 f27 + +#define alpha f31 + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 280 +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz LDA, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld A, 112 + STACKSIZE(SP) + ld LDA, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCY, 56 + STACKSIZE(SP) + lwz A, 60 + STACKSIZE(SP) + lwz LDA, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz A, 56 + STACKSIZE(SP) + lwz LDA, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld A, 112 + STACKSIZE(SP) + ld LDA, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + fmr alpha, f1 + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + mr XX, X + + cmpi cr0, 0, INCX, SIZE + beq LL(10) + + mr XX, BUFFER + mr X1, BUFFER + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(X) + add X, X, INCX + LFD a2, 0 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + add X, X, INCX + LFD a4, 0 * SIZE(X) + add X, X, INCX + LFD a5, 0 * SIZE(X) + add X, X, INCX + LFD a6, 0 * SIZE(X) + add X, X, INCX + LFD a7, 0 * SIZE(X) + add X, X, INCX + LFD a8, 0 * SIZE(X) + add X, X, INCX + + STFD a1, 0 * SIZE(X1) + STFD a2, 1 * SIZE(X1) + STFD a3, 2 * SIZE(X1) + STFD a4, 3 * SIZE(X1) + STFD a5, 4 * SIZE(X1) + STFD a6, 5 * SIZE(X1) + STFD a7, 6 * SIZE(X1) + STFD a8, 7 * SIZE(X1) + + addi X1, X1, 8 * SIZE + bdnz+ LL(01) + .align 4 + +LL(05): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(06): + LFD a1, 0 * SIZE(X) + add X, X, INCX + STFD a1, 0 * SIZE(X1) + addi X1, X1, SIZE + bdnz+ LL(06) + .align 4 + +LL(10): + srawi. J, N, 1 + ble LL(20) + .align 4 + +LL(11): + LFD alpha1, 0 * SIZE(Y) + add Y, Y, INCY + LFD alpha2, 0 * SIZE(Y) + add Y, Y, INCY + + FMUL alpha1, alpha, alpha1 + FMUL alpha2, alpha, alpha2 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr X1, XX + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(15) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a9, 0 * SIZE(AO2) + LFD a10, 1 * SIZE(AO2) + LFD a11, 2 * SIZE(AO2) + LFD a12, 3 * SIZE(AO2) + + LFD a13, 4 * SIZE(AO2) + LFD a14, 5 * SIZE(AO2) + LFD a15, 6 * SIZE(AO2) + LFD a16, 7 * SIZE(AO2) + bdz LL(13) + .align 4 + +LL(12): + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + LFD a9, 8 * SIZE(AO2) + LFD a10, 9 * SIZE(AO2) + LFD a11, 10 * SIZE(AO2) + LFD a12, 11 * SIZE(AO2) + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + LFD a13, 12 * SIZE(AO2) + LFD a14, 13 * SIZE(AO2) + LFD a15, 14 * SIZE(AO2) + LFD a16, 15 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + LFD y01, 16 * SIZE(X1) + LFD y02, 17 * SIZE(X1) + LFD y03, 18 * SIZE(X1) + LFD y04, 19 * SIZE(X1) + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + LFD y05, 20 * SIZE(X1) + LFD y06, 21 * SIZE(X1) + LFD y07, 22 * SIZE(X1) + LFD y08, 23 * SIZE(X1) + + STFD a9, 8 * SIZE(AO2) + STFD a10, 9 * SIZE(AO2) + STFD a11, 10 * SIZE(AO2) + STFD a12, 11 * SIZE(AO2) + + LFD a9, 16 * SIZE(AO2) + LFD a10, 17 * SIZE(AO2) + LFD a11, 18 * SIZE(AO2) + LFD a12, 19 * SIZE(AO2) + + STFD a13, 12 * SIZE(AO2) + STFD a14, 13 * SIZE(AO2) + STFD a15, 14 * SIZE(AO2) + STFD a16, 15 * SIZE(AO2) + + LFD a13, 20 * SIZE(AO2) + LFD a14, 21 * SIZE(AO2) + LFD a15, 22 * SIZE(AO2) + LFD a16, 23 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi X1, X1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(Y1, PREY) + + bdnz+ LL(12) + .align 4 + +LL(13): + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + LFD a9, 8 * SIZE(AO2) + LFD a10, 9 * SIZE(AO2) + LFD a11, 10 * SIZE(AO2) + LFD a12, 11 * SIZE(AO2) + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + LFD a13, 12 * SIZE(AO2) + LFD a14, 13 * SIZE(AO2) + LFD a15, 14 * SIZE(AO2) + LFD a16, 15 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + STFD a9, 8 * SIZE(AO2) + STFD a10, 9 * SIZE(AO2) + STFD a11, 10 * SIZE(AO2) + STFD a12, 11 * SIZE(AO2) + + STFD a13, 12 * SIZE(AO2) + STFD a14, 13 * SIZE(AO2) + STFD a15, 14 * SIZE(AO2) + STFD a16, 15 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi X1, X1, 16 * SIZE + .align 4 + + +LL(15): + andi. r0, M, 15 + ble LL(19) + + andi. r0, M, 8 + ble LL(16) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD a9, 0 * SIZE(AO2) + LFD a10, 1 * SIZE(AO2) + LFD a11, 2 * SIZE(AO2) + LFD a12, 3 * SIZE(AO2) + LFD a13, 4 * SIZE(AO2) + LFD a14, 5 * SIZE(AO2) + LFD a15, 6 * SIZE(AO2) + LFD a16, 7 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi X1, X1, 8 * SIZE + .align 4 + +LL(16): + andi. r0, M, 4 + ble LL(17) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + FMADD a5, alpha2, y01, a5 + FMADD a6, alpha2, y02, a6 + FMADD a7, alpha2, y03, a7 + FMADD a8, alpha2, y04, a8 + + STFD a5, 0 * SIZE(AO2) + STFD a6, 1 * SIZE(AO2) + STFD a7, 2 * SIZE(AO2) + STFD a8, 3 * SIZE(AO2) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi X1, X1, 4 * SIZE + .align 4 + +LL(17): + andi. r0, M, 2 + ble LL(18) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha2, y01, a3 + FMADD a4, alpha2, y02, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 0 * SIZE(AO2) + STFD a4, 1 * SIZE(AO2) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + + addi X1, X1, 2 * SIZE + .align 4 + +LL(18): + andi. r0, M, 1 + ble LL(19) + + LFD y01, 0 * SIZE(X1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 0 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha2, y01, a2 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 0 * SIZE(AO2) + .align 4 + +LL(19): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 1 + ble LL(999) + .align 4 + +LL(21): + LFD alpha1, 0 * SIZE(Y) + FMUL alpha1, alpha, alpha1 + + mr AO1, A + mr X1, XX + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(25) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + bdz LL(23) + .align 4 + +LL(22): + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + LFD y01, 16 * SIZE(X1) + LFD y02, 17 * SIZE(X1) + LFD y03, 18 * SIZE(X1) + LFD y04, 19 * SIZE(X1) + + LFD y05, 20 * SIZE(X1) + LFD y06, 21 * SIZE(X1) + LFD y07, 22 * SIZE(X1) + LFD y08, 23 * SIZE(X1) + + addi AO1, AO1, 16 * SIZE + addi X1, X1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(Y1, PREY) + + bdnz+ LL(22) + .align 4 + +LL(23): + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + addi X1, X1, 16 * SIZE + .align 4 + +LL(25): + andi. r0, M, 15 + ble LL(999) + + andi. r0, M, 8 + ble LL(26) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + addi AO1, AO1, 8 * SIZE + addi X1, X1, 8 * SIZE + .align 4 + +LL(26): + andi. r0, M, 4 + ble LL(27) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + addi AO1, AO1, 4 * SIZE + addi X1, X1, 4 * SIZE + .align 4 + +LL(27): + andi. r0, M, 2 + ble LL(28) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + + addi AO1, AO1, 2 * SIZE + addi X1, X1, 2 * SIZE + .align 4 + +LL(28): + andi. r0, M, 1 + ble LL(999) + + LFD y01, 0 * SIZE(X1) + LFD a1, 0 * SIZE(AO1) + + FMADD a1, alpha1, y01, a1 + + STFD a1, 0 * SIZE(AO1) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/iamax.S b/kernel/power/iamax.S new file mode 100644 index 0000000..cdc57fa --- /dev/null +++ b/kernel/power/iamax.S @@ -0,0 +1,802 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(1000): + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f24, 0 * SIZE(XX) + LFD f25, 1 * SIZE(XX) + LFD f26, 2 * SIZE(XX) + LFD f27, 3 * SIZE(XX) + LFD f28, 4 * SIZE(XX) + LFD f29, 5 * SIZE(XX) + LFD f30, 6 * SIZE(XX) + LFD f31, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + addi XX, XX, 8 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi XX, XX, 8 * SIZE + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/iamax_hummer.S b/kernel/power/iamax_hummer.S new file mode 100644 index 0000000..9b23709 --- /dev/null +++ b/kernel/power/iamax_hummer.S @@ -0,0 +1,1015 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + mr NN, N + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + + addi N, N, -1 + cmpwi cr0, N, 0 + li RET, 1 + fabs C1, C1 + ble LL(999) + + fsmfp C1, C1 + mr XX, X + fpmr C2, C1 + add X, X, INCX + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C2, C2 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, T1 + LFPDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFPDUX A2, X, INCX2 + fpsub F3, C3, T3 + LFPDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + li RET, 0 + + fsmfp C1, C1 + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD A1, 0 * SIZE(XX) + add XX, XX, INCX + + addi NN, NN, -1 + addi RET, RET, 1 + + fabs A1, A1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + bdz LL(23) + .align 4 + +LL(22): + addi RET, RET, 1 + fcmpu cr0, C1, T1 + LFPDUX A1, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + LFPDUX A2, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + LFPDUX A3, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + LFPDUX A4, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + LFPDUX A5, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + LFPDUX A6, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + LFPDUX A7, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + LFPDUX A8, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T5 + fpabs T1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T5 + fpabs T2, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T6 + fpabs T3, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T6 + fpabs T4, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T7 + fpabs T5, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T7 + fpabs T6, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T8 + fpabs T7, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T8 + fpabs T8, A8 + beq cr0, LL(999) + bdnz LL(22) + .align 4 + +LL(23): + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T8 + beq cr0, LL(999) + .align 4 + +LL(25): + andi. r0, NN, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(26): + andi. r0, NN, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + .align 4 + +LL(27): + andi. r0, NN, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + + fpabs T1, A1 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + .align 4 + +LL(28): + andi. r0, NN, 1 + beq LL(999) + addi RET, RET, 1 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, T1 + LFDUX A1, X, INCX + fpsub F2, C2, T2 + LFDUX A2, X, INCX + fpsub F3, C3, T3 + LFDUX A3, X, INCX + fpsub F4, C4, T4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, C1, T5 + LFSDUX A5, X, INCX + fpsub F6, C2, T6 + LFSDUX A6, X, INCX + fpsub F7, C3, T7 + LFSDUX A7, X, INCX + fpsub F8, C4, T8 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + + li RET, 0 + + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fabs T1, A1 + fabs T2, A2 + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + bdz LL(123) + .align 4 + +LL(122): + LFDUX A1, XX, INCX + fabs T3, A3 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + LFDUX A2, XX, INCX + fabs T4, A4 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + LFDUX A3, XX, INCX + fabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + LFDUX A4, XX, INCX + fabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + LFDUX A5, XX, INCX + fabs T3, A7 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + LFDUX A6, XX, INCX + fabs T4, A8 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + LFDUX A7, XX, INCX + fabs T1, A1 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + LFDUX A8, XX, INCX + fabs T2, A2 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + bdnz LL(122) + .align 4 + +LL(123): + fabs T3, A3 + fabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + fabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + fabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + fabs T3, A7 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + fabs T4, A8 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fabs T1, A1 + fabs T2, A2 + fabs T3, A3 + fabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(127): + andi. r0, NN, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + fabs T1, A1 + fabs T2, A2 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + .align 4 + +LL(128): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/iamax_ppc440.S b/kernel/power/iamax_ppc440.S new file mode 100644 index 0000000..11ea4cb --- /dev/null +++ b/kernel/power/iamax_ppc440.S @@ -0,0 +1,482 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + li RET, 0 + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + li PRE, 3 * 16 * SIZE + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + subi N, N, 1 + fabs f6, f1 + srawi. r0, N, 4 + fabs f7, f1 + mtspr CTR, r0 + fabs f1, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDUX f25, XX, INCX + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDUX f27, XX, INCX + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDUX f29, XX, INCX + + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDUX f31, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/iamin.S b/kernel/power/iamin.S new file mode 100644 index 0000000..c3dbb84 --- /dev/null +++ b/kernel/power/iamin.S @@ -0,0 +1,803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(1000): + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f24, 0 * SIZE(XX) + LFD f25, 1 * SIZE(XX) + LFD f26, 2 * SIZE(XX) + LFD f27, 3 * SIZE(XX) + LFD f28, 4 * SIZE(XX) + LFD f29, 5 * SIZE(XX) + LFD f30, 6 * SIZE(XX) + LFD f31, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + addi XX, XX, 8 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi XX, XX, 8 * SIZE + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/iamin_hummer.S b/kernel/power/iamin_hummer.S new file mode 100644 index 0000000..6dad3be --- /dev/null +++ b/kernel/power/iamin_hummer.S @@ -0,0 +1,1016 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + mr NN, N + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + + addi N, N, -1 + cmpwi cr0, N, 0 + li RET, 1 + fabs C1, C1 + ble LL(999) + + fsmfp C1, C1 + mr XX, X + fpmr C2, C1 + add X, X, INCX + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C2, C2 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, T1, C1 + LFPDUX A1, X, INCX2 + fpsub F2, T2, C2 + LFPDUX A2, X, INCX2 + fpsub F3, T3, C3 + LFPDUX A3, X, INCX2 + fpsub F4, T4, C4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + li RET, 0 + + fsmfp C1, C1 + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD A1, 0 * SIZE(XX) + add XX, XX, INCX + + addi NN, NN, -1 + addi RET, RET, 1 + + fabs A1, A1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + bdz LL(23) + .align 4 + +LL(22): + addi RET, RET, 1 + fcmpu cr0, C1, T1 + LFPDUX A1, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + LFPDUX A2, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + LFPDUX A3, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + LFPDUX A4, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + LFPDUX A5, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + LFPDUX A6, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + LFPDUX A7, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + LFPDUX A8, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T5 + fpabs T1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T5 + fpabs T2, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T6 + fpabs T3, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T6 + fpabs T4, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T7 + fpabs T5, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T7 + fpabs T6, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T8 + fpabs T7, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T8 + fpabs T8, A8 + beq cr0, LL(999) + bdnz LL(22) + .align 4 + +LL(23): + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T8 + beq cr0, LL(999) + .align 4 + +LL(25): + andi. r0, NN, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(26): + andi. r0, NN, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + .align 4 + +LL(27): + andi. r0, NN, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + + fpabs T1, A1 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + .align 4 + +LL(28): + andi. r0, NN, 1 + beq LL(999) + addi RET, RET, 1 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, T1, C1 + LFDUX A1, X, INCX + fpsub F2, T2, C2 + LFDUX A2, X, INCX + fpsub F3, T3, C3 + LFDUX A3, X, INCX + fpsub F4, T4, C4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, T5, C1 + LFSDUX A5, X, INCX + fpsub F6, T6, C2 + LFSDUX A6, X, INCX + fpsub F7, T7, C3 + LFSDUX A7, X, INCX + fpsub F8, T8, C4 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsub F3, A3, C3 + fsub F4, A4, C4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, A1, C1 + fsub F2, A2, C2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + + li RET, 0 + + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fabs T1, A1 + fabs T2, A2 + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + bdz LL(123) + .align 4 + +LL(122): + LFDUX A1, XX, INCX + fabs T3, A3 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + LFDUX A2, XX, INCX + fabs T4, A4 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + LFDUX A3, XX, INCX + fabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + LFDUX A4, XX, INCX + fabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + LFDUX A5, XX, INCX + fabs T3, A7 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + LFDUX A6, XX, INCX + fabs T4, A8 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + LFDUX A7, XX, INCX + fabs T1, A1 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + LFDUX A8, XX, INCX + fabs T2, A2 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + bdnz LL(122) + .align 4 + +LL(123): + fabs T3, A3 + fabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + fabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + fabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + fabs T3, A7 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + fabs T4, A8 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fabs T1, A1 + fabs T2, A2 + fabs T3, A3 + fabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(127): + andi. r0, NN, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + fabs T1, A1 + fabs T2, A2 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + .align 4 + +LL(128): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/iamin_ppc440.S b/kernel/power/iamin_ppc440.S new file mode 100644 index 0000000..888e74a --- /dev/null +++ b/kernel/power/iamin_ppc440.S @@ -0,0 +1,482 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + li RET, 0 + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + li PRE, 3 * 16 * SIZE + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + subi N, N, 1 + fabs f6, f1 + srawi. r0, N, 4 + fabs f7, f1 + mtspr CTR, r0 + fabs f1, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f9, f1 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f10, f2 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f11, f3 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f13, f5 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f14, f6 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f15, f7 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f9, f1 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f10, f2 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f11, f3 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f13, f5 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f14, f6 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f15, f7 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDUX f25, XX, INCX + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDUX f27, XX, INCX + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDUX f29, XX, INCX + + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDUX f31, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/imax.S b/kernel/power/imax.S new file mode 100644 index 0000000..6b6cd45 --- /dev/null +++ b/kernel/power/imax.S @@ -0,0 +1,684 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(20) + .align 4 + +LL(10): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fsel f0, f8, f0, f24 + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + fsub f11, f3, f19 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f12, f4, f28 + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + fsub f15, f7, f23 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fsel f0, f8, f0, f24 + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + fsub f11, f3, f19 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f12, f4, f28 + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + fsub f15, f7, f23 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(1000): + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + LFD f10, 2 * SIZE(XX) + LFD f11, 3 * SIZE(XX) + LFD f12, 4 * SIZE(XX) + LFD f13, 5 * SIZE(XX) + LFD f14, 6 * SIZE(XX) + LFD f15, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + LFD f8, 8 * SIZE(XX) + LFD f9, 9 * SIZE(XX) + LFD f10, 10 * SIZE(XX) + LFD f11, 11 * SIZE(XX) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + LFD f12, 12 * SIZE(XX) + LFD f13, 13 * SIZE(XX) + LFD f14, 14 * SIZE(XX) + LFD f15, 15 * SIZE(XX) + + addi XX, XX, 8 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + addi XX, XX, 8 * SIZE + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + bdnz LL(1110) + .align 4 + +LL(1120): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/imax_hummer.S b/kernel/power/imax_hummer.S new file mode 100644 index 0000000..110dc18 --- /dev/null +++ b/kernel/power/imax_hummer.S @@ -0,0 +1,867 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + mr NN, N + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + + addi N, N, -1 + cmpwi cr0, N, 0 + li RET, 1 + ble LL(999) + + fsmfp C1, C1 + mr XX, X + fpmr C2, C1 + add X, X, INCX + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + LFPDUX A1, X, INCX2 + fpsel C2, F2, C2, A2 + LFPDUX A2, X, INCX2 + fpsel C3, F3, C3, A3 + LFPDUX A3, X, INCX2 + fpsel C4, F4, C4, A4 + LFPDUX A4, X, INCX2 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + LFPDUX A5, X, INCX2 + fpsel C2, F6, C2, A6 + LFPDUX A6, X, INCX2 + fpsel C3, F7, C3, A7 + LFPDUX A7, X, INCX2 + fpsel C4, F8, C4, A8 + LFPDUX A8, X, INCX2 + + bdnz LL(12) + .align 4 + +LL(13): + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + li RET, 0 + + fsmfp C1, C1 + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD A1, 0 * SIZE(XX) + add XX, XX, INCX + + addi NN, NN, -1 + addi RET, RET, 1 + + fcmpu cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + bdz LL(23) + .align 4 + +LL(22): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + LFPDUX A1, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + LFPDUX A2, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + LFPDUX A3, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + LFPDUX A4, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A5 + LFPDUX A5, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A6 + LFPDUX A6, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A7 + LFPDUX A7, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A8 + LFPDUX A8, XX, INCX2 + beq cr0, LL(999) + bdnz LL(22) + .align 4 + +LL(23): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A8 + beq cr0, LL(999) + .align 4 + +LL(25): + andi. r0, NN, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + beq cr0, LL(999) + .align 4 + +LL(26): + andi. r0, NN, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + .align 4 + +LL(27): + andi. r0, NN, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(28): + addi RET, RET, 1 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, A1 + LFSDUX A5, X, INCX + fpsub F2, C2, A2 + LFSDUX A6, X, INCX + fpsub F3, C3, A3 + LFSDUX A7, X, INCX + fpsub F4, C4, A4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, A1 + LFDUX A1, X, INCX + fpsel C2, F2, C2, A2 + LFDUX A2, X, INCX + fpsel C3, F3, C3, A3 + LFDUX A3, X, INCX + fpsel C4, F4, C4, A4 + LFDUX A4, X, INCX + + fpsub F5, C1, A5 + LFSDUX A1, X, INCX + fpsub F6, C2, A6 + LFSDUX A2, X, INCX + fpsub F7, C3, A7 + LFSDUX A3, X, INCX + fpsub F8, C4, A8 + LFSDUX A4, X, INCX + + fpsel C1, F5, C1, A5 + LFDUX A5, X, INCX + fpsel C2, F6, C2, A6 + LFDUX A6, X, INCX + fpsel C3, F7, C3, A7 + LFDUX A7, X, INCX + fpsel C4, F8, C4, A8 + LFDUX A8, X, INCX + bdnz LL(102) + .align 4 + +LL(103): + fpsub F1, C1, A1 + LFSDUX A5, X, INCX + fpsub F2, C2, A2 + LFSDUX A6, X, INCX + fpsub F3, C3, A3 + LFSDUX A7, X, INCX + fpsub F4, C4, A4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + + li RET, 0 + + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + bdz LL(123) + .align 4 + +LL(122): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + LFDUX A1, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + LFDUX A2, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + LFDUX A3, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + LFDUX A4, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + LFDUX A5, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + LFDUX A6, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + LFDUX A7, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + LFDUX A8, XX, INCX + beq cr0, LL(999) + bdnz LL(122) + .align 4 + +LL(123): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + .align 4 + +LL(127): + andi. r0, NN, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + .align 4 + +LL(128): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/imax_ppc440.S b/kernel/power/imax_ppc440.S new file mode 100644 index 0000000..b4a6449 --- /dev/null +++ b/kernel/power/imax_ppc440.S @@ -0,0 +1,429 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + li PRE, 3 * 16 * SIZE + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + subi N, N, 1 + fmr f5, f1 + srawi. r0, N, 4 + fmr f6, f1 + mtspr CTR, r0 + fmr f7, f1 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + fsub f8, f0, f16 + LFDUX f25, X, INCX + fsub f9, f1, f17 + LFDUX f26, X, INCX + fsub f10, f2, f18 + LFDUX f27, X, INCX + fsub f11, f3, f19 + LFDUX f28, X, INCX + fsub f12, f4, f20 + LFDUX f29, X, INCX + fsub f13, f5, f21 + LFDUX f30, X, INCX + fsub f14, f6, f22 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f0, f16 + LFDUX f16, X, INCX + fsub f8, f0, f24 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsel f1, f9, f1, f17 + LFDUX f17, X, INCX + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + LFDUX f18, X, INCX + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + LFDUX f19, X, INCX + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + LFDUX f20, X, INCX + fsub f12, f4, f28 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f5, f13, f5, f21 + LFDUX f21, X, INCX + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + LFDUX f22, X, INCX + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + LFDUX f23, X, INCX + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + LFDUX f24, X, INCX + fsub f8, f0, f16 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsel f1, f9, f1, f25 + LFDUX f25, X, INCX + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + LFDUX f26, X, INCX + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + LFDUX f27, X, INCX + fsub f11, f3, f19 + + fsel f4, f12, f4, f28 + LFDUX f28, X, INCX + fsub f12, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f5, f13, f5, f29 + LFDUX f29, X, INCX + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + LFDUX f30, X, INCX + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + LFDUX f8, XX, INCX + beq cr0, LL(9999) + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + LFDUX f9, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + LFDUX f10, XX, INCX + beq cr0, LL(9999) + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + LFDUX f11, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + LFDUX f12, XX, INCX + beq cr0, LL(9999) + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + LFDUX f13, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + LFDUX f14, XX, INCX + beq cr0, LL(9999) + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + LFDUX f15, XX, INCX + beq cr0, LL(9999) + bdnz LL(1110) + .align 4 + +LL(1120): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/imin.S b/kernel/power/imin.S new file mode 100644 index 0000000..2dd774d --- /dev/null +++ b/kernel/power/imin.S @@ -0,0 +1,684 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(20) + .align 4 + +LL(10): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fsel f0, f8, f24, f0 + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + fsub f11, f3, f19 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f12, f28, f4 + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + fsub f15, f7, f23 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fsel f0, f8, f24, f0 + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + fsub f11, f3, f19 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f12, f28, f4 + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + fsub f15, f7, f23 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(1000): + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + LFD f10, 2 * SIZE(XX) + LFD f11, 3 * SIZE(XX) + LFD f12, 4 * SIZE(XX) + LFD f13, 5 * SIZE(XX) + LFD f14, 6 * SIZE(XX) + LFD f15, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + LFD f8, 8 * SIZE(XX) + LFD f9, 9 * SIZE(XX) + LFD f10, 10 * SIZE(XX) + LFD f11, 11 * SIZE(XX) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + LFD f12, 12 * SIZE(XX) + LFD f13, 13 * SIZE(XX) + LFD f14, 14 * SIZE(XX) + LFD f15, 15 * SIZE(XX) + + addi XX, XX, 8 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + addi XX, XX, 8 * SIZE + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + bdnz LL(1110) + .align 4 + +LL(1120): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/imin_hummer.S b/kernel/power/imin_hummer.S new file mode 100644 index 0000000..d333329 --- /dev/null +++ b/kernel/power/imin_hummer.S @@ -0,0 +1,867 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + mr NN, N + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + + addi N, N, -1 + cmpwi cr0, N, 0 + li RET, 1 + ble LL(999) + + fsmfp C1, C1 + mr XX, X + fpmr C2, C1 + add X, X, INCX + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + LFPDUX A1, X, INCX2 + fpsel C2, F2, C2, A2 + LFPDUX A2, X, INCX2 + fpsel C3, F3, C3, A3 + LFPDUX A3, X, INCX2 + fpsel C4, F4, C4, A4 + LFPDUX A4, X, INCX2 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + LFPDUX A5, X, INCX2 + fpsel C2, F6, C2, A6 + LFPDUX A6, X, INCX2 + fpsel C3, F7, C3, A7 + LFPDUX A7, X, INCX2 + fpsel C4, F8, C4, A8 + LFPDUX A8, X, INCX2 + + bdnz LL(12) + .align 4 + +LL(13): + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + li RET, 0 + + fsmfp C1, C1 + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD A1, 0 * SIZE(XX) + add XX, XX, INCX + + addi NN, NN, -1 + addi RET, RET, 1 + + fcmpu cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + bdz LL(23) + .align 4 + +LL(22): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + LFPDUX A1, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + LFPDUX A2, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + LFPDUX A3, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + LFPDUX A4, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A5 + LFPDUX A5, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A6 + LFPDUX A6, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A7 + LFPDUX A7, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A8 + LFPDUX A8, XX, INCX2 + beq cr0, LL(999) + bdnz LL(22) + .align 4 + +LL(23): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A8 + beq cr0, LL(999) + .align 4 + +LL(25): + andi. r0, NN, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + beq cr0, LL(999) + .align 4 + +LL(26): + andi. r0, NN, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + .align 4 + +LL(27): + andi. r0, NN, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(28): + addi RET, RET, 1 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, A1, C1 + LFSDUX A5, X, INCX + fpsub F2, A2, C2 + LFSDUX A6, X, INCX + fpsub F3, A3, C3 + LFSDUX A7, X, INCX + fpsub F4, A4, C4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, A1 + LFDUX A1, X, INCX + fpsel C2, F2, C2, A2 + LFDUX A2, X, INCX + fpsel C3, F3, C3, A3 + LFDUX A3, X, INCX + fpsel C4, F4, C4, A4 + LFDUX A4, X, INCX + + fpsub F5, A5, C1 + LFSDUX A1, X, INCX + fpsub F6, A6, C2 + LFSDUX A2, X, INCX + fpsub F7, A7, C3 + LFSDUX A3, X, INCX + fpsub F8, A8, C4 + LFSDUX A4, X, INCX + + fpsel C1, F5, C1, A5 + LFDUX A5, X, INCX + fpsel C2, F6, C2, A6 + LFDUX A6, X, INCX + fpsel C3, F7, C3, A7 + LFDUX A7, X, INCX + fpsel C4, F8, C4, A8 + LFDUX A8, X, INCX + bdnz LL(102) + .align 4 + +LL(103): + fpsub F1, A1, C1 + LFSDUX A5, X, INCX + fpsub F2, A2, C2 + LFSDUX A6, X, INCX + fpsub F3, A3, C3 + LFSDUX A7, X, INCX + fpsub F4, A4, C4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsub F3, A3, C3 + fsub F4, A4, C4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + + li RET, 0 + + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + bdz LL(123) + .align 4 + +LL(122): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + LFDUX A1, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + LFDUX A2, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + LFDUX A3, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + LFDUX A4, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + LFDUX A5, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + LFDUX A6, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + LFDUX A7, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + LFDUX A8, XX, INCX + beq cr0, LL(999) + bdnz LL(122) + .align 4 + +LL(123): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + .align 4 + +LL(127): + andi. r0, NN, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + .align 4 + +LL(128): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/imin_ppc440.S b/kernel/power/imin_ppc440.S new file mode 100644 index 0000000..4e1185d --- /dev/null +++ b/kernel/power/imin_ppc440.S @@ -0,0 +1,414 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + li PRE, 3 * 16 * SIZE + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + subi N, N, 1 + fmr f5, f1 + srawi. r0, N, 4 + fmr f6, f1 + mtspr CTR, r0 + fmr f7, f1 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + fsub f8, f0, f16 + LFDUX f25, X, INCX + fsub f9, f1, f17 + LFDUX f26, X, INCX + fsub f10, f2, f18 + LFDUX f27, X, INCX + fsub f11, f3, f19 + LFDUX f28, X, INCX + fsub f12, f4, f20 + LFDUX f29, X, INCX + fsub f13, f5, f21 + LFDUX f30, X, INCX + fsub f14, f6, f22 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f16, f0 + LFDUX f16, X, INCX + fsub f8, f0, f24 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsel f1, f9, f17, f1 + LFDUX f17, X, INCX + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + LFDUX f18, X, INCX + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + LFDUX f19, X, INCX + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + LFDUX f20, X, INCX + fsub f12, f4, f28 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f5, f13, f21, f5 + LFDUX f21, X, INCX + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + LFDUX f22, X, INCX + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + LFDUX f23, X, INCX + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + LFDUX f24, X, INCX + fsub f8, f0, f16 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsel f1, f9, f25, f1 + LFDUX f25, X, INCX + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + LFDUX f26, X, INCX + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + LFDUX f27, X, INCX + fsub f11, f3, f19 + + fsel f4, f12, f28, f4 + LFDUX f28, X, INCX + fsub f12, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f5, f13, f29, f5 + LFDUX f29, X, INCX + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + LFDUX f30, X, INCX + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + LFDUX f8, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + LFDUX f9, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + LFDUX f10, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + LFDUX f11, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + LFDUX f12, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + LFDUX f13, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + LFDUX f14, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + LFDUX f15, XX, INCX + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/izamax.S b/kernel/power/izamax.S new file mode 100644 index 0000000..4851047 --- /dev/null +++ b/kernel/power/izamax.S @@ -0,0 +1,919 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 +#define INCXM1 r10 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + + +LL(1000): + cmpwi cr0, INCX, SIZE * 2 + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f24, 0 * SIZE(XX) + LFD f25, 1 * SIZE(XX) + LFD f26, 2 * SIZE(XX) + LFD f27, 3 * SIZE(XX) + LFD f28, 4 * SIZE(XX) + LFD f29, 5 * SIZE(XX) + LFD f30, 6 * SIZE(XX) + LFD f31, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(XX) + LFD f25, 17 * SIZE(XX) + LFD f26, 18 * SIZE(XX) + LFD f27, 19 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(XX) + LFD f29, 21 * SIZE(XX) + LFD f30, 22 * SIZE(XX) + LFD f31, 23 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + addi XX, XX, 16 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + addi XX, XX, 16 * SIZE + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + addi XX, XX, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCXM1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/izamax_hummer.S b/kernel/power/izamax_hummer.S new file mode 100644 index 0000000..8dffa0c --- /dev/null +++ b/kernel/power/izamax_hummer.S @@ -0,0 +1,566 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 + +#define T1 f16 +#define T2 f17 +#define T3 f18 +#define T4 f19 + +#define B1 f20 +#define B2 f21 +#define B3 f22 +#define B4 f23 +#define B5 f24 +#define B6 f25 +#define B7 f26 +#define B8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + mr NN, N + ble LL(999) + + mr XX, X + + LFD A1, 0 * SIZE(X) + LFD A2, 1 * SIZE(X) + add X, X, INCX2 + li RET, 1 + + fabs A1, A1 + fabs A2, A2 + + subi INCX2, INCX2, SIZE + + addi N, N, -1 + cmpwi cr0, N, 0 + fadd C1, A1, A2 + ble LL(999) + + fsmfp C1, C1 + li INCX, SIZE + fpmr C2, C1 + sub X, X, INCX2 + fpmr C3, C1 + srawi. r0, N, 3 + fpmr C4, C1 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX2 + LFDUX A6, X, INCX + LFDUX A7, X, INCX2 + LFDUX A8, X, INCX + + LFSDUX A5, X, INCX2 + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX2 + LFSDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpabs B1, A1 + LFDUX A1, X, INCX2 + fpabs B2, A2 + LFDUX A2, X, INCX + fpabs B3, A3 + LFDUX A3, X, INCX2 + fpabs B4, A4 + LFDUX A4, X, INCX + + fpabs B5, A5 + LFSDUX A1, X, INCX2 + fpabs B6, A6 + LFSDUX A2, X, INCX + fpabs B7, A7 + LFSDUX A3, X, INCX2 + fpabs B8, A8 + LFSDUX A4, X, INCX + + fpadd T1, B1, B2 + LFDUX A5, X, INCX2 + fpadd T2, B3, B4 + LFDUX A6, X, INCX + fpadd T3, B5, B6 + LFDUX A7, X, INCX2 + fpadd T4, B7, B8 + LFDUX A8, X, INCX + + fpsub F1, C1, T1 + LFSDUX A5, X, INCX2 + fpsub F2, C2, T2 + LFSDUX A6, X, INCX + fpsub F3, C3, T3 + LFSDUX A7, X, INCX2 + fpsub F4, C4, T4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + bdnz LL(102) + .align 4 + +LL(103): + fpabs B1, A1 + fpabs B2, A2 + fpabs B3, A3 + fpabs B4, A4 + + fpabs B5, A5 + fpabs B6, A6 + fpabs B7, A7 + fpabs B8, A8 + + fpadd T1, B1, B2 + fpadd T2, B3, B4 + fpadd T3, B5, B6 + fpadd T4, B7, B8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(120) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A3 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A3 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + + fpadd A1, A1, A2 + + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + + fabs A1, A1 + fabs A2, A2 + + fadd A1, A1, A2 + + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + li RET, 0 + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + + fsmfp C1, C1 + + sub XX, XX, INCX2 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(125) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + LFSDUX A1, XX, INCX2 + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX2 + LFSDUX A4, XX, INCX + + LFDUX A5, XX, INCX2 + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX2 + LFDUX A8, XX, INCX + + LFSDUX A5, XX, INCX2 + LFSDUX A6, XX, INCX + LFSDUX A7, XX, INCX2 + LFSDUX A8, XX, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd B1, T1, T2 + fpadd B2, T3, T4 + + bdz LL(123) + .align 4 + +LL(122): + LFDUX A1, XX, INCX2 + fpabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, B1 + LFDUX A2, XX, INCX + beq cr0, LL(999) + + LFDUX A3, XX, INCX2 + fpabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, B2 + LFDUX A4, XX, INCX + beq cr0, LL(999) + + LFSDUX A1, XX, INCX2 + fpabs T3, A7 + addi RET, RET, 1 + fscmp cr0, C1, B1 + LFSDUX A2, XX, INCX + beq cr0, LL(999) + + LFSDUX A3, XX, INCX2 + fpabs T4, A8 + addi RET, RET, 1 + fscmp cr0, C1, B2 + LFSDUX A4, XX, INCX + beq cr0, LL(999) + + fpadd B3, T1, T2 + fpadd B4, T3, T4 + + LFDUX A5, XX, INCX2 + fpabs T1, A1 + addi RET, RET, 1 + fcmpu cr0, C1, B3 + LFDUX A6, XX, INCX + beq cr0, LL(999) + + LFDUX A7, XX, INCX2 + fpabs T2, A2 + addi RET, RET, 1 + fcmpu cr0, C1, B4 + LFDUX A8, XX, INCX + beq cr0, LL(999) + + LFSDUX A5, XX, INCX2 + fpabs T3, A3 + addi RET, RET, 1 + fscmp cr0, C1, B3 + LFSDUX A6, XX, INCX + beq cr0, LL(999) + + LFSDUX A7, XX, INCX2 + fpabs T4, A4 + addi RET, RET, 1 + fscmp cr0, C1, B4 + LFSDUX A8, XX, INCX + beq cr0, LL(999) + + fpadd B1, T1, T2 + fpadd B2, T3, T4 + bdnz LL(122) + .align 4 + +LL(123): + fpabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, B1 + beq cr0, LL(999) + + fpabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, B2 + beq cr0, LL(999) + + fpabs T3, A7 + addi RET, RET, 1 + fscmp cr0, C1, B1 + beq cr0, LL(999) + + fpabs T4, A8 + addi RET, RET, 1 + fscmp cr0, C1, B2 + beq cr0, LL(999) + + fpadd B3, T1, T2 + fpadd B4, T3, T4 + + addi RET, RET, 1 + fcmpu cr0, C1, B3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, B4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, B3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, B4 + beq cr0, LL(999) + .align 4 + +LL(125): + andi. r0, NN, 4 + beq LL(126) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + LFSDUX A1, XX, INCX2 + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX2 + LFSDUX A4, XX, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 2 + beq LL(127) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fadd A1, A1, A2 + fadd A3, A3, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + .align 4 + +LL(127): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/izamax_ppc440.S b/kernel/power/izamax_ppc440.S new file mode 100644 index 0000000..f80c9ad --- /dev/null +++ b/kernel/power/izamax_ppc440.S @@ -0,0 +1,538 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 +#define INC1 r10 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + + slwi INCX, INCX, ZBASE_SHIFT + sub X, X, INCX + li INC1, SIZE + li PRE, 3 * 16 * SIZE + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + LFDX f2, X, INC1 + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + subi N, N, 1 + fmr f0, f1 + srawi. r0, N, 3 + fmr f2, f1 + mtspr CTR, r0 + fmr f3, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 +#ifdef PPCG4 + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f13, f29 + LFDUX f28, X, INCX + fabs f14, f30 + LFDX f29, X, INC1 + fabs f15, f31 + LFDUX f30, X, INCX + + fsub f16, f0, f4 + LFDX f31, X, INC1 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 +#ifdef PPCG4 + dcbt X, PRE +#endif + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f0, f16, f0, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDX f25, XX, INC1 + LFDUX f26, XX, INCX + LFDX f27, XX, INC1 + LFDUX f28, XX, INCX + LFDX f29, XX, INC1 + LFDUX f30, XX, INCX + LFDX f31, XX, INC1 + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + LFDX f9, XX, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/izamin.S b/kernel/power/izamin.S new file mode 100644 index 0000000..17275fc --- /dev/null +++ b/kernel/power/izamin.S @@ -0,0 +1,920 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 +#define INCXM1 r10 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + + +LL(1000): + cmpwi cr0, INCX, SIZE * 2 + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f24, 0 * SIZE(XX) + LFD f25, 1 * SIZE(XX) + LFD f26, 2 * SIZE(XX) + LFD f27, 3 * SIZE(XX) + LFD f28, 4 * SIZE(XX) + LFD f29, 5 * SIZE(XX) + LFD f30, 6 * SIZE(XX) + LFD f31, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(XX) + LFD f25, 17 * SIZE(XX) + LFD f26, 18 * SIZE(XX) + LFD f27, 19 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(XX) + LFD f29, 21 * SIZE(XX) + LFD f30, 22 * SIZE(XX) + LFD f31, 23 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + addi XX, XX, 16 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + addi XX, XX, 16 * SIZE + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + addi XX, XX, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCXM1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/izamin_hummer.S b/kernel/power/izamin_hummer.S new file mode 100644 index 0000000..75145ab --- /dev/null +++ b/kernel/power/izamin_hummer.S @@ -0,0 +1,566 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 + +#define T1 f16 +#define T2 f17 +#define T3 f18 +#define T4 f19 + +#define B1 f20 +#define B2 f21 +#define B3 f22 +#define B4 f23 +#define B5 f24 +#define B6 f25 +#define B7 f26 +#define B8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + mr NN, N + ble LL(999) + + mr XX, X + + LFD A1, 0 * SIZE(X) + LFD A2, 1 * SIZE(X) + add X, X, INCX2 + li RET, 1 + + fabs A1, A1 + fabs A2, A2 + + subi INCX2, INCX2, SIZE + + addi N, N, -1 + cmpwi cr0, N, 0 + fadd C1, A1, A2 + ble LL(999) + + fsmfp C1, C1 + li INCX, SIZE + fpmr C2, C1 + sub X, X, INCX2 + fpmr C3, C1 + srawi. r0, N, 3 + fpmr C4, C1 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX2 + LFDUX A6, X, INCX + LFDUX A7, X, INCX2 + LFDUX A8, X, INCX + + LFSDUX A5, X, INCX2 + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX2 + LFSDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpabs B1, A1 + LFDUX A1, X, INCX2 + fpabs B2, A2 + LFDUX A2, X, INCX + fpabs B3, A3 + LFDUX A3, X, INCX2 + fpabs B4, A4 + LFDUX A4, X, INCX + + fpabs B5, A5 + LFSDUX A1, X, INCX2 + fpabs B6, A6 + LFSDUX A2, X, INCX + fpabs B7, A7 + LFSDUX A3, X, INCX2 + fpabs B8, A8 + LFSDUX A4, X, INCX + + fpadd T1, B1, B2 + LFDUX A5, X, INCX2 + fpadd T2, B3, B4 + LFDUX A6, X, INCX + fpadd T3, B5, B6 + LFDUX A7, X, INCX2 + fpadd T4, B7, B8 + LFDUX A8, X, INCX + + fpsub F1, T1, C1 + LFSDUX A5, X, INCX2 + fpsub F2, T2, C2 + LFSDUX A6, X, INCX + fpsub F3, T3, C3 + LFSDUX A7, X, INCX2 + fpsub F4, T4, C4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + bdnz LL(102) + .align 4 + +LL(103): + fpabs B1, A1 + fpabs B2, A2 + fpabs B3, A3 + fpabs B4, A4 + + fpabs B5, A5 + fpabs B6, A6 + fpabs B7, A7 + fpabs B8, A8 + + fpadd T1, B1, B2 + fpadd T2, B3, B4 + fpadd T3, B5, B6 + fpadd T4, B7, B8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(120) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + fpsub F1, A1, C1 + fpsub F2, A3, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A3 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + + fpadd A1, A1, A2 + + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + + fabs A1, A1 + fabs A2, A2 + + fadd A1, A1, A2 + + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + li RET, 0 + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + + fsmfp C1, C1 + + sub XX, XX, INCX2 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(125) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + LFSDUX A1, XX, INCX2 + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX2 + LFSDUX A4, XX, INCX + + LFDUX A5, XX, INCX2 + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX2 + LFDUX A8, XX, INCX + + LFSDUX A5, XX, INCX2 + LFSDUX A6, XX, INCX + LFSDUX A7, XX, INCX2 + LFSDUX A8, XX, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd B1, T1, T2 + fpadd B2, T3, T4 + + bdz LL(123) + .align 4 + +LL(122): + LFDUX A1, XX, INCX2 + fpabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, B1 + LFDUX A2, XX, INCX + beq cr0, LL(999) + + LFDUX A3, XX, INCX2 + fpabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, B2 + LFDUX A4, XX, INCX + beq cr0, LL(999) + + LFSDUX A1, XX, INCX2 + fpabs T3, A7 + addi RET, RET, 1 + fscmp cr0, C1, B1 + LFSDUX A2, XX, INCX + beq cr0, LL(999) + + LFSDUX A3, XX, INCX2 + fpabs T4, A8 + addi RET, RET, 1 + fscmp cr0, C1, B2 + LFSDUX A4, XX, INCX + beq cr0, LL(999) + + fpadd B3, T1, T2 + fpadd B4, T3, T4 + + LFDUX A5, XX, INCX2 + fpabs T1, A1 + addi RET, RET, 1 + fcmpu cr0, C1, B3 + LFDUX A6, XX, INCX + beq cr0, LL(999) + + LFDUX A7, XX, INCX2 + fpabs T2, A2 + addi RET, RET, 1 + fcmpu cr0, C1, B4 + LFDUX A8, XX, INCX + beq cr0, LL(999) + + LFSDUX A5, XX, INCX2 + fpabs T3, A3 + addi RET, RET, 1 + fscmp cr0, C1, B3 + LFSDUX A6, XX, INCX + beq cr0, LL(999) + + LFSDUX A7, XX, INCX2 + fpabs T4, A4 + addi RET, RET, 1 + fscmp cr0, C1, B4 + LFSDUX A8, XX, INCX + beq cr0, LL(999) + + fpadd B1, T1, T2 + fpadd B2, T3, T4 + bdnz LL(122) + .align 4 + +LL(123): + fpabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, B1 + beq cr0, LL(999) + + fpabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, B2 + beq cr0, LL(999) + + fpabs T3, A7 + addi RET, RET, 1 + fscmp cr0, C1, B1 + beq cr0, LL(999) + + fpabs T4, A8 + addi RET, RET, 1 + fscmp cr0, C1, B2 + beq cr0, LL(999) + + fpadd B3, T1, T2 + fpadd B4, T3, T4 + + addi RET, RET, 1 + fcmpu cr0, C1, B3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, B4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, B3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, B4 + beq cr0, LL(999) + .align 4 + +LL(125): + andi. r0, NN, 4 + beq LL(126) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + LFSDUX A1, XX, INCX2 + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX2 + LFSDUX A4, XX, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 2 + beq LL(127) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fadd A1, A1, A2 + fadd A3, A3, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + .align 4 + +LL(127): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/izamin_ppc440.S b/kernel/power/izamin_ppc440.S new file mode 100644 index 0000000..2cdb8bf --- /dev/null +++ b/kernel/power/izamin_ppc440.S @@ -0,0 +1,538 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 +#define INC1 r10 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + + slwi INCX, INCX, ZBASE_SHIFT + sub X, X, INCX + li INC1, SIZE + li PRE, 3 * 16 * SIZE + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + LFDX f2, X, INC1 + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + subi N, N, 1 + fmr f0, f1 + srawi. r0, N, 3 + fmr f2, f1 + mtspr CTR, r0 + fmr f3, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 +#ifdef PPCG4 + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f13, f29 + LFDUX f28, X, INCX + fabs f14, f30 + LFDX f29, X, INC1 + fabs f15, f31 + LFDUX f30, X, INCX + + fsub f16, f0, f4 + LFDX f31, X, INC1 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 +#ifdef PPCG4 + dcbt X, PRE +#endif + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f0, f16, f4, f0 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDX f25, XX, INC1 + LFDUX f26, XX, INCX + LFDX f27, XX, INC1 + LFDUX f28, XX, INCX + LFDX f29, XX, INC1 + LFDUX f30, XX, INCX + LFDX f31, XX, INC1 + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + LFDX f9, XX, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/lock.c b/kernel/power/lock.c new file mode 100644 index 0000000..51348d6 --- /dev/null +++ b/kernel/power/lock.c @@ -0,0 +1,61 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +static void __inline blas_lock(volatile BLASULONG *address){ + +#ifdef __GNUC__ + + BLASLONG int ret, val = 1; + + __asm__ __volatile__ ( + " .machine \"any\" ;" + "0: lwarx %0,0, %1 ;" + " cmpwi 0,%0,0;" + " bne 1f;" + " stwcx. %2,0, %1 ;" + " bne- 0b;" + "1: " + : "=&r"(ret) + : "r"(address), "r" (val) + : "cr0", "memory"); + +#else + while (*address) {}; + *address = 1; +#endif +} diff --git a/kernel/power/lsame.S b/kernel/power/lsame.S new file mode 100644 index 0000000..51d21b0 --- /dev/null +++ b/kernel/power/lsame.S @@ -0,0 +1,72 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + lbz r3, 0(r3) + lbz r4, 0(r4) + + cmplwi cr0, r3, 96 + cmplwi cr6, r4, 96 + addi r0, r3, -32 + addi r11,r4, -32 + + ble- cr0, LL(2) +#ifdef __64BIT__ + rldicl r3, r0, 0, 56 +#else + rlwinm r3, r0, 0, 0xff +#endif +LL(2): + ble- cr6, LL(3) +#ifdef __64BIT__ + rldicl r4, r11, 0, 56 +#else + rlwinm r4, r11, 0, 0xff +#endif +LL(3): + xor r3, r3, r4 + subfic r0, r3, 0 + adde r3, r0, r3 + blr + + EPILOGUE diff --git a/kernel/power/max.S b/kernel/power/max.S new file mode 100644 index 0000000..5862bc9 --- /dev/null +++ b/kernel/power/max.S @@ -0,0 +1,445 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(20) + .align 4 + +LL(10): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fsel f0, f8, f0, f24 + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + fsub f11, f3, f19 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f12, f4, f28 + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + fsub f15, f7, f23 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fsel f0, f8, f0, f24 + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + fsub f11, f3, f19 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f12, f4, f28 + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + fsub f15, f7, f23 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/max_hummer.S b/kernel/power/max_hummer.S new file mode 100644 index 0000000..01ff907 --- /dev/null +++ b/kernel/power/max_hummer.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + + fsmfp C1, C1 + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(998) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + LFPDUX A1, X, INCX2 + fpsel C2, F2, C2, A2 + LFPDUX A2, X, INCX2 + fpsel C3, F3, C3, A3 + LFPDUX A3, X, INCX2 + fpsel C4, F4, C4, A4 + LFPDUX A4, X, INCX2 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + LFPDUX A5, X, INCX2 + fpsel C2, F6, C2, A6 + LFPDUX A6, X, INCX2 + fpsel C3, F7, C3, A7 + LFPDUX A7, X, INCX2 + fpsel C4, F8, C4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + b LL(998) + .align 4 + + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX + LFSDUX A8, X, INCX + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + bdz LL(103) + .align 4 + +LL(102): + fpsel C1, F1, C1, A1 + LFDUX A1, X, INCX + fpsel C2, F2, C2, A2 + LFDUX A2, X, INCX + fpsel C3, F3, C3, A3 + LFDUX A3, X, INCX + fpsel C4, F4, C4, A4 + LFDUX A4, X, INCX + + fpsub F5, C1, A5 + LFSDUX A1, X, INCX + fpsub F6, C2, A6 + LFSDUX A2, X, INCX + fpsub F7, C3, A7 + LFSDUX A3, X, INCX + fpsub F8, C4, A8 + LFSDUX A4, X, INCX + + fpsel C1, F5, C1, A5 + LFDUX A5, X, INCX + fpsel C2, F6, C2, A6 + LFDUX A6, X, INCX + fpsel C3, F7, C3, A7 + LFDUX A7, X, INCX + fpsel C4, F8, C4, A8 + LFDUX A8, X, INCX + + fpsub F1, C1, A1 + LFSDUX A5, X, INCX + fpsub F2, C2, A2 + LFSDUX A6, X, INCX + fpsub F3, C3, A3 + LFSDUX A7, X, INCX + fpsub F4, C4, A4 + LFSDUX A8, X, INCX + bdnz LL(102) + .align 4 + +LL(103): + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + + +LL(998): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/max_ppc440.S b/kernel/power/max_ppc440.S new file mode 100644 index 0000000..7afdf56 --- /dev/null +++ b/kernel/power/max_ppc440.S @@ -0,0 +1,284 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + sub X, X, INCX + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fmr f0, f1 + fmr f2, f1 + subi N, N, 1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + srawi. r0, N, 4 + fmr f6, f1 + mtspr CTR, r0 + fmr f7, f1 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + fsub f8, f0, f16 + LFDUX f25, X, INCX + fsub f9, f1, f17 + LFDUX f26, X, INCX + fsub f10, f2, f18 + LFDUX f27, X, INCX + fsub f11, f3, f19 + LFDUX f28, X, INCX + fsub f12, f4, f20 + LFDUX f29, X, INCX + fsub f13, f5, f21 + LFDUX f30, X, INCX + fsub f14, f6, f22 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f0, f16 + LFDUX f16, X, INCX + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + LFDUX f17, X, INCX + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + LFDUX f18, X, INCX + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + LFDUX f19, X, INCX + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + LFDUX f20, X, INCX + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + LFDUX f21, X, INCX + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + LFDUX f22, X, INCX + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + LFDUX f23, X, INCX + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + LFDUX f24, X, INCX + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + LFDUX f25, X, INCX + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + LFDUX f26, X, INCX + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + LFDUX f27, X, INCX + fsub f11, f3, f19 + + fsel f4, f12, f4, f28 + LFDUX f28, X, INCX + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + LFDUX f29, X, INCX + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + LFDUX f30, X, INCX + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/min.S b/kernel/power/min.S new file mode 100644 index 0000000..727a6a7 --- /dev/null +++ b/kernel/power/min.S @@ -0,0 +1,445 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(20) + .align 4 + +LL(10): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fsel f0, f8, f24, f0 + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + fsub f11, f3, f19 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f12, f28, f4 + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + fsub f15, f7, f23 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fsel f0, f8, f24, f0 + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + fsub f11, f3, f19 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f12, f28, f4 + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + fsub f15, f7, f23 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/min_hummer.S b/kernel/power/min_hummer.S new file mode 100644 index 0000000..bd82687 --- /dev/null +++ b/kernel/power/min_hummer.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + + fsmfp C1, C1 + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(998) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + LFPDUX A1, X, INCX2 + fpsel C2, F2, C2, A2 + LFPDUX A2, X, INCX2 + fpsel C3, F3, C3, A3 + LFPDUX A3, X, INCX2 + fpsel C4, F4, C4, A4 + LFPDUX A4, X, INCX2 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + LFPDUX A5, X, INCX2 + fpsel C2, F6, C2, A6 + LFPDUX A6, X, INCX2 + fpsel C3, F7, C3, A7 + LFPDUX A7, X, INCX2 + fpsel C4, F8, C4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + b LL(998) + .align 4 + + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX + LFSDUX A8, X, INCX + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + bdz LL(103) + .align 4 + +LL(102): + fpsel C1, F1, C1, A1 + LFDUX A1, X, INCX + fpsel C2, F2, C2, A2 + LFDUX A2, X, INCX + fpsel C3, F3, C3, A3 + LFDUX A3, X, INCX + fpsel C4, F4, C4, A4 + LFDUX A4, X, INCX + + fpsub F5, A5, C1 + LFSDUX A1, X, INCX + fpsub F6, A6, C2 + LFSDUX A2, X, INCX + fpsub F7, A7, C3 + LFSDUX A3, X, INCX + fpsub F8, A8, C4 + LFSDUX A4, X, INCX + + fpsel C1, F5, C1, A5 + LFDUX A5, X, INCX + fpsel C2, F6, C2, A6 + LFDUX A6, X, INCX + fpsel C3, F7, C3, A7 + LFDUX A7, X, INCX + fpsel C4, F8, C4, A8 + LFDUX A8, X, INCX + + fpsub F1, A1, C1 + LFSDUX A5, X, INCX + fpsub F2, A2, C2 + LFSDUX A6, X, INCX + fpsub F3, A3, C3 + LFSDUX A7, X, INCX + fpsub F4, A4, C4 + LFSDUX A8, X, INCX + bdnz LL(102) + .align 4 + +LL(103): + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsub F3, A3, C3 + fsub F4, A4, C4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fsub F1, A1, C1 + fsub F2, A2, C2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + + +LL(998): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/min_ppc440.S b/kernel/power/min_ppc440.S new file mode 100644 index 0000000..ab67bbc --- /dev/null +++ b/kernel/power/min_ppc440.S @@ -0,0 +1,284 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + sub X, X, INCX + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fmr f0, f1 + subi N, N, 1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + srawi. r0, N, 4 + fmr f6, f1 + mtspr CTR, r0 + fmr f7, f1 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + fsub f8, f0, f16 + LFDUX f25, X, INCX + fsub f9, f1, f17 + LFDUX f26, X, INCX + fsub f10, f2, f18 + LFDUX f27, X, INCX + fsub f11, f3, f19 + LFDUX f28, X, INCX + fsub f12, f4, f20 + LFDUX f29, X, INCX + fsub f13, f5, f21 + LFDUX f30, X, INCX + fsub f14, f6, f22 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f16, f0 + LFDUX f16, X, INCX + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + LFDUX f17, X, INCX + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + LFDUX f18, X, INCX + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + LFDUX f19, X, INCX + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + LFDUX f20, X, INCX + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + LFDUX f21, X, INCX + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + LFDUX f22, X, INCX + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + LFDUX f23, X, INCX + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + LFDUX f24, X, INCX + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + LFDUX f25, X, INCX + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + LFDUX f26, X, INCX + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + LFDUX f27, X, INCX + fsub f11, f3, f19 + + fsel f4, f12, f28, f4 + LFDUX f28, X, INCX + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + LFDUX f29, X, INCX + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + LFDUX f30, X, INCX + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S new file mode 100644 index 0000000..e2b635e --- /dev/null +++ b/kernel/power/nrm2.S @@ -0,0 +1,908 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define NN r6 +#define XX r7 +#define PREA r8 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define FMAX 152(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r12, 0x5fe0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r12, FMAX + stw r10, 4 + FMAX + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + mr NN, N + mr XX, X + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + subi N, N, 1 + + cmpwi cr0, N, 0 + ble- LL(9999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1000) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(100) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + .align 4 + +LL(100): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + + fcmpu cr0, f1, f31 + beq- cr0, LL(9999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- cr0, LL(250) + + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + LFD f10, 2 * SIZE(XX) + LFD f11, 3 * SIZE(XX) + LFD f12, 4 * SIZE(XX) + LFD f13, 5 * SIZE(XX) + LFD f14, 6 * SIZE(XX) + LFD f15, 7 * SIZE(XX) + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmul f18, f30, f10 + fmul f19, f30, f11 + + LFD f8, 8 * SIZE(XX) + LFD f9, 9 * SIZE(XX) + LFD f10, 10 * SIZE(XX) + LFD f11, 11 * SIZE(XX) + + fmul f20, f30, f12 + fmul f21, f30, f13 + fmul f22, f30, f14 + fmul f23, f30, f15 + + LFD f12, 12 * SIZE(XX) + LFD f13, 13 * SIZE(XX) + LFD f14, 14 * SIZE(XX) + LFD f15, 15 * SIZE(XX) + bdz LL(220) + .align 4 + +LL(210): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFD f8, 16 * SIZE(XX) + LFD f9, 17 * SIZE(XX) + LFD f10, 18 * SIZE(XX) + LFD f11, 19 * SIZE(XX) + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFD f12, 20 * SIZE(XX) + LFD f13, 21 * SIZE(XX) + LFD f14, 22 * SIZE(XX) + LFD f15, 23 * SIZE(XX) + + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFD f8, 24 * SIZE(XX) + LFD f9, 25 * SIZE(XX) + LFD f10, 26 * SIZE(XX) + LFD f11, 27 * SIZE(XX) + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFD f12, 28 * SIZE(XX) + LFD f13, 29 * SIZE(XX) + LFD f14, 30 * SIZE(XX) + LFD f15, 31 * SIZE(XX) + +#ifndef POWER6 + L1_PREFETCH XX, PREA +#endif + addi XX, XX, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH XX, PREA +#endif + + bdnz LL(210) + .align 4 + +LL(220): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + addi XX, XX, 16 * SIZE + .align 4 + +LL(250): + andi. r0, NN, 15 + mtspr CTR, r0 + beq- cr0, LL(270) + .align 4 + +LL(260): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + + fmul f16, f30, f8 + fmadd f0, f16, f16, f0 + bdnz LL(260) + .align 4 + +LL(270): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f0, f0, f4 + + fsqrt f0, f0 + fmul f1, f31, f0 + b LL(9999) + .align 4 + +LL(1000): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(1050) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(1020) + .align 4 + +LL(1010): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(1010) + .align 4 + +LL(1020): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(1050): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(1999) + .align 4 + +LL(1060): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(1060) + .align 4 + +LL(1999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + lfd f2, FMAX + + fcmpu cr0, f1, f31 + beq- cr0, LL(9999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + sub XX, XX, INCX + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- cr0, LL(2150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmul f18, f30, f10 + fmul f19, f30, f11 + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + fmul f20, f30, f12 + fmul f21, f30, f13 + fmul f22, f30, f14 + fmul f23, f30, f15 + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(2120) + .align 4 + +LL(2110): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + bdnz LL(2110) + .align 4 + +LL(2120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + .align 4 + +LL(2150): + andi. r0, NN, 15 + mtspr CTR, r0 + beq- cr0, LL(2170) + .align 4 + +LL(2160): + LFDUX f8, XX, INCX + + fmul f16, f30, f8 + fmadd f0, f16, f16, f0 + bdnz LL(2160) + .align 4 + +LL(2170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f0, f0, f4 + + fsqrt f0, f0 + fmul f1, f31, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/rot.S b/kernel/power/rot.S new file mode 100644 index 0000000..b9e9338 --- /dev/null +++ b/kernel/power/rot.S @@ -0,0 +1,571 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 +#define XX r9 +#define YY r10 + +#define C f1 +#define S f2 + +#define STACKSIZE 32 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f0, 0 * SIZE(X) + LFD f4, 1 * SIZE(X) + LFD f6, 2 * SIZE(X) + LFD f8, 3 * SIZE(X) + + LFD f3, 0 * SIZE(Y) + LFD f5, 1 * SIZE(Y) + LFD f7, 2 * SIZE(Y) + LFD f9, 3 * SIZE(Y) + bdz LL(12) + .align 4 + +LL(10): + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 4 * SIZE(X) + LFD f4, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f8, 7 * SIZE(X) + + LFD f3, 4 * SIZE(Y) + LFD f5, 5 * SIZE(Y) + LFD f7, 6 * SIZE(Y) + LFD f9, 7 * SIZE(Y) + + STFD f10, 0 * SIZE(X) + STFD f12, 1 * SIZE(X) + STFD f14, 2 * SIZE(X) + STFD f16, 3 * SIZE(X) + + STFD f11, 0 * SIZE(Y) + STFD f13, 1 * SIZE(Y) + STFD f15, 2 * SIZE(Y) + STFD f17, 3 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f6, 10 * SIZE(X) + LFD f8, 11 * SIZE(X) + + LFD f3, 8 * SIZE(Y) + LFD f5, 9 * SIZE(Y) + LFD f7, 10 * SIZE(Y) + LFD f9, 11 * SIZE(Y) + + STFD f10, 4 * SIZE(X) + STFD f12, 5 * SIZE(X) + STFD f14, 6 * SIZE(X) + STFD f16, 7 * SIZE(X) + + STFD f11, 4 * SIZE(Y) + STFD f13, 5 * SIZE(Y) + STFD f15, 6 * SIZE(Y) + STFD f17, 7 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 12 * SIZE(X) + LFD f4, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f8, 15 * SIZE(X) + + LFD f3, 12 * SIZE(Y) + LFD f5, 13 * SIZE(Y) + LFD f7, 14 * SIZE(Y) + LFD f9, 15 * SIZE(Y) + + STFD f10, 8 * SIZE(X) + STFD f12, 9 * SIZE(X) + STFD f14, 10 * SIZE(X) + STFD f16, 11 * SIZE(X) + + STFD f11, 8 * SIZE(Y) + STFD f13, 9 * SIZE(Y) + STFD f15, 10 * SIZE(Y) + STFD f17, 11 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 16 * SIZE(X) + LFD f4, 17 * SIZE(X) + LFD f6, 18 * SIZE(X) + LFD f8, 19 * SIZE(X) + + LFD f3, 16 * SIZE(Y) + LFD f5, 17 * SIZE(Y) + LFD f7, 18 * SIZE(Y) + LFD f9, 19 * SIZE(Y) + + STFD f10, 12 * SIZE(X) + STFD f12, 13 * SIZE(X) + STFD f14, 14 * SIZE(X) + STFD f16, 15 * SIZE(X) + + STFD f11, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f15, 14 * SIZE(Y) + STFD f17, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst X, PREA +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst X, PREA + dcbtst X, PREA +#endif + bdnz LL(10) + .align 4 + +LL(12): + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 0 * SIZE(X) + STFD f12, 1 * SIZE(X) + STFD f14, 2 * SIZE(X) + STFD f16, 3 * SIZE(X) + + STFD f11, 0 * SIZE(Y) + STFD f13, 1 * SIZE(Y) + STFD f15, 2 * SIZE(Y) + STFD f17, 3 * SIZE(Y) + + LFD f0, 4 * SIZE(X) + LFD f4, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f8, 7 * SIZE(X) + + LFD f3, 4 * SIZE(Y) + LFD f5, 5 * SIZE(Y) + LFD f7, 6 * SIZE(Y) + LFD f9, 7 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 4 * SIZE(X) + STFD f12, 5 * SIZE(X) + STFD f14, 6 * SIZE(X) + STFD f16, 7 * SIZE(X) + + STFD f11, 4 * SIZE(Y) + STFD f13, 5 * SIZE(Y) + STFD f15, 6 * SIZE(Y) + STFD f17, 7 * SIZE(Y) + + LFD f0, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f6, 10 * SIZE(X) + LFD f8, 11 * SIZE(X) + + LFD f3, 8 * SIZE(Y) + LFD f5, 9 * SIZE(Y) + LFD f7, 10 * SIZE(Y) + LFD f9, 11 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 8 * SIZE(X) + STFD f12, 9 * SIZE(X) + STFD f14, 10 * SIZE(X) + STFD f16, 11 * SIZE(X) + + STFD f11, 8 * SIZE(Y) + STFD f13, 9 * SIZE(Y) + STFD f15, 10 * SIZE(Y) + STFD f17, 11 * SIZE(Y) + + LFD f0, 12 * SIZE(X) + LFD f4, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f8, 15 * SIZE(X) + + LFD f3, 12 * SIZE(Y) + LFD f5, 13 * SIZE(Y) + LFD f7, 14 * SIZE(Y) + LFD f9, 15 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 12 * SIZE(X) + STFD f12, 13 * SIZE(X) + STFD f14, 14 * SIZE(X) + STFD f16, 15 * SIZE(X) + + STFD f11, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f15, 14 * SIZE(Y) + STFD f17, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f3, 0 * SIZE(X) + LFD f4, 0 * SIZE(Y) + + FMUL f10, C, f3 + FMUL f11, C, f4 + + FMADD f10, S, f4, f10 + FNMSUB f11, S, f3, f11 + + STFD f10, 0 * SIZE(X) + STFD f11, 0 * SIZE(Y) + + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + mr XX, X + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + LFDUX f6, X, INCX + LFDUX f7, Y, INCY + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDUX f10, XX, INCX + STFDUX f11, YY, INCY + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + STFDUX f14, XX, INCX + STFDUX f15, YY, INCY + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + LFDUX f6, X, INCX + LFDUX f7, Y, INCY + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDUX f10, XX, INCX + STFDUX f11, YY, INCY + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + STFDUX f14, XX, INCX + STFDUX f15, YY, INCY + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + + STFDUX f10, XX, INCX + STFDUX f11, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/rot_ppc440.S b/kernel/power/rot_ppc440.S new file mode 100644 index 0000000..bb19583 --- /dev/null +++ b/kernel/power/rot_ppc440.S @@ -0,0 +1,286 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PRE r8 +#define XX r9 +#define YY r10 + +#define C f1 +#define S f2 + +#define STACKSIZE 32 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PRE, 2 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + sub X, X, INCX + sub Y, Y, INCY + + mr XX, X + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + LFDUX f4, X, INCX + + FMUL f10, C, f0 + LFDUX f5, Y, INCY + FMUL f11, C, f3 + LFDUX f6, X, INCX + FMUL f12, C, f4 + LFDUX f7, Y, INCY + FMUL f13, C, f5 + LFDUX f8, X, INCX + + FMADD f10, S, f3, f10 + LFDUX f9, Y, INCY + FNMSUB f11, S, f0, f11 + LFDUX f0, X, INCX + FMADD f12, S, f5, f12 + LFDUX f3, Y, INCY + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + + bdz LL(111) + .align 4 + +LL(110): + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDUX f10, XX, INCX + FMUL f16, C, f8 + STFDUX f11, YY, INCY + FMUL f17, C, f9 + STFDUX f12, XX, INCX + +#ifdef PPCG4 + dcbtst X, PRE +#endif + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDUX f6, X, INCX + FMADD f16, S, f9, f16 + LFDUX f7, Y, INCY + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDUX f14, XX, INCX + FMUL f12, C, f4 + STFDUX f15, YY, INCY + FMUL f13, C, f5 + STFDUX f16, XX, INCX + +#ifdef PPCG4 + dcbtst Y, PRE +#endif + + FMADD f10, S, f3, f10 + STFDUX f17, YY, INCY + FNMSUB f11, S, f0, f11 + LFDUX f0, X, INCX + FMADD f12, S, f5, f12 + LFDUX f3, Y, INCY + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDUX f10, XX, INCX + FMUL f16, C, f8 + STFDUX f11, YY, INCY + FMUL f17, C, f9 + STFDUX f12, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDUX f6, X, INCX + FMADD f16, S, f9, f16 + LFDUX f7, Y, INCY + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDUX f14, XX, INCX + FMUL f12, C, f4 + STFDUX f15, YY, INCY + FMUL f13, C, f5 + STFDUX f16, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + + FMADD f10, S, f3, f10 + STFDUX f17, YY, INCY + FNMSUB f11, S, f0, f11 + LFDUX f0, X, INCX + FMADD f12, S, f5, f12 + LFDUX f3, Y, INCY + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + + bdnz LL(110) + .align 4 + +LL(111): + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDUX f10, XX, INCX + FMUL f16, C, f8 + STFDUX f11, YY, INCY + FMUL f17, C, f9 + STFDUX f12, XX, INCX + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDUX f6, X, INCX + FMADD f16, S, f9, f16 + LFDUX f7, Y, INCY + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDUX f14, XX, INCX + FMUL f12, C, f4 + STFDUX f15, YY, INCY + FMUL f13, C, f5 + STFDUX f16, XX, INCX + + FMUL f14, C, f6 + STFDUX f17, YY, INCY + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + STFDUX f10, XX, INCX + FNMSUB f15, S, f6, f15 + STFDUX f11, YY, INCY + FMADD f16, S, f9, f16 + STFDUX f12, XX, INCX + FNMSUB f17, S, f8, f17 + STFDUX f13, YY, INCY + + STFDUX f14, XX, INCX + STFDUX f15, YY, INCY + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + + STFDUX f10, XX, INCX + STFDUX f11, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/scal.S b/kernel/power/scal.S new file mode 100644 index 0000000..f242f08 --- /dev/null +++ b/kernel/power/scal.S @@ -0,0 +1,401 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define XX r4 +#define PREA r5 + +#ifdef linux +#ifndef __64BIT__ +#define X r6 +#define INCX r7 +#else +#define X r7 +#define INCX r8 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define X r8 +#define INCX r9 +#else +#define X r7 +#define INCX r8 +#endif +#endif + +#define FZERO f0 +#define ALPHA f1 + + PROLOGUE + PROFCODE + + addi SP, SP, -8 + li r0, 0 + + stw r0, 0(SP) + lfs FZERO, 0(SP) + + addi SP, SP, 8 + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + blelr- cr0 + + fcmpu cr0, FZERO, ALPHA + bne- cr0, LL(A1I1) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(A0IN) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(A0I1_Remain) + .align 4 + +LL(A0I1_kernel): + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + STFD FZERO, 2 * SIZE(X) + STFD FZERO, 3 * SIZE(X) + STFD FZERO, 4 * SIZE(X) + STFD FZERO, 5 * SIZE(X) + STFD FZERO, 6 * SIZE(X) + STFD FZERO, 7 * SIZE(X) + + STFD FZERO, 8 * SIZE(X) + STFD FZERO, 9 * SIZE(X) + STFD FZERO, 10 * SIZE(X) + STFD FZERO, 11 * SIZE(X) + STFD FZERO, 12 * SIZE(X) + STFD FZERO, 13 * SIZE(X) + STFD FZERO, 14 * SIZE(X) + STFD FZERO, 15 * SIZE(X) + + addi X, X, 16 * SIZE + bdnz LL(A0I1_kernel) + .align 4 + +LL(A0I1_Remain): + andi. r0, N, 15 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0I1_RemainKernel): + STFD FZERO, 0 * SIZE(X) + addi X, X, 1 * SIZE + bdnz LL(A0I1_RemainKernel) + blr + .align 4 + +LL(A0IN): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(A0IN_Remain) + .align 4 + +LL(A0IN_Kernel): + dcbtst X, PREA + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + bdnz LL(A0IN_Kernel) + .align 4 + +LL(A0IN_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0IN_RemainKernel): + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + bdnz LL(A0IN_RemainKernel) + blr + .align 4 + +LL(A1I1): + cmpwi cr0, INCX, SIZE + bne- LL(A1IN) + + mr XX, X + + srawi. r0, N, 4 + mtspr CTR, r0 + beq+ LL(A1I1_Remain) + + LFD f2, 0 * SIZE(X) + LFD f3, 1 * SIZE(X) + LFD f4, 2 * SIZE(X) + LFD f5, 3 * SIZE(X) + LFD f6, 4 * SIZE(X) + LFD f7, 5 * SIZE(X) + LFD f8, 6 * SIZE(X) + LFD f9, 7 * SIZE(X) + bdz LL(13) + .align 4 + +LL(A1I1_kernel): + FMUL f10, ALPHA, f2 + FMUL f11, ALPHA, f3 + FMUL f12, ALPHA, f4 + FMUL f13, ALPHA, f5 + + LFD f2, 8 * SIZE(X) + LFD f3, 9 * SIZE(X) + LFD f4, 10 * SIZE(X) + LFD f5, 11 * SIZE(X) + + STFD f10, 0 * SIZE(X) + STFD f11, 1 * SIZE(X) + STFD f12, 2 * SIZE(X) + STFD f13, 3 * SIZE(X) + + FMUL f10, ALPHA, f6 + FMUL f11, ALPHA, f7 + FMUL f12, ALPHA, f8 + FMUL f13, ALPHA, f9 + + LFD f6, 12 * SIZE(X) + LFD f7, 13 * SIZE(X) + LFD f8, 14 * SIZE(X) + LFD f9, 15 * SIZE(X) + + STFD f10, 4 * SIZE(X) + STFD f11, 5 * SIZE(X) + STFD f12, 6 * SIZE(X) + STFD f13, 7 * SIZE(X) + + FMUL f10, ALPHA, f2 + FMUL f11, ALPHA, f3 + FMUL f12, ALPHA, f4 + FMUL f13, ALPHA, f5 + + LFD f2, 16 * SIZE(X) + LFD f3, 17 * SIZE(X) + LFD f4, 18 * SIZE(X) + LFD f5, 19 * SIZE(X) + + STFD f10, 8 * SIZE(X) + STFD f11, 9 * SIZE(X) + STFD f12, 10 * SIZE(X) + STFD f13, 11 * SIZE(X) + + FMUL f10, ALPHA, f6 + FMUL f11, ALPHA, f7 + FMUL f12, ALPHA, f8 + FMUL f13, ALPHA, f9 + + LFD f6, 20 * SIZE(X) + LFD f7, 21 * SIZE(X) + LFD f8, 22 * SIZE(X) + LFD f9, 23 * SIZE(X) + + STFD f10, 12 * SIZE(X) + STFD f11, 13 * SIZE(X) + STFD f12, 14 * SIZE(X) + STFD f13, 15 * SIZE(X) + + addi X, X, 16 * SIZE + dcbtst X, PREA + bdnz LL(A1I1_kernel) + .align 4 + +LL(13): + FMUL f10, ALPHA, f2 + FMUL f11, ALPHA, f3 + FMUL f12, ALPHA, f4 + FMUL f13, ALPHA, f5 + + LFD f2, 8 * SIZE(X) + LFD f3, 9 * SIZE(X) + LFD f4, 10 * SIZE(X) + LFD f5, 11 * SIZE(X) + + STFD f10, 0 * SIZE(X) + STFD f11, 1 * SIZE(X) + STFD f12, 2 * SIZE(X) + STFD f13, 3 * SIZE(X) + + FMUL f10, ALPHA, f6 + FMUL f11, ALPHA, f7 + FMUL f12, ALPHA, f8 + FMUL f13, ALPHA, f9 + + LFD f6, 12 * SIZE(X) + LFD f7, 13 * SIZE(X) + LFD f8, 14 * SIZE(X) + LFD f9, 15 * SIZE(X) + + STFD f10, 4 * SIZE(X) + STFD f11, 5 * SIZE(X) + STFD f12, 6 * SIZE(X) + STFD f13, 7 * SIZE(X) + + FMUL f10, ALPHA, f2 + FMUL f11, ALPHA, f3 + FMUL f12, ALPHA, f4 + FMUL f13, ALPHA, f5 + + STFD f10, 8 * SIZE(X) + STFD f11, 9 * SIZE(X) + STFD f12, 10 * SIZE(X) + STFD f13, 11 * SIZE(X) + + FMUL f10, ALPHA, f6 + FMUL f11, ALPHA, f7 + FMUL f12, ALPHA, f8 + FMUL f13, ALPHA, f9 + + STFD f10, 12 * SIZE(X) + STFD f11, 13 * SIZE(X) + STFD f12, 14 * SIZE(X) + STFD f13, 15 * SIZE(X) + + addi X, X, 16 * SIZE + .align 4 + +LL(A1I1_Remain): + andi. r0, N, 15 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1I1_RemainKernel): + LFD f2, 0 * SIZE(X) + FMUL f2, ALPHA, f2 + STFD f2, 0 * SIZE(X) + addi X, X, 1 * SIZE + bdnz LL(A1I1_RemainKernel) + blr + .align 4 + +LL(A1IN): + mr XX, X + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(A1IN_Remain) + .align 4 + +LL(A1IN_Kernel): + LFD f2, 0 * SIZE(XX) + add XX, XX, INCX + LFD f3, 0 * SIZE(XX) + add XX, XX, INCX + LFD f4, 0 * SIZE(XX) + add XX, XX, INCX + LFD f5, 0 * SIZE(XX) + add XX, XX, INCX + + FMUL f2, ALPHA, f2 + FMUL f3, ALPHA, f3 + FMUL f4, ALPHA, f4 + FMUL f5, ALPHA, f5 + + LFD f6, 0 * SIZE(XX) + add XX, XX, INCX + LFD f7, 0 * SIZE(XX) + add XX, XX, INCX + LFD f8, 0 * SIZE(XX) + add XX, XX, INCX + LFD f9, 0 * SIZE(XX) + add XX, XX, INCX + + FMUL f6, ALPHA, f6 + FMUL f7, ALPHA, f7 + FMUL f8, ALPHA, f8 + FMUL f9, ALPHA, f9 + + STFD f2, 0 * SIZE(X) + add X, X, INCX + STFD f3, 0 * SIZE(X) + add X, X, INCX + STFD f4, 0 * SIZE(X) + add X, X, INCX + STFD f5, 0 * SIZE(X) + add X, X, INCX + STFD f6, 0 * SIZE(X) + add X, X, INCX + STFD f7, 0 * SIZE(X) + add X, X, INCX + STFD f8, 0 * SIZE(X) + add X, X, INCX + STFD f9, 0 * SIZE(X) + add X, X, INCX + bdnz LL(A1IN_Kernel) + .align 4 + +LL(A1IN_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1IN_RemainKernel): + LFD f2, 0 * SIZE(XX) + add XX, XX, INCX + FMUL f2, ALPHA, f2 + STFD f2, 0 * SIZE(X) + add X, X, INCX + bdnz LL(A1IN_RemainKernel) + blr + + EPILOGUE diff --git a/kernel/power/scal_hummer.S b/kernel/power/scal_hummer.S new file mode 100644 index 0000000..0b58486 --- /dev/null +++ b/kernel/power/scal_hummer.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 + +#define INCX2 r4 +#define X2 r5 + +#define ALPHA f1 + +#define A1 f0 +#define A2 f16 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 + +#define B1 f8 +#define B2 f9 +#define B3 f10 +#define B4 f11 +#define B5 f12 +#define B6 f13 +#define B7 f14 +#define B8 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + + lfpdx A1, SP, r10 # Zero clear + fsmfp ALPHA, ALPHA + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + + fcmpu cr7, ALPHA, A1 + bne cr7, LL(50) + + sub X, X, INCX2 + + andi. r0, X, 2 * SIZE - 1 + beq LL(11) + + STFDX A1, X, INCX2 + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + .align 4 + +LL(11): + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + .align 4 + +LL(12): + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + andi. r0, N, 8 + beq LL(16) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + STFPDUX A1, X, INCX2 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + STFDUX A1, X, INCX2 + b LL(999) + .align 4 + +LL(50): + sub X2, X, INCX2 + sub X, X, INCX2 + + andi. r0, X, 2 * SIZE - 1 + beq LL(51) + + LFDX A1, X, INCX2 + addi X, X, 1 * SIZE + + fmul B1, ALPHA, A1 + addi N, N, -1 + cmpwi cr0, N, 0 + + STFDX B1, X2, INCX2 + addi X2, X2, 1 * SIZE + ble LL(999) + .align 4 + +LL(51): + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(55) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(53) + .align 4 + +LL(52): + fpmul B1, ALPHA, A1 + LFPDUX A1, X, INCX2 + fpmul B2, ALPHA, A2 + LFPDUX A2, X, INCX2 + fpmul B3, ALPHA, A3 + LFPDUX A3, X, INCX2 + fpmul B4, ALPHA, A4 + LFPDUX A4, X, INCX2 + fpmul B5, ALPHA, A5 + LFPDUX A5, X, INCX2 + fpmul B6, ALPHA, A6 + LFPDUX A6, X, INCX2 + fpmul B7, ALPHA, A7 + LFPDUX A7, X, INCX2 + fpmul B8, ALPHA, A8 + LFPDUX A8, X, INCX2 + + STFPDUX B1, X2, INCX2 + STFPDUX B2, X2, INCX2 + STFPDUX B3, X2, INCX2 + STFPDUX B4, X2, INCX2 + STFPDUX B5, X2, INCX2 + STFPDUX B6, X2, INCX2 + STFPDUX B7, X2, INCX2 + STFPDUX B8, X2, INCX2 + bdnz LL(52) + .align 4 + +LL(53): + fpmul B1, ALPHA, A1 + fpmul B2, ALPHA, A2 + fpmul B3, ALPHA, A3 + fpmul B4, ALPHA, A4 + fpmul B5, ALPHA, A5 + fpmul B6, ALPHA, A6 + STFPDUX B1, X2, INCX2 + fpmul B7, ALPHA, A7 + STFPDUX B2, X2, INCX2 + fpmul B8, ALPHA, A8 + STFPDUX B3, X2, INCX2 + + STFPDUX B4, X2, INCX2 + STFPDUX B5, X2, INCX2 + STFPDUX B6, X2, INCX2 + STFPDUX B7, X2, INCX2 + STFPDUX B8, X2, INCX2 + .align 4 + +LL(55): + andi. r0, N, 15 + beq LL(999) + andi. r0, N, 8 + beq LL(56) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpmul B1, ALPHA, A1 + fpmul B2, ALPHA, A2 + fpmul B3, ALPHA, A3 + fpmul B4, ALPHA, A4 + + STFPDUX B1, X2, INCX2 + STFPDUX B2, X2, INCX2 + STFPDUX B3, X2, INCX2 + STFPDUX B4, X2, INCX2 + .align 4 + +LL(56): + andi. r0, N, 4 + beq LL(57) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpmul B1, ALPHA, A1 + fpmul B2, ALPHA, A2 + STFPDUX B1, X2, INCX2 + STFPDUX B2, X2, INCX2 + .align 4 + +LL(57): + andi. r0, N, 2 + beq LL(58) + + LFPDUX A1, X, INCX2 + fpmul B1, ALPHA, A1 + STFPDUX B1, X2, INCX2 + .align 4 + +LL(58): + andi. r0, N, 1 + beq LL(999) + + LFDX A1, X, INCX2 + fmul B1, ALPHA, A1 + STFDX B1, X2, INCX2 + b LL(999) + .align 4 + + +LL(100): + fcmpu cr7, ALPHA, A1 + bne cr7, LL(200) + + sub X, X, INCX + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(115) + .align 4 + +LL(112): + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(117) + + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + STFDUX A1, X, INCX + STFDUX A1, X, INCX + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(999) + STFDUX A1, X, INCX + b LL(999) + .align 4 + +LL(200): + sub X2, X, INCX + sub X, X, INCX + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(215) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + bdz LL(213) + .align 4 + +LL(212): + fmul B1, ALPHA, A1 + LFDUX A1, X, INCX + fmul B2, ALPHA, A2 + LFDUX A2, X, INCX + + fmul B3, ALPHA, A3 + LFDUX A3, X, INCX + fmul B4, ALPHA, A4 + LFDUX A4, X, INCX + + fmul B5, ALPHA, A5 + LFDUX A5, X, INCX + fmul B6, ALPHA, A6 + LFDUX A6, X, INCX + + fmul B7, ALPHA, A7 + LFDUX A7, X, INCX + fmul B8, ALPHA, A8 + LFDUX A8, X, INCX + + STFDUX B1, X2, INCX + STFDUX B2, X2, INCX + STFDUX B3, X2, INCX + STFDUX B4, X2, INCX + STFDUX B5, X2, INCX + STFDUX B6, X2, INCX + STFDUX B7, X2, INCX + STFDUX B8, X2, INCX + bdnz LL(212) + .align 4 + +LL(213): + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fmul B3, ALPHA, A3 + fmul B4, ALPHA, A4 + fmul B5, ALPHA, A5 + + fmul B6, ALPHA, A6 + STFDUX B1, X2, INCX + fmul B7, ALPHA, A7 + STFDUX B2, X2, INCX + fmul B8, ALPHA, A8 + STFDUX B3, X2, INCX + STFDUX B4, X2, INCX + STFDUX B5, X2, INCX + STFDUX B6, X2, INCX + STFDUX B7, X2, INCX + STFDUX B8, X2, INCX + .align 4 + +LL(215): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(217) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fmul B3, ALPHA, A3 + fmul B4, ALPHA, A4 + + STFDUX B1, X2, INCX + STFDUX B2, X2, INCX + STFDUX B3, X2, INCX + STFDUX B4, X2, INCX + .align 4 + +LL(217): + andi. r0, N, 2 + beq LL(218) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + + STFDUX B1, X2, INCX + STFDUX B2, X2, INCX + .align 4 + +LL(218): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + fmul B1, ALPHA, A1 + STFDUX B1, X2, INCX + .align 4 + +LL(999): + li r10, 16 + + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S new file mode 100644 index 0000000..8b9e271 --- /dev/null +++ b/kernel/power/scal_ppc440.S @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define XX r4 +#define PRE r5 + +#ifdef linux +#ifndef __64BIT__ +#define X r6 +#define INCX r7 +#else +#define X r7 +#define INCX r8 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define X r8 +#define INCX r9 +#else +#define X r7 +#define INCX r8 +#endif +#endif + +#define FZERO f0 +#define ALPHA f1 + + PROLOGUE + PROFCODE + + addi SP, SP, -8 + li r0, 0 + + stw r0, 0(SP) + lfs FZERO, 0(SP) + + addi SP, SP, 8 + + slwi INCX, INCX, BASE_SHIFT + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + blelr- cr0 + + sub X, X, INCX + + fcmpu cr0, FZERO, ALPHA + bne- cr0, LL(A1I1) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(A0I1_Remain) + .align 4 + +LL(A0I1_kernel): +#ifdef PPCG4 + dcbtst X, PRE +#endif + + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + +#ifdef PPCG4 + dcbtst X, PRE +#endif + + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + bdnz LL(A0I1_kernel) + .align 4 + +LL(A0I1_Remain): + andi. r0, N, 15 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0I1_RemainKernel): + STFDUX FZERO, X, INCX + bdnz LL(A0I1_RemainKernel) + blr + .align 4 + +LL(A1I1): + mr XX, X + + srawi. r0, N, 3 + mtspr CTR, r0 + beq+ LL(A1I1_Remain) + + LFDUX f2, X, INCX + LFDUX f3, X, INCX + LFDUX f4, X, INCX + LFDUX f5, X, INCX + bdz LL(12) + .align 4 + +LL(11): + LFDUX f6, X, INCX + FMUL f2, ALPHA, f2 + LFDUX f7, X, INCX + FMUL f3, ALPHA, f3 + LFDUX f8, X, INCX + FMUL f4, ALPHA, f4 + LFDUX f9, X, INCX + FMUL f5, ALPHA, f5 + +#ifdef PPCG4 + dcbtst X, PRE +#endif + STFDUX f2, XX, INCX + STFDUX f3, XX, INCX + STFDUX f4, XX, INCX + STFDUX f5, XX, INCX + + LFDUX f2, X, INCX + FMUL f6, ALPHA, f6 + LFDUX f3, X, INCX + FMUL f7, ALPHA, f7 + LFDUX f4, X, INCX + FMUL f8, ALPHA, f8 + LFDUX f5, X, INCX + FMUL f9, ALPHA, f9 + + STFDUX f6, XX, INCX + STFDUX f7, XX, INCX + STFDUX f8, XX, INCX + STFDUX f9, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + bdnz LL(11) + .align 4 + +LL(12): + LFDUX f6, X, INCX + FMUL f2, ALPHA, f2 + LFDUX f7, X, INCX + FMUL f3, ALPHA, f3 + LFDUX f8, X, INCX + FMUL f4, ALPHA, f4 + LFDUX f9, X, INCX + FMUL f5, ALPHA, f5 + + STFDUX f2, XX, INCX + FMUL f6, ALPHA, f6 + STFDUX f3, XX, INCX + FMUL f7, ALPHA, f7 + STFDUX f4, XX, INCX + FMUL f8, ALPHA, f8 + STFDUX f5, XX, INCX + FMUL f9, ALPHA, f9 + + STFDUX f6, XX, INCX + STFDUX f7, XX, INCX + STFDUX f8, XX, INCX + STFDUX f9, XX, INCX + .align 4 + +LL(A1I1_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1I1_RemainKernel): + LFDUX f2, X, INCX + FMUL f2, ALPHA, f2 + STFDUX f2, XX, INCX + bdnz LL(A1I1_RemainKernel) + blr + .align 4 + + EPILOGUE diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S new file mode 100644 index 0000000..f235c67 --- /dev/null +++ b/kernel/power/snrm2.S @@ -0,0 +1,412 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO 144(SP) +#define FONE 148(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, 4 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + fmr f8, f1 + fmr f9, f1 + fmr f10, f1 + fmr f11, f1 + fmr f12, f1 + fmr f13, f1 + fmr f14, f1 + fmr f15, f1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1000) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFD f16, 0 * SIZE(X) + addi X, X, 1 * SIZE + fmadd f0, f16, f16, f0 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f0, f0, f8 + + fsqrts f1, f0 + b LL(9999) + .align 4 + +LL(1000): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(1150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(1110) + .align 4 + +LL(1120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + .align 4 + +LL(1150): + andi. r0, N, 15 + mtspr CTR, r0 + beq- cr0, LL(1170) + .align 4 + +LL(1160): + LFDUX f16, X, INCX + fmadd f0, f16, f16, f0 + bdnz LL(1160) + .align 4 + +LL(1170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f0, f0, f8 + + fsqrts f1, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S new file mode 100644 index 0000000..a002492 --- /dev/null +++ b/kernel/power/snrm2_hummer.S @@ -0,0 +1,614 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 +#define C5 f4 +#define C6 f5 +#define C7 f6 +#define C8 f7 + +#define A1 f8 +#define A2 f9 +#define A3 f10 +#define A4 f11 +#define A5 f12 +#define A6 f13 +#define A7 f14 +#define A8 f15 + +#define A9 f16 +#define A10 f17 +#define A11 f18 +#define A12 f19 +#define A13 f20 +#define A14 f21 +#define A15 f22 +#define A16 f23 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + fpmr C5, C1 + fpmr C6, C1 + fpmr C7, C1 + fpmr C8, C1 + + cmpwi cr0, N, 0 + ble LL(99) + cmpwi cr0, INCX, 0 + ble LL(99) + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C1, 0(X) + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + fmul C1, C1, C1 + ble LL(998) + .align 4 + +LL(05): + srawi. r0, N, 5 + sub X, X, INCX2 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + LFPDUX A9, X, INCX2 + LFPDUX A10, X, INCX2 + LFPDUX A11, X, INCX2 + LFPDUX A12, X, INCX2 + LFPDUX A13, X, INCX2 + LFPDUX A14, X, INCX2 + LFPDUX A15, X, INCX2 + LFPDUX A16, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpmadd C1, A1, A1, C1 + LFPDUX A1, X, INCX2 + fpmadd C2, A2, A2, C2 + LFPDUX A2, X, INCX2 + fpmadd C3, A3, A3, C3 + LFPDUX A3, X, INCX2 + fpmadd C4, A4, A4, C4 + LFPDUX A4, X, INCX2 + + fpmadd C5, A5, A5, C5 + LFPDUX A5, X, INCX2 + fpmadd C6, A6, A6, C6 + LFPDUX A6, X, INCX2 + fpmadd C7, A7, A7, C7 + LFPDUX A7, X, INCX2 + fpmadd C8, A8, A8, C8 + LFPDUX A8, X, INCX2 + + fpmadd C1, A9, A9, C1 + LFPDUX A9, X, INCX2 + fpmadd C2, A10, A10, C2 + LFPDUX A10, X, INCX2 + fpmadd C3, A11, A11, C3 + LFPDUX A11, X, INCX2 + fpmadd C4, A12, A12, C4 + LFPDUX A12, X, INCX2 + + fpmadd C5, A13, A13, C5 + LFPDUX A13, X, INCX2 + fpmadd C6, A14, A14, C6 + LFPDUX A14, X, INCX2 + fpmadd C7, A15, A15, C7 + LFPDUX A15, X, INCX2 + fpmadd C8, A16, A16, C8 + LFPDUX A16, X, INCX2 + + bdnz LL(12) + .align 4 + +LL(13): + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + + fpmadd C1, A9, A9, C1 + fpmadd C2, A10, A10, C2 + fpmadd C3, A11, A11, C3 + fpmadd C4, A12, A12, C4 + + fpmadd C5, A13, A13, C5 + fpmadd C6, A14, A14, C6 + fpmadd C7, A15, A15, C7 + fpmadd C8, A16, A16, C8 + .align 4 + +LL(15): + andi. r0, N, 31 + beq LL(98) + + andi. r0, N, 16 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + .align 4 + +LL(16): + andi. r0, N, 8 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(17): + andi. r0, N, 4 + beq LL(18) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(18): + andi. r0, N, 2 + beq LL(19) + + LFPDUX A1, X, INCX2 + fpmadd C3, A1, A1, C3 + .align 4 + +LL(19): + andi. r0, N, 1 + beq LL(98) + + LFDX A1, X, INCX2 + fmadd C4, A1, A1, C4 + .align 4 + +LL(98): + fpadd C1, C1, C5 + lis r3, 0x3f00 + fpadd C2, C2, C6 + lis r4, 0x4040 + fpadd C3, C3, C7 + stw r3, 4(SP) + fpadd C4, C4, C8 + stw r4, 8(SP) + + fpadd C1, C1, C2 + fpadd C3, C3, C4 + lfs f10, 4(SP) + + fpadd C1, C1, C3 + lfs f11, 4(SP) + lfs f12, 8(SP) + + fsmtp C2, C1 + fadd C1, C2, C1 + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr +#else + fsqrt f1, f1 + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr +#endif + + .align 4 + +LL(99): + li r10, 16 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + sub X2, X, INCX + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(115) + + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + + LFDUX A9, X, INCX2 + LFDUX A10, X2, INCX2 + LFDUX A11, X, INCX2 + LFDUX A12, X2, INCX2 + + LFDUX A13, X, INCX2 + LFDUX A14, X2, INCX2 + LFDUX A15, X, INCX2 + LFDUX A16, X2, INCX2 + bdz LL(113) + .align 4 + +LL(112): + fmadd C1, A1, A1, C1 + LFDUX A1, X, INCX2 + fmadd C2, A2, A2, C2 + LFDUX A2, X2, INCX2 + fmadd C3, A3, A3, C3 + LFDUX A3, X, INCX2 + fmadd C4, A4, A4, C4 + LFDUX A4, X2, INCX2 + + fmadd C5, A5, A5, C5 + LFDUX A5, X, INCX2 + fmadd C6, A6, A6, C6 + LFDUX A6, X2, INCX2 + fmadd C7, A7, A7, C7 + LFDUX A7, X, INCX2 + fmadd C8, A8, A8, C8 + LFDUX A8, X2, INCX2 + + fmadd C1, A9, A9, C1 + LFDUX A9, X, INCX2 + fmadd C2, A10, A10, C2 + LFDUX A10, X2, INCX2 + fmadd C3, A11, A11, C3 + LFDUX A11, X, INCX2 + fmadd C4, A12, A12, C4 + LFDUX A12, X2, INCX2 + + fmadd C5, A13, A13, C5 + LFDUX A13, X, INCX2 + fmadd C6, A14, A14, C6 + LFDUX A14, X2, INCX2 + fmadd C7, A15, A15, C7 + LFDUX A15, X, INCX2 + fmadd C8, A16, A16, C8 + LFDUX A16, X2, INCX2 + + bdnz LL(112) + .align 4 + +LL(113): + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + + fmadd C5, A5, A5, C5 + fmadd C6, A6, A6, C6 + fmadd C7, A7, A7, C7 + fmadd C8, A8, A8, C8 + + fmadd C1, A9, A9, C1 + fmadd C2, A10, A10, C2 + fmadd C3, A11, A11, C3 + fmadd C4, A12, A12, C4 + + fmadd C5, A13, A13, C5 + fmadd C6, A14, A14, C6 + fmadd C7, A15, A15, C7 + fmadd C8, A16, A16, C8 + .align 4 + +LL(115): + andi. r0, N, 15 + beq LL(998) + andi. r0, N, 8 + beq LL(116) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + + fmadd C5, A5, A5, C5 + fmadd C6, A6, A6, C6 + fmadd C7, A7, A7, C7 + fmadd C8, A8, A8, C8 + .align 4 + +LL(116): + andi. r0, N, 4 + beq LL(117) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(998) + + LFDX A1, X, INCX2 + fmadd C1, A1, A1, C1 + .align 4 + +LL(998): + fadd C1, C1, C5 + lis r3, 0x3f00 + fadd C2, C2, C6 + lis r4, 0x4040 + fadd C3, C3, C7 + stw r3, 4(SP) + fadd C4, C4, C8 + stw r4, 8(SP) + + fadd C1, C1, C2 + lfs f10, 0(SP) + fadd C3, C3, C4 + lfs f11, 4(SP) + lfs f12, 8(SP) + + fadd C1, C1, C3 + + fcmpu cr0, f10, C1 + beq cr0, LL(999) + +#ifndef HUMMER_EMULATOR + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr +#else + fsqrt f1, f1 + + li r10, 16 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(999): + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S new file mode 100644 index 0000000..ffda99e --- /dev/null +++ b/kernel/power/snrm2_ppc440.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PRE r8 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define C1 152(SP) +#define C2 156(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + + li r10, 0 + lis r11, 0x3f80 + lis r6, 0x3f00 + lis r7, 0x4040 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r6, C1 + stw r7, C2 + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + li PRE, 3 * 16 * SIZE + + sub X, X, INCX + + cmpwi cr0, N, 0 + ble- LL(999) + cmpwi cr0, INCX, 0 + ble- LL(999) + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + fmr f8, f1 + fmr f9, f1 + fmr f10, f1 + fmr f11, f1 + fmr f12, f1 + fmr f13, f1 + fmr f14, f1 + fmr f15, f1 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(20) + .align 4 + +LL(10): + fmadd f0, f16, f16, f0 + LFDUX f16, X, INCX + fmadd f1, f17, f17, f1 + LFDUX f17, X, INCX + fmadd f2, f18, f18, f2 + LFDUX f18, X, INCX + fmadd f3, f19, f19, f3 + LFDUX f19, X, INCX + +#ifdef PPCG4 + dcbt X, PRE +#endif + + fmadd f4, f20, f20, f4 + LFDUX f20, X, INCX + fmadd f5, f21, f21, f5 + LFDUX f21, X, INCX + fmadd f6, f22, f22, f6 + LFDUX f22, X, INCX + fmadd f7, f23, f23, f7 + LFDUX f23, X, INCX + + fmadd f8, f24, f24, f8 + LFDUX f24, X, INCX + fmadd f9, f25, f25, f9 + LFDUX f25, X, INCX + fmadd f10, f26, f26, f10 + LFDUX f26, X, INCX + fmadd f11, f27, f27, f11 + LFDUX f27, X, INCX + +#ifdef PPCG4 + dcbt X, PRE +#endif + + fmadd f12, f28, f28, f12 + LFDUX f28, X, INCX + fmadd f13, f29, f29, f13 + LFDUX f29, X, INCX + fmadd f14, f30, f30, f14 + LFDUX f30, X, INCX + fmadd f15, f31, f31, f15 + LFDUX f31, X, INCX + + bdnz LL(10) + .align 4 + +LL(20): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq- cr0, LL(70) + .align 4 + +LL(60): + LFDUX f16, X, INCX + fmadd f0, f16, f16, f0 + bdnz LL(60) + .align 4 + +LL(70): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f1, f0, f8 + lfs f4, FZERO + + fcmpu cr0, f1, f4 + beq cr0, LL(999) + + frsqrte f0, f1 + lfs f8, C1 + lfs f9, C2 + + fmul f2, f1, f0 + fadd f7, f8, f8 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f5, f1, f0 + fmul f2, f5, f8 + fnmsub f3, f5, f0, f7 + fmadd f1, f2, f3, f5 + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/staticbuffer.S b/kernel/power/staticbuffer.S new file mode 100644 index 0000000..7bbd23d --- /dev/null +++ b/kernel/power/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 8 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 +#endif diff --git a/kernel/power/swap.S b/kernel/power/swap.S new file mode 100644 index 0000000..a0d150f --- /dev/null +++ b/kernel/power/swap.S @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define PREA r4 +#define XX r10 +#define YY r11 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define PREA r4 +#define XX r5 +#define YY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define PREA r5 +#define XX r6 +#define YY r11 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define PREA r4 +#define XX r5 +#define YY r6 +#endif +#endif + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCY, 56 + STACKSIZE(SP) +#endif + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + +LL(10): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f24, 8 * SIZE(Y) + LFD f25, 9 * SIZE(Y) + LFD f26, 10 * SIZE(Y) + LFD f27, 11 * SIZE(Y) + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f28, 12 * SIZE(Y) + LFD f29, 13 * SIZE(Y) + LFD f30, 14 * SIZE(Y) + LFD f31, 15 * SIZE(Y) + + STFD f16, 0 * SIZE(X) + STFD f17, 1 * SIZE(X) + STFD f18, 2 * SIZE(X) + STFD f19, 3 * SIZE(X) + + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + STFD f2, 2 * SIZE(Y) + STFD f3, 3 * SIZE(Y) + + STFD f20, 4 * SIZE(X) + STFD f21, 5 * SIZE(X) + STFD f22, 6 * SIZE(X) + STFD f23, 7 * SIZE(X) + + STFD f4, 4 * SIZE(Y) + STFD f5, 5 * SIZE(Y) + STFD f6, 6 * SIZE(Y) + STFD f7, 7 * SIZE(Y) + + STFD f24, 8 * SIZE(X) + STFD f25, 9 * SIZE(X) + STFD f26, 10 * SIZE(X) + STFD f27, 11 * SIZE(X) + + STFD f8, 8 * SIZE(Y) + STFD f9, 9 * SIZE(Y) + STFD f10, 10 * SIZE(Y) + STFD f11, 11 * SIZE(Y) + + STFD f28, 12 * SIZE(X) + STFD f29, 13 * SIZE(X) + STFD f30, 14 * SIZE(X) + STFD f31, 15 * SIZE(X) + + STFD f12, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f14, 14 * SIZE(Y) + STFD f15, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + dcbtst X, PREA +#ifdef L1_DUALFETCH + dcbtst Y, PREA +#endif + bdnz LL(10) + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 0 * SIZE(Y) + + STFD f9, 0 * SIZE(X) + STFD f8, 0 * SIZE(Y) + + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + mr XX, X + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f16, Y, INCY + LFDUX f17, Y, INCY + LFDUX f18, Y, INCY + LFDUX f19, Y, INCY + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f20, Y, INCY + LFDUX f21, Y, INCY + LFDUX f22, Y, INCY + LFDUX f23, Y, INCY + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + + LFDUX f24, Y, INCY + LFDUX f25, Y, INCY + LFDUX f26, Y, INCY + LFDUX f27, Y, INCY + + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + LFDUX f28, Y, INCY + LFDUX f29, Y, INCY + LFDUX f30, Y, INCY + LFDUX f31, Y, INCY + + STFDUX f16, XX, INCX + STFDUX f17, XX, INCX + STFDUX f18, XX, INCX + STFDUX f19, XX, INCX + + STFDUX f0, YY, INCY + STFDUX f1, YY, INCY + STFDUX f2, YY, INCY + STFDUX f3, YY, INCY + + STFDUX f20, XX, INCX + STFDUX f21, XX, INCX + STFDUX f22, XX, INCX + STFDUX f23, XX, INCX + + STFDUX f4, YY, INCY + STFDUX f5, YY, INCY + STFDUX f6, YY, INCY + STFDUX f7, YY, INCY + + STFDUX f24, XX, INCX + STFDUX f25, XX, INCX + STFDUX f26, XX, INCX + STFDUX f27, XX, INCX + + STFDUX f8, YY, INCY + STFDUX f9, YY, INCY + STFDUX f10, YY, INCY + STFDUX f11, YY, INCY + + STFDUX f28, XX, INCX + STFDUX f29, XX, INCX + STFDUX f30, XX, INCX + STFDUX f31, XX, INCX + + STFDUX f12, YY, INCY + STFDUX f13, YY, INCY + STFDUX f14, YY, INCY + STFDUX f15, YY, INCY + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + STFDUX f9, XX, INCX + STFDUX f8, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/swap_hummer.S b/kernel/power/swap_hummer.S new file mode 100644 index 0000000..293a28b --- /dev/null +++ b/kernel/power/swap_hummer.S @@ -0,0 +1,703 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 + +#define INCX2 r4 +#define INCY2 r5 +#define X2 r10 +#define Y2 r11 + +#define A1 f0 +#define A2 f1 +#define A3 f2 +#define A4 f3 +#define A5 f4 + +#define B1 f5 +#define B2 f6 +#define B3 f7 +#define B4 f8 +#define B5 f9 + +#define T1 f10 +#define T2 f11 +#define T3 f12 +#define T4 f13 +#define T5 f14 +#define T6 f15 +#define T7 f16 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + mr X2, X + mr Y2, Y + + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + .align 4 + +LL(10): /* X : aligned Y : aligned */ + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + bdz LL(13) + .align 4 + +LL(12): + STFPDUX B1, X2, INCY2 + LFPDUX B1, Y, INCY2 + STFPDUX A1, Y2, INCY2 + LFPDUX A1, X, INCX2 + + STFPDUX B2, X2, INCY2 + LFPDUX B2, Y, INCY2 + STFPDUX A2, Y2, INCY2 + LFPDUX A2, X, INCX2 + + STFPDUX B3, X2, INCY2 + LFPDUX B3, Y, INCY2 + STFPDUX A3, Y2, INCY2 + LFPDUX A3, X, INCX2 + + STFPDUX B4, X2, INCY2 + LFPDUX B4, Y, INCY2 + STFPDUX A4, Y2, INCY2 + LFPDUX A4, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + STFPDUX B3, X2, INCY2 + STFPDUX A3, Y2, INCY2 + STFPDUX B4, X2, INCY2 + STFPDUX A4, Y2, INCY2 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + STFDUX B1, X2, INCY2 + STFDUX A1, Y2, INCY2 + b LL(999) + .align 4 + +LL(20): /* X : aligned Y : unaligned */ + + LFXDUX A1, X, INCX2 + LFDX B1, Y, INCY2 + + STFSDX A1, Y2, INCY2 + + add Y, Y, INCY + add Y2, Y2, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(29) + .align 4 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX T1, X, INCX2 + LFXDUX T2, Y, INCY2 + LFXDUX T3, X, INCX2 + LFXDUX T4, Y, INCY2 + + LFPDUX A4, X, INCX2 + fsmr A1, T1 + LFPDUX B4, Y, INCY2 + fsmr B1, T2 + LFPDUX A5, X, INCX2 + fsmr T1, T3 + LFPDUX B5, Y, INCY2 + fsmr T2, T4 + bdz LL(23) + .align 4 + +LL(22): + fxmr T5, A4 + STFPDUX A1, Y2, INCY2 + fxmr T6, B4 + STFPDUX B1, X2, INCX2 + fxmr A1, A5 + STFPDUX T1, Y2, INCY2 + fxmr B1, B5 + STFPDUX T2, X2, INCX2 + + fsmr T3, T5 + LFPDUX A2, X, INCX2 + fsmr T4, T6 + LFPDUX B2, Y, INCY2 + fsmr T5, A1 + LFPDUX A3, X, INCX2 + fsmr T6, B1 + LFPDUX B3, Y, INCY2 + + fxmr T1, A2 + STFPDUX T3, Y2, INCY2 + fxmr T2, B2 + STFPDUX T4, X2, INCX2 + fxmr T3, A3 + STFPDUX T5, Y2, INCY2 + fxmr T4, B3 + STFPDUX T6, X2, INCX2 + + fsmr A1, T1 + LFPDUX A4, X, INCX2 + fsmr B1, T2 + LFPDUX B4, Y, INCY2 + fsmr T1, T3 + LFPDUX A5, X, INCX2 + fsmr T2, T4 + LFPDUX B5, Y, INCY2 + bdnz LL(22) + .align 4 + +LL(23): + fxmr T5, A4 + STFPDUX A1, Y2, INCY2 + fxmr T6, B4 + STFPDUX B1, X2, INCX2 + fxmr A1, A5 + STFPDUX T1, Y2, INCY2 + fxmr B1, B5 + STFPDUX T2, X2, INCX2 + + fsmr T3, T5 + fsmr T4, T6 + fsmr T5, A1 + fsmr T6, B1 + + STFPDUX T3, Y2, INCY2 + STFPDUX T4, X2, INCX2 + STFPDUX T5, Y2, INCY2 + STFPDUX T6, X2, INCX2 + .align 4 + +LL(25): + andi. r0, N, 7 + beq LL(29) + + andi. r0, N, 4 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFXDUX B2, Y, INCY2 + LFXDUX A3, X, INCX2 + LFXDUX B3, Y, INCY2 + + fsmr A1, A2 + fsmr B1, B2 + fsmr A2, A3 + fsmr B2, B3 + + STFPDUX A1, Y2, INCY2 + STFPDUX B1, X2, INCX2 + STFPDUX A2, Y2, INCY2 + fpmr A1, A3 + STFPDUX B2, X2, INCX2 + fpmr B1, B3 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFXDUX A2, X, INCX2 + LFXDUX B2, Y, INCY2 + fsmr A1, A2 + fsmr B1, B2 + STFPDUX A1, Y2, INCY2 + fpmr A1, A2 + STFPDUX B1, X2, INCX2 + fpmr B1, B2 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(29) + + LFSDX B1, Y, INCY2 + STFDX A1, Y2, INCY2 + STFDX B1, X2, INCX2 + add X2, X2, INCX + fsmtp B1, B1 + .align 4 + +LL(29): + STFDX B1, X2, INCX2 + b LL(999) + .align 4 + + +LL(30): /* X : unaligned Y : aligned */ + + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFXDUX A1, Y, INCY2 + LFDX B1, X, INCX2 + + STFSDX A1, X2, INCX2 + + add X, X, INCX + add X2, X2, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(39) + .align 4 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX T1, Y, INCY2 + LFXDUX T2, X, INCX2 + LFXDUX T3, Y, INCY2 + LFXDUX T4, X, INCX2 + + LFPDUX A4, Y, INCY2 + fsmr A1, T1 + LFPDUX B4, X, INCX2 + fsmr B1, T2 + LFPDUX A5, Y, INCY2 + fsmr T1, T3 + LFPDUX B5, X, INCX2 + fsmr T2, T4 + bdz LL(33) + .align 4 + +LL(32): + fxmr T5, A4 + STFPDUX A1, X2, INCX2 + fxmr T6, B4 + STFPDUX B1, Y2, INCY2 + fxmr A1, A5 + STFPDUX T1, X2, INCX2 + fxmr B1, B5 + STFPDUX T2, Y2, INCY2 + + fsmr T3, T5 + LFPDUX A2, Y, INCY2 + fsmr T4, T6 + LFPDUX B2, X, INCX2 + fsmr T5, A1 + LFPDUX A3, Y, INCY2 + fsmr T6, B1 + LFPDUX B3, X, INCX2 + + fxmr T1, A2 + STFPDUX T3, X2, INCX2 + fxmr T2, B2 + STFPDUX T4, Y2, INCY2 + fxmr T3, A3 + STFPDUX T5, X2, INCX2 + fxmr T4, B3 + STFPDUX T6, Y2, INCY2 + + fsmr A1, T1 + LFPDUX A4, Y, INCY2 + fsmr B1, T2 + LFPDUX B4, X, INCX2 + fsmr T1, T3 + LFPDUX A5, Y, INCY2 + fsmr T2, T4 + LFPDUX B5, X, INCX2 + bdnz LL(32) + .align 4 + +LL(33): + fxmr T5, A4 + STFPDUX A1, X2, INCX2 + fxmr T6, B4 + STFPDUX B1, Y2, INCY2 + fxmr A1, A5 + STFPDUX T1, X2, INCX2 + fxmr B1, B5 + STFPDUX T2, Y2, INCY2 + + fsmr T3, T5 + fsmr T4, T6 + fsmr T5, A1 + fsmr T6, B1 + + STFPDUX T3, X2, INCX2 + STFPDUX T4, Y2, INCY2 + STFPDUX T5, X2, INCX2 + STFPDUX T6, Y2, INCY2 + .align 4 + +LL(35): + andi. r0, N, 7 + beq LL(39) + + andi. r0, N, 4 + beq LL(37) + + LFXDUX A2, Y, INCY2 + LFXDUX B2, X, INCX2 + LFXDUX A3, Y, INCY2 + LFXDUX B3, X, INCX2 + + fsmr A1, A2 + fsmr B1, B2 + fsmr A2, A3 + fsmr B2, B3 + + STFPDUX A1, X2, INCX2 + STFPDUX B1, Y2, INCY2 + STFPDUX A2, X2, INCX2 + fpmr A1, A3 + STFPDUX B2, Y2, INCY2 + fpmr B1, B3 + .align 4 + +LL(37): + andi. r0, N, 2 + beq LL(38) + + LFXDUX A2, Y, INCY2 + LFXDUX B2, X, INCX2 + fsmr A1, A2 + fsmr B1, B2 + STFPDUX A1, X2, INCX2 + fpmr A1, A2 + STFPDUX B1, Y2, INCY2 + fpmr B1, B2 + .align 4 + +LL(38): + andi. r0, N, 1 + beq LL(39) + + LFSDX B1, X, INCX2 + STFDX A1, X2, INCX2 + STFDX B1, Y2, INCY2 + add Y2, Y2, INCY + fsmtp B1, B1 + .align 4 + +LL(39): + STFDX B1, Y2, INCY2 + b LL(999) + .align 4 + +LL(40): /* X : unaligned Y : unaligned */ + + LFDX A1, Y, INCY2 + LFDX B1, X, INCX2 + add X, X, INCX + add Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + + STFDX A1, X2, INCX2 + STFDX B1, Y2, INCY2 + add X2, X2, INCX + add Y2, Y2, INCY + ble LL(999) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + bdz LL(43) + .align 4 + +LL(42): + STFPDUX B1, X2, INCY2 + LFPDUX B1, Y, INCY2 + STFPDUX A1, Y2, INCY2 + LFPDUX A1, X, INCX2 + + STFPDUX B2, X2, INCY2 + LFPDUX B2, Y, INCY2 + STFPDUX A2, Y2, INCY2 + LFPDUX A2, X, INCX2 + + STFPDUX B3, X2, INCY2 + LFPDUX B3, Y, INCY2 + STFPDUX A3, Y2, INCY2 + LFPDUX A3, X, INCX2 + + STFPDUX B4, X2, INCY2 + LFPDUX B4, Y, INCY2 + STFPDUX A4, Y2, INCY2 + LFPDUX A4, X, INCX2 + bdnz LL(42) + .align 4 + +LL(43): + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + STFPDUX B3, X2, INCY2 + STFPDUX A3, Y2, INCY2 + STFPDUX B4, X2, INCY2 + STFPDUX A4, Y2, INCY2 + .align 4 + +LL(45): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + .align 4 + +LL(46): + andi. r0, N, 2 + beq LL(47) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + .align 4 + +LL(47): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + STFDUX B1, X2, INCY2 + STFDUX A1, Y2, INCY2 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + mr X2, X + mr Y2, Y + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + bdz LL(113) + .align 4 + +LL(112): + STFDUX B1, X2, INCX + LFDUX B1, Y, INCY + STFDUX A1, Y2, INCY + LFDUX A1, X, INCX + + STFDUX B2, X2, INCX + LFDUX B2, Y, INCY + STFDUX A2, Y2, INCY + LFDUX A2, X, INCX + + STFDUX B3, X2, INCX + LFDUX B3, Y, INCY + STFDUX A3, Y2, INCY + LFDUX A3, X, INCX + + STFDUX B4, X2, INCX + LFDUX B4, Y, INCY + STFDUX A4, Y2, INCY + LFDUX A4, X, INCX + bdnz LL(112) + .align 4 + +LL(113): + STFDUX B1, X2, INCX + STFDUX A1, Y2, INCY + STFDUX B2, X2, INCX + STFDUX A2, Y2, INCY + + STFDUX B3, X2, INCX + STFDUX A3, Y2, INCY + STFDUX B4, X2, INCX + STFDUX A4, Y2, INCY + .align 4 + +LL(115): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(117) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX B1, Y, INCY + LFDUX B2, Y, INCY + + STFDUX B1, X2, INCX + STFDUX B2, X2, INCX + STFDUX A1, Y2, INCY + STFDUX A2, Y2, INCY + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + STFDUX B1, X2, INCX + STFDUX A1, Y2, INCY + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S new file mode 100644 index 0000000..91bfb5e --- /dev/null +++ b/kernel/power/symv_L.S @@ -0,0 +1,1521 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define BUFFER r14 +#else +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define BUFFER r14 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#else +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define BUFFER r14 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r15 +#define AO2 r16 +#define AO3 r17 +#define AO4 r18 +#define XX r19 +#define YY r20 +#define NEW_Y r21 +#define TEMP r22 +#define PREA r24 +#define IS r25 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 + +#define atemp1 f4 +#define atemp2 f5 +#define atemp3 f6 +#define atemp4 f7 + +#define xtemp1 f8 +#define xtemp2 f9 +#define xtemp3 f10 +#define xtemp4 f11 + +#define xsum1 f12 +#define xsum2 f13 +#define xsum3 f14 +#define xsum4 f15 + +#define a1 f16 +#define a2 f17 +#define a3 f18 +#define a4 f19 +#define a5 f20 +#define a6 f21 +#define a7 f22 +#define a8 f23 +#define a9 f24 +#define a10 f25 +#define a11 f26 +#define a12 f27 +#define a13 f28 +#define a14 f29 +#define a15 f30 +#define a16 f31 + +#define alpha f1 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 64 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 96 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 40 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#define NOP1 +#define NOP2 +#else +#define NOP1 mr LDA, LDA +#define NOP2 mr INCX, INCX +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA 200(SP) +#define FZERO 208(SP) +#else +#define STACKSIZE 280 +#define ALPHA 256(SP) +#define FZERO 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz BUFFER, 56 + STACKSIZE(SP) +#else + ld INCY, 112 + STACKSIZE(SP) + ld BUFFER, 120 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) + lwz BUFFER, 60 + STACKSIZE(SP) +#endif +#else + ld INCY, 112 + STACKSIZE(SP) + ld BUFFER, 120 + STACKSIZE(SP) +#endif +#endif + + STFD alpha, ALPHA + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + beq LL(05) + + mr XX, X + mr X, BUFFER + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(03) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(XX) + add XX, XX, INCX + LFD a2, 0 * SIZE(XX) + add XX, XX, INCX + LFD a3, 0 * SIZE(XX) + add XX, XX, INCX + LFD a4, 0 * SIZE(XX) + add XX, XX, INCX + LFD a5, 0 * SIZE(XX) + add XX, XX, INCX + LFD a6, 0 * SIZE(XX) + add XX, XX, INCX + LFD a7, 0 * SIZE(XX) + add XX, XX, INCX + LFD a8, 0 * SIZE(XX) + add XX, XX, INCX + + dcbt XX, PREA + dcbtst BUFFER, PREA + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + STFD a3, 2 * SIZE(BUFFER) + STFD a4, 3 * SIZE(BUFFER) + STFD a5, 4 * SIZE(BUFFER) + STFD a6, 5 * SIZE(BUFFER) + STFD a7, 6 * SIZE(BUFFER) + STFD a8, 7 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(01) + .align 4 + +LL(03): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(04): + LFD a1, 0 * SIZE(XX) + add XX, XX, INCX + + STFD a1, 0 * SIZE(BUFFER) + addi BUFFER, BUFFER, 1 * SIZE + bdnz LL(04) + .align 4 + +LL(05): + mr NEW_Y, Y + lfd f0, FZERO + + cmpwi cr0, INCY, SIZE + beq LL(10) + + mr NEW_Y, BUFFER + + addi r0, M, 7 + srawi. r0, r0, 3 + mtspr CTR, r0 + .align 4 + +LL(06): + STFD f0, 0 * SIZE(BUFFER) + STFD f0, 1 * SIZE(BUFFER) + STFD f0, 2 * SIZE(BUFFER) + STFD f0, 3 * SIZE(BUFFER) + STFD f0, 4 * SIZE(BUFFER) + STFD f0, 5 * SIZE(BUFFER) + STFD f0, 6 * SIZE(BUFFER) + STFD f0, 7 * SIZE(BUFFER) + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(06) + .align 4 + +LL(10): + li IS, 0 + + cmpwi cr0, N, 4 + blt LL(20) + .align 4 + +LL(11): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + addi A, A, 4 * SIZE + + slwi TEMP, IS, BASE_SHIFT + add XX, X, TEMP + add YY, NEW_Y, TEMP + + LFD atemp1, 0 * SIZE(XX) + LFD atemp2, 1 * SIZE(XX) + LFD atemp3, 2 * SIZE(XX) + LFD atemp4, 3 * SIZE(XX) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + LFD a11, 2 * SIZE(AO3) + LFD a12, 3 * SIZE(AO3) + + LFD a16, 3 * SIZE(AO4) + + LFD a5, ALPHA + + FMUL xsum1, atemp1, a1 + FMUL xsum2, atemp1, a2 + FMUL xsum3, atemp1, a3 + FMUL xsum4, atemp1, a4 + + FMADD xsum1, atemp2, a2, xsum1 + FMADD xsum2, atemp2, a6, xsum2 + FMADD xsum3, atemp2, a7, xsum3 + FMADD xsum4, atemp2, a8, xsum4 + + FMADD xsum1, atemp3, a3, xsum1 + FMADD xsum2, atemp3, a7, xsum2 + FMADD xsum3, atemp3, a11, xsum3 + FMADD xsum4, atemp3, a12, xsum4 + + FMADD xsum1, atemp4, a4, xsum1 + FMADD xsum2, atemp4, a8, xsum2 + FMADD xsum3, atemp4, a12, xsum3 + FMADD xsum4, atemp4, a16, xsum4 + + FMUL atemp1, a5, atemp1 + FMUL atemp2, a5, atemp2 + FMUL atemp3, a5, atemp3 + FMUL atemp4, a5, atemp4 + + LFD xtemp1, 4 * SIZE(XX) + LFD xtemp2, 5 * SIZE(XX) + LFD xtemp3, 6 * SIZE(XX) + LFD xtemp4, 7 * SIZE(XX) + + LFD y01, 4 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + LFD a1, 4 * SIZE(AO1) + LFD a2, 5 * SIZE(AO1) + LFD a3, 6 * SIZE(AO1) + LFD a4, 7 * SIZE(AO1) + + LFD a5, 4 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + LFD a9, 4 * SIZE(AO3) + LFD a10, 5 * SIZE(AO3) + LFD a11, 6 * SIZE(AO3) + LFD a12, 7 * SIZE(AO3) + + LFD a13, 4 * SIZE(AO4) + LFD a14, 5 * SIZE(AO4) + LFD a15, 6 * SIZE(AO4) + LFD a16, 7 * SIZE(AO4) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + + addi XX, XX, 4 * SIZE + addi YY, YY, 4 * SIZE + + sub TEMP, M, IS + addi TEMP, TEMP, -4 + srawi. r0, TEMP, 4 + mtspr CTR, r0 + ble LL(14) + .align 4 + +LL(12): + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO1, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 +# DCBT(X, PREX) + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO2, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 9 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 9 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 9 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 10 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 8 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 10 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 9 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 10 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 11 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 8 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 11 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 9 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 11 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 10 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 11 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 11 * SIZE(AO4) + + STFD y01, 4 * SIZE(YY) + LFD y01, 8 * SIZE(YY) + STFD y02, 5 * SIZE(YY) + LFD y02, 9 * SIZE(YY) + + STFD y03, 6 * SIZE(YY) + LFD y03, 10 * SIZE(YY) + STFD y04, 7 * SIZE(YY) + LFD y04, 11 * SIZE(YY) + + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO3, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 13 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 +# DCBT(Y1, PREY) + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 13 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 14 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 12 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 14 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10,13 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 14 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 15 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13,12 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 15 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 13 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 15 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 14 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 15 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 15 * SIZE(AO4) + + STFD y01, 8 * SIZE(YY) + LFD y01, 12 * SIZE(YY) + STFD y02, 9 * SIZE(YY) + LFD y02, 13 * SIZE(YY) + + STFD y03, 10 * SIZE(YY) + LFD y03, 14 * SIZE(YY) + STFD y04, 11 * SIZE(YY) + LFD y04, 15 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO4, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + addi YY, YY, 16 * SIZE + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 17 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 16 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + addi AO3, AO3, 16 * SIZE + FMADD y02, atemp2, a6, y02 + LFD a6, 17 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + addi AO1, AO1, 16 * SIZE + FMADD y03, atemp2, a7, y03 + addi AO2, AO2, 16 * SIZE + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 17 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + addi AO4, AO4, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 2 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 0 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 2 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 1 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 2 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 18 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + addi XX, XX, 16 * SIZE + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 3 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 0 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 3 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 1 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 3 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 2 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 3 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 3 * SIZE(AO4) + + STFD y01, -4 * SIZE(YY) + LFD y01, 0 * SIZE(YY) + STFD y02, -3 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + STFD y03, -2 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + STFD y04, -1 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + bdnz LL(12) + .align 4 + +LL(14): + sub TEMP, M, IS + addi TEMP, TEMP, -4 + andi. r0, TEMP, 8 + ble LL(15) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 9 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 9 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 9 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 10 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 8 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 10 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 9 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 10 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 11 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 8 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 11 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 9 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 11 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 10 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 11 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 11 * SIZE(AO4) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi AO3, AO3, 8 * SIZE + addi AO4, AO4, 8 * SIZE + + STFD y01, 4 * SIZE(YY) + LFD y01, 8 * SIZE(YY) + STFD y02, 5 * SIZE(YY) + LFD y02, 9 * SIZE(YY) + + STFD y03, 6 * SIZE(YY) + LFD y03, 10 * SIZE(YY) + STFD y04, 7 * SIZE(YY) + LFD y04, 11 * SIZE(YY) + + addi XX, XX, 8 * SIZE + addi YY, YY, 8 * SIZE + .align 4 + +LL(15): + sub TEMP, M, IS + addi TEMP, TEMP, -4 + andi. r0, TEMP, 4 + ble LL(16) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + addi XX, XX, 4 * SIZE + addi YY, YY, 4 * SIZE + .align 4 + +LL(16): + andi. r0, M, 2 + ble LL(17) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + LFD a1, 2 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + FMADD y02, atemp1, a2, y02 + + FMADD xsum3, xtemp1, a9, xsum3 + FMADD y01, atemp2, a5, y01 + LFD a5, 2 * SIZE(AO2) + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 2 * SIZE(XX) + FMADD y02, atemp2, a6, y02 + + FMADD xsum1, xtemp2, a2, xsum1 + FMADD y01, atemp3, a9, y01 + LFD a9, 2 * SIZE(AO3) + + FMADD xsum2, xtemp2, a6, xsum2 + FMADD y02, atemp3, a10, y02 + + FMADD xsum3, xtemp2, a10, xsum3 + FMADD y01, atemp4, a13, y01 + LFD a13, 2 * SIZE(AO4) + + FMADD xsum4, xtemp2, a14, xsum4 + FMADD y02, atemp4, a14, y02 + + STFD y01, 0 * SIZE(YY) + LFD y01, 2 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + addi YY, YY, 2 * SIZE + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(18) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp1, a5, xsum2 + FMADD y01, atemp2, a5, y01 + FMADD xsum3, xtemp1, a9, xsum3 + FMADD y01, atemp3, a9, y01 + FMADD xsum4, xtemp1, a13, xsum4 + FMADD y01, atemp4, a13, y01 + + STFD y01, 0 * SIZE(YY) + .align 4 + +LL(18): + slwi TEMP, IS, BASE_SHIFT + add YY, NEW_Y, TEMP + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + FMUL xsum2, xtemp1, xsum2 + FMUL xsum3, xtemp1, xsum3 + FMUL xsum4, xtemp1, xsum4 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + FADD y03, y03, xsum3 + FADD y04, y04, xsum4 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + STFD y03, 2 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + + addi TEMP, IS, 8 + addi IS, IS, 4 + cmpw cr0, TEMP, N + ble LL(11) + .align 4 + +LL(20): + andi. TEMP, N, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + addi A, A, 2 * SIZE + + slwi TEMP, IS, BASE_SHIFT + add XX, X, TEMP + add YY, NEW_Y, TEMP + + LFD atemp1, 0 * SIZE(XX) + LFD atemp2, 1 * SIZE(XX) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a6, 1 * SIZE(AO2) + + LFD a5, ALPHA + + FMUL xsum1, atemp1, a1 + FMUL xsum2, atemp1, a2 + + FMADD xsum1, atemp2, a2, xsum1 + FMADD xsum2, atemp2, a6, xsum2 + + FMUL atemp1, a5, atemp1 + FMUL atemp2, a5, atemp2 + + LFD xtemp1, 2 * SIZE(XX) + LFD y01, 2 * SIZE(YY) + LFD a1, 2 * SIZE(AO1) + LFD a5, 2 * SIZE(AO2) + + andi. r0, M, 1 + ble LL(28) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp1, a5, xsum2 + FMADD y01, atemp2, a5, y01 + + STFD y01, 2 * SIZE(YY) + .align 4 + +LL(28): + slwi TEMP, IS, BASE_SHIFT + add YY, NEW_Y, TEMP + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + FMUL xsum2, xtemp1, xsum2 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + + addi IS, IS, 2 + .align 4 + +LL(30): + andi. TEMP, N, 1 + ble LL(990) + + mr AO1, A + + slwi TEMP, IS, BASE_SHIFT + add XX, X, TEMP + add YY, NEW_Y, TEMP + + LFD atemp1, 0 * SIZE(XX) + LFD a1, 0 * SIZE(AO1) + LFD xtemp1, ALPHA + LFD y01, 0 * SIZE(YY) + + FMUL xsum1, atemp1, a1 + FMUL xsum1, xtemp1, xsum1 + + FADD y01, y01, xsum1 + + STFD y01, 0 * SIZE(YY) + .align 4 + +LL(990): + cmpwi cr0, INCY, SIZE + beq LL(999) + + mr YY, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + add Y, Y, INCY + LFD f5, 0 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + add Y, Y, INCY + LFD f7, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + LFD f12, 4 * SIZE(NEW_Y) + LFD f13, 5 * SIZE(NEW_Y) + LFD f14, 6 * SIZE(NEW_Y) + LFD f15, 7 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + add YY, YY, INCY + STFD f11, 0 * SIZE(YY) + add YY, YY, INCY + STFD f12, 0 * SIZE(YY) + add YY, YY, INCY + STFD f13, 0 * SIZE(YY) + add YY, YY, INCY + STFD f14, 0 * SIZE(YY) + add YY, YY, INCY + STFD f15, 0 * SIZE(YY) + add YY, YY, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 4 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + add YY, YY, INCY + STFD f11, 0 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(996): + andi. J, M, 2 + ble LL(997) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 2 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(997): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f8, 0 * SIZE(NEW_Y) + + FADD f8, f8, f0 + + STFD f8, 0 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S new file mode 100644 index 0000000..76cbd64 --- /dev/null +++ b/kernel/power/symv_U.S @@ -0,0 +1,1506 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define IS r4 +#define A r5 +#define LDA r6 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define BUFFER r14 +#else +#define M r3 +#define IS r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define BUFFER r14 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define IS r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#else +#define M r3 +#define IS r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define BUFFER r14 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r15 +#define AO2 r16 +#define AO3 r17 +#define AO4 r18 +#define XX r19 +#define YY r20 +#define NEW_Y r21 +#define TEMP r22 +#define PREA r24 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 + +#define atemp1 f4 +#define atemp2 f5 +#define atemp3 f6 +#define atemp4 f7 + +#define xtemp1 f8 +#define xtemp2 f9 +#define xtemp3 f10 +#define xtemp4 f11 + +#define xsum1 f12 +#define xsum2 f13 +#define xsum3 f14 +#define xsum4 f15 + +#define a1 f16 +#define a2 f17 +#define a3 f18 +#define a4 f19 +#define a5 f20 +#define a6 f21 +#define a7 f22 +#define a8 f23 +#define a9 f24 +#define a10 f25 +#define a11 f26 +#define a12 f27 +#define a13 f28 +#define a14 f29 +#define a15 f30 +#define a16 f31 + +#define alpha f1 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 64 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 96 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 40 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#define NOP1 +#define NOP2 +#else +#define NOP1 mr LDA, LDA +#define NOP2 mr INCX, INCX +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA 200(SP) +#define FZERO 208(SP) +#else +#define STACKSIZE 280 +#define ALPHA 256(SP) +#define FZERO 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz BUFFER, 56 + STACKSIZE(SP) +#else + ld INCY, 112 + STACKSIZE(SP) + ld BUFFER, 120 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) + lwz BUFFER, 60 + STACKSIZE(SP) +#endif +#else + ld INCY, 112 + STACKSIZE(SP) + ld BUFFER, 120 + STACKSIZE(SP) +#endif +#endif + + STFD alpha, ALPHA + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + sub IS, M, IS + + cmpwi cr0, M, 0 + ble- LL(999) + + mullw TEMP, IS, LDA + add A, A, TEMP + + cmpwi cr0, INCX, SIZE + beq LL(05) + + mr XX, X + mr X, BUFFER + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(03) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(XX) + add XX, XX, INCX + LFD a2, 0 * SIZE(XX) + add XX, XX, INCX + LFD a3, 0 * SIZE(XX) + add XX, XX, INCX + LFD a4, 0 * SIZE(XX) + add XX, XX, INCX + LFD a5, 0 * SIZE(XX) + add XX, XX, INCX + LFD a6, 0 * SIZE(XX) + add XX, XX, INCX + LFD a7, 0 * SIZE(XX) + add XX, XX, INCX + LFD a8, 0 * SIZE(XX) + add XX, XX, INCX + + dcbt XX, PREA + dcbtst BUFFER, PREA + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + STFD a3, 2 * SIZE(BUFFER) + STFD a4, 3 * SIZE(BUFFER) + STFD a5, 4 * SIZE(BUFFER) + STFD a6, 5 * SIZE(BUFFER) + STFD a7, 6 * SIZE(BUFFER) + STFD a8, 7 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(01) + .align 4 + +LL(03): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(04): + LFD a1, 0 * SIZE(XX) + add XX, XX, INCX + + STFD a1, 0 * SIZE(BUFFER) + addi BUFFER, BUFFER, 1 * SIZE + bdnz LL(04) + .align 4 + +LL(05): + mr NEW_Y, Y + lfd f0, FZERO + + cmpwi cr0, INCY, SIZE + beq LL(10) + + mr NEW_Y, BUFFER + + addi r0, M, 7 + srawi. r0, r0, 3 + mtspr CTR, r0 + .align 4 + +LL(06): + STFD f0, 0 * SIZE(BUFFER) + STFD f0, 1 * SIZE(BUFFER) + STFD f0, 2 * SIZE(BUFFER) + STFD f0, 3 * SIZE(BUFFER) + STFD f0, 4 * SIZE(BUFFER) + STFD f0, 5 * SIZE(BUFFER) + STFD f0, 6 * SIZE(BUFFER) + STFD f0, 7 * SIZE(BUFFER) + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(06) + .align 4 + +LL(10): + addi TEMP, IS, 4 + cmpw cr0, TEMP, M + bgt LL(20) + .align 4 + +LL(11): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + slwi TEMP, IS, BASE_SHIFT + add TEMP, X, TEMP + + LFD a16, ALPHA + lfd xsum1, FZERO + + LFD atemp1, 0 * SIZE(TEMP) + LFD atemp2, 1 * SIZE(TEMP) + LFD atemp3, 2 * SIZE(TEMP) + LFD atemp4, 3 * SIZE(TEMP) + + LFD xtemp1, 0 * SIZE(X) + LFD xtemp2, 1 * SIZE(X) + LFD xtemp3, 2 * SIZE(X) + LFD xtemp4, 3 * SIZE(X) + + LFD y01, 0 * SIZE(NEW_Y) + LFD y02, 1 * SIZE(NEW_Y) + LFD y03, 2 * SIZE(NEW_Y) + LFD y04, 3 * SIZE(NEW_Y) + + LFD a1, 0 * SIZE(AO1) + FMUL atemp1, a16, atemp1 + LFD a2, 1 * SIZE(AO1) + FMUL atemp2, a16, atemp2 + LFD a3, 2 * SIZE(AO1) + FMUL atemp3, a16, atemp3 + LFD a4, 3 * SIZE(AO1) + FMUL atemp4, a16, atemp4 + + LFD a5, 0 * SIZE(AO2) + fmr xsum2, xsum1 + LFD a6, 1 * SIZE(AO2) + fmr xsum3, xsum1 + LFD a7, 2 * SIZE(AO2) + fmr xsum4, xsum1 + LFD a8, 3 * SIZE(AO2) + + LFD a9, 0 * SIZE(AO3) + LFD a10, 1 * SIZE(AO3) + LFD a11, 2 * SIZE(AO3) + LFD a12, 3 * SIZE(AO3) + + LFD a13, 0 * SIZE(AO4) + LFD a14, 1 * SIZE(AO4) + LFD a15, 2 * SIZE(AO4) + LFD a16, 3 * SIZE(AO4) + + mr XX, X + mr YY, NEW_Y + + srawi. r0, IS, 4 + mtspr CTR, r0 + ble LL(14) + .align 4 + +LL(12): + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO1, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 +# DCBT(X, PREX) + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO2, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 9 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 9 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 9 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 10 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 8 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 10 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 9 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 10 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 11 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 8 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 11 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 9 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 11 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 10 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 11 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 11 * SIZE(AO4) + + STFD y01, 4 * SIZE(YY) + LFD y01, 8 * SIZE(YY) + STFD y02, 5 * SIZE(YY) + LFD y02, 9 * SIZE(YY) + + STFD y03, 6 * SIZE(YY) + LFD y03, 10 * SIZE(YY) + STFD y04, 7 * SIZE(YY) + LFD y04, 11 * SIZE(YY) + + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO3, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 13 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 +# DCBT(Y1, PREY) + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 13 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 14 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 12 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 14 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10,13 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 14 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 15 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13,12 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 15 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 13 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 15 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 14 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 15 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 15 * SIZE(AO4) + + STFD y01, 8 * SIZE(YY) + LFD y01, 12 * SIZE(YY) + STFD y02, 9 * SIZE(YY) + LFD y02, 13 * SIZE(YY) + + STFD y03, 10 * SIZE(YY) + LFD y03, 14 * SIZE(YY) + STFD y04, 11 * SIZE(YY) + LFD y04, 15 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO4, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + addi YY, YY, 16 * SIZE + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 17 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 16 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + addi AO3, AO3, 16 * SIZE + FMADD y02, atemp2, a6, y02 + LFD a6, 17 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + addi AO1, AO1, 16 * SIZE + FMADD y03, atemp2, a7, y03 + addi AO2, AO2, 16 * SIZE + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 17 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + addi AO4, AO4, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 2 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 0 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 2 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 1 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 2 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 18 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + addi XX, XX, 16 * SIZE + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 3 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 0 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 3 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 1 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 3 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 2 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 3 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 3 * SIZE(AO4) + + STFD y01, -4 * SIZE(YY) + LFD y01, 0 * SIZE(YY) + STFD y02, -3 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + STFD y03, -2 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + STFD y04, -1 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + bdnz LL(12) + .align 4 + +LL(14): + andi. r0, IS, 8 + ble LL(15) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 9 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 9 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 9 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 10 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 8 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 10 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 9 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 10 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 11 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 8 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 11 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 9 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 11 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 10 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 11 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 11 * SIZE(AO4) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi AO3, AO3, 8 * SIZE + addi AO4, AO4, 8 * SIZE + + STFD y01, 4 * SIZE(YY) + LFD y01, 8 * SIZE(YY) + STFD y02, 5 * SIZE(YY) + LFD y02, 9 * SIZE(YY) + + STFD y03, 6 * SIZE(YY) + LFD y03, 10 * SIZE(YY) + STFD y04, 7 * SIZE(YY) + LFD y04, 11 * SIZE(YY) + + addi XX, XX, 8 * SIZE + addi YY, YY, 8 * SIZE + .align 4 + +LL(15): + andi. r0, IS, 4 + ble LL(18) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + addi XX, XX, 4 * SIZE + addi YY, YY, 4 * SIZE + .align 4 + +LL(18): + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + FMUL xsum2, xtemp1, xsum2 + FMUL xsum3, xtemp1, xsum3 + FMUL xsum4, xtemp1, xsum4 + + FMADD xsum1, atemp1, a1, xsum1 + FMADD xsum2, atemp1, a5, xsum2 + FMADD xsum3, atemp1, a9, xsum3 + FMADD xsum4, atemp1, a13, xsum4 + + FMADD xsum1, atemp2, a5, xsum1 + FMADD xsum2, atemp2, a6, xsum2 + FMADD xsum3, atemp2, a10, xsum3 + FMADD xsum4, atemp2, a14, xsum4 + + FMADD xsum1, atemp3, a9, xsum1 + FMADD xsum2, atemp3, a10, xsum2 + FMADD xsum3, atemp3, a11, xsum3 + FMADD xsum4, atemp3, a15, xsum4 + + FMADD xsum1, atemp4, a13, xsum1 + FMADD xsum2, atemp4, a14, xsum2 + FMADD xsum3, atemp4, a15, xsum3 + FMADD xsum4, atemp4, a16, xsum4 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + FADD y03, y03, xsum3 + FADD y04, y04, xsum4 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + STFD y03, 2 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + + addi TEMP, IS, 8 + addi IS, IS, 4 + cmpw cr0, TEMP, M + ble LL(11) + .align 4 + +LL(20): + andi. TEMP, M, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + slwi TEMP, IS, BASE_SHIFT + add TEMP, X, TEMP + + LFD atemp1, 0 * SIZE(TEMP) + LFD atemp2, 1 * SIZE(TEMP) + + LFD a1, ALPHA + + FMUL atemp1, a1, atemp1 + FMUL atemp2, a1, atemp2 + + lfd xsum1, FZERO + fmr xsum2, xsum1 + + mr XX, X + mr YY, NEW_Y + + LFD xtemp1, 0 * SIZE(XX) + LFD xtemp2, 1 * SIZE(XX) + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + + srawi. r0, IS, 1 + mtspr CTR, r0 + ble LL(28) + .align 4 + +LL(22): + FMADD xsum1, xtemp1, a1, xsum1 + FMADD xsum2, xtemp1, a5, xsum2 + + FMADD xsum1, xtemp2, a2, xsum1 + FMADD xsum2, xtemp2, a6, xsum2 + + FMADD y01, atemp1, a1, y01 + FMADD y02, atemp1, a2, y02 + FMADD y01, atemp2, a5, y01 + FMADD y02, atemp2, a6, y02 + + LFD xtemp1, 2 * SIZE(XX) + LFD xtemp2, 3 * SIZE(XX) + + LFD a1, 2 * SIZE(AO1) + LFD a2, 3 * SIZE(AO1) + + LFD a5, 2 * SIZE(AO2) + LFD a6, 3 * SIZE(AO2) + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + + LFD y01, 2 * SIZE(YY) + LFD y02, 3 * SIZE(YY) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + + addi XX, XX, 2 * SIZE + addi YY, YY, 2 * SIZE + + bdnz LL(22) + .align 4 + +LL(28): + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + FMUL xsum2, xtemp1, xsum2 + + FMADD xsum1, atemp1, a1, xsum1 + FMADD xsum2, atemp1, a5, xsum2 + FMADD xsum1, atemp2, a5, xsum1 + FMADD xsum2, atemp2, a6, xsum2 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + + addi IS, IS, 2 + .align 4 + +LL(30): + andi. TEMP, M, 1 + ble LL(990) + + mr AO1, A + + slwi TEMP, IS, BASE_SHIFT + add TEMP, X, TEMP + + LFD atemp1, 0 * SIZE(TEMP) + + LFD a1, ALPHA + + FMUL atemp1, a1, atemp1 + + lfd xsum1, FZERO + + mr XX, X + mr YY, NEW_Y + + LFD xtemp1, 0 * SIZE(XX) + LFD y01, 0 * SIZE(YY) + + LFD a1, 0 * SIZE(AO1) + + mtspr CTR, IS + cmpwi cr0, IS, 0 + ble LL(38) + .align 4 + +LL(32): + FMADD xsum1, xtemp1, a1, xsum1 + + FMADD y01, atemp1, a1, y01 + + LFD xtemp1, 1 * SIZE(XX) + + LFD a1, 1 * SIZE(AO1) + + STFD y01, 0 * SIZE(YY) + + LFD y01, 1 * SIZE(YY) + + addi AO1, AO1, 1 * SIZE + + addi XX, XX, 1 * SIZE + addi YY, YY, 1 * SIZE + + bdnz LL(32) + .align 4 + +LL(38): + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + + FMADD xsum1, atemp1, a1, xsum1 + + FADD y01, y01, xsum1 + + STFD y01, 0 * SIZE(YY) + .align 4 + +LL(990): + cmpwi cr0, INCY, SIZE + beq LL(999) + + mr YY, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + add Y, Y, INCY + LFD f5, 0 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + add Y, Y, INCY + LFD f7, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + LFD f12, 4 * SIZE(NEW_Y) + LFD f13, 5 * SIZE(NEW_Y) + LFD f14, 6 * SIZE(NEW_Y) + LFD f15, 7 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + add YY, YY, INCY + STFD f11, 0 * SIZE(YY) + add YY, YY, INCY + STFD f12, 0 * SIZE(YY) + add YY, YY, INCY + STFD f13, 0 * SIZE(YY) + add YY, YY, INCY + STFD f14, 0 * SIZE(YY) + add YY, YY, INCY + STFD f15, 0 * SIZE(YY) + add YY, YY, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 4 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + add YY, YY, INCY + STFD f11, 0 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(996): + andi. J, M, 2 + ble LL(997) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 2 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(997): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f8, 0 * SIZE(NEW_Y) + + FADD f8, f8, f0 + + STFD f8, 0 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S new file mode 100644 index 0000000..6be8e28 --- /dev/null +++ b/kernel/power/trsm_kernel_LN.S @@ -0,0 +1,3652 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef LN + li PREC, -4 * SIZE +#else + li PREC, 4 * SIZE +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + +LL(30): + andi. I, M, 1 + ble LL(20) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(09): + srawi. I, M, 2 + ble LL(39) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(60): + andi. I, M, 1 + ble LL(50) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(41) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(41): + srawi. I, M, 2 + ble LL(69) + .align 4 + +LL(42): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(43): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(43) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(42) + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(80) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(71) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(71): + srawi. I, M, 2 + ble LL(999) + .align 4 + +LL(72): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(73): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(73) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(72) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S new file mode 100644 index 0000000..0d28744 --- /dev/null +++ b/kernel/power/trsm_kernel_LT.S @@ -0,0 +1,3665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#if defined(TRSMKERNEL) && defined(LN) +/* Direction is special */ +#ifdef PPC970 + li PREC, -4 * SIZE +#endif +#ifdef POWER4 + li PREC, -4 * SIZE +#endif +#ifdef POWER5 + li PREC, -4 * SIZE +#endif +#ifdef CELL + li PREC, -4 * SIZE +#endif +#else +/* Normal prefetch */ +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 3 * SIZE +#endif +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#ifdef CELL + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 12 * SIZE) +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S new file mode 100644 index 0000000..533f299 --- /dev/null +++ b/kernel/power/trsm_kernel_RT.S @@ -0,0 +1,3679 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#if defined(TRSMKERNEL) && defined(LN) +/* Direction is special */ +#ifdef PPC970 + li PREC, -4 * SIZE +#endif +#ifdef POWER4 + li PREC, -4 * SIZE +#endif +#ifdef POWER5 + li PREC, -4 * SIZE +#endif +#else +/* Normal prefetch */ +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 3 * SIZE +#endif +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + lfs f0, FZERO + +LL(70): + andi. J, N, 1 + ble LL(40) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(99) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(99): +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(09) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(09): + srawi. J, N, 2 + ble LL(999) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S new file mode 100644 index 0000000..179db31 --- /dev/null +++ b/kernel/power/trsm_kernel_cell_LN.S @@ -0,0 +1,3666 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREC, -4 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#ifdef CELL + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 12 * SIZE) +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + +LL(30): + andi. I, M, 1 + ble LL(20) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(09): + srawi. I, M, 2 + ble LL(39) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(60): + andi. I, M, 1 + ble LL(50) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(41) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(41): + srawi. I, M, 2 + ble LL(69) + .align 4 + +LL(42): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(43): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(43) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(42) + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(80) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(71) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(71): + srawi. I, M, 2 + ble LL(999) + .align 4 + +LL(72): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(73): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(73) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(72) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S new file mode 100644 index 0000000..06b3d9e --- /dev/null +++ b/kernel/power/trsm_kernel_cell_LT.S @@ -0,0 +1,3680 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#if defined(TRSMKERNEL) && defined(LN) +/* Direction is special */ +#ifdef PPC970 + li PREC, -4 * SIZE +#endif +#ifdef POWER4 + li PREC, -4 * SIZE +#endif +#ifdef POWER5 + li PREC, -4 * SIZE +#endif +#ifdef CELL + li PREC, -4 * SIZE +#endif +#else +/* Normal prefetch */ +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 3 * SIZE +#endif +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#ifdef CELL + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 12 * SIZE) +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S new file mode 100644 index 0000000..51e7bc4 --- /dev/null +++ b/kernel/power/trsm_kernel_cell_RT.S @@ -0,0 +1,3675 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREC, -4 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#ifdef CELL + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 12 * SIZE) +#endif +#endif + lfs f0, FZERO + +LL(70): + andi. J, N, 1 + ble LL(40) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(99) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(99): +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(09) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(09): + srawi. J, N, 2 + ble LL(999) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S new file mode 100644 index 0000000..32f4d0d --- /dev/null +++ b/kernel/power/trsm_kernel_hummer_LN.S @@ -0,0 +1,5695 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define ALPHA 0 +#define FZERO 8 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM4 r16 +#define INCM2 r17 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define CO3 r30 +#define CO4 r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) # dummy + + li r0, 0 + + stwu r0, -4(SP) + stwu r0, -4(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + + li INCM1, -1 * SIZE + li INCM2, -2 * SIZE + li INCM4, -4 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + srawi. J, N, 2 + ble .L50 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO4, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + andi. I, M, 1 + beq .L20 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L44 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L44 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L43 + .align 4 + +.L42: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L42 + .align 4 + +.L43: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L44: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L48 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L48 +#endif + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L47 + .align 4 + +.L46: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L46 + .align 4 + +.L47: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + addi AO2, AO, 2 * SIZE + .align 4 + +.L48: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#if defined(LN) || defined(LT) + LFPDX A1, AO, INC4 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RN + LFD A1, (4 + 0) * SIZE(BO) + LFD A2, (4 + 1) * SIZE(BO) + LFD A3, (4 + 2) * SIZE(BO) + LFD A4, (4 + 3) * SIZE(BO) + + LFD A5, (4 + 5) * SIZE(BO) + LFD A6, (4 + 6) * SIZE(BO) + LFD A7, (4 + 7) * SIZE(BO) + LFD A8, (4 + 10) * SIZE(BO) + + LFD A9, (4 + 11) * SIZE(BO) + LFD A10, (4 + 15) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f0, A1, f0 + fnmsub f2, A2, f0, f2 + fnmsub f1, A3, f0, f1 + fnmsub f3, A4, f0, f3 + + fmul f2, A5, f2 + fnmsub f1, A6, f2, f1 + fnmsub f3, A7, f2, f3 + + fmul f1, A8, f1 + fnmsub f3, A9, f1, f3 + + fmul f3, A10, f3 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#ifdef RT + LFD A1, (4 + 15) * SIZE(BO) + LFD A2, (4 + 14) * SIZE(BO) + LFD A3, (4 + 13) * SIZE(BO) + LFD A4, (4 + 12) * SIZE(BO) + + LFD A5, (4 + 10) * SIZE(BO) + LFD A6, (4 + 9) * SIZE(BO) + LFD A7, (4 + 8) * SIZE(BO) + LFD A8, (4 + 5) * SIZE(BO) + + LFD A9, (4 + 4) * SIZE(BO) + LFD A10, (4 + 0) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f3, A1, f3 + fnmsub f1, A2, f3, f1 + fnmsub f2, A3, f3, f2 + fnmsub f0, A4, f3, f0 + + fmul f1, A5, f1 + fnmsub f2, A6, f1, f2 + fnmsub f0, A7, f1, f0 + + fmul f2, A8, f2 + fnmsub f0, A9, f2, f0 + + fmul f0, A10, f0 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f1, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f1, AO2, INC4 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + + STFDX f0, CO1, INC + STFSDX f0, CO2, INC + STFDX f1, CO3, INC + STFSDX f1, CO4, INC + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + LFPDUX B3, BO, INC4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + LFPDUX A5, BO, INC4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + LFPDUX A7, BO, INC4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f28, f8 + + fsmfp f0, f4 + fsmfp f8, f12 + fsmtp f4, f24 + fsmtp f12, f28 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f8, f18, f8 + fpsub f12, f19, f12 +#endif + +#ifdef LN + addi AO, AO, 8 * SIZE + addi AO2, AO2, 8 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f12, A1, f12 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f8, A1, f12, f8 + + fxpmul f0, A2, f0 + fxpmul f8, A2, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + subi AO, AO, 4 * SIZE + subi AO2, AO2, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxsmul f4, A2, f4 + fxsmul f12, A2, f12 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f8, A2, f0, f8 + fxcsnmsub f12, A2, f0, f12 + + fxsmul f4, A3, f4 + fxcpnmsub f8, A4, f4, f8 + fxcsnmsub f12, A4, f4, f12 + + fxpmul f8, A5, f8 + fxcsnmsub f12, A5, f8, f12 + fxsmul f12, A6, f12 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxcpnmsub f8, A1, f12, f8 + fxcsnmsub f4, A2, f12, f4 + fxcpnmsub f0, A2, f12, f0 + + fxpmul f8, A3, f8 + fxcsnmsub f4, A4, f8, f4 + fxcpnmsub f0, A4, f8, f0 + + fxsmul f4, A5, f4 + fxcpnmsub f0, A5, f4, f0 + fxpmul f0, A6, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f4, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f12, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 4 + beq .L40 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f28, f8 + fpmr f29, f9 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f8, f12 + fsmfp f9, f13 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f12, f28 + fsmtp f13, f29 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + LFPDUX f20, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f22, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 + + fpsub f1, f20, f1 + fpsub f9, f21, f9 + fpsub f5, f22, f5 + fpsub f13, f23, f13 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f4, f18, f4 + fpsub f5, f19, f5 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f12, f22, f12 + fpsub f13, f23, f13 +#endif + +#ifdef LN + addi AO, AO, 20 * SIZE + addi AO2, AO2, 20 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A5, AO, INCM4 + add AO2, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f5, A1, f5 + fxsmul f13, A1, f13 + + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f5, f4 + fxcsnmsub f12, A2, f13, f12 + + fxcpnmsub f0, A2, f5, f0 + fxcpnmsub f8, A2, f13, f8 + + fxpmul f1, A3, f1 + fxpmul f9, A3, f9 + + fxcsnmsub f4, A4, f1, f4 + fxcsnmsub f12, A4, f9, f12 + + fxcpnmsub f0, A4, f1, f0 + fxcpnmsub f8, A4, f9, f8 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f8, A5, f12, f8 + + fxpmul f0, A6, f0 + fxpmul f8, A6, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + add AO, AO, INC4 + LFPDUX A6, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f1, A4, f4, f1 + fxcpnmsub f9, A4, f12, f9 + + fxcsnmsub f5, A4, f4, f5 + fxcsnmsub f13, A4, f12, f13 + + fxpmul f1, A5, f1 + fxpmul f9, A5, f9 + + fxcsnmsub f5, A5, f1, f5 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f5, A6, f5 + fxsmul f13, A6, f13 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L40: + srawi. I, M, 3 + ble .L49 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + nop + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + nop + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + nop + + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + nop + fxcsmadd f12, B4, A9, f12 + nop + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + LFPDUX f16, BO, INC4 + fpmr f25, f1 + nop + fpmr f26, f2 + LFPDUX f17, BO2, INC4 + fpmr f27, f3 + nop + + fpmr f28, f8 + LFPDUX f18, BO, INC4 + fpmr f29, f9 + nop + fpmr f30, f10 + LFPDUX f19, BO2, INC4 + fpmr f31, f11 + nop + + fsmfp f0, f4 + LFPDUX f20, BO, INC4 + fsmfp f1, f5 + nop + fsmfp f2, f6 + LFPDUX f21, BO2, INC4 + fsmfp f3, f7 + nop + + fsmfp f8, f12 + LFPDUX f22, BO, INC4 + fsmfp f9, f13 + nop + fsmfp f10, f14 + LFPDUX f23, BO2, INC4 + fsmfp f11, f15 + nop + + fsmtp f4, f24 + LFPDUX f24, BO, INC4 + fsmtp f5, f25 + nop + fsmtp f6, f26 + LFPDUX f25, BO2, INC4 + fsmtp f7, f27 + nop + + fsmtp f12, f28 + LFPDUX f26, BO, INC4 + fsmtp f13, f29 + nop + fsmtp f14, f30 + LFPDUX f27, BO2, INC4 + fsmtp f15, f31 + nop + + fpsub f0, f16, f0 + LFPDUX f28, BO, INC4 + fpsub f8, f17, f8 + nop + fpsub f4, f18, f4 + LFPDUX f29, BO2, INC4 + fpsub f12, f19, f12 + nop + + fpsub f1, f20, f1 + LFPDUX f30, BO, INC4 + fpsub f9, f21, f9 + subi BO, BO, 32 * SIZE + fpsub f5, f22, f5 + LFPDUX f31, BO2, INC4 + fpsub f13, f23, f13 + subi BO2, BO2, 32 * SIZE + + fpsub f2, f24, f2 + fpsub f10, f25, f10 + fpsub f6, f26, f6 + fpsub f14, f27, f14 + fpsub f3, f28, f3 + fpsub f11, f29, f11 + fpsub f7, f30, f7 + fpsub f15, f31, f15 + +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + fpsub f0, f16, f0 + LFPDUX f24, AO, INC4 + fpsub f1, f17, f1 + LFPDUX f25, AO2, INC4 + fpsub f2, f18, f2 + LFPDUX f26, AO, INC4 + fpsub f3, f19, f3 + LFPDUX f27, AO2, INC4 + fpsub f4, f20, f4 + LFPDUX f28, AO, INC4 + fpsub f5, f21, f5 + LFPDUX f29, AO2, INC4 + fpsub f6, f22, f6 + LFPDUX f30, AO, INC4 + fpsub f7, f23, f7 + LFPDUX f31, AO2, INC4 + + fpsub f8, f24, f8 + subi AO, AO, 32 * SIZE + fpsub f9, f25, f9 + subi AO2, AO2, 32 * SIZE + fpsub f10, f26, f10 + fpsub f11, f27, f11 + fpsub f12, f28, f12 + fpsub f13, f29, f13 + fpsub f14, f30, f14 + fpsub f15, f31, f15 +#endif + +#ifdef LN + addi AO, AO, 68 * SIZE + addi AO2, AO2, 68 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + fxsmul f7, A1, f7 + fxsmul f15, A1, f15 + + fxcpnmsub f3, A1, f7, f3 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f6, A2, f7, f6 + fxcsnmsub f14, A2, f15, f14 + + fxcpnmsub f2, A2, f7, f2 + fxcpnmsub f10, A2, f15, f10 + + fxcsnmsub f5, A3, f7, f5 + fxcsnmsub f13, A3, f15, f13 + + fxcpnmsub f1, A3, f7, f1 + fxcpnmsub f9, A3, f15, f9 + + fxcsnmsub f4, A4, f7, f4 + fxcsnmsub f12, A4, f15, f12 + + fxcpnmsub f0, A4, f7, f0 + fxcpnmsub f8, A4, f15, f8 + + fxpmul f3, A5, f3 + fxpmul f11, A5, f11 + + fxcsnmsub f6, A6, f3, f6 + fxcsnmsub f14, A6, f11, f14 + + fxcpnmsub f2, A6, f3, f2 + fxcpnmsub f10, A6, f11, f10 + + fxcsnmsub f5, A7, f3, f5 + fxcsnmsub f13, A7, f11, f13 + + fxcpnmsub f1, A7, f3, f1 + fxcpnmsub f9, A7, f11, f9 + + fxcsnmsub f4, A8, f3, f4 + fxcsnmsub f12, A8, f11, f12 + + fxcpnmsub f0, A8, f3, f0 + fxcpnmsub f8, A8, f11, f8 + + add AO2, AO2, INCM4 + LFPDUX A1, AO, INCM4 + LFPDUX A2, AO2, INCM4 + LFPDUX A3, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f10, A1, f14, f10 + + fxcsnmsub f5, A2, f6, f5 + fxcsnmsub f13, A2, f14, f13 + + fxcpnmsub f1, A2, f6, f1 + fxcpnmsub f9, A2, f14, f9 + + fxcsnmsub f4, A3, f6, f4 + fxcsnmsub f12, A3, f14, f12 + + fxcpnmsub f0, A3, f6, f0 + fxcpnmsub f8, A3, f14, f8 + + fxpmul f2, A4, f2 + fxpmul f10, A4, f10 + + fxcsnmsub f5, A5, f2, f5 + fxcsnmsub f13, A5, f10, f13 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + + fxcsnmsub f4, A6, f2, f4 + fxcsnmsub f12, A6, f10, f12 + + fxcpnmsub f0, A6, f2, f0 + fxcpnmsub f8, A6, f10, f8 + + fxsmul f5, A7, f5 + fxsmul f13, A7, f13 + + fxcpnmsub f1, A7, f5, f1 + fxcpnmsub f9, A7, f13, f9 + + fxcsnmsub f4, A8, f5, f4 + fxcsnmsub f12, A8, f13, f12 + + fxcpnmsub f0, A8, f5, f0 + fxcpnmsub f8, A8, f13, f8 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A3, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A4, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f4, A2, f1, f4 + fxcsnmsub f12, A2, f9, f12 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f0, A3, f4, f0 + fxcpnmsub f8, A3, f12, f8 + + fxpmul f0, A4, f0 + fxpmul f8, A4, f8 + +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + + fxcsnmsub f6, A3, f0, f6 + fxcsnmsub f14, A3, f8, f14 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + + fxcsnmsub f7, A4, f0, f7 + fxcsnmsub f15, A4, f8, f15 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f1, A6, f4, f1 + fxcpnmsub f9, A6, f12, f9 + + fxcsnmsub f5, A6, f4, f5 + fxcsnmsub f13, A6, f12, f13 + + fxcpnmsub f2, A7, f4, f2 + fxcpnmsub f10, A7, f12, f10 + + fxcsnmsub f6, A7, f4, f6 + fxcsnmsub f14, A7, f12, f14 + + fxcpnmsub f3, A8, f4, f3 + fxcpnmsub f11, A8, f12, f11 + + fxcsnmsub f7, A8, f4, f7 + fxcsnmsub f15, A8, f12, f15 + + add AO, AO, INC4 + LFPDUX A1, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f13, A1, f9, f13 + + fxcpnmsub f2, A2, f1, f2 + fxcpnmsub f10, A2, f9, f10 + + fxcsnmsub f6, A2, f1, f6 + fxcsnmsub f14, A2, f9, f14 + + fxcpnmsub f3, A3, f1, f3 + fxcpnmsub f11, A3, f9, f11 + + fxcsnmsub f7, A3, f1, f7 + fxcsnmsub f15, A3, f9, f15 + + fxsmul f5, A4, f5 + fxsmul f13, A4, f13 + + fxcpnmsub f2, A5, f5, f2 + fxcpnmsub f10, A5, f13, f10 + + fxcsnmsub f6, A5, f5, f6 + fxcsnmsub f14, A5, f13, f14 + + fxcpnmsub f3, A6, f5, f3 + fxcpnmsub f11, A6, f13, f11 + + fxcsnmsub f7, A6, f5, f7 + fxcsnmsub f15, A6, f13, f15 + + fxpmul f2, A7, f2 + fxpmul f10, A7, f10 + + fxcsnmsub f6, A7, f2, f6 + fxcsnmsub f14, A7, f10, f14 + + fxcpnmsub f3, A8, f2, f3 + fxcpnmsub f11, A8, f10, f11 + + fxcsnmsub f7, A8, f2, f7 + fxcsnmsub f15, A8, f10, f15 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A3, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A4, AO2, INC4 + + subi AO, AO, 64 * SIZE + subi AO2, AO2, 64 * SIZE + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f3, A2, f6, f3 + fxcpnmsub f11, A2, f14, f11 + + fxcsnmsub f7, A2, f6, f7 + fxcsnmsub f15, A2, f14, f15 + + fxpmul f3, A3, f3 + fxpmul f11, A3, f11 + + fxcsnmsub f7, A3, f3, f7 + fxcsnmsub f15, A3, f11, f15 + + fxsmul f7, A4, f7 + fxsmul f15, A4, f15 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + fxcsnmsub f14, A2, f2, f14 + fxcsnmsub f15, A2, f3, f15 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxsmul f6, A3, f6 + fxsmul f7, A3, f7 + + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + fxcpnmsub f10, A4, f6, f10 + fxcpnmsub f11, A4, f7, f11 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + fxcsnmsub f14, A4, f6, f14 + fxcsnmsub f15, A4, f7, f15 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxpmul f10, A5, f10 + fxpmul f11, A5, f11 + + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + fxcsnmsub f14, A5, f10, f14 + fxcsnmsub f15, A5, f11, f15 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 + fxsmul f14, A6, f14 + fxsmul f15, A6, f15 + +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxsmul f14, A1, f14 + fxsmul f15, A1, f15 + + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + fxcpnmsub f10, A1, f14, f10 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcsnmsub f6, A2, f14, f6 + fxcsnmsub f7, A2, f15, f7 + + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + fxcpnmsub f2, A2, f14, f2 + fxcpnmsub f3, A2, f15, f3 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxpmul f10, A3, f10 + fxpmul f11, A3, f11 + + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + fxcsnmsub f6, A4, f10, f6 + fxcsnmsub f7, A4, f11, f7 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + fxcpnmsub f2, A4, f10, f2 + fxcpnmsub f3, A4, f11, f3 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxsmul f6, A5, f6 + fxsmul f7, A5, f7 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + fxcpnmsub f2, A5, f6, f2 + fxcpnmsub f3, A5, f7, f3 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 + fxpmul f2, A6, f2 + fxpmul f3, A6, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f6, BO, INC4 + STFPDUX f14, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + STFPDUX f7, BO, INC4 + STFPDUX f15, BO2, INC4 + + subi BO, BO, 32 * SIZE + subi BO2, BO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + STFDUX f10, CO3, INC + STFDUX f14, CO3, INC + STFDUX f11, CO3, INC + STFDUX f15, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC + STFSDUX f10, CO4, INC + STFSDUX f14, CO4, INC + STFSDUX f11, CO4, INC + STFSDUX f15, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f6, AO, INC4 + STFPDUX f7, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + STFPDUX f14, AO, INC4 + STFPDUX f15, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC + STFDUX f14, CO4, INC + STFSDUX f14, CO4, INC + STFDUX f15, CO4, INC + STFSDUX f15, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 2 + beq .L90 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + andi. I, M, 1 + beq .L60 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L84 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L84 + +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L83 + .align 4 + +.L82: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L82 + .align 4 + +.L83: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L84: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L88 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L88 +#endif + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L87 + .align 4 + +.L86: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L86 + .align 4 + +.L87: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L88: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RN + LFD A1, (2 + 0) * SIZE(BO) + LFD A2, (2 + 1) * SIZE(BO) + LFD A3, (2 + 3) * SIZE(BO) + + fsmtp f1, f0 + + fmul f0, A1, f0 + fnmsub f1, A2, f0, f1 + + fmul f1, A3, f1 + fsmfp f0, f1 +#endif + +#ifdef RT + LFD A1, (2 + 3) * SIZE(BO) + LFD A2, (2 + 2) * SIZE(BO) + LFD A3, (2 + 0) * SIZE(BO) + + fsmtp f1, f0 + + fmul f1, A1, f1 + fnmsub f0, A2, f1, f0 + + fmul f0, A3, f0 + fsmfp f0, f1 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L72 + .align 4 + +.L73: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fsmfp f0, f1 + fsmtp f1, f24 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxsmul f1, A2, f1 + fxcpnmsub f0, A2, f1, f0 + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f1, A1, f1 + fxcpnmsub f0, A1, f1, f0 + fxpmul f0, A2, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFDUX f1, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f1, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 4 + beq .L80 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L68: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + + fsmfp f0, f2 + fsmfp f1, f3 + fsmtp f2, f24 + fsmtp f3, f25 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + fpsub f1, f18, f1 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + addi AO, AO, 18 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A5, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A6, AO, INCM2 + + subi AO, AO, 2 * SIZE + + fxsmul f3, A1, f3 + fxcpnmsub f1, A1, f3, f1 + fxcsnmsub f2, A2, f3, f2 + fxcpnmsub f0, A2, f3, f0 + + fxpmul f1, A3, f1 + fxcsnmsub f2, A4, f1, f2 + fxcpnmsub f0, A4, f1, f0 + + fxsmul f2, A5, f2 + fxcpnmsub f0, A5, f2, f0 + + fxpmul f0, A6, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + add AO, AO, INC2 + LFPDUX A6, AO, INC2 + + subi AO, AO, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f2, A1, f0, f2 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f3, A2, f0, f3 + + fxsmul f2, A3, f2 + fxcpnmsub f1, A4, f2, f1 + fxcsnmsub f3, A4, f2, f3 + + fxpmul f1, A5, f1 + fxcsnmsub f3, A5, f1, f3 + + fxsmul f3, A6, f3 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + + fxcsnmsub f2, A1, f0, f2 + fxcsnmsub f3, A1, f1, f3 + + fxsmul f2, A2, f2 + fxsmul f3, A2, f3 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f2, A1, f2 + fxsmul f3, A1, f3 + + fxcpnmsub f0, A1, f2, f0 + fxcpnmsub f1, A1, f3, f1 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f2, CO1, INC + STFDUX f1, CO1, INC + STFDUX f3, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f3, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L80: + srawi. I, M, 3 + ble .L89 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, KK, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f26, f2 + fpmr f27, f3 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f6, f26 + fsmtp f7, f27 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + LFPDUX f20, BO, INC2 + LFPDUX f21, BO, INC2 + LFPDUX f22, BO, INC2 + LFPDUX f23, BO, INC2 + + subi BO, BO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f1, f18, f1 + fpsub f5, f19, f5 + + fpsub f2, f20, f2 + fpsub f6, f21, f6 + fpsub f3, f22, f3 + fpsub f7, f23, f7 + +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + LFPDUX f20, AO, INC2 + LFPDUX f21, AO, INC2 + LFPDUX f22, AO, INC2 + LFPDUX f23, AO, INC2 + + subi AO, AO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + fpsub f4, f20, f4 + fpsub f5, f21, f5 + fpsub f6, f22, f6 + fpsub f7, f23, f7 +#endif + +#ifdef LN + addi AO, AO, 66 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f7, A1, f7 + fxcpnmsub f3, A1, f7, f3 + fxcsnmsub f6, A2, f7, f6 + fxcpnmsub f2, A2, f7, f2 + + fxcsnmsub f5, A3, f7, f5 + fxcpnmsub f1, A3, f7, f1 + fxcsnmsub f4, A4, f7, f4 + fxcpnmsub f0, A4, f7, f0 + + fxpmul f3, A5, f3 + fxcsnmsub f6, A6, f3, f6 + fxcpnmsub f2, A6, f3, f2 + + fxcsnmsub f5, A7, f3, f5 + fxcpnmsub f1, A7, f3, f1 + fxcsnmsub f4, A8, f3, f4 + fxcpnmsub f0, A8, f3, f0 + + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + + add AO, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f6, A1, f6 + fxcpnmsub f2, A1, f6, f2 + fxcsnmsub f5, A2, f6, f5 + fxcpnmsub f1, A2, f6, f1 + fxcsnmsub f4, A3, f6, f4 + fxcpnmsub f0, A3, f6, f0 + + fxpmul f2, A4, f2 + fxcsnmsub f5, A5, f2, f5 + fxcpnmsub f1, A5, f2, f1 + fxcsnmsub f4, A6, f2, f4 + fxcpnmsub f0, A6, f2, f0 + + fxsmul f5, A7, f5 + fxcpnmsub f1, A7, f5, f1 + fxcsnmsub f4, A8, f5, f4 + fxcpnmsub f0, A8, f5, f0 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + + subi AO, AO, 6 * SIZE + LFPDUX A3, AO, INCM2 + subi AO, AO, 6 * SIZE + LFPDUX A4, AO, INCM2 + + addi AO, AO, -2 * SIZE + + fxpmul f1, A1, f1 + fxcsnmsub f4, A2, f1, f4 + fxcpnmsub f0, A2, f1, f0 + + fxsmul f4, A3, f4 + fxcpnmsub f0, A3, f4, f0 + + fxpmul f0, A4, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f5, A2, f0, f5 + fxcpnmsub f2, A3, f0, f2 + fxcsnmsub f6, A3, f0, f6 + fxcpnmsub f3, A4, f0, f3 + fxcsnmsub f7, A4, f0, f7 + + fxsmul f4, A5, f4 + fxcpnmsub f1, A6, f4, f1 + fxcsnmsub f5, A6, f4, f5 + fxcpnmsub f2, A7, f4, f2 + fxcsnmsub f6, A7, f4, f6 + fxcpnmsub f3, A8, f4, f3 + fxcsnmsub f7, A8, f4, f7 + + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + add AO, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f1, A1, f1 + fxcsnmsub f5, A1, f1, f5 + fxcpnmsub f2, A2, f1, f2 + fxcsnmsub f6, A2, f1, f6 + fxcpnmsub f3, A3, f1, f3 + fxcsnmsub f7, A3, f1, f7 + + fxsmul f5, A4, f5 + fxcpnmsub f2, A5, f5, f2 + fxcsnmsub f6, A5, f5, f6 + fxcpnmsub f3, A6, f5, f3 + fxcsnmsub f7, A6, f5, f7 + + fxpmul f2, A7, f2 + fxcsnmsub f6, A7, f2, f6 + fxcpnmsub f3, A8, f2, f3 + fxcsnmsub f7, A8, f2, f7 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, 6 * SIZE + LFPDUX A3, AO, INC2 + addi AO, AO, 6 * SIZE + LFPDUX A4, AO, INC2 + + subi AO, AO, 64 * SIZE + + fxsmul f6, A1, f6 + fxcpnmsub f3, A2, f6, f3 + fxcsnmsub f7, A2, f6, f7 + + fxpmul f3, A3, f3 + fxcsnmsub f7, A3, f3, f7 + + fxsmul f7, A4, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxsmul f4, A2, f4 + fxsmul f5, A2, f5 + fxsmul f6, A2, f6 + fxsmul f7, A2, f7 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f5, A1, f5 + fxsmul f6, A1, f6 + fxsmul f7, A1, f7 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f3, A1, f7, f3 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 + fxpmul f2, A2, f2 + fxpmul f3, A2, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f4, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f5, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f6, BO, INC2 + STFPDUX f3, BO, INC2 + STFPDUX f7, BO, INC2 + + subi BO, BO, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + STFPDUX f4, AO, INC2 + STFPDUX f5, AO, INC2 + STFPDUX f6, AO, INC2 + STFPDUX f7, AO, INC2 + + subi AO, AO, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +.L90: + andi. J, N, 1 + beq .L999 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO1, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + andi. I, M, 1 + beq .L100 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L124 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L123 + .align 4 + +.L122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L122 + .align 4 + +.L123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L124: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L128 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L128 +#endif + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L127 + .align 4 + +.L126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L126 + .align 4 + +.L127: + fmadd f0, A1, B1, f0 + .align 4 + +.L128: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + + fadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFDX f16, BO, INC2 + + fsub f0, f16, f0 +#else + LFDX f16, AO, INC2 + + fsub f0, f16, f0 +#endif + +#ifdef LN + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef LT + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef RN + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef RT + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFDX f0, BO, INC2 + + STFDUX f0, CO1, INC +#else + STFDX f0, AO, INC2 + + STFDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L100: + andi. I, M, 2 + beq .L110 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L114 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L113 + .align 4 + +.L112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L112 + .align 4 + +.L113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L114: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L118 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L118 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L117 + .align 4 + +.L116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L116 + .align 4 + +.L117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L118: + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + fsmtp f4, f0 + + LFD A1, (2 + 3) * SIZE(AO) + LFD A2, (2 + 2) * SIZE(AO) + LFD A3, (2 + 0) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + fmul f0, A3, f0 + fsmfp f0, f4 +#endif + +#ifdef LT + fsmtp f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fmul f4, A3, f4 + + fsmfp f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L110: + andi. I, M, 4 + beq .L120 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L104 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L103 + .align 4 + +.L102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L102 + .align 4 + +.L103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L104: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L108 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L108 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L107 + .align 4 + +.L106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L106 + .align 4 + +.L107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L108: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 15) * SIZE(AO) + LFD A2, (2 + 14) * SIZE(AO) + LFD A3, (2 + 13) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 9) * SIZE(AO) + LFD A3, (2 + 8) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 4) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 6) * SIZE(AO) + LFD A3, (2 + 7) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 11) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + + LFD A1, (2 + 15) * SIZE(AO) + + fmul f5, A1, f5 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L120: + srawi. I, M, 3 + ble .L129 + .align 4 + +.L91: +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L94 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L94 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L93 + .align 4 + +.L92: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L92 + .align 4 + +.L93: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L94: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L98 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L98 +#endif + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L97 + .align 4 + +.L96: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L96 + .align 4 + +.L97: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L98: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 63) * SIZE(AO) + LFD A2, (2 + 62) * SIZE(AO) + LFD A3, (2 + 61) * SIZE(AO) + LFD A4, (2 + 60) * SIZE(AO) + LFD A5, (2 + 59) * SIZE(AO) + LFD A6, (2 + 58) * SIZE(AO) + LFD A7, (2 + 57) * SIZE(AO) + LFD A8, (2 + 56) * SIZE(AO) + + fmul f7, A1, f7 + fnmsub f3, A2, f7, f3 + fnmsub f6, A3, f7, f6 + fnmsub f2, A4, f7, f2 + fnmsub f5, A5, f7, f5 + fnmsub f1, A6, f7, f1 + fnmsub f4, A7, f7, f4 + fnmsub f0, A8, f7, f0 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 53) * SIZE(AO) + LFD A3, (2 + 52) * SIZE(AO) + LFD A4, (2 + 51) * SIZE(AO) + LFD A5, (2 + 50) * SIZE(AO) + LFD A6, (2 + 49) * SIZE(AO) + LFD A7, (2 + 48) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f6, A2, f3, f6 + fnmsub f2, A3, f3, f2 + fnmsub f5, A4, f3, f5 + fnmsub f1, A5, f3, f1 + fnmsub f4, A6, f3, f4 + fnmsub f0, A7, f3, f0 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 44) * SIZE(AO) + LFD A3, (2 + 43) * SIZE(AO) + LFD A4, (2 + 42) * SIZE(AO) + LFD A5, (2 + 41) * SIZE(AO) + LFD A6, (2 + 40) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f2, A2, f6, f2 + fnmsub f5, A3, f6, f5 + fnmsub f1, A4, f6, f1 + fnmsub f4, A5, f6, f4 + fnmsub f0, A6, f6, f0 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 35) * SIZE(AO) + LFD A3, (2 + 34) * SIZE(AO) + LFD A4, (2 + 33) * SIZE(AO) + LFD A5, (2 + 32) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f5, A2, f2, f5 + fnmsub f1, A3, f2, f1 + fnmsub f4, A4, f2, f4 + fnmsub f0, A5, f2, f0 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 26) * SIZE(AO) + LFD A3, (2 + 25) * SIZE(AO) + LFD A4, (2 + 24) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 17) * SIZE(AO) + LFD A3, (2 + 16) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 8) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + LFD A5, (2 + 4) * SIZE(AO) + LFD A6, (2 + 5) * SIZE(AO) + LFD A7, (2 + 6) * SIZE(AO) + LFD A8, (2 + 7) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + fnmsub f2, A5, f0, f2 + fnmsub f6, A6, f0, f6 + fnmsub f3, A7, f0, f3 + fnmsub f7, A8, f0, f7 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 10) * SIZE(AO) + LFD A3, (2 + 11) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + LFD A5, (2 + 13) * SIZE(AO) + LFD A6, (2 + 14) * SIZE(AO) + LFD A7, (2 + 15) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + fnmsub f2, A4, f4, f2 + fnmsub f6, A5, f4, f6 + fnmsub f3, A6, f4, f3 + fnmsub f7, A7, f4, f7 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 19) * SIZE(AO) + LFD A3, (2 + 20) * SIZE(AO) + LFD A4, (2 + 21) * SIZE(AO) + LFD A5, (2 + 22) * SIZE(AO) + LFD A6, (2 + 23) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + fnmsub f2, A3, f1, f2 + fnmsub f6, A4, f1, f6 + fnmsub f3, A5, f1, f3 + fnmsub f7, A6, f1, f7 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 28) * SIZE(AO) + LFD A3, (2 + 29) * SIZE(AO) + LFD A4, (2 + 30) * SIZE(AO) + LFD A5, (2 + 31) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f2, A2, f5, f2 + fnmsub f6, A3, f5, f6 + fnmsub f3, A4, f5, f3 + fnmsub f7, A5, f5, f7 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 37) * SIZE(AO) + LFD A3, (2 + 38) * SIZE(AO) + LFD A4, (2 + 39) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f6, A2, f2, f6 + fnmsub f3, A3, f2, f3 + fnmsub f7, A4, f2, f7 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 46) * SIZE(AO) + LFD A3, (2 + 47) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f3, A2, f6, f3 + fnmsub f7, A3, f6, f7 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 55) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f7, A2, f3, f7 + + LFD A1, (2 + 63) * SIZE(AO) + + fmul f7, A1, f7 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L91 + .align 4 + +.L129: +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S new file mode 100644 index 0000000..027fcf0 --- /dev/null +++ b/kernel/power/trsm_kernel_hummer_LT.S @@ -0,0 +1,5697 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define ALPHA 0 +#define FZERO 8 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM4 r16 +#define INCM2 r17 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define CO3 r30 +#define CO4 r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) # dummy + + li r0, 0 + + stwu r0, -4(SP) + stwu r0, -4(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + + li INCM1, -1 * SIZE + li INCM2, -2 * SIZE + li INCM4, -4 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + srawi. J, N, 2 + ble .L50 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO4, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + nop + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + nop + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + nop + + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + nop + fxcsmadd f12, B4, A9, f12 + nop + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + LFPDUX f16, BO, INC4 + fpmr f25, f1 + nop + fpmr f26, f2 + LFPDUX f17, BO2, INC4 + fpmr f27, f3 + nop + + fpmr f28, f8 + LFPDUX f18, BO, INC4 + fpmr f29, f9 + nop + fpmr f30, f10 + LFPDUX f19, BO2, INC4 + fpmr f31, f11 + nop + + fsmfp f0, f4 + LFPDUX f20, BO, INC4 + fsmfp f1, f5 + nop + fsmfp f2, f6 + LFPDUX f21, BO2, INC4 + fsmfp f3, f7 + nop + + fsmfp f8, f12 + LFPDUX f22, BO, INC4 + fsmfp f9, f13 + nop + fsmfp f10, f14 + LFPDUX f23, BO2, INC4 + fsmfp f11, f15 + nop + + fsmtp f4, f24 + LFPDUX f24, BO, INC4 + fsmtp f5, f25 + nop + fsmtp f6, f26 + LFPDUX f25, BO2, INC4 + fsmtp f7, f27 + nop + + fsmtp f12, f28 + LFPDUX f26, BO, INC4 + fsmtp f13, f29 + nop + fsmtp f14, f30 + LFPDUX f27, BO2, INC4 + fsmtp f15, f31 + nop + + fpsub f0, f16, f0 + LFPDUX f28, BO, INC4 + fpsub f8, f17, f8 + nop + fpsub f4, f18, f4 + LFPDUX f29, BO2, INC4 + fpsub f12, f19, f12 + nop + + fpsub f1, f20, f1 + LFPDUX f30, BO, INC4 + fpsub f9, f21, f9 + subi BO, BO, 32 * SIZE + fpsub f5, f22, f5 + LFPDUX f31, BO2, INC4 + fpsub f13, f23, f13 + subi BO2, BO2, 32 * SIZE + + fpsub f2, f24, f2 + fpsub f10, f25, f10 + fpsub f6, f26, f6 + fpsub f14, f27, f14 + fpsub f3, f28, f3 + fpsub f11, f29, f11 + fpsub f7, f30, f7 + fpsub f15, f31, f15 + +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + fpsub f0, f16, f0 + LFPDUX f24, AO, INC4 + fpsub f1, f17, f1 + LFPDUX f25, AO2, INC4 + fpsub f2, f18, f2 + LFPDUX f26, AO, INC4 + fpsub f3, f19, f3 + LFPDUX f27, AO2, INC4 + fpsub f4, f20, f4 + LFPDUX f28, AO, INC4 + fpsub f5, f21, f5 + LFPDUX f29, AO2, INC4 + fpsub f6, f22, f6 + LFPDUX f30, AO, INC4 + fpsub f7, f23, f7 + LFPDUX f31, AO2, INC4 + + fpsub f8, f24, f8 + subi AO, AO, 32 * SIZE + fpsub f9, f25, f9 + subi AO2, AO2, 32 * SIZE + fpsub f10, f26, f10 + fpsub f11, f27, f11 + fpsub f12, f28, f12 + fpsub f13, f29, f13 + fpsub f14, f30, f14 + fpsub f15, f31, f15 +#endif + +#ifdef LN + addi AO, AO, 68 * SIZE + addi AO2, AO2, 68 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + fxsmul f7, A1, f7 + fxsmul f15, A1, f15 + + fxcpnmsub f3, A1, f7, f3 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f6, A2, f7, f6 + fxcsnmsub f14, A2, f15, f14 + + fxcpnmsub f2, A2, f7, f2 + fxcpnmsub f10, A2, f15, f10 + + fxcsnmsub f5, A3, f7, f5 + fxcsnmsub f13, A3, f15, f13 + + fxcpnmsub f1, A3, f7, f1 + fxcpnmsub f9, A3, f15, f9 + + fxcsnmsub f4, A4, f7, f4 + fxcsnmsub f12, A4, f15, f12 + + fxcpnmsub f0, A4, f7, f0 + fxcpnmsub f8, A4, f15, f8 + + fxpmul f3, A5, f3 + fxpmul f11, A5, f11 + + fxcsnmsub f6, A6, f3, f6 + fxcsnmsub f14, A6, f11, f14 + + fxcpnmsub f2, A6, f3, f2 + fxcpnmsub f10, A6, f11, f10 + + fxcsnmsub f5, A7, f3, f5 + fxcsnmsub f13, A7, f11, f13 + + fxcpnmsub f1, A7, f3, f1 + fxcpnmsub f9, A7, f11, f9 + + fxcsnmsub f4, A8, f3, f4 + fxcsnmsub f12, A8, f11, f12 + + fxcpnmsub f0, A8, f3, f0 + fxcpnmsub f8, A8, f11, f8 + + add AO2, AO2, INCM4 + LFPDUX A1, AO, INCM4 + LFPDUX A2, AO2, INCM4 + LFPDUX A3, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f10, A1, f14, f10 + + fxcsnmsub f5, A2, f6, f5 + fxcsnmsub f13, A2, f14, f13 + + fxcpnmsub f1, A2, f6, f1 + fxcpnmsub f9, A2, f14, f9 + + fxcsnmsub f4, A3, f6, f4 + fxcsnmsub f12, A3, f14, f12 + + fxcpnmsub f0, A3, f6, f0 + fxcpnmsub f8, A3, f14, f8 + + fxpmul f2, A4, f2 + fxpmul f10, A4, f10 + + fxcsnmsub f5, A5, f2, f5 + fxcsnmsub f13, A5, f10, f13 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + + fxcsnmsub f4, A6, f2, f4 + fxcsnmsub f12, A6, f10, f12 + + fxcpnmsub f0, A6, f2, f0 + fxcpnmsub f8, A6, f10, f8 + + fxsmul f5, A7, f5 + fxsmul f13, A7, f13 + + fxcpnmsub f1, A7, f5, f1 + fxcpnmsub f9, A7, f13, f9 + + fxcsnmsub f4, A8, f5, f4 + fxcsnmsub f12, A8, f13, f12 + + fxcpnmsub f0, A8, f5, f0 + fxcpnmsub f8, A8, f13, f8 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A3, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A4, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f4, A2, f1, f4 + fxcsnmsub f12, A2, f9, f12 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f0, A3, f4, f0 + fxcpnmsub f8, A3, f12, f8 + + fxpmul f0, A4, f0 + fxpmul f8, A4, f8 + +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + + fxcsnmsub f6, A3, f0, f6 + fxcsnmsub f14, A3, f8, f14 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + + fxcsnmsub f7, A4, f0, f7 + fxcsnmsub f15, A4, f8, f15 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f1, A6, f4, f1 + fxcpnmsub f9, A6, f12, f9 + + fxcsnmsub f5, A6, f4, f5 + fxcsnmsub f13, A6, f12, f13 + + fxcpnmsub f2, A7, f4, f2 + fxcpnmsub f10, A7, f12, f10 + + fxcsnmsub f6, A7, f4, f6 + fxcsnmsub f14, A7, f12, f14 + + fxcpnmsub f3, A8, f4, f3 + fxcpnmsub f11, A8, f12, f11 + + fxcsnmsub f7, A8, f4, f7 + fxcsnmsub f15, A8, f12, f15 + + add AO, AO, INC4 + LFPDUX A1, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f13, A1, f9, f13 + + fxcpnmsub f2, A2, f1, f2 + fxcpnmsub f10, A2, f9, f10 + + fxcsnmsub f6, A2, f1, f6 + fxcsnmsub f14, A2, f9, f14 + + fxcpnmsub f3, A3, f1, f3 + fxcpnmsub f11, A3, f9, f11 + + fxcsnmsub f7, A3, f1, f7 + fxcsnmsub f15, A3, f9, f15 + + fxsmul f5, A4, f5 + fxsmul f13, A4, f13 + + fxcpnmsub f2, A5, f5, f2 + fxcpnmsub f10, A5, f13, f10 + + fxcsnmsub f6, A5, f5, f6 + fxcsnmsub f14, A5, f13, f14 + + fxcpnmsub f3, A6, f5, f3 + fxcpnmsub f11, A6, f13, f11 + + fxcsnmsub f7, A6, f5, f7 + fxcsnmsub f15, A6, f13, f15 + + fxpmul f2, A7, f2 + fxpmul f10, A7, f10 + + fxcsnmsub f6, A7, f2, f6 + fxcsnmsub f14, A7, f10, f14 + + fxcpnmsub f3, A8, f2, f3 + fxcpnmsub f11, A8, f10, f11 + + fxcsnmsub f7, A8, f2, f7 + fxcsnmsub f15, A8, f10, f15 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A3, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A4, AO2, INC4 + + subi AO, AO, 64 * SIZE + subi AO2, AO2, 64 * SIZE + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f3, A2, f6, f3 + fxcpnmsub f11, A2, f14, f11 + + fxcsnmsub f7, A2, f6, f7 + fxcsnmsub f15, A2, f14, f15 + + fxpmul f3, A3, f3 + fxpmul f11, A3, f11 + + fxcsnmsub f7, A3, f3, f7 + fxcsnmsub f15, A3, f11, f15 + + fxsmul f7, A4, f7 + fxsmul f15, A4, f15 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + fxcsnmsub f14, A2, f2, f14 + fxcsnmsub f15, A2, f3, f15 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxsmul f6, A3, f6 + fxsmul f7, A3, f7 + + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + fxcpnmsub f10, A4, f6, f10 + fxcpnmsub f11, A4, f7, f11 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + fxcsnmsub f14, A4, f6, f14 + fxcsnmsub f15, A4, f7, f15 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxpmul f10, A5, f10 + fxpmul f11, A5, f11 + + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + fxcsnmsub f14, A5, f10, f14 + fxcsnmsub f15, A5, f11, f15 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 + fxsmul f14, A6, f14 + fxsmul f15, A6, f15 + +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxsmul f14, A1, f14 + fxsmul f15, A1, f15 + + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + fxcpnmsub f10, A1, f14, f10 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcsnmsub f6, A2, f14, f6 + fxcsnmsub f7, A2, f15, f7 + + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + fxcpnmsub f2, A2, f14, f2 + fxcpnmsub f3, A2, f15, f3 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxpmul f10, A3, f10 + fxpmul f11, A3, f11 + + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + fxcsnmsub f6, A4, f10, f6 + fxcsnmsub f7, A4, f11, f7 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + fxcpnmsub f2, A4, f10, f2 + fxcpnmsub f3, A4, f11, f3 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxsmul f6, A5, f6 + fxsmul f7, A5, f7 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + fxcpnmsub f2, A5, f6, f2 + fxcpnmsub f3, A5, f7, f3 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 + fxpmul f2, A6, f2 + fxpmul f3, A6, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f6, BO, INC4 + STFPDUX f14, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + STFPDUX f7, BO, INC4 + STFPDUX f15, BO2, INC4 + + subi BO, BO, 32 * SIZE + subi BO2, BO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + STFDUX f10, CO3, INC + STFDUX f14, CO3, INC + STFDUX f11, CO3, INC + STFDUX f15, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC + STFSDUX f10, CO4, INC + STFSDUX f14, CO4, INC + STFSDUX f11, CO4, INC + STFSDUX f15, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f6, AO, INC4 + STFPDUX f7, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + STFPDUX f14, AO, INC4 + STFPDUX f15, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC + STFDUX f14, CO4, INC + STFSDUX f14, CO4, INC + STFDUX f15, CO4, INC + STFSDUX f15, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 4 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f28, f8 + fpmr f29, f9 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f8, f12 + fsmfp f9, f13 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f12, f28 + fsmtp f13, f29 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + LFPDUX f20, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f22, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 + + fpsub f1, f20, f1 + fpsub f9, f21, f9 + fpsub f5, f22, f5 + fpsub f13, f23, f13 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f4, f18, f4 + fpsub f5, f19, f5 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f12, f22, f12 + fpsub f13, f23, f13 +#endif + +#ifdef LN + addi AO, AO, 20 * SIZE + addi AO2, AO2, 20 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A5, AO, INCM4 + add AO2, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f5, A1, f5 + fxsmul f13, A1, f13 + + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f5, f4 + fxcsnmsub f12, A2, f13, f12 + + fxcpnmsub f0, A2, f5, f0 + fxcpnmsub f8, A2, f13, f8 + + fxpmul f1, A3, f1 + fxpmul f9, A3, f9 + + fxcsnmsub f4, A4, f1, f4 + fxcsnmsub f12, A4, f9, f12 + + fxcpnmsub f0, A4, f1, f0 + fxcpnmsub f8, A4, f9, f8 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f8, A5, f12, f8 + + fxpmul f0, A6, f0 + fxpmul f8, A6, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + add AO, AO, INC4 + LFPDUX A6, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f1, A4, f4, f1 + fxcpnmsub f9, A4, f12, f9 + + fxcsnmsub f5, A4, f4, f5 + fxcsnmsub f13, A4, f12, f13 + + fxpmul f1, A5, f1 + fxpmul f9, A5, f9 + + fxcsnmsub f5, A5, f1, f5 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f5, A6, f5 + fxsmul f13, A6, f13 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 2 + beq .L40 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + LFPDUX B3, BO, INC4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + LFPDUX A5, BO, INC4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + LFPDUX A7, BO, INC4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f28, f8 + + fsmfp f0, f4 + fsmfp f8, f12 + fsmtp f4, f24 + fsmtp f12, f28 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f8, f18, f8 + fpsub f12, f19, f12 +#endif + +#ifdef LN + addi AO, AO, 8 * SIZE + addi AO2, AO2, 8 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f12, A1, f12 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f8, A1, f12, f8 + + fxpmul f0, A2, f0 + fxpmul f8, A2, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + subi AO, AO, 4 * SIZE + subi AO2, AO2, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxsmul f4, A2, f4 + fxsmul f12, A2, f12 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f8, A2, f0, f8 + fxcsnmsub f12, A2, f0, f12 + + fxsmul f4, A3, f4 + fxcpnmsub f8, A4, f4, f8 + fxcsnmsub f12, A4, f4, f12 + + fxpmul f8, A5, f8 + fxcsnmsub f12, A5, f8, f12 + fxsmul f12, A6, f12 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxcpnmsub f8, A1, f12, f8 + fxcsnmsub f4, A2, f12, f4 + fxcpnmsub f0, A2, f12, f0 + + fxpmul f8, A3, f8 + fxcsnmsub f4, A4, f8, f4 + fxcpnmsub f0, A4, f8, f0 + + fxsmul f4, A5, f4 + fxcpnmsub f0, A5, f4, f0 + fxpmul f0, A6, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f4, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f12, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L40: + andi. I, M, 1 + beq .L49 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L44 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L44 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L43 + .align 4 + +.L42: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L42 + .align 4 + +.L43: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L44: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L48 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L48 +#endif + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L47 + .align 4 + +.L46: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L46 + .align 4 + +.L47: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + addi AO2, AO, 2 * SIZE + .align 4 + +.L48: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#if defined(LN) || defined(LT) + LFPDX A1, AO, INC4 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RN + LFD A1, (4 + 0) * SIZE(BO) + LFD A2, (4 + 1) * SIZE(BO) + LFD A3, (4 + 2) * SIZE(BO) + LFD A4, (4 + 3) * SIZE(BO) + + LFD A5, (4 + 5) * SIZE(BO) + LFD A6, (4 + 6) * SIZE(BO) + LFD A7, (4 + 7) * SIZE(BO) + LFD A8, (4 + 10) * SIZE(BO) + + LFD A9, (4 + 11) * SIZE(BO) + LFD A10, (4 + 15) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f0, A1, f0 + fnmsub f2, A2, f0, f2 + fnmsub f1, A3, f0, f1 + fnmsub f3, A4, f0, f3 + + fmul f2, A5, f2 + fnmsub f1, A6, f2, f1 + fnmsub f3, A7, f2, f3 + + fmul f1, A8, f1 + fnmsub f3, A9, f1, f3 + + fmul f3, A10, f3 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#ifdef RT + LFD A1, (4 + 15) * SIZE(BO) + LFD A2, (4 + 14) * SIZE(BO) + LFD A3, (4 + 13) * SIZE(BO) + LFD A4, (4 + 12) * SIZE(BO) + + LFD A5, (4 + 10) * SIZE(BO) + LFD A6, (4 + 9) * SIZE(BO) + LFD A7, (4 + 8) * SIZE(BO) + LFD A8, (4 + 5) * SIZE(BO) + + LFD A9, (4 + 4) * SIZE(BO) + LFD A10, (4 + 0) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f3, A1, f3 + fnmsub f1, A2, f3, f1 + fnmsub f2, A3, f3, f2 + fnmsub f0, A4, f3, f0 + + fmul f1, A5, f1 + fnmsub f2, A6, f1, f2 + fnmsub f0, A7, f1, f0 + + fmul f2, A8, f2 + fnmsub f0, A9, f2, f0 + + fmul f0, A10, f0 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f1, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f1, AO2, INC4 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC + STFDUX f1, CO3, INC + STFSDUX f1, CO4, INC + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 2 + beq .L90 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L60 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, KK, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f26, f2 + fpmr f27, f3 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f6, f26 + fsmtp f7, f27 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + LFPDUX f20, BO, INC2 + LFPDUX f21, BO, INC2 + LFPDUX f22, BO, INC2 + LFPDUX f23, BO, INC2 + + subi BO, BO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f1, f18, f1 + fpsub f5, f19, f5 + + fpsub f2, f20, f2 + fpsub f6, f21, f6 + fpsub f3, f22, f3 + fpsub f7, f23, f7 + +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + LFPDUX f20, AO, INC2 + LFPDUX f21, AO, INC2 + LFPDUX f22, AO, INC2 + LFPDUX f23, AO, INC2 + + subi AO, AO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + fpsub f4, f20, f4 + fpsub f5, f21, f5 + fpsub f6, f22, f6 + fpsub f7, f23, f7 +#endif + +#ifdef LN + addi AO, AO, 66 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f7, A1, f7 + fxcpnmsub f3, A1, f7, f3 + fxcsnmsub f6, A2, f7, f6 + fxcpnmsub f2, A2, f7, f2 + + fxcsnmsub f5, A3, f7, f5 + fxcpnmsub f1, A3, f7, f1 + fxcsnmsub f4, A4, f7, f4 + fxcpnmsub f0, A4, f7, f0 + + fxpmul f3, A5, f3 + fxcsnmsub f6, A6, f3, f6 + fxcpnmsub f2, A6, f3, f2 + + fxcsnmsub f5, A7, f3, f5 + fxcpnmsub f1, A7, f3, f1 + fxcsnmsub f4, A8, f3, f4 + fxcpnmsub f0, A8, f3, f0 + + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + + add AO, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f6, A1, f6 + fxcpnmsub f2, A1, f6, f2 + fxcsnmsub f5, A2, f6, f5 + fxcpnmsub f1, A2, f6, f1 + fxcsnmsub f4, A3, f6, f4 + fxcpnmsub f0, A3, f6, f0 + + fxpmul f2, A4, f2 + fxcsnmsub f5, A5, f2, f5 + fxcpnmsub f1, A5, f2, f1 + fxcsnmsub f4, A6, f2, f4 + fxcpnmsub f0, A6, f2, f0 + + fxsmul f5, A7, f5 + fxcpnmsub f1, A7, f5, f1 + fxcsnmsub f4, A8, f5, f4 + fxcpnmsub f0, A8, f5, f0 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + + subi AO, AO, 6 * SIZE + LFPDUX A3, AO, INCM2 + subi AO, AO, 6 * SIZE + LFPDUX A4, AO, INCM2 + + addi AO, AO, -2 * SIZE + + fxpmul f1, A1, f1 + fxcsnmsub f4, A2, f1, f4 + fxcpnmsub f0, A2, f1, f0 + + fxsmul f4, A3, f4 + fxcpnmsub f0, A3, f4, f0 + + fxpmul f0, A4, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f5, A2, f0, f5 + fxcpnmsub f2, A3, f0, f2 + fxcsnmsub f6, A3, f0, f6 + fxcpnmsub f3, A4, f0, f3 + fxcsnmsub f7, A4, f0, f7 + + fxsmul f4, A5, f4 + fxcpnmsub f1, A6, f4, f1 + fxcsnmsub f5, A6, f4, f5 + fxcpnmsub f2, A7, f4, f2 + fxcsnmsub f6, A7, f4, f6 + fxcpnmsub f3, A8, f4, f3 + fxcsnmsub f7, A8, f4, f7 + + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + add AO, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f1, A1, f1 + fxcsnmsub f5, A1, f1, f5 + fxcpnmsub f2, A2, f1, f2 + fxcsnmsub f6, A2, f1, f6 + fxcpnmsub f3, A3, f1, f3 + fxcsnmsub f7, A3, f1, f7 + + fxsmul f5, A4, f5 + fxcpnmsub f2, A5, f5, f2 + fxcsnmsub f6, A5, f5, f6 + fxcpnmsub f3, A6, f5, f3 + fxcsnmsub f7, A6, f5, f7 + + fxpmul f2, A7, f2 + fxcsnmsub f6, A7, f2, f6 + fxcpnmsub f3, A8, f2, f3 + fxcsnmsub f7, A8, f2, f7 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, 6 * SIZE + LFPDUX A3, AO, INC2 + addi AO, AO, 6 * SIZE + LFPDUX A4, AO, INC2 + + subi AO, AO, 64 * SIZE + + fxsmul f6, A1, f6 + fxcpnmsub f3, A2, f6, f3 + fxcsnmsub f7, A2, f6, f7 + + fxpmul f3, A3, f3 + fxcsnmsub f7, A3, f3, f7 + + fxsmul f7, A4, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxsmul f4, A2, f4 + fxsmul f5, A2, f5 + fxsmul f6, A2, f6 + fxsmul f7, A2, f7 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f5, A1, f5 + fxsmul f6, A1, f6 + fxsmul f7, A1, f7 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f3, A1, f7, f3 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 + fxpmul f2, A2, f2 + fxpmul f3, A2, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f4, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f5, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f6, BO, INC2 + STFPDUX f3, BO, INC2 + STFPDUX f7, BO, INC2 + + subi BO, BO, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + STFPDUX f4, AO, INC2 + STFPDUX f5, AO, INC2 + STFPDUX f6, AO, INC2 + STFPDUX f7, AO, INC2 + + subi AO, AO, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 4 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L68: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + + fsmfp f0, f2 + fsmfp f1, f3 + fsmtp f2, f24 + fsmtp f3, f25 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + fpsub f1, f18, f1 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + addi AO, AO, 18 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A5, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A6, AO, INCM2 + + subi AO, AO, 2 * SIZE + + fxsmul f3, A1, f3 + fxcpnmsub f1, A1, f3, f1 + fxcsnmsub f2, A2, f3, f2 + fxcpnmsub f0, A2, f3, f0 + + fxpmul f1, A3, f1 + fxcsnmsub f2, A4, f1, f2 + fxcpnmsub f0, A4, f1, f0 + + fxsmul f2, A5, f2 + fxcpnmsub f0, A5, f2, f0 + + fxpmul f0, A6, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + add AO, AO, INC2 + LFPDUX A6, AO, INC2 + + subi AO, AO, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f2, A1, f0, f2 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f3, A2, f0, f3 + + fxsmul f2, A3, f2 + fxcpnmsub f1, A4, f2, f1 + fxcsnmsub f3, A4, f2, f3 + + fxpmul f1, A5, f1 + fxcsnmsub f3, A5, f1, f3 + + fxsmul f3, A6, f3 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + + fxcsnmsub f2, A1, f0, f2 + fxcsnmsub f3, A1, f1, f3 + + fxsmul f2, A2, f2 + fxsmul f3, A2, f3 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f2, A1, f2 + fxsmul f3, A1, f3 + + fxcpnmsub f0, A1, f2, f0 + fxcpnmsub f1, A1, f3, f1 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f2, CO1, INC + STFDUX f1, CO1, INC + STFDUX f3, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f3, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 2 + beq .L80 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L72 + .align 4 + +.L73: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fsmfp f0, f1 + fsmtp f1, f24 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxsmul f1, A2, f1 + fxcpnmsub f0, A2, f1, f0 + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f1, A1, f1 + fxcpnmsub f0, A1, f1, f0 + fxpmul f0, A2, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFDUX f1, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f1, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L80: + andi. I, M, 1 + beq .L89 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L84 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L84 + +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L83 + .align 4 + +.L82: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L82 + .align 4 + +.L83: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L84: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L88 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L88 +#endif + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L87 + .align 4 + +.L86: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L86 + .align 4 + +.L87: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L88: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RN + LFD A1, (2 + 0) * SIZE(BO) + LFD A2, (2 + 1) * SIZE(BO) + LFD A3, (2 + 3) * SIZE(BO) + + fsmtp f1, f0 + + fmul f0, A1, f0 + fnmsub f1, A2, f0, f1 + + fmul f1, A3, f1 + fsmfp f0, f1 +#endif + +#ifdef RT + LFD A1, (2 + 3) * SIZE(BO) + LFD A2, (2 + 2) * SIZE(BO) + LFD A3, (2 + 0) * SIZE(BO) + + fsmtp f1, f0 + + fmul f1, A1, f1 + fnmsub f0, A2, f1, f0 + + fmul f0, A3, f0 + fsmfp f0, f1 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +.L90: + andi. J, N, 1 + beq .L999 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO1, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L100 + .align 4 + +.L91: +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L94 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L94 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L93 + .align 4 + +.L92: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L92 + .align 4 + +.L93: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L94: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L98 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L98 +#endif + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L97 + .align 4 + +.L96: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L96 + .align 4 + +.L97: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L98: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 63) * SIZE(AO) + LFD A2, (2 + 62) * SIZE(AO) + LFD A3, (2 + 61) * SIZE(AO) + LFD A4, (2 + 60) * SIZE(AO) + LFD A5, (2 + 59) * SIZE(AO) + LFD A6, (2 + 58) * SIZE(AO) + LFD A7, (2 + 57) * SIZE(AO) + LFD A8, (2 + 56) * SIZE(AO) + + fmul f7, A1, f7 + fnmsub f3, A2, f7, f3 + fnmsub f6, A3, f7, f6 + fnmsub f2, A4, f7, f2 + fnmsub f5, A5, f7, f5 + fnmsub f1, A6, f7, f1 + fnmsub f4, A7, f7, f4 + fnmsub f0, A8, f7, f0 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 53) * SIZE(AO) + LFD A3, (2 + 52) * SIZE(AO) + LFD A4, (2 + 51) * SIZE(AO) + LFD A5, (2 + 50) * SIZE(AO) + LFD A6, (2 + 49) * SIZE(AO) + LFD A7, (2 + 48) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f6, A2, f3, f6 + fnmsub f2, A3, f3, f2 + fnmsub f5, A4, f3, f5 + fnmsub f1, A5, f3, f1 + fnmsub f4, A6, f3, f4 + fnmsub f0, A7, f3, f0 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 44) * SIZE(AO) + LFD A3, (2 + 43) * SIZE(AO) + LFD A4, (2 + 42) * SIZE(AO) + LFD A5, (2 + 41) * SIZE(AO) + LFD A6, (2 + 40) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f2, A2, f6, f2 + fnmsub f5, A3, f6, f5 + fnmsub f1, A4, f6, f1 + fnmsub f4, A5, f6, f4 + fnmsub f0, A6, f6, f0 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 35) * SIZE(AO) + LFD A3, (2 + 34) * SIZE(AO) + LFD A4, (2 + 33) * SIZE(AO) + LFD A5, (2 + 32) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f5, A2, f2, f5 + fnmsub f1, A3, f2, f1 + fnmsub f4, A4, f2, f4 + fnmsub f0, A5, f2, f0 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 26) * SIZE(AO) + LFD A3, (2 + 25) * SIZE(AO) + LFD A4, (2 + 24) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 17) * SIZE(AO) + LFD A3, (2 + 16) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 8) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + LFD A5, (2 + 4) * SIZE(AO) + LFD A6, (2 + 5) * SIZE(AO) + LFD A7, (2 + 6) * SIZE(AO) + LFD A8, (2 + 7) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + fnmsub f2, A5, f0, f2 + fnmsub f6, A6, f0, f6 + fnmsub f3, A7, f0, f3 + fnmsub f7, A8, f0, f7 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 10) * SIZE(AO) + LFD A3, (2 + 11) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + LFD A5, (2 + 13) * SIZE(AO) + LFD A6, (2 + 14) * SIZE(AO) + LFD A7, (2 + 15) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + fnmsub f2, A4, f4, f2 + fnmsub f6, A5, f4, f6 + fnmsub f3, A6, f4, f3 + fnmsub f7, A7, f4, f7 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 19) * SIZE(AO) + LFD A3, (2 + 20) * SIZE(AO) + LFD A4, (2 + 21) * SIZE(AO) + LFD A5, (2 + 22) * SIZE(AO) + LFD A6, (2 + 23) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + fnmsub f2, A3, f1, f2 + fnmsub f6, A4, f1, f6 + fnmsub f3, A5, f1, f3 + fnmsub f7, A6, f1, f7 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 28) * SIZE(AO) + LFD A3, (2 + 29) * SIZE(AO) + LFD A4, (2 + 30) * SIZE(AO) + LFD A5, (2 + 31) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f2, A2, f5, f2 + fnmsub f6, A3, f5, f6 + fnmsub f3, A4, f5, f3 + fnmsub f7, A5, f5, f7 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 37) * SIZE(AO) + LFD A3, (2 + 38) * SIZE(AO) + LFD A4, (2 + 39) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f6, A2, f2, f6 + fnmsub f3, A3, f2, f3 + fnmsub f7, A4, f2, f7 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 46) * SIZE(AO) + LFD A3, (2 + 47) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f3, A2, f6, f3 + fnmsub f7, A3, f6, f7 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 55) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f7, A2, f3, f7 + + LFD A1, (2 + 63) * SIZE(AO) + + fmul f7, A1, f7 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L91 + .align 4 + +.L100: + andi. I, M, 4 + beq .L110 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L104 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L103 + .align 4 + +.L102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L102 + .align 4 + +.L103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L104: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L108 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L108 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L107 + .align 4 + +.L106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L106 + .align 4 + +.L107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L108: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 15) * SIZE(AO) + LFD A2, (2 + 14) * SIZE(AO) + LFD A3, (2 + 13) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 9) * SIZE(AO) + LFD A3, (2 + 8) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 4) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 6) * SIZE(AO) + LFD A3, (2 + 7) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 11) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + + LFD A1, (2 + 15) * SIZE(AO) + + fmul f5, A1, f5 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L110: + andi. I, M, 2 + beq .L120 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L114 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L113 + .align 4 + +.L112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L112 + .align 4 + +.L113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L114: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L118 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L118 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L117 + .align 4 + +.L116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L116 + .align 4 + +.L117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L118: + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + fsmtp f4, f0 + + LFD A1, (2 + 3) * SIZE(AO) + LFD A2, (2 + 2) * SIZE(AO) + LFD A3, (2 + 0) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + fmul f0, A3, f0 + fsmfp f0, f4 +#endif + +#ifdef LT + fsmtp f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fmul f4, A3, f4 + + fsmfp f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L120: + andi. I, M, 1 + beq .L129 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L124 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L123 + .align 4 + +.L122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L122 + .align 4 + +.L123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L124: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L128 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L128 +#endif + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L127 + .align 4 + +.L126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L126 + .align 4 + +.L127: + fmadd f0, A1, B1, f0 + .align 4 + +.L128: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + + fadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFDX f16, BO, INC2 + + fsub f0, f16, f0 +#else + LFDX f16, AO, INC2 + + fsub f0, f16, f0 +#endif + +#ifdef LN + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef LT + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef RN + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef RT + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFDX f0, BO, INC2 + + STFDUX f0, CO1, INC +#else + STFDX f0, AO, INC2 + + STFDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L129: +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S new file mode 100644 index 0000000..e0b5d21 --- /dev/null +++ b/kernel/power/trsm_kernel_hummer_RT.S @@ -0,0 +1,5696 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define ALPHA 0 +#define FZERO 8 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM4 r16 +#define INCM2 r17 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define CO3 r30 +#define CO4 r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) # dummy + + li r0, 0 + + stwu r0, -4(SP) + stwu r0, -4(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + + li INCM1, -1 * SIZE + li INCM2, -2 * SIZE + li INCM4, -4 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + andi. J, N, 1 + beq .L50 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO1, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L100 + .align 4 + +.L91: +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L94 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L94 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L93 + .align 4 + +.L92: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L92 + .align 4 + +.L93: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L94: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L98 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L98 +#endif + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L97 + .align 4 + +.L96: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L96 + .align 4 + +.L97: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L98: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 63) * SIZE(AO) + LFD A2, (2 + 62) * SIZE(AO) + LFD A3, (2 + 61) * SIZE(AO) + LFD A4, (2 + 60) * SIZE(AO) + LFD A5, (2 + 59) * SIZE(AO) + LFD A6, (2 + 58) * SIZE(AO) + LFD A7, (2 + 57) * SIZE(AO) + LFD A8, (2 + 56) * SIZE(AO) + + fmul f7, A1, f7 + fnmsub f3, A2, f7, f3 + fnmsub f6, A3, f7, f6 + fnmsub f2, A4, f7, f2 + fnmsub f5, A5, f7, f5 + fnmsub f1, A6, f7, f1 + fnmsub f4, A7, f7, f4 + fnmsub f0, A8, f7, f0 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 53) * SIZE(AO) + LFD A3, (2 + 52) * SIZE(AO) + LFD A4, (2 + 51) * SIZE(AO) + LFD A5, (2 + 50) * SIZE(AO) + LFD A6, (2 + 49) * SIZE(AO) + LFD A7, (2 + 48) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f6, A2, f3, f6 + fnmsub f2, A3, f3, f2 + fnmsub f5, A4, f3, f5 + fnmsub f1, A5, f3, f1 + fnmsub f4, A6, f3, f4 + fnmsub f0, A7, f3, f0 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 44) * SIZE(AO) + LFD A3, (2 + 43) * SIZE(AO) + LFD A4, (2 + 42) * SIZE(AO) + LFD A5, (2 + 41) * SIZE(AO) + LFD A6, (2 + 40) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f2, A2, f6, f2 + fnmsub f5, A3, f6, f5 + fnmsub f1, A4, f6, f1 + fnmsub f4, A5, f6, f4 + fnmsub f0, A6, f6, f0 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 35) * SIZE(AO) + LFD A3, (2 + 34) * SIZE(AO) + LFD A4, (2 + 33) * SIZE(AO) + LFD A5, (2 + 32) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f5, A2, f2, f5 + fnmsub f1, A3, f2, f1 + fnmsub f4, A4, f2, f4 + fnmsub f0, A5, f2, f0 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 26) * SIZE(AO) + LFD A3, (2 + 25) * SIZE(AO) + LFD A4, (2 + 24) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 17) * SIZE(AO) + LFD A3, (2 + 16) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 8) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + LFD A5, (2 + 4) * SIZE(AO) + LFD A6, (2 + 5) * SIZE(AO) + LFD A7, (2 + 6) * SIZE(AO) + LFD A8, (2 + 7) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + fnmsub f2, A5, f0, f2 + fnmsub f6, A6, f0, f6 + fnmsub f3, A7, f0, f3 + fnmsub f7, A8, f0, f7 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 10) * SIZE(AO) + LFD A3, (2 + 11) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + LFD A5, (2 + 13) * SIZE(AO) + LFD A6, (2 + 14) * SIZE(AO) + LFD A7, (2 + 15) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + fnmsub f2, A4, f4, f2 + fnmsub f6, A5, f4, f6 + fnmsub f3, A6, f4, f3 + fnmsub f7, A7, f4, f7 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 19) * SIZE(AO) + LFD A3, (2 + 20) * SIZE(AO) + LFD A4, (2 + 21) * SIZE(AO) + LFD A5, (2 + 22) * SIZE(AO) + LFD A6, (2 + 23) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + fnmsub f2, A3, f1, f2 + fnmsub f6, A4, f1, f6 + fnmsub f3, A5, f1, f3 + fnmsub f7, A6, f1, f7 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 28) * SIZE(AO) + LFD A3, (2 + 29) * SIZE(AO) + LFD A4, (2 + 30) * SIZE(AO) + LFD A5, (2 + 31) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f2, A2, f5, f2 + fnmsub f6, A3, f5, f6 + fnmsub f3, A4, f5, f3 + fnmsub f7, A5, f5, f7 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 37) * SIZE(AO) + LFD A3, (2 + 38) * SIZE(AO) + LFD A4, (2 + 39) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f6, A2, f2, f6 + fnmsub f3, A3, f2, f3 + fnmsub f7, A4, f2, f7 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 46) * SIZE(AO) + LFD A3, (2 + 47) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f3, A2, f6, f3 + fnmsub f7, A3, f6, f7 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 55) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f7, A2, f3, f7 + + LFD A1, (2 + 63) * SIZE(AO) + + fmul f7, A1, f7 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L91 + .align 4 + +.L100: + andi. I, M, 4 + beq .L110 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L104 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L103 + .align 4 + +.L102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L102 + .align 4 + +.L103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L104: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L108 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L108 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L107 + .align 4 + +.L106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L106 + .align 4 + +.L107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L108: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 15) * SIZE(AO) + LFD A2, (2 + 14) * SIZE(AO) + LFD A3, (2 + 13) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 9) * SIZE(AO) + LFD A3, (2 + 8) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 4) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 6) * SIZE(AO) + LFD A3, (2 + 7) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 11) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + + LFD A1, (2 + 15) * SIZE(AO) + + fmul f5, A1, f5 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L110: + andi. I, M, 2 + beq .L120 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L114 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L113 + .align 4 + +.L112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L112 + .align 4 + +.L113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L114: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L118 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L118 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L117 + .align 4 + +.L116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L116 + .align 4 + +.L117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L118: + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + fsmtp f4, f0 + + LFD A1, (2 + 3) * SIZE(AO) + LFD A2, (2 + 2) * SIZE(AO) + LFD A3, (2 + 0) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + fmul f0, A3, f0 + fsmfp f0, f4 +#endif + +#ifdef LT + fsmtp f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fmul f4, A3, f4 + + fsmfp f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L120: + andi. I, M, 1 + beq .L129 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L124 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L123 + .align 4 + +.L122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L122 + .align 4 + +.L123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L124: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L128 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L128 +#endif + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L127 + .align 4 + +.L126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L126 + .align 4 + +.L127: + fmadd f0, A1, B1, f0 + .align 4 + +.L128: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + + fadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFDX f16, BO, INC2 + + fsub f0, f16, f0 +#else + LFDX f16, AO, INC2 + + fsub f0, f16, f0 +#endif + +#ifdef LN + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef LT + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef RN + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef RT + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFDX f0, BO, INC2 + + STFDUX f0, CO1, INC +#else + STFDX f0, AO, INC2 + + STFDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L129: +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L50: + andi. J, N, 2 + beq .L90 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L60 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, KK, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f26, f2 + fpmr f27, f3 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f6, f26 + fsmtp f7, f27 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + LFPDUX f20, BO, INC2 + LFPDUX f21, BO, INC2 + LFPDUX f22, BO, INC2 + LFPDUX f23, BO, INC2 + + subi BO, BO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f1, f18, f1 + fpsub f5, f19, f5 + + fpsub f2, f20, f2 + fpsub f6, f21, f6 + fpsub f3, f22, f3 + fpsub f7, f23, f7 + +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + LFPDUX f20, AO, INC2 + LFPDUX f21, AO, INC2 + LFPDUX f22, AO, INC2 + LFPDUX f23, AO, INC2 + + subi AO, AO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + fpsub f4, f20, f4 + fpsub f5, f21, f5 + fpsub f6, f22, f6 + fpsub f7, f23, f7 +#endif + +#ifdef LN + addi AO, AO, 66 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f7, A1, f7 + fxcpnmsub f3, A1, f7, f3 + fxcsnmsub f6, A2, f7, f6 + fxcpnmsub f2, A2, f7, f2 + + fxcsnmsub f5, A3, f7, f5 + fxcpnmsub f1, A3, f7, f1 + fxcsnmsub f4, A4, f7, f4 + fxcpnmsub f0, A4, f7, f0 + + fxpmul f3, A5, f3 + fxcsnmsub f6, A6, f3, f6 + fxcpnmsub f2, A6, f3, f2 + + fxcsnmsub f5, A7, f3, f5 + fxcpnmsub f1, A7, f3, f1 + fxcsnmsub f4, A8, f3, f4 + fxcpnmsub f0, A8, f3, f0 + + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + + add AO, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f6, A1, f6 + fxcpnmsub f2, A1, f6, f2 + fxcsnmsub f5, A2, f6, f5 + fxcpnmsub f1, A2, f6, f1 + fxcsnmsub f4, A3, f6, f4 + fxcpnmsub f0, A3, f6, f0 + + fxpmul f2, A4, f2 + fxcsnmsub f5, A5, f2, f5 + fxcpnmsub f1, A5, f2, f1 + fxcsnmsub f4, A6, f2, f4 + fxcpnmsub f0, A6, f2, f0 + + fxsmul f5, A7, f5 + fxcpnmsub f1, A7, f5, f1 + fxcsnmsub f4, A8, f5, f4 + fxcpnmsub f0, A8, f5, f0 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + + subi AO, AO, 6 * SIZE + LFPDUX A3, AO, INCM2 + subi AO, AO, 6 * SIZE + LFPDUX A4, AO, INCM2 + + addi AO, AO, -2 * SIZE + + fxpmul f1, A1, f1 + fxcsnmsub f4, A2, f1, f4 + fxcpnmsub f0, A2, f1, f0 + + fxsmul f4, A3, f4 + fxcpnmsub f0, A3, f4, f0 + + fxpmul f0, A4, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f5, A2, f0, f5 + fxcpnmsub f2, A3, f0, f2 + fxcsnmsub f6, A3, f0, f6 + fxcpnmsub f3, A4, f0, f3 + fxcsnmsub f7, A4, f0, f7 + + fxsmul f4, A5, f4 + fxcpnmsub f1, A6, f4, f1 + fxcsnmsub f5, A6, f4, f5 + fxcpnmsub f2, A7, f4, f2 + fxcsnmsub f6, A7, f4, f6 + fxcpnmsub f3, A8, f4, f3 + fxcsnmsub f7, A8, f4, f7 + + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + add AO, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f1, A1, f1 + fxcsnmsub f5, A1, f1, f5 + fxcpnmsub f2, A2, f1, f2 + fxcsnmsub f6, A2, f1, f6 + fxcpnmsub f3, A3, f1, f3 + fxcsnmsub f7, A3, f1, f7 + + fxsmul f5, A4, f5 + fxcpnmsub f2, A5, f5, f2 + fxcsnmsub f6, A5, f5, f6 + fxcpnmsub f3, A6, f5, f3 + fxcsnmsub f7, A6, f5, f7 + + fxpmul f2, A7, f2 + fxcsnmsub f6, A7, f2, f6 + fxcpnmsub f3, A8, f2, f3 + fxcsnmsub f7, A8, f2, f7 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, 6 * SIZE + LFPDUX A3, AO, INC2 + addi AO, AO, 6 * SIZE + LFPDUX A4, AO, INC2 + + subi AO, AO, 64 * SIZE + + fxsmul f6, A1, f6 + fxcpnmsub f3, A2, f6, f3 + fxcsnmsub f7, A2, f6, f7 + + fxpmul f3, A3, f3 + fxcsnmsub f7, A3, f3, f7 + + fxsmul f7, A4, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxsmul f4, A2, f4 + fxsmul f5, A2, f5 + fxsmul f6, A2, f6 + fxsmul f7, A2, f7 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f5, A1, f5 + fxsmul f6, A1, f6 + fxsmul f7, A1, f7 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f3, A1, f7, f3 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 + fxpmul f2, A2, f2 + fxpmul f3, A2, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f4, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f5, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f6, BO, INC2 + STFPDUX f3, BO, INC2 + STFPDUX f7, BO, INC2 + + subi BO, BO, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + STFPDUX f4, AO, INC2 + STFPDUX f5, AO, INC2 + STFPDUX f6, AO, INC2 + STFPDUX f7, AO, INC2 + + subi AO, AO, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 4 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L68: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + + fsmfp f0, f2 + fsmfp f1, f3 + fsmtp f2, f24 + fsmtp f3, f25 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + fpsub f1, f18, f1 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + addi AO, AO, 18 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A5, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A6, AO, INCM2 + + subi AO, AO, 2 * SIZE + + fxsmul f3, A1, f3 + fxcpnmsub f1, A1, f3, f1 + fxcsnmsub f2, A2, f3, f2 + fxcpnmsub f0, A2, f3, f0 + + fxpmul f1, A3, f1 + fxcsnmsub f2, A4, f1, f2 + fxcpnmsub f0, A4, f1, f0 + + fxsmul f2, A5, f2 + fxcpnmsub f0, A5, f2, f0 + + fxpmul f0, A6, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + add AO, AO, INC2 + LFPDUX A6, AO, INC2 + + subi AO, AO, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f2, A1, f0, f2 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f3, A2, f0, f3 + + fxsmul f2, A3, f2 + fxcpnmsub f1, A4, f2, f1 + fxcsnmsub f3, A4, f2, f3 + + fxpmul f1, A5, f1 + fxcsnmsub f3, A5, f1, f3 + + fxsmul f3, A6, f3 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + + fxcsnmsub f2, A1, f0, f2 + fxcsnmsub f3, A1, f1, f3 + + fxsmul f2, A2, f2 + fxsmul f3, A2, f3 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f2, A1, f2 + fxsmul f3, A1, f3 + + fxcpnmsub f0, A1, f2, f0 + fxcpnmsub f1, A1, f3, f1 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f2, CO1, INC + STFDUX f1, CO1, INC + STFDUX f3, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f3, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 2 + beq .L80 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L72 + .align 4 + +.L73: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fsmfp f0, f1 + fsmtp f1, f24 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxsmul f1, A2, f1 + fxcpnmsub f0, A2, f1, f0 + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f1, A1, f1 + fxcpnmsub f0, A1, f1, f0 + fxpmul f0, A2, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFDUX f1, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f1, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L80: + andi. I, M, 1 + beq .L89 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L84 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L84 + +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L83 + .align 4 + +.L82: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L82 + .align 4 + +.L83: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L84: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L88 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L88 +#endif + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L87 + .align 4 + +.L86: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L86 + .align 4 + +.L87: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L88: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RN + LFD A1, (2 + 0) * SIZE(BO) + LFD A2, (2 + 1) * SIZE(BO) + LFD A3, (2 + 3) * SIZE(BO) + + fsmtp f1, f0 + + fmul f0, A1, f0 + fnmsub f1, A2, f0, f1 + + fmul f1, A3, f1 + fsmfp f0, f1 +#endif + +#ifdef RT + LFD A1, (2 + 3) * SIZE(BO) + LFD A2, (2 + 2) * SIZE(BO) + LFD A3, (2 + 0) * SIZE(BO) + + fsmtp f1, f0 + + fmul f1, A1, f1 + fnmsub f0, A2, f1, f0 + + fmul f0, A3, f0 + fsmfp f0, f1 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +.L90: + srawi. J, N, 2 + ble .L999 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO4, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + nop + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + nop + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + nop + + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + nop + fxcsmadd f12, B4, A9, f12 + nop + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + LFPDUX f16, BO, INC4 + fpmr f25, f1 + nop + fpmr f26, f2 + LFPDUX f17, BO2, INC4 + fpmr f27, f3 + nop + + fpmr f28, f8 + LFPDUX f18, BO, INC4 + fpmr f29, f9 + nop + fpmr f30, f10 + LFPDUX f19, BO2, INC4 + fpmr f31, f11 + nop + + fsmfp f0, f4 + LFPDUX f20, BO, INC4 + fsmfp f1, f5 + nop + fsmfp f2, f6 + LFPDUX f21, BO2, INC4 + fsmfp f3, f7 + nop + + fsmfp f8, f12 + LFPDUX f22, BO, INC4 + fsmfp f9, f13 + nop + fsmfp f10, f14 + LFPDUX f23, BO2, INC4 + fsmfp f11, f15 + nop + + fsmtp f4, f24 + LFPDUX f24, BO, INC4 + fsmtp f5, f25 + nop + fsmtp f6, f26 + LFPDUX f25, BO2, INC4 + fsmtp f7, f27 + nop + + fsmtp f12, f28 + LFPDUX f26, BO, INC4 + fsmtp f13, f29 + nop + fsmtp f14, f30 + LFPDUX f27, BO2, INC4 + fsmtp f15, f31 + nop + + fpsub f0, f16, f0 + LFPDUX f28, BO, INC4 + fpsub f8, f17, f8 + nop + fpsub f4, f18, f4 + LFPDUX f29, BO2, INC4 + fpsub f12, f19, f12 + nop + + fpsub f1, f20, f1 + LFPDUX f30, BO, INC4 + fpsub f9, f21, f9 + subi BO, BO, 32 * SIZE + fpsub f5, f22, f5 + LFPDUX f31, BO2, INC4 + fpsub f13, f23, f13 + subi BO2, BO2, 32 * SIZE + + fpsub f2, f24, f2 + fpsub f10, f25, f10 + fpsub f6, f26, f6 + fpsub f14, f27, f14 + fpsub f3, f28, f3 + fpsub f11, f29, f11 + fpsub f7, f30, f7 + fpsub f15, f31, f15 + +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + fpsub f0, f16, f0 + LFPDUX f24, AO, INC4 + fpsub f1, f17, f1 + LFPDUX f25, AO2, INC4 + fpsub f2, f18, f2 + LFPDUX f26, AO, INC4 + fpsub f3, f19, f3 + LFPDUX f27, AO2, INC4 + fpsub f4, f20, f4 + LFPDUX f28, AO, INC4 + fpsub f5, f21, f5 + LFPDUX f29, AO2, INC4 + fpsub f6, f22, f6 + LFPDUX f30, AO, INC4 + fpsub f7, f23, f7 + LFPDUX f31, AO2, INC4 + + fpsub f8, f24, f8 + subi AO, AO, 32 * SIZE + fpsub f9, f25, f9 + subi AO2, AO2, 32 * SIZE + fpsub f10, f26, f10 + fpsub f11, f27, f11 + fpsub f12, f28, f12 + fpsub f13, f29, f13 + fpsub f14, f30, f14 + fpsub f15, f31, f15 +#endif + +#ifdef LN + addi AO, AO, 68 * SIZE + addi AO2, AO2, 68 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + fxsmul f7, A1, f7 + fxsmul f15, A1, f15 + + fxcpnmsub f3, A1, f7, f3 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f6, A2, f7, f6 + fxcsnmsub f14, A2, f15, f14 + + fxcpnmsub f2, A2, f7, f2 + fxcpnmsub f10, A2, f15, f10 + + fxcsnmsub f5, A3, f7, f5 + fxcsnmsub f13, A3, f15, f13 + + fxcpnmsub f1, A3, f7, f1 + fxcpnmsub f9, A3, f15, f9 + + fxcsnmsub f4, A4, f7, f4 + fxcsnmsub f12, A4, f15, f12 + + fxcpnmsub f0, A4, f7, f0 + fxcpnmsub f8, A4, f15, f8 + + fxpmul f3, A5, f3 + fxpmul f11, A5, f11 + + fxcsnmsub f6, A6, f3, f6 + fxcsnmsub f14, A6, f11, f14 + + fxcpnmsub f2, A6, f3, f2 + fxcpnmsub f10, A6, f11, f10 + + fxcsnmsub f5, A7, f3, f5 + fxcsnmsub f13, A7, f11, f13 + + fxcpnmsub f1, A7, f3, f1 + fxcpnmsub f9, A7, f11, f9 + + fxcsnmsub f4, A8, f3, f4 + fxcsnmsub f12, A8, f11, f12 + + fxcpnmsub f0, A8, f3, f0 + fxcpnmsub f8, A8, f11, f8 + + add AO2, AO2, INCM4 + LFPDUX A1, AO, INCM4 + LFPDUX A2, AO2, INCM4 + LFPDUX A3, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f10, A1, f14, f10 + + fxcsnmsub f5, A2, f6, f5 + fxcsnmsub f13, A2, f14, f13 + + fxcpnmsub f1, A2, f6, f1 + fxcpnmsub f9, A2, f14, f9 + + fxcsnmsub f4, A3, f6, f4 + fxcsnmsub f12, A3, f14, f12 + + fxcpnmsub f0, A3, f6, f0 + fxcpnmsub f8, A3, f14, f8 + + fxpmul f2, A4, f2 + fxpmul f10, A4, f10 + + fxcsnmsub f5, A5, f2, f5 + fxcsnmsub f13, A5, f10, f13 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + + fxcsnmsub f4, A6, f2, f4 + fxcsnmsub f12, A6, f10, f12 + + fxcpnmsub f0, A6, f2, f0 + fxcpnmsub f8, A6, f10, f8 + + fxsmul f5, A7, f5 + fxsmul f13, A7, f13 + + fxcpnmsub f1, A7, f5, f1 + fxcpnmsub f9, A7, f13, f9 + + fxcsnmsub f4, A8, f5, f4 + fxcsnmsub f12, A8, f13, f12 + + fxcpnmsub f0, A8, f5, f0 + fxcpnmsub f8, A8, f13, f8 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A3, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A4, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f4, A2, f1, f4 + fxcsnmsub f12, A2, f9, f12 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f0, A3, f4, f0 + fxcpnmsub f8, A3, f12, f8 + + fxpmul f0, A4, f0 + fxpmul f8, A4, f8 + +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + + fxcsnmsub f6, A3, f0, f6 + fxcsnmsub f14, A3, f8, f14 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + + fxcsnmsub f7, A4, f0, f7 + fxcsnmsub f15, A4, f8, f15 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f1, A6, f4, f1 + fxcpnmsub f9, A6, f12, f9 + + fxcsnmsub f5, A6, f4, f5 + fxcsnmsub f13, A6, f12, f13 + + fxcpnmsub f2, A7, f4, f2 + fxcpnmsub f10, A7, f12, f10 + + fxcsnmsub f6, A7, f4, f6 + fxcsnmsub f14, A7, f12, f14 + + fxcpnmsub f3, A8, f4, f3 + fxcpnmsub f11, A8, f12, f11 + + fxcsnmsub f7, A8, f4, f7 + fxcsnmsub f15, A8, f12, f15 + + add AO, AO, INC4 + LFPDUX A1, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f13, A1, f9, f13 + + fxcpnmsub f2, A2, f1, f2 + fxcpnmsub f10, A2, f9, f10 + + fxcsnmsub f6, A2, f1, f6 + fxcsnmsub f14, A2, f9, f14 + + fxcpnmsub f3, A3, f1, f3 + fxcpnmsub f11, A3, f9, f11 + + fxcsnmsub f7, A3, f1, f7 + fxcsnmsub f15, A3, f9, f15 + + fxsmul f5, A4, f5 + fxsmul f13, A4, f13 + + fxcpnmsub f2, A5, f5, f2 + fxcpnmsub f10, A5, f13, f10 + + fxcsnmsub f6, A5, f5, f6 + fxcsnmsub f14, A5, f13, f14 + + fxcpnmsub f3, A6, f5, f3 + fxcpnmsub f11, A6, f13, f11 + + fxcsnmsub f7, A6, f5, f7 + fxcsnmsub f15, A6, f13, f15 + + fxpmul f2, A7, f2 + fxpmul f10, A7, f10 + + fxcsnmsub f6, A7, f2, f6 + fxcsnmsub f14, A7, f10, f14 + + fxcpnmsub f3, A8, f2, f3 + fxcpnmsub f11, A8, f10, f11 + + fxcsnmsub f7, A8, f2, f7 + fxcsnmsub f15, A8, f10, f15 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A3, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A4, AO2, INC4 + + subi AO, AO, 64 * SIZE + subi AO2, AO2, 64 * SIZE + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f3, A2, f6, f3 + fxcpnmsub f11, A2, f14, f11 + + fxcsnmsub f7, A2, f6, f7 + fxcsnmsub f15, A2, f14, f15 + + fxpmul f3, A3, f3 + fxpmul f11, A3, f11 + + fxcsnmsub f7, A3, f3, f7 + fxcsnmsub f15, A3, f11, f15 + + fxsmul f7, A4, f7 + fxsmul f15, A4, f15 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + fxcsnmsub f14, A2, f2, f14 + fxcsnmsub f15, A2, f3, f15 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxsmul f6, A3, f6 + fxsmul f7, A3, f7 + + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + fxcpnmsub f10, A4, f6, f10 + fxcpnmsub f11, A4, f7, f11 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + fxcsnmsub f14, A4, f6, f14 + fxcsnmsub f15, A4, f7, f15 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxpmul f10, A5, f10 + fxpmul f11, A5, f11 + + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + fxcsnmsub f14, A5, f10, f14 + fxcsnmsub f15, A5, f11, f15 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 + fxsmul f14, A6, f14 + fxsmul f15, A6, f15 + +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxsmul f14, A1, f14 + fxsmul f15, A1, f15 + + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + fxcpnmsub f10, A1, f14, f10 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcsnmsub f6, A2, f14, f6 + fxcsnmsub f7, A2, f15, f7 + + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + fxcpnmsub f2, A2, f14, f2 + fxcpnmsub f3, A2, f15, f3 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxpmul f10, A3, f10 + fxpmul f11, A3, f11 + + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + fxcsnmsub f6, A4, f10, f6 + fxcsnmsub f7, A4, f11, f7 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + fxcpnmsub f2, A4, f10, f2 + fxcpnmsub f3, A4, f11, f3 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxsmul f6, A5, f6 + fxsmul f7, A5, f7 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + fxcpnmsub f2, A5, f6, f2 + fxcpnmsub f3, A5, f7, f3 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 + fxpmul f2, A6, f2 + fxpmul f3, A6, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f6, BO, INC4 + STFPDUX f14, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + STFPDUX f7, BO, INC4 + STFPDUX f15, BO2, INC4 + + subi BO, BO, 32 * SIZE + subi BO2, BO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + STFDUX f10, CO3, INC + STFDUX f14, CO3, INC + STFDUX f11, CO3, INC + STFDUX f15, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC + STFSDUX f10, CO4, INC + STFSDUX f14, CO4, INC + STFSDUX f11, CO4, INC + STFSDUX f15, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f6, AO, INC4 + STFPDUX f7, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + STFPDUX f14, AO, INC4 + STFPDUX f15, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC + STFDUX f14, CO4, INC + STFSDUX f14, CO4, INC + STFDUX f15, CO4, INC + STFSDUX f15, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 4 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f28, f8 + fpmr f29, f9 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f8, f12 + fsmfp f9, f13 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f12, f28 + fsmtp f13, f29 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + LFPDUX f20, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f22, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 + + fpsub f1, f20, f1 + fpsub f9, f21, f9 + fpsub f5, f22, f5 + fpsub f13, f23, f13 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f4, f18, f4 + fpsub f5, f19, f5 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f12, f22, f12 + fpsub f13, f23, f13 +#endif + +#ifdef LN + addi AO, AO, 20 * SIZE + addi AO2, AO2, 20 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A5, AO, INCM4 + add AO2, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f5, A1, f5 + fxsmul f13, A1, f13 + + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f5, f4 + fxcsnmsub f12, A2, f13, f12 + + fxcpnmsub f0, A2, f5, f0 + fxcpnmsub f8, A2, f13, f8 + + fxpmul f1, A3, f1 + fxpmul f9, A3, f9 + + fxcsnmsub f4, A4, f1, f4 + fxcsnmsub f12, A4, f9, f12 + + fxcpnmsub f0, A4, f1, f0 + fxcpnmsub f8, A4, f9, f8 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f8, A5, f12, f8 + + fxpmul f0, A6, f0 + fxpmul f8, A6, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + add AO, AO, INC4 + LFPDUX A6, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f1, A4, f4, f1 + fxcpnmsub f9, A4, f12, f9 + + fxcsnmsub f5, A4, f4, f5 + fxcsnmsub f13, A4, f12, f13 + + fxpmul f1, A5, f1 + fxpmul f9, A5, f9 + + fxcsnmsub f5, A5, f1, f5 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f5, A6, f5 + fxsmul f13, A6, f13 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 2 + beq .L40 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + LFPDUX B3, BO, INC4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + LFPDUX A5, BO, INC4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + LFPDUX A7, BO, INC4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f28, f8 + + fsmfp f0, f4 + fsmfp f8, f12 + fsmtp f4, f24 + fsmtp f12, f28 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f8, f18, f8 + fpsub f12, f19, f12 +#endif + +#ifdef LN + addi AO, AO, 8 * SIZE + addi AO2, AO2, 8 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f12, A1, f12 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f8, A1, f12, f8 + + fxpmul f0, A2, f0 + fxpmul f8, A2, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + subi AO, AO, 4 * SIZE + subi AO2, AO2, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxsmul f4, A2, f4 + fxsmul f12, A2, f12 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f8, A2, f0, f8 + fxcsnmsub f12, A2, f0, f12 + + fxsmul f4, A3, f4 + fxcpnmsub f8, A4, f4, f8 + fxcsnmsub f12, A4, f4, f12 + + fxpmul f8, A5, f8 + fxcsnmsub f12, A5, f8, f12 + fxsmul f12, A6, f12 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxcpnmsub f8, A1, f12, f8 + fxcsnmsub f4, A2, f12, f4 + fxcpnmsub f0, A2, f12, f0 + + fxpmul f8, A3, f8 + fxcsnmsub f4, A4, f8, f4 + fxcpnmsub f0, A4, f8, f0 + + fxsmul f4, A5, f4 + fxcpnmsub f0, A5, f4, f0 + fxpmul f0, A6, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f4, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f12, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L40: + andi. I, M, 1 + beq .L49 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L44 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L44 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L43 + .align 4 + +.L42: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L42 + .align 4 + +.L43: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L44: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L48 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L48 +#endif + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L47 + .align 4 + +.L46: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L46 + .align 4 + +.L47: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + addi AO2, AO, 2 * SIZE + .align 4 + +.L48: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#if defined(LN) || defined(LT) + LFPDX A1, AO, INC4 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RN + LFD A1, (4 + 0) * SIZE(BO) + LFD A2, (4 + 1) * SIZE(BO) + LFD A3, (4 + 2) * SIZE(BO) + LFD A4, (4 + 3) * SIZE(BO) + + LFD A5, (4 + 5) * SIZE(BO) + LFD A6, (4 + 6) * SIZE(BO) + LFD A7, (4 + 7) * SIZE(BO) + LFD A8, (4 + 10) * SIZE(BO) + + LFD A9, (4 + 11) * SIZE(BO) + LFD A10, (4 + 15) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f0, A1, f0 + fnmsub f2, A2, f0, f2 + fnmsub f1, A3, f0, f1 + fnmsub f3, A4, f0, f3 + + fmul f2, A5, f2 + fnmsub f1, A6, f2, f1 + fnmsub f3, A7, f2, f3 + + fmul f1, A8, f1 + fnmsub f3, A9, f1, f3 + + fmul f3, A10, f3 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#ifdef RT + LFD A1, (4 + 15) * SIZE(BO) + LFD A2, (4 + 14) * SIZE(BO) + LFD A3, (4 + 13) * SIZE(BO) + LFD A4, (4 + 12) * SIZE(BO) + + LFD A5, (4 + 10) * SIZE(BO) + LFD A6, (4 + 9) * SIZE(BO) + LFD A7, (4 + 8) * SIZE(BO) + LFD A8, (4 + 5) * SIZE(BO) + + LFD A9, (4 + 4) * SIZE(BO) + LFD A10, (4 + 0) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f3, A1, f3 + fnmsub f1, A2, f3, f1 + fnmsub f2, A3, f3, f2 + fnmsub f0, A4, f3, f0 + + fmul f1, A5, f1 + fnmsub f2, A6, f1, f2 + fnmsub f0, A7, f1, f0 + + fmul f2, A8, f2 + fnmsub f0, A9, f2, f0 + + fmul f0, A10, f0 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f1, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f1, AO2, INC4 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC + STFDUX f1, CO3, INC + STFSDUX f1, CO4, INC + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S new file mode 100644 index 0000000..60ba587 --- /dev/null +++ b/kernel/power/trsm_kernel_power6_LN.S @@ -0,0 +1,3688 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, (16 * 3 * SIZE) + li PREC, -4 * SIZE + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + +LL(30): + andi. I, M, 1 + ble LL(20) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(09): + srawi. I, M, 2 + ble LL(39) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(60): + andi. I, M, 1 + ble LL(50) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(41) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(41): + srawi. I, M, 2 + ble LL(69) + .align 4 + +LL(42): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(43): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(43) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(42) + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(80) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(71) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(71): + srawi. I, M, 2 + ble LL(999) + .align 4 + +LL(72): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(73): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(73) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(72) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S new file mode 100644 index 0000000..448b163 --- /dev/null +++ b/kernel/power/trsm_kernel_power6_LT.S @@ -0,0 +1,3676 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define BB r20 +#define KK r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, (16 * 3 * SIZE) + li PREC, 4 * SIZE + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S new file mode 100644 index 0000000..1f36d17 --- /dev/null +++ b/kernel/power/trsm_kernel_power6_RT.S @@ -0,0 +1,3696 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, (16 * 3 * SIZE) + li PREC, 4 * SIZE + lfs f0, FZERO + + andi. J, N, 1 + ble LL(40) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + dcbt AO, PREA + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(99) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(99): +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(09) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + dcbt AO, PREA + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(09): + srawi. J, N, 2 + ble LL(999) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S new file mode 100644 index 0000000..43354c6 --- /dev/null +++ b/kernel/power/trsm_kernel_ppc440_LN.S @@ -0,0 +1,3487 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + + srawi. J, N, 2 + ble .L40 + .align 4 + +.L10: + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + +.L30: + andi. I, M, 1 + ble .L20 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L20: + andi. I, M, 2 + ble .L09 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L09: + srawi. I, M, 2 + ble .L39 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L18 + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.L18: +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + + +.L39: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L40: + andi. J, N, 2 + ble .L70 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +.L60: + andi. I, M, 1 + ble .L50 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L50: + andi. I, M, 2 + ble .L41 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L41: + srawi. I, M, 2 + ble .L69 + .align 4 + +.L42: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 5 + +.L43: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L43 + .align 4 + +.L45: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L42 + .align 4 + +.L69: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +.L70: + andi. J, N, 1 + ble .L999 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L80 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L80: + andi. I, M, 2 + ble .L71 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L71: + srawi. I, M, 2 + ble .L999 + .align 4 + +.L72: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L75 + .align 5 + +.L73: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L73 + .align 4 + +.L75: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L72 + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S new file mode 100644 index 0000000..eb0d4e4 --- /dev/null +++ b/kernel/power/trsm_kernel_ppc440_LT.S @@ -0,0 +1,3477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + + srawi. J, N, 2 + ble .L40 + .align 4 + +.L10: + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L18 + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.L18: +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + ble .L30 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L30: + andi. I, M, 1 + ble .L39 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L39: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L40: + andi. J, N, 2 + ble .L70 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble .L50 + .align 4 + +.L41: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 5 + +.L42: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L42 + .align 4 + +.L45: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L41 + .align 4 + +.L50: + andi. I, M, 2 + ble .L60 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L60: + andi. I, M, 1 + ble .L69 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L69: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +.L70: + andi. J, N, 1 + ble .L999 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble .L80 + .align 4 + +.L71: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L75 + .align 5 + +.L72: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L72 + .align 4 + +.L75: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L71 + .align 4 + +.L80: + andi. I, M, 2 + ble .L90 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L999 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S new file mode 100644 index 0000000..54c59c2 --- /dev/null +++ b/kernel/power/trsm_kernel_ppc440_RT.S @@ -0,0 +1,3496 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + +.L70: + andi. J, N, 1 + ble .L40 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble .L80 + .align 4 + +.L71: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L75 + .align 5 + +.L72: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L72 + .align 4 + +.L75: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L71 + .align 4 + +.L80: + andi. I, M, 2 + ble .L90 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L99 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L99: +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L40: + andi. J, N, 2 + ble .L09 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble .L50 + .align 4 + +.L41: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 5 + +.L42: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L42 + .align 4 + +.L45: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L41 + .align 4 + +.L50: + andi. I, M, 2 + ble .L60 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L60: + andi. I, M, 1 + ble .L69 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L69: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +.L09: + srawi. J, N, 2 + ble .L999 + .align 4 + +.L10: + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L18 + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.L18: +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + ble .L30 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L30: + andi. I, M, 1 + ble .L39 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L39: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/zamax.S b/kernel/power/zamax.S new file mode 100644 index 0000000..6acd96d --- /dev/null +++ b/kernel/power/zamax.S @@ -0,0 +1,505 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamax_cell.S b/kernel/power/zamax_cell.S new file mode 100644 index 0000000..2af3d24 --- /dev/null +++ b/kernel/power/zamax_cell.S @@ -0,0 +1,495 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, 10 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + + fabs f8, f24 + LFD f26, 2 * SIZE(X) + fabs f9, f25 + LFD f27, 3 * SIZE(X) + fabs f10, f26 + LFD f28, 4 * SIZE(X) + fabs f11, f27 + LFD f29, 5 * SIZE(X) + fabs f12, f28 + LFD f30, 6 * SIZE(X) + fabs f13, f29 + LFD f31, 7 * SIZE(X) + fabs f14, f30 + nop + fabs f15, f31 + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + dcbt X, PREA + fadd f5, f10, f11 + nop + fadd f6, f12, f13 + LFD f24, 8 * SIZE(X) + fadd f7, f14, f15 + LFD f25, 9 * SIZE(X) + + fabs f8, f24 + LFD f26, 10 * SIZE(X) + fabs f9, f25 + LFD f27, 11 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + LFD f28, 12 * SIZE(X) + fsub f19, f3, f7 + LFD f29, 13 * SIZE(X) + + fabs f12, f28 + LFD f30, 14 * SIZE(X) + fabs f13, f29 + LFD f31, 15 * SIZE(X) + fabs f14, f30 + fabs f15, f31 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + LFD f24, 16 * SIZE(X) + fadd f23, f14, f15 + LFD f25, 17 * SIZE(X) + + fabs f8, f24 + LFD f26, 18 * SIZE(X) + fabs f9, f25 + LFD f27, 19 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + LFD f28, 20 * SIZE(X) + fsub f19, f3, f23 + LFD f29, 21 * SIZE(X) + + fabs f12, f28 + LFD f30, 22 * SIZE(X) + fabs f13, f29 + LFD f31, 23 * SIZE(X) + fabs f14, f30 + addi X, X, 16 * SIZE + fabs f15, f31 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + LFD f24, 8 * SIZE(X) + fadd f7, f14, f15 + LFD f25, 9 * SIZE(X) + + fabs f8, f24 + LFD f26, 10 * SIZE(X) + fabs f9, f25 + LFD f27, 11 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + LFD f28, 12 * SIZE(X) + fsub f19, f3, f7 + LFD f29, 13 * SIZE(X) + + fabs f12, f28 + LFD f30, 14 * SIZE(X) + fabs f13, f29 + LFD f31, 15 * SIZE(X) + fabs f14, f30 + fabs f15, f31 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + addi X, X, 16 * SIZE + + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamax_hummer.S b/kernel/power/zamax_hummer.S new file mode 100644 index 0000000..8431239 --- /dev/null +++ b/kernel/power/zamax_hummer.S @@ -0,0 +1,347 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 + +#define T1 f16 +#define T2 f17 +#define T3 f18 +#define T4 f19 + +#define B1 f20 +#define B2 f21 +#define B3 f22 +#define B4 f23 +#define B5 f24 +#define B6 f25 +#define B7 f26 +#define B8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD A1, 0 * SIZE(X) + LFD A2, 1 * SIZE(X) + add X, X, INCX2 + + fabs A1, A1 + fabs A2, A2 + + addi N, N, -1 + cmpwi cr0, N, 0 + fadd C1, A1, A2 + ble LL(999) + + subi INCX2, INCX2, SIZE + fsmfp C1, C1 + li INCX, SIZE + fpmr C2, C1 + sub X, X, INCX2 + fpmr C3, C1 + srawi. r0, N, 3 + fpmr C4, C1 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX2 + LFDUX A6, X, INCX + LFDUX A7, X, INCX2 + LFDUX A8, X, INCX + + LFSDUX A5, X, INCX2 + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX2 + LFSDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpabs B1, A1 + LFDUX A1, X, INCX2 + fpabs B2, A2 + LFDUX A2, X, INCX + fpabs B3, A3 + LFDUX A3, X, INCX2 + fpabs B4, A4 + LFDUX A4, X, INCX + + fpabs B5, A5 + LFSDUX A1, X, INCX2 + fpabs B6, A6 + LFSDUX A2, X, INCX + fpabs B7, A7 + LFSDUX A3, X, INCX2 + fpabs B8, A8 + LFSDUX A4, X, INCX + + fpadd T1, B1, B2 + LFDUX A5, X, INCX2 + fpadd T2, B3, B4 + LFDUX A6, X, INCX + fpadd T3, B5, B6 + LFDUX A7, X, INCX2 + fpadd T4, B7, B8 + LFDUX A8, X, INCX + + fpsub F1, C1, T1 + LFSDUX A5, X, INCX2 + fpsub F2, C2, T2 + LFSDUX A6, X, INCX + fpsub F3, C3, T3 + LFSDUX A7, X, INCX2 + fpsub F4, C4, T4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + bdnz LL(102) + .align 4 + +LL(103): + fpabs B1, A1 + fpabs B2, A2 + fpabs B3, A3 + fpabs B4, A4 + + fpabs B5, A5 + fpabs B6, A6 + fpabs B7, A7 + fpabs B8, A8 + + fpadd T1, B1, B2 + fpadd T2, B3, B4 + fpadd T3, B5, B6 + fpadd T4, B7, B8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(998) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A3 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A3 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + + fpadd A1, A1, A2 + + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + + fabs A1, A1 + fabs A2, A2 + + fadd A1, A1, A2 + + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(998): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zamax_ppc440.S b/kernel/power/zamax_ppc440.S new file mode 100644 index 0000000..17372bb --- /dev/null +++ b/kernel/power/zamax_ppc440.S @@ -0,0 +1,319 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREX r8 +#define INC1 r9 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + + sub X, X, INCX + li INC1, SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + LFDX f2, X, INC1 + + fabs f1, f1 + li PREX, 4 * 8 * SIZE + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 +#ifdef PPCG4 + dcbt X, PREX +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f13, f29 + LFDUX f28, X, INCX + fabs f14, f30 + LFDX f29, X, INC1 + fabs f15, f31 + LFDUX f30, X, INCX + + fsub f16, f0, f4 + LFDX f31, X, INC1 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 +#ifdef PPCG4 + dcbt X, PREX +#endif + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f0, f16, f0, f4 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamin.S b/kernel/power/zamin.S new file mode 100644 index 0000000..1ab8b6b --- /dev/null +++ b/kernel/power/zamin.S @@ -0,0 +1,505 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamin_cell.S b/kernel/power/zamin_cell.S new file mode 100644 index 0000000..6d32f60 --- /dev/null +++ b/kernel/power/zamin_cell.S @@ -0,0 +1,495 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, 10 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + + fabs f8, f24 + LFD f26, 2 * SIZE(X) + fabs f9, f25 + LFD f27, 3 * SIZE(X) + fabs f10, f26 + LFD f28, 4 * SIZE(X) + fabs f11, f27 + LFD f29, 5 * SIZE(X) + fabs f12, f28 + LFD f30, 6 * SIZE(X) + fabs f13, f29 + LFD f31, 7 * SIZE(X) + fabs f14, f30 + nop + fabs f15, f31 + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + dcbt X, PREA + fadd f5, f10, f11 + nop + fadd f6, f12, f13 + LFD f24, 8 * SIZE(X) + fadd f7, f14, f15 + LFD f25, 9 * SIZE(X) + + fabs f8, f24 + LFD f26, 10 * SIZE(X) + fabs f9, f25 + LFD f27, 11 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + LFD f28, 12 * SIZE(X) + fsub f19, f3, f7 + LFD f29, 13 * SIZE(X) + + fabs f12, f28 + LFD f30, 14 * SIZE(X) + fabs f13, f29 + LFD f31, 15 * SIZE(X) + fabs f14, f30 + fabs f15, f31 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + LFD f24, 16 * SIZE(X) + fadd f23, f14, f15 + LFD f25, 17 * SIZE(X) + + fabs f8, f24 + LFD f26, 18 * SIZE(X) + fabs f9, f25 + LFD f27, 19 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + LFD f28, 20 * SIZE(X) + fsub f19, f3, f23 + LFD f29, 21 * SIZE(X) + + fabs f12, f28 + LFD f30, 22 * SIZE(X) + fabs f13, f29 + LFD f31, 23 * SIZE(X) + fabs f14, f30 + addi X, X, 16 * SIZE + fabs f15, f31 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + LFD f24, 8 * SIZE(X) + fadd f7, f14, f15 + LFD f25, 9 * SIZE(X) + + fabs f8, f24 + LFD f26, 10 * SIZE(X) + fabs f9, f25 + LFD f27, 11 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + LFD f28, 12 * SIZE(X) + fsub f19, f3, f7 + LFD f29, 13 * SIZE(X) + + fabs f12, f28 + LFD f30, 14 * SIZE(X) + fabs f13, f29 + LFD f31, 15 * SIZE(X) + fabs f14, f30 + fabs f15, f31 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + addi X, X, 16 * SIZE + + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamin_hummer.S b/kernel/power/zamin_hummer.S new file mode 100644 index 0000000..5ac1b89 --- /dev/null +++ b/kernel/power/zamin_hummer.S @@ -0,0 +1,347 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 + +#define T1 f16 +#define T2 f17 +#define T3 f18 +#define T4 f19 + +#define B1 f20 +#define B2 f21 +#define B3 f22 +#define B4 f23 +#define B5 f24 +#define B6 f25 +#define B7 f26 +#define B8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD A1, 0 * SIZE(X) + LFD A2, 1 * SIZE(X) + add X, X, INCX2 + + fabs A1, A1 + fabs A2, A2 + + addi N, N, -1 + cmpwi cr0, N, 0 + fadd C1, A1, A2 + ble LL(999) + + subi INCX2, INCX2, SIZE + fsmfp C1, C1 + li INCX, SIZE + fpmr C2, C1 + sub X, X, INCX2 + fpmr C3, C1 + srawi. r0, N, 3 + fpmr C4, C1 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX2 + LFDUX A6, X, INCX + LFDUX A7, X, INCX2 + LFDUX A8, X, INCX + + LFSDUX A5, X, INCX2 + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX2 + LFSDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpabs B1, A1 + LFDUX A1, X, INCX2 + fpabs B2, A2 + LFDUX A2, X, INCX + fpabs B3, A3 + LFDUX A3, X, INCX2 + fpabs B4, A4 + LFDUX A4, X, INCX + + fpabs B5, A5 + LFSDUX A1, X, INCX2 + fpabs B6, A6 + LFSDUX A2, X, INCX + fpabs B7, A7 + LFSDUX A3, X, INCX2 + fpabs B8, A8 + LFSDUX A4, X, INCX + + fpadd T1, B1, B2 + LFDUX A5, X, INCX2 + fpadd T2, B3, B4 + LFDUX A6, X, INCX + fpadd T3, B5, B6 + LFDUX A7, X, INCX2 + fpadd T4, B7, B8 + LFDUX A8, X, INCX + + fpsub F1, T1, C1 + LFSDUX A5, X, INCX2 + fpsub F2, T2, C2 + LFSDUX A6, X, INCX + fpsub F3, T3, C3 + LFSDUX A7, X, INCX2 + fpsub F4, T4, C4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + bdnz LL(102) + .align 4 + +LL(103): + fpabs B1, A1 + fpabs B2, A2 + fpabs B3, A3 + fpabs B4, A4 + + fpabs B5, A5 + fpabs B6, A6 + fpabs B7, A7 + fpabs B8, A8 + + fpadd T1, B1, B2 + fpadd T2, B3, B4 + fpadd T3, B5, B6 + fpadd T4, B7, B8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(998) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + fpsub F1, A1, C1 + fpsub F2, A3, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A3 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + + fpadd A1, A1, A2 + + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + + fabs A1, A1 + fabs A2, A2 + + fadd A1, A1, A2 + + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(998): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zamin_ppc440.S b/kernel/power/zamin_ppc440.S new file mode 100644 index 0000000..9d70f76 --- /dev/null +++ b/kernel/power/zamin_ppc440.S @@ -0,0 +1,317 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREX r8 +#define INC1 r9 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + + sub X, X, INCX + li INC1, SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + LFDX f2, X, INC1 + + fabs f1, f1 + li PREX, 4 * 8 * SIZE + fabs f2, f2 + subi N, N, 1 + fadd f1, f1, f2 + + fmr f0, f1 + srawi. r0, N, 3 + fmr f2, f1 + mtspr CTR, r0 + fmr f3, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 +#ifdef PPCG4 + dcbt X, PREX +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f13, f29 + LFDUX f28, X, INCX + fabs f14, f30 + LFDX f29, X, INC1 + fabs f15, f31 + LFDUX f30, X, INCX + + fsub f16, f0, f4 + LFDX f31, X, INC1 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 +#ifdef PPCG4 + dcbt X, PREX +#endif + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f0, f16, f4, f0 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zasum.S b/kernel/power/zasum.S new file mode 100644 index 0000000..14b58ce --- /dev/null +++ b/kernel/power/zasum.S @@ -0,0 +1,456 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCXM1 r9 +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fabs f16, f8 + fabs f17, f9 + fabs f18, f10 + fabs f19, f11 + + fabs f20, f12 + fabs f21, f13 + fabs f22, f14 + fabs f23, f15 + bdz LL(20) + .align 4 + +LL(10): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + FADD f0, f0, f16 + fabs f16, f8 + FADD f1, f1, f17 + fabs f17, f9 + + FADD f2, f2, f18 + fabs f18, f10 + FADD f3, f3, f19 + fabs f19, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + FADD f4, f4, f20 + fabs f20, f12 + FADD f5, f5, f21 + fabs f21, f13 + + FADD f6, f6, f22 + fabs f22, f14 + FADD f7, f7, f23 + fabs f23, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + FADD f0, f0, f8 + FADD f1, f1, f9 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f16, f8 + fabs f17, f9 + fabs f18, f10 + fabs f19, f11 + + fabs f20, f12 + fabs f21, f13 + fabs f22, f14 + fabs f23, f15 + bdz LL(120) + .align 4 + +LL(110): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + FADD f0, f0, f16 + fabs f16, f8 + FADD f1, f1, f17 + fabs f17, f9 + + FADD f2, f2, f18 + fabs f18, f10 + FADD f3, f3, f19 + fabs f19, f11 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + FADD f4, f4, f20 + fabs f20, f12 + FADD f5, f5, f21 + fabs f21, f13 + + FADD f6, f6, f22 + fabs f22, f14 + FADD f7, f7, f23 + fabs f23, f15 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + fabs f8, f8 + fabs f9, f9 + FADD f0, f0, f8 + FADD f1, f1, f9 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zasum_cell.S b/kernel/power/zasum_cell.S new file mode 100644 index 0000000..7389468 --- /dev/null +++ b/kernel/power/zasum_cell.S @@ -0,0 +1,581 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f0 + +#define STACKSIZE 16 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stw r0, 0(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfs FZERO, 0(SP) + + slwi INCX, INCX, ZBASE_SHIFT + fmr f1, FZERO + li PREA, 8 * 16 * SIZE + fmr f2, FZERO + subi INCXM1, INCX, SIZE + + cmpwi cr0, N, 0 + fmr f3, FZERO + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE * 2 + bne- cr0, LL(20) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(15) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + bdz LL(13) + .align 4 + +LL(12): + FADD f0, f0, f4 + dcbt X, PREA + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 7 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 8 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 9 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 10 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 11 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 12 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 13 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 14 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 15 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 16 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 17 * SIZE(X) + + FADD f1, f1, f5 + addi X, X, 16 * SIZE + fabs f5, f9 + LFD f10, 2 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 3 * SIZE(X) + + FADD f3, f3, f7 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + bdnz LL(12) + .align 4 + +LL(13): + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 7 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 8 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 9 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 10 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 11 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 12 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 13 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 14 * SIZE(X) + + FADD f2, f2, f6 + addi X, X, 16 * SIZE + fabs f6, f10 + LFD f11, -1 * SIZE(X) + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(16) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + addi X, X, 8 * SIZE + fabs f6, f10 + LFD f11, -1 * SIZE(X) + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + addi X, X, 4 * SIZE + fabs f7, f11 + nop + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + fabs f5, f9 + + FADD f0, f0, f4 + addi X, X, 2 * SIZE + FADD f1, f1, f5 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(25) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f4, f8 + LFDX f10, X, INCXM1 + fabs f5, f9 + LFDUX f11, X, INCX + fabs f6, f10 + LFDX f8, X, INCXM1 + fabs f7, f11 + bdz LL(23) + .align 4 + +LL(22): + FADD f0, f0, f4 + dcbt X, PREA + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + LFDX f8, X, INCXM1 + fabs f7, f11 + bdnz LL(22) + .align 4 + +LL(23): + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(25): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(26) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f4, f8 + LFDX f10, X, INCXM1 + fabs f5, f9 + LFDUX f11, X, INCX + fabs f6, f10 + LFDX f8, X, INCXM1 + fabs f7, f11 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(26): + andi. r0, N, 2 + beq LL(27) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f4, f8 + LFDX f10, X, INCXM1 + fabs f5, f9 + LFDUX f11, X, INCX + + fabs f6, f10 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(999) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f4, f8 + fabs f5, f9 + + FADD f0, f0, f4 + FADD f1, f1, f5 + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + + FADD f1, f0, f2 + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zasum_hummer.S b/kernel/power/zasum_hummer.S new file mode 100644 index 0000000..f090e69 --- /dev/null +++ b/kernel/power/zasum_hummer.S @@ -0,0 +1,583 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 +#define FLAG r8 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define T1 f12 +#define T2 f13 +#define T3 f14 +#define T4 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + li FLAG, 0 + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + sub X, X, INCX2 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C1, 2 * SIZE(X) + li FLAG, 1 + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(99) + .align 4 + +LL(05): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + fpmr T1, C2 + LFPDUX A2, X, INCX2 + fpmr T2, C2 + LFPDUX A3, X, INCX2 + fpmr T3, C2 + LFPDUX A4, X, INCX2 + fpmr T4, C2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFPDUX A1, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFPDUX A2, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A3 + LFPDUX A3, X, INCX2 + + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFPDUX A4, X, INCX2 + + fpadd C1, C1, T1 + nop + fpabs T1, A5 + LFPDUX A5, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A6 + LFPDUX A6, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A7 + LFPDUX A7, X, INCX2 + + fpadd C4, C4, T4 + fpabs T4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + fpadd C1, C1, T1 + fpabs T1, A1 + fpadd C2, C2, T2 + fpabs T2, A2 + fpadd C3, C3, T3 + fpabs T3, A3 + fpadd C4, C4, T4 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(99) + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpabs T1, A1 + fpabs T2, A2 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(99) + + LFPDUX A1, X, INCX2 + fpabs T1, A1 + fpadd C1, C1, T1 + .align 4 + +LL(99): + cmpwi cr0, FLAG, 0 + beq LL(999) + + LFD A1, 2 * SIZE(X) + fabs T1, A1 + fadd C2, C2, T1 + b LL(999) + .align 4 + +LL(100): + addi X2, X, SIZE + andi. r0, X, 2 * SIZE - 1 + bne LL(200) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(115) + + LFPDUX A1, X, INCX2 + fpmr T1, C2 + LFPDUX A2, X, INCX2 + fpmr T2, C2 + LFPDUX A3, X, INCX2 + fpmr T3, C2 + LFPDUX A4, X, INCX2 + fpmr T4, C2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(113) + .align 4 + +LL(112): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFPDUX A1, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFPDUX A2, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A3 + LFPDUX A3, X, INCX2 + + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFPDUX A4, X, INCX2 + + fpadd C1, C1, T1 + nop + fpabs T1, A5 + LFPDUX A5, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A6 + LFPDUX A6, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A7 + LFPDUX A7, X, INCX2 + + fpadd C4, C4, T4 + fpabs T4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(112) + .align 4 + +LL(113): + fpadd C1, C1, T1 + fpabs T1, A1 + fpadd C2, C2, T2 + fpabs T2, A2 + fpadd C3, C3, T3 + fpabs T3, A3 + fpadd C4, C4, T4 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(115): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(116) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(116): + andi. r0, N, 2 + beq LL(117) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpabs T1, A1 + fpabs T2, A2 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + fpabs T1, A1 + fpadd C1, C1, T1 + b LL(999) + .align 4 + +LL(200): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(215) + + + LFDUX A1, X, INCX2 + fpmr T1, C2 + LFDUX A2, X, INCX2 + fpmr T2, C2 + LFDUX A3, X, INCX2 + fpmr T3, C2 + LFDUX A4, X, INCX2 + fpmr T4, C2 + + LFDUX A5, X, INCX2 + LFSDUX A1, X2, INCX2 + + LFDUX A6, X, INCX2 + LFSDUX A2, X2, INCX2 + + LFDUX A7, X, INCX2 + LFSDUX A3, X2, INCX2 + + LFDUX A8, X, INCX2 + LFSDUX A4, X2, INCX2 + bdz LL(213) + .align 4 + +LL(212): + fpadd C1, C1, T1 + LFSDUX A5, X2, INCX2 + fpabs T1, A1 + LFDUX A1, X, INCX2 + + fpadd C2, C2, T2 + LFSDUX A6, X2, INCX2 + fpabs T2, A2 + LFDUX A2, X, INCX2 + + fpadd C3, C3, T3 + LFSDUX A7, X2, INCX2 + fpabs T3, A3 + LFDUX A3, X, INCX2 + + fpadd C4, C4, T4 + LFSDUX A8, X2, INCX2 + fpabs T4, A4 + LFDUX A4, X, INCX2 + + fpadd C1, C1, T1 + LFSDUX A1, X2, INCX2 + fpabs T1, A5 + LFDUX A5, X, INCX2 + fpadd C2, C2, T2 + LFSDUX A2, X2, INCX2 + fpabs T2, A6 + LFDUX A6, X, INCX2 + + fpadd C3, C3, T3 + LFSDUX A3, X2, INCX2 + fpabs T3, A7 + LFDUX A7, X, INCX2 + fpadd C4, C4, T4 + LFSDUX A4, X2, INCX2 + fpabs T4, A8 + LFDUX A8, X, INCX2 + + bdnz LL(212) + .align 4 + +LL(213): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFSDUX A5, X2, INCX2 + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFSDUX A6, X2, INCX2 + fpadd C3, C3, T3 + + nop + fpabs T3, A3 + LFSDUX A7, X2, INCX2 + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFSDUX A8, X2, INCX2 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(215): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(216) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs T1, A1 + LFDUX A5, X, INCX2 + fabs T2, A2 + LFDUX A6, X2, INCX2 + fabs T3, A3 + LFDUX A7, X, INCX2 + fabs T4, A4 + LFDUX A8, X2, INCX2 + + fadd C1, C1, T1 + fabs T1, A5 + fadd C2, C2, T2 + fabs T2, A6 + + fadd C3, C3, T3 + fabs T3, A7 + fadd C4, C4, T4 + fabs T4, A8 + + fadd C1, C1, T1 + fadd C2, C2, T2 + fadd C3, C3, T3 + fadd C4, C4, T4 + .align 4 + +LL(216): + andi. r0, N, 2 + beq LL(217) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs T1, A1 + fabs T2, A2 + fabs T3, A3 + fabs T4, A4 + + fadd C1, C1, T1 + fadd C2, C2, T2 + fadd C3, C3, T3 + fadd C4, C4, T4 + .align 4 + +LL(217): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + fabs T1, A1 + fabs T2, A2 + fadd C1, C1, T1 + fadd C2, C2, T2 + .align 4 + +LL(999): + fpadd C1, C1, C2 + li r10, 16 + fpadd C3, C3, C4 + fpadd C1, C1, C3 + lfpdux f15, SP, r10 + fsmtp C2, C1 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fadd C1, C2, C1 + blr + + EPILOGUE diff --git a/kernel/power/zasum_ppc440.S b/kernel/power/zasum_ppc440.S new file mode 100644 index 0000000..213c837 --- /dev/null +++ b/kernel/power/zasum_ppc440.S @@ -0,0 +1,321 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCXM1 r9 +#define PREX r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + fmr f1, FZERO + slwi INCX, INCX, ZBASE_SHIFT + fmr f2, FZERO + fmr f3, FZERO + subi INCXM1, INCX, SIZE + fmr f4, FZERO + sub X, X, INCXM1 + fmr f5, FZERO + li PREX, 3 * 16 * SIZE + fmr f6, FZERO + cmpwi cr0, N, 0 + fmr f7, FZERO + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + fabs f16, f8 + + LFDX f24, X, INCXM1 + fabs f17, f9 + LFDUX f25, X, INCX + fabs f18, f10 + LFDX f26, X, INCXM1 + fabs f19, f11 + LFDUX f27, X, INCX + fabs f20, f12 + LFDX f28, X, INCXM1 + fabs f21, f13 + LFDUX f29, X, INCX + fabs f22, f14 + LFDX f30, X, INCXM1 + fabs f23, f15 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + LFDX f8, X, INCXM1 + FADD f0, f0, f16 +#ifdef PPCG4 + dcbt X, PREX +#else + nop +#endif + fabs f16, f24 + + LFDUX f9, X, INCX + FADD f1, f1, f17 + nop + fabs f17, f25 + + LFDX f10, X, INCXM1 + FADD f2, f2, f18 + nop + fabs f18, f26 + + LFDUX f11, X, INCX + FADD f3, f3, f19 + nop + fabs f19, f27 + + LFDX f12, X, INCXM1 + FADD f4, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#else + nop +#endif + fabs f20, f28 + + LFDUX f13, X, INCX + FADD f5, f5, f21 + nop + fabs f21, f29 + + LFDX f14, X, INCXM1 + FADD f6, f6, f22 + nop + fabs f22, f30 + + LFDUX f15, X, INCX + FADD f7, f7, f23 + nop + fabs f23, f31 + + LFDX f24, X, INCXM1 + FADD f0, f0, f16 +#ifdef PPCG4 + dcbt X, PREX +#else + nop +#endif + fabs f16, f8 + + LFDUX f25, X, INCX + FADD f1, f1, f17 + nop + fabs f17, f9 + + LFDX f26, X, INCXM1 + FADD f2, f2, f18 + nop + fabs f18, f10 + + LFDUX f27, X, INCX + FADD f3, f3, f19 + nop + fabs f19, f11 + + LFDX f28, X, INCXM1 + FADD f4, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#else + nop +#endif + fabs f20, f12 + + LFDUX f29, X, INCX + FADD f5, f5, f21 + nop + fabs f21, f13 + + LFDX f30, X, INCXM1 + FADD f6, f6, f22 + nop + fabs f22, f14 + + LFDUX f31, X, INCX + FADD f7, f7, f23 + fabs f23, f15 + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + fabs f8, f8 + fabs f9, f9 + FADD f0, f0, f8 + FADD f1, f1, f9 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S new file mode 100644 index 0000000..7eb591d --- /dev/null +++ b/kernel/power/zaxpy.S @@ -0,0 +1,683 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define INCXM1 r4 +#define INCYM1 r5 +#define PREA r10 +#define YY r11 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define INCXM1 r5 +#define INCYM1 r6 +#define PREA r7 +#define YY r11 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r10 +#define INCX r4 +#define Y r5 +#define INCY r6 +#define INCXM1 r7 +#define INCYM1 r8 +#define PREA r9 +#define YY r11 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define INCXM1 r5 +#define INCYM1 r6 +#define PREA r7 +#define YY r11 +#endif +#endif + +#define ALPHA_R f24 +#define ALPHA_I f25 + +#ifndef CONJ +#define ADD1 FNMSUB +#define ADD2 FMADD +#else +#define ADD1 FMADD +#define ADD2 FNMSUB +#endif + +#ifndef NEEDPARAM + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + +#if defined(linux) && defined(__64BIT__) + ld INCY, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld INCY, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + fmr ALPHA_R, f1 + fmr ALPHA_I, f2 + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + LFD f8, 0 * SIZE(Y) + LFD f9, 1 * SIZE(Y) + LFD f10, 2 * SIZE(Y) + LFD f11, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + LFD f12, 4 * SIZE(Y) + LFD f13, 5 * SIZE(Y) + LFD f14, 6 * SIZE(Y) + LFD f15, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFD f0, 8 * SIZE(X) + LFD f1, 9 * SIZE(X) + LFD f2, 10 * SIZE(X) + LFD f3, 11 * SIZE(X) + + LFD f8, 8 * SIZE(Y) + LFD f9, 9 * SIZE(Y) + LFD f10, 10 * SIZE(Y) + LFD f11, 11 * SIZE(Y) + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + STFD f18, 2 * SIZE(Y) + STFD f19, 3 * SIZE(Y) + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFD f4, 12 * SIZE(X) + LFD f5, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f7, 15 * SIZE(X) + + LFD f12, 12 * SIZE(Y) + LFD f13, 13 * SIZE(Y) + LFD f14, 14 * SIZE(Y) + LFD f15, 15 * SIZE(Y) + + STFD f20, 4 * SIZE(Y) + STFD f21, 5 * SIZE(Y) + STFD f22, 6 * SIZE(Y) + STFD f23, 7 * SIZE(Y) + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFD f0, 16 * SIZE(X) + LFD f1, 17 * SIZE(X) + LFD f2, 18 * SIZE(X) + LFD f3, 19 * SIZE(X) + + LFD f8, 16 * SIZE(Y) + LFD f9, 17 * SIZE(Y) + LFD f10, 18 * SIZE(Y) + LFD f11, 19 * SIZE(Y) + + STFD f16, 8 * SIZE(Y) + STFD f17, 9 * SIZE(Y) + STFD f18, 10 * SIZE(Y) + STFD f19, 11 * SIZE(Y) + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFD f4, 20 * SIZE(X) + LFD f5, 21 * SIZE(X) + LFD f6, 22 * SIZE(X) + LFD f7, 23 * SIZE(X) + + LFD f12, 20 * SIZE(Y) + LFD f13, 21 * SIZE(Y) + LFD f14, 22 * SIZE(Y) + LFD f15, 23 * SIZE(Y) + + STFD f20, 12 * SIZE(Y) + STFD f21, 13 * SIZE(Y) + STFD f22, 14 * SIZE(Y) + STFD f23, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst Y, PREA +#ifdef L1_DUALFETCH + dcbt X, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst Y, PREA + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFD f0, 8 * SIZE(X) + LFD f1, 9 * SIZE(X) + LFD f2, 10 * SIZE(X) + LFD f3, 11 * SIZE(X) + + LFD f8, 8 * SIZE(Y) + LFD f9, 9 * SIZE(Y) + LFD f10, 10 * SIZE(Y) + LFD f11, 11 * SIZE(Y) + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFD f4, 12 * SIZE(X) + LFD f5, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f7, 15 * SIZE(X) + + LFD f12, 12 * SIZE(Y) + LFD f13, 13 * SIZE(Y) + LFD f14, 14 * SIZE(Y) + LFD f15, 15 * SIZE(Y) + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + STFD f18, 2 * SIZE(Y) + STFD f19, 3 * SIZE(Y) + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + STFD f20, 4 * SIZE(Y) + STFD f21, 5 * SIZE(Y) + STFD f22, 6 * SIZE(Y) + STFD f23, 7 * SIZE(Y) + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + STFD f16, 8 * SIZE(Y) + STFD f17, 9 * SIZE(Y) + STFD f18, 10 * SIZE(Y) + STFD f19, 11 * SIZE(Y) + + STFD f20, 12 * SIZE(Y) + STFD f21, 13 * SIZE(Y) + STFD f22, 14 * SIZE(Y) + STFD f23, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f8, 0 * SIZE(Y) + LFD f9, 1 * SIZE(Y) + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + sub Y, Y, INCYM1 + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + .align 4 + + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + LFDX f10, Y, INCYM1 + LFDUX f11, Y, INCY + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f12, Y, INCYM1 + LFDUX f13, Y, INCY + LFDX f14, Y, INCYM1 + LFDUX f15, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + LFDX f10, Y, INCYM1 + LFDUX f11, Y, INCY + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f12, Y, INCYM1 + LFDUX f13, Y, INCY + LFDX f14, Y, INCYM1 + LFDUX f15, Y, INCY + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + STFDX f18, YY, INCYM1 + STFDUX f19, YY, INCY + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + LFDX f10, Y, INCYM1 + LFDUX f11, Y, INCY + + STFDX f20, YY, INCYM1 + STFDUX f21, YY, INCY + STFDX f22, YY, INCYM1 + STFDUX f23, YY, INCY + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f12, Y, INCYM1 + LFDUX f13, Y, INCY + LFDX f14, Y, INCYM1 + LFDUX f15, Y, INCY + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + STFDX f18, YY, INCYM1 + STFDUX f19, YY, INCY + + STFDX f20, YY, INCYM1 + STFDUX f21, YY, INCY + STFDX f22, YY, INCYM1 + STFDUX f23, YY, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + LFDX f10, Y, INCYM1 + LFDUX f11, Y, INCY + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f12, Y, INCYM1 + LFDUX f13, Y, INCY + LFDX f14, Y, INCYM1 + LFDUX f15, Y, INCY + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + STFDX f18, YY, INCYM1 + STFDUX f19, YY, INCY + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + STFDX f20, YY, INCYM1 + STFDUX f21, YY, INCY + STFDX f22, YY, INCYM1 + STFDUX f23, YY, INCY + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + STFDX f18, YY, INCYM1 + STFDUX f19, YY, INCY + + STFDX f20, YY, INCYM1 + STFDUX f21, YY, INCY + STFDX f22, YY, INCYM1 + STFDUX f23, YY, INCY + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE + +#endif diff --git a/kernel/power/zaxpy_hummer.S b/kernel/power/zaxpy_hummer.S new file mode 100644 index 0000000..41b3495 --- /dev/null +++ b/kernel/power/zaxpy_hummer.S @@ -0,0 +1,503 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 + +#define YY r4 +#define INCX2 r5 +#define INCY2 r10 +#define X1 r11 +#define Y1 INCX +#define YY1 INCY + +#define ALPHA f1 + +#define A1 f0 +#define A2 f8 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 +#define A9 f25 + +#define B1 f9 +#define B2 f10 +#define B3 f11 +#define B4 f12 +#define B5 f13 +#define B6 f14 +#define B7 f15 +#define B8 f16 + +#define C1 f17 +#define C2 f18 +#define C3 f19 +#define C4 f20 +#define C5 f21 +#define C6 f22 +#define C7 f23 +#define C8 f24 + +#define ALPHA_R ALPHA +#define ALPHA_I A9 + +#ifndef CONJ +#define ADD1 FNMSUB +#define ADD2 FMADD +#else +#define ADD1 FMADD +#define ADD2 FNMSUB +#endif + +#ifndef CONJ +#define FXMADD1 fxcpmadd +#define FXMADD2 fxcxnpma +#else +#define FXMADD1 fxcpnsma +#define FXMADD2 fxcxma +#endif + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + + fsmfp ALPHA, f2 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + andi. r0, X, 2 * SIZE - 1 + bne LL(100) + andi. r0, Y, 2 * SIZE - 1 + bne LL(100) + + sub X, X, INCX2 + sub Y, Y, INCY2 + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + + bdz LL(13) + .align 4 + +LL(12): + FXMADD1 C1, ALPHA, A1, B1 + LFPDUX B1, Y, INCY2 + FXMADD1 C2, ALPHA, A2, B2 + LFPDUX B2, Y, INCY2 + FXMADD1 C3, ALPHA, A3, B3 + LFPDUX B3, Y, INCY2 + FXMADD1 C4, ALPHA, A4, B4 + LFPDUX B4, Y, INCY2 + + FXMADD1 C5, ALPHA, A5, B5 + LFPDUX B5, Y, INCY2 + FXMADD1 C6, ALPHA, A6, B6 + LFPDUX B6, Y, INCY2 + FXMADD1 C7, ALPHA, A7, B7 + LFPDUX B7, Y, INCY2 + FXMADD1 C8, ALPHA, A8, B8 + LFPDUX B8, Y, INCY2 + + FXMADD2 C1, ALPHA, A1, C1 + LFPDUX A1, X, INCX2 + FXMADD2 C2, ALPHA, A2, C2 + LFPDUX A2, X, INCX2 + FXMADD2 C3, ALPHA, A3, C3 + LFPDUX A3, X, INCX2 + FXMADD2 C4, ALPHA, A4, C4 + LFPDUX A4, X, INCX2 + + FXMADD2 C5, ALPHA, A5, C5 + LFPDUX A5, X, INCX2 + FXMADD2 C6, ALPHA, A6, C6 + LFPDUX A6, X, INCX2 + FXMADD2 C7, ALPHA, A7, C7 + LFPDUX A7, X, INCX2 + FXMADD2 C8, ALPHA, A8, C8 + LFPDUX A8, X, INCX2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + bdnz LL(12) + .align 4 + +LL(13): + FXMADD1 C1, ALPHA, A1, B1 + FXMADD1 C2, ALPHA, A2, B2 + FXMADD1 C3, ALPHA, A3, B3 + FXMADD1 C4, ALPHA, A4, B4 + + FXMADD1 C5, ALPHA, A5, B5 + FXMADD1 C6, ALPHA, A6, B6 + FXMADD1 C7, ALPHA, A7, B7 + FXMADD1 C8, ALPHA, A8, B8 + + FXMADD2 C1, ALPHA, A1, C1 + FXMADD2 C2, ALPHA, A2, C2 + FXMADD2 C3, ALPHA, A3, C3 + FXMADD2 C4, ALPHA, A4, C4 + + FXMADD2 C5, ALPHA, A5, C5 + FXMADD2 C6, ALPHA, A6, C6 + STFPDUX C1, YY, INCY2 + FXMADD2 C7, ALPHA, A7, C7 + STFPDUX C2, YY, INCY2 + FXMADD2 C8, ALPHA, A8, C8 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + FXMADD1 C1, ALPHA, A1, B1 + FXMADD1 C2, ALPHA, A2, B2 + FXMADD1 C3, ALPHA, A3, B3 + FXMADD1 C4, ALPHA, A4, B4 + + FXMADD2 C1, ALPHA, A1, C1 + FXMADD2 C2, ALPHA, A2, C2 + FXMADD2 C3, ALPHA, A3, C3 + FXMADD2 C4, ALPHA, A4, C4 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + FXMADD1 C1, ALPHA, A1, B1 + FXMADD1 C2, ALPHA, A2, B2 + FXMADD2 C1, ALPHA, A1, C1 + FXMADD2 C2, ALPHA, A2, C2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + FXMADD1 C1, ALPHA, A1, B1 + FXMADD2 C1, ALPHA, A1, C1 + + STFPDUX C1, YY, INCY2 + b LL(999) + .align 4 + +LL(100): + fsmtp ALPHA_I, ALPHA_R + + sub X, X, INCX2 + sub Y, Y, INCY2 + + addi X1, X, SIZE + addi Y1, Y, SIZE + + mr YY, Y + mr YY1, Y1 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX2 + LFDUX A2, X1, INCX2 + LFDUX B1, Y, INCY2 + LFDUX B2, Y1, INCY2 + + LFDUX A3, X, INCX2 + LFDUX A4, X1, INCX2 + LFDUX B3, Y, INCY2 + LFDUX B4, Y1, INCY2 + + LFDUX A5, X, INCX2 + LFDUX A6, X1, INCX2 + LFDUX B5, Y, INCY2 + LFDUX B6, Y1, INCY2 + + LFDUX A7, X, INCX2 + LFDUX A8, X1, INCX2 + LFDUX B7, Y, INCY2 + LFDUX B8, Y1, INCY2 + bdz LL(113) + .align 4 + +LL(112): + FMADD C1, ALPHA_R, A1, B1 + LFDUX B1, Y, INCY2 + FMADD C2, ALPHA_I, A1, B2 + LFDUX A1, X, INCX2 + FMADD C3, ALPHA_R, A3, B3 + LFDUX B3, Y, INCY2 + FMADD C4, ALPHA_I, A3, B4 + LFDUX A3, X, INCX2 + + FMADD C5, ALPHA_R, A5, B5 + LFDUX B5, Y, INCY2 + FMADD C6, ALPHA_I, A5, B6 + LFDUX A5, X, INCX2 + FMADD C7, ALPHA_R, A7, B7 + LFDUX B7, Y, INCY2 + FMADD C8, ALPHA_I, A7, B8 + LFDUX A7, X, INCX2 + + ADD1 C1, ALPHA_I, A2, C1 + LFDUX B2, Y1, INCY2 + ADD2 C2, ALPHA_R, A2, C2 + LFDUX A2, X1, INCX2 + ADD1 C3, ALPHA_I, A4, C3 + LFDUX B4, Y1, INCY2 + ADD2 C4, ALPHA_R, A4, C4 + LFDUX A4, X1, INCX2 + + ADD1 C5, ALPHA_I, A6, C5 + LFDUX B6, Y1, INCY2 + ADD2 C6, ALPHA_R, A6, C6 + LFDUX A6, X1, INCX2 + ADD1 C7, ALPHA_I, A8, C7 + LFDUX B8, Y1, INCY2 + ADD2 C8, ALPHA_R, A8, C8 + LFDUX A8, X1, INCX2 + + STFDUX C1, YY, INCY2 + STFDUX C2, YY1, INCY2 + STFDUX C3, YY, INCY2 + STFDUX C4, YY1, INCY2 + + STFDUX C5, YY, INCY2 + STFDUX C6, YY1, INCY2 + STFDUX C7, YY, INCY2 + STFDUX C8, YY1, INCY2 + bdnz LL(112) + .align 4 + +LL(113): + FMADD C1, ALPHA_R, A1, B1 + FMADD C2, ALPHA_I, A1, B2 + FMADD C3, ALPHA_R, A3, B3 + FMADD C4, ALPHA_I, A3, B4 + + FMADD C5, ALPHA_R, A5, B5 + FMADD C6, ALPHA_I, A5, B6 + FMADD C7, ALPHA_R, A7, B7 + FMADD C8, ALPHA_I, A7, B8 + + ADD1 C1, ALPHA_I, A2, C1 + ADD2 C2, ALPHA_R, A2, C2 + ADD1 C3, ALPHA_I, A4, C3 + ADD2 C4, ALPHA_R, A4, C4 + + ADD1 C5, ALPHA_I, A6, C5 + ADD2 C6, ALPHA_R, A6, C6 + STFDUX C1, YY, INCY2 + ADD1 C7, ALPHA_I, A8, C7 + STFDUX C2, YY1, INCY2 + ADD2 C8, ALPHA_R, A8, C8 + STFDUX C3, YY, INCY2 + STFDUX C4, YY1, INCY2 + + STFDUX C5, YY, INCY2 + STFDUX C6, YY1, INCY2 + STFDUX C7, YY, INCY2 + STFDUX C8, YY1, INCY2 + .align 4 + +LL(115): + andi. r0, N, 3 + beq LL(999) + + andi. r0, N, 2 + beq LL(117) + + LFDUX A1, X, INCX2 + LFDUX A2, X1, INCX2 + LFDUX B1, Y, INCY2 + LFDUX B2, Y1, INCY2 + + LFDUX A3, X, INCX2 + FMADD C1, ALPHA_R, A1, B1 + LFDUX A4, X1, INCX2 + FMADD C2, ALPHA_I, A1, B2 + LFDUX B3, Y, INCY2 + FMADD C3, ALPHA_R, A3, B3 + LFDUX B4, Y1, INCY2 + FMADD C4, ALPHA_I, A3, B4 + + ADD1 C1, ALPHA_I, A2, C1 + ADD2 C2, ALPHA_R, A2, C2 + STFDUX C1, YY, INCY2 + ADD1 C3, ALPHA_I, A4, C3 + STFDUX C2, YY1, INCY2 + ADD2 C4, ALPHA_R, A4, C4 + STFDUX C3, YY, INCY2 + STFDUX C4, YY1, INCY2 + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, X1, INCX2 + LFDUX B1, Y, INCY2 + LFDUX B2, Y1, INCY2 + + FMADD C1, ALPHA_R, A1, B1 + FMADD C2, ALPHA_I, A1, B2 + + ADD1 C1, ALPHA_I, A2, C1 + ADD2 C2, ALPHA_R, A2, C2 + + STFDUX C1, YY, INCY2 + STFDUX C2, YY1, INCY2 + .align 4 + +LL(999): + li r10, 16 + subi SP, SP, 16 + + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S new file mode 100644 index 0000000..5100e94 --- /dev/null +++ b/kernel/power/zaxpy_ppc440.S @@ -0,0 +1,413 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define YY r4 +#define PRE r5 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r5 +#define INCY r4 +#define YY r6 +#define PRE r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r10 +#define INCX r4 +#define Y r5 +#define INCY r6 +#define YY r7 +#define PRE r8 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define YY r5 +#define PRE r6 +#endif +#endif + +#define ALPHA_R f24 +#define ALPHA_I f25 + +#ifndef CONJ +#define ADD1 FNMSUB +#define ADD2 FMADD +#else +#define ADD1 FMADD +#define ADD2 FNMSUB +#endif + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + subi SP, SP, STACKSIZE + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + +#if defined(linux) && defined(__64BIT__) + ld INCY, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld INCY, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + fmr ALPHA_R, f1 + slwi INCX, INCX, ZBASE_SHIFT + fmr ALPHA_I, f2 + slwi INCY, INCY, ZBASE_SHIFT + + subi INCX, INCX, SIZE + subi INCY, INCY, SIZE + + li PRE, 2 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + sub X, X, INCX + sub Y, Y, INCY + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + ble- LL(150) + .align 4 + + LFDUX f0, X, INCX + LFDU f1, 1 * SIZE(X) + LFDUX f2, X, INCX + LFDU f3, 1 * SIZE(X) + + LFDUX f8, Y, INCY + LFDU f9, 1 * SIZE(Y) + LFDUX f10, Y, INCY + LFDU f11, 1 * SIZE(Y) + + LFDUX f4, X, INCX + LFDU f5, 1 * SIZE(X) + LFDUX f6, X, INCX + LFDU f7, 1 * SIZE(X) + + LFDUX f12, Y, INCY + LFDU f13, 1 * SIZE(Y) + LFDUX f14, Y, INCY + LFDU f15, 1 * SIZE(Y) + bdz LL(120) + .align 4 + +LL(110): + FMADD f16, ALPHA_R, f0, f8 + LFDUX f8, Y, INCY + FMADD f17, ALPHA_I, f0, f9 + LFDU f9, 1 * SIZE(Y) + FMADD f18, ALPHA_R, f2, f10 + LFDUX f10, Y, INCY + FMADD f19, ALPHA_I, f2, f11 + LFDU f11, 1 * SIZE(Y) +#ifdef PPCG4 + dcbt X, PRE +#endif + + ADD1 f16, ALPHA_I, f1, f16 + LFDUX f0, X, INCX + ADD2 f17, ALPHA_R, f1, f17 + LFDU f1, 1 * SIZE(X) + ADD1 f18, ALPHA_I, f3, f18 + LFDUX f2, X, INCX + ADD2 f19, ALPHA_R, f3, f19 + LFDU f3, 1 * SIZE(X) +#ifdef PPCG4 + dcbtst Y, PRE +#endif + + FMADD f20, ALPHA_R, f4, f12 + LFDUX f12, Y, INCY + FMADD f21, ALPHA_I, f4, f13 + LFDU f13, 1 * SIZE(Y) + FMADD f22, ALPHA_R, f6, f14 + LFDUX f14, Y, INCY + FMADD f23, ALPHA_I, f6, f15 + LFDU f15, 1 * SIZE(Y) +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + + ADD1 f20, ALPHA_I, f5, f20 + LFDUX f4, X, INCX + ADD2 f21, ALPHA_R, f5, f21 + LFDU f5, 1 * SIZE(X) + ADD1 f22, ALPHA_I, f7, f22 + LFDUX f6, X, INCX + ADD2 f23, ALPHA_R, f7, f23 + LFDU f7, 1 * SIZE(X) +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + + STFDUX f16, YY, INCY + STFDU f17, 1 * SIZE(YY) + STFDUX f18, YY, INCY + STFDU f19, 1 * SIZE(YY) + + FMADD f16, ALPHA_R, f0, f8 + LFDUX f8, Y, INCY + FMADD f17, ALPHA_I, f0, f9 + LFDU f9, 1 * SIZE(Y) + FMADD f18, ALPHA_R, f2, f10 + LFDUX f10, Y, INCY + FMADD f19, ALPHA_I, f2, f11 + LFDU f11, 1 * SIZE(Y) +#ifdef PPCG4 + dcbt X, PRE +#endif + + ADD1 f16, ALPHA_I, f1, f16 + LFDUX f0, X, INCX + ADD2 f17, ALPHA_R, f1, f17 + LFDU f1, 1 * SIZE(X) + ADD1 f18, ALPHA_I, f3, f18 + LFDUX f2, X, INCX + ADD2 f19, ALPHA_R, f3, f19 + LFDU f3, 1 * SIZE(X) +#ifdef PPCG4 + dcbtst Y, PRE +#endif + + STFDUX f20, YY, INCY + STFDU f21, 1 * SIZE(YY) + STFDUX f22, YY, INCY + STFDU f23, 1 * SIZE(YY) + + FMADD f20, ALPHA_R, f4, f12 + LFDUX f12, Y, INCY + FMADD f21, ALPHA_I, f4, f13 + LFDU f13, 1 * SIZE(Y) + FMADD f22, ALPHA_R, f6, f14 + LFDUX f14, Y, INCY + FMADD f23, ALPHA_I, f6, f15 + LFDU f15, 1 * SIZE(Y) +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + + ADD1 f20, ALPHA_I, f5, f20 + LFDUX f4, X, INCX + ADD2 f21, ALPHA_R, f5, f21 + LFDU f5, 1 * SIZE(X) + ADD1 f22, ALPHA_I, f7, f22 + LFDUX f6, X, INCX + ADD2 f23, ALPHA_R, f7, f23 + LFDU f7, 1 * SIZE(X) +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + + STFDUX f16, YY, INCY + STFDU f17, 1 * SIZE(YY) + STFDUX f18, YY, INCY + STFDU f19, 1 * SIZE(YY) + + STFDUX f20, YY, INCY + STFDU f21, 1 * SIZE(YY) + STFDUX f22, YY, INCY + STFDU f23, 1 * SIZE(YY) + bdnz LL(110) + .align 4 + +LL(120): + FMADD f16, ALPHA_R, f0, f8 + LFDUX f8, Y, INCY + FMADD f17, ALPHA_I, f0, f9 + LFDU f9, 1 * SIZE(Y) + FMADD f18, ALPHA_R, f2, f10 + LFDUX f10, Y, INCY + FMADD f19, ALPHA_I, f2, f11 + LFDU f11, 1 * SIZE(Y) + + ADD1 f16, ALPHA_I, f1, f16 + LFDUX f0, X, INCX + ADD2 f17, ALPHA_R, f1, f17 + LFDU f1, 1 * SIZE(X) + ADD1 f18, ALPHA_I, f3, f18 + LFDUX f2, X, INCX + ADD2 f19, ALPHA_R, f3, f19 + LFDU f3, 1 * SIZE(X) + + FMADD f20, ALPHA_R, f4, f12 + LFDUX f12, Y, INCY + FMADD f21, ALPHA_I, f4, f13 + LFDU f13, 1 * SIZE(Y) + FMADD f22, ALPHA_R, f6, f14 + LFDUX f14, Y, INCY + FMADD f23, ALPHA_I, f6, f15 + LFDU f15, 1 * SIZE(Y) + + ADD1 f20, ALPHA_I, f5, f20 + LFDUX f4, X, INCX + ADD2 f21, ALPHA_R, f5, f21 + LFDU f5, 1 * SIZE(X) + ADD1 f22, ALPHA_I, f7, f22 + LFDUX f6, X, INCX + ADD2 f23, ALPHA_R, f7, f23 + LFDU f7, 1 * SIZE(X) + + STFDUX f16, YY, INCY + FMADD f16, ALPHA_R, f0, f8 + STFDU f17, 1 * SIZE(YY) + FMADD f17, ALPHA_I, f0, f9 + STFDUX f18, YY, INCY + FMADD f18, ALPHA_R, f2, f10 + STFDU f19, 1 * SIZE(YY) + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + STFDUX f20, YY, INCY + FMADD f20, ALPHA_R, f4, f12 + STFDU f21, 1 * SIZE(YY) + FMADD f21, ALPHA_I, f4, f13 + STFDUX f22, YY, INCY + FMADD f22, ALPHA_R, f6, f14 + STFDU f23, 1 * SIZE(YY) + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + STFDUX f16, YY, INCY + ADD2 f21, ALPHA_R, f5, f21 + STFDU f17, 1 * SIZE(YY) + ADD1 f22, ALPHA_I, f7, f22 + STFDUX f18, YY, INCY + ADD2 f23, ALPHA_R, f7, f23 + STFDU f19, 1 * SIZE(YY) + + STFDUX f20, YY, INCY + STFDU f21, 1 * SIZE(YY) + STFDUX f22, YY, INCY + STFDU f23, 1 * SIZE(YY) + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + ble LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDU f1, 1 * SIZE(X) + LFDUX f8, Y, INCY + LFDU f9, 1 * SIZE(Y) + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + + STFDUX f16, YY, INCY + STFDU f17, 1 * SIZE(YY) + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + addi SP, SP, STACKSIZE + li r0, 0 + blr + EPILOGUE diff --git a/kernel/power/zcopy.S b/kernel/power/zcopy.S new file mode 100644 index 0000000..f5ed2f9 --- /dev/null +++ b/kernel/power/zcopy.S @@ -0,0 +1,237 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 + +#define INCXM1 r9 +#define INCYM1 r10 + +#define STACKSIZE 16 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + +LL(10): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + STFD f2, 2 * SIZE(Y) + STFD f3, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + STFD f4, 4 * SIZE(Y) + STFD f5, 5 * SIZE(Y) + STFD f6, 6 * SIZE(Y) + STFD f7, 7 * SIZE(Y) + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + STFD f8, 8 * SIZE(Y) + STFD f9, 9 * SIZE(Y) + STFD f10, 10 * SIZE(Y) + STFD f11, 11 * SIZE(Y) + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + STFD f12, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f14, 14 * SIZE(Y) + STFD f15, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst Y, PREA +#ifdef L1_DUALFETCH + dcbt X, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst Y, PREA + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + STFD f8, 0 * SIZE(Y) + STFD f9, 1 * SIZE(Y) + addi Y, Y, 2 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + STFDX f0, Y, INCYM1 + STFDUX f1, Y, INCY + STFDX f2, Y, INCYM1 + STFDUX f3, Y, INCY + + STFDX f4, Y, INCYM1 + STFDUX f5, Y, INCY + STFDX f6, Y, INCYM1 + STFDUX f7, Y, INCY + + STFDX f8, Y, INCYM1 + STFDUX f9, Y, INCY + STFDX f10, Y, INCYM1 + STFDUX f11, Y, INCY + + STFDX f12, Y, INCYM1 + STFDUX f13, Y, INCY + STFDX f14, Y, INCYM1 + STFDUX f15, Y, INCY + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + STFDX f8, Y, INCYM1 + STFDUX f9, Y, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zcopy_hummer.S b/kernel/power/zcopy_hummer.S new file mode 100644 index 0000000..825b440 --- /dev/null +++ b/kernel/power/zcopy_hummer.S @@ -0,0 +1,652 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 + +#define INCX2 r8 +#define INCY2 r9 +#define X2 r10 +#define Y2 r11 + +#define A1 f0 +#define A2 f1 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 +#define A9 f8 + +#define T1 f9 +#define T2 f10 +#define T3 f11 +#define T4 f12 +#define T5 f13 +#define T6 f14 +#define T7 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + .align 4 + +LL(10): /* X ): aligned Y ): aligned */ + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + STFPDUX A1, Y, INCY2 + LFPDUX A1, X, INCX2 + STFPDUX A2, Y, INCY2 + LFPDUX A2, X, INCX2 + STFPDUX A3, Y, INCY2 + LFPDUX A3, X, INCX2 + STFPDUX A4, Y, INCY2 + LFPDUX A4, X, INCX2 + + STFPDUX A5, Y, INCY2 + LFPDUX A5, X, INCX2 + STFPDUX A6, Y, INCY2 + LFPDUX A6, X, INCX2 + STFPDUX A7, Y, INCY2 + LFPDUX A7, X, INCX2 + STFPDUX A8, Y, INCY2 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A6, Y, INCY2 + STFPDUX A7, Y, INCY2 + STFPDUX A8, Y, INCY2 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + STFPDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(20): /* X : aligned Y : unaligned */ + + LFXDUX A1, X, INCX2 + addi N, N, -1 + cmpwi cr0, N, 0 + STFSDX A1, Y, INCY2 + add Y, Y, INCY + ble LL(29) + .align 4 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX T1, X, INCX2 + LFXDUX T2, X, INCX2 + LFXDUX T3, X, INCX2 + LFXDUX T4, X, INCX2 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdz LL(23) + .align 4 + +LL(22): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + LFPDUX A2, X, INCX2 + fsmr T5, T6 + LFPDUX A3, X, INCX2 + fsmr T6, T7 + LFPDUX A4, X, INCX2 + fsmr T7, A1 + LFPDUX A5, X, INCX2 + + STFPDUX T4, Y, INCY2 + fxmr T1, A2 + STFPDUX T5, Y, INCY2 + fxmr T2, A3 + STFPDUX T6, Y, INCY2 + fxmr T3, A4 + STFPDUX T7, Y, INCY2 + fxmr T4, A5 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdnz LL(22) + .align 4 + +LL(23): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + fsmr T5, T6 + fsmr T6, T7 + fsmr T7, A1 + + STFPDUX T4, Y, INCY2 + STFPDUX T5, Y, INCY2 + STFPDUX T6, Y, INCY2 + STFPDUX T7, Y, INCY2 + .align 4 + +LL(25): + andi. r0, N, 7 + beq LL(29) + + andi. r0, N, 4 + beq LL(26) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + LFXDUX A4, X, INCX2 + LFXDUX A5, X, INCX2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + fpmr A1, A5 + .align 4 + +LL(26): + andi. r0, N, 2 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + fsmr A1, A2 + fsmr A2, A3 + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + fpmr A1, A3 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(29) + + LFXDUX A2, X, INCX2 + fsmr A1, A2 + STFPDUX A1, Y, INCY2 + fpmr A1, A2 + .align 4 + +LL(29): + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(30): /* X ): unaligned Y ): aligned */ + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFDX A1, X, INCX2 + add X, X, INCX + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX T1, X, INCX2 + LFXDUX T2, X, INCX2 + LFXDUX T3, X, INCX2 + LFXDUX T4, X, INCX2 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdz LL(33) + .align 4 + +LL(32): + fxmr T5, A6 + STFPDUX A1, Y, INCY2 + fxmr T6, A7 + STFPDUX T1, Y, INCY2 + fxmr T7, A8 + STFPDUX T2, Y, INCY2 + fxmr A1, A9 + STFPDUX T3, Y, INCY2 + + LFPDUX A2, X, INCX2 + fsmr T4, T5 + LFPDUX A3, X, INCX2 + fsmr T5, T6 + LFPDUX A4, X, INCX2 + fsmr T6, T7 + LFPDUX A5, X, INCX2 + fsmr T7, A1 + + fxmr T1, A2 + STFPDUX T4, Y, INCY2 + fxmr T2, A3 + STFPDUX T5, Y, INCY2 + fxmr T3, A4 + STFPDUX T6, Y, INCY2 + fxmr T4, A5 + STFPDUX T7, Y, INCY2 + + fsmr A1, T1 + LFPDUX A6, X, INCX2 + fsmr T1, T2 + LFPDUX A7, X, INCX2 + fsmr T2, T3 + LFPDUX A8, X, INCX2 + fsmr T3, T4 + LFPDUX A9, X, INCX2 + bdnz LL(32) + .align 4 + +LL(33): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + fsmr T5, T6 + fsmr T6, T7 + fsmr T7, A1 + + STFPDUX T4, Y, INCY2 + STFPDUX T5, Y, INCY2 + STFPDUX T6, Y, INCY2 + STFPDUX T7, Y, INCY2 + .align 4 + +LL(35): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(36) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + LFXDUX A4, X, INCX2 + LFXDUX A5, X, INCX2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + fpmr A1, A5 + .align 4 + +LL(36): + andi. r0, N, 2 + beq LL(37) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + fsmr A1, A2 + fsmr A2, A3 + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + fpmr A1, A3 + .align 4 + +LL(37): + andi. r0, N, 1 + beq LL(999) + + LFXDUX A2, X, INCX2 + fsmr A1, A2 + STFPDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(40): /* X : unaligned Y : unaligned */ + + LFDX A1, X, INCX2 + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + STFDX A1, Y, INCY2 + add Y, Y, INCY + ble LL(49) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(43) + .align 4 + +LL(42): + STFPDUX A1, Y, INCY2 + LFPDUX A1, X, INCX2 + STFPDUX A2, Y, INCY2 + LFPDUX A2, X, INCX2 + STFPDUX A3, Y, INCY2 + LFPDUX A3, X, INCX2 + STFPDUX A4, Y, INCY2 + LFPDUX A4, X, INCX2 + + STFPDUX A5, Y, INCY2 + LFPDUX A5, X, INCX2 + STFPDUX A6, Y, INCY2 + LFPDUX A6, X, INCX2 + STFPDUX A7, Y, INCY2 + LFPDUX A7, X, INCX2 + STFPDUX A8, Y, INCY2 + LFPDUX A8, X, INCX2 + bdnz LL(42) + .align 4 + +LL(43): + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A6, Y, INCY2 + STFPDUX A7, Y, INCY2 + STFPDUX A8, Y, INCY2 + .align 4 + +LL(45): + andi. r0, N, 7 + beq LL(49) + + andi. r0, N, 4 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + .align 4 + +LL(46): + andi. r0, N, 2 + beq LL(47) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + .align 4 + +LL(47): + andi. r0, N, 1 + beq LL(49) + + LFPDUX A1, X, INCX2 + STFPDUX A1, Y, INCY2 + +LL(49): + LFDUX A1, X, INCX2 + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(100): + addi X2, X, SIZE + addi Y2, Y, SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + bdz LL(113) + .align 4 + +LL(112): + STFDUX A1, Y, INCY2 + LFDUX A1, X, INCX2 + STFDUX A2, Y2, INCY2 + LFDUX A2, X2, INCX2 + STFDUX A3, Y, INCY2 + LFDUX A3, X, INCX2 + STFDUX A4, Y2, INCY2 + LFDUX A4, X2, INCX2 + + STFDUX A5, Y, INCY2 + LFDUX A5, X, INCX2 + STFDUX A6, Y2, INCY2 + LFDUX A6, X2, INCX2 + STFDUX A7, Y, INCY2 + LFDUX A7, X, INCX2 + STFDUX A8, Y2, INCY2 + LFDUX A8, X2, INCX2 + bdnz LL(112) + .align 4 + +LL(113): + STFDUX A1, Y, INCY2 + STFDUX A2, Y2, INCY2 + STFDUX A3, Y, INCY2 + STFDUX A4, Y2, INCY2 + STFDUX A5, Y, INCY2 + STFDUX A6, Y2, INCY2 + STFDUX A7, Y, INCY2 + STFDUX A8, Y2, INCY2 + .align 4 + +LL(115): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(117) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + STFDUX A1, Y, INCY2 + STFDUX A2, Y2, INCY2 + STFDUX A3, Y, INCY2 + STFDUX A4, Y2, INCY2 + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + STFDUX A1, Y, INCY2 + STFDUX A2, Y2, INCY2 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zdot.S b/kernel/power/zdot.S new file mode 100644 index 0000000..dab7eaa --- /dev/null +++ b/kernel/power/zdot.S @@ -0,0 +1,654 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define RESULT r3 +#define N r4 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define PREA r9 +#else +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 +#endif + +#define INCXM1 r10 +#define INCYM1 r11 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + fmr f24, FZERO + fmr f25, FZERO + fmr f26, FZERO + fmr f27, FZERO + fmr f28, FZERO + fmr f29, FZERO + fmr f30, FZERO + fmr f31, FZERO + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f16, 8 * SIZE(Y) + LFD f17, 9 * SIZE(Y) + LFD f18, 10 * SIZE(Y) + LFD f19, 11 * SIZE(Y) + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f20, 12 * SIZE(Y) + LFD f21, 13 * SIZE(Y) + LFD f22, 14 * SIZE(Y) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + LFD f16, 16 * SIZE(Y) + LFD f17, 17 * SIZE(Y) + LFD f18, 18 * SIZE(Y) + LFD f19, 19 * SIZE(Y) + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + LFD f20, 20 * SIZE(Y) + LFD f21, 21 * SIZE(Y) + LFD f22, 22 * SIZE(Y) + LFD f23, 23 * SIZE(Y) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#ifdef L1_DUALFETCH + L1_PREFETCH Y, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + L1_PREFETCH X, PREA +#ifdef L1_DUALFETCH + L1_PREFETCH Y, PREA +#endif +#endif + bdnz LL(10) + .align 4 + +LL(20): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f16, 8 * SIZE(Y) + LFD f17, 9 * SIZE(Y) + LFD f18, 10 * SIZE(Y) + LFD f19, 11 * SIZE(Y) + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f20, 12 * SIZE(Y) + LFD f21, 13 * SIZE(Y) + LFD f22, 14 * SIZE(Y) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDX f22, Y, INCYM1 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDX f22, Y, INCYM1 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDX f22, Y, INCYM1 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDX f22, Y, INCYM1 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f16, Y, INCYM1 + LFDUX f17, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FADD f24, f28, f24 + FADD f25, f29, f25 + FADD f26, f30, f26 + FADD f27, f31, f27 + + FADD f0, f0, f24 + FADD f1, f1, f25 + FADD f2, f2, f26 + FADD f3, f3, f27 + +#ifndef CONJ + FSUB f1, f0, f1 + FADD f2, f2, f3 +#else + FADD f1, f0, f1 + FSUB f2, f3, f2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD f1, 0 * SIZE(RESULT) + STFD f2, 1 * SIZE(RESULT) +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT) +#ifndef __64BIT__ +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) + lwz r5, 152(SP) + lwz r6, 156(SP) +#endif +#else +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + ld r3, 144(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + ld r3, 144(SP) + ld r4, 152(SP) +#endif +#endif +#endif + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zdot_cell.S b/kernel/power/zdot_cell.S new file mode 100644 index 0000000..66b7dfa --- /dev/null +++ b/kernel/power/zdot_cell.S @@ -0,0 +1,617 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define RESULT r3 +#define N r4 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define PREA r9 +#else +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 +#endif + +#define INCXM1 r10 +#define INCYM1 r11 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + fmr f24, FZERO + fmr f25, FZERO + fmr f26, FZERO + fmr f27, FZERO + fmr f28, FZERO + fmr f29, FZERO + fmr f30, FZERO + fmr f31, FZERO + + li PREA, 16 * 10 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f0, f8, f16, f0 + LFD f22, 6 * SIZE(Y) + FMADD f3, f8, f17, f3 + LFD f8, 8 * SIZE(X) + FMADD f1, f9, f17, f1 + LFD f17, 9 * SIZE(Y) + FMADD f2, f9, f16, f2 + LFD f9, 9 * SIZE(X) + + FMADD f4, f10, f18, f4 + LFD f16, 8 * SIZE(Y) + FMADD f7, f10, f19, f7 + LFD f10, 10 * SIZE(X) + FMADD f5, f11, f19, f5 + LFD f19, 11 * SIZE(Y) + FMADD f6, f11, f18, f6 + LFD f11, 11 * SIZE(X) + + + FMADD f24, f12, f20, f24 + LFD f18, 10 * SIZE(Y) + FMADD f27, f12, f21, f27 + LFD f12, 12 * SIZE(X) + FMADD f25, f13, f21, f25 + LFD f21, 13 * SIZE(Y) + FMADD f26, f13, f20, f26 + LFD f13, 13 * SIZE(X) + + FMADD f28, f14, f22, f28 + LFD f20, 12 * SIZE(Y) + FMADD f31, f14, f23, f31 + LFD f14, 14 * SIZE(X) + FMADD f29, f15, f23, f29 + LFD f23, 15 * SIZE(Y) + FMADD f30, f15, f22, f30 + LFD f15, 15 * SIZE(X) + + FMADD f0, f8, f16, f0 + LFD f22, 14 * SIZE(Y) + FMADD f3, f8, f17, f3 + LFD f8, 16 * SIZE(X) + FMADD f1, f9, f17, f1 + LFD f17, 17 * SIZE(Y) + FMADD f2, f9, f16, f2 + LFD f9, 17 * SIZE(X) + + FMADD f4, f10, f18, f4 + LFD f16, 16 * SIZE(Y) + FMADD f7, f10, f19, f7 + LFD f10, 18 * SIZE(X) + FMADD f5, f11, f19, f5 + LFD f19, 19 * SIZE(Y) + FMADD f6, f11, f18, f6 + LFD f11, 19 * SIZE(X) + + FMADD f24, f12, f20, f24 + LFD f18, 18 * SIZE(Y) + FMADD f27, f12, f21, f27 + LFD f12, 20 * SIZE(X) + FMADD f25, f13, f21, f25 + LFD f21, 21 * SIZE(Y) + FMADD f26, f13, f20, f26 + LFD f13, 21 * SIZE(X) + + FMADD f28, f14, f22, f28 + LFD f20, 20 * SIZE(Y) + FMADD f31, f14, f23, f31 + LFD f14, 22 * SIZE(X) + FMADD f29, f15, f23, f29 + LFD f23, 23 * SIZE(Y) + FMADD f30, f15, f22, f30 + LFD f15, 23 * SIZE(X) + + dcbt X, PREA + addi X, X, 16 * SIZE + dcbt Y, PREA + addi Y, Y, 16 * SIZE + bdnz LL(10) + .align 4 + +LL(20): + FMADD f0, f8, f16, f0 + LFD f22, 6 * SIZE(Y) + FMADD f3, f8, f17, f3 + LFD f8, 8 * SIZE(X) + FMADD f1, f9, f17, f1 + LFD f17, 9 * SIZE(Y) + FMADD f2, f9, f16, f2 + LFD f9, 9 * SIZE(X) + + FMADD f4, f10, f18, f4 + LFD f16, 8 * SIZE(Y) + FMADD f7, f10, f19, f7 + LFD f10, 10 * SIZE(X) + FMADD f5, f11, f19, f5 + LFD f19, 11 * SIZE(Y) + FMADD f6, f11, f18, f6 + LFD f11, 11 * SIZE(X) + + FMADD f24, f12, f20, f24 + LFD f18, 10 * SIZE(Y) + FMADD f27, f12, f21, f27 + LFD f12, 12 * SIZE(X) + FMADD f25, f13, f21, f25 + LFD f21, 13 * SIZE(Y) + FMADD f26, f13, f20, f26 + LFD f13, 13 * SIZE(X) + + FMADD f28, f14, f22, f28 + LFD f20, 12 * SIZE(Y) + FMADD f31, f14, f23, f31 + LFD f14, 14 * SIZE(X) + FMADD f29, f15, f23, f29 + LFD f23, 15 * SIZE(Y) + FMADD f30, f15, f22, f30 + LFD f15, 15 * SIZE(X) + + FMADD f0, f8, f16, f0 + LFD f22, 14 * SIZE(Y) + FMADD f3, f8, f17, f3 + addi X, X, 16 * SIZE + FMADD f1, f9, f17, f1 + addi Y, Y, 16 * SIZE + FMADD f2, f9, f16, f2 + nop + + FMADD f4, f10, f18, f4 + FMADD f7, f10, f19, f7 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + + FMADD f24, f12, f20, f24 + FMADD f27, f12, f21, f27 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + + FMADD f28, f14, f22, f28 + FMADD f31, f14, f23, f31 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + + FMADD f0, f8, f16, f0 + FMADD f3, f8, f17, f3 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDUX f23, Y, INCY + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + + FMADD f4, f10, f18, f4 + FMADD f7, f10, f19, f7 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + + FMADD f24, f12, f20, f24 + FMADD f27, f12, f21, f27 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + + FMADD f28, f14, f22, f28 + FMADD f31, f14, f23, f31 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f16, Y, INCYM1 + LFDUX f17, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f3, f8, f17, f3 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FADD f24, f28, f24 + FADD f25, f29, f25 + FADD f26, f30, f26 + FADD f27, f31, f27 + + FADD f0, f0, f24 + FADD f1, f1, f25 + FADD f2, f2, f26 + FADD f3, f3, f27 + +#ifndef CONJ + FSUB f1, f0, f1 + FADD f2, f2, f3 +#else + FADD f1, f0, f1 + FSUB f2, f3, f2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD f1, 0 * SIZE(RESULT) + STFD f2, 1 * SIZE(RESULT) +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT) +#ifndef __64BIT__ +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) + lwz r5, 152(SP) + lwz r6, 156(SP) +#endif +#else +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + ld r3, 144(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + ld r3, 144(SP) + ld r4, 152(SP) +#endif +#endif +#endif + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zdot_hummer.S b/kernel/power/zdot_hummer.S new file mode 100644 index 0000000..83027cf --- /dev/null +++ b/kernel/power/zdot_hummer.S @@ -0,0 +1,529 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define RESULT r3 +#define N r4 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#else +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif + +#define INCX2 r9 +#define INCY2 r10 + +#define C1 f1 +#define C2 f2 +#define C3 f0 +#define C4 f3 +#define C5 f4 +#define C6 f5 +#define C7 f6 +#define C8 f7 + +#define A1 f8 +#define A2 f9 +#define A3 f10 +#define A4 f11 +#define A5 f12 +#define A6 f13 +#define A7 f14 +#define A8 f15 + +#define B1 f16 +#define B2 f17 +#define B3 f18 +#define B4 f19 +#define B5 f20 +#define B6 f21 +#define B7 f22 +#define B8 f23 + +#ifndef CONJ +#define FXCXNPMA fxcxnpma +#else +#define FXCXNPMA fxcxnsma +#endif + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + fpmr C2, C1 + + slwi INCY, INCY, BASE_SHIFT + fpmr C3, C1 + add INCY2, INCY, INCY + fpmr C4, C1 + + fpmr C5, C1 + fpmr C6, C1 + fpmr C7, C1 + fpmr C8, C1 + + cmpwi cr0, N, 0 + ble LL(99) + +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(05) + + subi r0, N, 1 + mullw r0, r0, INCX2 + sub X, X, r0 + .align 4 + +LL(05): + cmpwi cr0, INCY, 0 + bge+ LL(06) + + subi r0, N, 1 + mullw r0, r0, INCY2 + sub Y, Y, r0 + .align 4 + +LL(06): +#endif + + andi. r0, X, 2 * SIZE - 1 + bne LL(100) + andi. r0, Y, 2 * SIZE - 1 + bne LL(100) + +/* X is aligned, Y is aligned */ +LL(10): + sub X, X, INCX2 + sub Y, Y, INCY2 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + bdz LL(14) + .align 4 + +LL(13): + fxcpmadd C1, A1, B1, C1 + LFPDUX B8, Y, INCY2 + FXCXNPMA C2, A1, B1, C2 + LFPDUX A1, X, INCX2 + fxcpmadd C3, A2, B2, C3 + LFPDUX B1, Y, INCY2 + FXCXNPMA C4, A2, B2, C4 + LFPDUX A2, X, INCX2 + + fxcpmadd C5, A3, B3, C5 + LFPDUX B2, Y, INCY2 + FXCXNPMA C6, A3, B3, C6 + LFPDUX A3, X, INCX2 + fxcpmadd C7, A4, B4, C7 + LFPDUX B3, Y, INCY2 + FXCXNPMA C8, A4, B4, C8 + LFPDUX A4, X, INCX2 + + fxcpmadd C1, A5, B5, C1 + LFPDUX B4, Y, INCY2 + FXCXNPMA C2, A5, B5, C2 + LFPDUX A5, X, INCX2 + fxcpmadd C3, A6, B6, C3 + LFPDUX B5, Y, INCY2 + FXCXNPMA C4, A6, B6, C4 + LFPDUX A6, X, INCX2 + + fxcpmadd C5, A7, B7, C5 + LFPDUX B6, Y, INCY2 + FXCXNPMA C6, A7, B7, C6 + LFPDUX A7, X, INCX2 + fxcpmadd C7, A8, B8, C7 + LFPDUX B7, Y, INCY2 + FXCXNPMA C8, A8, B8, C8 + LFPDUX A8, X, INCX2 + bdnz LL(13) + .align 4 + +LL(14): + LFPDUX B8, Y, INCY2 + fxcpmadd C1, A1, B1, C1 + FXCXNPMA C2, A1, B1, C2 + fxcpmadd C3, A2, B2, C3 + FXCXNPMA C4, A2, B2, C4 + + fxcpmadd C5, A3, B3, C5 + FXCXNPMA C6, A3, B3, C6 + fxcpmadd C7, A4, B4, C7 + FXCXNPMA C8, A4, B4, C8 + + fxcpmadd C1, A5, B5, C1 + FXCXNPMA C2, A5, B5, C2 + fxcpmadd C3, A6, B6, C3 + FXCXNPMA C4, A6, B6, C4 + + fxcpmadd C5, A7, B7, C5 + FXCXNPMA C6, A7, B7, C6 + fxcpmadd C7, A8, B8, C7 + FXCXNPMA C8, A8, B8, C8 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(99) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fxcpmadd C1, A1, B1, C1 + FXCXNPMA C2, A1, B1, C2 + fxcpmadd C3, A2, B2, C3 + FXCXNPMA C4, A2, B2, C4 + + fxcpmadd C5, A3, B3, C5 + FXCXNPMA C6, A3, B3, C6 + fxcpmadd C7, A4, B4, C7 + FXCXNPMA C8, A4, B4, C8 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fxcpmadd C1, A1, B1, C1 + FXCXNPMA C2, A1, B1, C2 + fxcpmadd C3, A2, B2, C3 + FXCXNPMA C4, A2, B2, C4 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(99) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + fxcpmadd C1, A1, B1, C1 + FXCXNPMA C2, A1, B1, C2 + .align 4 + +LL(99): + li r10, 16 + + fpadd C1, C1, C5 + lfpdux f23, SP, r10 + fpadd C2, C2, C6 + lfpdux f22, SP, r10 + fpadd C3, C3, C7 + lfpdux f21, SP, r10 + fpadd C4, C4, C8 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fpadd C1, C1, C3 + lfpdux f17, SP, r10 + fpadd C2, C2, C4 + lfpdux f16, SP, r10 + + fpadd C1, C1, C2 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fsmtp C2, C1 + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD C1, 0 * SIZE(RESULT) + STFD C2, 1 * SIZE(RESULT) +#endif + addi SP, SP, 16 + blr + .align 4 + + +/* X is aligned, Y is NOT aligned */ + +LL(100): + subi INCX2, INCX2, SIZE + subi INCY2, INCY2, SIZE + + li INCX, SIZE + li INCY, SIZE + + sub X, X, INCX2 + sub Y, Y, INCY2 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX2 + LFDUX B3, Y, INCY2 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + LFDUX A5, X, INCX2 + LFDUX B5, Y, INCY2 + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + + LFDUX A7, X, INCX2 + LFDUX B7, Y, INCY2 + LFDUX A8, X, INCX + bdz LL(104) + .align 4 + +LL(103): + fmadd C1, A1, B1, C1 + LFDUX B8, Y, INCY + fmadd C2, A1, B2, C2 + LFDUX A1, X, INCX2 + + fmadd C3, A2, B1, C3 + LFDUX B1, Y, INCY2 + fmadd C4, A2, B2, C4 + LFDUX A2, X, INCX + + fmadd C5, A3, B3, C5 + LFDUX B2, Y, INCY + fmadd C6, A3, B4, C6 + LFDUX A3, X, INCX2 + + fmadd C7, A4, B3, C7 + LFDUX B3, Y, INCY2 + fmadd C8, A4, B4, C8 + LFDUX A4, X, INCX + + fmadd C1, A5, B5, C1 + LFDUX B4, Y, INCY + fmadd C2, A5, B6, C2 + LFDUX A5, X, INCX2 + + fmadd C3, A6, B5, C3 + LFDUX B5, Y, INCY2 + fmadd C4, A6, B6, C4 + LFDUX A6, X, INCX + + fmadd C5, A7, B7, C5 + LFDUX B6, Y, INCY + fmadd C6, A7, B8, C6 + LFDUX A7, X, INCX2 + + fmadd C7, A8, B7, C7 + LFDUX B7, Y, INCY2 + fmadd C8, A8, B8, C8 + LFDUX A8, X, INCX + + bdnz LL(103) + .align 4 + +LL(104): + LFDUX B8, Y, INCY + fmadd C1, A1, B1, C1 + fmadd C2, A1, B2, C2 + fmadd C3, A2, B1, C3 + fmadd C4, A2, B2, C4 + + fmadd C5, A3, B3, C5 + fmadd C6, A3, B4, C6 + fmadd C7, A4, B3, C7 + fmadd C8, A4, B4, C8 + + fmadd C1, A5, B5, C1 + fmadd C2, A5, B6, C2 + fmadd C3, A6, B5, C3 + fmadd C4, A6, B6, C4 + + fmadd C5, A7, B7, C5 + fmadd C6, A7, B8, C6 + fmadd C7, A8, B7, C7 + fmadd C8, A8, B8, C8 + .align 4 + +LL(105): + andi. r0, N, 3 + beq LL(999) + + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX2 + LFDUX B3, Y, INCY2 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C1, A1, B1, C1 + fmadd C2, A1, B2, C2 + fmadd C3, A2, B1, C3 + fmadd C4, A2, B2, C4 + + fmadd C5, A3, B3, C5 + fmadd C6, A3, B4, C6 + fmadd C7, A4, B3, C7 + fmadd C8, A4, B4, C8 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C1, A1, B1, C1 + fmadd C2, A1, B2, C2 + fmadd C3, A2, B1, C3 + fmadd C4, A2, B2, C4 + .align 4 + +LL(999): + li r10, 16 + + fadd C1, C1, C5 + lfpdux f23, SP, r10 + fadd C2, C2, C6 + lfpdux f22, SP, r10 + fadd C3, C3, C7 + lfpdux f21, SP, r10 + fadd C4, C4, C8 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + +#ifndef CONJ + FSUB C1, C1, C4 + FADD C2, C2, C3 +#else + FADD C1, C1, C4 + FSUB C2, C2, C3 +#endif + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD C1, 0 * SIZE(RESULT) + STFD C2, 1 * SIZE(RESULT) +#endif + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zdot_ppc440.S b/kernel/power/zdot_ppc440.S new file mode 100644 index 0000000..3340e65 --- /dev/null +++ b/kernel/power/zdot_ppc440.S @@ -0,0 +1,441 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define RESULT r3 +#define N r4 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define PRE r9 +#else +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PRE r8 +#endif + +#define INCXM1 r10 +#define INCYM1 r11 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stw r0, 144(SP) + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + fmr f24, FZERO + fmr f25, FZERO + fmr f26, FZERO + fmr f27, FZERO + fmr f28, FZERO + fmr f29, FZERO + fmr f30, FZERO + fmr f31, FZERO + + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 +#ifdef PPCG4 + dcbt Y, PRE +#endif + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 +#if defined(PPCG4) && defined(DOUBLE) + dcbt Y, PRE +#endif + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 +#ifdef PPCG4 + dcbt Y, PRE +#endif + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 +#if defined(PPCG4) && defined(DOUBLE) + dcbt Y, PRE +#endif + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + LFDX f22, Y, INCYM1 + FMADD f0, f8, f16, f0 + LFDUX f23, Y, INCY + FMADD f3, f8, f17, f3 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + + FMADD f4, f10, f18, f4 + FMADD f7, f10, f19, f7 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + + FMADD f24, f12, f20, f24 + FMADD f27, f12, f21, f27 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + + FMADD f28, f14, f22, f28 + FMADD f31, f14, f23, f31 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f16, Y, INCYM1 + LFDUX f17, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FADD f24, f28, f24 + FADD f25, f29, f25 + FADD f26, f30, f26 + FADD f27, f31, f27 + + FADD f0, f0, f24 + FADD f1, f1, f25 + FADD f2, f2, f26 + FADD f3, f3, f27 + +#ifndef CONJ + FSUB f1, f0, f1 + FADD f2, f2, f3 +#else + FADD f1, f0, f1 + FSUB f2, f3, f2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD f1, 0 * SIZE(RESULT) + STFD f2, 1 * SIZE(RESULT) +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT) +#ifndef __64BIT__ +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) + lwz r5, 152(SP) + lwz r6, 156(SP) +#endif +#else +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + ld r3, 144(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + ld r3, 144(SP) + ld r4, 152(SP) +#endif +#endif +#endif + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S new file mode 100644 index 0000000..c936a3d --- /dev/null +++ b/kernel/power/zgemm_beta.S @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define C r10 +#define LDC r11 +#define J r5 +#define PRE r6 +#define CO1 r7 + +#define ALPHA_R f30 +#define ALPHA_I f31 + +#define STACKSIZE 32 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f30, 0(SP) + stfd f31, 8(SP) + stw r0, 16(SP) + +#ifdef linux +#ifndef __64BIT__ + lwz LDC, 8 + STACKSIZE(SP) +#else + ld C, 120 + STACKSIZE(SP) + ld LDC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld C, 120 + STACKSIZE(SP) + ld LDC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz C, 68 + STACKSIZE(SP) + lwz LDC, 72 + STACKSIZE(SP) +#else + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + + + slwi LDC, LDC, ZBASE_SHIFT + + lfs f0, 16(SP) + + fmr ALPHA_R, f1 + fmr ALPHA_I, f2 + + cmpwi cr0, M, 0 + ble- LL(999) + cmpwi cr0, N, 0 + ble- LL(999) + + mr J, N + fcmpu cr7, f1, f0 + bne cr7, LL(20) + fcmpu cr7, f2, f0 + bne cr7, LL(20) + .align 4 + +LL(10): + mr CO1, C + add C, C, LDC + addi PRE, 0, 32 * SIZE + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + STFD f0, 0 * SIZE(CO1) + STFD f0, 1 * SIZE(CO1) + STFD f0, 2 * SIZE(CO1) + STFD f0, 3 * SIZE(CO1) + STFD f0, 4 * SIZE(CO1) + STFD f0, 5 * SIZE(CO1) + STFD f0, 6 * SIZE(CO1) + STFD f0, 7 * SIZE(CO1) + STFD f0, 8 * SIZE(CO1) + STFD f0, 9 * SIZE(CO1) + STFD f0, 10 * SIZE(CO1) + STFD f0, 11 * SIZE(CO1) + STFD f0, 12 * SIZE(CO1) + STFD f0, 13 * SIZE(CO1) + STFD f0, 14 * SIZE(CO1) + STFD f0, 15 * SIZE(CO1) + + dcbst PRE, CO1 + addi CO1, CO1, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 7 + mtspr CTR, r0 + beq LL(19) + .align 4 + +LL(16): + STFD f0, 0 * SIZE(CO1) + STFD f0, 1 * SIZE(CO1) + addi CO1, CO1, 2 * SIZE + bdnz LL(16) + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(10) + b LL(999) + .align 4 + +LL(20): + mr CO1, C + add C, C, LDC + addi PRE, 0, 16 * SIZE + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFD f3, 0 * SIZE(CO1) + LFD f4, 1 * SIZE(CO1) + LFD f5, 2 * SIZE(CO1) + LFD f6, 3 * SIZE(CO1) + LFD f7, 4 * SIZE(CO1) + LFD f8, 5 * SIZE(CO1) + LFD f9, 6 * SIZE(CO1) + LFD f10, 7 * SIZE(CO1) + + FMUL f0, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMUL f11, ALPHA_I, f6 + FMUL f6, ALPHA_R, f6 + + FMUL f12, ALPHA_I, f8 + FMUL f8, ALPHA_R, f8 + FMUL f13, ALPHA_I, f10 + FMUL f10, ALPHA_R, f10 + + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f0 + FMADD f6, ALPHA_I, f5, f6 + FMSUB f5, ALPHA_R, f5, f11 + + FMADD f8, ALPHA_I, f7, f8 + FMSUB f7, ALPHA_R, f7, f12 + FMADD f10, ALPHA_I, f9, f10 + FMSUB f9, ALPHA_R, f9, f13 + + STFD f3, 0 * SIZE(CO1) + STFD f4, 1 * SIZE(CO1) + STFD f5, 2 * SIZE(CO1) + STFD f6, 3 * SIZE(CO1) + STFD f7, 4 * SIZE(CO1) + STFD f8, 5 * SIZE(CO1) + STFD f9, 6 * SIZE(CO1) + STFD f10, 7 * SIZE(CO1) + + addi CO1, CO1, 8 * SIZE + dcbtst PRE, CO1 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(29) + .align 4 + +LL(26): + LFD f0, 0 * SIZE(CO1) + LFD f1, 1 * SIZE(CO1) + + FMUL f5, ALPHA_I, f1 + FMUL f1, ALPHA_R, f1 + FMADD f1, ALPHA_I, f0, f1 + FMSUB f0, ALPHA_R, f0, f5 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(29): + addic. J, J, -1 + bgt LL(20) + .align 4 + +LL(999): + li r3, 0 + lfd f30, 0(SP) + lfd f31, 8(SP) + addi SP, SP, STACKSIZE + + blr + EPILOGUE diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S new file mode 100644 index 0000000..5fef0da --- /dev/null +++ b/kernel/power/zgemm_kernel.S @@ -0,0 +1,1837 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#ifdef TRMMKERNEL + std r23, 208(SP) + std r22, 216(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#ifdef TRMMKERNEL + stw r23, 176(SP) + stw r22, 180(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE) + li PREB, (16 * 5 * SIZE) +#else + li PREA, (16 * 15 * SIZE) + li PREB, (16 * 8 * SIZE) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE) + li PREB, (16 * 1 * SIZE) +#else + li PREA, (16 * 2 * SIZE) + li PREB, (16 * 2 * SIZE) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE) + li PREB, (16 * 7 * SIZE) +#else + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 6 * SIZE) +#endif +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr AO, A + ble LL(20) + .align 4 + +LL(11): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + +#ifdef POWER5 + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + LFD f31, 7 * SIZE(B) +#endif + + DCBTST(CO1, PREC) + nop + nop + DCBTST(CO2, PREC) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(15) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + +#ifdef POWER5 + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + LFD f31, 7 * SIZE(B) +#endif + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + +#ifdef POWER5 + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif +#endif + + DCBTST(CO1, PREC) + nop + nop + DCBTST(CO2, PREC) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(15) +#endif + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + +#if defined(ALLOC_HUGETLB) && !defined(POWER5) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + +#if !defined(ALLOC_HUGETLB) && !defined(POWER5) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + +#ifndef POWER5 + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) +#else + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) +#endif + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + +#ifndef POWER5 + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) +#else + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) +#endif + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + +#ifdef POWER5 + LFD f28, 20 * SIZE(BO) + LFD f29, 21 * SIZE(BO) + LFD f30, 22 * SIZE(BO) + LFD f31, 23 * SIZE(BO) +#endif + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 +#ifndef ALLOC_HUGETLB + DCBT(BO, PREB) + DCBT(AO, PREA) +#endif +#endif + bdnz LL(12) + .align 4 + +LL(15): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(KERNEL_MainFinish) +#endif + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FNMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FNMADD f23, f31, f10, f23 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(25) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(25) +#endif + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(27) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(27) +#endif + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(29): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + ble LL(40) + .align 4 + +LL(31): +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(35) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + + DCBTST(CO1, PREC) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(35) +#endif + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(AO, PREA) + DCBT(BO, PREB) + bdnz LL(32) + .align 4 + +LL(35): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(37) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(37) +#endif + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(999) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(45) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(45) +#endif + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble LL(47) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,TEMP + ble LL(47) +#endif + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 +#endif + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 + + FMADD f16, f31, f2, f16 + FNMADD f17, f31, f0, f17 +#endif + +#endif + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#ifdef TRMMKERNEL + ld r23, 208(SP) + ld r22, 216(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#ifdef TRMMKERNEL + lwz r23, 176(SP) + lwz r22, 180(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S new file mode 100644 index 0000000..b55300e --- /dev/null +++ b/kernel/power/zgemm_kernel_altivec.S @@ -0,0 +1,1703 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALIGN_SIZE 0xffff +#define SWAP 0 +#define NEG 16 +#define ALPHA_R 32 +#define ALPHA_I 48 +#define FZERO 64 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 + +#define c00 v24 + +#define VZERO v25 +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 + +#define swap v28 +#define neg v29 +#define alpha_r v30 +#define alpha_i v31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 16 * SIZE +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREB, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREB, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREB, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREB, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREB, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef CELL + li PREB, (3 * 32 * SIZE) +#else + li PREB, (5 * 32 * SIZE) +#endif +#endif + + li r0, -1 + mfspr VREG, VRsave + + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -8192 + + and SP, SP, r0 + + fneg f3, f1 + fneg f4, f2 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NC) || defined(TC) || defined(NR) || defined(TR) + stfs f1, ALPHA_R + 0(SP) + stfs f1, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f1, ALPHA_R + 12(SP) + + stfs f4, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f4, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#else + stfs f1, ALPHA_R + 0(SP) + stfs f3, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f3, ALPHA_R + 12(SP) + + stfs f2, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f2, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#endif + + li I, Address_L(0x04050607) + addis I, I, Address_H(0x04050607) + stw I, SWAP + 0(SP) + li I, Address_L(0x00010203) + addis I, I, Address_H(0x00010203) + stw I, SWAP + 4(SP) + li I, Address_L(0x0c0d0e0f) + addis I, I, Address_H(0x0c0d0e0f) + stw I, SWAP + 8(SP) + li I, Address_L(0x08090a0b) + addis I, I, Address_H(0x08090a0b) + stw I, SWAP + 12(SP) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + lis I, 0x8000 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + li I, 0 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#else + li I, 0 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + lis I, 0x8000 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#endif + + li r0, 0 + stw r0, FZERO(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 1 + ble LL(50) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 3 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_B b2, OFFSET_1, B + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + LOAD_A a5, OFFSET_4, AO + vxor c08, c08, c08 + + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + vxor c12, c12, c12 + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 1 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(15) + .align 4 + +LL(12): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + DCBT(BO, PREB) + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + LOAD_A a6, OFFSET_5, AO + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 +#ifdef CELL + DCBT(AO, PREA) +#else + nop +#endif + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + LOAD_A a7, OFFSET_6, AO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_2, BO + vmaddfp c11, a3, bp1, c11 + nop + vmaddfp c12, a4, bp1, c12 + LOAD_A a8, OFFSET_7, AO + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + addi AO, AO, 32 * SIZE + vmaddfp c15, a3, bp2, c15 + nop + vmaddfp c16, a4, bp2, c16 + LOAD_A a1, OFFSET_0, AO + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + nop + vmaddfp c03, a7, bp1, c03 + nop + vmaddfp c04, a8, bp1, c04 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + nop + vmaddfp c08, a8, bp2, c08 + LOAD_A a3, OFFSET_2, AO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b2, OFFSET_3, BO + vmaddfp c11, a7, bp1, c11 + nop + vmaddfp c12, a8, bp1, c12 + LOAD_A a4, OFFSET_3, AO + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + addi BO, BO, 8 * SIZE + vmaddfp c15, a7, bp2, c15 + LOAD_A a5, OFFSET_4, AO + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(15): + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + andi. r0, K, 1 + ble+ LL(18) + .align 4 + +LL(16): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 16 * SIZE + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 4 * SIZE + vmaddfp c12, a4, bp1, c12 + nop + + vmaddfp c13, a1, bp2, c13 + vmaddfp c14, a2, bp2, c14 + vmaddfp c15, a3, bp2, c15 + vmaddfp c16, a4, bp2, c16 + .align 4 + +LL(18): + vxor VZERO, VZERO, VZERO + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + vperm c15, c15, c15, swap + vperm c16, c16, c16, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vxor c13, c13, neg + vxor c14, c14, neg + vxor c15, c15, neg + vxor c16, c16, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + vaddfp c11, c11, c15 + vaddfp c12, c12, c16 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + vperm c15, c11, c11, swap + vperm c16, c12, c12, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c11, alpha_r, c11, VZERO + vmaddfp c12, alpha_r, c12, VZERO + + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + vmaddfp c11, alpha_i, c15, c11 + vmaddfp c12, alpha_i, c16, c12 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, c11, PERMRSHIFT2 + vperm c11, c11, c12, PERMRSHIFT2 + vperm c12, c12, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + vaddfp c11, c11, C4 + vaddfp c12, c12, C5 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + stvx c11, OFFSET_3, CO2 + stvx c12, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 4 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c13, c13, neg + vxor c14, c14, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 2 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c13, c13, c13, swap + + vxor c05, c05, neg + vxor c13, c13, neg + + vaddfp c01, c01, c05 + vaddfp c09, c09, c13 + + vperm c05, c01, c01, swap + vperm c13, c09, c09, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c09, alpha_i, c13, c09 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 + fadd f4, f4, f7 + fsub f5, f5, f6 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 + fadd f4, f4, f7 + fsub f5, f6, f5 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fnmsub f11, f12, f5, f11 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fmadd f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fmadd f11, f12, f5, f11 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fnmsub f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + STFD f10, 0 * SIZE(CO2) + STFD f11, 1 * SIZE(CO2) + +LL(49): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + + srawi. I, M, 3 + ble LL(70) + .align 4 + +LL(61): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(65) + .align 4 + +LL(62): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(62) + .align 4 + +LL(65): + andi. r0, K, 1 + ble+ LL(68) + .align 4 + +LL(66): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(68): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(61) + .align 4 + +LL(70): + andi. I, M, 4 + ble LL(80) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(78): + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + + vxor c05, c05, neg + vxor c06, c06, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + ble+ LL(88) + .align 4 + +LL(86): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(88): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + + vxor c05, c05, neg + + vaddfp c01, c01, c05 + + vperm c05, c01, c01, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + + fmadd f0, f8, f12, f0 + fmadd f2, f8, f13, f2 + fmadd f1, f9, f12, f1 + fmadd f3, f9, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + ble LL(98) + .align 4 + +LL(96): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + .align 4 + +LL(98): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S new file mode 100644 index 0000000..7b80e66 --- /dev/null +++ b/kernel/power/zgemm_kernel_altivec_cell.S @@ -0,0 +1,1858 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALIGN_SIZE 0xffff +#define SWAP 0 +#define NEG 16 +#define ALPHA_R 32 +#define ALPHA_I 48 +#define FZERO 64 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 + +#define c00 v24 + +#define VZERO v25 +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 + +#define swap v28 +#define neg v29 +#define alpha_r v30 +#define alpha_i v31 + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../cparam.h" +#else +#include "../zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 16 * SIZE +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREB, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREB, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREB, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREB, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREB, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef CELL + li PREB, (3 * 32 * SIZE) +#else + li PREB, (5 * 32 * SIZE) +#endif +#endif + + li r0, -1 + mfspr VREG, VRsave + + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -8192 + + and SP, SP, r0 + + fneg f3, f1 + fneg f4, f2 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NC) || defined(TC) || defined(NR) || defined(TR) + stfs f1, ALPHA_R + 0(SP) + stfs f1, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f1, ALPHA_R + 12(SP) + + stfs f4, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f4, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#else + stfs f1, ALPHA_R + 0(SP) + stfs f3, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f3, ALPHA_R + 12(SP) + + stfs f2, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f2, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#endif + + li I, Address_L(0x04050607) + addis I, I, Address_H(0x04050607) + stw I, SWAP + 0(SP) + li I, Address_L(0x00010203) + addis I, I, Address_H(0x00010203) + stw I, SWAP + 4(SP) + li I, Address_L(0x0c0d0e0f) + addis I, I, Address_H(0x0c0d0e0f) + stw I, SWAP + 8(SP) + li I, Address_L(0x08090a0b) + addis I, I, Address_H(0x08090a0b) + stw I, SWAP + 12(SP) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + lis I, 0x8000 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + li I, 0 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#else + li I, 0 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + lis I, 0x8000 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#endif + + li r0, 0 + stw r0, FZERO(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 1 + ble LL(50) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 3 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + + vxor c04, c04, c04 + vxor c05, c05, c05 + vxor c06, c06, c06 + vxor c07, c07, c07 + vxor c08, c08, c08 + + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + vxor c12, c12, c12 + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(13) + .align 4 + +#define NOP1 mr r3, r3 +#define NOP2 mr r4, r4 + +LL(12): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + dcbt AO, PREA + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + dcbt BO, PREB + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 8 * SIZE + vmaddfp c12, a4, bp1, c12 + NOP1 + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + addi AO, AO, 32 * SIZE + vmaddfp c07, a7, bp2, c07 + LOAD_B b1, OFFSET_0, BO + vmaddfp c08, a8, bp2, c08 + NOP1 + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + NOP2 + vmaddfp c11, a7, bp1, c11 + NOP1 + vmaddfp c12, a8, bp1, c12 + dcbt AO, PREA + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c15, a7, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a8, bp2, c16 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + LOAD_A a3, OFFSET_2, AO + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + NOP2 + vmaddfp c11, a3, bp1, c11 + NOP1 + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + addi AO, AO, 32 * SIZE + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c11, a7, bp1, c11 + NOP2 + vmaddfp c12, a8, bp1, c12 + vspltw bp1, b1, 0 + + vmaddfp c13, a5, bp2, c13 + LOAD_A a2, OFFSET_1, AO + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + NOP1 + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(13): + andi. r0, K, 2 + nop + nop + ble+ LL(15) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_A a5, OFFSET_4, AO + vmaddfp c11, a3, bp1, c11 + LOAD_A a6, OFFSET_5, AO + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a7, OFFSET_6, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a8, OFFSET_7, AO + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 32 * SIZE + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + NOP2 + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO + vmaddfp c11, a7, bp1, c11 + LOAD_A a2, OFFSET_1, AO + vmaddfp c12, a8, bp1, c12 + NOP2 + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + vmaddfp c16, a8, bp2, c16 + .align 4 + + +LL(15): + andi. r0, K, 1 + vxor VZERO, VZERO, VZERO + ble+ LL(18) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 16 * SIZE + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 4 * SIZE + vmaddfp c12, a4, bp1, c12 + nop + + vmaddfp c13, a1, bp2, c13 + vmaddfp c14, a2, bp2, c14 + vmaddfp c15, a3, bp2, c15 + vmaddfp c16, a4, bp2, c16 + .align 4 + +LL(18): + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vxor VZERO, VZERO, VZERO + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + vperm c15, c15, c15, swap + vperm c16, c16, c16, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vxor c13, c13, neg + vxor c14, c14, neg + vxor c15, c15, neg + vxor c16, c16, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + vaddfp c11, c11, c15 + vaddfp c12, c12, c16 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + vperm c15, c11, c11, swap + vperm c16, c12, c12, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c11, alpha_r, c11, VZERO + vmaddfp c12, alpha_r, c12, VZERO + + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + vmaddfp c11, alpha_i, c15, c11 + vmaddfp c12, alpha_i, c16, c12 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, c11, PERMRSHIFT2 + vperm c11, c11, c12, PERMRSHIFT2 + vperm c12, c12, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + vaddfp c11, c11, C4 + vaddfp c12, c12, C5 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + stvx c11, OFFSET_3, CO2 + stvx c12, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 4 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c13, c13, neg + vxor c14, c14, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 2 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c13, c13, c13, swap + + vxor c05, c05, neg + vxor c13, c13, neg + + vaddfp c01, c01, c05 + vaddfp c09, c09, c13 + + vperm c05, c01, c01, swap + vperm c13, c09, c09, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c09, alpha_i, c13, c09 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 + fadd f4, f4, f7 + fsub f5, f5, f6 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 + fadd f4, f4, f7 + fsub f5, f6, f5 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fnmsub f11, f12, f5, f11 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fmadd f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fmadd f11, f12, f5, f11 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fnmsub f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + STFD f10, 0 * SIZE(CO2) + STFD f11, 1 * SIZE(CO2) + +LL(49): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + + srawi. I, M, 3 + ble LL(70) + .align 4 + +LL(61): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(65) + .align 4 + +LL(62): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(62) + .align 4 + +LL(65): + andi. r0, K, 1 + ble+ LL(68) + .align 4 + +LL(66): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(68): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(61) + .align 4 + +LL(70): + andi. I, M, 4 + ble LL(80) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(78): + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + + vxor c05, c05, neg + vxor c06, c06, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + ble+ LL(88) + .align 4 + +LL(86): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(88): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + + vxor c05, c05, neg + + vaddfp c01, c01, c05 + + vperm c05, c01, c01, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + + fmadd f0, f8, f12, f0 + fmadd f2, f8, f13, f2 + fmadd f1, f9, f12, f1 + fmadd f3, f9, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + ble LL(98) + .align 4 + +LL(96): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + .align 4 + +LL(98): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S new file mode 100644 index 0000000..f827348 --- /dev/null +++ b/kernel/power/zgemm_kernel_altivec_g4.S @@ -0,0 +1,1757 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALIGN_SIZE 0xffff +#define SWAP 0 +#define NEG 16 +#define ALPHA_R 32 +#define ALPHA_I 48 +#define FZERO 64 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 + +#define c00 v24 + +#define VZERO v25 +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 + +#define swap v28 +#define neg v29 +#define alpha_r v30 +#define alpha_i v31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + li r0, -1 + mfspr VREG, VRsave + + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -8192 + + and SP, SP, r0 + + fneg f3, f1 + fneg f4, f2 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NC) || defined(TC) || defined(NR) || defined(TR) + stfs f1, ALPHA_R + 0(SP) + stfs f1, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f1, ALPHA_R + 12(SP) + + stfs f4, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f4, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#else + stfs f1, ALPHA_R + 0(SP) + stfs f3, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f3, ALPHA_R + 12(SP) + + stfs f2, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f2, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#endif + + li I, Address_L(0x04050607) + addis I, I, Address_H(0x04050607) + stw I, SWAP + 0(SP) + li I, Address_L(0x00010203) + addis I, I, Address_H(0x00010203) + stw I, SWAP + 4(SP) + li I, Address_L(0x0c0d0e0f) + addis I, I, Address_H(0x0c0d0e0f) + stw I, SWAP + 8(SP) + li I, Address_L(0x08090a0b) + addis I, I, Address_H(0x08090a0b) + stw I, SWAP + 12(SP) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + lis I, 0x8000 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + li I, 0 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#else + li I, 0 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + lis I, 0x8000 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#endif + + li r0, 0 + stw r0, FZERO(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + li PREC, (15 * SIZE) + li PREB, (25 * 8 * SIZE) + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 1 + ble LL(50) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 3 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + vxor c05, c05, c05 + LOAD_A a4, OFFSET_3, AO + vxor c06, c06, c06 + LOAD_B b2, OFFSET_2, B + vxor c07, c07, c07 + LOAD_A a5, OFFSET_4, AO + vxor c08, c08, c08 + LOAD_A a6, OFFSET_5, AO + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + vxor c12, c12, c12 + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(15) + .align 4 + +LL(12): +/* 1 */ + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c03, a3, bp1, c03 + LOAD_A a7, OFFSET_4, AO + vmaddfp c04, a4, bp1, c04 + LOAD_A a8, OFFSET_5, AO + +/* 2 */ + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + dcbt BO, PREB + vmaddfp c07, a3, bp2, c07 + dcbt AO, PREB + vmaddfp c08, a4, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 3 */ + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + dcbt AO, PREB + vmaddfp c12, a4, bp1, c12 + addi AO, AO, 8 * SIZE + +/* 4 */ + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_2, AO + vmaddfp c15, a3, bp2, c15 + dcbt AO, PREB + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 8 * SIZE + +/* 5 */ + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a6, bp1, c02 + LOAD_A a2, OFFSET_1, AO + vmaddfp c03, a7, bp1, c03 + LOAD_A a3, OFFSET_2, AO + vmaddfp c04, a8, bp1, c04 + LOAD_A a4, OFFSET_3, AO + +/* 6 */ + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + dcbt AO, PREA + vmaddfp c08, a8, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 7 */ + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b1, OFFSET_4, BO + vmaddfp c11, a7, bp1, c11 + nop + vmaddfp c12, a8, bp1, c12 + nop + +/* 8 */ + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a5, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a6, OFFSET_3, AO + vmaddfp c16, a8, bp2, c16 + LOAD_A a7, OFFSET_4, AO + +/* 9 */ + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a8, OFFSET_5, AO + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 8 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + +/* 10 */ + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + +/* 11 */ + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + dcbt AO, PREA + vmaddfp c12, a4, bp1, c12 + addi AO, AO, 8 * SIZE + +/* 12 */ + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a2, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + LOAD_A a3, OFFSET_6, AO + +/* 13 */ + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + LOAD_A a4, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + dcbt AO, PREA + vmaddfp c04, a8, bp1, c04 + addi AO, AO, 8 * SIZE + +/* 14 */ + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + dcbt AO, PREA + vmaddfp c08, a8, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 15 */ + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b2, OFFSET_4, BO + vmaddfp c11, a7, bp1, c11 + dcbt AO, PREA + vmaddfp c12, a8, bp1, c12 + addi BO, BO, 8 * SIZE + +/* 16 */ + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(15): + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi AO, AO, 16 * SIZE + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 4 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_0, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a4, bp2, c16 + LOAD_A a3, OFFSET_2, AO + + LOAD_A a4, OFFSET_3, AO + bdnz+ LL(16) + .align 4 + +LL(18): + vxor VZERO, VZERO, VZERO + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + vperm c15, c15, c15, swap + vperm c16, c16, c16, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vxor c13, c13, neg + vxor c14, c14, neg + vxor c15, c15, neg + vxor c16, c16, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + vaddfp c11, c11, c15 + vaddfp c12, c12, c16 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + vperm c15, c11, c11, swap + vperm c16, c12, c12, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c11, alpha_r, c11, VZERO + vmaddfp c12, alpha_r, c12, VZERO + + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + vmaddfp c11, alpha_i, c15, c11 + vmaddfp c12, alpha_i, c16, c12 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, c11, PERMRSHIFT2 + vperm c11, c11, c12, PERMRSHIFT2 + vperm c12, c12, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + vaddfp c11, c11, C4 + vaddfp c12, c12, C5 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + stvx c11, OFFSET_3, CO2 + stvx c12, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 4 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c13, c13, neg + vxor c14, c14, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 2 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c13, c13, c13, swap + + vxor c05, c05, neg + vxor c13, c13, neg + + vaddfp c01, c01, c05 + vaddfp c09, c09, c13 + + vperm c05, c01, c01, swap + vperm c13, c09, c09, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c09, alpha_i, c13, c09 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 + fadd f4, f4, f7 + fsub f5, f5, f6 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 + fadd f4, f4, f7 + fsub f5, f6, f5 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fnmsub f11, f12, f5, f11 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fmadd f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fmadd f11, f12, f5, f11 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fnmsub f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + STFD f10, 0 * SIZE(CO2) + STFD f11, 1 * SIZE(CO2) + +LL(49): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + + srawi. I, M, 3 + ble LL(70) + .align 4 + +LL(61): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(65) + .align 4 + +LL(62): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(62) + .align 4 + +LL(65): + andi. r0, K, 1 + ble+ LL(68) + .align 4 + +LL(66): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(68): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(61) + .align 4 + +LL(70): + andi. I, M, 4 + ble LL(80) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(78): + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + + vxor c05, c05, neg + vxor c06, c06, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + ble+ LL(88) + .align 4 + +LL(86): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(88): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + + vxor c05, c05, neg + + vaddfp c01, c01, c05 + + vperm c05, c01, c01, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + + fmadd f0, f8, f12, f0 + fmadd f2, f8, f13, f2 + fmadd f1, f9, f12, f1 + fmadd f3, f9, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + ble LL(98) + .align 4 + +LL(96): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + .align 4 + +LL(98): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S new file mode 100644 index 0000000..f0d3204 --- /dev/null +++ b/kernel/power/zgemm_kernel_cell.S @@ -0,0 +1,1784 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../cparam.h" +#else +#include "../zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#ifdef TRMMKERNEL + std r23, 208(SP) + std r22, 216(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#ifdef TRMMKERNEL + stw r23, 176(SP) + stw r22, 180(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREC, 3 * SIZE + li PREA, 16 * 12 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + + + lfs f0, FZERO + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr AO, A + ble LL(20) + .align 4 + +LL(11): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + PREFETCH_C1 + nop + nop + PREFETCH_C2 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(15) +#else + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) +#endif + + PREFETCH_C1 + PREFETCH_C2 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(15) +#endif + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(KERNEL_MainFinish) +#endif + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FNMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FNMADD f23, f31, f10, f23 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(25) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(25) +#endif + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(27) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(27) +#endif + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(29): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + ble LL(40) + .align 4 + +LL(31): +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(35) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + + PREFETCH_C1 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(35) +#endif + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(37) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(37) +#endif + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(999) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(45) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(45) +#endif + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble LL(47) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,TEMP + ble LL(47) +#endif + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 +#endif + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 + + FMADD f16, f31, f2, f16 + FNMADD f17, f31, f0, f17 +#endif + +#endif + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#ifdef TRMMKERNEL + ld r23, 208(SP) + ld r22, 216(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#ifdef TRMMKERNEL + lwz r23, 176(SP) + lwz r22, 180(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S new file mode 100644 index 0000000..c652adf --- /dev/null +++ b/kernel/power/zgemm_kernel_g4.S @@ -0,0 +1,1637 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#ifdef TRMMKERNEL + std r23, 208(SP) + std r22, 216(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#ifdef TRMMKERNEL + stw r23, 176(SP) + stw r22, 180(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + li PREA, 8 * 8 * SIZE + li PREC, 3 * SIZE + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + + srawi. J, N, 1 + ble .L30 + .align 4 + +.L10: + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr AO, A + ble .L20 + .align 4 + +.L11: +#ifndef TRMMKERNEL + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, K, 1 + mr BO, B + mtspr CTR, r0 + ble .L15 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + mr BO, B + +#else + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 1 + mtspr CTR, TEMP + ble .L15 +#endif + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + dcbt AO, PREA + FMADD f4, A1, B2, f4 + LFDU B5, 4 * SIZE(BO) + + FMADD f8, A1, B3, f8 + dcbt BO, PREA + FMADD f12, A1, B4, f12 + LFD A4, -1 * SIZE(AO) + + FMADD f1, A2, B1, f1 + nop + FMADD f5, A2, B2, f5 + LFD B6, 1 * SIZE(BO) + + FMADD f9, A2, B3, f9 + LFDU A1, 4 * SIZE(AO) + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B7, 2 * SIZE(BO) + + FMADD f10, A3, B3, f10 + LFD A2, -3 * SIZE(AO) + FMADD f14, A3, B4, f14 + nop + + FMADD f3, A4, B1, f3 + nop + FMADD f7, A4, B2, f7 + LFD B8, 3 * SIZE(BO) + + FMADD f11, A4, B3, f11 + LFD A3, -2 * SIZE(AO) + FMADD f15, A4, B4, f15 + nop + + FMADD f0, A5, B5, f0 +#ifdef DOUBLE + dcbt AO, PREA +#else + nop +#endif + FMADD f4, A5, B6, f4 + LFDU B1, 4 * SIZE(BO) + + FMADD f8, A5, B7, f8 +#ifdef DOUBLE + dcbt BO, PREA +#else + nop +#endif + FMADD f12, A5, B8, f12 + LFD A4, -1 * SIZE(AO) + + FMADD f1, A2, B5, f1 + nop + FMADD f5, A2, B6, f5 + LFD B2, 1 * SIZE(BO) + + FMADD f9, A2, B7, f9 + LFDU A5, 4 * SIZE(AO) + FMADD f13, A2, B8, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B6, f6 + LFD B3, 2 * SIZE(BO) + + FMADD f10, A3, B7, f10 + LFD A2, -3 * SIZE(AO) + FMADD f14, A3, B8, f14 + nop + + FMADD f3, A4, B5, f3 + nop + FMADD f7, A4, B6, f7 + LFD B4, 3 * SIZE(BO) + + FMADD f11, A4, B7, f11 + LFD A3, -2 * SIZE(AO) + FMADD f15, A4, B8, f15 + bdnz .L12 + .align 4 + .align 4 + +.L15: + addi AO, AO, -4 * SIZE + +#ifndef TRMMKERNEL + andi. r0, K, 1 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + ble .LKERNEL_MainFinish +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 1 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + ble .LKERNEL_MainFinish +#endif + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A4, 3 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + + FMADD f3, A4, B1, f3 + FMADD f7, A4, B2, f7 + FMADD f11, A4, B3, f11 + addi AO, AO, 4 * SIZE + FMADD f15, A4, B4, f15 + addi BO, BO, 4 * SIZE + .align 4 + +.LKERNEL_MainFinish: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FNMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FNMADD f23, f31, f10, f23 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L20: + andi. I, M, 1 + ble .L29 + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L25 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L25 +#endif + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f27, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + LFD f19, 3 * SIZE(AO) + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + bdnz .L22 + .align 4 + +.L25: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .L27 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .L27 +#endif + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L26 + .align 4 + +.L27: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + ble .L40 + .align 4 + +.L31: +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L35 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L35 +#endif + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f27, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + LFD f19, 3 * SIZE(BO) + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + bdnz .L32 + .align 4 + +.L35: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .L37 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .L37 +#endif + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + LFD f17, 1 * SIZE(BO) + bdnz .L36 + .align 4 + +.L37: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L40: + andi. I, M, 1 + ble .L999 + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L45 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L45 +#endif + .align 4 + +.L42: + fmadd f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFD f16, 4 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFD f20, 4 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 5 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 6 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 6 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 8 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 8 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 2 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble .L47 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,TEMP + ble .L47 +#endif + .align 4 + +.L46: + fmadd f0, f16, f20, f0 + fmadd f3, f16, f21, f3 + LFDU f16, 2 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 2 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L47: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 +#endif + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 + + FMADD f16, f31, f2, f16 + FNMADD f17, f31, f0, f17 +#endif + +#endif + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#ifdef TRMMKERNEL + ld r23, 208(SP) + ld r22, 216(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#ifdef TRMMKERNEL + lwz r23, 176(SP) + lwz r22, 180(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S new file mode 100644 index 0000000..7378950 --- /dev/null +++ b/kernel/power/zgemm_kernel_hummer.S @@ -0,0 +1,4428 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#undef ZERO + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define ZERO r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnpma +#else +#define FXCPMADD fxcpnsma +#define FXCSMADD fxcxma +#endif + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) + + li r0, 0 + stwu r0, -4(SP) + stwu r0, -4(SP) + + stfdu f2, -8(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + andi. r0, C, 2 * SIZE - 1 + bne .L1000 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -2 * SIZE + li INCM5, -4 * SIZE + li INCM7, -6 * SIZE + + addi C, C, - 2 * SIZE + srawi. J, N, 1 + ble .L50 + .align 4 + +.L10: + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -4 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L20 + .align 4 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 2 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 +#else + nop +#endif + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 +#ifndef TRMMKERNEL + LFPDUX B1, CO1, INC2 +#else + nop +#endif + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFPDUX A3, CO1, INC2 +#else + nop +#endif + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A5, CO1, INC2 +#else + nop +#endif + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 +#ifndef TRMMKERNEL + LFPDUX B3, CO2, INC2 +#else + nop +#endif + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 +#ifndef TRMMKERNEL + LFPDUX A6, CO2, INC2 +#else + nop +#endif + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A7, CO2, INC2 +#else + nop +#endif + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 +#ifndef TRMMKERNEL + LFPDUX B2, CO2, INC2 +#else + nop +#endif + + FXCSMADD f12, B4, A9, f12 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + FXCSMADD f6, B6, A10, f6 + FXCPMADD f10, B4, A10, f10 + FXCSMADD f14, B4, A10, f14 + + FXCPMADD f3, B6, A4, f3 + FXCSMADD f7, B6, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L14: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 + + cmpwi cr0, TEMP, 3 + bgt+ .L15 +#else + andi. r0, K, 3 + mtspr CTR, r0 + ble+ .L18 + + cmpwi cr0, K, 3 + bgt+ .L15 +#endif + +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + fpmr f5, f0 + LFPDUX B1, CO1, INC2 + fpmr f9, f0 + LFPDUX A3, CO1, INC2 + fpmr f13, f0 + LFPDUX A5, CO1, INC2 + fpmr f2, f0 + + LFPDUX B3, CO2, INC2 + fpmr f6, f0 + LFPDUX A6, CO2, INC2 + fpmr f10, f0 + LFPDUX A7, CO2, INC2 + fpmr f14, f0 + LFPDUX B2, CO2, INC2 + fpmr f3, f0 +#else + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 +#endif + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L18: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 +#else + fpsub f0, f0, f4 + fpsub f8, f8, f12 + fpsub f1, f1, f5 + fpsub f9, f9, f13 + + fpsub f2, f2, f6 + fpsub f10, f10, f14 + fpsub f3, f3, f7 + fpsub f11, f11, f15 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd B1, f1, AP, B1 + fxcpmadd A3, f2, AP, A3 + fxcpmadd A5, f3, AP, A5 + + fxcxnpma f0, f0, AP, A1 + fxcpmadd B3, f8, AP, B3 + fxcxnpma f1, f1, AP, B1 + fxcpmadd A6, f9, AP, A6 + fxcxnpma f2, f2, AP, A3 + fxcpmadd A7, f10, AP, A7 + + fxcxnpma f3, f3, AP, A5 + fxcpmadd B2, f11, AP, B2 + fxcxnpma f8, f8, AP, B3 + STFPDUX f0, CO1, INCM7 + fxcxnpma f9, f9, AP, A6 + STFPDUX f1, CO1, INC2 + fxcxnpma f10, f10, AP, A7 + STFPDUX f2, CO1, INC2 + + fxcxnpma f11, f11, AP, B2 + STFPDUX f3, CO1, INC2 + STFPDUX f8, CO2, INCM7 + STFPDUX f9, CO2, INC2 + STFPDUX f10, CO2, INC2 + STFPDUX f11, CO2, INC2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f2, AP, f30 + fxcpmadd f15, f3, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f2, f2, AP, f14 + fxcxnpma f3, f3, AP, f15 + + fxcpmadd f16, f8, AP, f30 + fxcpmadd f17, f9, AP, f30 + fxcpmadd f18, f10, AP, f30 + fxcpmadd f19, f11, AP, f30 + + fxcxnpma f8, f8, AP, f16 + fxcxnpma f9, f9, AP, f17 + fxcxnpma f10, f10, AP, f18 + fxcxnpma f11, f11, AP, f19 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 + + STFPDUX f8, CO2, INC2 + STFPDUX f9, CO2, INC2 + STFPDUX f10, CO2, INC2 + STFPDUX f11, CO2, INC2 + +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 1 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L24: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L28 + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L28: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX A2, CO1, INC2 + LFPDUX A3, CO2, INC2 + LFPDUX A4, CO2, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 +#else + fpsub f0, f0, f4 + fpsub f8, f8, f12 + fpsub f1, f1, f5 + fpsub f9, f9, f13 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcpmadd A3, f8, AP, A3 + fxcpmadd A4, f9, AP, A4 + + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + fxcxnpma f8, f8, AP, A3 + fxcxnpma f9, f9, AP, A4 + + STFPDUX f0, CO1, INCM3 + STFPDUX f1, CO1, INC2 + + STFPDUX f8, CO2, INCM3 + STFPDUX f9, CO2, INC2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f8, AP, f30 + fxcpmadd f15, f9, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f8, f8, AP, f14 + fxcxnpma f9, f9, AP, f15 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + + STFPDUX f8, CO2, INC2 + STFPDUX f9, CO2, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 1 + beq .L49 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L34: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L38 + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L38: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 + LFPDX A2, CO2, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f1 + fpadd f2, f2, f3 +#else + fpsub f0, f0, f1 + fpsub f2, f2, f3 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f2, AP, A2 + fxcxnpma f0, f0, AP, A1 + fxcxnpma f2, f2, AP, A2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f2, AP, f30 + fxcxnpma f0, f0, AP, f12 + fxcxnpma f2, f2, AP, f13 +#endif + + STFPDUX f0, CO1, INC2 + STFPDUX f2, CO2, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + addi B, BO, 4 * SIZE + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 1 + beq .L999 + + mr CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L60 + .align 4 + +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#else + slwi TEMP, KK, 2 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + srawi. r0, K, 2 + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L54: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L58 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L58: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX A2, CO1, INC2 + LFPDUX A3, CO1, INC2 + LFPDUX A4, CO1, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 +#else + fpsub f0, f0, f4 + fpsub f1, f1, f5 + fpsub f2, f2, f6 + fpsub f3, f3, f7 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcpmadd A3, f2, AP, A3 + fxcpmadd A4, f3, AP, A4 + + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + fxcxnpma f2, f2, AP, A3 + fxcxnpma f3, f3, AP, A4 + + STFPDUX f0, CO1, INCM7 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f2, AP, f30 + fxcpmadd f15, f3, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f2, f2, AP, f14 + fxcxnpma f3, f3, AP, f15 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 1 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + + srawi. r0, TEMP, 2 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L64 + +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L64: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L68 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L68: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX A2, CO1, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f2 + fpadd f1, f1, f3 +#else + fpsub f0, f0, f2 + fpsub f1, f1, f3 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + + STFPDUX f0, CO1, INCM3 + STFPDUX f1, CO1, INC2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 1 + beq .L89 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 0 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L74 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + srawi. r0, K, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L72 + .align 4 + +.L73: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L74: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 7 + mtspr CTR, r0 +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L78 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L78: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 +#endif + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f1 +#else + fpsub f0, f0, f1 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcxnpma f0, f0, AP, A1 +#else + fxcpmadd f12, f0, AP, f30 + fxcxnpma f0, f0, AP, f12 +#endif + + STFPDUX f0, CO1, INC2 + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L89: + addi B, BO, 2 * SIZE + .align 4 + +.L999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + +.L1000: + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + srawi. J, N, 1 + ble .L1050 + .align 4 + +.L1010: + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -4 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L1020 + .align 4 + +.L1011: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 2 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L1014 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L1014 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L1013 + .align 4 + +.L1012: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L1012 + .align 4 + +.L1013: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC +#else + nop +#endif + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 +#ifndef TRMMKERNEL + LFDUX B1, CO1, INC2 +#else + nop +#endif + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFDUX A3, CO1, INC2 +#else + nop +#endif + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 +#ifndef TRMMKERNEL + LFDUX A5, CO1, INC2 +#else + nop +#endif + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 +#ifndef TRMMKERNEL + LFSDUX A1, CO1, INCM5 +#else + nop +#endif + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 +#ifndef TRMMKERNEL + LFSDUX B1, CO1, INC2 +#else + nop +#endif + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFSDUX A3, CO1, INC2 +#else + nop +#endif + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 +#ifndef TRMMKERNEL + LFSDUX A5, CO1, INC2 +#else + nop +#endif + FXCSMADD f12, B4, A9, f12 +#ifndef TRMMKERNEL + LFDUX B3, CO2, INC +#else + nop +#endif + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFDUX A6, CO2, INC2 +#else + nop +#endif + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 +#ifndef TRMMKERNEL + LFDUX A7, CO2, INC2 +#else + nop +#endif + + FXCPMADD f3, B6, A4, f3 + nop + FXCSMADD f7, B6, A4, f7 + nop + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 +#ifndef TRMMKERNEL + LFDUX B2, CO2, INC2 +#else + nop +#endif + .align 4 + +.L1014: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L1018 + + cmpwi cr0, TEMP, 3 + bgt+ .L1015 +#else + andi. r0, K, 3 + mtspr CTR, r0 + ble+ .L1018 + + cmpwi cr0, K, 3 + bgt+ .L1015 +#endif + +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + fpmr f5, f0 + LFDUX B1, CO1, INC2 + fpmr f9, f0 + LFDUX A3, CO1, INC2 + fpmr f13, f0 + LFDUX A5, CO1, INC2 + fpmr f2, f0 + + LFSDUX A1, CO1, INCM5 + fpmr f6, f0 + LFSDUX B1, CO1, INC2 + fpmr f10, f0 + LFSDUX A3, CO1, INC2 + fpmr f14, f0 + LFSDUX A5, CO1, INC2 + fpmr f3, f0 + + LFDUX B3, CO2, INC + fpmr f7, f0 + LFDUX A6, CO2, INC2 + fpmr f11, f0 + LFDUX A7, CO2, INC2 + fpmr f15, f0 + LFDUX B2, CO2, INC2 +#else + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 +#endif + .align 4 + +.L1015: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L1017 + .align 4 + +.L1016: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L1016 + .align 4 + +.L1017: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L1018: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 +#else + fpsub f0, f0, f4 + fpsub f8, f8, f12 + fpsub f1, f1, f5 + fpsub f9, f9, f13 + + fpsub f2, f2, f6 + fpsub f10, f10, f14 + fpsub f3, f3, f7 + fpsub f11, f11, f15 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + LFSDUX B3, CO2, INCM5 + fxcpmadd B1, f1, AP, B1 + LFSDUX A6, CO2, INC2 + fxcpmadd A3, f2, AP, A3 + LFSDUX A7, CO2, INC2 + fxcpmadd A5, f3, AP, A5 + LFSDUX B2, CO2, INC2 + + fxcxnpma f0, f0, AP, A1 + fxcpmadd B3, f8, AP, B3 + fxcxnpma f1, f1, AP, B1 + fxcpmadd A6, f9, AP, A6 + fxcxnpma f2, f2, AP, A3 + fxcpmadd A7, f10, AP, A7 + + fxcxnpma f3, f3, AP, A5 + STFDUX f0, CO1, INCM7 + fxcpmadd B2, f11, AP, B2 + STFSDUX f0, CO1, INC + fxcxnpma f8, f8, AP, B3 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + fxcxnpma f9, f9, AP, A6 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + fxcxnpma f10, f10, AP, A7 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fxcxnpma f11, f11, AP, B2 + STFDUX f8, CO2, INCM7 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f2, AP, f30 + fxcpmadd f15, f3, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f2, f2, AP, f14 + fxcxnpma f3, f3, AP, f15 + + fxcpmadd f16, f8, AP, f30 + fxcpmadd f17, f9, AP, f30 + fxcpmadd f18, f10, AP, f30 + fxcpmadd f19, f11, AP, f30 + + fxcxnpma f8, f8, AP, f16 + fxcxnpma f9, f9, AP, f17 + fxcxnpma f10, f10, AP, f18 + fxcxnpma f11, f11, AP, f19 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + STFDUX f8, CO2, INC +#endif + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + STFDUX f10, CO2, INC + STFSDUX f10, CO2, INC + + STFDUX f11, CO2, INC + STFSDUX f11, CO2, INC + + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1011 + .align 4 + +.L1020: + andi. I, M, 2 + beq .L1030 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 1 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L1024 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L1024 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L1023 + .align 4 + +.L1022: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L1022 + .align 4 + +.L1023: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L1024: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1028 + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L1027 + .align 4 + +.L1026: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L1026 + .align 4 + +.L1027: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L1028: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC2 + LFDUX A3, CO2, INC + LFDUX A4, CO2, INC2 + + LFSDUX A1, CO1, INCM1 + LFSDUX A2, CO1, INC2 + LFSDUX A3, CO2, INCM1 + LFSDUX A4, CO2, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 +#else + fpsub f0, f0, f4 + fpsub f8, f8, f12 + fpsub f1, f1, f5 + fpsub f9, f9, f13 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcpmadd A3, f8, AP, A3 + fxcpmadd A4, f9, AP, A4 + + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + fxcxnpma f8, f8, AP, A3 + fxcxnpma f9, f9, AP, A4 + + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INCM3 + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f8, AP, f30 + fxcpmadd f15, f9, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f8, f8, AP, f14 + fxcxnpma f9, f9, AP, f15 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1030: + andi. I, M, 1 + beq .L1049 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L1034 +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 2 + mtspr CTR, r0 + ble .L1034 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L1033 + .align 4 + +.L1032: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L1032 + .align 4 + +.L1033: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L1034: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1038 + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L1037 + .align 4 + +.L1036: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L1036 + .align 4 + +.L1037: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L1038: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO2, INC + LFSDUX A1, CO1, INC + LFSDUX A2, CO2, INC +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f1 + fpadd f2, f2, f3 +#else + fpsub f0, f0, f1 + fpsub f2, f2, f3 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f2, AP, A2 + fxcxnpma f0, f0, AP, A1 + fxcxnpma f2, f2, AP, A2 + + STFDUX f0, CO1, INCM1 + STFSDUX f0, CO1, INC + + STFDUX f2, CO2, INCM1 + STFSDUX f2, CO2, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f2, AP, f30 + fxcxnpma f0, f0, AP, f12 + fxcxnpma f2, f2, AP, f13 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1049: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + addi B, BO, 4 * SIZE + + addic. J, J, -1 + bgt+ .L1010 + .align 4 + +.L1050: + andi. J, N, 1 + beq .L10999 + + mr CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L1060 + .align 4 + +.L1051: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#else + slwi TEMP, KK, 2 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L1054 +#else + srawi. r0, K, 2 + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L1054 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1053 + .align 4 + +.L1052: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L1052 + .align 4 + +.L1053: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L1054: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1058 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L1057 + .align 4 + +.L1056: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L1056 + .align 4 + +.L1057: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L1058: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC2 + LFDUX A3, CO1, INC2 + LFDUX A4, CO1, INC2 + + LFSDUX A1, CO1, INCM5 + LFSDUX A2, CO1, INC2 + LFSDUX A3, CO1, INC2 + LFSDUX A4, CO1, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 +#else + fpsub f0, f0, f4 + fpsub f1, f1, f5 + fpsub f2, f2, f6 + fpsub f3, f3, f7 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcpmadd A3, f2, AP, A3 + fxcpmadd A4, f3, AP, A4 + + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + fxcxnpma f2, f2, AP, A3 + fxcxnpma f3, f3, AP, A4 + + STFDUX f0, CO1, INCM7 + STFSDUX f0, CO1, INC + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f2, AP, f30 + fxcpmadd f15, f3, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f2, f2, AP, f14 + fxcxnpma f3, f3, AP, f15 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1051 + .align 4 + +.L1060: + andi. I, M, 2 + beq .L1070 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 1 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + + srawi. r0, TEMP, 2 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1064 + +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1064 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1063 + .align 4 + +.L1062: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L1062 + .align 4 + +.L1063: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L1064: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1068 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L1067 + .align 4 + +.L1066: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L1066 + .align 4 + +.L1067: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L1068: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC2 + LFSDUX A1, CO1, INCM1 + LFSDUX A2, CO1, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f2 + fpadd f1, f1, f3 +#else + fpsub f0, f0, f2 + fpsub f1, f1, f3 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1070: + andi. I, M, 1 + beq .L1089 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 0 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1074 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + srawi. r0, K, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1074 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L1073 + .align 4 + +.L1072: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L1072 + .align 4 + +.L1073: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L1074: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 7 + mtspr CTR, r0 +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L1078 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L1077 + .align 4 + +.L1076: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L1076 + .align 4 + +.L1077: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L1078: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC +#endif + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fsmfp A1, A2 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f1 +#else + fpsub f0, f0, f1 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcxnpma f0, f0, AP, A1 + + STFDUX f0, CO1, INCM1 + STFSDUX f0, CO1, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcxnpma f0, f0, AP, f12 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1089: + addi B, BO, 2 * SIZE + .align 4 + +.L10999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S new file mode 100644 index 0000000..716fa88 --- /dev/null +++ b/kernel/power/zgemm_kernel_power3.S @@ -0,0 +1,1260 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../cparam.h" +#else +#include "../zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE + 16) +#else + li PREA, (16 * 9 * SIZE + 16) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 + li PREA, 16 * 9 * SIZE +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 1 + ble LL(KERNEL_N_AND_3_HEAD) + .align 4 + +LL(KERNEL_MainHead): + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + srawi. I, M, 1 + mr AO, A + ble LL(KERNEL_M_AND_3) + .align 4 + +LL(KERNEL_MainSubHead): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(KERNEL_K_AND_7) + .align 4 + +LL(KERNEL_MainLoop): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + LFD f28, 4 * SIZE(BO) + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFD f16, 8 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + LFD f29, 5 * SIZE(BO) + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + LFD f30, 6 * SIZE(BO) + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + fmadd f3, f19, f20, f3 + fmadd f7, f19, f21, f7 + LFD f31, 7 * SIZE(BO) + fmadd f11, f19, f22, f11 + fmadd f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f4, f24, f29, f4 + LFD f20, 8 * SIZE(BO) + fmadd f8, f24, f30, f8 + fmadd f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f5, f25, f29, f5 + LFD f21, 9 * SIZE(BO) + fmadd f9, f25, f30, f9 + fmadd f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + fmadd f2, f26, f28, f2 + fmadd f6, f26, f29, f6 + LFD f22, 10 * SIZE(BO) + fmadd f10, f26, f30, f10 + fmadd f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + fmadd f3, f27, f28, f3 + fmadd f7, f27, f29, f7 + LFD f23, 11 * SIZE(BO) + fmadd f11, f27, f30, f11 + fmadd f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + LFD f28, 12 * SIZE(BO) + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFDU f16, 16 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + LFD f29, 13 * SIZE(BO) + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + LFD f30, 14 * SIZE(BO) + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 2 * SIZE(AO) + + fmadd f3, f19, f20, f3 + fmadd f7, f19, f21, f7 + LFD f31, 15 * SIZE(BO) + fmadd f11, f19, f22, f11 + fmadd f15, f19, f23, f15 + LFD f19, 3 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f4, f24, f29, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f8, f24, f30, f8 + fmadd f12, f24, f31, f12 + LFD f24, 4 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f5, f25, f29, f5 + LFD f21, 1 * SIZE(BO) + fmadd f9, f25, f30, f9 + fmadd f13, f25, f31, f13 + LFD f25, 5 * SIZE(AO) + + fmadd f2, f26, f28, f2 + fmadd f6, f26, f29, f6 + LFD f22, 2 * SIZE(BO) + fmadd f10, f26, f30, f10 + fmadd f14, f26, f31, f14 + LFD f26, 6 * SIZE(AO) + + fmadd f3, f27, f28, f3 + fmadd f7, f27, f29, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f11, f27, f30, f11 + fmadd f15, f27, f31, f15 + LFD f27, 7 * SIZE(AO) + bdnz LL(KERNEL_MainLoop) + .align 4 + +LL(KERNEL_K_AND_7): + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(KERNEL_SubLoop): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 5 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 6 * SIZE(AO) + + fmadd f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + fmadd f7, f19, f21, f7 + LFD f21, 5 * SIZE(BO) + fmadd f11, f19, f22, f11 + LFD f22, 6 * SIZE(BO) + fmadd f15, f19, f23, f15 + LFD f19, 7 * SIZE(AO) + + LFD f23, 7 * SIZE(BO) + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(KERNEL_SubLoop) + .align 4 + +LL(KERNEL_MainFinish): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + + addic. I, I, -1 + bgt LL(KERNEL_MainSubHead) + .align 4 + +LL(KERNEL_M_AND_3): + andi. I, M, 1 + ble LL(KERNEL_MainTail) + .align 4 + +LL(KERNEL_M_AND_3_SubHead): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(KERNEL_M_AND_3_K_AND_3) + .align 4 + +LL(KERNEL_M_AND_3_MainLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(KERNEL_M_AND_3_MainLoop) + .align 4 + +LL(KERNEL_M_AND_3_K_AND_3): + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_M_AND3_Finish) + .align 4 + +LL(KERNEL_M_AND_3_SubLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(KERNEL_M_AND_3_SubLoop) + .align 4 + +LL(KERNEL_M_AND3_Finish): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + + addic. I, I, -1 + bgt LL(KERNEL_M_AND_3_SubHead) + .align 4 + +LL(KERNEL_MainTail): + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt LL(KERNEL_MainHead) + .align 4 + +LL(KERNEL_N_AND_3_HEAD): + andi. J, N, 1 + ble LL(999) + .align 4 + +LL(KERNEL_N_AND_3_MainHead): + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + + ble LL(KERNEL_MN_AND_3_Head) + .align 4 + +LL(KERNEL_N_AND_3_SubHead): + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(KERNEL_N_AND_3_K_AND_3) + .align 4 + +LL(KERNEL_N_AND_3_MainLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(KERNEL_N_AND_3_MainLoop) + .align 4 + +LL(KERNEL_N_AND_3_K_AND_3): + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_N_AND_3_Finish) + .align 4 + +LL(KERNEL_N_AND_3_SubLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(KERNEL_N_AND_3_SubLoop) + .align 4 + +LL(KERNEL_N_AND_3_Finish): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + + addic. I, I, -1 + bgt LL(KERNEL_N_AND_3_SubHead) + .align 4 + +LL(KERNEL_MN_AND_3_Head): + andi. I, M, 1 + ble LL(KERNEL_SubEnd) + .align 4 + +LL(KERNEL_MN_AND_3_SubHead): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(KERNEL_MN_AND_3_K_AND_3) + .align 4 + +LL(KERNEL_MN_AND_3_MainLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(KERNEL_MN_AND_3_MainLoop) + .align 4 + +LL(KERNEL_MN_AND_3_K_AND_3): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble LL(KERNEL_MN_AND_3_Finish) + .align 4 + +LL(KERNEL_MN_AND_3_SubLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(KERNEL_MN_AND_3_SubLoop) + .align 4 + +LL(KERNEL_MN_AND_3_Finish): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#endif + + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + addic. I, I, -1 + bgt LL(KERNEL_MN_AND_3_SubHead) + .align 4 + +LL(KERNEL_SubEnd): + mr B, BO + addic. J, J, -1 + bgt LL(KERNEL_N_AND_3_MainHead) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S new file mode 100644 index 0000000..7f677df --- /dev/null +++ b/kernel/power/zgemm_kernel_power6.S @@ -0,0 +1,2937 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FNMSUB +#define FMA4 FMADD +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FMADD +#define FMA4 FMADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FMADD +#define FMA4 FNMSUB +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FNMSUB +#define FMA4 FNMSUB +#endif + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#ifdef TRMMKERNEL + std r20, 232(SP) + std r19, 240(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#ifdef TRMMKERNEL + stw r20, 188(SP) + stw r19, 192(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + li PREA, (16 * 3) * SIZE + li PREC, 3 * SIZE + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(10): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + slwi BB, K, ZBASE_SHIFT + 2 + mr AO, A + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 1 + ble LL(20) + .align 4 + +LL(11): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, K, 3 + mr BO, B + mtspr CTR, r0 + ble LL(15) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(15) +#endif + .align 4 + +LL(12): + dcbt AO, PREA + FMA1 f0, f16, f20, f0 + nop + FMA1 f2, f18, f20, f2 + + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 40 * SIZE(BO) + LFD f21, 41 * SIZE(BO) + LFD f22, 42 * SIZE(BO) + LFD f23, 43 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 44 * SIZE(BO) + LFD f25, 45 * SIZE(BO) + LFD f26, 46 * SIZE(BO) + LFD f27, 47 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 48 * SIZE(BO) + LFD f21, 49 * SIZE(BO) + LFD f22, 50 * SIZE(BO) + LFD f23, 51 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 52 * SIZE(BO) + LFD f25, 53 * SIZE(BO) + LFD f26, 54 * SIZE(BO) + LFD f27, 55 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 56 * SIZE(BO) + LFD f21, 57 * SIZE(BO) + LFD f22, 58 * SIZE(BO) + LFD f23, 59 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 60 * SIZE(BO) + LFD f25, 61 * SIZE(BO) + LFD f26, 62 * SIZE(BO) + LFD f27, 63 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 64 * SIZE(BO) + LFD f21, 65 * SIZE(BO) + LFD f22, 66 * SIZE(BO) + LFD f23, 67 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 68 * SIZE(BO) + LFD f25, 69 * SIZE(BO) + LFD f26, 70 * SIZE(BO) + LFD f27, 71 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 64 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + + dcbtst B, BB + addi BB, BB, 16 * SIZE + dcbtst B, BB + addi BB, BB, 16 * SIZE + +#ifndef TRMMKERNEL + andi. r0, K, 7 + mtspr CTR, r0 + ble LL(18) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + ble LL(18) +#endif + .align 4 + +LL(16): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#ifndef TRMMKERNEL + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f3, f18 + FMADD f27, f31, f2, f19 + + LFD f16, 0 * SIZE(CO3) + LFD f17, 1 * SIZE(CO3) + LFD f18, 2 * SIZE(CO3) + LFD f19, 3 * SIZE(CO3) + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f2, f30, f2, f26 + FMADD f3, f30, f3, f27 + + FNMSUB f24, f31, f5, f20 + FMADD f25, f31, f4, f21 + FNMSUB f26, f31, f7, f22 + FMADD f27, f31, f6, f23 + + LFD f20, 0 * SIZE(CO4) + LFD f21, 1 * SIZE(CO4) + LFD f22, 2 * SIZE(CO4) + LFD f23, 3 * SIZE(CO4) + + FMADD f4, f30, f4, f24 + FMADD f5, f30, f5, f25 + FMADD f6, f30, f6, f26 + FMADD f7, f30, f7, f27 + + FNMSUB f24, f31, f9, f16 + FMADD f25, f31, f8, f17 + FNMSUB f26, f31, f11, f18 + FMADD f27, f31, f10, f19 + + FMADD f8, f30, f8, f24 + FMADD f9, f30, f9, f25 + FMADD f10, f30, f10, f26 + FMADD f11, f30, f11, f27 + + FNMSUB f24, f31, f13, f20 + FMADD f25, f31, f12, f21 + FNMSUB f26, f31, f15, f22 + FMADD f27, f31, f14, f23 + + FMADD f12, f30, f12, f24 + FMADD f13, f30, f13, f25 + FMADD f14, f30, f14, f26 + FMADD f15, f30, f15, f27 + +#else + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMUL f20, f31, f5 + FMUL f21, f31, f4 + FMUL f22, f31, f7 + FMUL f23, f31, f6 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + + FMSUB f4, f30, f4, f20 + FMADD f5, f30, f5, f21 + FMADD f6, f30, f6, f22 + FMADD f7, f30, f7, f23 + + FMUL f16, f31, f9 + FMUL f17, f31, f8 + FMUL f18, f31, f11 + FMUL f19, f31, f10 + + FMUL f20, f31, f13 + FMUL f21, f31, f12 + FMUL f22, f31, f15 + FMUL f23, f31, f14 + + FMSUB f8, f30, f8, f16 + FMADD f9, f30, f9, f17 + FMADD f10, f30, f10, f18 + FMADD f11, f30, f11, f19 + + FMSUB f12, f30, f12, f20 + FMADD f13, f30, f13, f21 + FMADD f14, f30, f14, f22 + FMADD f15, f30, f15, f23 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(25) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(25) +#endif + .align 4 + +LL(22): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f18, f20, f0 + FMA4 f3, f19, f20, f3 + FMA2 f1, f18, f21, f1 + FMA3 f2, f19, f21, f2 + + FMA1 f4, f18, f22, f4 + FMA4 f7, f19, f22, f7 + FMA2 f5, f18, f23, f5 + FMA3 f6, f19, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f8, f18, f24, f8 + FMA4 f11, f19, f24, f11 + FMA2 f9, f18, f25, f9 + FMA3 f10, f19, f25, f10 + + FMA1 f12, f18, f26, f12 + FMA4 f15, f19, f26, f15 + FMA2 f13, f18, f27, f13 + FMA3 f14, f19, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA4 f3, f29, f20, f3 + FMA2 f1, f28, f21, f1 + FMA3 f2, f29, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA4 f7, f29, f22, f7 + FMA2 f5, f28, f23, f5 + FMA3 f6, f29, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f8, f28, f24, f8 + FMA4 f11, f29, f24, f11 + FMA2 f9, f28, f25, f9 + FMA3 f10, f29, f25, f10 + + FMA1 f12, f28, f26, f12 + FMA4 f15, f29, f26, f15 + FMA2 f13, f28, f27, f13 + FMA3 f14, f29, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f30, f20, f0 + FMA4 f3, f31, f20, f3 + FMA2 f1, f30, f21, f1 + FMA3 f2, f31, f21, f2 + + FMA1 f4, f30, f22, f4 + FMA4 f7, f31, f22, f7 + FMA2 f5, f30, f23, f5 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f8, f30, f24, f8 + FMA4 f11, f31, f24, f11 + FMA2 f9, f30, f25, f9 + FMA3 f10, f31, f25, f10 + + FMA1 f12, f30, f26, f12 + FMA4 f15, f31, f26, f15 + FMA2 f13, f30, f27, f13 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 32 * SIZE + + bdnz LL(22) + .align 4 + +LL(25): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef TRMMKERNEL + andi. r0, K, 3 + mtspr CTR, r0 + ble LL(28) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(28) +#endif + .align 4 + +LL(26): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#ifndef TRMMKERNEL + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f5, f18 + FMADD f27, f31, f4, f19 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f4, f30, f4, f26 + FMADD f5, f30, f5, f27 + + FNMSUB f24, f31, f9, f20 + FMADD f25, f31, f8, f21 + FNMSUB f26, f31, f13, f22 + FMADD f27, f31, f12, f23 + + FMADD f8, f30, f8, f24 + FMADD f9, f30, f9, f25 + FMADD f12, f30, f12, f26 + FMADD f13, f30, f13, f27 + +#else + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f5 + FMUL f19, f31, f4 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f4, f30, f4, f18 + FMADD f5, f30, f5, f19 + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + FMUL f20, f31, f9 + FMUL f21, f31, f8 + FMUL f22, f31, f13 + FMUL f23, f31, f12 + + FMSUB f8, f30, f8, f20 + FMADD f9, f30, f9, f21 + FMSUB f12, f30, f12, f22 + FMADD f13, f30, f13, f23 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(29): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(50) + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + slwi BB, K, ZBASE_SHIFT + 1 + mr AO, A + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 1 + ble LL(40) + .align 4 + +LL(31): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, K, 3 + mr BO, B + mtspr CTR, r0 + ble LL(35) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(35) +#endif + .align 4 + +LL(32): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f0, f28, f24, f0 + FMA1 f2, f30, f24, f2 + FMA2 f1, f28, f25, f1 + FMA2 f3, f30, f25, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f26, f4 + FMA1 f6, f30, f26, f6 + FMA2 f5, f28, f27, f5 + FMA2 f7, f30, f27, f7 + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f0, f28, f24, f0 + FMA1 f2, f30, f24, f2 + FMA2 f1, f28, f25, f1 + FMA2 f3, f30, f25, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f26, f4 + FMA1 f6, f30, f26, f6 + FMA2 f5, f28, f27, f5 + FMA2 f7, f30, f27, f7 + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f0, f28, f24, f0 + FMA1 f2, f30, f24, f2 + FMA2 f1, f28, f25, f1 + FMA2 f3, f30, f25, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f26, f4 + FMA1 f6, f30, f26, f6 + FMA2 f5, f28, f27, f5 + FMA2 f7, f30, f27, f7 + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f0, f28, f24, f0 + FMA1 f2, f30, f24, f2 + FMA2 f1, f28, f25, f1 + FMA2 f3, f30, f25, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f26, f4 + FMA1 f6, f30, f26, f6 + FMA2 f5, f28, f27, f5 + FMA2 f7, f30, f27, f7 + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + bdnz LL(32) + .align 4 + +LL(35): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + + dcbtst B, BB + addi BB, BB, 16 * SIZE + +#ifndef TRMMKERNEL + andi. r0, K, 7 + mtspr CTR, r0 + ble LL(38) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + ble LL(38) +#endif + .align 4 + +LL(36): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 4 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f17, 5 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#ifndef TRMMKERNEL + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FADD f4, f4, f12 + FADD f5, f5, f13 + FADD f6, f6, f14 + FADD f7, f7, f15 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f3, f18 + FMADD f27, f31, f2, f19 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f2, f30, f2, f26 + FMADD f3, f30, f3, f27 + + FNMSUB f24, f31, f5, f20 + FMADD f25, f31, f4, f21 + FNMSUB f26, f31, f7, f22 + FMADD f27, f31, f6, f23 + + FMADD f4, f30, f4, f24 + FMADD f5, f30, f5, f25 + FMADD f6, f30, f6, f26 + FMADD f7, f30, f7, f27 + +#else + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + + FADD f4, f4, f12 + FADD f5, f5, f13 + FADD f6, f6, f14 + FADD f7, f7, f15 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMUL f20, f31, f5 + FMUL f21, f31, f4 + FMUL f22, f31, f7 + FMUL f23, f31, f6 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + + FMSUB f4, f30, f4, f20 + FMADD f5, f30, f5, f21 + FMADD f6, f30, f6, f22 + FMADD f7, f30, f7, f23 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(45) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(45) +#endif + .align 4 + +LL(42): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + + LFD f20, 12 * SIZE(BO) + LFD f21, 13 * SIZE(BO) + LFD f22, 14 * SIZE(BO) + LFD f23, 15 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef TRMMKERNEL + andi. r0, K, 3 + mtspr CTR, r0 + ble LL(48) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(48) +#endif + .align 4 + +LL(46): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f5, f20 + FMADD f27, f31, f4, f21 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f4, f30, f4, f26 + FMADD f5, f30, f5, f27 + +#else + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f5 + FMUL f19, f31, f4 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f4, f30, f4, f18 + FMADD f5, f30, f5, f19 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(49): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + + mr CO1, C + add C, CO1, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + mr AO, A + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + srawi. I, M, 1 + ble LL(60) + .align 4 + +LL(51): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + + srawi. r0, K, 3 + mr BO, B + mtspr CTR, r0 + ble LL(55) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(55) +#endif + .align 4 + +LL(52): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMA1 f0, f16, f22, f0 + FMA1 f2, f18, f22, f2 + FMA2 f1, f16, f23, f1 + FMA2 f3, f18, f23, f3 + + FMA4 f9, f17, f22, f9 + FMA4 f11, f19, f22, f11 + FMA3 f8, f17, f23, f8 + FMA3 f10, f19, f23, f10 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMA1 f0, f16, f22, f0 + FMA1 f2, f18, f22, f2 + FMA2 f1, f16, f23, f1 + FMA2 f3, f18, f23, f3 + + FMA4 f9, f17, f22, f9 + FMA4 f11, f19, f22, f11 + FMA3 f8, f17, f23, f8 + FMA3 f10, f19, f23, f10 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 20 * SIZE(AO) + LFD f17, 21 * SIZE(AO) + LFD f18, 22 * SIZE(AO) + LFD f19, 23 * SIZE(AO) + + FMA1 f0, f16, f22, f0 + FMA1 f2, f18, f22, f2 + FMA2 f1, f16, f23, f1 + FMA2 f3, f18, f23, f3 + + FMA4 f9, f17, f22, f9 + FMA4 f11, f19, f22, f11 + FMA3 f8, f17, f23, f8 + FMA3 f10, f19, f23, f10 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + LFD f20, 12 * SIZE(BO) + LFD f21, 13 * SIZE(BO) + LFD f22, 14 * SIZE(BO) + LFD f23, 15 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 28 * SIZE(AO) + LFD f17, 29 * SIZE(AO) + LFD f18, 30 * SIZE(AO) + LFD f19, 31 * SIZE(AO) + + FMA1 f0, f16, f22, f0 + FMA1 f2, f18, f22, f2 + FMA2 f1, f16, f23, f1 + FMA2 f3, f18, f23, f3 + + FMA4 f9, f17, f22, f9 + FMA4 f11, f19, f22, f11 + FMA3 f8, f17, f23, f8 + FMA3 f10, f19, f23, f10 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 16 * SIZE + + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef TRMMKERNEL + andi. r0, K, 7 + mtspr CTR, r0 + ble LL(58) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + ble LL(58) +#endif + .align 4 + +LL(56): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f17, 5 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f3, f18 + FMADD f27, f31, f2, f19 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f2, f30, f2, f26 + FMADD f3, f30, f3, f27 + +#else + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(51) + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(999) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(65) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(65) +#endif + .align 4 + +LL(62): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMA1 f0, f18, f22, f0 + FMA4 f3, f19, f22, f3 + FMA2 f1, f18, f23, f1 + FMA3 f2, f19, f23, f2 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMA1 f0, f18, f22, f0 + FMA4 f3, f19, f22, f3 + FMA2 f1, f18, f23, f1 + FMA3 f2, f19, f23, f2 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef TRMMKERNEL + andi. r0, K, 3 + mtspr CTR, r0 + ble LL(68) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(68) +#endif + .align 4 + +LL(66): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + LFD f20, 2 * SIZE(BO) + FMA2 f1, f16, f21, f1 + LFD f16, 2 * SIZE(AO) + FMA3 f2, f17, f21, f2 + LFD f17, 3 * SIZE(AO) + + LFD f21, 3 * SIZE(BO) + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(66) + .align 4 + +LL(68): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + +#else + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#ifdef TRMMKERNEL + ld r20, 232(SP) + ld r19, 240(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#ifdef TRMMKERNEL + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S new file mode 100644 index 0000000..2a80c97 --- /dev/null +++ b/kernel/power/zgemm_kernel_ppc440.S @@ -0,0 +1,1700 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#ifdef TRMMKERNEL + std r23, 208(SP) + std r22, 216(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#ifdef TRMMKERNEL + stw r23, 176(SP) + stw r22, 180(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + + srawi. J, N, 1 + ble .L30 + .align 4 + +.L10: + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr AO, A + ble .L20 + .align 4 + +.L11: +#ifndef TRMMKERNEL + LFD A1, 0 * SIZE(AO) ### + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) ### + LFD A5, 8 * SIZE(AO) ### + + LFD B1, 0 * SIZE(B) ### + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) ### + LFD B6, 8 * SIZE(B) ### + LFD B7, 12 * SIZE(B) ### + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L15 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD A1, 0 * SIZE(AO) ### + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) ### + LFD A5, 8 * SIZE(AO) ### + + LFD B1, 0 * SIZE(B) ### + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) ### + LFD B6, 8 * SIZE(B) ### + LFD B7, 12 * SIZE(B) ### + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD A1, 0 * SIZE(AO) ### + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) ### + LFD A5, 8 * SIZE(AO) ### + + LFD B1, 0 * SIZE(BO) ### + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) ### + LFD B6, 8 * SIZE(BO) ### + LFD B7, 12 * SIZE(BO) ### +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L15 +#endif + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) ### + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) ### + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + +############ + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) ### + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) ### + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + +############ + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) ### + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) ### + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + +############ + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) ### + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) ### + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .LKERNEL_MainFinish +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .LKERNEL_MainFinish +#endif + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.LKERNEL_MainFinish: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FNMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FNMADD f23, f31, f10, f23 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L20: + andi. I, M, 1 + ble .L29 + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L25 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L25 +#endif + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f27, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + LFD f19, 3 * SIZE(AO) + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + bdnz .L22 + .align 4 + +.L25: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .L27 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .L27 +#endif + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L26 + .align 4 + +.L27: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + ble .L40 + .align 4 + +.L31: +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L35 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L35 +#endif + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f27, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + LFD f19, 3 * SIZE(BO) + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + bdnz .L32 + .align 4 + +.L35: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .L37 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .L37 +#endif + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + LFD f17, 1 * SIZE(BO) + bdnz .L36 + .align 4 + +.L37: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L40: + andi. I, M, 1 + ble .L999 + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L45 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L45 +#endif + .align 4 + +.L42: + fmadd f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFD f16, 4 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFD f20, 4 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 5 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 6 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 6 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 8 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 8 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 2 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble .L47 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,TEMP + ble .L47 +#endif + .align 4 + +.L46: + fmadd f0, f16, f20, f0 + fmadd f3, f16, f21, f3 + LFDU f16, 2 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 2 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L47: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 +#endif + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 + + FMADD f16, f31, f2, f16 + FNMADD f17, f31, f0, f17 +#endif + +#endif + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#ifdef TRMMKERNEL + ld r23, 208(SP) + ld r22, 216(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#ifdef TRMMKERNEL + lwz r23, 176(SP) + lwz r22, 180(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_ncopy_hummer_2.S b/kernel/power/zgemm_ncopy_hummer_2.S new file mode 100644 index 0000000..9a6f802 --- /dev/null +++ b/kernel/power/zgemm_ncopy_hummer_2.S @@ -0,0 +1,451 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 + +#define J r12 + +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + + slwi LDA, LDA, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble- LL(99) + cmpwi cr0, N, 0 + ble- LL(99) + + li INC, 1 * SIZE + li INC2, 2 * SIZE + subi B, B, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne LL(100) + + subi A, A, 2 * SIZE + srawi. J, N, 1 + ble LL(20) + .align 4 +LL(11): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO2, INC2 + LFPDUX c05, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO1, INC2 + LFPDUX c08, AO2, INC2 + + LFPDUX c09, AO1, INC2 + LFPDUX c10, AO2, INC2 + LFPDUX c11, AO1, INC2 + LFPDUX c12, AO2, INC2 + LFPDUX c13, AO1, INC2 + LFPDUX c14, AO2, INC2 + LFPDUX c15, AO1, INC2 + LFPDUX c16, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c16, B, INC2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 7 + ble LL(19) + + andi. r0, M, 4 + beq LL(16) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO2, INC2 + LFPDUX c05, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO1, INC2 + LFPDUX c08, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + .align 4 + +LL(16): + andi. r0, M, 2 + beq LL(17) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +LL(17): + andi. r0, M, 1 + beq LL(19) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 1 + ble LL(99) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c05, AO1, INC2 + LFPDUX c07, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 3 + ble LL(99) + + andi. r0, M, 2 + beq LL(27) + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(27): + andi. r0, M, 1 + beq LL(99) + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B, INC2 + .align 4 + +LL(99): + addi SP, SP, -4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + subi A, A, 1 * SIZE + srawi. J, N, 1 + ble LL(120) + .align 4 +LL(111): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + fsmfp c01, c02 + + LFDUX c13, AO1, INC + fsmfp c03, c04 + LFDUX c14, AO1, INC + fsmfp c05, c06 + LFDUX c15, AO2, INC + fsmfp c07, c08 + LFDUX c16, AO2, INC + fsmfp c09, c10 + + STFPDUX c01, B, INC2 + fsmfp c11, c12 + STFPDUX c03, B, INC2 + fsmfp c13, c14 + STFPDUX c05, B, INC2 + fsmfp c15, c16 + STFPDUX c07, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c15, B, INC2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, M, 3 + ble LL(119) + + andi. r0, M, 2 + beq LL(117) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + .align 4 + +LL(117): + andi. r0, M, 1 + beq LL(119) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(119): + addic. J, J, -1 + bgt LL(111) + .align 4 + +LL(120): + andi. J, N, 1 + ble LL(999) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(125) + .align 4 + +LL(122): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz LL(122) + .align 4 + +LL(125): + andi. r0, M, 3 + ble LL(999) + + andi. r0, M, 2 + beq LL(127) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(127): + andi. r0, M, 1 + beq LL(999) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +LL(999): + addi SP, SP, -4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/zgemm_ncopy_hummer_4.S b/kernel/power/zgemm_ncopy_hummer_4.S new file mode 100644 index 0000000..0a64d0d --- /dev/null +++ b/kernel/power/zgemm_ncopy_hummer_4.S @@ -0,0 +1,666 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + + slwi LDA, LDA, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble- LL(99) + cmpwi cr0, N, 0 + ble- LL(99) + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + subi B, B, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne LL(100) + + subi A, A, 2 * SIZE + srawi. J, N, 2 + ble LL(20) + .align 4 +LL(11): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFPDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c14, AO4, INC2 + + LFPDUX c03, AO1, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFPDUX c15, AO4, INC2 + + LFPDUX c04, AO1, INC2 + LFPDUX c08, AO2, INC2 + LFPDUX c12, AO3, INC2 + LFPDUX c16, AO4, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c16, B, INC2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 3 + ble LL(19) + + andi. r0, M, 2 + beq LL(17) + + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFPDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c14, AO4, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + .align 4 + +LL(17): + andi. r0, M, 1 + beq LL(19) + + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFPDUX c13, AO4, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + + LFPDUX c03, AO1, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c04, AO1, INC2 + LFPDUX c08, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 3 + ble LL(30) + + andi. r0, M, 2 + beq LL(27) + + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + .align 4 + +LL(27): + andi. r0, M, 1 + beq LL(30) + + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(99) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, M, 3 + ble LL(99) + + andi. r0, M, 2 + beq LL(37) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +LL(37): + andi. r0, M, 1 + beq LL(99) + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B, INC2 + .align 4 + +LL(99): + addi SP, SP, -4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + subi A, A, 1 * SIZE + srawi. J, N, 2 + ble LL(120) + .align 4 +LL(111): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + LFDUX c01, AO1, INC + LFDUX c05, AO2, INC + LFDUX c09, AO3, INC + LFDUX c13, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO3, INC + LFSDUX c13, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c06, AO2, INC + LFDUX c10, AO3, INC + LFDUX c14, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c06, AO2, INC + LFSDUX c10, AO3, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c07, AO2, INC + LFDUX c11, AO3, INC + LFDUX c15, AO4, INC + + LFSDUX c03, AO1, INC + LFSDUX c07, AO2, INC + LFSDUX c11, AO3, INC + LFSDUX c15, AO4, INC + + LFDUX c04, AO1, INC + LFDUX c08, AO2, INC + LFDUX c12, AO3, INC + LFDUX c16, AO4, INC + + LFSDUX c04, AO1, INC + LFSDUX c08, AO2, INC + LFSDUX c12, AO3, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c16, B, INC2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, M, 3 + ble LL(119) + + andi. r0, M, 2 + beq LL(117) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c05, AO2, INC + LFDUX c06, AO2, INC + + LFDUX c09, AO3, INC + LFDUX c10, AO3, INC + LFDUX c13, AO4, INC + LFDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + fsmfp c01, c02 + + LFDUX c11, AO3, INC + fsmfp c05, c06 + LFDUX c12, AO3, INC + fsmfp c09, c10 + LFDUX c15, AO4, INC + fsmfp c13, c14 + LFDUX c16, AO4, INC + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + fsmfp c07, c08 + STFPDUX c05, B, INC2 + fsmfp c11, c12 + STFPDUX c09, B, INC2 + fsmfp c15, c16 + STFPDUX c13, B, INC2 + + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + .align 4 + +LL(117): + andi. r0, M, 1 + beq LL(119) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO4, INC + LFDUX c08, AO4, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + .align 4 + +LL(119): + addic. J, J, -1 + bgt LL(111) + .align 4 + +LL(120): + andi. J, N, 2 + ble LL(130) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(125) + .align 4 + +LL(122): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c13, AO2, INC + LFDUX c14, AO2, INC + fsmfp c01, c02 + + LFDUX c07, AO1, INC + fsmfp c09, c10 + LFDUX c08, AO1, INC + fsmfp c03, c04 + LFDUX c15, AO2, INC + fsmfp c11, c12 + LFDUX c16, AO2, INC + fsmfp c05, c06 + + STFPDUX c01, B, INC2 + fsmfp c13, c14 + STFPDUX c09, B, INC2 + fsmfp c07, c08 + STFPDUX c03, B, INC2 + fsmfp c15, c16 + STFPDUX c11, B, INC2 + + STFPDUX c05, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c15, B, INC2 + bdnz LL(122) + .align 4 + +LL(125): + andi. r0, M, 3 + ble LL(130) + + andi. r0, M, 2 + beq LL(127) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + .align 4 + +LL(127): + andi. r0, M, 1 + beq LL(130) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(130): + andi. J, N, 1 + ble LL(999) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(135) + .align 4 + +LL(132): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz LL(132) + .align 4 + +LL(135): + andi. r0, M, 3 + ble LL(999) + + andi. r0, M, 2 + beq LL(137) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(137): + andi. r0, M, 1 + beq LL(999) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +LL(999): + addi SP, SP, -4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + + + EPILOGUE diff --git a/kernel/power/zgemm_tcopy_hummer_2.S b/kernel/power/zgemm_tcopy_hummer_2.S new file mode 100644 index 0000000..bc2a083 --- /dev/null +++ b/kernel/power/zgemm_tcopy_hummer_2.S @@ -0,0 +1,308 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 + +#define J r10 +#define B1 r11 + +#define B2 r28 +#define M4 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 + + PROLOGUE + PROFCODE + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + slwi LDA, LDA, ZBASE_SHIFT + slwi M4, M, 1 + ZBASE_SHIFT + + li r9, -2 + + and B2, N, r9 + + mullw B2, B2, M + + slwi B2, B2, ZBASE_SHIFT + + add B2, B2, B + + cmpwi cr0, M, 0 + ble- LL(99) + cmpwi cr0, N, 0 + ble- LL(99) + + subi B2, B2, 2 * SIZE + subi M4, M4, 6 * SIZE + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne LL(100) + + subi A, A, 2 * SIZE + srawi. J, M, 1 + ble LL(20) + .align 4 + +LL(10): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 8 * SIZE + + srawi. r0, N, 1 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, N, 1 + ble LL(19) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(20): + andi. J, M, 1 + addi M4, M4, 4 * SIZE + ble LL(99) + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 1 + mtspr CTR, r0 + ble LL(23) + .align 4 + +LL(22): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + bdnz LL(22) + .align 4 + +LL(23): + andi. r0, N, 1 + ble LL(99) + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B2, INC2 + .align 4 + +LL(99): + addi SP, SP, -4 + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + addi SP, SP, 4 + blr + .align 4 + +LL(100): + subi A, A, SIZE + srawi. J, M, 1 + ble LL(120) + .align 4 + +LL(110): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 8 * SIZE + + srawi. r0, N, 1 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO2, INC + fsmfp c01, c02 + LFDUX c06, AO2, INC + fsmfp c03, c04 + LFDUX c07, AO2, INC + fsmfp c05, c06 + LFDUX c08, AO2, INC + fsmfp c07, c08 + + STFPDUX c01, B1, M4 + STFPDUX c03, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c07, B1, INC2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, N, 1 + ble LL(119) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +LL(119): + addic. J, J, -1 + bgt LL(110) + .align 4 + +LL(120): + andi. J, M, 1 + addi M4, M4, 4 * SIZE + ble LL(999) + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 1 + mtspr CTR, r0 + ble LL(123) + .align 4 + +LL(122): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B1, M4 + STFPDUX c03, B1, INC2 + bdnz LL(122) + .align 4 + +LL(123): + andi. r0, N, 1 + ble LL(999) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + + STFPDUX c01, B2, INC2 + .align 4 + +LL(999): + addi SP, SP, -4 + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + addi SP, SP, 4 + blr + + + + + EPILOGUE diff --git a/kernel/power/zgemm_tcopy_hummer_4.S b/kernel/power/zgemm_tcopy_hummer_4.S new file mode 100644 index 0000000..7011dc2 --- /dev/null +++ b/kernel/power/zgemm_tcopy_hummer_4.S @@ -0,0 +1,705 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r25 +#define B1 r26 +#define B2 r27 +#define B3 r28 +#define M4 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + + slwi LDA, LDA, ZBASE_SHIFT + slwi M4, M, 2 + ZBASE_SHIFT + + li r8, -4 + li r9, -2 + + and B2, N, r8 + and B3, N, r9 + + mullw B2, B2, M + mullw B3, B3, M + + slwi B2, B2, ZBASE_SHIFT + slwi B3, B3, ZBASE_SHIFT + + add B2, B2, B + add B3, B3, B + + cmpwi cr0, M, 0 + ble- LL(99) + cmpwi cr0, N, 0 + ble- LL(99) + + subi B2, B2, 2 * SIZE + subi B3, B3, 2 * SIZE + subi M4, M4, 30 * SIZE + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne LL(100) + + subi A, A, 2 * SIZE + srawi. J, M, 2 + ble LL(20) + .align 4 + +LL(10): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M4 + addi B, B, 32 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFPDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c14, AO4, INC2 + + LFPDUX c03, AO1, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFPDUX c15, AO4, INC2 + + LFPDUX c04, AO1, INC2 + LFPDUX c08, AO2, INC2 + LFPDUX c12, AO3, INC2 + LFPDUX c16, AO4, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, N, 3 + ble LL(19) + + andi. r0, N, 2 + ble LL(17) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + LFPDUX c05, AO3, INC2 + LFPDUX c06, AO3, INC2 + LFPDUX c07, AO4, INC2 + LFPDUX c08, AO4, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + .align 4 + +LL(17): + andi. r0, N, 1 + ble LL(19) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFPDUX c04, AO4, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c02, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c04, B3, INC2 + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(20): + andi. J, M, 2 + addi M4, M4, 16 * SIZE + + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(23) + .align 4 + +LL(22): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + LFPDUX c05, AO2, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c08, AO2, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + bdnz LL(22) + .align 4 + +LL(23): + andi. r0, N, 2 + ble LL(24) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + .align 4 + +LL(24): + andi. r0, N, 1 + ble LL(30) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c02, B3, INC2 + .align 4 + +LL(30): + andi. J, M, 1 + addi M4, M4, 8 * SIZE + ble LL(99) + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(33) + .align 4 + +LL(32): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz LL(32) + .align 4 + +LL(33): + andi. r0, N, 2 + ble LL(34) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + .align 4 + +LL(34): + andi. r0, N, 1 + ble LL(99) + + LFPDUX c01, AO1, INC2 + + STFPDX c01, B3, INC2 + .align 4 + +LL(99): + addi SP, SP, -4 + + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + + addi SP, SP, 16 + blr + .align 4 + +LL(100): + subi A, A, SIZE + srawi. J, M, 2 + ble LL(120) + .align 4 + +LL(110): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M4 + addi B, B, 32 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + LFDUX c01, AO1, INC + LFDUX c05, AO2, INC + LFDUX c09, AO3, INC + LFDUX c13, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO3, INC + LFSDUX c13, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c06, AO2, INC + LFDUX c10, AO3, INC + LFDUX c14, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c06, AO2, INC + LFSDUX c10, AO3, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c07, AO2, INC + LFDUX c11, AO3, INC + LFDUX c15, AO4, INC + + LFSDUX c03, AO1, INC + LFSDUX c07, AO2, INC + LFSDUX c11, AO3, INC + LFSDUX c15, AO4, INC + + LFDUX c04, AO1, INC + LFDUX c08, AO2, INC + LFDUX c12, AO3, INC + LFDUX c16, AO4, INC + + LFSDUX c04, AO1, INC + LFSDUX c08, AO2, INC + LFSDUX c12, AO3, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, N, 3 + ble LL(119) + + andi. r0, N, 2 + ble LL(117) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO2, INC + LFDUX c06, AO2, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + + LFDUX c09, AO3, INC + LFDUX c10, AO3, INC + LFDUX c11, AO3, INC + LFDUX c12, AO3, INC + fsmfp c01, c02 + + LFDUX c13, AO4, INC + fsmfp c03, c04 + LFDUX c14, AO4, INC + fsmfp c05, c06 + LFDUX c15, AO4, INC + fsmfp c07, c08 + LFDUX c16, AO4, INC + fsmfp c09, c10 + + STFPDUX c01, B2, INC2 + fsmfp c11, c12 + STFPDUX c03, B2, INC2 + fsmfp c13, c14 + STFPDUX c05, B2, INC2 + fsmfp c15, c16 + STFPDUX c07, B2, INC2 + STFPDUX c09, B2, INC2 + STFPDUX c11, B2, INC2 + STFPDUX c13, B2, INC2 + STFPDUX c15, B2, INC2 + .align 4 + +LL(117): + andi. r0, N, 1 + ble LL(119) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + LFDUX c05, AO3, INC + fsmfp c01, c02 + LFDUX c06, AO3, INC + fsmfp c03, c04 + LFDUX c07, AO4, INC + fsmfp c05, c06 + LFDUX c08, AO4, INC + fsmfp c07, c08 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + .align 4 + +LL(119): + addic. J, J, -1 + bgt LL(110) + .align 4 + +LL(120): + andi. J, M, 2 + addi M4, M4, 16 * SIZE + + ble LL(130) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(123) + .align 4 + +LL(122): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + fsmfp c01, c02 + LFDUX c13, AO2, INC + fsmfp c03, c04 + LFDUX c14, AO2, INC + fsmfp c05, c06 + LFDUX c15, AO2, INC + fsmfp c07, c08 + LFDUX c16, AO2, INC + fsmfp c09, c10 + + STFPDUX c01, B1, M4 + fsmfp c11, c12 + STFPDUX c03, B1, INC2 + fsmfp c13, c14 + STFPDUX c05, B1, INC2 + fsmfp c15, c16 + STFPDUX c07, B1, INC2 + STFPDUX c09, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c15, B1, INC2 + bdnz LL(122) + .align 4 + +LL(123): + andi. r0, N, 2 + ble LL(124) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO2, INC + fsmfp c01, c02 + LFDUX c06, AO2, INC + fsmfp c03, c04 + LFDUX c07, AO2, INC + fsmfp c05, c06 + LFDUX c08, AO2, INC + fsmfp c07, c08 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c07, B2, INC2 + .align 4 + +LL(124): + andi. r0, N, 1 + ble LL(130) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + .align 4 + +LL(130): + andi. J, M, 1 + addi M4, M4, 8 * SIZE + ble LL(999) + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(133) + .align 4 + +LL(132): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c05, AO1, INC + fsmfp c01, c02 + LFDUX c06, AO1, INC + fsmfp c03, c04 + LFDUX c07, AO1, INC + fsmfp c05, c06 + LFDUX c08, AO1, INC + fsmfp c07, c08 + + STFPDUX c01, B1, M4 + STFPDUX c03, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c07, B1, INC2 + bdnz LL(132) + .align 4 + +LL(133): + andi. r0, N, 2 + ble LL(134) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +LL(134): + andi. r0, N, 1 + ble LL(999) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDX c01, B3, INC2 + .align 4 + +LL(999): + addi SP, SP, -4 + + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + + addi SP, SP, 16 + blr + + + + EPILOGUE diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S new file mode 100644 index 0000000..00ba966 --- /dev/null +++ b/kernel/power/zgemv_n.S @@ -0,0 +1,4290 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r10 +#define LDA r5 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define LDA4 r18 + +#define Y1 r19 +#define Y2 r20 +#define PREA r21 +#define PREC r22 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define alpha1r f16 +#define alpha1i f17 +#define alpha2r f18 +#define alpha2i f19 +#define alpha3r f20 +#define alpha3i f21 +#define alpha4r f22 +#define alpha4i f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha_r f14 +#define alpha_i f15 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 56 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 56 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 40 +#define PREFETCHSIZE_C 24 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 24 +#endif + +#ifndef XCONJ +#define FMADDR FMADD +#define FMSUBR FNMSUB +#else +#define FMADDR FNMSUB +#define FMSUBR FMADD +#endif + +#ifndef CONJ +#define FMADDX FMADD +#define FMSUBX FNMSUB +#else +#define FMADDX FNMSUB +#define FMSUBX FMADD +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA_R 208(SP) +#define ALPHA_I 216(SP) +#else +#define STACKSIZE 280 +#define ALPHA_R 256(SP) +#define ALPHA_I 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz LDA, 56 + STACKSIZE(SP) + lwz X, 60 + STACKSIZE(SP) + lwz INCX, 64 + STACKSIZE(SP) + lwz Y, 68 + STACKSIZE(SP) + lwz INCY, 72 + STACKSIZE(SP) +#else + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) +#endif +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + + slwi LDA4, LDA, ZBASE_SHIFT + 2 + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpi cr0, 0, INCY, 2 * SIZE + bne LL(100) + + srawi. J, N, 2 + ble LL(20) + .align 4 + +LL(11): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + LFD a5, 0 * SIZE(X) + LFD a6, 1 * SIZE(X) + add X, X, INCX + LFD a7, 0 * SIZE(X) + LFD a8, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + FMUL alpha3r, alpha_r, a5 + FMUL alpha3i, alpha_i, a5 + FMUL alpha4r, alpha_r, a7 + FMUL alpha4i, alpha_i, a7 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + FMSUBR alpha2r, alpha_i, a4, alpha2r + FMADDR alpha2i, alpha_r, a4, alpha2i + FMSUBR alpha3r, alpha_i, a6, alpha3r + FMADDR alpha3i, alpha_r, a6, alpha3i + FMSUBR alpha4r, alpha_i, a8, alpha4r + FMADDR alpha4i, alpha_r, a8, alpha4i + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(15) + .align 4 + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + addi Y1, Y1, 16 * SIZE + + bdz LL(13) + .align 4 + +LL(12): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + addi AO2, AO2, 16 * SIZE + nop + DCBT(AO2, PREA) + nop + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 8 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a5, 12 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 9 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + addi AO3, AO3, 16 * SIZE + nop + DCBT(AO3, PREA) + nop + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y09, alpha3i, a2, y09 + FMADDX y10, alpha3r, a2, y10 + FMSUBX y11, alpha3i, a4, y11 + FMADDX y12, alpha3r, a4, y12 + + FMSUBX y13, alpha3i, a6, y13 + FMADDX y14, alpha3r, a6, y14 + FMSUBX y15, alpha3i, a8, y15 + FMADDX y16, alpha3r, a8, y16 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + LFD a1, 8 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a5, 12 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + LFD a2, 9 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + addi AO4, AO4, 16 * SIZE + nop + DCBT(AO4, PREA) + nop + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + + FMSUBX y13, alpha4i, a6, y13 + FMADDX y14, alpha4r, a6, y14 + FMSUBX y15, alpha4i, a8, y15 + FMADDX y16, alpha4r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + addi Y2, Y2, 16 * SIZE + addi Y1, Y1, 16 * SIZE + DCBT(Y1, PREC) + bdnz LL(12) + .align 4 + +LL(13): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 8 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a5, 12 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 9 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y09, alpha3i, a2, y09 + FMADDX y10, alpha3r, a2, y10 + FMSUBX y11, alpha3i, a4, y11 + FMADDX y12, alpha3r, a4, y12 + + FMSUBX y13, alpha3i, a6, y13 + FMADDX y14, alpha3r, a6, y14 + FMSUBX y15, alpha3i, a8, y15 + FMADDX y16, alpha3r, a8, y16 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + LFD a1, 8 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a5, 12 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + LFD a2, 9 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + LFD a1, 16 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a5, 20 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + FMSUBX y13, alpha4i, a6, y13 + FMADDX y14, alpha4r, a6, y14 + FMSUBX y15, alpha4i, a8, y15 + FMADDX y16, alpha4r, a8, y16 + + LFD a2, 17 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + addi Y2, Y2, 16 * SIZE + .align 4 + +LL(15): + andi. r0, M, 7 + ble LL(19) + andi. r0, M, 4 + ble LL(16) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi AO3, AO3, 8 * SIZE + addi AO4, AO4, 8 * SIZE + + addi Y1, Y1, 8 * SIZE + addi Y2, Y2, 8 * SIZE + .align 4 + +LL(16): + andi. r0, M, 2 + nop + nop + ble LL(17) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + LFD a1, 0 * SIZE(AO3) + LFD a2, 1 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + LFD a5, 0 * SIZE(AO4) + LFD a6, 1 * SIZE(AO4) + LFD a7, 2 * SIZE(AO4) + LFD a8, 3 * SIZE(AO4) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMADD y01, alpha4r, a5, y01 + FMADD y02, alpha4i, a5, y02 + FMADD y03, alpha4r, a7, y03 + FMADD y04, alpha4i, a7, y04 + + FMSUBX y01, alpha4i, a6, y01 + FMADDX y02, alpha4r, a6, y02 + FMSUBX y03, alpha4i, a8, y03 + FMADDX y04, alpha4r, a8, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + + addi Y1, Y1, 4 * SIZE + addi Y2, Y2, 4 * SIZE + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(19) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD a5, 0 * SIZE(AO3) + LFD a6, 1 * SIZE(AO3) + LFD a7, 0 * SIZE(AO4) + LFD a8, 1 * SIZE(AO4) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + FMADD y01, alpha3r, a5, y01 + FMADD y02, alpha3i, a5, y02 + FMSUBX y01, alpha3i, a6, y01 + FMADDX y02, alpha3r, a6, y02 + + FMADD y01, alpha4r, a7, y01 + FMADD y02, alpha4i, a7, y02 + FMSUBX y01, alpha4i, a8, y01 + FMADDX y02, alpha4r, a8, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + + add Y1, Y1, INCY + add Y2, Y2, INCY + .align 4 + +LL(19): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + .align 4 + +LL(21): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + FMSUBR alpha2r, alpha_i, a4, alpha2r + FMADDR alpha2i, alpha_r, a4, alpha2i + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(25) + .align 4 + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + addi Y1, Y1, 16 * SIZE + bdz LL(23) + .align 4 + +LL(22): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + addi AO2, AO2, 16 * SIZE + nop + DCBT(AO2, PREA) + nop + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + addi Y2, Y2, 16 * SIZE + addi Y1, Y1, 16 * SIZE + DCBT(Y1, PREC) + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi Y2, Y2, 16 * SIZE + .align 4 + +LL(25): + andi. r0, M, 7 + ble LL(30) + andi. r0, M, 4 + ble LL(26) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi Y1, Y1, 8 * SIZE + addi Y2, Y2, 8 * SIZE + .align 4 + +LL(26): + andi. r0, M, 2 + ble LL(27) + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 0 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 1 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi Y1, Y1, 4 * SIZE + addi Y2, Y2, 4 * SIZE + .align 4 + +LL(27): + andi. r0, M, 1 + ble LL(30) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + add Y1, Y1, INCY + add Y2, Y2, INCY + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + .align 4 + +LL(31): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + + mr AO1, A + add A, AO1, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(35) + .align 4 + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + addi Y1, Y1, 16 * SIZE + bdz LL(33) + .align 4 + +LL(32): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + addi Y1, Y1, 16 * SIZE + addi Y2, Y2, 16 * SIZE + DCBT(Y1, PREC) + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + addi AO1, AO1, 16 * SIZE + addi Y2, Y2, 16 * SIZE + .align 4 + +LL(35): + andi. r0, M, 7 + ble LL(999) + andi. r0, M, 4 + ble LL(36) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + addi AO1, AO1, 8 * SIZE + addi Y1, Y1, 8 * SIZE + addi Y2, Y2, 8 * SIZE + .align 4 + +LL(36): + andi. r0, M, 2 + ble LL(37) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + addi AO1, AO1, 4 * SIZE + addi Y1, Y1, 4 * SIZE + addi Y2, Y2, 4 * SIZE + .align 4 + +LL(37): + andi. r0, M, 1 + ble LL(999) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + add Y1, Y1, INCY + add Y2, Y2, INCY + b LL(999) + .align 4 + +LL(100): + srawi. J, N, 2 + ble LL(120) + .align 4 + +LL(111): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + LFD a5, 0 * SIZE(X) + LFD a6, 1 * SIZE(X) + add X, X, INCX + LFD a7, 0 * SIZE(X) + LFD a8, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + FMUL alpha3r, alpha_r, a5 + FMUL alpha3i, alpha_i, a5 + FMUL alpha4r, alpha_r, a7 + FMUL alpha4i, alpha_i, a7 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + FMSUBR alpha2r, alpha_i, a4, alpha2r + FMADDR alpha2i, alpha_r, a4, alpha2i + FMSUBR alpha3r, alpha_i, a6, alpha3r + FMADDR alpha3i, alpha_r, a6, alpha3i + FMSUBR alpha4r, alpha_i, a8, alpha4r + FMADDR alpha4i, alpha_r, a8, alpha4i + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(115) + .align 4 + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y09, 0 * SIZE(Y1) + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y11, 0 * SIZE(Y1) + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y13, 0 * SIZE(Y1) + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y15, 0 * SIZE(Y1) + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + bdz LL(113) + .align 4 + +LL(112): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + addi AO2, AO2, 16 * SIZE + nop + DCBT(AO2, PREA) + nop + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 8 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a5, 12 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 9 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + addi AO3, AO3, 16 * SIZE + nop + DCBT(AO3, PREA) + nop + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y09, alpha3i, a2, y09 + FMADDX y10, alpha3r, a2, y10 + FMSUBX y11, alpha3i, a4, y11 + FMADDX y12, alpha3r, a4, y12 + + FMSUBX y13, alpha3i, a6, y13 + FMADDX y14, alpha3r, a6, y14 + FMSUBX y15, alpha3i, a8, y15 + FMADDX y16, alpha3r, a8, y16 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + LFD a1, 8 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a5, 12 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + LFD a2, 9 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + addi AO4, AO4, 16 * SIZE + nop + DCBT(AO4, PREA) + nop + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y05, 0 * SIZE(Y1) + nop + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y07, 0 * SIZE(Y1) + nop + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y09, 0 * SIZE(Y1) + nop + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y11, 0 * SIZE(Y1) + nop + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y13, alpha4i, a6, y13 + FMADDX y14, alpha4r, a6, y14 + FMSUBX y15, alpha4i, a8, y15 + FMADDX y16, alpha4r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y13, 0 * SIZE(Y1) + nop + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y15, 0 * SIZE(Y1) + nop + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + DCBT(Y1, PREC) + bdnz LL(112) + .align 4 + +LL(113): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 8 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a5, 12 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 9 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y09, alpha3i, a2, y09 + FMADDX y10, alpha3r, a2, y10 + FMSUBX y11, alpha3i, a4, y11 + FMADDX y12, alpha3r, a4, y12 + + FMSUBX y13, alpha3i, a6, y13 + FMADDX y14, alpha3r, a6, y14 + FMSUBX y15, alpha3i, a8, y15 + FMADDX y16, alpha3r, a8, y16 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + LFD a1, 8 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a5, 12 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + LFD a2, 9 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y13, alpha4i, a6, y13 + FMADDX y14, alpha4r, a6, y14 + FMSUBX y15, alpha4i, a8, y15 + FMADDX y16, alpha4r, a8, y16 + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + .align 4 + +LL(115): + andi. r0, M, 7 + ble LL(119) + andi. r0, M, 4 + ble LL(116) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 8 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + addi AO2, AO2, 8 * SIZE + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y05, 0 * SIZE(Y2) + addi AO3, AO3, 8 * SIZE + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + addi AO4, AO4, 8 * SIZE + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(116): + andi. r0, M, 2 + ble LL(117) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + LFD a1, 0 * SIZE(AO3) + LFD a2, 1 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + LFD a5, 0 * SIZE(AO4) + LFD a6, 1 * SIZE(AO4) + LFD a7, 2 * SIZE(AO4) + LFD a8, 3 * SIZE(AO4) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMADD y01, alpha4r, a5, y01 + FMADD y02, alpha4i, a5, y02 + FMADD y03, alpha4r, a7, y03 + FMADD y04, alpha4i, a7, y04 + + FMSUBX y01, alpha4i, a6, y01 + FMADDX y02, alpha4r, a6, y02 + FMSUBX y03, alpha4i, a8, y03 + FMADDX y04, alpha4r, a8, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 4 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + addi AO2, AO2, 4 * SIZE + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + .align 4 + +LL(117): + andi. r0, M, 1 + ble LL(119) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + LFD a5, 0 * SIZE(AO3) + LFD a6, 1 * SIZE(AO3) + LFD a7, 0 * SIZE(AO4) + LFD a8, 1 * SIZE(AO4) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + FMADD y01, alpha3r, a5, y01 + FMADD y02, alpha3i, a5, y02 + FMSUBX y01, alpha3i, a6, y01 + FMADDX y02, alpha3r, a6, y02 + + FMADD y01, alpha4r, a7, y01 + FMADD y02, alpha4i, a7, y02 + FMSUBX y01, alpha4i, a8, y01 + FMADDX y02, alpha4r, a8, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(119): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(111) + .align 4 + +LL(120): + andi. J, N, 2 + ble LL(130) + .align 4 + +LL(121): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + FMSUBR alpha2r, alpha_i, a4, alpha2r + FMADDR alpha2i, alpha_r, a4, alpha2i + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(125) + .align 4 + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y09, 0 * SIZE(Y1) + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y11, 0 * SIZE(Y1) + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y13, 0 * SIZE(Y1) + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y15, 0 * SIZE(Y1) + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + bdz LL(123) + .align 4 + +LL(122): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + addi AO2, AO2, 16 * SIZE + nop + DCBT(AO2, PREA) + nop + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y05, 0 * SIZE(Y1) + nop + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y07, 0 * SIZE(Y1) + nop + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y09, 0 * SIZE(Y1) + nop + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y11, 0 * SIZE(Y1) + nop + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y13, 0 * SIZE(Y1) + nop + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y15, 0 * SIZE(Y1) + nop + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + DCBT(Y1, PREC) + bdnz LL(122) + .align 4 + +LL(123): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 16 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + STFD y05, 0 * SIZE(Y2) + addi AO2, AO2, 16 * SIZE + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(125): + andi. r0, M, 7 + ble LL(130) + andi. r0, M, 4 + ble LL(126) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 8 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + addi AO2, AO2, 8 * SIZE + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(126): + andi. r0, M, 2 + ble LL(127) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 4 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + addi AO2, AO2, 4 * SIZE + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(127): + andi. r0, M, 1 + ble LL(130) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(130): + andi. J, N, 1 + ble LL(999) + .align 4 + +LL(131): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + + mr AO1, A + add A, AO1, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(135) + .align 4 + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y09, 0 * SIZE(Y1) + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y11, 0 * SIZE(Y1) + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y13, 0 * SIZE(Y1) + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y15, 0 * SIZE(Y1) + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + bdz LL(133) + .align 4 + +LL(132): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y05, 0 * SIZE(Y1) + nop + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y07, 0 * SIZE(Y1) + nop + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y09, 0 * SIZE(Y1) + nop + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y11, 0 * SIZE(Y1) + nop + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y13, 0 * SIZE(Y1) + nop + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y15, 0 * SIZE(Y1) + nop + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + DCBT(Y1, PREC) + bdnz LL(132) + .align 4 + +LL(133): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 16 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(135): + andi. r0, M, 7 + ble LL(999) + andi. r0, M, 4 + ble LL(136) + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y05, 0 * SIZE(Y1) + nop + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y07, 0 * SIZE(Y1) + nop + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 8 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(136): + andi. r0, M, 2 + ble LL(137) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 4 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(137): + andi. r0, M, 1 + ble LL(999) + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S new file mode 100644 index 0000000..690eb0d --- /dev/null +++ b/kernel/power/zgemv_n_ppc440.S @@ -0,0 +1,1386 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r10 +#define LDA r5 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 + +#define Y1 r18 +#define Y2 r19 +#define PREA r20 +#define YY r21 +#define BUFFER r22 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define alpha1r f16 +#define alpha1i f17 +#define alpha2r f18 +#define alpha2i f19 +#define alpha3r f20 +#define alpha3i f21 +#define alpha4r f22 +#define alpha4i f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha_r f14 +#define alpha_i f15 + +#if defined(PPCG4) +#define PREFETCHSIZE_A (3 * 4) +#endif + +#if defined(POWER6) +#define PREFETCHSIZE_A (3 * 4) +#endif + +#ifndef XCONJ +#define FMADDR FMADD +#define FMSUBR FNMSUB +#else +#define FMADDR FNMSUB +#define FMSUBR FMADD +#endif + +#ifndef CONJ +#define FMADDX FMADD +#define FMSUBX FNMSUB +#else +#define FMADDX FNMSUB +#define FMSUBX FMADD +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 232 +#define ALPHA_R 208(SP) +#define ALPHA_I 216(SP) +#define FZERO 224(SP) +#else +#define STACKSIZE 280 +#define ALPHA_R 256(SP) +#define ALPHA_I 264(SP) +#define FZERO 272(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz LDA, 56 + STACKSIZE(SP) + lwz X, 60 + STACKSIZE(SP) + lwz INCX, 64 + STACKSIZE(SP) + lwz Y, 68 + STACKSIZE(SP) + lwz INCY, 72 + STACKSIZE(SP) + lwz BUFFER, 76 + STACKSIZE(SP) +#else + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#endif +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + addi INCX, INCX, -SIZE + addi INCY, INCY, -SIZE + addi A, A, -SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + sub X, X, INCX + cmpwi cr0, N, 0 + sub Y, Y, INCY + ble- LL(999) + + li PREA, PREFETCHSIZE_A * SIZE + + mr YY, Y + lfd f0, FZERO + + cmpi cr0, 0, INCY, SIZE + beq LL(10) + + addi YY, BUFFER, -SIZE + addi Y1, BUFFER, -SIZE + + addi r0, M, 3 + srawi. r0, r0, 2 + mtspr CTR, r0 + .align 4 + +LL(02): + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + bdnz LL(02) + .align 4 + +LL(10): + srawi. J, N, 2 + ble LL(20) + .align 4 + +LL(11): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFDUX a1, X, INCX + LFDU a2, 1 * SIZE(X) + LFDUX a3, X, INCX + LFDU a4, 1 * SIZE(X) + LFDUX a5, X, INCX + LFDU a6, 1 * SIZE(X) + LFDUX a7, X, INCX + LFDU a8, 1 * SIZE(X) + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + + FMUL alpha3r, alpha_r, a5 + mr Y1, YY + FMUL alpha3i, alpha_i, a5 + mr Y2, YY + FMUL alpha4r, alpha_r, a7 + mr AO1, A + FMUL alpha4i, alpha_i, a7 + add AO2, A, LDA + + FMSUBR alpha1r, alpha_i, a2, alpha1r + add AO3, AO2, LDA + FMADDR alpha1i, alpha_r, a2, alpha1i + add AO4, AO3, LDA + FMSUBR alpha2r, alpha_i, a4, alpha2r + add A, AO4, LDA + FMADDR alpha2i, alpha_r, a4, alpha2i + + FMSUBR alpha3r, alpha_i, a6, alpha3r + srawi. r0, M, 2 + FMADDR alpha3i, alpha_r, a6, alpha3i + FMSUBR alpha4r, alpha_i, a8, alpha4r + mtspr CTR, r0 + FMADDR alpha4i, alpha_r, a8, alpha4i + ble LL(15) + .align 4 + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + LFDU a5, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + + FMADD y09, alpha1r, a1, y01 + FMADD y10, alpha1i, a1, y02 + FMADD y11, alpha1r, a3, y03 + FMADD y12, alpha1i, a3, y04 + + FMADD y13, alpha1r, a5, y05 + FMADD y14, alpha1i, a5, y06 + FMADD y15, alpha1r, a7, y07 + FMADD y16, alpha1i, a7, y08 + + bdz LL(13) + .align 4 + +LL(12): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO2) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO2) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO2) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO2) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO2) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO2) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO2) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO2) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y09, alpha2r, a1, y09 + LFDU y01, 1 * SIZE(Y1) + FMADD y10, alpha2i, a1, y10 + LFDU y02, 1 * SIZE(Y1) + FMADD y11, alpha2r, a3, y11 + LFDU y03, 1 * SIZE(Y1) + FMADD y12, alpha2i, a3, y12 + LFDU y04, 1 * SIZE(Y1) + +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + LFDU a1, 1 * SIZE(AO3) + FMADDX y10, alpha2r, a2, y10 + LFDU a2, 1 * SIZE(AO3) + FMSUBX y11, alpha2i, a4, y11 + LFDU a3, 1 * SIZE(AO3) + FMADDX y12, alpha2r, a4, y12 + LFDU a4, 1 * SIZE(AO3) + +#ifdef PPCG4 + dcbt AO3, PREA +#endif + + FMSUBX y13, alpha2i, a6, y13 + LFDU a5, 1 * SIZE(AO3) + FMADDX y14, alpha2r, a6, y14 + LFDU a6, 1 * SIZE(AO3) + FMSUBX y15, alpha2i, a8, y15 + LFDU a7, 1 * SIZE(AO3) + FMADDX y16, alpha2r, a8, y16 + LFDU a8, 1 * SIZE(AO3) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO3, PREA +#endif + + FMADD y09, alpha3r, a1, y09 + LFDU y05, 1 * SIZE(Y1) + FMADD y10, alpha3i, a1, y10 + LFDU y06, 1 * SIZE(Y1) + FMADD y11, alpha3r, a3, y11 + LFDU y07, 1 * SIZE(Y1) + FMADD y12, alpha3i, a3, y12 + LFDU y08, 1 * SIZE(Y1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + FMSUBX y09, alpha3i, a2, y09 + LFDU a1, 1 * SIZE(AO4) + FMADDX y10, alpha3r, a2, y10 + LFDU a2, 1 * SIZE(AO4) + FMSUBX y11, alpha3i, a4, y11 + LFDU a3, 1 * SIZE(AO4) + FMADDX y12, alpha3r, a4, y12 + LFDU a4, 1 * SIZE(AO4) + +#ifdef PPCG4 + dcbt AO4, PREA +#endif + + FMSUBX y13, alpha3i, a6, y13 + LFDU a5, 1 * SIZE(AO4) + FMADDX y14, alpha3r, a6, y14 + LFDU a6, 1 * SIZE(AO4) + FMSUBX y15, alpha3i, a8, y15 + LFDU a7, 1 * SIZE(AO4) + FMADDX y16, alpha3r, a8, y16 + LFDU a8, 1 * SIZE(AO4) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO4, PREA +#endif + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + FMSUBX y09, alpha4i, a2, y09 + LFDU a1, 1 * SIZE(AO1) + FMADDX y10, alpha4r, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMSUBX y11, alpha4i, a4, y11 + LFDU a3, 1 * SIZE(AO1) + FMADDX y12, alpha4r, a4, y12 + LFDU a4, 1 * SIZE(AO1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMSUBX y13, alpha4i, a6, y13 + LFDU a5, 1 * SIZE(AO1) + FMADDX y14, alpha4r, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMSUBX y15, alpha4i, a8, y15 + LFDU a7, 1 * SIZE(AO1) + FMADDX y16, alpha4r, a8, y16 + LFDU a8, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + FMADD y09, alpha1r, a1, y01 + STFDU y10, 1 * SIZE(Y2) + FMADD y10, alpha1i, a1, y02 + STFDU y11, 1 * SIZE(Y2) + FMADD y11, alpha1r, a3, y03 + STFDU y12, 1 * SIZE(Y2) + FMADD y12, alpha1i, a3, y04 + + STFDU y13, 1 * SIZE(Y2) + FMADD y13, alpha1r, a5, y05 + STFDU y14, 1 * SIZE(Y2) + FMADD y14, alpha1i, a5, y06 + STFDU y15, 1 * SIZE(Y2) + FMADD y15, alpha1r, a7, y07 + STFDU y16, 1 * SIZE(Y2) + FMADD y16, alpha1i, a7, y08 + bdnz LL(12) + .align 4 + +LL(13): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO2) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO2) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO2) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO2) + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO2) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO2) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO2) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + LFDU a1, 1 * SIZE(AO3) + FMADDX y10, alpha2r, a2, y10 + LFDU a2, 1 * SIZE(AO3) + FMSUBX y11, alpha2i, a4, y11 + LFDU a3, 1 * SIZE(AO3) + FMADDX y12, alpha2r, a4, y12 + LFDU a4, 1 * SIZE(AO3) + + FMSUBX y13, alpha2i, a6, y13 + LFDU a5, 1 * SIZE(AO3) + FMADDX y14, alpha2r, a6, y14 + LFDU a6, 1 * SIZE(AO3) + FMSUBX y15, alpha2i, a8, y15 + LFDU a7, 1 * SIZE(AO3) + FMADDX y16, alpha2r, a8, y16 + LFDU a8, 1 * SIZE(AO3) + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + FMSUBX y09, alpha3i, a2, y09 + LFDU a1, 1 * SIZE(AO4) + FMADDX y10, alpha3r, a2, y10 + LFDU a2, 1 * SIZE(AO4) + FMSUBX y11, alpha3i, a4, y11 + LFDU a3, 1 * SIZE(AO4) + FMADDX y12, alpha3r, a4, y12 + LFDU a4, 1 * SIZE(AO4) + + FMSUBX y13, alpha3i, a6, y13 + LFDU a5, 1 * SIZE(AO4) + FMADDX y14, alpha3r, a6, y14 + LFDU a6, 1 * SIZE(AO4) + FMSUBX y15, alpha3i, a8, y15 + LFDU a7, 1 * SIZE(AO4) + FMADDX y16, alpha3r, a8, y16 + LFDU a8, 1 * SIZE(AO4) + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + FMSUBX y13, alpha4i, a6, y13 + STFDU y09, 1 * SIZE(Y2) + FMADDX y14, alpha4r, a6, y14 + STFDU y10, 1 * SIZE(Y2) + FMSUBX y15, alpha4i, a8, y15 + STFDU y11, 1 * SIZE(Y2) + FMADDX y16, alpha4r, a8, y16 + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + .align 4 + +LL(15): + andi. r0, M, 2 + ble LL(17) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1r, a1, y01 + LFDU a5, 1 * SIZE(AO2) + FMADD y02, alpha1i, a1, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, alpha1r, a3, y03 + LFDU a7, 1 * SIZE(AO2) + FMADD y04, alpha1i, a3, y04 + LFDU a8, 1 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + LFDU a1, 1 * SIZE(AO3) + FMADDX y02, alpha1r, a2, y02 + LFDU a2, 1 * SIZE(AO3) + FMSUBX y03, alpha1i, a4, y03 + LFDU a3, 1 * SIZE(AO3) + FMADDX y04, alpha1r, a4, y04 + LFDU a4, 1 * SIZE(AO3) + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + LFDU a5, 1 * SIZE(AO4) + FMADDX y02, alpha2r, a6, y02 + LFDU a6, 1 * SIZE(AO4) + FMSUBX y03, alpha2i, a8, y03 + LFDU a7, 1 * SIZE(AO4) + FMADDX y04, alpha2r, a8, y04 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMADD y01, alpha4r, a5, y01 + FMADD y02, alpha4i, a5, y02 + FMADD y03, alpha4r, a7, y03 + FMADD y04, alpha4i, a7, y04 + + FMSUBX y01, alpha4i, a6, y01 + FMADDX y02, alpha4r, a6, y02 + FMSUBX y03, alpha4i, a8, y03 + FMADDX y04, alpha4r, a8, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(19) + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + LFDU a5, 1 * SIZE(AO3) + FMADD y02, alpha1i, a1, y02 + LFDU a6, 1 * SIZE(AO3) + FMSUBX y01, alpha1i, a2, y01 + LFDU a7, 1 * SIZE(AO4) + FMADDX y02, alpha1r, a2, y02 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + FMADD y01, alpha3r, a5, y01 + FMADD y02, alpha3i, a5, y02 + FMSUBX y01, alpha3i, a6, y01 + FMADDX y02, alpha3r, a6, y02 + + FMADD y01, alpha4r, a7, y01 + FMADD y02, alpha4i, a7, y02 + FMSUBX y01, alpha4i, a8, y01 + FMADDX y02, alpha4r, a8, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(19): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFDUX a1, X, INCX + LFDU a2, 1 * SIZE(X) + LFDUX a3, X, INCX + LFDU a4, 1 * SIZE(X) + + FMUL alpha1r, alpha_r, a1 + mr Y1, YY + FMUL alpha1i, alpha_i, a1 + mr Y2, YY + FMUL alpha2r, alpha_r, a3 + mr AO1, A + FMUL alpha2i, alpha_i, a3 + add AO2, A, LDA + + FMSUBR alpha1r, alpha_i, a2, alpha1r + add A, AO2, LDA + FMADDR alpha1i, alpha_r, a2, alpha1i + srawi. r0, M, 2 + FMSUBR alpha2r, alpha_i, a4, alpha2r + mtspr CTR, r0 + FMADDR alpha2i, alpha_r, a4, alpha2i + ble LL(25) + .align 4 + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + LFDU a5, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + + FMADD y09, alpha1r, a1, y01 + FMADD y10, alpha1i, a1, y02 + FMADD y11, alpha1r, a3, y03 + FMADD y12, alpha1i, a3, y04 + + FMADD y13, alpha1r, a5, y05 + FMADD y14, alpha1i, a5, y06 + FMADD y15, alpha1r, a7, y07 + FMADD y16, alpha1i, a7, y08 + + bdz LL(23) + .align 4 + +LL(22): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO2) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO2) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO2) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO2) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO2) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO2) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO2) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO2) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y09, alpha2r, a1, y09 + LFDU y01, 1 * SIZE(Y1) + FMADD y10, alpha2i, a1, y10 + LFDU y02, 1 * SIZE(Y1) + FMADD y11, alpha2r, a3, y11 + LFDU y03, 1 * SIZE(Y1) + FMADD y12, alpha2i, a3, y12 + LFDU y04, 1 * SIZE(Y1) + +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y13, alpha2r, a5, y13 + LFDU y05, 1 * SIZE(Y1) + FMADD y14, alpha2i, a5, y14 + LFDU y06, 1 * SIZE(Y1) + FMADD y15, alpha2r, a7, y15 + LFDU y07, 1 * SIZE(Y1) + FMADD y16, alpha2i, a7, y16 + LFDU y08, 1 * SIZE(Y1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + + FMSUBX y09, alpha2i, a2, y09 + LFDU a1, 1 * SIZE(AO1) + FMADDX y10, alpha2r, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMSUBX y11, alpha2i, a4, y11 + LFDU a3, 1 * SIZE(AO1) + FMADDX y12, alpha2r, a4, y12 + LFDU a4, 1 * SIZE(AO1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMSUBX y13, alpha2i, a6, y13 + LFDU a5, 1 * SIZE(AO1) + FMADDX y14, alpha2r, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMSUBX y15, alpha2i, a8, y15 + LFDU a7, 1 * SIZE(AO1) + FMADDX y16, alpha2r, a8, y16 + LFDU a8, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + FMADD y09, alpha1r, a1, y01 + STFDU y10, 1 * SIZE(Y2) + FMADD y10, alpha1i, a1, y02 + STFDU y11, 1 * SIZE(Y2) + FMADD y11, alpha1r, a3, y03 + STFDU y12, 1 * SIZE(Y2) + FMADD y12, alpha1i, a3, y04 + + STFDU y13, 1 * SIZE(Y2) + FMADD y13, alpha1r, a5, y05 + STFDU y14, 1 * SIZE(Y2) + FMADD y14, alpha1i, a5, y06 + STFDU y15, 1 * SIZE(Y2) + FMADD y15, alpha1r, a7, y07 + STFDU y16, 1 * SIZE(Y2) + FMADD y16, alpha1i, a7, y08 + bdnz LL(22) + .align 4 + +LL(23): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO2) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO2) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO2) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO2) + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO2) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO2) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO2) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + STFDU y09, 1 * SIZE(Y2) + FMADDX y14, alpha2r, a6, y14 + STFDU y10, 1 * SIZE(Y2) + FMSUBX y15, alpha2i, a8, y15 + STFDU y11, 1 * SIZE(Y2) + FMADDX y16, alpha2r, a8, y16 + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + .align 4 + +LL(25): + andi. r0, M, 2 + ble LL(27) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1r, a1, y01 + LFDU a5, 1 * SIZE(AO2) + FMADD y02, alpha1i, a1, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, alpha1r, a3, y03 + LFDU a7, 1 * SIZE(AO2) + FMADD y04, alpha1i, a3, y04 + LFDU a8, 1 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(27): + andi. r0, M, 1 + ble LL(30) + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(990) + .align 4 + + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFDUX a1, X, INCX + LFDU a2, 1 * SIZE(X) + + FMUL alpha1r, alpha_r, a1 + mr Y1, YY + mr Y2, YY + FMUL alpha1i, alpha_i, a1 + mr AO1, A + add A, A, LDA + + FMSUBR alpha1r, alpha_i, a2, alpha1r + srawi. r0, M, 2 + mtspr CTR, r0 + FMADDR alpha1i, alpha_r, a2, alpha1i + ble LL(35) + .align 4 + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + LFDU a5, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + + FMADD y09, alpha1r, a1, y01 + FMADD y10, alpha1i, a1, y02 + FMADD y11, alpha1r, a3, y03 + FMADD y12, alpha1i, a3, y04 + + FMADD y13, alpha1r, a5, y05 + FMADD y14, alpha1i, a5, y06 + FMADD y15, alpha1r, a7, y07 + FMADD y16, alpha1i, a7, y08 + + bdz LL(33) + .align 4 + +LL(32): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO1) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO1) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) + LFDU y03, 1 * SIZE(Y1) + LFDU y04, 1 * SIZE(Y1) + +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO1) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO1) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + LFDU y05, 1 * SIZE(Y1) + LFDU y06, 1 * SIZE(Y1) + LFDU y07, 1 * SIZE(Y1) + LFDU y08, 1 * SIZE(Y1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + FMADD y09, alpha1r, a1, y01 + STFDU y10, 1 * SIZE(Y2) + FMADD y10, alpha1i, a1, y02 + STFDU y11, 1 * SIZE(Y2) + FMADD y11, alpha1r, a3, y03 + STFDU y12, 1 * SIZE(Y2) + FMADD y12, alpha1i, a3, y04 + + STFDU y13, 1 * SIZE(Y2) + FMADD y13, alpha1r, a5, y05 + STFDU y14, 1 * SIZE(Y2) + FMADD y14, alpha1i, a5, y06 + STFDU y15, 1 * SIZE(Y2) + FMADD y15, alpha1r, a7, y07 + STFDU y16, 1 * SIZE(Y2) + FMADD y16, alpha1i, a7, y08 + bdnz LL(32) + .align 4 + +LL(33): + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + STFDU y09, 1 * SIZE(Y2) + FMADDX y14, alpha1r, a6, y14 + STFDU y10, 1 * SIZE(Y2) + FMSUBX y15, alpha1i, a8, y15 + STFDU y11, 1 * SIZE(Y2) + FMADDX y16, alpha1r, a8, y16 + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + .align 4 + +LL(35): + andi. r0, M, 2 + ble LL(37) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(37): + andi. r0, M, 1 + ble LL(990) + + LFDU y01, 1 * SIZE(Y1) + LFDU a1, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(990): + cmpi cr0, 0, INCY, SIZE + beq LL(999) + + addi YY, BUFFER, -SIZE + mr Y1, Y + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFDUX f0, Y, INCY + LFDU f1, 1 * SIZE(Y) + LFDUX f2, Y, INCY + LFDU f3, 1 * SIZE(Y) + LFDUX f4, Y, INCY + LFDU f5, 1 * SIZE(Y) + LFDUX f6, Y, INCY + LFDU f7, 1 * SIZE(Y) + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + LFDU f10, 1 * SIZE(YY) + LFDU f11, 1 * SIZE(YY) + LFDU f12, 1 * SIZE(YY) + LFDU f13, 1 * SIZE(YY) + LFDU f14, 1 * SIZE(YY) + LFDU f15, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFDUX f8, Y1, INCY + STFDU f9, 1 * SIZE(Y1) + STFDUX f10, Y1, INCY + STFDU f11, 1 * SIZE(Y1) + STFDUX f12, Y1, INCY + STFDU f13, 1 * SIZE(Y1) + STFDUX f14, Y1, INCY + STFDU f15, 1 * SIZE(Y1) + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 2 + ble LL(996) + + LFDUX f0, Y, INCY + LFDU f1, 1 * SIZE(Y) + LFDUX f2, Y, INCY + LFDU f3, 1 * SIZE(Y) + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + LFDU f10, 1 * SIZE(YY) + LFDU f11, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFDUX f8, Y1, INCY + STFDU f9, 1 * SIZE(Y1) + STFDUX f10, Y1, INCY + STFDU f11, 1 * SIZE(Y1) + .align 4 + +LL(996): + andi. J, M, 1 + ble LL(999) + + LFDUX f0, Y, INCY + LFDU f1, 1 * SIZE(Y) + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFDUX f8, Y1, INCY + STFDU f9, 1 * SIZE(Y1) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S new file mode 100644 index 0000000..057c04d --- /dev/null +++ b/kernel/power/zgemv_t.S @@ -0,0 +1,1522 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 2048 + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 304 +#endif + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r10 +#define LDA r5 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#define BUFFER r11 +#define XP r12 +#define MIN_N r14 +#define J r15 +#define CO r16 +#define BO r17 +#define PLDA_M r18 +#define AO1 r19 +#define AO2 r20 +#define AO3 r21 +#define AO4 r22 +#define IS r23 +#define PREA r24 +#define PREC r25 + +#define Y1 r23 /* dummy; should be same as gemv_n.S */ +#define Y2 r24 /* dummy; should be same as gemv_n.S */ + +#if defined(PPCG4) +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 56 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 56 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 40 +#define PREFETCHSIZE_C 8 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 8 +#endif + +#if !(defined(CONJ) && defined(XCONJ)) +#define FMADDR FMADD +#define FMSUBR FNMSUB +#else +#define FMADDR FNMSUB +#define FMSUBR FMADD +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define FZERO 200(SP) +#define ALPHA_R 208(SP) +#define ALPHA_I 216(SP) +#else +#define FZERO 256(SP) +#define ALPHA_R 264(SP) +#define ALPHA_I 272(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r0, FZERO +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r0, FZERO + stw r0, 4 + FZERO +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz LDA, 56 + STACKSIZE(SP) + lwz X, 60 + STACKSIZE(SP) + lwz INCX, 64 + STACKSIZE(SP) + lwz Y, 68 + STACKSIZE(SP) + lwz INCY, 72 + STACKSIZE(SP) + lwz BUFFER, 76 + STACKSIZE(SP) +#else + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#endif +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + + mullw PLDA_M, LDA, N + li XP, P + subf PLDA_M, XP, PLDA_M + slwi PLDA_M, PLDA_M, ZBASE_SHIFT + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li IS, 0 + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble LL(End) + cmpwi cr0, N, 0 + ble LL(End) + .align 4 + +LL(ISLoop): + subf MIN_N, IS, M + slwi r0, IS, ZBASE_SHIFT + cmpi cr0, 0, MIN_N, P + ble+ LL(min_nP) + li MIN_N, P +LL(min_nP): + add XP, X, r0 + cmpwi cr0, INCX, 2 * SIZE + beq LL(Main) + + mr XP, BUFFER + addi CO, BUFFER, -SIZE + + srawi. r0, MIN_N, 2 + mtspr CTR, r0 + ble LL(CopyRemain) + .align 4 + +LL(CopyKernel): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + add X, X, INCX + LFD f2, 0 * SIZE(X) + LFD f3, 1 * SIZE(X) + add X, X, INCX + LFD f4, 0 * SIZE(X) + LFD f5, 1 * SIZE(X) + add X, X, INCX + LFD f6, 0 * SIZE(X) + LFD f7, 1 * SIZE(X) + add X, X, INCX + + STFD f0, 1 * SIZE(CO) + STFD f1, 2 * SIZE(CO) + STFD f2, 3 * SIZE(CO) + STFD f3, 4 * SIZE(CO) + STFD f4, 5 * SIZE(CO) + STFD f5, 6 * SIZE(CO) + STFD f6, 7 * SIZE(CO) + STFDU f7, 8 * SIZE(CO) + bdnz LL(CopyKernel) + .align 4 + +LL(CopyRemain): + andi. r0, MIN_N, 3 + mtspr CTR, r0 + ble LL(Main) + .align 4 + +LL(CopySub): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + add X, X, INCX + STFD f0, 1 * SIZE(CO) + STFDU f1, 2 * SIZE(CO) + bdnz LL(CopySub) + .align 4 + +LL(Main): + mr CO, Y + addi XP, XP, -SIZE + srawi. J, N, 2 + ble LL(Remain) + .align 4 + +LL(MainHead): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr BO, XP + + lfd f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst PREC, CO + srawi. r0, MIN_N, 3 + mtspr CTR, r0 + ble LL(MainN3) + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f18, 0 * SIZE(AO2) + LFD f19, 1 * SIZE(AO2) + LFD f20, 0 * SIZE(AO3) + LFD f21, 1 * SIZE(AO3) + LFD f22, 0 * SIZE(AO4) + LFD f23, 1 * SIZE(AO4) + + LFD f24, 1 * SIZE(BO) + LFD f25, 2 * SIZE(BO) + LFD f26, 3 * SIZE(BO) + LFD f27, 4 * SIZE(BO) + LFD f28, 5 * SIZE(BO) + LFD f29, 6 * SIZE(BO) + LFD f30, 7 * SIZE(BO) + LFD f31, 8 * SIZE(BO) + + bdz LL(MainKernelSkip) + .align 5 + +LL(MainKernel): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 2 * SIZE(AO1) + LFD f17, 3 * SIZE(AO1) + LFD f18, 2 * SIZE(AO2) + LFD f19, 3 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 2 * SIZE(AO3) + LFD f21, 3 * SIZE(AO3) + LFD f22, 2 * SIZE(AO4) + LFD f23, 3 * SIZE(AO4) + + FMADD f0, f16, f26, f0 + FMADD f1, f16, f27, f1 + FMADD f2, f17, f26, f2 + FMADD f3, f17, f27, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 4 * SIZE(AO1) + LFD f17, 5 * SIZE(AO1) + LFD f18, 4 * SIZE(AO2) + LFD f19, 5 * SIZE(AO2) + + FMADD f8, f20, f26, f8 + FMADD f9, f20, f27, f9 + FMADD f10, f21, f26, f10 + FMADD f11, f21, f27, f11 + + FMADD f12, f22, f26, f12 + FMADD f13, f22, f27, f13 + FMADD f14, f23, f26, f14 + FMADD f15, f23, f27, f15 + + LFD f20, 4 * SIZE(AO3) + LFD f21, 5 * SIZE(AO3) + LFD f22, 4 * SIZE(AO4) + LFD f23, 5 * SIZE(AO4) + + LFD f24, 9 * SIZE(BO) + LFD f25, 10 * SIZE(BO) + LFD f26, 11 * SIZE(BO) + LFD f27, 12 * SIZE(BO) + + FMADD f0, f16, f28, f0 + FMADD f1, f16, f29, f1 + FMADD f2, f17, f28, f2 + FMADD f3, f17, f29, f3 + + FMADD f4, f18, f28, f4 + FMADD f5, f18, f29, f5 + FMADD f6, f19, f28, f6 + FMADD f7, f19, f29, f7 + + LFD f16, 6 * SIZE(AO1) + LFD f17, 7 * SIZE(AO1) + LFD f18, 6 * SIZE(AO2) + LFD f19, 7 * SIZE(AO2) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f28, f12 + FMADD f13, f22, f29, f13 + FMADD f14, f23, f28, f14 + FMADD f15, f23, f29, f15 + + LFD f20, 6 * SIZE(AO3) + LFD f21, 7 * SIZE(AO3) + LFD f22, 6 * SIZE(AO4) + LFD f23, 7 * SIZE(AO4) + + FMADD f0, f16, f30, f0 + FMADD f1, f16, f31, f1 + FMADD f2, f17, f30, f2 + FMADD f3, f17, f31, f3 + + FMADD f4, f18, f30, f4 + FMADD f5, f18, f31, f5 + FMADD f6, f19, f30, f6 + FMADD f7, f19, f31, f7 + + LFD f16, 8 * SIZE(AO1) + LFD f17, 9 * SIZE(AO1) + LFD f18, 8 * SIZE(AO2) + LFD f19, 9 * SIZE(AO2) + + FMADD f8, f20, f30, f8 + FMADD f9, f20, f31, f9 + FMADD f10, f21, f30, f10 + FMADD f11, f21, f31, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 8 * SIZE(AO3) + LFD f21, 9 * SIZE(AO3) + LFD f22, 8 * SIZE(AO4) + LFD f23, 9 * SIZE(AO4) + + LFD f28, 13 * SIZE(BO) + LFD f29, 14 * SIZE(BO) + LFD f30, 15 * SIZE(BO) + LFD f31, 16 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 10 * SIZE(AO1) + LFD f17, 11 * SIZE(AO1) + LFD f18, 10 * SIZE(AO2) + LFD f19, 11 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 10 * SIZE(AO3) + LFD f21, 11 * SIZE(AO3) + LFD f22, 10 * SIZE(AO4) + LFD f23, 11 * SIZE(AO4) + + FMADD f0, f16, f26, f0 + FMADD f1, f16, f27, f1 + FMADD f2, f17, f26, f2 + FMADD f3, f17, f27, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 12 * SIZE(AO1) + LFD f17, 13 * SIZE(AO1) + LFD f18, 12 * SIZE(AO2) + LFD f19, 13 * SIZE(AO2) + + FMADD f8, f20, f26, f8 + FMADD f9, f20, f27, f9 + FMADD f10, f21, f26, f10 + FMADD f11, f21, f27, f11 + + FMADD f12, f22, f26, f12 + FMADD f13, f22, f27, f13 + FMADD f14, f23, f26, f14 + FMADD f15, f23, f27, f15 + + LFD f20, 12 * SIZE(AO3) + LFD f21, 13 * SIZE(AO3) + LFD f22, 12 * SIZE(AO4) + LFD f23, 13 * SIZE(AO4) + + LFD f24, 17 * SIZE(BO) + LFD f25, 18 * SIZE(BO) + LFD f26, 19 * SIZE(BO) + LFD f27, 20 * SIZE(BO) + + FMADD f0, f16, f28, f0 + FMADD f1, f16, f29, f1 + FMADD f2, f17, f28, f2 + FMADD f3, f17, f29, f3 + + FMADD f4, f18, f28, f4 + FMADD f5, f18, f29, f5 + FMADD f6, f19, f28, f6 + FMADD f7, f19, f29, f7 + + LFD f16, 14 * SIZE(AO1) + LFD f17, 15 * SIZE(AO1) + LFD f18, 14 * SIZE(AO2) + LFD f19, 15 * SIZE(AO2) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f28, f12 + FMADD f13, f22, f29, f13 + FMADD f14, f23, f28, f14 + FMADD f15, f23, f29, f15 + + LFD f20, 14 * SIZE(AO3) + LFD f21, 15 * SIZE(AO3) + LFD f22, 14 * SIZE(AO4) + LFD f23, 15 * SIZE(AO4) + + FMADD f0, f16, f30, f0 + FMADD f1, f16, f31, f1 + FMADD f2, f17, f30, f2 + FMADD f3, f17, f31, f3 + + FMADD f4, f18, f30, f4 + FMADD f5, f18, f31, f5 + FMADD f6, f19, f30, f6 + FMADD f7, f19, f31, f7 + + LFD f16, 16 * SIZE(AO1) + LFD f17, 17 * SIZE(AO1) + LFD f18, 16 * SIZE(AO2) + LFD f19, 17 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + DCBT(AO1, PREA) + DCBT(AO2, PREA) + + FMADD f8, f20, f30, f8 + FMADD f9, f20, f31, f9 + FMADD f10, f21, f30, f10 + FMADD f11, f21, f31, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 16 * SIZE(AO3) + LFD f21, 17 * SIZE(AO3) + LFD f22, 16 * SIZE(AO4) + LFD f23, 17 * SIZE(AO4) + + LFD f28, 21 * SIZE(BO) + LFD f29, 22 * SIZE(BO) + LFD f30, 23 * SIZE(BO) + LFD f31, 24 * SIZE(BO) + + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + addi BO, BO, 16 * SIZE + bdnz LL(MainKernel) + .align 4 + +LL(MainKernelSkip): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 2 * SIZE(AO1) + LFD f17, 3 * SIZE(AO1) + LFD f18, 2 * SIZE(AO2) + LFD f19, 3 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 2 * SIZE(AO3) + LFD f21, 3 * SIZE(AO3) + LFD f22, 2 * SIZE(AO4) + LFD f23, 3 * SIZE(AO4) + + FMADD f0, f16, f26, f0 + FMADD f1, f16, f27, f1 + FMADD f2, f17, f26, f2 + FMADD f3, f17, f27, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 4 * SIZE(AO1) + LFD f17, 5 * SIZE(AO1) + LFD f18, 4 * SIZE(AO2) + LFD f19, 5 * SIZE(AO2) + + FMADD f8, f20, f26, f8 + FMADD f9, f20, f27, f9 + FMADD f10, f21, f26, f10 + FMADD f11, f21, f27, f11 + + FMADD f12, f22, f26, f12 + FMADD f13, f22, f27, f13 + FMADD f14, f23, f26, f14 + FMADD f15, f23, f27, f15 + + LFD f20, 4 * SIZE(AO3) + LFD f21, 5 * SIZE(AO3) + LFD f22, 4 * SIZE(AO4) + LFD f23, 5 * SIZE(AO4) + + FMADD f0, f16, f28, f0 + FMADD f1, f16, f29, f1 + FMADD f2, f17, f28, f2 + FMADD f3, f17, f29, f3 + + FMADD f4, f18, f28, f4 + FMADD f5, f18, f29, f5 + FMADD f6, f19, f28, f6 + FMADD f7, f19, f29, f7 + + LFD f16, 6 * SIZE(AO1) + LFD f17, 7 * SIZE(AO1) + LFD f18, 6 * SIZE(AO2) + LFD f19, 7 * SIZE(AO2) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f28, f12 + FMADD f13, f22, f29, f13 + FMADD f14, f23, f28, f14 + FMADD f15, f23, f29, f15 + + LFD f20, 6 * SIZE(AO3) + LFD f21, 7 * SIZE(AO3) + LFD f22, 6 * SIZE(AO4) + LFD f23, 7 * SIZE(AO4) + + FMADD f0, f16, f30, f0 + FMADD f1, f16, f31, f1 + FMADD f2, f17, f30, f2 + FMADD f3, f17, f31, f3 + + FMADD f4, f18, f30, f4 + FMADD f5, f18, f31, f5 + FMADD f6, f19, f30, f6 + FMADD f7, f19, f31, f7 + + LFD f16, 8 * SIZE(AO1) + LFD f17, 9 * SIZE(AO1) + LFD f18, 8 * SIZE(AO2) + LFD f19, 9 * SIZE(AO2) + + FMADD f8, f20, f30, f8 + FMADD f9, f20, f31, f9 + FMADD f10, f21, f30, f10 + FMADD f11, f21, f31, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 8 * SIZE(AO3) + LFD f21, 9 * SIZE(AO3) + LFD f22, 8 * SIZE(AO4) + LFD f23, 9 * SIZE(AO4) + + LFD f24, 9 * SIZE(BO) + LFD f25, 10 * SIZE(BO) + LFD f26, 11 * SIZE(BO) + LFD f27, 12 * SIZE(BO) + + LFD f28, 13 * SIZE(BO) + LFD f29, 14 * SIZE(BO) + LFD f30, 15 * SIZE(BO) + LFDU f31, 16 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 10 * SIZE(AO1) + LFD f17, 11 * SIZE(AO1) + LFD f18, 10 * SIZE(AO2) + LFD f19, 11 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 10 * SIZE(AO3) + LFD f21, 11 * SIZE(AO3) + LFD f22, 10 * SIZE(AO4) + LFD f23, 11 * SIZE(AO4) + + FMADD f0, f16, f26, f0 + FMADD f1, f16, f27, f1 + FMADD f2, f17, f26, f2 + FMADD f3, f17, f27, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 12 * SIZE(AO1) + LFD f17, 13 * SIZE(AO1) + LFD f18, 12 * SIZE(AO2) + LFD f19, 13 * SIZE(AO2) + + FMADD f8, f20, f26, f8 + FMADD f9, f20, f27, f9 + FMADD f10, f21, f26, f10 + FMADD f11, f21, f27, f11 + + FMADD f12, f22, f26, f12 + FMADD f13, f22, f27, f13 + FMADD f14, f23, f26, f14 + FMADD f15, f23, f27, f15 + + LFD f20, 12 * SIZE(AO3) + LFD f21, 13 * SIZE(AO3) + LFD f22, 12 * SIZE(AO4) + LFD f23, 13 * SIZE(AO4) + + FMADD f0, f16, f28, f0 + FMADD f1, f16, f29, f1 + FMADD f2, f17, f28, f2 + FMADD f3, f17, f29, f3 + + FMADD f4, f18, f28, f4 + FMADD f5, f18, f29, f5 + FMADD f6, f19, f28, f6 + FMADD f7, f19, f29, f7 + + LFD f16, 14 * SIZE(AO1) + LFD f17, 15 * SIZE(AO1) + LFD f18, 14 * SIZE(AO2) + LFD f19, 15 * SIZE(AO2) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f28, f12 + FMADD f13, f22, f29, f13 + FMADD f14, f23, f28, f14 + FMADD f15, f23, f29, f15 + + LFD f20, 14 * SIZE(AO3) + LFD f21, 15 * SIZE(AO3) + LFD f22, 14 * SIZE(AO4) + LFD f23, 15 * SIZE(AO4) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + FMADD f0, f16, f30, f0 + FMADD f1, f16, f31, f1 + FMADD f2, f17, f30, f2 + FMADD f3, f17, f31, f3 + + FMADD f4, f18, f30, f4 + FMADD f5, f18, f31, f5 + FMADD f6, f19, f30, f6 + FMADD f7, f19, f31, f7 + + FMADD f8, f20, f30, f8 + FMADD f9, f20, f31, f9 + FMADD f10, f21, f30, f10 + FMADD f11, f21, f31, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + .align 4 + +LL(MainN3): + andi. r0, MIN_N, 7 + mtspr CTR, r0 + ble LL(MainFinish) + .align 4 + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f18, 0 * SIZE(AO2) + LFD f19, 1 * SIZE(AO2) + LFD f20, 0 * SIZE(AO3) + LFD f21, 1 * SIZE(AO3) + LFD f22, 0 * SIZE(AO4) + LFD f23, 1 * SIZE(AO4) + + LFD f24, 1 * SIZE(BO) + LFDU f25, 2 * SIZE(BO) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi AO3, AO3, 2 * SIZE + addi AO4, AO4, 2 * SIZE + + bdz LL(MainN3KernelSkip) + .align 4 + +LL(MainN3Kernel): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f18, 0 * SIZE(AO2) + LFD f19, 1 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 0 * SIZE(AO3) + LFD f21, 1 * SIZE(AO3) + LFD f22, 0 * SIZE(AO4) + LFD f23, 1 * SIZE(AO4) + + LFD f24, 1 * SIZE(BO) + LFDU f25, 2 * SIZE(BO) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi AO3, AO3, 2 * SIZE + addi AO4, AO4, 2 * SIZE + + bdnz LL(MainN3Kernel) + .align 4 + +LL(MainN3KernelSkip): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + .align 4 + +LL(MainFinish): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef XCONJ +#ifndef CONJ + FSUB f0, f0, f3 + FADD f1, f1, f2 + FSUB f4, f4, f7 + FADD f5, f5, f6 + FSUB f8, f8, f11 + FADD f9, f9, f10 + FSUB f12, f12, f15 + FADD f13, f13, f14 +#else + FADD f0, f0, f3 + FSUB f1, f1, f2 + FADD f4, f4, f7 + FSUB f5, f5, f6 + FADD f8, f8, f11 + FSUB f9, f9, f10 + FADD f12, f12, f15 + FSUB f13, f13, f14 +#endif +#else +#ifndef CONJ + FADD f0, f0, f3 + FSUB f1, f2, f1 + FADD f4, f4, f7 + FSUB f5, f6, f5 + FADD f8, f8, f11 + FSUB f9, f10, f9 + FADD f12, f12, f15 + FSUB f13, f14, f13 +#else + FSUB f0, f0, f3 + FADD f1, f1, f2 + FSUB f4, f4, f7 + FADD f5, f5, f6 + FSUB f8, f8, f11 + FADD f9, f9, f10 + FSUB f12, f12, f15 + FADD f13, f13, f14 +#endif +#endif + + mr BO, CO + cmpwi cr0, INCY, 2 * SIZE + bne LL(FinishN1) + + LFD f16, 0 * SIZE(CO) + LFD f17, 1 * SIZE(CO) + LFD f18, 2 * SIZE(CO) + LFD f19, 3 * SIZE(CO) + LFD f20, 4 * SIZE(CO) + LFD f21, 5 * SIZE(CO) + LFD f22, 6 * SIZE(CO) + LFD f23, 7 * SIZE(CO) + + FMADD f16, f30, f0, f16 + FMADDR f17, f30, f1, f17 + FMADD f18, f30, f4, f18 + FMADDR f19, f30, f5, f19 + + FMADD f20, f30, f8, f20 + FMADDR f21, f30, f9, f21 + FMADD f22, f30, f12, f22 + FMADDR f23, f30, f13, f23 + + FMSUBR f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMSUBR f18, f31, f5, f18 + FMADD f19, f31, f4, f19 + + FMSUBR f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMSUBR f22, f31, f13, f22 + FMADD f23, f31, f12, f23 + + STFD f16, 0 * SIZE(CO) + STFD f17, 1 * SIZE(CO) + STFD f18, 2 * SIZE(CO) + STFD f19, 3 * SIZE(CO) + + STFD f20, 4 * SIZE(CO) + STFD f21, 5 * SIZE(CO) + STFD f22, 6 * SIZE(CO) + STFD f23, 7 * SIZE(CO) + + addi CO, CO, 8 * SIZE + + addi J, J, -1 + cmpwi cr0, J, 0 + bgt LL(MainHead) + b LL(Remain) + .align 4 + +LL(FinishN1): + LFD f16, 0 * SIZE(CO) + LFD f17, 1 * SIZE(CO) + add CO, CO, INCY + + LFD f18, 0 * SIZE(CO) + LFD f19, 1 * SIZE(CO) + add CO, CO, INCY + + LFD f20, 0 * SIZE(CO) + LFD f21, 1 * SIZE(CO) + add CO, CO, INCY + + LFD f22, 0 * SIZE(CO) + LFD f23, 1 * SIZE(CO) + add CO, CO, INCY + + FMADD f16, f30, f0, f16 + FMADDR f17, f30, f1, f17 + FMADD f18, f30, f4, f18 + FMADDR f19, f30, f5, f19 + + FMADD f20, f30, f8, f20 + FMADDR f21, f30, f9, f21 + FMADD f22, f30, f12, f22 + FMADDR f23, f30, f13, f23 + + FMSUBR f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMSUBR f18, f31, f5, f18 + FMADD f19, f31, f4, f19 + + FMSUBR f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMSUBR f22, f31, f13, f22 + FMADD f23, f31, f12, f23 + + STFD f16, 0 * SIZE(BO) + STFD f17, 1 * SIZE(BO) + add BO, BO, INCY + STFD f18, 0 * SIZE(BO) + STFD f19, 1 * SIZE(BO) + add BO, BO, INCY + + STFD f20, 0 * SIZE(BO) + STFD f21, 1 * SIZE(BO) + add BO, BO, INCY + STFD f22, 0 * SIZE(BO) + STFD f23, 1 * SIZE(BO) + + addi J, J, -1 + cmpwi cr0, J, 0 + bgt LL(MainHead) + .align 4 + +LL(Remain): + andi. J, N, 3 + ble LL(ISEnd) + .align 4 + +LL(RemainHead): + mr AO1, A + add A, A, LDA + mr BO, XP + lfd f0, FZERO + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0 , MIN_N, 3 + mtspr CTR, r0 + ble LL(RemainN3) + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f18, 2 * SIZE(AO1) + LFD f19, 3 * SIZE(AO1) + + LFD f20, 4 * SIZE(AO1) + LFD f21, 5 * SIZE(AO1) + LFD f22, 6 * SIZE(AO1) + LFD f23, 7 * SIZE(AO1) + + LFD f24, 1 * SIZE(BO) + LFD f25, 2 * SIZE(BO) + LFD f26, 3 * SIZE(BO) + LFD f27, 4 * SIZE(BO) + + LFD f28, 5 * SIZE(BO) + LFD f29, 6 * SIZE(BO) + LFD f30, 7 * SIZE(BO) + LFD f31, 8 * SIZE(BO) + + bdz LL(RemainKernelSkip) + .align 4 + +LL(RemainKernel): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO1) + LFD f17, 9 * SIZE(AO1) + LFD f18, 10 * SIZE(AO1) + LFD f19, 11 * SIZE(AO1) + + LFD f24, 9 * SIZE(BO) + LFD f25, 10 * SIZE(BO) + LFD f26, 11 * SIZE(BO) + LFD f27, 12 * SIZE(BO) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 12 * SIZE(AO1) + LFD f21, 13 * SIZE(AO1) + LFD f22, 14 * SIZE(AO1) + LFD f23, 15 * SIZE(AO1) + + LFD f28, 13 * SIZE(BO) + LFD f29, 14 * SIZE(BO) + LFD f30, 15 * SIZE(BO) + LFD f31, 16 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 16 * SIZE(AO1) + LFD f17, 17 * SIZE(AO1) + LFD f18, 18 * SIZE(AO1) + LFD f19, 19 * SIZE(AO1) + + LFD f24, 17 * SIZE(BO) + LFD f25, 18 * SIZE(BO) + LFD f26, 19 * SIZE(BO) + LFD f27, 20 * SIZE(BO) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 20 * SIZE(AO1) + LFD f21, 21 * SIZE(AO1) + LFD f22, 22 * SIZE(AO1) + LFD f23, 23 * SIZE(AO1) + + LFD f28, 21 * SIZE(BO) + LFD f29, 22 * SIZE(BO) + LFD f30, 23 * SIZE(BO) + LFD f31, 24 * SIZE(BO) + + addi AO1, AO1, 16 * SIZE + addi BO, BO, 16 * SIZE + + DCBT(AO1, PREA) + + bdnz LL(RemainKernel) + .align 4 + +LL(RemainKernelSkip): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO1) + LFD f17, 9 * SIZE(AO1) + LFD f18, 10 * SIZE(AO1) + LFD f19, 11 * SIZE(AO1) + + LFD f24, 9 * SIZE(BO) + LFD f25, 10 * SIZE(BO) + LFD f26, 11 * SIZE(BO) + LFD f27, 12 * SIZE(BO) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 12 * SIZE(AO1) + LFD f21, 13 * SIZE(AO1) + LFD f22, 14 * SIZE(AO1) + LFD f23, 15 * SIZE(AO1) + + LFD f28, 13 * SIZE(BO) + LFD f29, 14 * SIZE(BO) + LFD f30, 15 * SIZE(BO) + LFDU f31, 16 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + addi AO1, AO1, 16 * SIZE + .align 4 + +LL(RemainN3): + andi. r0, MIN_N, 7 + mtspr CTR, r0 + ble LL(RemainFinish) + .align 4 + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f24, 1 * SIZE(BO) + LFDU f25, 2 * SIZE(BO) + addi AO1, AO1, 2 * SIZE + bdz LL(RemainN3KernelSkip) + .align 4 + +LL(RemainN3Kernel): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f24, 1 * SIZE(BO) + LFDU f25, 2 * SIZE(BO) + addi AO1, AO1, 2 * SIZE + bdnz LL(RemainN3Kernel) + .align 4 + +LL(RemainN3KernelSkip): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + .align 4 + +LL(RemainFinish): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + LFD f16, 0 * SIZE(CO) + LFD f17, 1 * SIZE(CO) + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FADD f8, f8, f12 + FADD f9, f9, f13 + FADD f10, f10, f14 + FADD f11, f11, f15 + + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + +#ifndef XCONJ +#ifndef CONJ + FSUB f0, f0, f3 + FADD f1, f1, f2 +#else + FADD f0, f0, f3 + FSUB f1, f1, f2 +#endif +#else +#ifndef CONJ + FADD f0, f0, f3 + FSUB f1, f2, f1 +#else + FSUB f0, f0, f3 + FADD f1, f1, f2 +#endif +#endif + + FMADD f16, f30, f0, f16 + FMADDR f17, f30, f1, f17 + FMSUBR f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + + STFD f16, 0 * SIZE(CO) + STFD f17, 1 * SIZE(CO) + add CO, CO, INCY + + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(RemainHead) + .align 4 + +LL(ISEnd): + subf A, PLDA_M, A + addi IS, IS, P + + cmp cr0, 0, IS, M + blt LL(ISLoop) + .align 4 + +LL(End): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE + +#endif diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S new file mode 100644 index 0000000..edb5183 --- /dev/null +++ b/kernel/power/zgemv_t_ppc440.S @@ -0,0 +1,1294 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 1024 + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 304 +#endif + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r10 +#define LDA r5 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#define BUFFER r11 +#define XP r12 +#define X1 r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define PREA r20 +#define PREC r21 +#define YY r22 + +#if defined(PPCG4) +#define PREFETCHSIZE_A (3 * 8) +#define PREFETCHSIZE_C 7 +#endif + +#if defined(POWER6) +#define PREFETCHSIZE_A (3 * 8) +#define PREFETCHSIZE_C 7 +#endif + +#if !(defined(CONJ) && defined(XCONJ)) +#define FMADDR FMADD +#define FMSUBR FNMSUB +#else +#define FMADDR FNMSUB +#define FMSUBR FMADD +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define FZERO 200(SP) +#else +#define FZERO 256(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r0, FZERO +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r0, FZERO + stw r0, 4 + FZERO +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz LDA, 56 + STACKSIZE(SP) + lwz X, 60 + STACKSIZE(SP) + lwz INCX, 64 + STACKSIZE(SP) + lwz Y, 68 + STACKSIZE(SP) + lwz INCY, 72 + STACKSIZE(SP) + lwz BUFFER, 76 + STACKSIZE(SP) +#else + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#endif +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#ifndef XCONJ +#ifndef CONJ +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FMADD +#define FMADD4 FNMSUB +#endif +#else +#ifndef CONJ +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FMADD +#define FMADD4 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#endif +#endif + +#define y1 f0 +#define y2 f1 +#define y3 f2 +#define y4 f3 +#define y5 f4 +#define y6 f5 +#define y7 f6 +#define y8 f7 + +#define a1 f8 +#define a2 f9 +#define a3 f10 +#define a4 f11 +#define a5 f12 +#define a6 f13 +#define a7 f14 +#define a8 f15 + +#define b1 f16 +#define b2 f17 +#define b3 f18 +#define b4 f19 +#define b5 f20 +#define b6 f21 +#define b7 f22 +#define b8 f23 + +#define alpha_r f24 +#define alpha_i f25 + + fmr alpha_r, f1 + fmr alpha_i, f2 + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + addi A, A, -SIZE + addi INCX, INCX, -SIZE + addi INCY, INCY, -SIZE + + sub X, X, INCX + sub Y, Y, INCY + + mr YY, Y + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + + mr XP, X + cmpwi cr0, INCX, SIZE + beq LL(10) + + addi XP, BUFFER, -SIZE + addi X1, BUFFER, -SIZE + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(02): + LFDUX f0, X, INCX + LFDU f1, 1 * SIZE(X) + LFDUX f2, X, INCX + LFDU f3, 1 * SIZE(X) + LFDUX f4, X, INCX + LFDU f5, 1 * SIZE(X) + LFDUX f6, X, INCX + LFDU f7, 1 * SIZE(X) + + STFDU f0, 1 * SIZE(X1) + STFDU f1, 1 * SIZE(X1) + STFDU f2, 1 * SIZE(X1) + STFDU f3, 1 * SIZE(X1) + STFDU f4, 1 * SIZE(X1) + STFDU f5, 1 * SIZE(X1) + STFDU f6, 1 * SIZE(X1) + STFDU f7, 1 * SIZE(X1) + bdnz LL(02) + .align 4 + +LL(05): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(06): + LFDUX f0, X, INCX + LFDU f1, 1 * SIZE(X) + STFDU f0, 1 * SIZE(X1) + STFDU f1, 1 * SIZE(X1) + bdnz LL(06) + .align 4 + +LL(10): + srawi. J, N, 2 + ble LL(20) + .align 4 + +LL(11): + lfd y1, FZERO + mr AO1, A + fmr y2, y1 + mr X1, XP + fmr y3, y1 + add AO2, A, LDA + fmr y4, y1 + add AO3, AO2, LDA + fmr y5, y1 + add AO4, AO3, LDA + fmr y6, y1 + add A, AO4, LDA + fmr y7, y1 + + dcbtst PREC, Y + fmr y8, y1 + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(15) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + LFDU a5, 1 * SIZE(AO3) + LFDU a6, 1 * SIZE(AO3) + LFDU a7, 1 * SIZE(AO4) + bdz LL(13) + .align 5 + +LL(12): + FMADD1 y1, a1, b1, y1 + LFDU a8, 1 * SIZE(AO4) + FMADD2 y2, a1, b2, y2 + LFDU b3, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + LFDU b4, 1 * SIZE(X1) + FMADD2 y4, a3, b2, y4 + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + +#ifdef PPCG4 + dcbt AO3, PREA +#endif + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b3, y5 + FMADD2 y6, a5, b4, y6 + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + +#ifdef PPCG4 + dcbt AO4, PREA +#endif + + FMADD3 y5, a6, b4, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b3, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b4, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b3, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO3, PREA +#endif + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b3, y5 + LFDU b1, 1 * SIZE(X1) + FMADD2 y6, a5, b4, y6 + LFDU b2, 1 * SIZE(X1) + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO4, PREA +#endif + + FMADD3 y5, a6, b4, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b3, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b4, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b3, y8 + bdnz LL(12) + .align 4 + +LL(13): + FMADD1 y1, a1, b1, y1 + LFDU a8, 1 * SIZE(AO4) + FMADD2 y2, a1, b2, y2 + LFDU b3, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + LFDU b4, 1 * SIZE(X1) + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b3, y5 + FMADD2 y6, a5, b4, y6 + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + + FMADD3 y5, a6, b4, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b3, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b4, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b3, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + FMADD4 y2, a2, b3, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + + FMADD1 y5, a5, b3, y5 + FMADD2 y6, a5, b4, y6 + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + + FMADD3 y5, a6, b4, y5 + FMADD4 y6, a6, b3, y6 + FMADD3 y7, a8, b4, y7 + FMADD4 y8, a8, b3, y8 + .align 4 + +LL(15): + andi. r0, M, 2 + ble LL(17) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO2) + LFDU b3, 1 * SIZE(X1) + LFDU a4, 1 * SIZE(AO2) + LFDU b4, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + LFDU a5, 1 * SIZE(AO3) + FMADD2 y2, a1, b2, y2 + LFDU a6, 1 * SIZE(AO3) + FMADD1 y3, a3, b1, y3 + LFDU a7, 1 * SIZE(AO4) + FMADD2 y4, a3, b2, y4 + LFDU a8, 1 * SIZE(AO4) + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + FMADD4 y2, a2, b3, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + + FMADD1 y5, a5, b3, y5 + FMADD2 y6, a5, b4, y6 + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + + FMADD3 y5, a6, b4, y5 + FMADD4 y6, a6, b3, y6 + FMADD3 y7, a8, b4, y7 + FMADD4 y8, a8, b3, y8 + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(19) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + LFDU a5, 1 * SIZE(AO3) + LFDU a6, 1 * SIZE(AO3) + LFDU a7, 1 * SIZE(AO4) + LFDU a8, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + FMADD2 y2, a1, b2, y2 + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + FMADD4 y2, a2, b1, y2 + FMADD3 y3, a4, b2, y3 + FMADD4 y4, a4, b1, y4 + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + + FMADD3 y5, a6, b2, y5 + FMADD4 y6, a6, b1, y6 + FMADD3 y7, a8, b2, y7 + FMADD4 y8, a8, b1, y8 + .align 4 + +LL(19): + LFDUX b1, Y, INCY + LFDU b2, 1 * SIZE(Y) + LFDUX b3, Y, INCY + LFDU b4, 1 * SIZE(Y) + LFDUX b5, Y, INCY + LFDU b6, 1 * SIZE(Y) + LFDUX b7, Y, INCY + LFDU b8, 1 * SIZE(Y) + + FMADD b1, alpha_r, y1, b1 + FMADDR b2, alpha_r, y2, b2 + FMADD b3, alpha_r, y3, b3 + FMADDR b4, alpha_r, y4, b4 + + FMADD b5, alpha_r, y5, b5 + FMADDR b6, alpha_r, y6, b6 + FMADD b7, alpha_r, y7, b7 + FMADDR b8, alpha_r, y8, b8 + + FMSUBR b1, alpha_i, y2, b1 + FMADD b2, alpha_i, y1, b2 + FMSUBR b3, alpha_i, y4, b3 + FMADD b4, alpha_i, y3, b4 + + FMSUBR b5, alpha_i, y6, b5 + FMADD b6, alpha_i, y5, b6 + FMSUBR b7, alpha_i, y8, b7 + FMADD b8, alpha_i, y7, b8 + + STFDUX b1, YY, INCY + STFDU b2, 1 * SIZE(YY) + STFDUX b3, YY, INCY + STFDU b4, 1 * SIZE(YY) + + STFDUX b5, YY, INCY + STFDU b6, 1 * SIZE(YY) + STFDUX b7, YY, INCY + STFDU b8, 1 * SIZE(YY) + + addi J, J, -1 + cmpwi cr0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + + lfd y1, FZERO + mr AO1, A + fmr y2, y1 + mr X1, XP + fmr y3, y1 + add AO2, A, LDA + fmr y4, y1 + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO2) + bdz LL(23) + .align 5 + +LL(22): + FMADD1 y1, a1, b1, y1 + LFDU a4, 1 * SIZE(AO2) + FMADD2 y2, a1, b2, y2 + LFDU b3, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + LFDU b4, 1 * SIZE(X1) + FMADD2 y4, a3, b2, y4 + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + + bdnz LL(22) + .align 4 + +LL(23): + FMADD1 y1, a1, b1, y1 + LFDU a4, 1 * SIZE(AO2) + FMADD2 y2, a1, b2, y2 + LFDU b3, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + LFDU b4, 1 * SIZE(X1) + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + FMADD4 y2, a2, b3, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + .align 4 + +LL(25): + andi. r0, M, 2 + ble LL(27) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO2) + LFDU b3, 1 * SIZE(X1) + LFDU a4, 1 * SIZE(AO2) + LFDU b4, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + FMADD2 y2, a1, b2, y2 + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + FMADD4 y2, a2, b3, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + .align 4 + +LL(27): + andi. r0, M, 1 + ble LL(29) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + FMADD2 y2, a1, b2, y2 + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + FMADD4 y2, a2, b1, y2 + FMADD3 y3, a4, b2, y3 + FMADD4 y4, a4, b1, y4 + .align 4 + +LL(29): + LFDUX b1, Y, INCY + LFDU b2, 1 * SIZE(Y) + LFDUX b3, Y, INCY + LFDU b4, 1 * SIZE(Y) + + FMADD b1, alpha_r, y1, b1 + FMADDR b2, alpha_r, y2, b2 + FMADD b3, alpha_r, y3, b3 + FMADDR b4, alpha_r, y4, b4 + + FMSUBR b1, alpha_i, y2, b1 + FMADD b2, alpha_i, y1, b2 + FMSUBR b3, alpha_i, y4, b3 + FMADD b4, alpha_i, y3, b4 + + STFDUX b1, YY, INCY + STFDU b2, 1 * SIZE(YY) + STFDUX b3, YY, INCY + STFDU b4, 1 * SIZE(YY) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + + lfd y1, FZERO + mr AO1, A + fmr y2, y1 + mr X1, XP + fmr y3, y1 + fmr y4, y1 + add A, A, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(35) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + bdz LL(33) + .align 5 + +LL(32): + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD3 y3, a2, b2, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b1, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD3 y3, a2, b4, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b3, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD3 y3, a2, b2, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b1, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD3 y3, a2, b4, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b3, y4 + LFDU a2, 1 * SIZE(AO1) + + bdnz LL(32) + .align 4 + +LL(33): + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + + FMADD3 y3, a2, b2, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b1, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + + FMADD3 y3, a2, b4, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b3, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + + FMADD3 y3, a2, b2, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b1, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD3 y3, a2, b4, y3 + FMADD4 y4, a2, b3, y4 + .align 4 + +LL(35): + andi. r0, M, 2 + ble LL(37) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU a3, 1 * SIZE(AO1) + FMADD3 y3, a2, b2, y3 + LFDU b4, 1 * SIZE(X1) + FMADD4 y4, a2, b1, y4 + LFDU a4, 1 * SIZE(AO1) + + FMADD1 y1, a3, b3, y1 + FMADD2 y2, a3, b4, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + .align 4 + +LL(37): + andi. r0, M, 1 + ble LL(39) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + FMADD2 y2, a1, b2, y2 + FMADD3 y3, a2, b2, y3 + FMADD4 y4, a2, b1, y4 + .align 4 + +LL(39): + LFDUX b1, Y, INCY + LFDU b2, 1 * SIZE(Y) + + FADD y1, y1, y3 + FADD y2, y2, y4 + + FMADD b1, alpha_r, y1, b1 + FMADDR b2, alpha_r, y2, b2 + FMSUBR b1, alpha_i, y2, b1 + FMADD b2, alpha_i, y1, b2 + + STFDUX b1, YY, INCY + STFDU b2, 1 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE + +#endif diff --git a/kernel/power/zger.S b/kernel/power/zger.S new file mode 100644 index 0000000..03d0bca --- /dev/null +++ b/kernel/power/zger.S @@ -0,0 +1,1357 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef NEEDPARAM +#ifndef DOUBLE +#include "cparam.h" +#else +#include "zparam.h" +#endif +#endif + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define A r10 +#define LDA r5 +#else +#define M r3 +#define N r4 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define A r6 +#define LDA r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define A r8 +#define LDA r9 +#else +#define M r3 +#define N r4 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define A r6 +#define LDA r7 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define AO5 r18 +#define AO6 r19 +#define AO7 r20 +#define AO8 r21 + +#define X1 r22 +#define PREA r23 +#define PREC r24 +#define XX r25 +#define BUFFER r26 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define alpha1_r f8 +#define alpha1_i f9 +#define alpha2_r f10 +#define alpha2_i f11 + +#define a1 f12 +#define a2 f13 +#define a3 f14 +#define a4 f15 +#define a5 f16 +#define a6 f17 +#define a7 f18 +#define a8 f19 +#define a9 f20 +#define a10 f21 +#define a11 f22 +#define a12 f23 +#define a13 f24 +#define a14 f25 +#define a15 f26 +#define a16 f27 + +#define alpha_r f30 +#define alpha_i f31 + +#ifndef CONJ +#define FMA1 FNMSUB +#define FMA2 FMADD +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 280 +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz LDA, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld INCY, 112 + STACKSIZE(SP) + ld A, 120 + STACKSIZE(SP) + ld LDA, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz A, 68 + STACKSIZE(SP) + lwz LDA, 72 + STACKSIZE(SP) + lwz BUFFER, 76 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) + lwz A, 60 + STACKSIZE(SP) + lwz LDA, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#endif +#else + ld INCY, 112 + STACKSIZE(SP) + ld A, 120 + STACKSIZE(SP) + ld LDA, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + + fmr alpha_r, f1 + fmr alpha_i, f2 + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + mr XX, X + + cmpi cr0, 0, INCX, 2 * SIZE + beq LL(10) + + mr XX, BUFFER + mr X1, BUFFER + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + LFD a5, 0 * SIZE(X) + LFD a6, 1 * SIZE(X) + add X, X, INCX + LFD a7, 0 * SIZE(X) + LFD a8, 1 * SIZE(X) + add X, X, INCX + + STFD a1, 0 * SIZE(X1) + STFD a2, 1 * SIZE(X1) + STFD a3, 2 * SIZE(X1) + STFD a4, 3 * SIZE(X1) + STFD a5, 4 * SIZE(X1) + STFD a6, 5 * SIZE(X1) + STFD a7, 6 * SIZE(X1) + STFD a8, 7 * SIZE(X1) + + addi X1, X1, 8 * SIZE + bdnz+ LL(01) + .align 4 + +LL(05): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(06): + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + STFD a1, 0 * SIZE(X1) + STFD a2, 1 * SIZE(X1) + + add X, X, INCX + addi X1, X1, 2 * SIZE + bdnz+ LL(06) + .align 4 + +LL(10): + srawi. J, N, 1 + ble LL(20) + .align 4 + +LL(11): + LFD alpha1_r, 0 * SIZE(Y) + LFD alpha1_i, 1 * SIZE(Y) + add Y, Y, INCY + LFD alpha2_r, 0 * SIZE(Y) + LFD alpha2_i, 1 * SIZE(Y) + add Y, Y, INCY + + FMUL a1, alpha_r, alpha1_r + FMUL a2, alpha_i, alpha1_r + FMUL a3, alpha_r, alpha2_r + FMUL a4, alpha_i, alpha2_r + + FMA1 alpha1_r, alpha_i, alpha1_i, a1 + FMA2 alpha1_i, alpha_r, alpha1_i, a2 + FMA1 alpha2_r, alpha_i, alpha2_i, a3 + FMA2 alpha2_i, alpha_r, alpha2_i, a4 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr X1, XX + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(15) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a9, 0 * SIZE(AO2) + LFD a10, 1 * SIZE(AO2) + LFD a11, 2 * SIZE(AO2) + LFD a12, 3 * SIZE(AO2) + LFD a13, 4 * SIZE(AO2) + LFD a14, 5 * SIZE(AO2) + LFD a15, 6 * SIZE(AO2) + LFD a16, 7 * SIZE(AO2) + + bdz LL(13) + .align 4 + +LL(12): + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + LFD a9, 8 * SIZE(AO2) + LFD a10, 9 * SIZE(AO2) + LFD a11, 10 * SIZE(AO2) + LFD a12, 11 * SIZE(AO2) + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + LFD a13, 12 * SIZE(AO2) + LFD a14, 13 * SIZE(AO2) + LFD a15, 14 * SIZE(AO2) + LFD a16, 15 * SIZE(AO2) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + LFD y01, 16 * SIZE(X1) + LFD y02, 17 * SIZE(X1) + LFD y03, 18 * SIZE(X1) + LFD y04, 19 * SIZE(X1) + + STFD a9, 8 * SIZE(AO2) + STFD a10, 9 * SIZE(AO2) + STFD a11, 10 * SIZE(AO2) + STFD a12, 11 * SIZE(AO2) + + LFD a9, 16 * SIZE(AO2) + LFD a10, 17 * SIZE(AO2) + LFD a11, 18 * SIZE(AO2) + LFD a12, 19 * SIZE(AO2) + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + LFD y05, 20 * SIZE(X1) + LFD y06, 21 * SIZE(X1) + LFD y07, 22 * SIZE(X1) + LFD y08, 23 * SIZE(X1) + + STFD a13, 12 * SIZE(AO2) + STFD a14, 13 * SIZE(AO2) + STFD a15, 14 * SIZE(AO2) + STFD a16, 15 * SIZE(AO2) + + LFD a13, 20 * SIZE(AO2) + LFD a14, 21 * SIZE(AO2) + LFD a15, 22 * SIZE(AO2) + LFD a16, 23 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi X1, X1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(Y1, PREY) + + bdnz+ LL(12) + .align 4 + +LL(13): + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + LFD a9, 8 * SIZE(AO2) + LFD a10, 9 * SIZE(AO2) + LFD a11, 10 * SIZE(AO2) + LFD a12, 11 * SIZE(AO2) + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + LFD a13, 12 * SIZE(AO2) + LFD a14, 13 * SIZE(AO2) + LFD a15, 14 * SIZE(AO2) + LFD a16, 15 * SIZE(AO2) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + STFD a9, 8 * SIZE(AO2) + STFD a10, 9 * SIZE(AO2) + STFD a11, 10 * SIZE(AO2) + STFD a12, 11 * SIZE(AO2) + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + STFD a13, 12 * SIZE(AO2) + STFD a14, 13 * SIZE(AO2) + STFD a15, 14 * SIZE(AO2) + STFD a16, 15 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi X1, X1, 16 * SIZE + .align 4 + +LL(15): + andi. r0, M, 7 + ble LL(19) + + andi. r0, M, 4 + ble LL(17) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a9, 0 * SIZE(AO2) + LFD a10, 1 * SIZE(AO2) + LFD a11, 2 * SIZE(AO2) + LFD a12, 3 * SIZE(AO2) + LFD a13, 4 * SIZE(AO2) + LFD a14, 5 * SIZE(AO2) + LFD a15, 6 * SIZE(AO2) + LFD a16, 7 * SIZE(AO2) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi X1, X1, 8 * SIZE + .align 4 + +LL(17): + andi. r0, M, 2 + ble LL(18) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha2_r, y01, a5 + FMADD a6, alpha2_r, y02, a6 + FMADD a7, alpha2_r, y03, a7 + FMADD a8, alpha2_r, y04, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + FNMSUB a5, alpha2_i, y02, a5 + FMADD a6, alpha2_i, y01, a6 + FNMSUB a7, alpha2_i, y04, a7 + FMADD a8, alpha2_i, y03, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + STFD a5, 0 * SIZE(AO2) + STFD a6, 1 * SIZE(AO2) + STFD a7, 2 * SIZE(AO2) + STFD a8, 3 * SIZE(AO2) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi X1, X1, 4 * SIZE + .align 4 + +LL(18): + andi. r0, M, 1 + ble LL(19) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha2_r, y01, a3 + FMADD a4, alpha2_r, y02, a4 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha2_i, y02, a3 + FMADD a4, alpha2_i, y01, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 0 * SIZE(AO2) + STFD a4, 1 * SIZE(AO2) + .align 4 + +LL(19): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 1 + ble LL(999) + + LFD alpha1_r, 0 * SIZE(Y) + LFD alpha1_i, 1 * SIZE(Y) + + FMUL a1, alpha_r, alpha1_r + FMUL a2, alpha_i, alpha1_r + + FMA1 alpha1_r, alpha_i, alpha1_i, a1 + FMA2 alpha1_i, alpha_r, alpha1_i, a2 + + mr AO1, A + + mr X1, XX + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(25) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + bdz LL(23) + .align 4 + +LL(22): + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + LFD y01, 16 * SIZE(X1) + LFD y02, 17 * SIZE(X1) + LFD y03, 18 * SIZE(X1) + LFD y04, 19 * SIZE(X1) + + LFD y05, 20 * SIZE(X1) + LFD y06, 21 * SIZE(X1) + LFD y07, 22 * SIZE(X1) + LFD y08, 23 * SIZE(X1) + + addi AO1, AO1, 16 * SIZE + addi X1, X1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(Y1, PREY) + + bdnz+ LL(22) + .align 4 + +LL(23): + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + addi X1, X1, 16 * SIZE + .align 4 + +LL(25): + andi. r0, M, 7 + ble LL(999) + + andi. r0, M, 4 + ble LL(27) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + addi AO1, AO1, 8 * SIZE + addi X1, X1, 8 * SIZE + .align 4 + +LL(27): + andi. r0, M, 2 + ble LL(28) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + addi AO1, AO1, 4 * SIZE + addi X1, X1, 4 * SIZE + .align 4 + +LL(28): + andi. r0, M, 1 + ble LL(999) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S new file mode 100644 index 0000000..ded25fd --- /dev/null +++ b/kernel/power/znrm2.S @@ -0,0 +1,924 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define NN r6 +#define XX r7 +#define PREA r8 +#define INCXM1 r9 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define FMAX 152(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r12, 0x5fe0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r12, FMAX + stw r10, 4 + FMAX + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + mr NN, N + mr XX, X + + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + + add X, X, INCX + + fabs f2, f0 + fabs f3, f1 + fabs f4, f0 + fabs f5, f1 + fabs f6, f0 + fabs f7, f1 + fabs f0, f0 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(1000) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(100) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + bdnz LL(60) + .align 4 + +LL(100): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + + fcmpu cr0, f1, f31 + beq- cr0, LL(9999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + LFD f10, 2 * SIZE(XX) + LFD f11, 3 * SIZE(XX) + LFD f12, 4 * SIZE(XX) + LFD f13, 5 * SIZE(XX) + LFD f14, 6 * SIZE(XX) + LFD f15, 7 * SIZE(XX) + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmul f18, f30, f10 + fmul f19, f30, f11 + + LFD f8, 8 * SIZE(XX) + LFD f9, 9 * SIZE(XX) + LFD f10, 10 * SIZE(XX) + LFD f11, 11 * SIZE(XX) + + fmul f20, f30, f12 + fmul f21, f30, f13 + fmul f22, f30, f14 + fmul f23, f30, f15 + + LFD f12, 12 * SIZE(XX) + LFD f13, 13 * SIZE(XX) + LFD f14, 14 * SIZE(XX) + LFD f15, 15 * SIZE(XX) + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFD f8, 16 * SIZE(XX) + LFD f9, 17 * SIZE(XX) + LFD f10, 18 * SIZE(XX) + LFD f11, 19 * SIZE(XX) + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFD f12, 20 * SIZE(XX) + LFD f13, 21 * SIZE(XX) + LFD f14, 22 * SIZE(XX) + LFD f15, 23 * SIZE(XX) + + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFD f8, 24 * SIZE(XX) + LFD f9, 25 * SIZE(XX) + LFD f10, 26 * SIZE(XX) + LFD f11, 27 * SIZE(XX) + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFD f12, 28 * SIZE(XX) + LFD f13, 29 * SIZE(XX) + LFD f14, 30 * SIZE(XX) + LFD f15, 31 * SIZE(XX) + +#ifndef POWER6 + L1_PREFETCH XX, PREA +#endif + addi XX, XX, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH XX, PREA +#endif + + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + addi XX, XX, 16 * SIZE + .align 4 + +LL(150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + addi XX, XX, 2 * SIZE + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f0, f0, f4 + + fsqrt f0, f0 + fmul f1, f31, f0 + b LL(9999) + .align 4 + +LL(1000): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(1050) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdz LL(1020) + .align 4 + +LL(1010): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdnz LL(1010) + .align 4 + +LL(1020): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(1050): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(1999) + .align 4 + +LL(1060): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + bdnz LL(1060) + .align 4 + +LL(1999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + + fcmpu cr0, f1, f31 + beq- cr0, LL(9999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + sub XX, XX, INCXM1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1150) + + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + LFDX f10, XX, INCXM1 + LFDUX f11, XX, INCX + LFDX f12, XX, INCXM1 + LFDUX f13, XX, INCX + LFDX f14, XX, INCXM1 + LFDUX f15, XX, INCX + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmul f18, f30, f10 + fmul f19, f30, f11 + + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + LFDX f10, XX, INCXM1 + LFDUX f11, XX, INCX + + fmul f20, f30, f12 + fmul f21, f30, f13 + fmul f22, f30, f14 + fmul f23, f30, f15 + + LFDX f12, XX, INCXM1 + LFDUX f13, XX, INCX + LFDX f14, XX, INCXM1 + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + LFDX f10, XX, INCXM1 + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFDX f12, XX, INCXM1 + LFDUX f13, XX, INCX + LFDX f14, XX, INCXM1 + LFDUX f15, XX, INCX + + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + LFDX f10, XX, INCXM1 + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFDX f12, XX, INCXM1 + LFDUX f13, XX, INCX + LFDX f14, XX, INCXM1 + LFDUX f15, XX, INCX + + bdnz LL(1110) + .align 4 + +LL(1120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq- cr0, LL(1170) + .align 4 + +LL(1160): + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(1160) + .align 4 + +LL(1170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f0, f0, f4 + + fsqrt f0, f0 + fmul f1, f31, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S new file mode 100644 index 0000000..b6deb94 --- /dev/null +++ b/kernel/power/znrm2_hummer.S @@ -0,0 +1,1018 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define ALPHA f4 +#define ALPHA_R f5 + +#define A1 f6 +#define A2 f7 +#define A3 f8 +#define A4 f9 +#define A5 f10 +#define A6 f11 +#define A7 f12 +#define A8 f13 + +#define F1 f14 +#define F2 f15 +#define F3 f16 +#define F4 f17 +#define F5 f18 +#define F6 f19 +#define F7 f20 +#define F8 f21 + +#define T1 f22 +#define T2 f23 +#define T3 f24 +#define T4 f25 +#define T5 f26 +#define T6 f27 +#define T7 f28 +#define T8 f29 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + stfpdux f28, SP, r10 + stfpdux f29, SP, r10 + + li r10, 0 + lis r11, 0x3f80 + stwu r11, -4(SP) + stwu r11, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpsx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(99) + cmpwi cr0, INCX, 0 + ble LL(99) + + mr XX, X + + andi. r0, X, 2 * SIZE - 1 + bne LL(100) + +/* aligned */ + + sub X, X, INCX2 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, T1 + LFPDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFPDUX A2, X, INCX2 + fpsub F3, C3, T3 + LFPDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(20) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(20) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel ALPHA, F1, C1, C2 + + li r10, 0 + + lfs ALPHA_R, 8(SP) # load 1.0 + fdiv ALPHA_R, ALPHA_R, ALPHA + + lfpsx C1, SP, r10 # Zero clear + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + fsmfp ALPHA_R, ALPHA_R + + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD C1, 0 * SIZE(XX) + add XX, XX, INCX + + cmpwi cr0, N, 0 + fmul C1, ALPHA_R, C1 + fmul C1, C1, C1 + ble LL(98) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + + fpmul T1, ALPHA_R, A1 + fpmul T2, ALPHA_R, A2 + fpmul T3, ALPHA_R, A3 + fpmul T4, ALPHA_R, A4 + + bdz LL(23) + .align 4 + +LL(22): + fpmadd C1, T1, T1, C1 + LFPDUX A1, XX, INCX2 + fpmul T1, ALPHA_R, A5 + LFPDUX A2, XX, INCX2 + + fpmadd C2, T2, T2, C2 + LFPDUX A3, XX, INCX2 + fpmul T2, ALPHA_R, A6 + LFPDUX A4, XX, INCX2 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + LFPDUX A5, XX, INCX2 + fpmul T1, ALPHA_R, A1 + LFPDUX A6, XX, INCX2 + + fpmadd C2, T2, T2, C2 + LFPDUX A7, XX, INCX2 + fpmul T2, ALPHA_R, A2 + LFPDUX A8, XX, INCX2 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A3 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A4 + bdnz LL(22) + .align 4 + +LL(23): + fpmadd C1, T1, T1, C1 + fpmul T1, ALPHA_R, A5 + fpmadd C2, T2, T2, C2 + fpmul T2, ALPHA_R, A6 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + fpmadd C2, T2, T2, C2 + fpmadd C3, T3, T3, C3 + fpmadd C4, T4, T4, C4 + .align 4 + +LL(25): + andi. r0, N, 7 + beq LL(98) + + andi. r0, N, 4 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + fpmul A3, ALPHA_R, A3 + fpmul A4, ALPHA_R, A4 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(26): + andi. r0, N, 2 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(98) + + LFPDUX A1, XX, INCX2 + fpmul A1, ALPHA_R, A1 + fpmadd C1, A1, A1, C1 + .align 4 + +LL(98): + fpadd C1, C1, C2 + lis r3, 0x3f00 + fpadd C3, C3, C4 + lis r4, 0x4040 + + stw r3, 4(SP) + stw r4, 8(SP) + + fpadd C1, C1, C3 + lfs f10, 0(SP) + + fsmtp C2, C1 + lfs f11, 4(SP) + fadd C1, C2, C1 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, C1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f29, SP, r10 + fmul f3, f9, f11 + lfpdux f28, SP, r10 + fnmsub f7, f2, f9, f12 + lfpdux f27, SP, r10 + fmul f9, f3, f7 + lfpdux f26, SP, r10 + fadd f13, f11, f11 + lfpdux f25, SP, r10 + fmul f12, f1, f9 + lfpdux f24, SP, r10 + fmul f11, f12, f11 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + fnmsub f1, f12, f9, f13 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fmadd f1, f11, f1, f12 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmul C1, ALPHA, C1 + blr +#else + fsqrt C1, C1 + + li r10, 16 + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(99): + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + sub X, X, INCX2 + addi X2, X, SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + LFSDUX A1, X, INCX2 + LFSDUX A2, X2, INCX2 + LFSDUX A3, X, INCX2 + LFSDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + LFSDUX A5, X, INCX2 + fpabs T1, A1 + LFSDUX A6, X2, INCX2 + fpabs T2, A2 + LFSDUX A7, X, INCX2 + fpabs T3, A3 + LFSDUX A8, X2, INCX2 + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, T1 + LFDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFDUX A2, X2, INCX2 + fpsub F3, C3, T3 + LFDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFDUX A4, X2, INCX2 + + fpabs T5, A5 + LFSDUX A1, X, INCX2 + fpabs T6, A6 + LFSDUX A2, X2, INCX2 + fpabs T7, A7 + LFSDUX A3, X, INCX2 + fpabs T8, A8 + LFSDUX A4, X2, INCX2 + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFDUX A6, X2, INCX2 + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFDUX A8, X2, INCX2 + + fpsub F5, C1, T5 + LFSDUX A5, X, INCX2 + fpsub F6, C2, T6 + LFSDUX A6, X2, INCX2 + fpsub F7, C3, T7 + LFSDUX A7, X, INCX2 + fpsub F8, C4, T8 + LFSDUX A8, X2, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(120) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + LFSDUX A1, X, INCX2 + LFSDUX A2, X2, INCX2 + LFSDUX A3, X, INCX2 + LFSDUX A4, X2, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + fabs A1, A1 + fabs A2, A2 + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel ALPHA, F1, C1, C2 + + li r10, 0 + + lfs ALPHA_R, 8(SP) # load 1.0 + fdiv ALPHA_R, ALPHA_R, ALPHA + + lfpsx C1, SP, r10 # Zero clear + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + fsmfp ALPHA_R, ALPHA_R + + sub XX, XX, INCX2 + addi X2, XX, SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(125) + + LFDUX A1, XX, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, XX, INCX2 + LFDUX A4, X2, INCX2 + LFSDUX A1, XX, INCX2 + LFSDUX A2, X2, INCX2 + LFSDUX A3, XX, INCX2 + LFSDUX A4, X2, INCX2 + + LFDUX A5, XX, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, XX, INCX2 + LFDUX A8, X2, INCX2 + LFSDUX A5, XX, INCX2 + fpmul T1, ALPHA_R, A1 + LFSDUX A6, X2, INCX2 + fpmul T2, ALPHA_R, A2 + LFSDUX A7, XX, INCX2 + fpmul T3, ALPHA_R, A3 + LFSDUX A8, X2, INCX2 + fpmul T4, ALPHA_R, A4 + bdz LL(123) + .align 4 + +LL(122): + fpmadd C1, T1, T1, C1 + LFDUX A1, XX, INCX2 + fpmul T1, ALPHA_R, A5 + LFDUX A2, X2, INCX2 + + fpmadd C2, T2, T2, C2 + LFDUX A3, XX, INCX2 + fpmul T2, ALPHA_R, A6 + LFDUX A4, X2, INCX2 + + fpmadd C3, T3, T3, C3 + LFSDUX A1, XX, INCX2 + fpmul T3, ALPHA_R, A7 + LFSDUX A2, X2, INCX2 + + fpmadd C4, T4, T4, C4 + LFSDUX A3, XX, INCX2 + fpmul T4, ALPHA_R, A8 + LFSDUX A4, X2, INCX2 + + fpmadd C1, T1, T1, C1 + LFDUX A5, XX, INCX2 + fpmul T1, ALPHA_R, A1 + LFDUX A6, X2, INCX2 + + fpmadd C2, T2, T2, C2 + LFDUX A7, XX, INCX2 + fpmul T2, ALPHA_R, A2 + LFDUX A8, X2, INCX2 + + fpmadd C3, T3, T3, C3 + LFSDUX A5, XX, INCX2 + fpmul T3, ALPHA_R, A3 + LFSDUX A6, X2, INCX2 + fpmadd C4, T4, T4, C4 + LFSDUX A7, XX, INCX2 + fpmul T4, ALPHA_R, A4 + LFSDUX A8, X2, INCX2 + bdnz LL(122) + .align 4 + +LL(123): + fpmadd C1, T1, T1, C1 + fpmul T1, ALPHA_R, A5 + fpmadd C2, T2, T2, C2 + fpmul T2, ALPHA_R, A6 + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + fpmadd C2, T2, T2, C2 + fpmadd C3, T3, T3, C3 + fpmadd C4, T4, T4, C4 + .align 4 + +LL(125): + andi. r0, N, 7 + beq LL(998) + + andi. r0, N, 4 + beq LL(126) + + LFDUX A1, XX, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, XX, INCX2 + LFDUX A4, X2, INCX2 + LFSDUX A1, XX, INCX2 + LFSDUX A2, X2, INCX2 + LFSDUX A3, XX, INCX2 + LFSDUX A4, X2, INCX2 + + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + fpmul A3, ALPHA_R, A3 + fpmul A4, ALPHA_R, A4 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(126): + andi. r0, N, 2 + beq LL(127) + + LFDUX A1, XX, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, XX, INCX2 + LFDUX A4, X2, INCX2 + + fmul A1, ALPHA_R, A1 + fmul A2, ALPHA_R, A2 + fmul A3, ALPHA_R, A3 + fmul A4, ALPHA_R, A4 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + .align 4 + +LL(127): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, XX, INCX2 + LFDUX A2, X2, INCX2 + + fmul A1, ALPHA_R, A1 + fmul A2, ALPHA_R, A2 + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + .align 4 + +LL(998): + fpadd C1, C1, C2 + lis r3, 0x3f00 + fpadd C3, C3, C4 + lis r4, 0x4040 + + stw r3, 4(SP) + stw r4, 8(SP) + + fpadd C1, C1, C3 + lfs f10, 0(SP) + fsmtp C2, C1 + lfs f11, 4(SP) + fadd C1, C2, C1 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, C1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f29, SP, r10 + fmul f3, f9, f11 + lfpdux f28, SP, r10 + fnmsub f7, f2, f9, f12 + lfpdux f27, SP, r10 + fmul f9, f3, f7 + lfpdux f26, SP, r10 + fadd f13, f11, f11 + lfpdux f25, SP, r10 + fmul f12, f1, f9 + lfpdux f24, SP, r10 + fmul f11, f12, f11 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f21, SP, r10 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fmadd f1, f11, f1, f12 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmul C1, ALPHA, C1 + blr +#else + fsqrt C1, C1 + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(999): + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S new file mode 100644 index 0000000..3542279 --- /dev/null +++ b/kernel/power/znrm2_ppc440.S @@ -0,0 +1,564 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define NN r6 +#define XX r7 +#define INC1 r9 +#define PRE r10 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define FMAX 152(SP) +#define C1 156(SP) +#define C2 160(SP) + +#define STACKSIZE 168 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r12, 0x5fe0 + lis r6, 0x3f00 + lis r7, 0x4040 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r12, FMAX + stw r10, 4 + FMAX + stw r6, C1 + stw r7, C2 + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + sub X, X, INCX + li INC1, SIZE + + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + cmpwi cr0, INCX, 0 + ble- LL(999) + + mr NN, N + mr XX, X + + LFDUX f0, X, INCX + LFDX f1, X, INC1 + + fabs f2, f0 + fabs f3, f1 + fabs f4, f0 + fabs f5, f1 + fabs f6, f0 + fabs f7, f1 + fabs f0, f0 + fabs f1, f1 + + subi N, N, 1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(50) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDX f25, X, INC1 + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDX f29, X, INC1 + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDX f25, X, INC1 + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDX f29, X, INC1 + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDX f31, X, INC1 + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(99) + .align 4 + +LL(60): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + bdnz LL(60) + .align 4 + +LL(99): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + + fcmpu cr0, f1, f31 + beq- cr0, LL(999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFDUX f8, XX, INCX + LFDX f9, XX, INC1 + LFDUX f10, XX, INCX + LFDX f11, XX, INC1 + LFDUX f12, XX, INCX + LFDX f13, XX, INC1 + LFDUX f14, XX, INCX + LFDX f15, XX, INC1 + + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmul f17, f30, f9 + LFDX f9, XX, INC1 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmul f19, f30, f11 + LFDX f11, XX, INC1 + + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmul f21, f30, f13 + LFDX f13, XX, INC1 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmul f23, f30, f15 + LFDX f15, XX, INC1 + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + LFDX f9, XX, INC1 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + LFDX f11, XX, INC1 + + fmadd f4, f20, f20, f4 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + LFDX f13, XX, INC1 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + LFDX f15, XX, INC1 + + fmadd f0, f16, f16, f0 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + LFDX f9, XX, INC1 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + LFDX f11, XX, INC1 + + fmadd f4, f20, f20, f4 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + LFDX f13, XX, INC1 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + LFDX f15, XX, INC1 + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + .align 4 + +LL(150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFDUX f8, XX, INCX + LFDX f9, XX, INC1 + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f1, f0, f4 + + frsqrte f0, f1 + lfs f8, C1 + lfs f9, C2 + + fmul f2, f1, f0 + fadd f7, f8, f8 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f2, f1, f0 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f2, f1, f0 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f5, f1, f0 + fmul f2, f5, f8 + fnmsub f3, f5, f0, f7 + fmadd f1, f2, f3, f5 + fmul f1, f31, f1 + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zrot.S b/kernel/power/zrot.S new file mode 100644 index 0000000..aad28af --- /dev/null +++ b/kernel/power/zrot.S @@ -0,0 +1,595 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 +#define XX r9 +#define YY r10 + +#define INCXM1 r11 +#define INCYM1 r12 + +#define C f1 +#define S f2 + +#define STACKSIZE 32 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f0, 0 * SIZE(X) + LFD f4, 1 * SIZE(X) + LFD f6, 2 * SIZE(X) + LFD f8, 3 * SIZE(X) + + LFD f3, 0 * SIZE(Y) + LFD f5, 1 * SIZE(Y) + LFD f7, 2 * SIZE(Y) + LFD f9, 3 * SIZE(Y) + bdz LL(12) + .align 4 + +LL(10): + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 4 * SIZE(X) + LFD f4, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f8, 7 * SIZE(X) + + LFD f3, 4 * SIZE(Y) + LFD f5, 5 * SIZE(Y) + LFD f7, 6 * SIZE(Y) + LFD f9, 7 * SIZE(Y) + + STFD f10, 0 * SIZE(X) + STFD f12, 1 * SIZE(X) + STFD f14, 2 * SIZE(X) + STFD f16, 3 * SIZE(X) + + STFD f11, 0 * SIZE(Y) + STFD f13, 1 * SIZE(Y) + STFD f15, 2 * SIZE(Y) + STFD f17, 3 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f6, 10 * SIZE(X) + LFD f8, 11 * SIZE(X) + + LFD f3, 8 * SIZE(Y) + LFD f5, 9 * SIZE(Y) + LFD f7, 10 * SIZE(Y) + LFD f9, 11 * SIZE(Y) + + STFD f10, 4 * SIZE(X) + STFD f12, 5 * SIZE(X) + STFD f14, 6 * SIZE(X) + STFD f16, 7 * SIZE(X) + + STFD f11, 4 * SIZE(Y) + STFD f13, 5 * SIZE(Y) + STFD f15, 6 * SIZE(Y) + STFD f17, 7 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 12 * SIZE(X) + LFD f4, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f8, 15 * SIZE(X) + + LFD f3, 12 * SIZE(Y) + LFD f5, 13 * SIZE(Y) + LFD f7, 14 * SIZE(Y) + LFD f9, 15 * SIZE(Y) + + STFD f10, 8 * SIZE(X) + STFD f12, 9 * SIZE(X) + STFD f14, 10 * SIZE(X) + STFD f16, 11 * SIZE(X) + + STFD f11, 8 * SIZE(Y) + STFD f13, 9 * SIZE(Y) + STFD f15, 10 * SIZE(Y) + STFD f17, 11 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 16 * SIZE(X) + LFD f4, 17 * SIZE(X) + LFD f6, 18 * SIZE(X) + LFD f8, 19 * SIZE(X) + + LFD f3, 16 * SIZE(Y) + LFD f5, 17 * SIZE(Y) + LFD f7, 18 * SIZE(Y) + LFD f9, 19 * SIZE(Y) + + STFD f10, 12 * SIZE(X) + STFD f12, 13 * SIZE(X) + STFD f14, 14 * SIZE(X) + STFD f16, 15 * SIZE(X) + + STFD f11, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f15, 14 * SIZE(Y) + STFD f17, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst X, PREA +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst X, PREA + dcbtst X, PREA +#endif + bdnz LL(10) + .align 4 + +LL(12): + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 0 * SIZE(X) + STFD f12, 1 * SIZE(X) + STFD f14, 2 * SIZE(X) + STFD f16, 3 * SIZE(X) + + STFD f11, 0 * SIZE(Y) + STFD f13, 1 * SIZE(Y) + STFD f15, 2 * SIZE(Y) + STFD f17, 3 * SIZE(Y) + + LFD f0, 4 * SIZE(X) + LFD f4, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f8, 7 * SIZE(X) + + LFD f3, 4 * SIZE(Y) + LFD f5, 5 * SIZE(Y) + LFD f7, 6 * SIZE(Y) + LFD f9, 7 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 4 * SIZE(X) + STFD f12, 5 * SIZE(X) + STFD f14, 6 * SIZE(X) + STFD f16, 7 * SIZE(X) + + STFD f11, 4 * SIZE(Y) + STFD f13, 5 * SIZE(Y) + STFD f15, 6 * SIZE(Y) + STFD f17, 7 * SIZE(Y) + + LFD f0, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f6, 10 * SIZE(X) + LFD f8, 11 * SIZE(X) + + LFD f3, 8 * SIZE(Y) + LFD f5, 9 * SIZE(Y) + LFD f7, 10 * SIZE(Y) + LFD f9, 11 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 8 * SIZE(X) + STFD f12, 9 * SIZE(X) + STFD f14, 10 * SIZE(X) + STFD f16, 11 * SIZE(X) + + STFD f11, 8 * SIZE(Y) + STFD f13, 9 * SIZE(Y) + STFD f15, 10 * SIZE(Y) + STFD f17, 11 * SIZE(Y) + + LFD f0, 12 * SIZE(X) + LFD f4, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f8, 15 * SIZE(X) + + LFD f3, 12 * SIZE(Y) + LFD f5, 13 * SIZE(Y) + LFD f7, 14 * SIZE(Y) + LFD f9, 15 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 12 * SIZE(X) + STFD f12, 13 * SIZE(X) + STFD f14, 14 * SIZE(X) + STFD f16, 15 * SIZE(X) + + STFD f11, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f15, 14 * SIZE(Y) + STFD f17, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f3, 0 * SIZE(X) + LFD f4, 0 * SIZE(Y) + LFD f5, 1 * SIZE(X) + LFD f6, 1 * SIZE(Y) + + FMUL f10, C, f3 + FMUL f11, C, f4 + FMUL f12, C, f5 + FMUL f13, C, f6 + + FMADD f10, S, f4, f10 + FNMSUB f11, S, f3, f11 + FMADD f12, S, f6, f12 + FNMSUB f13, S, f5, f13 + + STFD f10, 0 * SIZE(X) + STFD f11, 0 * SIZE(Y) + STFD f12, 1 * SIZE(X) + STFD f13, 1 * SIZE(Y) + + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + mr XX, X + mr YY, Y + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + LFDX f6, X, INCXM1 + LFDX f7, Y, INCYM1 + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDX f10, XX, INCXM1 + STFDX f11, YY, INCYM1 + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + STFDX f14, XX, INCXM1 + STFDX f15, YY, INCYM1 + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + LFDX f6, X, INCXM1 + LFDX f7, Y, INCYM1 + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDX f10, XX, INCXM1 + STFDX f11, YY, INCYM1 + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + STFDX f14, XX, INCXM1 + STFDX f15, YY, INCYM1 + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 3 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + STFDX f10, XX, INCXM1 + STFDX f11, YY, INCYM1 + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE + +#endif diff --git a/kernel/power/zrot_ppc440.S b/kernel/power/zrot_ppc440.S new file mode 100644 index 0000000..fe1a99d --- /dev/null +++ b/kernel/power/zrot_ppc440.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PRE r8 +#define XX r9 +#define YY r10 + +#define INCXM1 r11 +#define INCYM1 r12 + +#define C f1 +#define S f2 + +#define STACKSIZE 32 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + li PRE, 2 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + mr XX, X + mr YY, Y + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(150) + + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + + FMUL f10, C, f0 + LFDUX f5, Y, INCY + FMUL f11, C, f3 + LFDX f6, X, INCXM1 + FMUL f12, C, f4 + LFDX f7, Y, INCYM1 + FMUL f13, C, f5 + LFDUX f8, X, INCX + + FMADD f10, S, f3, f10 + LFDUX f9, Y, INCY + FNMSUB f11, S, f0, f11 + LFDX f0, X, INCXM1 + FMADD f12, S, f5, f12 + LFDX f3, Y, INCYM1 + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + bdz LL(111) + .align 4 + +LL(110): + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDX f10, XX, INCXM1 + FMUL f16, C, f8 + STFDX f11, YY, INCYM1 + FMUL f17, C, f9 + STFDUX f12, XX, INCX + +#ifdef PPCG4 + dcbtst X, PRE +#endif + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDX f6, X, INCXM1 + FMADD f16, S, f9, f16 + LFDX f7, Y, INCYM1 + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDX f14, XX, INCXM1 + FMUL f12, C, f4 + STFDX f15, YY, INCYM1 + FMUL f13, C, f5 + STFDUX f16, XX, INCX + +#ifdef PPCG4 + dcbtst Y, PRE +#endif + + FMADD f10, S, f3, f10 + STFDUX f17, YY, INCY + FNMSUB f11, S, f0, f11 + LFDX f0, X, INCXM1 + FMADD f12, S, f5, f12 + LFDX f3, Y, INCYM1 + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDX f10, XX, INCXM1 + FMUL f16, C, f8 + STFDX f11, YY, INCYM1 + FMUL f17, C, f9 + STFDUX f12, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDX f6, X, INCXM1 + FMADD f16, S, f9, f16 + LFDX f7, Y, INCYM1 + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + STFDX f14, XX, INCXM1 + FMUL f11, C, f3 + STFDX f15, YY, INCYM1 + FMUL f12, C, f4 + STFDUX f16, XX, INCX + FMUL f13, C, f5 + STFDUX f17, YY, INCY + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + + FMADD f10, S, f3, f10 + LFDUX f9, Y, INCY + FNMSUB f11, S, f0, f11 + LFDX f0, X, INCXM1 + FMADD f12, S, f5, f12 + LFDX f3, Y, INCYM1 + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + bdnz LL(110) + .align 4 + + +LL(111): + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDX f10, XX, INCXM1 + FMUL f16, C, f8 + STFDX f11, YY, INCYM1 + FMUL f17, C, f9 + STFDUX f12, XX, INCX + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDX f6, X, INCXM1 + FMADD f16, S, f9, f16 + LFDX f7, Y, INCYM1 + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDX f14, XX, INCXM1 + FMUL f12, C, f4 + STFDX f15, YY, INCYM1 + FMUL f13, C, f5 + STFDUX f16, XX, INCX + + FMADD f10, S, f3, f10 + STFDUX f17, YY, INCY + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMUL f14, C, f6 + STFDX f10, XX, INCXM1 + FMUL f15, C, f7 + STFDX f11, YY, INCYM1 + FMUL f16, C, f8 + STFDUX f12, XX, INCX + FMUL f17, C, f9 + STFDUX f13, YY, INCY + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDX f14, XX, INCXM1 + STFDX f15, YY, INCYM1 + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + .align 4 + + +LL(150): + andi. r0, N, 3 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + STFDX f10, XX, INCXM1 + STFDX f11, YY, INCYM1 + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S new file mode 100644 index 0000000..7ffa80f --- /dev/null +++ b/kernel/power/zscal.S @@ -0,0 +1,385 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define XX r4 +#define PREA r5 + +#ifdef linux +#ifndef __64BIT__ +#define X r6 +#define INCX r7 +#else +#define X r8 +#define INCX r9 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define X r10 +#define INCX r8 +#else +#define X r8 +#define INCX r9 +#endif +#endif + +#define FZERO f0 +#define ALPHA_R f1 +#define ALPHA_I f2 + + PROLOGUE + PROFCODE + + addi SP, SP, -8 + li r0, 0 + + stw r0, 0(SP) + lfs FZERO, 0(SP) + addi SP, SP, 8 + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCX, 56(SP) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + blelr- cr0 + + fcmpu cr0, FZERO, ALPHA_R + bne- cr0, LL(A1I1) + + fcmpu cr0, FZERO, ALPHA_I + bne- cr0, LL(A1I1) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(A0IN) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(A0I1_Remain) + .align 4 + +LL(A0I1_kernel): + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + STFD FZERO, 2 * SIZE(X) + STFD FZERO, 3 * SIZE(X) + STFD FZERO, 4 * SIZE(X) + STFD FZERO, 5 * SIZE(X) + STFD FZERO, 6 * SIZE(X) + STFD FZERO, 7 * SIZE(X) + + STFD FZERO, 8 * SIZE(X) + STFD FZERO, 9 * SIZE(X) + STFD FZERO, 10 * SIZE(X) + STFD FZERO, 11 * SIZE(X) + STFD FZERO, 12 * SIZE(X) + STFD FZERO, 13 * SIZE(X) + STFD FZERO, 14 * SIZE(X) + STFD FZERO, 15 * SIZE(X) + + addi X, X, 16 * SIZE + bdnz LL(A0I1_kernel) + .align 4 + +LL(A0I1_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0I1_RemainKernel): + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + addi X, X, 2 * SIZE + bdnz LL(A0I1_RemainKernel) + blr + .align 4 + +LL(A0IN): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(A0IN_Remain) + .align 4 + +LL(A0IN_Kernel): + dcbtst X, PREA + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + bdnz LL(A0IN_Kernel) + .align 4 + +LL(A0IN_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0IN_RemainKernel): + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + bdnz LL(A0IN_RemainKernel) + blr + .align 4 + +LL(A1I1): + cmpwi cr0, INCX, 2 * SIZE + bne- LL(A1IN) + + mr XX, X + srawi. r0, N, 3 + mtspr CTR, r0 + beq+ LL(A1I1_Remain) + .align 4 + +LL(A1I1_kernel): + LFD f3, 0 * SIZE(X) + LFD f4, 1 * SIZE(X) + LFD f5, 2 * SIZE(X) + LFD f6, 3 * SIZE(X) + LFD f7, 4 * SIZE(X) + LFD f8, 5 * SIZE(X) + LFD f9, 6 * SIZE(X) + LFD f10, 7 * SIZE(X) + + FMUL f0, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMUL f11, ALPHA_I, f6 + FMUL f6, ALPHA_R, f6 + + FMUL f12, ALPHA_I, f8 + FMUL f8, ALPHA_R, f8 + FMUL f13, ALPHA_I, f10 + FMUL f10, ALPHA_R, f10 + + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f0 + FMADD f6, ALPHA_I, f5, f6 + FMSUB f5, ALPHA_R, f5, f11 + + FMADD f8, ALPHA_I, f7, f8 + FMSUB f7, ALPHA_R, f7, f12 + FMADD f10, ALPHA_I, f9, f10 + FMSUB f9, ALPHA_R, f9, f13 + + STFD f3, 0 * SIZE(X) + STFD f4, 1 * SIZE(X) + STFD f5, 2 * SIZE(X) + STFD f6, 3 * SIZE(X) + STFD f7, 4 * SIZE(X) + STFD f8, 5 * SIZE(X) + STFD f9, 6 * SIZE(X) + STFD f10, 7 * SIZE(X) + + LFD f3, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f5, 10 * SIZE(X) + LFD f6, 11 * SIZE(X) + LFD f7, 12 * SIZE(X) + LFD f8, 13 * SIZE(X) + LFD f9, 14 * SIZE(X) + LFD f10,15 * SIZE(X) + + FMUL f0, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMUL f11, ALPHA_I, f6 + FMUL f6, ALPHA_R, f6 + + FMUL f12, ALPHA_I, f8 + FMUL f8, ALPHA_R, f8 + FMUL f13, ALPHA_I, f10 + FMUL f10, ALPHA_R, f10 + + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f0 + FMADD f6, ALPHA_I, f5, f6 + FMSUB f5, ALPHA_R, f5, f11 + + FMADD f8, ALPHA_I, f7, f8 + FMSUB f7, ALPHA_R, f7, f12 + FMADD f10, ALPHA_I, f9, f10 + FMSUB f9, ALPHA_R, f9, f13 + + STFD f3, 8 * SIZE(X) + STFD f4, 9 * SIZE(X) + STFD f5, 10 * SIZE(X) + STFD f6, 11 * SIZE(X) + STFD f7, 12 * SIZE(X) + STFD f8, 13 * SIZE(X) + STFD f9, 14 * SIZE(X) + STFD f10,15 * SIZE(X) + + addi X, X, 16 * SIZE + dcbtst X, PREA + bdnz LL(A1I1_kernel) + .align 4 + +LL(A1I1_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1I1_RemainKernel): + LFD f3, 0 * SIZE(X) + LFD f4, 1 * SIZE(X) + + FMUL f5, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f5 + + STFD f3, 0 * SIZE(X) + STFD f4, 1 * SIZE(X) + addi X, X, 2 * SIZE + bdnz LL(A1I1_RemainKernel) + blr + .align 4 + +LL(A1IN): + mr XX, X + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(A1IN_Remain) + .align 4 + +LL(A1IN_Kernel): + LFD f3, 0 * SIZE(XX) + LFD f4, 1 * SIZE(XX) + add XX, XX, INCX + LFD f5, 0 * SIZE(XX) + LFD f6, 1 * SIZE(XX) + add XX, XX, INCX + LFD f7, 0 * SIZE(XX) + LFD f8, 1 * SIZE(XX) + add XX, XX, INCX + LFD f9, 0 * SIZE(XX) + LFD f10, 1 * SIZE(XX) + add XX, XX, INCX + + FMUL f0, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMUL f11, ALPHA_I, f6 + FMUL f6, ALPHA_R, f6 + + FMUL f12, ALPHA_I, f8 + FMUL f8, ALPHA_R, f8 + FMUL f13, ALPHA_I, f10 + FMUL f10, ALPHA_R, f10 + + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f0 + FMADD f6, ALPHA_I, f5, f6 + FMSUB f5, ALPHA_R, f5, f11 + + FMADD f8, ALPHA_I, f7, f8 + FMSUB f7, ALPHA_R, f7, f12 + FMADD f10, ALPHA_I, f9, f10 + FMSUB f9, ALPHA_R, f9, f13 + + STFD f3, 0 * SIZE(X) + STFD f4, 1 * SIZE(X) + add X, X, INCX + STFD f5, 0 * SIZE(X) + STFD f6, 1 * SIZE(X) + add X, X, INCX + STFD f7, 0 * SIZE(X) + STFD f8, 1 * SIZE(X) + add X, X, INCX + STFD f9, 0 * SIZE(X) + STFD f10, 1 * SIZE(X) + add X, X, INCX + dcbtst X, PREA + bdnz LL(A1IN_Kernel) + .align 4 + +LL(A1IN_Remain): + andi. r0, N, 3 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1IN_RemainKernel): + LFD f3, 0 * SIZE(XX) + LFD f4, 1 * SIZE(XX) + add XX, XX, INCX + + FMUL f5, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f5 + + STFD f3, 0 * SIZE(X) + STFD f4, 1 * SIZE(X) + add X, X, INCX + bdnz LL(A1IN_RemainKernel) + blr + + EPILOGUE diff --git a/kernel/power/zscal_hummer.S b/kernel/power/zscal_hummer.S new file mode 100644 index 0000000..6c559f3 --- /dev/null +++ b/kernel/power/zscal_hummer.S @@ -0,0 +1,871 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 + +#define INCX2 r4 +#define XX r5 +#define Y r8 +#define YY r9 + +#define ALPHA f1 +#define ALPHA_I f2 + +#define A1 f0 +#define A2 f16 +#define A3 f17 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 + +#define B1 f8 +#define B2 f9 +#define B3 f10 +#define B4 f11 +#define B5 f12 +#define B6 f13 +#define B7 f14 +#define B8 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + + lfpdx A1, SP, r10 # Zero clear + fsmfp ALPHA, ALPHA_I + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + + fcmpu cr7, ALPHA, A1 + bne cr7, LL(50) + + fscmp cr7, ALPHA, A1 + bne cr7, LL(50) + + andi. r0, X, 2 * SIZE - 1 + bne LL(20) + + sub X, X, INCX2 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(15) + .align 4 + +LL(12): + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(17) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + STFPDUX A1, X, INCX2 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCX2 + + STFDX A1, X, INCX2 + addi X, X, SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(29) + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(25) + .align 4 + +LL(22): + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, N, 3 + beq LL(29) + andi. r0, N, 2 + beq LL(27) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(29) + + STFPDUX A1, X, INCX2 + .align 4 + +LL(29): + STFDX A1, X, INCX2 + b LL(999) + .align 4 + +LL(50): + sub Y, X, INCX2 + sub X, X, INCX2 + + andi. r0, X, 2 * SIZE - 1 + bne LL(60) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(55) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fxpmul B1, ALPHA, A1 + LFPDUX A6, X, INCX2 + fxpmul B2, ALPHA, A2 + LFPDUX A7, X, INCX2 + fxpmul B3, ALPHA, A3 + LFPDUX A8, X, INCX2 + fxpmul B4, ALPHA, A4 + fxpmul B5, ALPHA, A5 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + bdz LL(53) + .align 4 + +LL(52): + fxcxnpma B3, ALPHA, A3, B3 + LFPDUX A1, X, INCX2 + fxpmul B6, ALPHA, A6 + STFPDUX B1, Y, INCX2 + + fxcxnpma B4, ALPHA, A4, B4 + LFPDUX A2, X, INCX2 + fxpmul B7, ALPHA, A7 + STFPDUX B2, Y, INCX2 + + fxcxnpma B5, ALPHA, A5, B5 + LFPDUX A3, X, INCX2 + fxpmul B8, ALPHA, A8 + STFPDUX B3, Y, INCX2 + + fxcxnpma B6, ALPHA, A6, B6 + LFPDUX A4, X, INCX2 + fxpmul B1, ALPHA, A1 + STFPDUX B4, Y, INCX2 + + fxcxnpma B7, ALPHA, A7, B7 + LFPDUX A5, X, INCX2 + fxpmul B2, ALPHA, A2 + STFPDUX B5, Y, INCX2 + + fxcxnpma B8, ALPHA, A8, B8 + LFPDUX A6, X, INCX2 + fxpmul B3, ALPHA, A3 + STFPDUX B6, Y, INCX2 + + fxcxnpma B1, ALPHA, A1, B1 + LFPDUX A7, X, INCX2 + fxpmul B4, ALPHA, A4 + STFPDUX B7, Y, INCX2 + + fxcxnpma B2, ALPHA, A2, B2 + LFPDUX A8, X, INCX2 + fxpmul B5, ALPHA, A5 + STFPDUX B8, Y, INCX2 + bdnz LL(52) + .align 4 + +LL(53): + fxcxnpma B3, ALPHA, A3, B3 + fxpmul B6, ALPHA, A6 + STFPDUX B1, Y, INCX2 + + fxcxnpma B4, ALPHA, A4, B4 + fxpmul B7, ALPHA, A7 + STFPDUX B2, Y, INCX2 + + fxcxnpma B5, ALPHA, A5, B5 + fxpmul B8, ALPHA, A8 + STFPDUX B3, Y, INCX2 + + fxcxnpma B6, ALPHA, A6, B6 + STFPDUX B4, Y, INCX2 + fxcxnpma B7, ALPHA, A7, B7 + STFPDUX B5, Y, INCX2 + fxcxnpma B8, ALPHA, A8, B8 + STFPDUX B6, Y, INCX2 + STFPDUX B7, Y, INCX2 + STFPDUX B8, Y, INCX2 + .align 4 + +LL(55): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(56) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxpmul B2, ALPHA, A2 + fxpmul B3, ALPHA, A3 + fxpmul B4, ALPHA, A4 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + fxcxnpma B3, ALPHA, A3, B3 + fxcxnpma B4, ALPHA, A4, B4 + + STFPDUX B1, Y, INCX2 + STFPDUX B2, Y, INCX2 + STFPDUX B3, Y, INCX2 + STFPDUX B4, Y, INCX2 + .align 4 + +LL(56): + andi. r0, N, 2 + beq LL(57) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxpmul B2, ALPHA, A2 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + + STFPDUX B1, Y, INCX2 + STFPDUX B2, Y, INCX2 + .align 4 + +LL(57): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxcxnpma B1, ALPHA, A1, B1 + + STFPDUX B1, Y, INCX2 + b LL(999) + .align 4 + +LL(60): + addi XX, X, SIZE + addi YY, Y, SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(65) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, XX, INCX2 + + LFDUX A5, X, INCX2 + fmul B1, ALPHA, A1 + LFDUX A6, XX, INCX2 + fmul B2, ALPHA_I, A1 + LFDUX A7, X, INCX2 + fmul B3, ALPHA, A3 + LFDUX A8, XX, INCX2 + fmul B4, ALPHA_I, A3 + + fmul B5, ALPHA, A5 + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA , A2, B2 + bdz LL(63) + .align 4 + +LL(62): + fnmsub B3, ALPHA_I, A4, B3 + LFDUX A1, X, INCX2 + fmul B6, ALPHA_I, A5 + STFDUX B1, Y, INCX2 + + fmadd B4, ALPHA , A4, B4 + LFDUX A2, XX, INCX2 + fmul B7, ALPHA, A7 + STFDUX B2, YY, INCX2 + + fnmsub B5, ALPHA_I, A6, B5 + LFDUX A3, X, INCX2 + fmul B8, ALPHA_I, A7 + STFDUX B3, Y, INCX2 + + fmadd B6, ALPHA , A6, B6 + LFDUX A4, XX, INCX2 + fmul B1, ALPHA, A1 + STFDUX B4, YY, INCX2 + + fnmsub B7, ALPHA_I, A8, B7 + LFDUX A5, X, INCX2 + fmul B2, ALPHA_I, A1 + STFDUX B5, Y, INCX2 + + fmadd B8, ALPHA , A8, B8 + LFDUX A6, XX, INCX2 + fmul B3, ALPHA, A3 + STFDUX B6, YY, INCX2 + + fnmsub B1, ALPHA_I, A2, B1 + LFDUX A7, X, INCX2 + fmul B4, ALPHA_I, A3 + STFDUX B7, Y, INCX2 + + fmadd B2, ALPHA , A2, B2 + LFDUX A8, XX, INCX2 + fmul B5, ALPHA, A5 + STFDUX B8, YY, INCX2 + bdnz LL(62) + .align 4 + +LL(63): + fnmsub B3, ALPHA_I, A4, B3 + fmul B6, ALPHA_I, A5 + STFDUX B1, Y, INCX2 + + fmadd B4, ALPHA , A4, B4 + fmul B7, ALPHA, A7 + STFDUX B2, YY, INCX2 + + fnmsub B5, ALPHA_I, A6, B5 + fmul B8, ALPHA_I, A7 + STFDUX B3, Y, INCX2 + + fmadd B6, ALPHA , A6, B6 + STFDUX B4, YY, INCX2 + fnmsub B7, ALPHA_I, A8, B7 + STFDUX B5, Y, INCX2 + fmadd B8, ALPHA , A8, B8 + STFDUX B6, YY, INCX2 + STFDUX B7, Y, INCX2 + STFDUX B8, YY, INCX2 + .align 4 + +LL(65): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(67) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, XX, INCX2 + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fmul B3, ALPHA, A3 + fmul B4, ALPHA, A4 + + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA_I, A1, B2 + fnmsub B3, ALPHA_I, A4, B3 + fmadd B4, ALPHA_I, A3, B4 + + STFDUX B1, Y, INCX2 + STFDUX B2, YY, INCX2 + STFDUX B3, Y, INCX2 + STFDUX B4, YY, INCX2 + .align 4 + +LL(67): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA_I, A1, B2 + + STFDUX B1, Y, INCX2 + STFDUX B2, YY, INCX2 + b LL(999) + .align 4 + + +LL(100): + fcmpu cr7, ALPHA, A1 + bne cr7, LL(150) + + fscmp cr7, ALPHA, A1 + bne cr7, LL(150) + + andi. r0, X, 2 * SIZE - 1 + bne LL(120) + + sub X, X, INCX2 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(115) + .align 4 + +LL(112): + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(117) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + STFPDUX A1, X, INCX2 + b LL(999) + .align 4 + +LL(120): + subi INCX2, INCX2, SIZE + li INCX, SIZE + + sub X, X, INCX2 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(125) + .align 4 + +LL(122): + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + bdnz LL(122) + .align 4 + +LL(125): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(127) + + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + .align 4 + +LL(127): + andi. r0, N, 1 + beq LL(999) + + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + b LL(999) + .align 4 + +LL(150): + sub Y, X, INCX2 + sub X, X, INCX2 + + andi. r0, X, 2 * SIZE - 1 + bne LL(160) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(155) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fxpmul B1, ALPHA, A1 + LFPDUX A6, X, INCX2 + fxpmul B2, ALPHA, A2 + LFPDUX A7, X, INCX2 + fxpmul B3, ALPHA, A3 + LFPDUX A8, X, INCX2 + fxpmul B4, ALPHA, A4 + fxpmul B5, ALPHA, A5 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + bdz LL(153) + .align 4 + +LL(152): + fxcxnpma B3, ALPHA, A3, B3 + LFPDUX A1, X, INCX2 + fxpmul B6, ALPHA, A6 + STFPDUX B1, Y, INCX2 + + fxcxnpma B4, ALPHA, A4, B4 + LFPDUX A2, X, INCX2 + fxpmul B7, ALPHA, A7 + STFPDUX B2, Y, INCX2 + + fxcxnpma B5, ALPHA, A5, B5 + LFPDUX A3, X, INCX2 + fxpmul B8, ALPHA, A8 + STFPDUX B3, Y, INCX2 + + fxcxnpma B6, ALPHA, A6, B6 + LFPDUX A4, X, INCX2 + fxpmul B1, ALPHA, A1 + STFPDUX B4, Y, INCX2 + + fxcxnpma B7, ALPHA, A7, B7 + LFPDUX A5, X, INCX2 + fxpmul B2, ALPHA, A2 + STFPDUX B5, Y, INCX2 + + fxcxnpma B8, ALPHA, A8, B8 + LFPDUX A6, X, INCX2 + fxpmul B3, ALPHA, A3 + STFPDUX B6, Y, INCX2 + + fxcxnpma B1, ALPHA, A1, B1 + LFPDUX A7, X, INCX2 + fxpmul B4, ALPHA, A4 + STFPDUX B7, Y, INCX2 + + fxcxnpma B2, ALPHA, A2, B2 + LFPDUX A8, X, INCX2 + fxpmul B5, ALPHA, A5 + STFPDUX B8, Y, INCX2 + bdnz LL(152) + .align 4 + +LL(153): + fxcxnpma B3, ALPHA, A3, B3 + fxpmul B6, ALPHA, A6 + STFPDUX B1, Y, INCX2 + + fxcxnpma B4, ALPHA, A4, B4 + fxpmul B7, ALPHA, A7 + STFPDUX B2, Y, INCX2 + + fxcxnpma B5, ALPHA, A5, B5 + fxpmul B8, ALPHA, A8 + STFPDUX B3, Y, INCX2 + + fxcxnpma B6, ALPHA, A6, B6 + STFPDUX B4, Y, INCX2 + fxcxnpma B7, ALPHA, A7, B7 + STFPDUX B5, Y, INCX2 + fxcxnpma B8, ALPHA, A8, B8 + STFPDUX B6, Y, INCX2 + STFPDUX B7, Y, INCX2 + STFPDUX B8, Y, INCX2 + .align 4 + +LL(155): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(156) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxpmul B2, ALPHA, A2 + fxpmul B3, ALPHA, A3 + fxpmul B4, ALPHA, A4 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + fxcxnpma B3, ALPHA, A3, B3 + fxcxnpma B4, ALPHA, A4, B4 + + STFPDUX B1, Y, INCX2 + STFPDUX B2, Y, INCX2 + STFPDUX B3, Y, INCX2 + STFPDUX B4, Y, INCX2 + .align 4 + +LL(156): + andi. r0, N, 2 + beq LL(157) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxpmul B2, ALPHA, A2 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + + STFPDUX B1, Y, INCX2 + STFPDUX B2, Y, INCX2 + .align 4 + +LL(157): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxcxnpma B1, ALPHA, A1, B1 + + STFPDUX B1, Y, INCX2 + b LL(999) + .align 4 + +LL(160): + addi XX, X, SIZE + addi YY, Y, SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(165) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, XX, INCX2 + + LFDUX A5, X, INCX2 + fmul B1, ALPHA, A1 + LFDUX A6, XX, INCX2 + fmul B2, ALPHA_I, A1 + LFDUX A7, X, INCX2 + fmul B3, ALPHA, A3 + LFDUX A8, XX, INCX2 + fmul B4, ALPHA_I, A3 + + fmul B5, ALPHA, A5 + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA , A2, B2 + bdz LL(163) + + .align 4 + +LL(162): + fnmsub B3, ALPHA_I, A4, B3 + LFDUX A1, X, INCX2 + fmul B6, ALPHA_I, A5 + STFDUX B1, Y, INCX2 + + fmadd B4, ALPHA , A4, B4 + LFDUX A2, XX, INCX2 + fmul B7, ALPHA, A7 + STFDUX B2, YY, INCX2 + + fnmsub B5, ALPHA_I, A6, B5 + LFDUX A3, X, INCX2 + fmul B8, ALPHA_I, A7 + STFDUX B3, Y, INCX2 + + fmadd B6, ALPHA , A6, B6 + LFDUX A4, XX, INCX2 + fmul B1, ALPHA, A1 + STFDUX B4, YY, INCX2 + + fnmsub B7, ALPHA_I, A8, B7 + LFDUX A5, X, INCX2 + fmul B2, ALPHA_I, A1 + STFDUX B5, Y, INCX2 + + fmadd B8, ALPHA , A8, B8 + LFDUX A6, XX, INCX2 + fmul B3, ALPHA, A3 + STFDUX B6, YY, INCX2 + + fnmsub B1, ALPHA_I, A2, B1 + LFDUX A7, X, INCX2 + fmul B4, ALPHA_I, A3 + STFDUX B7, Y, INCX2 + + fmadd B2, ALPHA , A2, B2 + LFDUX A8, XX, INCX2 + fmul B5, ALPHA, A5 + STFDUX B8, YY, INCX2 + bdnz LL(162) + .align 4 + +LL(163): + fnmsub B3, ALPHA_I, A4, B3 + fmul B6, ALPHA_I, A5 + STFDUX B1, Y, INCX2 + + fmadd B4, ALPHA , A4, B4 + fmul B7, ALPHA, A7 + STFDUX B2, YY, INCX2 + + fnmsub B5, ALPHA_I, A6, B5 + fmul B8, ALPHA_I, A7 + STFDUX B3, Y, INCX2 + + fmadd B6, ALPHA , A6, B6 + STFDUX B4, YY, INCX2 + fnmsub B7, ALPHA_I, A8, B7 + STFDUX B5, Y, INCX2 + fmadd B8, ALPHA , A8, B8 + STFDUX B6, YY, INCX2 + STFDUX B7, Y, INCX2 + STFDUX B8, YY, INCX2 + .align 4 + +LL(165): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(167) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, XX, INCX2 + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fmul B3, ALPHA, A3 + fmul B4, ALPHA, A4 + + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA_I, A1, B2 + fnmsub B3, ALPHA_I, A4, B3 + fmadd B4, ALPHA_I, A3, B4 + + STFDUX B1, Y, INCX2 + STFDUX B2, YY, INCX2 + STFDUX B3, Y, INCX2 + STFDUX B4, YY, INCX2 + .align 4 + +LL(167): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA_I, A1, B2 + + STFDUX B1, Y, INCX2 + STFDUX B2, YY, INCX2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S new file mode 100644 index 0000000..9f120ac --- /dev/null +++ b/kernel/power/zscal_ppc440.S @@ -0,0 +1,276 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define XX r4 +#define PRE r5 + +#ifdef linux +#ifndef __64BIT__ +#define X r6 +#define INCX r7 +#else +#define X r8 +#define INCX r9 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define X r10 +#define INCX r8 +#else +#define X r8 +#define INCX r9 +#endif +#endif + +#define INC1 r11 + +#define FZERO f0 +#define ALPHA_R f1 +#define ALPHA_I f2 + + PROLOGUE + PROFCODE + + addi SP, SP, -8 + li r0, 0 + + stw r0, 0(SP) + lfs FZERO, 0(SP) + addi SP, SP, 8 + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCX, 56(SP) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + li INC1, SIZE + sub X, X, INCX + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + blelr- cr0 + + fcmpu cr0, FZERO, ALPHA_R + bne- cr0, LL(A1I1) + + fcmpu cr0, FZERO, ALPHA_I + bne- cr0, LL(A1I1) + +LL(A0IN): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(A0IN_Remain) + .align 4 + +LL(A0IN_Kernel): +#ifdef PPCG4 + dcbtst X, PRE +#endif + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 +#ifdef PPCG4 + dcbtst X, PRE +#endif + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + bdnz LL(A0IN_Kernel) + .align 4 + +LL(A0IN_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0IN_RemainKernel): + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + bdnz LL(A0IN_RemainKernel) + blr + .align 4 + +LL(A1I1): + mr XX, X + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(15) + + LFDUX f0, X, INCX + LFDX f3, X, INC1 + LFDUX f4, X, INCX + LFDX f5, X, INC1 + + LFDUX f6, X, INCX + FMUL f10, ALPHA_R, f0 + LFDX f7, X, INC1 + FMUL f11, ALPHA_R, f3 + LFDUX f8, X, INCX + FMUL f12, ALPHA_R, f4 + FMUL f13, ALPHA_R, f5 + bdz LL(13) + .align 4 + +LL(12): +#ifdef PPCG4 + dcbtst X, PRE +#endif + + FNMSUB f10, ALPHA_I, f3, f10 + LFDX f9, X, INC1 + FMADD f11, ALPHA_I, f0, f11 + LFDUX f0, X, INCX + FNMSUB f12, ALPHA_I, f5, f12 + LFDX f3, X, INC1 + FMADD f13, ALPHA_I, f4, f13 + LFDUX f4, X, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + STFDUX f10, XX, INCX + FMUL f10, ALPHA_R, f6 + STFDX f11, XX, INC1 + FMUL f11, ALPHA_R, f7 + STFDUX f12, XX, INCX + FMUL f12, ALPHA_R, f8 + STFDX f13, XX, INC1 + FMUL f13, ALPHA_R, f9 + +#ifdef PPCG4 + dcbtst X, PRE +#endif + + FNMSUB f10, ALPHA_I, f7, f10 + LFDX f5, X, INC1 + FMADD f11, ALPHA_I, f6, f11 + LFDUX f6, X, INCX + FNMSUB f12, ALPHA_I, f9, f12 + LFDX f7, X, INC1 + FMADD f13, ALPHA_I, f8, f13 + LFDUX f8, X, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + STFDUX f10, XX, INCX + FMUL f10, ALPHA_R, f0 + STFDX f11, XX, INC1 + FMUL f11, ALPHA_R, f3 + STFDUX f12, XX, INCX + FMUL f12, ALPHA_R, f4 + STFDX f13, XX, INC1 + FMUL f13, ALPHA_R, f5 + bdnz LL(12) + .align 4 + +LL(13): + FNMSUB f10, ALPHA_I, f3, f10 + LFDX f9, X, INC1 + FMADD f11, ALPHA_I, f0, f11 + FNMSUB f12, ALPHA_I, f5, f12 + FMADD f13, ALPHA_I, f4, f13 + + STFDUX f10, XX, INCX + FMUL f10, ALPHA_R, f6 + STFDX f11, XX, INC1 + FMUL f11, ALPHA_R, f7 + STFDUX f12, XX, INCX + FMUL f12, ALPHA_R, f8 + STFDX f13, XX, INC1 + FMUL f13, ALPHA_R, f9 + + FNMSUB f10, ALPHA_I, f7, f10 + FMADD f11, ALPHA_I, f6, f11 + FNMSUB f12, ALPHA_I, f9, f12 + FMADD f13, ALPHA_I, f8, f13 + + STFDUX f10, XX, INCX + STFDX f11, XX, INC1 + STFDUX f12, XX, INCX + STFDX f13, XX, INC1 + .align 4 + +LL(15): + andi. r0, N, 3 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1IN_RemainKernel): + LFDUX f3, X, INCX + LFDX f4, X, INC1 + + FMUL f5, ALPHA_R, f3 + FMUL f6, ALPHA_R, f4 + + FNMSUB f5, ALPHA_I, f4, f5 + FMADD f6, ALPHA_I, f3, f6 + + STFDUX f5, XX, INCX + STFDX f6, XX, INC1 + bdnz LL(A1IN_RemainKernel) + blr + + EPILOGUE diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S new file mode 100644 index 0000000..4c23c1d --- /dev/null +++ b/kernel/power/zswap.S @@ -0,0 +1,414 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define PREA r4 +#define XX r5 +#define YY r10 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define PREA r5 +#define XX r6 +#define YY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r10 +#define INCX r4 +#define Y r5 +#define INCY r6 +#define PREA r7 +#define XX r8 +#define YY r9 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define PREA r5 +#define XX r6 +#define YY r7 +#endif +#endif + +#define INCXM1 r11 +#define INCYM1 r12 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#if defined(linux) && defined(__64BIT__) + ld INCY, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld INCY, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + +LL(10): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f24, 8 * SIZE(Y) + LFD f25, 9 * SIZE(Y) + LFD f26, 10 * SIZE(Y) + LFD f27, 11 * SIZE(Y) + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f28, 12 * SIZE(Y) + LFD f29, 13 * SIZE(Y) + LFD f30, 14 * SIZE(Y) + LFD f31, 15 * SIZE(Y) + + STFD f16, 0 * SIZE(X) + STFD f17, 1 * SIZE(X) + STFD f18, 2 * SIZE(X) + STFD f19, 3 * SIZE(X) + + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + STFD f2, 2 * SIZE(Y) + STFD f3, 3 * SIZE(Y) + + STFD f20, 4 * SIZE(X) + STFD f21, 5 * SIZE(X) + STFD f22, 6 * SIZE(X) + STFD f23, 7 * SIZE(X) + + STFD f4, 4 * SIZE(Y) + STFD f5, 5 * SIZE(Y) + STFD f6, 6 * SIZE(Y) + STFD f7, 7 * SIZE(Y) + + STFD f24, 8 * SIZE(X) + STFD f25, 9 * SIZE(X) + STFD f26, 10 * SIZE(X) + STFD f27, 11 * SIZE(X) + + STFD f8, 8 * SIZE(Y) + STFD f9, 9 * SIZE(Y) + STFD f10, 10 * SIZE(Y) + STFD f11, 11 * SIZE(Y) + + STFD f28, 12 * SIZE(X) + STFD f29, 13 * SIZE(X) + STFD f30, 14 * SIZE(X) + STFD f31, 15 * SIZE(X) + + STFD f12, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f14, 14 * SIZE(Y) + STFD f15, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + dcbtst X, PREA +#ifdef L1_DUALFETCH + dcbtst Y, PREA +#endif + bdnz LL(10) + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + + STFD f2, 0 * SIZE(X) + STFD f3, 1 * SIZE(X) + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + mr XX, X + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f16, Y, INCYM1 + LFDUX f17, Y, INCY + LFDX f18, Y, INCYM1 + LFDUX f19, Y, INCY + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f20, Y, INCYM1 + LFDUX f21, Y, INCY + LFDX f22, Y, INCYM1 + LFDUX f23, Y, INCY + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + + LFDX f24, Y, INCYM1 + LFDUX f25, Y, INCY + LFDX f26, Y, INCYM1 + LFDUX f27, Y, INCY + + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + LFDX f28, Y, INCYM1 + LFDUX f29, Y, INCY + LFDX f30, Y, INCYM1 + LFDUX f31, Y, INCY + + STFDX f16, XX, INCXM1 + STFDUX f17, XX, INCX + STFDX f18, XX, INCXM1 + STFDUX f19, XX, INCX + + STFDX f0, YY, INCYM1 + STFDUX f1, YY, INCY + STFDX f2, YY, INCYM1 + STFDUX f3, YY, INCY + + STFDX f20, XX, INCXM1 + STFDUX f21, XX, INCX + STFDX f22, XX, INCXM1 + STFDUX f23, XX, INCX + + STFDX f4, YY, INCYM1 + STFDUX f5, YY, INCY + STFDX f6, YY, INCYM1 + STFDUX f7, YY, INCY + + STFDX f24, XX, INCXM1 + STFDUX f25, XX, INCX + STFDX f26, XX, INCXM1 + STFDUX f27, XX, INCX + + STFDX f8, YY, INCYM1 + STFDUX f9, YY, INCY + STFDX f10, YY, INCYM1 + STFDUX f11, YY, INCY + + STFDX f28, XX, INCXM1 + STFDUX f29, XX, INCX + STFDX f30, XX, INCXM1 + STFDUX f31, XX, INCX + + STFDX f12, YY, INCYM1 + STFDUX f13, YY, INCY + STFDX f14, YY, INCYM1 + STFDUX f15, YY, INCY + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, Y, INCYM1 + LFDUX f3, Y, INCY + STFDX f2, XX, INCXM1 + STFDUX f3, XX, INCX + STFDX f0, YY, INCYM1 + STFDUX f1, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zswap_hummer.S b/kernel/power/zswap_hummer.S new file mode 100644 index 0000000..335eaa1 --- /dev/null +++ b/kernel/power/zswap_hummer.S @@ -0,0 +1,665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 + +#define INCX2 r4 +#define INCY2 r5 +#define X2 r10 +#define Y2 r11 + +#define A1 f0 +#define A2 f1 +#define A3 f2 +#define A4 f3 +#define A5 f4 + +#define B1 f5 +#define B2 f6 +#define B3 f7 +#define B4 f8 +#define B5 f9 + +#define T1 f10 +#define T2 f11 +#define T3 f12 +#define T4 f13 +#define T5 f14 +#define T6 f15 +#define T7 f16 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + mr X2, X + mr Y2, Y + + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + .align 4 + +LL(10): /* X : aligned Y : aligned */ + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + bdz LL(13) + .align 4 + +LL(12): + STFPDUX B1, X2, INCY2 + LFPDUX B1, Y, INCY2 + STFPDUX A1, Y2, INCY2 + LFPDUX A1, X, INCX2 + + STFPDUX B2, X2, INCY2 + LFPDUX B2, Y, INCY2 + STFPDUX A2, Y2, INCY2 + LFPDUX A2, X, INCX2 + + STFPDUX B3, X2, INCY2 + LFPDUX B3, Y, INCY2 + STFPDUX A3, Y2, INCY2 + LFPDUX A3, X, INCX2 + + STFPDUX B4, X2, INCY2 + LFPDUX B4, Y, INCY2 + STFPDUX A4, Y2, INCY2 + LFPDUX A4, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + STFPDUX B3, X2, INCY2 + STFPDUX A3, Y2, INCY2 + STFPDUX B4, X2, INCY2 + STFPDUX A4, Y2, INCY2 + .align 4 + +LL(15): + andi. r0, N, 3 + beq LL(999) + + andi. r0, N, 2 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + .align 4 + +LL(16): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + b LL(999) + .align 4 + +LL(20): /* X : aligned Y : unaligned */ + + LFXDUX A1, X, INCX2 + LFDX B1, Y, INCY2 + + STFSDX A1, Y2, INCY2 + + add Y, Y, INCY + add Y2, Y2, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(29) + .align 4 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX T1, X, INCX2 + LFXDUX T2, Y, INCY2 + LFXDUX T3, X, INCX2 + LFXDUX T4, Y, INCY2 + + LFPDUX A4, X, INCX2 + fsmr A1, T1 + LFPDUX B4, Y, INCY2 + fsmr B1, T2 + LFPDUX A5, X, INCX2 + fsmr T1, T3 + LFPDUX B5, Y, INCY2 + fsmr T2, T4 + bdz LL(23) + .align 4 + +LL(22): + fxmr T5, A4 + STFPDUX A1, Y2, INCY2 + fxmr T6, B4 + STFPDUX B1, X2, INCX2 + fxmr A1, A5 + STFPDUX T1, Y2, INCY2 + fxmr B1, B5 + STFPDUX T2, X2, INCX2 + + fsmr T3, T5 + LFPDUX A2, X, INCX2 + fsmr T4, T6 + LFPDUX B2, Y, INCY2 + fsmr T5, A1 + LFPDUX A3, X, INCX2 + fsmr T6, B1 + LFPDUX B3, Y, INCY2 + + fxmr T1, A2 + STFPDUX T3, Y2, INCY2 + fxmr T2, B2 + STFPDUX T4, X2, INCX2 + fxmr T3, A3 + STFPDUX T5, Y2, INCY2 + fxmr T4, B3 + STFPDUX T6, X2, INCX2 + + fsmr A1, T1 + LFPDUX A4, X, INCX2 + fsmr B1, T2 + LFPDUX B4, Y, INCY2 + fsmr T1, T3 + LFPDUX A5, X, INCX2 + fsmr T2, T4 + LFPDUX B5, Y, INCY2 + bdnz LL(22) + .align 4 + +LL(23): + fxmr T5, A4 + STFPDUX A1, Y2, INCY2 + fxmr T6, B4 + STFPDUX B1, X2, INCX2 + fxmr A1, A5 + STFPDUX T1, Y2, INCY2 + fxmr B1, B5 + STFPDUX T2, X2, INCX2 + + fsmr T3, T5 + fsmr T4, T6 + fsmr T5, A1 + fsmr T6, B1 + + STFPDUX T3, Y2, INCY2 + STFPDUX T4, X2, INCX2 + STFPDUX T5, Y2, INCY2 + STFPDUX T6, X2, INCX2 + .align 4 + +LL(25): + andi. r0, N, 3 + beq LL(29) + + andi. r0, N, 2 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFXDUX B2, Y, INCY2 + LFXDUX A3, X, INCX2 + LFXDUX B3, Y, INCY2 + + fsmr A1, A2 + fsmr B1, B2 + fsmr A2, A3 + fsmr B2, B3 + + STFPDUX A1, Y2, INCY2 + STFPDUX B1, X2, INCX2 + STFPDUX A2, Y2, INCY2 + fpmr A1, A3 + STFPDUX B2, X2, INCX2 + fpmr B1, B3 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(29) + + LFXDUX A2, X, INCX2 + LFXDUX B2, Y, INCY2 + fsmr A1, A2 + fsmr B1, B2 + STFPDUX A1, Y2, INCY2 + fpmr A1, A2 + STFPDUX B1, X2, INCX2 + fpmr B1, B2 + .align 4 + +LL(29): + LFSDX B1, Y, INCY2 + STFDX A1, Y2, INCY2 + STFPDX B1, X2, INCX2 + b LL(999) + .align 4 + + +LL(30): /* X : unaligned Y : aligned */ + + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFXDUX A1, Y, INCY2 + LFDX B1, X, INCX2 + + STFSDX A1, X2, INCX2 + + add X, X, INCX + add X2, X2, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(39) + .align 4 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX T1, Y, INCY2 + LFXDUX T2, X, INCX2 + LFXDUX T3, Y, INCY2 + LFXDUX T4, X, INCX2 + + LFPDUX A4, Y, INCY2 + fsmr A1, T1 + LFPDUX B4, X, INCX2 + fsmr B1, T2 + LFPDUX A5, Y, INCY2 + fsmr T1, T3 + LFPDUX B5, X, INCX2 + fsmr T2, T4 + bdz LL(33) + .align 4 + +LL(32): + fxmr T5, A4 + STFPDUX A1, X2, INCX2 + fxmr T6, B4 + STFPDUX B1, Y2, INCY2 + fxmr A1, A5 + STFPDUX T1, X2, INCX2 + fxmr B1, B5 + STFPDUX T2, Y2, INCY2 + + fsmr T3, T5 + LFPDUX A2, Y, INCY2 + fsmr T4, T6 + LFPDUX B2, X, INCX2 + fsmr T5, A1 + LFPDUX A3, Y, INCY2 + fsmr T6, B1 + LFPDUX B3, X, INCX2 + + fxmr T1, A2 + STFPDUX T3, X2, INCX2 + fxmr T2, B2 + STFPDUX T4, Y2, INCY2 + fxmr T3, A3 + STFPDUX T5, X2, INCX2 + fxmr T4, B3 + STFPDUX T6, Y2, INCY2 + + fsmr A1, T1 + LFPDUX A4, Y, INCY2 + fsmr B1, T2 + LFPDUX B4, X, INCX2 + fsmr T1, T3 + LFPDUX A5, Y, INCY2 + fsmr T2, T4 + LFPDUX B5, X, INCX2 + bdnz LL(32) + .align 4 + +LL(33): + fxmr T5, A4 + STFPDUX A1, X2, INCX2 + fxmr T6, B4 + STFPDUX B1, Y2, INCY2 + fxmr A1, A5 + STFPDUX T1, X2, INCX2 + fxmr B1, B5 + STFPDUX T2, Y2, INCY2 + + fsmr T3, T5 + fsmr T4, T6 + fsmr T5, A1 + fsmr T6, B1 + + STFPDUX T3, X2, INCX2 + STFPDUX T4, Y2, INCY2 + STFPDUX T5, X2, INCX2 + STFPDUX T6, Y2, INCY2 + .align 4 + +LL(35): + andi. r0, N, 3 + beq LL(39) + + andi. r0, N, 2 + beq LL(37) + + LFXDUX A2, Y, INCY2 + LFXDUX B2, X, INCX2 + LFXDUX A3, Y, INCY2 + LFXDUX B3, X, INCX2 + + fsmr A1, A2 + fsmr B1, B2 + fsmr A2, A3 + fsmr B2, B3 + + STFPDUX A1, X2, INCX2 + STFPDUX B1, Y2, INCY2 + STFPDUX A2, X2, INCX2 + fpmr A1, A3 + STFPDUX B2, Y2, INCY2 + fpmr B1, B3 + .align 4 + +LL(37): + andi. r0, N, 1 + beq LL(39) + + LFXDUX A2, Y, INCY2 + LFXDUX B2, X, INCX2 + fsmr A1, A2 + fsmr B1, B2 + STFPDUX A1, X2, INCX2 + fpmr A1, A2 + STFPDUX B1, Y2, INCY2 + fpmr B1, B2 + .align 4 + +LL(39): + LFSDX B1, X, INCX2 + STFDX A1, X2, INCX2 + STFPDX B1, Y2, INCY2 + b LL(999) + .align 4 + +LL(40): /* X : unaligned Y : unaligned */ + + LFDX A1, Y, INCY2 + LFDX B1, X, INCX2 + add X, X, INCX + add Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + + STFDX A1, X2, INCX2 + STFDX B1, Y2, INCY2 + add X2, X2, INCX + add Y2, Y2, INCY + ble LL(49) + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + bdz LL(43) + .align 4 + +LL(42): + STFPDUX B1, X2, INCY2 + LFPDUX B1, Y, INCY2 + STFPDUX A1, Y2, INCY2 + LFPDUX A1, X, INCX2 + + STFPDUX B2, X2, INCY2 + LFPDUX B2, Y, INCY2 + STFPDUX A2, Y2, INCY2 + LFPDUX A2, X, INCX2 + + STFPDUX B3, X2, INCY2 + LFPDUX B3, Y, INCY2 + STFPDUX A3, Y2, INCY2 + LFPDUX A3, X, INCX2 + + STFPDUX B4, X2, INCY2 + LFPDUX B4, Y, INCY2 + STFPDUX A4, Y2, INCY2 + LFPDUX A4, X, INCX2 + bdnz LL(42) + .align 4 + +LL(43): + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + STFPDUX B3, X2, INCY2 + STFPDUX A3, Y2, INCY2 + STFPDUX B4, X2, INCY2 + STFPDUX A4, Y2, INCY2 + .align 4 + +LL(45): + andi. r0, N, 3 + beq LL(49) + + andi. r0, N, 2 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + .align 4 + +LL(46): + andi. r0, N, 1 + beq LL(49) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + .align 4 + +LL(49): + LFDX A1, Y, INCY2 + LFDX B1, X, INCX2 + STFDX A1, X2, INCX2 + STFDX B1, Y2, INCY2 + b LL(999) + .align 4 + +LL(100): + subi INCX2, INCX2, SIZE + subi INCY2, INCY2, SIZE + + li INCX, SIZE + li INCY, SIZE + + sub X, X, INCX2 + sub Y, Y, INCY2 + + mr X2, X + mr Y2, Y + + srawi. r0, N, 1 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + LFDUX A3, X, INCX2 + LFDUX B3, Y, INCY2 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + bdz LL(113) + .align 4 + +LL(112): + STFDUX B1, X2, INCX2 + LFDUX B1, Y, INCY2 + STFDUX A1, Y2, INCY2 + LFDUX A1, X, INCX2 + + STFDUX B2, X2, INCX + LFDUX B2, Y, INCY + STFDUX A2, Y2, INCY + LFDUX A2, X, INCX + + STFDUX B3, X2, INCX2 + LFDUX B3, Y, INCY2 + STFDUX A3, Y2, INCY2 + LFDUX A3, X, INCX2 + + STFDUX B4, X2, INCX + LFDUX B4, Y, INCY + STFDUX A4, Y2, INCY + LFDUX A4, X, INCX + bdnz LL(112) + .align 4 + +LL(113): + STFDUX B1, X2, INCX2 + STFDUX A1, Y2, INCY2 + STFDUX B2, X2, INCX + STFDUX A2, Y2, INCY + + STFDUX B3, X2, INCX2 + STFDUX A3, Y2, INCY2 + STFDUX B4, X2, INCX + STFDUX A4, Y2, INCY + .align 4 + +LL(115): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX B1, Y, INCY2 + LFDUX B2, Y, INCY + + STFDUX B1, X2, INCX2 + STFDUX B2, X2, INCX + STFDUX A1, Y2, INCY2 + STFDUX A2, Y2, INCY + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S new file mode 100644 index 0000000..0dca84d --- /dev/null +++ b/kernel/power/zsymv_L.S @@ -0,0 +1,1673 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define BUFFER r14 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r9 +#define LDA r10 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define BUFFER r14 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r15 +#define AO2 r16 +#define AO3 r17 +#define AO4 r18 +#define XX r19 +#define YY r20 +#define NEW_Y r21 +#define TEMP r22 +#define PREA r24 +#define IS r25 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define xtemp1 f8 +#define xtemp2 f9 +#define xtemp3 f10 +#define xtemp4 f11 +#define xtemp5 f12 +#define xtemp6 f13 +#define xtemp7 f14 +#define xtemp8 f15 + +#define atemp1 f16 +#define atemp2 f17 +#define atemp3 f18 +#define atemp4 f19 + +#define xsum1 f20 +#define xsum2 f21 +#define xsum3 f22 +#define xsum4 f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha_r f1 +#define alpha_i f2 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 32 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 96 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 112 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#define NOP1 +#define NOP2 +#else +#define NOP1 mr LDA, LDA +#define NOP2 mr INCX, INCX +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA_R 200(SP) +#define ALPHA_I 208(SP) +#define FZERO 216(SP) +#else +#define STACKSIZE 280 +#define ALPHA_R 256(SP) +#define ALPHA_I 264(SP) +#define FZERO 272(SP) +#endif + +#ifndef HEMV +#define FMADD1 FNMSUB +#define FMADD2 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz BUFFER, 56 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz X, 56 + STACKSIZE(SP) + lwz INCX, 60 + STACKSIZE(SP) + lwz Y, 64 + STACKSIZE(SP) + lwz INCY, 68 + STACKSIZE(SP) + lwz BUFFER, 72 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + STFD alpha_r, ALPHA_R + STFD alpha_i, ALPHA_I + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + beq LL(05) + + mr XX, X + mr X, BUFFER + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(03) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(XX) + LFD a2, 1 * SIZE(XX) + add XX, XX, INCX + LFD a3, 0 * SIZE(XX) + LFD a4, 1 * SIZE(XX) + add XX, XX, INCX + LFD a5, 0 * SIZE(XX) + LFD a6, 1 * SIZE(XX) + add XX, XX, INCX + LFD a7, 0 * SIZE(XX) + LFD a8, 1 * SIZE(XX) + add XX, XX, INCX + + dcbt XX, PREA + dcbtst BUFFER, PREA + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + STFD a3, 2 * SIZE(BUFFER) + STFD a4, 3 * SIZE(BUFFER) + STFD a5, 4 * SIZE(BUFFER) + STFD a6, 5 * SIZE(BUFFER) + STFD a7, 6 * SIZE(BUFFER) + STFD a8, 7 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(01) + .align 4 + +LL(03): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(04): + LFD a1, 0 * SIZE(XX) + LFD a2, 1 * SIZE(XX) + add XX, XX, INCX + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 2 * SIZE + bdnz LL(04) + .align 4 + +LL(05): + mr NEW_Y, Y + lfd f0, FZERO + + cmpwi cr0, INCY, 2 * SIZE + beq LL(10) + + mr NEW_Y, BUFFER + + addi r0, M, 3 + srawi. r0, r0, 2 + mtspr CTR, r0 + .align 4 + +LL(06): + STFD f0, 0 * SIZE(BUFFER) + STFD f0, 1 * SIZE(BUFFER) + STFD f0, 2 * SIZE(BUFFER) + STFD f0, 3 * SIZE(BUFFER) + STFD f0, 4 * SIZE(BUFFER) + STFD f0, 5 * SIZE(BUFFER) + STFD f0, 6 * SIZE(BUFFER) + STFD f0, 7 * SIZE(BUFFER) + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(06) + .align 4 + +LL(10): + li IS, 0 + + cmpwi cr0, N, 2 + blt LL(20) + .align 4 + +LL(11): + mr AO1, A + slwi TEMP, IS, ZBASE_SHIFT + add AO2, A, LDA + add XX, X, TEMP + + add A, AO2, LDA + add YY, NEW_Y, TEMP + addi A, A, 4 * SIZE + NOP2 + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD atemp1, 0 * SIZE(XX) + LFD atemp2, 1 * SIZE(XX) + LFD atemp3, 2 * SIZE(XX) + LFD atemp4, 3 * SIZE(XX) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMUL xsum1, atemp1, a1 + addi AO2, AO2, 4 * SIZE + FMUL xsum2, atemp2, a1 + LFD a1, 4 * SIZE(AO1) + FMUL xsum3, atemp1, a3 + addi AO1, AO1, 4 * SIZE + FMUL xsum4, atemp2, a3 + LFD a5, 0 * SIZE(AO2) + +#ifndef HEMV + FNMSUB xsum1, atemp2, a2, xsum1 +#endif + addi XX, XX, 4 * SIZE +#ifndef HEMV + FMADD xsum2, atemp1, a2, xsum2 +#endif + LFD a2, 1 * SIZE(AO1) + FNMSUB xsum3, atemp2, a4, xsum3 + addi YY, YY, 4 * SIZE + FMADD xsum4, atemp1, a4, xsum4 + LFD a6, 1 * SIZE(AO2) + + FMADD xsum1, atemp3, a3, xsum1 + sub TEMP, M, IS + FMADD xsum2, atemp4, a3, xsum2 + LFD a3, 2 * SIZE(AO1) + FMADD xsum3, atemp3, a7, xsum3 + addi TEMP, TEMP, -2 + FMADD xsum4, atemp4, a7, xsum4 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, atemp4, a4, xsum1 + srawi. r0, TEMP, 3 + FMADD2 xsum2, atemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) +#ifndef HEMV + FMADD1 xsum3, atemp4, a8, xsum3 +#endif + mtspr CTR, r0 +#ifndef HEMV + FMADD2 xsum4, atemp3, a8, xsum4 +#endif + LFD a8, 3 * SIZE(AO2) + + FMUL xtemp1, y05, atemp1 + LFD y01, 0 * SIZE(YY) + FMUL xtemp2, y06, atemp1 + LFD y02, 1 * SIZE(YY) + FMUL xtemp3, y05, atemp3 + LFD y03, 2 * SIZE(YY) + FMUL xtemp4, y06, atemp3 + LFD y04, 3 * SIZE(YY) + + FNMSUB atemp1, y06, atemp2, xtemp1 + LFD xtemp1, 0 * SIZE(XX) + FMADD atemp2, y05, atemp2, xtemp2 + LFD xtemp2, 1 * SIZE(XX) + FNMSUB atemp3, y06, atemp4, xtemp3 + LFD xtemp3, 2 * SIZE(XX) + FMADD atemp4, y05, atemp4, xtemp4 + LFD xtemp4, 3 * SIZE(XX) + + NOP1 + ble LL(15) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO1, PREA) + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + NOP1 + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(X, PREX) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + bdz LL(13) + .align 4 + +LL(12): + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, 4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + DCBT(AO2, PREA) + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, 5 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, 6 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, 7 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 12 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 13 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 13 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 13 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 14 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 14 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 15 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 14 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(Y1, PREY) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 15 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 15 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 15 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 8 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 9 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 10 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 11 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 16 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 17 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 17 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 17 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + addi AO2, AO2, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 18 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + addi XX, XX, 16 * SIZE + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 18 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 0 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 19 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi AO1, AO1, 16 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi YY, YY, 16 * SIZE + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, -4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + DCBT(AO1, PREA) + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, -3 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, -2 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, -1 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(X, PREX) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + bdnz LL(12) + .align 4 + +LL(13): + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, 4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, 5 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, 6 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, 7 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 12 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 13 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 13 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 13 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 14 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 14 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 15 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 14 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 15 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 15 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 15 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 8 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 9 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 10 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 11 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 16 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 17 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 17 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 17 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + addi AO2, AO2, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 18 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + addi XX, XX, 16 * SIZE + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 18 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 0 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 19 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi AO1, AO1, 16 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi YY, YY, 16 * SIZE + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + STFD y05, -4 * SIZE(YY) + STFD y06, -3 * SIZE(YY) + STFD y07, -2 * SIZE(YY) + STFD y08, -1 * SIZE(YY) + .align 4 + +LL(15): + andi. r0, TEMP, 4 + ble LL(16) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + NOP1 + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi YY, YY, 8 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi AO2, AO2, 8 * SIZE + FNMSUB y05, atemp4, a6, y05 + addi XX, XX, 8 * SIZE + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + addi AO1, AO1, 8 * SIZE + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + STFD y05, -4 * SIZE(YY) + STFD y06, -3 * SIZE(YY) + STFD y07, -2 * SIZE(YY) + STFD y08, -1 * SIZE(YY) + .align 4 + +LL(16): + andi. r0, TEMP, 2 + ble LL(17) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + FMADD y03, atemp1, a3, y03 + FMADD xsum4, xtemp2, a5, xsum4 + FMADD y04, atemp2, a3, y04 + + FMADD1 xsum1, xtemp2, a2, xsum1 + NOP1 + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + NOP1 + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + NOP1 + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + FMADD y03, atemp3, a7, y03 + FMADD xsum4, xtemp4, a7, xsum4 + FMADD y04, atemp4, a7, y04 + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + NOP1 + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + addi AO1, AO1, 4 * SIZE + FNMSUB y03, atemp4, a8, y03 + addi AO2, AO2, 4 * SIZE + FMADD2 xsum4, xtemp3, a8, xsum4 + addi YY, YY, 4 * SIZE + FMADD y04, atemp3, a8, y04 + NOP2 + + STFD y01, -4 * SIZE(YY) + LFD y01, 0 * SIZE(YY) + STFD y02, -3 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + STFD y03, -2 * SIZE(YY) + STFD y04, -1 * SIZE(YY) + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(18) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp2, a1, xsum2 + FMADD y02, atemp2, a1, y02 + FMADD xsum3, xtemp1, a5, xsum3 + FNMSUB y01, atemp2, a2, y01 + FMADD xsum4, xtemp2, a5, xsum4 + FMADD y02, atemp1, a2, y02 + + FMADD1 xsum1, xtemp2, a2, xsum1 + FMADD y01, atemp3, a5, y01 + FMADD2 xsum2, xtemp1, a2, xsum2 + FMADD y02, atemp4, a5, y02 + FMADD1 xsum3, xtemp2, a6, xsum3 + FNMSUB y01, atemp4, a6, y01 + FMADD2 xsum4, xtemp1, a6, xsum4 + FMADD y02, atemp3, a6, y02 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + STFD y03, 2 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + .align 4 + +LL(18): + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + slwi TEMP, IS, ZBASE_SHIFT + add YY, NEW_Y, TEMP + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + + FMUL xtemp1, y05, xsum1 + FMUL xtemp2, y06, xsum1 + FMUL xtemp3, y05, xsum3 + FMUL xtemp4, y06, xsum3 + + FNMSUB xsum1, y06, xsum2, xtemp1 + FMADD xsum2, y05, xsum2, xtemp2 + FNMSUB xsum3, y06, xsum4, xtemp3 + FMADD xsum4, y05, xsum4, xtemp4 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + FADD y03, y03, xsum3 + FADD y04, y04, xsum4 + + STFD y01, 0 * SIZE(YY) + addi TEMP, IS, 4 + STFD y02, 1 * SIZE(YY) + addi IS, IS, 2 + STFD y03, 2 * SIZE(YY) + cmpw cr0, TEMP, N + STFD y04, 3 * SIZE(YY) + ble LL(11) + .align 4 + +LL(20): + andi. TEMP, N, 1 + ble LL(990) + + slwi TEMP, IS, ZBASE_SHIFT + add XX, X, TEMP + add YY, NEW_Y, TEMP + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD atemp1, 0 * SIZE(XX) + LFD atemp2, 1 * SIZE(XX) + + LFD a1, 0 * SIZE(A) + LFD a2, 1 * SIZE(A) + + FMUL xsum1, atemp1, a1 + FMUL xsum2, atemp2, a1 + +#ifndef HEMV + FNMSUB xsum1, atemp2, a2, xsum1 + FMADD xsum2, atemp1, a2, xsum2 +#endif + + FMUL xtemp1, y05, atemp1 + FMUL xtemp2, y06, atemp1 + + FNMSUB atemp1, y06, atemp2, xtemp1 + FMADD atemp2, y05, atemp2, xtemp2 + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + FMUL xtemp1, y05, xsum1 + FMUL xtemp2, y06, xsum1 + + FNMSUB xsum1, y06, xsum2, xtemp1 + FMADD xsum2, y05, xsum2, xtemp2 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + .align 4 + +LL(990): + cmpwi cr0, INCY, 2 * SIZE + beq LL(999) + + mr YY, Y + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + LFD f5, 1 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + LFD f7, 1 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + LFD f12, 4 * SIZE(NEW_Y) + LFD f13, 5 * SIZE(NEW_Y) + LFD f14, 6 * SIZE(NEW_Y) + LFD f15, 7 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + STFD f11, 1 * SIZE(YY) + add YY, YY, INCY + STFD f12, 0 * SIZE(YY) + STFD f13, 1 * SIZE(YY) + add YY, YY, INCY + STFD f14, 0 * SIZE(YY) + STFD f15, 1 * SIZE(YY) + add YY, YY, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 2 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + STFD f11, 1 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(996): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S new file mode 100644 index 0000000..dbf6ebb --- /dev/null +++ b/kernel/power/zsymv_U.S @@ -0,0 +1,1653 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define IS r4 +#define A r5 +#define LDA r6 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define BUFFER r14 +#else +#define M r3 +#define IS r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define IS r4 +#define A r9 +#define LDA r10 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define BUFFER r14 +#else +#define M r3 +#define IS r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r15 +#define AO2 r16 +#define XX r19 +#define YY r20 +#define NEW_Y r21 +#define TEMP r22 +#define PREA r24 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define xtemp1 f8 +#define xtemp2 f9 +#define xtemp3 f10 +#define xtemp4 f11 +#define xtemp5 f12 +#define xtemp6 f13 +#define xtemp7 f14 +#define xtemp8 f15 + +#define atemp1 f16 +#define atemp2 f17 +#define atemp3 f18 +#define atemp4 f19 + +#define xsum1 f20 +#define xsum2 f21 +#define xsum3 f22 +#define xsum4 f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha_r f1 +#define alpha_i f2 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 32 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 96 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 112 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#define NOP1 +#define NOP2 +#else +#define NOP1 mr LDA, LDA +#define NOP2 mr INCX, INCX +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA_R 200(SP) +#define ALPHA_I 208(SP) +#define FZERO 216(SP) +#else +#define STACKSIZE 280 +#define ALPHA_R 256(SP) +#define ALPHA_I 264(SP) +#define FZERO 272(SP) +#endif + +#ifndef HEMV +#define FMADD1 FNMSUB +#define FMADD2 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz BUFFER, 56 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz X, 56 + STACKSIZE(SP) + lwz INCX, 60 + STACKSIZE(SP) + lwz Y, 64 + STACKSIZE(SP) + lwz INCY, 68 + STACKSIZE(SP) + lwz BUFFER, 72 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + STFD alpha_r, ALPHA_R + STFD alpha_i, ALPHA_I + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + sub IS, M, IS + + cmpwi cr0, M, 0 + ble- LL(999) + + mullw TEMP, IS, LDA + add A, A, TEMP + + cmpwi cr0, INCX, 2 * SIZE + beq LL(05) + + mr XX, X + mr X, BUFFER + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(03) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(XX) + LFD a2, 1 * SIZE(XX) + add XX, XX, INCX + LFD a3, 0 * SIZE(XX) + LFD a4, 1 * SIZE(XX) + add XX, XX, INCX + LFD a5, 0 * SIZE(XX) + LFD a6, 1 * SIZE(XX) + add XX, XX, INCX + LFD a7, 0 * SIZE(XX) + LFD a8, 1 * SIZE(XX) + add XX, XX, INCX + + dcbt XX, PREA + dcbtst BUFFER, PREA + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + STFD a3, 2 * SIZE(BUFFER) + STFD a4, 3 * SIZE(BUFFER) + STFD a5, 4 * SIZE(BUFFER) + STFD a6, 5 * SIZE(BUFFER) + STFD a7, 6 * SIZE(BUFFER) + STFD a8, 7 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(01) + .align 4 + +LL(03): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(04): + LFD a1, 0 * SIZE(XX) + LFD a2, 1 * SIZE(XX) + add XX, XX, INCX + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 2 * SIZE + bdnz LL(04) + .align 4 + +LL(05): + mr NEW_Y, Y + lfd f0, FZERO + + cmpwi cr0, INCY, 2 * SIZE + beq LL(10) + + mr NEW_Y, BUFFER + + addi r0, M, 3 + srawi. r0, r0, 2 + mtspr CTR, r0 + .align 4 + +LL(06): + STFD f0, 0 * SIZE(BUFFER) + STFD f0, 1 * SIZE(BUFFER) + STFD f0, 2 * SIZE(BUFFER) + STFD f0, 3 * SIZE(BUFFER) + STFD f0, 4 * SIZE(BUFFER) + STFD f0, 5 * SIZE(BUFFER) + STFD f0, 6 * SIZE(BUFFER) + STFD f0, 7 * SIZE(BUFFER) + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(06) + .align 4 + +LL(10): + addi TEMP, IS, 2 + cmpw cr0, TEMP, M + bgt LL(20) + .align 4 + +LL(11): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + slwi TEMP, IS, ZBASE_SHIFT + add TEMP, X, TEMP + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD xtemp1, 0 * SIZE(TEMP) + LFD xtemp2, 1 * SIZE(TEMP) + LFD xtemp3, 2 * SIZE(TEMP) + LFD xtemp4, 3 * SIZE(TEMP) + + FMUL atemp1, y05, xtemp1 + FMUL atemp2, y06, xtemp1 + FMUL atemp3, y05, xtemp3 + FMUL atemp4, y06, xtemp3 + + FNMSUB atemp1, y06, xtemp2, atemp1 + FMADD atemp2, y05, xtemp2, atemp2 + FNMSUB atemp3, y06, xtemp4, atemp3 + FMADD atemp4, y05, xtemp4, atemp4 + + lfd xsum1, FZERO + fmr xsum2, xsum1 + fmr xsum3, xsum1 + fmr xsum4, xsum1 + + mr XX, X + mr YY, NEW_Y + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + LFD xtemp1, 0 * SIZE(XX) + LFD xtemp2, 1 * SIZE(XX) + LFD xtemp3, 2 * SIZE(XX) + LFD xtemp4, 3 * SIZE(XX) + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + + srawi. r0, IS, 3 + mtspr CTR, r0 + ble LL(15) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO1, PREA) + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + NOP1 + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(X, PREX) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + bdz LL(13) + .align 4 + +LL(12): + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, 4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + DCBT(AO2, PREA) + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, 5 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, 6 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, 7 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 12 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 13 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 13 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 13 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 14 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 14 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 15 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 14 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(Y1, PREY) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 15 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 15 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 15 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 8 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 9 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 10 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 11 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 16 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 17 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 17 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 17 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + addi AO2, AO2, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 18 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + addi XX, XX, 16 * SIZE + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 18 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 0 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 19 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi AO1, AO1, 16 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi YY, YY, 16 * SIZE + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, -4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + DCBT(AO1, PREA) + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, -3 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, -2 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, -1 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(X, PREX) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + bdnz LL(12) + .align 4 + +LL(13): + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, 4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, 5 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, 6 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, 7 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 12 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 13 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 13 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 13 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 14 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 14 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 15 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 14 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 15 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 15 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 15 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 8 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 9 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 10 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 11 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 16 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 17 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 17 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 17 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + addi AO2, AO2, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 18 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + addi XX, XX, 16 * SIZE + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 18 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 0 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 19 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi AO1, AO1, 16 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi YY, YY, 16 * SIZE + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + STFD y05, -4 * SIZE(YY) + STFD y06, -3 * SIZE(YY) + STFD y07, -2 * SIZE(YY) + STFD y08, -1 * SIZE(YY) + .align 4 + +LL(15): + andi. r0, IS, 4 + ble LL(16) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + NOP1 + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + STFD y05, 4 * SIZE(YY) + STFD y06, 5 * SIZE(YY) + STFD y07, 6 * SIZE(YY) + STFD y08, 7 * SIZE(YY) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + + addi XX, XX, 8 * SIZE + addi YY, YY, 8 * SIZE + .align 4 + +LL(16): + andi. r0, IS, 2 + ble LL(18) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp2, a1, xsum2 + FMADD y02, atemp2, a1, y02 + FMADD xsum3, xtemp1, a5, xsum3 + FMADD y03, atemp1, a3, y03 + FMADD xsum4, xtemp2, a5, xsum4 + FMADD y04, atemp2, a3, y04 + + FMADD1 xsum1, xtemp2, a2, xsum1 + FNMSUB y01, atemp2, a2, y01 + FMADD2 xsum2, xtemp1, a2, xsum2 + FMADD y02, atemp1, a2, y02 + FMADD1 xsum3, xtemp2, a6, xsum3 + FNMSUB y03, atemp2, a4, y03 + FMADD2 xsum4, xtemp1, a6, xsum4 + FMADD y04, atemp1, a4, y04 + + FMADD xsum1, xtemp3, a3, xsum1 + FMADD y01, atemp3, a5, y01 + FMADD xsum2, xtemp4, a3, xsum2 + FMADD y02, atemp4, a5, y02 + FMADD xsum3, xtemp3, a7, xsum3 + FMADD y03, atemp3, a7, y03 + FMADD xsum4, xtemp4, a7, xsum4 + FMADD y04, atemp4, a7, y04 + + FMADD1 xsum1, xtemp4, a4, xsum1 + FNMSUB y01, atemp4, a6, y01 + FMADD2 xsum2, xtemp3, a4, xsum2 + FMADD y02, atemp3, a6, y02 + FMADD1 xsum3, xtemp4, a8, xsum3 + FNMSUB y03, atemp4, a8, y03 + FMADD2 xsum4, xtemp3, a8, xsum4 + FMADD y04, atemp3, a8, y04 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + STFD y03, 2 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + + LFD a1, 4 * SIZE(AO1) + LFD a2, 5 * SIZE(AO1) + + LFD a5, 4 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + LFD y01, 4 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + addi YY, YY, 4 * SIZE + .align 4 + +LL(18): + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + FMUL xtemp1, y05, xsum1 + FMUL xtemp2, y06, xsum1 + FMUL xtemp3, y05, xsum3 + FMUL xtemp4, y06, xsum3 + + FNMSUB xsum1, y06, xsum2, xtemp1 + FMADD xsum2, y05, xsum2, xtemp2 + FNMSUB xsum3, y06, xsum4, xtemp3 + FMADD xsum4, y05, xsum4, xtemp4 + + FMADD xsum1, atemp1, a1, xsum1 + FMADD xsum2, atemp2, a1, xsum2 + FMADD xsum3, atemp1, a5, xsum3 + FMADD xsum4, atemp2, a5, xsum4 + +#ifndef HEMV + FMADD1 xsum1, atemp2, a2, xsum1 + FMADD2 xsum2, atemp1, a2, xsum2 +#endif + FMADD1 xsum3, atemp2, a6, xsum3 + FMADD2 xsum4, atemp1, a6, xsum4 + + FMADD xsum1, atemp3, a5, xsum1 + FMADD xsum2, atemp4, a5, xsum2 + FMADD xsum3, atemp3, a7, xsum3 + FMADD xsum4, atemp4, a7, xsum4 + + FNMSUB xsum1, atemp4, a6, xsum1 + FMADD xsum2, atemp3, a6, xsum2 +#ifndef HEMV + FNMSUB xsum3, atemp4, a8, xsum3 + FMADD xsum4, atemp3, a8, xsum4 +#endif + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + FADD y03, y03, xsum3 + FADD y04, y04, xsum4 + + STFD y01, 0 * SIZE(YY) + addi TEMP, IS, 4 + STFD y02, 1 * SIZE(YY) + addi IS, IS, 2 + STFD y03, 2 * SIZE(YY) + cmpw cr0, TEMP, M + STFD y04, 3 * SIZE(YY) + ble LL(11) + .align 4 + +LL(20): + andi. TEMP, M, 1 + ble LL(990) + + mr AO1, A + + slwi TEMP, IS, ZBASE_SHIFT + add TEMP, X, TEMP + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD xtemp1, 0 * SIZE(TEMP) + LFD xtemp2, 1 * SIZE(TEMP) + + FMUL atemp1, y05, xtemp1 + FMUL atemp2, y06, xtemp1 + + FNMSUB atemp1, y06, xtemp2, atemp1 + FMADD atemp2, y05, xtemp2, atemp2 + + lfd xsum1, FZERO + fmr xsum2, xsum1 + + mr XX, X + mr YY, NEW_Y + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + LFD xtemp1, 0 * SIZE(XX) + LFD xtemp2, 1 * SIZE(XX) + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + mtspr CTR, IS + cmpwi cr0, IS, 0 + ble LL(28) + .align 4 + +LL(22): + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp2, a1, xsum2 + FMADD y02, atemp2, a1, y02 + LFD a1, 2 * SIZE(AO1) + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD xtemp2, 3 * SIZE(XX) + FNMSUB y01, atemp2, a2, y01 + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD xtemp1, 2 * SIZE(XX) + FMADD y02, atemp1, a2, y02 + LFD a2, 3 * SIZE(AO1) + + addi AO1, AO1, 2 * SIZE + addi XX, XX, 2 * SIZE + addi YY, YY, 2 * SIZE + + STFD y01, -2 * SIZE(YY) + LFD y01, 0 * SIZE(YY) + STFD y02, -1 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + bdnz LL(22) + .align 4 + +LL(28): + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + FMUL xtemp1, y05, xsum1 + FMUL xtemp2, y06, xsum1 + + FNMSUB xsum1, y06, xsum2, xtemp1 + FMADD xsum2, y05, xsum2, xtemp2 + + FMADD xsum1, atemp1, a1, xsum1 + FMADD xsum2, atemp2, a1, xsum2 + +#ifndef HEMV + FNMSUB xsum1, atemp2, a2, xsum1 + FMADD xsum2, atemp1, a2, xsum2 +#endif + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + .align 4 + +LL(990): + cmpwi cr0, INCY, 2 * SIZE + beq LL(999) + + mr YY, Y + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + LFD f5, 1 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + LFD f7, 1 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + LFD f12, 4 * SIZE(NEW_Y) + LFD f13, 5 * SIZE(NEW_Y) + LFD f14, 6 * SIZE(NEW_Y) + LFD f15, 7 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + STFD f11, 1 * SIZE(YY) + add YY, YY, INCY + STFD f12, 0 * SIZE(YY) + STFD f13, 1 * SIZE(YY) + add YY, YY, INCY + STFD f14, 0 * SIZE(YY) + STFD f15, 1 * SIZE(YY) + add YY, YY, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 2 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + STFD f11, 1 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(996): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S new file mode 100644 index 0000000..e31a887 --- /dev/null +++ b/kernel/power/ztrsm_kernel_LN.S @@ -0,0 +1,2288 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + +#ifndef PREFETCHTEST +#ifdef LN + li PREC, -4 * SIZE +#else + li PREC, 4 * SIZE +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE + 16) +#else + li PREA, (16 * 9 * SIZE + 16) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(20): + andi. I, M, 1 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f20, f2 + FADD f3, f21, f3 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(09): + srawi. I, M, 1 + ble LL(29) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + + andi. I, M, 1 + ble LL(40) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + srawi. I, M, 1 + ble LL(49) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S new file mode 100644 index 0000000..f7153b7 --- /dev/null +++ b/kernel/power/ztrsm_kernel_LT.S @@ -0,0 +1,2288 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE + 16) +#else + li PREA, (16 * 9 * SIZE + 16) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S new file mode 100644 index 0000000..55bc29b --- /dev/null +++ b/kernel/power/ztrsm_kernel_RT.S @@ -0,0 +1,2289 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE + 16) +#else + li PREA, (16 * 9 * SIZE + 16) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + + andi. J, N, 1 + ble LL(30) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +LL(30): + srawi. J, N, 1 + ble LL(999) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S new file mode 100644 index 0000000..c284a0e --- /dev/null +++ b/kernel/power/ztrsm_kernel_cell_LN.S @@ -0,0 +1,2252 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "cparam.h" +#else +#include "zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + + li PREC, -4 * SIZE + li PREA, 16 * 12 * SIZE + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(20): + andi. I, M, 1 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f20, f2 + FADD f3, f21, f3 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(09): + srawi. I, M, 1 + ble LL(29) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + + andi. I, M, 1 + ble LL(40) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + srawi. I, M, 1 + ble LL(49) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbtst CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S new file mode 100644 index 0000000..ca80100 --- /dev/null +++ b/kernel/power/ztrsm_kernel_cell_LT.S @@ -0,0 +1,2277 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "cparam.h" +#else +#include "zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREC, 3 * SIZE + li PREA, 16 * 12 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S new file mode 100644 index 0000000..f1139fd --- /dev/null +++ b/kernel/power/ztrsm_kernel_cell_RT.S @@ -0,0 +1,2249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "cparam.h" +#else +#include "zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREC, 3 * SIZE + li PREA, 16 * 12 * SIZE + + andi. J, N, 1 + ble LL(30) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +LL(30): + srawi. J, N, 1 + ble LL(999) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S new file mode 100644 index 0000000..9e9697d --- /dev/null +++ b/kernel/power/ztrsm_kernel_hummer_LN.S @@ -0,0 +1,2963 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#undef ZERO + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define ZERO r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + +#ifndef CONJ +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnpma +#else +#if defined(LN) || defined(LT) +#define FXCPMADD fxcpnsma +#define FXCSMADD fxcxma +#else +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnsma +#endif +#endif + +#ifndef CONJ +#define FXCXNPMA fxcxnpma +#define FXCXNSMA fxcxnsma +#else +#define FXCXNPMA fxcxnsma +#define FXCXNSMA fxcxnpma +#endif + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) + + li r0, 0 + stwu r0, -4(SP) + stwu r0, -4(SP) + + stfdu f2, -8(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + srawi. J, N, 1 + ble .L50 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + + andi. I, M, 1 + beq .L20 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L38: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 4 * SIZE +#endif + + addi AO2, AO, 2 * SIZE + addi BO2, BO, 2 * SIZE + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 +#endif + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + +#ifdef LN + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef LT + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f2, A2, f0, f2 + FXCXNSMA f2, A2, f0, f2 + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 + + fxcpnmsub f0, A2, f2, f0 + FXCXNSMA f0, A2, f2, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f2, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f2, AO2, INC4 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L28: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f18, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f8, f18, f8 + fpsub f9, f19, f9 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxpmul f6, A3, f1 + fxpmul f7, A3, f9 + FXCXNPMA f1, A3, f1, f6 + FXCXNPMA f9, A3, f9, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, r0 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + srawi. I, M, 2 + ble .L49 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + nop + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + nop + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + nop + FXCSMADD f12, B4, A9, f12 + nop + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + nop + FXCSMADD f7, B6, A4, f7 + nop + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L18: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f20, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f22, BO2, INC4 + LFPDUX f19, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f10, f22, f10 + fpsub f11, f23, f11 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A4, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + LFPDUX A9, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A10, f3 + fxpmul f5, A10, f11 + FXCXNPMA f3, A10, f3, f4 + FXCXNPMA f11, A10, f11, f5 + + fxcpnmsub f2, A9, f3, f2 + fxcpnmsub f10, A9, f11, f10 + FXCXNSMA f2, A9, f3, f2 + FXCXNSMA f10, A9, f11, f10 + + fxcpnmsub f1, A8, f3, f1 + fxcpnmsub f9, A8, f11, f9 + FXCXNSMA f1, A8, f3, f1 + FXCXNSMA f9, A8, f11, f9 + + fxcpnmsub f0, A7, f3, f0 + fxcpnmsub f8, A7, f11, f8 + FXCXNSMA f0, A7, f3, f0 + FXCXNSMA f8, A7, f11, f8 + + fxpmul f4, A6, f2 + fxpmul f5, A6, f10 + FXCXNPMA f2, A6, f2, f4 + FXCXNPMA f10, A6, f10, f5 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + FXCXNSMA f1, A5, f2, f1 + FXCXNSMA f9, A5, f10, f9 + + fxcpnmsub f0, A4, f2, f0 + fxcpnmsub f8, A4, f10, f8 + FXCXNSMA f0, A4, f2, f0 + FXCXNSMA f8, A4, f10, f8 + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + LFPDUX A7, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A8, AO, INC4 + LFPDUX A9, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + FXCXNSMA f2, A3, f0, f2 + FXCXNSMA f10, A3, f8, f10 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + FXCXNSMA f3, A4, f0, f3 + FXCXNSMA f11, A4, f8, f11 + + fxpmul f6, A5, f1 + fxpmul f7, A5, f9 + FXCXNPMA f1, A5, f1, f6 + FXCXNPMA f9, A5, f9, f7 + + fxcpnmsub f2, A6, f1, f2 + fxcpnmsub f10, A6, f9, f10 + FXCXNSMA f2, A6, f1, f2 + FXCXNSMA f10, A6, f9, f10 + + fxcpnmsub f3, A7, f1, f3 + fxcpnmsub f11, A7, f9, f11 + FXCXNSMA f3, A7, f1, f3 + FXCXNSMA f11, A7, f9, f11 + + fxpmul f4, A8, f2 + fxpmul f5, A8, f10 + FXCXNPMA f2, A8, f2, f4 + FXCXNPMA f10, A8, f10, f5 + + fxcpnmsub f3, A9, f2, f3 + fxcpnmsub f11, A9, f10, f11 + FXCXNSMA f3, A9, f2, f3 + FXCXNSMA f11, A9, f10, f11 + + fxpmul f6, A10, f3 + fxpmul f7, A10, f11 + FXCXNPMA f3, A10, f3, f6 + FXCXNPMA f11, A10, f11, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + FXCXNSMA f10, A2, f2, f10 + FXCXNSMA f11, A2, f3, f11 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + fxcpnmsub f2, A2, f10, f2 + fxcpnmsub f3, A2, f11, f3 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + FXCXNSMA f2, A2, f10, f2 + FXCXNSMA f3, A2, f11, f3 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + STFDUX f10, CO2, INC + STFSDUX f10, CO2, INC + STFDUX f11, CO2, INC + STFSDUX f11, CO2, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 1 + beq .L999 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + li r0, FZERO + lfpsx f0, SP, r0 + + andi. I, M, 1 + beq .L60 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L72 + .align 4 + +.L73: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 +#else + LFPDX f16, AO, INC2 +#endif + + fpsub f0, f16, f0 + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 +#else + STFPDX f0, AO, INC2 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L68: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + add AO, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxpmul f6, A3, f1 + FXCXNPMA f1, A3, f1, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + srawi. I, M, 2 + ble .L89 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L58: + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + add AO, AO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A10, f3 + FXCXNPMA f3, A10, f3, f4 + + fxcpnmsub f2, A9, f3, f2 + FXCXNSMA f2, A9, f3, f2 + + fxcpnmsub f1, A8, f3, f1 + FXCXNSMA f1, A8, f3, f1 + + fxcpnmsub f0, A7, f3, f0 + FXCXNSMA f0, A7, f3, f0 + + fxpmul f4, A6, f2 + FXCXNPMA f2, A6, f2, f4 + + fxcpnmsub f1, A5, f2, f1 + FXCXNSMA f1, A5, f2, f1 + + fxcpnmsub f0, A4, f2, f0 + FXCXNSMA f0, A4, f2, f0 + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxcpnmsub f2, A3, f0, f2 + FXCXNSMA f2, A3, f0, f2 + + fxcpnmsub f3, A4, f0, f3 + FXCXNSMA f3, A4, f0, f3 + + fxpmul f6, A5, f1 + FXCXNPMA f1, A5, f1, f6 + + fxcpnmsub f2, A6, f1, f2 + FXCXNSMA f2, A6, f1, f2 + + fxcpnmsub f3, A7, f1, f3 + FXCXNSMA f3, A7, f1, f3 + + fxpmul f4, A8, f2 + FXCXNPMA f2, A8, f2, f4 + + fxcpnmsub f3, A9, f2, f3 + FXCXNSMA f3, A9, f2, f3 + + fxpmul f6, A10, f3 + FXCXNPMA f3, A10, f3, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S new file mode 100644 index 0000000..6da6c72 --- /dev/null +++ b/kernel/power/ztrsm_kernel_hummer_LT.S @@ -0,0 +1,2962 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#undef ZERO + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define ZERO r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + +#ifndef CONJ +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnpma +#else +#if defined(LN) || defined(LT) +#define FXCPMADD fxcpnsma +#define FXCSMADD fxcxma +#else +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnsma +#endif +#endif + +#ifndef CONJ +#define FXCXNPMA fxcxnpma +#define FXCXNSMA fxcxnsma +#else +#define FXCXNPMA fxcxnsma +#define FXCXNSMA fxcxnpma +#endif + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) + + li r0, 0 + stwu r0, -4(SP) + stwu r0, -4(SP) + + stfdu f2, -8(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + srawi. J, N, 1 + ble .L50 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + nop + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + nop + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + nop + FXCSMADD f12, B4, A9, f12 + nop + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + nop + FXCSMADD f7, B6, A4, f7 + nop + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L18: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f20, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f22, BO2, INC4 + LFPDUX f19, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f10, f22, f10 + fpsub f11, f23, f11 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A4, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + LFPDUX A9, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A10, f3 + fxpmul f5, A10, f11 + FXCXNPMA f3, A10, f3, f4 + FXCXNPMA f11, A10, f11, f5 + + fxcpnmsub f2, A9, f3, f2 + fxcpnmsub f10, A9, f11, f10 + FXCXNSMA f2, A9, f3, f2 + FXCXNSMA f10, A9, f11, f10 + + fxcpnmsub f1, A8, f3, f1 + fxcpnmsub f9, A8, f11, f9 + FXCXNSMA f1, A8, f3, f1 + FXCXNSMA f9, A8, f11, f9 + + fxcpnmsub f0, A7, f3, f0 + fxcpnmsub f8, A7, f11, f8 + FXCXNSMA f0, A7, f3, f0 + FXCXNSMA f8, A7, f11, f8 + + fxpmul f4, A6, f2 + fxpmul f5, A6, f10 + FXCXNPMA f2, A6, f2, f4 + FXCXNPMA f10, A6, f10, f5 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + FXCXNSMA f1, A5, f2, f1 + FXCXNSMA f9, A5, f10, f9 + + fxcpnmsub f0, A4, f2, f0 + fxcpnmsub f8, A4, f10, f8 + FXCXNSMA f0, A4, f2, f0 + FXCXNSMA f8, A4, f10, f8 + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + LFPDUX A7, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A8, AO, INC4 + LFPDUX A9, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + FXCXNSMA f2, A3, f0, f2 + FXCXNSMA f10, A3, f8, f10 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + FXCXNSMA f3, A4, f0, f3 + FXCXNSMA f11, A4, f8, f11 + + fxpmul f6, A5, f1 + fxpmul f7, A5, f9 + FXCXNPMA f1, A5, f1, f6 + FXCXNPMA f9, A5, f9, f7 + + fxcpnmsub f2, A6, f1, f2 + fxcpnmsub f10, A6, f9, f10 + FXCXNSMA f2, A6, f1, f2 + FXCXNSMA f10, A6, f9, f10 + + fxcpnmsub f3, A7, f1, f3 + fxcpnmsub f11, A7, f9, f11 + FXCXNSMA f3, A7, f1, f3 + FXCXNSMA f11, A7, f9, f11 + + fxpmul f4, A8, f2 + fxpmul f5, A8, f10 + FXCXNPMA f2, A8, f2, f4 + FXCXNPMA f10, A8, f10, f5 + + fxcpnmsub f3, A9, f2, f3 + fxcpnmsub f11, A9, f10, f11 + FXCXNSMA f3, A9, f2, f3 + FXCXNSMA f11, A9, f10, f11 + + fxpmul f6, A10, f3 + fxpmul f7, A10, f11 + FXCXNPMA f3, A10, f3, f6 + FXCXNPMA f11, A10, f11, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + FXCXNSMA f10, A2, f2, f10 + FXCXNSMA f11, A2, f3, f11 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + fxcpnmsub f2, A2, f10, f2 + fxcpnmsub f3, A2, f11, f3 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + FXCXNSMA f2, A2, f10, f2 + FXCXNSMA f3, A2, f11, f3 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + STFDUX f10, CO2, INC + STFSDUX f10, CO2, INC + STFDUX f11, CO2, INC + STFSDUX f11, CO2, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L28: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f18, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f8, f18, f8 + fpsub f9, f19, f9 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxpmul f6, A3, f1 + fxpmul f7, A3, f9 + FXCXNPMA f1, A3, f1, f6 + FXCXNPMA f9, A3, f9, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, r0 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 1 + beq .L49 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L38: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 4 * SIZE +#endif + + addi AO2, AO, 2 * SIZE + addi BO2, BO, 2 * SIZE + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 +#endif + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + +#ifdef LN + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef LT + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f2, A2, f0, f2 + FXCXNSMA f2, A2, f0, f2 + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 + + fxcpnmsub f0, A2, f2, f0 + FXCXNSMA f0, A2, f2, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f2, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f2, AO2, INC4 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 1 + beq .L999 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L60 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L58: + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + add AO, AO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A10, f3 + FXCXNPMA f3, A10, f3, f4 + + fxcpnmsub f2, A9, f3, f2 + FXCXNSMA f2, A9, f3, f2 + + fxcpnmsub f1, A8, f3, f1 + FXCXNSMA f1, A8, f3, f1 + + fxcpnmsub f0, A7, f3, f0 + FXCXNSMA f0, A7, f3, f0 + + fxpmul f4, A6, f2 + FXCXNPMA f2, A6, f2, f4 + + fxcpnmsub f1, A5, f2, f1 + FXCXNSMA f1, A5, f2, f1 + + fxcpnmsub f0, A4, f2, f0 + FXCXNSMA f0, A4, f2, f0 + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxcpnmsub f2, A3, f0, f2 + FXCXNSMA f2, A3, f0, f2 + + fxcpnmsub f3, A4, f0, f3 + FXCXNSMA f3, A4, f0, f3 + + fxpmul f6, A5, f1 + FXCXNPMA f1, A5, f1, f6 + + fxcpnmsub f2, A6, f1, f2 + FXCXNSMA f2, A6, f1, f2 + + fxcpnmsub f3, A7, f1, f3 + FXCXNSMA f3, A7, f1, f3 + + fxpmul f4, A8, f2 + FXCXNPMA f2, A8, f2, f4 + + fxcpnmsub f3, A9, f2, f3 + FXCXNSMA f3, A9, f2, f3 + + fxpmul f6, A10, f3 + FXCXNPMA f3, A10, f3, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L68: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + add AO, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxpmul f6, A3, f1 + FXCXNPMA f1, A3, f1, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 1 + beq .L89 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L72 + .align 4 + +.L73: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 +#else + LFPDX f16, AO, INC2 +#endif + + fpsub f0, f16, f0 + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 +#else + STFPDX f0, AO, INC2 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S new file mode 100644 index 0000000..8670cea --- /dev/null +++ b/kernel/power/ztrsm_kernel_hummer_RT.S @@ -0,0 +1,2962 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#undef ZERO + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define ZERO r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + +#ifndef CONJ +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnpma +#else +#if defined(LN) || defined(LT) +#define FXCPMADD fxcpnsma +#define FXCSMADD fxcxma +#else +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnsma +#endif +#endif + +#ifndef CONJ +#define FXCXNPMA fxcxnpma +#define FXCXNSMA fxcxnsma +#else +#define FXCXNPMA fxcxnsma +#define FXCXNSMA fxcxnpma +#endif + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) + + li r0, 0 + stwu r0, -4(SP) + stwu r0, -4(SP) + + stfdu f2, -8(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + andi. J, N, 1 + beq .L50 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L60 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L58: + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + add AO, AO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A10, f3 + FXCXNPMA f3, A10, f3, f4 + + fxcpnmsub f2, A9, f3, f2 + FXCXNSMA f2, A9, f3, f2 + + fxcpnmsub f1, A8, f3, f1 + FXCXNSMA f1, A8, f3, f1 + + fxcpnmsub f0, A7, f3, f0 + FXCXNSMA f0, A7, f3, f0 + + fxpmul f4, A6, f2 + FXCXNPMA f2, A6, f2, f4 + + fxcpnmsub f1, A5, f2, f1 + FXCXNSMA f1, A5, f2, f1 + + fxcpnmsub f0, A4, f2, f0 + FXCXNSMA f0, A4, f2, f0 + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxcpnmsub f2, A3, f0, f2 + FXCXNSMA f2, A3, f0, f2 + + fxcpnmsub f3, A4, f0, f3 + FXCXNSMA f3, A4, f0, f3 + + fxpmul f6, A5, f1 + FXCXNPMA f1, A5, f1, f6 + + fxcpnmsub f2, A6, f1, f2 + FXCXNSMA f2, A6, f1, f2 + + fxcpnmsub f3, A7, f1, f3 + FXCXNSMA f3, A7, f1, f3 + + fxpmul f4, A8, f2 + FXCXNPMA f2, A8, f2, f4 + + fxcpnmsub f3, A9, f2, f3 + FXCXNSMA f3, A9, f2, f3 + + fxpmul f6, A10, f3 + FXCXNPMA f3, A10, f3, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L68: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + add AO, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxpmul f6, A3, f1 + FXCXNPMA f1, A3, f1, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 1 + beq .L89 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L72 + .align 4 + +.L73: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 +#else + LFPDX f16, AO, INC2 +#endif + + fpsub f0, f16, f0 + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 +#else + STFPDX f0, AO, INC2 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L50: + srawi. J, N, 1 + ble .L999 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + nop + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + nop + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + nop + FXCSMADD f12, B4, A9, f12 + nop + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + nop + FXCSMADD f7, B6, A4, f7 + nop + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L18: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f20, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f22, BO2, INC4 + LFPDUX f19, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f10, f22, f10 + fpsub f11, f23, f11 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A4, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + LFPDUX A9, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A10, f3 + fxpmul f5, A10, f11 + FXCXNPMA f3, A10, f3, f4 + FXCXNPMA f11, A10, f11, f5 + + fxcpnmsub f2, A9, f3, f2 + fxcpnmsub f10, A9, f11, f10 + FXCXNSMA f2, A9, f3, f2 + FXCXNSMA f10, A9, f11, f10 + + fxcpnmsub f1, A8, f3, f1 + fxcpnmsub f9, A8, f11, f9 + FXCXNSMA f1, A8, f3, f1 + FXCXNSMA f9, A8, f11, f9 + + fxcpnmsub f0, A7, f3, f0 + fxcpnmsub f8, A7, f11, f8 + FXCXNSMA f0, A7, f3, f0 + FXCXNSMA f8, A7, f11, f8 + + fxpmul f4, A6, f2 + fxpmul f5, A6, f10 + FXCXNPMA f2, A6, f2, f4 + FXCXNPMA f10, A6, f10, f5 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + FXCXNSMA f1, A5, f2, f1 + FXCXNSMA f9, A5, f10, f9 + + fxcpnmsub f0, A4, f2, f0 + fxcpnmsub f8, A4, f10, f8 + FXCXNSMA f0, A4, f2, f0 + FXCXNSMA f8, A4, f10, f8 + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + LFPDUX A7, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A8, AO, INC4 + LFPDUX A9, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + FXCXNSMA f2, A3, f0, f2 + FXCXNSMA f10, A3, f8, f10 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + FXCXNSMA f3, A4, f0, f3 + FXCXNSMA f11, A4, f8, f11 + + fxpmul f6, A5, f1 + fxpmul f7, A5, f9 + FXCXNPMA f1, A5, f1, f6 + FXCXNPMA f9, A5, f9, f7 + + fxcpnmsub f2, A6, f1, f2 + fxcpnmsub f10, A6, f9, f10 + FXCXNSMA f2, A6, f1, f2 + FXCXNSMA f10, A6, f9, f10 + + fxcpnmsub f3, A7, f1, f3 + fxcpnmsub f11, A7, f9, f11 + FXCXNSMA f3, A7, f1, f3 + FXCXNSMA f11, A7, f9, f11 + + fxpmul f4, A8, f2 + fxpmul f5, A8, f10 + FXCXNPMA f2, A8, f2, f4 + FXCXNPMA f10, A8, f10, f5 + + fxcpnmsub f3, A9, f2, f3 + fxcpnmsub f11, A9, f10, f11 + FXCXNSMA f3, A9, f2, f3 + FXCXNSMA f11, A9, f10, f11 + + fxpmul f6, A10, f3 + fxpmul f7, A10, f11 + FXCXNPMA f3, A10, f3, f6 + FXCXNPMA f11, A10, f11, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + FXCXNSMA f10, A2, f2, f10 + FXCXNSMA f11, A2, f3, f11 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + fxcpnmsub f2, A2, f10, f2 + fxcpnmsub f3, A2, f11, f3 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + FXCXNSMA f2, A2, f10, f2 + FXCXNSMA f3, A2, f11, f3 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + STFDUX f10, CO2, INC + STFSDUX f10, CO2, INC + STFDUX f11, CO2, INC + STFSDUX f11, CO2, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L28: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f18, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f8, f18, f8 + fpsub f9, f19, f9 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxpmul f6, A3, f1 + fxpmul f7, A3, f9 + FXCXNPMA f1, A3, f1, f6 + FXCXNPMA f9, A3, f9, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, r0 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 1 + beq .L49 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L38: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 4 * SIZE +#endif + + addi AO2, AO, 2 * SIZE + addi BO2, BO, 2 * SIZE + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 +#endif + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + +#ifdef LN + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef LT + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f2, A2, f0, f2 + FXCXNSMA f2, A2, f0, f2 + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 + + fxcpnmsub f0, A2, f2, f0 + FXCXNSMA f0, A2, f2, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f2, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f2, AO2, INC4 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S new file mode 100644 index 0000000..7a3b286 --- /dev/null +++ b/kernel/power/ztrsm_kernel_power6_LN.S @@ -0,0 +1,4720 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r19 +#define TEMP r20 +#define KK r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef CONJ +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FNMSUB +#define FMA4 FMADD +#elif defined(LN) || defined(LT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FMADD +#define FMA4 FNMSUB +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FMADD +#define FMA4 FMADD +#endif + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, 48 * SIZE + li PREC, -4 * SIZE + + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + + andi. I, M, 1 + ble LL(20) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f18, f20, f0 + FMA4 f3, f19, f20, f3 + FMA2 f1, f18, f21, f1 + FMA3 f2, f19, f21, f2 + + FMA1 f4, f18, f22, f4 + FMA4 f7, f19, f22, f7 + FMA2 f5, f18, f23, f5 + FMA3 f6, f19, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f8, f18, f24, f8 + FMA4 f11, f19, f24, f11 + FMA2 f9, f18, f25, f9 + FMA3 f10, f19, f25, f10 + + FMA1 f12, f18, f26, f12 + FMA4 f15, f19, f26, f15 + FMA2 f13, f18, f27, f13 + FMA3 f14, f19, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA4 f3, f29, f20, f3 + FMA2 f1, f28, f21, f1 + FMA3 f2, f29, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA4 f7, f29, f22, f7 + FMA2 f5, f28, f23, f5 + FMA3 f6, f29, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f8, f28, f24, f8 + FMA4 f11, f29, f24, f11 + FMA2 f9, f28, f25, f9 + FMA3 f10, f29, f25, f10 + + FMA1 f12, f28, f26, f12 + FMA4 f15, f29, f26, f15 + FMA2 f13, f28, f27, f13 + FMA3 f14, f29, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f30, f20, f0 + FMA4 f3, f31, f20, f3 + FMA2 f1, f30, f21, f1 + FMA3 f2, f31, f21, f2 + + FMA1 f4, f30, f22, f4 + FMA4 f7, f31, f22, f7 + FMA2 f5, f30, f23, f5 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f8, f30, f24, f8 + FMA4 f11, f31, f24, f11 + FMA2 f9, f30, f25, f9 + FMA3 f10, f31, f25, f10 + + FMA1 f12, f30, f26, f12 + FMA4 f15, f31, f26, f15 + FMA2 f13, f30, f27, f13 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 32 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + +#ifndef CONJ + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + +#else + + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + +#else + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(20): + srawi. I, M, 1 + ble LL(29) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 40 * SIZE(BO) + LFD f21, 41 * SIZE(BO) + LFD f22, 42 * SIZE(BO) + LFD f23, 43 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 44 * SIZE(BO) + LFD f25, 45 * SIZE(BO) + LFD f26, 46 * SIZE(BO) + LFD f27, 47 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 48 * SIZE(BO) + LFD f21, 49 * SIZE(BO) + LFD f22, 50 * SIZE(BO) + LFD f23, 51 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 52 * SIZE(BO) + LFD f25, 53 * SIZE(BO) + LFD f26, 54 * SIZE(BO) + LFD f27, 55 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 56 * SIZE(BO) + LFD f21, 57 * SIZE(BO) + LFD f22, 58 * SIZE(BO) + LFD f23, 59 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 60 * SIZE(BO) + LFD f25, 61 * SIZE(BO) + LFD f26, 62 * SIZE(BO) + LFD f27, 63 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 64 * SIZE(BO) + LFD f21, 65 * SIZE(BO) + LFD f22, 66 * SIZE(BO) + LFD f23, 67 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 68 * SIZE(BO) + LFD f25, 69 * SIZE(BO) + LFD f26, 70 * SIZE(BO) + LFD f27, 71 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 64 * SIZE + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(18) + .align 4 + +LL(16): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + FSUB f2, f24, f2 + FSUB f3, f25, f3 + FSUB f6, f26, f6 + FSUB f7, f27, f7 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f10, f28, f10 + FSUB f11, f29, f11 + FSUB f14, f30, f14 + FSUB f15, f31, f15 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f24, 6 * SIZE(AO) + LFD f25, 7 * SIZE(AO) + LFD f26, 4 * SIZE(AO) + LFD f27, 5 * SIZE(AO) + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f25, f3 + FMUL f17, f25, f2 + FMUL f18, f25, f7 + FMUL f19, f25, f6 + + FMUL f20, f25, f11 + FMUL f21, f25, f10 + FMUL f22, f25, f15 + FMUL f23, f25, f14 + +#ifndef CONJ + + FMSUB f2, f24, f2, f16 + FMADD f3, f24, f3, f17 + FMSUB f6, f24, f6, f18 + FMADD f7, f24, f7, f19 + + FMSUB f10, f24, f10, f20 + FMADD f11, f24, f11, f21 + FMSUB f14, f24, f14, f22 + FMADD f15, f24, f15, f23 + + FMADD f0, f27, f3, f0 + FNMSUB f1, f27, f2, f1 + FMADD f4, f27, f7, f4 + FNMSUB f5, f27, f6, f5 + + FMADD f8, f27, f11, f8 + FNMSUB f9, f27, f10, f9 + FMADD f12, f27, f15, f12 + FNMSUB f13, f27, f14, f13 + + FNMSUB f0, f26, f2, f0 + FNMSUB f1, f26, f3, f1 + FNMSUB f4, f26, f6, f4 + FNMSUB f5, f26, f7, f5 + + FNMSUB f8, f26, f10, f8 + FNMSUB f9, f26, f11, f9 + FNMSUB f12, f26, f14, f12 + FNMSUB f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f2, f24, f2, f16 + FMSUB f3, f24, f3, f17 + FMADD f6, f24, f6, f18 + FMSUB f7, f24, f7, f19 + + FMADD f10, f24, f10, f20 + FMSUB f11, f24, f11, f21 + FMADD f14, f24, f14, f22 + FMSUB f15, f24, f15, f23 + + FMSUB f0, f27, f3, f0 + FNMADD f1, f27, f2, f1 + FMSUB f4, f27, f7, f4 + FNMADD f5, f27, f6, f5 + + FMSUB f8, f27, f11, f8 + FNMADD f9, f27, f10, f9 + FMSUB f12, f27, f15, f12 + FNMADD f13, f27, f14, f13 + + FNMADD f0, f26, f2, f0 + FNMADD f1, f26, f3, f1 + FNMADD f4, f26, f6, f4 + FNMADD f5, f26, f7, f5 + + FNMADD f8, f26, f10, f8 + FNMADD f9, f26, f11, f9 + FNMADD f12, f26, f14, f12 + FNMADD f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + LFD f26, 2 * SIZE(AO) + LFD f27, 3 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + + FMADD f2, f27, f1, f2 + FNMSUB f3, f27, f0, f3 + FMADD f6, f27, f5, f6 + FNMSUB f7, f27, f4, f7 + + FMADD f10, f27, f9, f10 + FNMSUB f11, f27, f8, f11 + FMADD f14, f27, f13, f14 + FNMSUB f15, f27, f12, f15 + + FNMSUB f2, f26, f0, f2 + FNMSUB f3, f26, f1, f3 + FNMSUB f6, f26, f4, f6 + FNMSUB f7, f26, f5, f7 + + FNMSUB f10, f26, f8, f10 + FNMSUB f11, f26, f9, f11 + FNMSUB f14, f26, f12, f14 + FNMSUB f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMSUB f2, f28, f2, f16 + FMADD f3, f28, f3, f17 + FMSUB f6, f28, f6, f18 + FMADD f7, f28, f7, f19 + + FMSUB f10, f28, f10, f20 + FMADD f11, f28, f11, f21 + FMSUB f14, f28, f14, f22 + FMADD f15, f28, f15, f23 +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + + FMSUB f2, f27, f1, f2 + FNMADD f3, f27, f0, f3 + FMSUB f6, f27, f5, f6 + FNMADD f7, f27, f4, f7 + + FMSUB f10, f27, f9, f10 + FNMADD f11, f27, f8, f11 + FMSUB f14, f27, f13, f14 + FNMADD f15, f27, f12, f15 + + FNMADD f2, f26, f0, f2 + FNMADD f3, f26, f1, f3 + FNMADD f6, f26, f4, f6 + FNMADD f7, f26, f5, f7 + + FNMADD f10, f26, f8, f10 + FNMADD f11, f26, f9, f11 + FNMADD f14, f26, f12, f14 + FNMADD f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMADD f2, f28, f2, f16 + FMSUB f3, f28, f3, f17 + FMADD f6, f28, f6, f18 + FMSUB f7, f28, f7, f19 + + FMADD f10, f28, f10, f20 + FMSUB f11, f28, f11, f21 + FMADD f14, f28, f14, f22 + FMSUB f15, f28, f15, f23 +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f3 + FMUL f19, f25, f2 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f2, f24, f2, f18 + FMADD f3, f24, f3, f19 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FMADD f6, f27, f3, f6 + FNMSUB f7, f27, f2, f7 + + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + FNMSUB f6, f26, f2, f6 + FNMSUB f7, f26, f3, f7 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FMADD f10, f29, f3, f10 + FNMSUB f11, f29, f2, f11 + + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + FNMSUB f10, f28, f2, f10 + FNMSUB f11, f28, f3, f11 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FMADD f14, f31, f3, f14 + FNMSUB f15, f31, f2, f15 + + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + FNMSUB f14, f30, f2, f14 + FNMSUB f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FMADD f10, f29, f7, f10 + FNMSUB f11, f29, f6, f11 + + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + FNMSUB f10, f28, f6, f10 + FNMSUB f11, f28, f7, f11 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FMADD f14, f31, f7, f14 + FNMSUB f15, f31, f6, f15 + + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + FNMSUB f14, f30, f6, f14 + FNMSUB f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FMADD f14, f29, f11, f14 + FNMSUB f15, f29, f10, f15 + + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + FNMSUB f14, f28, f10, f14 + FNMSUB f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + FMSUB f14, f30, f14, f18 + FMADD f15, f30, f15, f19 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f2, f24, f2, f18 + FMSUB f3, f24, f3, f19 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FMSUB f6, f27, f3, f6 + FNMADD f7, f27, f2, f7 + + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + FNMADD f6, f26, f2, f6 + FNMADD f7, f26, f3, f7 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FMSUB f10, f29, f3, f10 + FNMADD f11, f29, f2, f11 + + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + FNMADD f10, f28, f2, f10 + FNMADD f11, f28, f3, f11 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FMSUB f14, f31, f3, f14 + FNMADD f15, f31, f2, f15 + + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + FNMADD f14, f30, f2, f14 + FNMADD f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FMSUB f10, f29, f7, f10 + FNMADD f11, f29, f6, f11 + + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + FNMADD f10, f28, f6, f10 + FNMADD f11, f28, f7, f11 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FMSUB f14, f31, f7, f14 + FNMADD f15, f31, f6, f15 + + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + FNMADD f14, f30, f6, f14 + FNMADD f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FMSUB f14, f29, f11, f14 + FNMADD f15, f29, f10, f15 + + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + FNMADD f14, f28, f10, f14 + FNMADD f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 + FMADD f14, f30, f14, f18 + FMSUB f15, f30, f15, f19 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + FMUL f18, f25, f15 + FMUL f19, f25, f14 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + FMSUB f14, f24, f14, f18 + FMADD f15, f24, f15, f19 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FMADD f10, f27, f15, f10 + FNMSUB f11, f27, f14, f11 + + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + FNMSUB f10, f26, f14, f10 + FNMSUB f11, f26, f15, f11 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FMADD f6, f29, f15, f6 + FNMSUB f7, f29, f14, f7 + + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + FNMSUB f6, f28, f14, f6 + FNMSUB f7, f28, f15, f7 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FMADD f2, f31, f15, f2 + FNMSUB f3, f31, f14, f3 + + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + FNMSUB f2, f30, f14, f2 + FNMSUB f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FMADD f6, f29, f11, f6 + FNMSUB f7, f29, f10, f7 + + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + FNMSUB f6, f28, f10, f6 + FNMSUB f7, f28, f11, f7 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FMADD f2, f31, f11, f2 + FNMSUB f3, f31, f10, f3 + + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + FNMSUB f2, f30, f10, f2 + FNMSUB f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FMADD f2, f29, f7, f2 + FNMSUB f3, f29, f6, f3 + + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + FNMSUB f2, f28, f6, f2 + FNMSUB f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + +#else + + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + FMADD f14, f24, f14, f18 + FMSUB f15, f24, f15, f19 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FMSUB f10, f27, f15, f10 + FNMADD f11, f27, f14, f11 + + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + FNMADD f10, f26, f14, f10 + FNMADD f11, f26, f15, f11 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FMSUB f6, f29, f15, f6 + FNMADD f7, f29, f14, f7 + + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + FNMADD f6, f28, f14, f6 + FNMADD f7, f28, f15, f7 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FMSUB f2, f31, f15, f2 + FNMADD f3, f31, f14, f3 + + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + FNMADD f2, f30, f14, f2 + FNMADD f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FMSUB f6, f29, f11, f6 + FNMADD f7, f29, f10, f7 + + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + FNMADD f6, f28, f10, f6 + FNMADD f7, f28, f11, f7 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FMSUB f2, f31, f11, f2 + FNMADD f3, f31, f10, f3 + + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + FNMADD f2, f30, f10, f2 + FNMADD f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FMSUB f2, f29, f7, f2 + FNMADD f3, f29, f6, f3 + + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + FNMADD f2, f28, f6, f2 + FNMADD f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMSUB f3, f30, f3, f19 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f3, 9 * SIZE(BO) + STFD f6, 10 * SIZE(BO) + STFD f7, 11 * SIZE(BO) + STFD f10, 12 * SIZE(BO) + STFD f11, 13 * SIZE(BO) + STFD f14, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(50) + .align 4 + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + + andi. I, M, 1 + ble LL(40) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(47) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + srawi. I, M, 1 + ble LL(49) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + + bdnz LL(36) + .align 4 + +LL(38): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + + andi. I, M, 1 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 4 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(67) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(66) + .align 4 + +LL(67): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(60): + srawi. I, M, 1 + ble LL(69) + .align 4 + +LL(51): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(57) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(56) + .align 4 + +LL(57): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(51) + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S new file mode 100644 index 0000000..b7c3441 --- /dev/null +++ b/kernel/power/ztrsm_kernel_power6_LT.S @@ -0,0 +1,4697 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r19 +#define TEMP r20 +#define KK r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef CONJ +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FNMSUB +#define FMA4 FMADD +#elif defined(LN) || defined(LT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FMADD +#define FMA4 FNMSUB +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FMADD +#define FMA4 FMADD +#endif + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, 48 * SIZE + li PREC, 4 * SIZE + + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 40 * SIZE(BO) + LFD f21, 41 * SIZE(BO) + LFD f22, 42 * SIZE(BO) + LFD f23, 43 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 44 * SIZE(BO) + LFD f25, 45 * SIZE(BO) + LFD f26, 46 * SIZE(BO) + LFD f27, 47 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 48 * SIZE(BO) + LFD f21, 49 * SIZE(BO) + LFD f22, 50 * SIZE(BO) + LFD f23, 51 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 52 * SIZE(BO) + LFD f25, 53 * SIZE(BO) + LFD f26, 54 * SIZE(BO) + LFD f27, 55 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 56 * SIZE(BO) + LFD f21, 57 * SIZE(BO) + LFD f22, 58 * SIZE(BO) + LFD f23, 59 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 60 * SIZE(BO) + LFD f25, 61 * SIZE(BO) + LFD f26, 62 * SIZE(BO) + LFD f27, 63 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 64 * SIZE(BO) + LFD f21, 65 * SIZE(BO) + LFD f22, 66 * SIZE(BO) + LFD f23, 67 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 68 * SIZE(BO) + LFD f25, 69 * SIZE(BO) + LFD f26, 70 * SIZE(BO) + LFD f27, 71 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 64 * SIZE + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(18) + .align 4 + +LL(16): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + FSUB f2, f24, f2 + FSUB f3, f25, f3 + FSUB f6, f26, f6 + FSUB f7, f27, f7 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f10, f28, f10 + FSUB f11, f29, f11 + FSUB f14, f30, f14 + FSUB f15, f31, f15 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f24, 6 * SIZE(AO) + LFD f25, 7 * SIZE(AO) + LFD f26, 4 * SIZE(AO) + LFD f27, 5 * SIZE(AO) + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f25, f3 + FMUL f17, f25, f2 + FMUL f18, f25, f7 + FMUL f19, f25, f6 + + FMUL f20, f25, f11 + FMUL f21, f25, f10 + FMUL f22, f25, f15 + FMUL f23, f25, f14 + +#ifndef CONJ + + FMSUB f2, f24, f2, f16 + FMADD f3, f24, f3, f17 + FMSUB f6, f24, f6, f18 + FMADD f7, f24, f7, f19 + + FMSUB f10, f24, f10, f20 + FMADD f11, f24, f11, f21 + FMSUB f14, f24, f14, f22 + FMADD f15, f24, f15, f23 + + FMADD f0, f27, f3, f0 + FNMSUB f1, f27, f2, f1 + FMADD f4, f27, f7, f4 + FNMSUB f5, f27, f6, f5 + + FMADD f8, f27, f11, f8 + FNMSUB f9, f27, f10, f9 + FMADD f12, f27, f15, f12 + FNMSUB f13, f27, f14, f13 + + FNMSUB f0, f26, f2, f0 + FNMSUB f1, f26, f3, f1 + FNMSUB f4, f26, f6, f4 + FNMSUB f5, f26, f7, f5 + + FNMSUB f8, f26, f10, f8 + FNMSUB f9, f26, f11, f9 + FNMSUB f12, f26, f14, f12 + FNMSUB f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f2, f24, f2, f16 + FMSUB f3, f24, f3, f17 + FMADD f6, f24, f6, f18 + FMSUB f7, f24, f7, f19 + + FMADD f10, f24, f10, f20 + FMSUB f11, f24, f11, f21 + FMADD f14, f24, f14, f22 + FMSUB f15, f24, f15, f23 + + FMSUB f0, f27, f3, f0 + FNMADD f1, f27, f2, f1 + FMSUB f4, f27, f7, f4 + FNMADD f5, f27, f6, f5 + + FMSUB f8, f27, f11, f8 + FNMADD f9, f27, f10, f9 + FMSUB f12, f27, f15, f12 + FNMADD f13, f27, f14, f13 + + FNMADD f0, f26, f2, f0 + FNMADD f1, f26, f3, f1 + FNMADD f4, f26, f6, f4 + FNMADD f5, f26, f7, f5 + + FNMADD f8, f26, f10, f8 + FNMADD f9, f26, f11, f9 + FNMADD f12, f26, f14, f12 + FNMADD f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + LFD f26, 2 * SIZE(AO) + LFD f27, 3 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + + FMADD f2, f27, f1, f2 + FNMSUB f3, f27, f0, f3 + FMADD f6, f27, f5, f6 + FNMSUB f7, f27, f4, f7 + + FMADD f10, f27, f9, f10 + FNMSUB f11, f27, f8, f11 + FMADD f14, f27, f13, f14 + FNMSUB f15, f27, f12, f15 + + FNMSUB f2, f26, f0, f2 + FNMSUB f3, f26, f1, f3 + FNMSUB f6, f26, f4, f6 + FNMSUB f7, f26, f5, f7 + + FNMSUB f10, f26, f8, f10 + FNMSUB f11, f26, f9, f11 + FNMSUB f14, f26, f12, f14 + FNMSUB f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMSUB f2, f28, f2, f16 + FMADD f3, f28, f3, f17 + FMSUB f6, f28, f6, f18 + FMADD f7, f28, f7, f19 + + FMSUB f10, f28, f10, f20 + FMADD f11, f28, f11, f21 + FMSUB f14, f28, f14, f22 + FMADD f15, f28, f15, f23 +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + + FMSUB f2, f27, f1, f2 + FNMADD f3, f27, f0, f3 + FMSUB f6, f27, f5, f6 + FNMADD f7, f27, f4, f7 + + FMSUB f10, f27, f9, f10 + FNMADD f11, f27, f8, f11 + FMSUB f14, f27, f13, f14 + FNMADD f15, f27, f12, f15 + + FNMADD f2, f26, f0, f2 + FNMADD f3, f26, f1, f3 + FNMADD f6, f26, f4, f6 + FNMADD f7, f26, f5, f7 + + FNMADD f10, f26, f8, f10 + FNMADD f11, f26, f9, f11 + FNMADD f14, f26, f12, f14 + FNMADD f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMADD f2, f28, f2, f16 + FMSUB f3, f28, f3, f17 + FMADD f6, f28, f6, f18 + FMSUB f7, f28, f7, f19 + + FMADD f10, f28, f10, f20 + FMSUB f11, f28, f11, f21 + FMADD f14, f28, f14, f22 + FMSUB f15, f28, f15, f23 +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f3 + FMUL f19, f25, f2 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f2, f24, f2, f18 + FMADD f3, f24, f3, f19 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FMADD f6, f27, f3, f6 + FNMSUB f7, f27, f2, f7 + + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + FNMSUB f6, f26, f2, f6 + FNMSUB f7, f26, f3, f7 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FMADD f10, f29, f3, f10 + FNMSUB f11, f29, f2, f11 + + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + FNMSUB f10, f28, f2, f10 + FNMSUB f11, f28, f3, f11 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FMADD f14, f31, f3, f14 + FNMSUB f15, f31, f2, f15 + + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + FNMSUB f14, f30, f2, f14 + FNMSUB f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FMADD f10, f29, f7, f10 + FNMSUB f11, f29, f6, f11 + + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + FNMSUB f10, f28, f6, f10 + FNMSUB f11, f28, f7, f11 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FMADD f14, f31, f7, f14 + FNMSUB f15, f31, f6, f15 + + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + FNMSUB f14, f30, f6, f14 + FNMSUB f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FMADD f14, f29, f11, f14 + FNMSUB f15, f29, f10, f15 + + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + FNMSUB f14, f28, f10, f14 + FNMSUB f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + FMSUB f14, f30, f14, f18 + FMADD f15, f30, f15, f19 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f2, f24, f2, f18 + FMSUB f3, f24, f3, f19 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FMSUB f6, f27, f3, f6 + FNMADD f7, f27, f2, f7 + + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + FNMADD f6, f26, f2, f6 + FNMADD f7, f26, f3, f7 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FMSUB f10, f29, f3, f10 + FNMADD f11, f29, f2, f11 + + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + FNMADD f10, f28, f2, f10 + FNMADD f11, f28, f3, f11 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FMSUB f14, f31, f3, f14 + FNMADD f15, f31, f2, f15 + + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + FNMADD f14, f30, f2, f14 + FNMADD f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FMSUB f10, f29, f7, f10 + FNMADD f11, f29, f6, f11 + + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + FNMADD f10, f28, f6, f10 + FNMADD f11, f28, f7, f11 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FMSUB f14, f31, f7, f14 + FNMADD f15, f31, f6, f15 + + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + FNMADD f14, f30, f6, f14 + FNMADD f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FMSUB f14, f29, f11, f14 + FNMADD f15, f29, f10, f15 + + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + FNMADD f14, f28, f10, f14 + FNMADD f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 + FMADD f14, f30, f14, f18 + FMSUB f15, f30, f15, f19 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + FMUL f18, f25, f15 + FMUL f19, f25, f14 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + FMSUB f14, f24, f14, f18 + FMADD f15, f24, f15, f19 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FMADD f10, f27, f15, f10 + FNMSUB f11, f27, f14, f11 + + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + FNMSUB f10, f26, f14, f10 + FNMSUB f11, f26, f15, f11 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FMADD f6, f29, f15, f6 + FNMSUB f7, f29, f14, f7 + + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + FNMSUB f6, f28, f14, f6 + FNMSUB f7, f28, f15, f7 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FMADD f2, f31, f15, f2 + FNMSUB f3, f31, f14, f3 + + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + FNMSUB f2, f30, f14, f2 + FNMSUB f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FMADD f6, f29, f11, f6 + FNMSUB f7, f29, f10, f7 + + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + FNMSUB f6, f28, f10, f6 + FNMSUB f7, f28, f11, f7 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FMADD f2, f31, f11, f2 + FNMSUB f3, f31, f10, f3 + + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + FNMSUB f2, f30, f10, f2 + FNMSUB f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FMADD f2, f29, f7, f2 + FNMSUB f3, f29, f6, f3 + + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + FNMSUB f2, f28, f6, f2 + FNMSUB f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + +#else + + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + FMADD f14, f24, f14, f18 + FMSUB f15, f24, f15, f19 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FMSUB f10, f27, f15, f10 + FNMADD f11, f27, f14, f11 + + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + FNMADD f10, f26, f14, f10 + FNMADD f11, f26, f15, f11 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FMSUB f6, f29, f15, f6 + FNMADD f7, f29, f14, f7 + + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + FNMADD f6, f28, f14, f6 + FNMADD f7, f28, f15, f7 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FMSUB f2, f31, f15, f2 + FNMADD f3, f31, f14, f3 + + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + FNMADD f2, f30, f14, f2 + FNMADD f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FMSUB f6, f29, f11, f6 + FNMADD f7, f29, f10, f7 + + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + FNMADD f6, f28, f10, f6 + FNMADD f7, f28, f11, f7 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FMSUB f2, f31, f11, f2 + FNMADD f3, f31, f10, f3 + + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + FNMADD f2, f30, f10, f2 + FNMADD f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FMSUB f2, f29, f7, f2 + FNMADD f3, f29, f6, f3 + + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + FNMADD f2, f28, f6, f2 + FNMADD f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMSUB f3, f30, f3, f19 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f3, 9 * SIZE(BO) + STFD f6, 10 * SIZE(BO) + STFD f7, 11 * SIZE(BO) + STFD f10, 12 * SIZE(BO) + STFD f11, 13 * SIZE(BO) + STFD f14, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f18, f20, f0 + FMA4 f3, f19, f20, f3 + FMA2 f1, f18, f21, f1 + FMA3 f2, f19, f21, f2 + + FMA1 f4, f18, f22, f4 + FMA4 f7, f19, f22, f7 + FMA2 f5, f18, f23, f5 + FMA3 f6, f19, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f8, f18, f24, f8 + FMA4 f11, f19, f24, f11 + FMA2 f9, f18, f25, f9 + FMA3 f10, f19, f25, f10 + + FMA1 f12, f18, f26, f12 + FMA4 f15, f19, f26, f15 + FMA2 f13, f18, f27, f13 + FMA3 f14, f19, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA4 f3, f29, f20, f3 + FMA2 f1, f28, f21, f1 + FMA3 f2, f29, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA4 f7, f29, f22, f7 + FMA2 f5, f28, f23, f5 + FMA3 f6, f29, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f8, f28, f24, f8 + FMA4 f11, f29, f24, f11 + FMA2 f9, f28, f25, f9 + FMA3 f10, f29, f25, f10 + + FMA1 f12, f28, f26, f12 + FMA4 f15, f29, f26, f15 + FMA2 f13, f28, f27, f13 + FMA3 f14, f29, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f30, f20, f0 + FMA4 f3, f31, f20, f3 + FMA2 f1, f30, f21, f1 + FMA3 f2, f31, f21, f2 + + FMA1 f4, f30, f22, f4 + FMA4 f7, f31, f22, f7 + FMA2 f5, f30, f23, f5 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f8, f30, f24, f8 + FMA4 f11, f31, f24, f11 + FMA2 f9, f30, f25, f9 + FMA3 f10, f31, f25, f10 + + FMA1 f12, f30, f26, f12 + FMA4 f15, f31, f26, f15 + FMA2 f13, f30, f27, f13 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 32 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + +#ifndef CONJ + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + +#else + + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + +#else + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(50) + .align 4 + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + + bdnz LL(36) + .align 4 + +LL(38): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(47) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(60) + .align 4 + +LL(51): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(57) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(56) + .align 4 + +LL(57): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(51) + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 4 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(67) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(66) + .align 4 + +LL(67): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S new file mode 100644 index 0000000..069a73c --- /dev/null +++ b/kernel/power/ztrsm_kernel_power6_RT.S @@ -0,0 +1,4696 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r19 +#define TEMP r20 +#define KK r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef CONJ +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FNMSUB +#define FMA4 FMADD +#elif defined(LN) || defined(LT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FMADD +#define FMA4 FNMSUB +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FMADD +#define FMA4 FMADD +#endif + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, 48 * SIZE + li PREC, 4 * SIZE + + andi. J, N, 1 + ble LL(30) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(60) + .align 4 + +LL(51): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(57) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(56) + .align 4 + +LL(57): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(51) + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 4 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(67) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(66) + .align 4 + +LL(67): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(50) + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + + bdnz LL(36) + .align 4 + +LL(38): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(47) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +LL(50): + srawi. J, N, 2 + ble LL(999) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 40 * SIZE(BO) + LFD f21, 41 * SIZE(BO) + LFD f22, 42 * SIZE(BO) + LFD f23, 43 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 44 * SIZE(BO) + LFD f25, 45 * SIZE(BO) + LFD f26, 46 * SIZE(BO) + LFD f27, 47 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 48 * SIZE(BO) + LFD f21, 49 * SIZE(BO) + LFD f22, 50 * SIZE(BO) + LFD f23, 51 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 52 * SIZE(BO) + LFD f25, 53 * SIZE(BO) + LFD f26, 54 * SIZE(BO) + LFD f27, 55 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 56 * SIZE(BO) + LFD f21, 57 * SIZE(BO) + LFD f22, 58 * SIZE(BO) + LFD f23, 59 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 60 * SIZE(BO) + LFD f25, 61 * SIZE(BO) + LFD f26, 62 * SIZE(BO) + LFD f27, 63 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 64 * SIZE(BO) + LFD f21, 65 * SIZE(BO) + LFD f22, 66 * SIZE(BO) + LFD f23, 67 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 68 * SIZE(BO) + LFD f25, 69 * SIZE(BO) + LFD f26, 70 * SIZE(BO) + LFD f27, 71 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 64 * SIZE + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(18) + .align 4 + +LL(16): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + FSUB f2, f24, f2 + FSUB f3, f25, f3 + FSUB f6, f26, f6 + FSUB f7, f27, f7 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f10, f28, f10 + FSUB f11, f29, f11 + FSUB f14, f30, f14 + FSUB f15, f31, f15 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f24, 6 * SIZE(AO) + LFD f25, 7 * SIZE(AO) + LFD f26, 4 * SIZE(AO) + LFD f27, 5 * SIZE(AO) + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f25, f3 + FMUL f17, f25, f2 + FMUL f18, f25, f7 + FMUL f19, f25, f6 + + FMUL f20, f25, f11 + FMUL f21, f25, f10 + FMUL f22, f25, f15 + FMUL f23, f25, f14 + +#ifndef CONJ + + FMSUB f2, f24, f2, f16 + FMADD f3, f24, f3, f17 + FMSUB f6, f24, f6, f18 + FMADD f7, f24, f7, f19 + + FMSUB f10, f24, f10, f20 + FMADD f11, f24, f11, f21 + FMSUB f14, f24, f14, f22 + FMADD f15, f24, f15, f23 + + FMADD f0, f27, f3, f0 + FNMSUB f1, f27, f2, f1 + FMADD f4, f27, f7, f4 + FNMSUB f5, f27, f6, f5 + + FMADD f8, f27, f11, f8 + FNMSUB f9, f27, f10, f9 + FMADD f12, f27, f15, f12 + FNMSUB f13, f27, f14, f13 + + FNMSUB f0, f26, f2, f0 + FNMSUB f1, f26, f3, f1 + FNMSUB f4, f26, f6, f4 + FNMSUB f5, f26, f7, f5 + + FNMSUB f8, f26, f10, f8 + FNMSUB f9, f26, f11, f9 + FNMSUB f12, f26, f14, f12 + FNMSUB f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f2, f24, f2, f16 + FMSUB f3, f24, f3, f17 + FMADD f6, f24, f6, f18 + FMSUB f7, f24, f7, f19 + + FMADD f10, f24, f10, f20 + FMSUB f11, f24, f11, f21 + FMADD f14, f24, f14, f22 + FMSUB f15, f24, f15, f23 + + FMSUB f0, f27, f3, f0 + FNMADD f1, f27, f2, f1 + FMSUB f4, f27, f7, f4 + FNMADD f5, f27, f6, f5 + + FMSUB f8, f27, f11, f8 + FNMADD f9, f27, f10, f9 + FMSUB f12, f27, f15, f12 + FNMADD f13, f27, f14, f13 + + FNMADD f0, f26, f2, f0 + FNMADD f1, f26, f3, f1 + FNMADD f4, f26, f6, f4 + FNMADD f5, f26, f7, f5 + + FNMADD f8, f26, f10, f8 + FNMADD f9, f26, f11, f9 + FNMADD f12, f26, f14, f12 + FNMADD f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + LFD f26, 2 * SIZE(AO) + LFD f27, 3 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + + FMADD f2, f27, f1, f2 + FNMSUB f3, f27, f0, f3 + FMADD f6, f27, f5, f6 + FNMSUB f7, f27, f4, f7 + + FMADD f10, f27, f9, f10 + FNMSUB f11, f27, f8, f11 + FMADD f14, f27, f13, f14 + FNMSUB f15, f27, f12, f15 + + FNMSUB f2, f26, f0, f2 + FNMSUB f3, f26, f1, f3 + FNMSUB f6, f26, f4, f6 + FNMSUB f7, f26, f5, f7 + + FNMSUB f10, f26, f8, f10 + FNMSUB f11, f26, f9, f11 + FNMSUB f14, f26, f12, f14 + FNMSUB f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMSUB f2, f28, f2, f16 + FMADD f3, f28, f3, f17 + FMSUB f6, f28, f6, f18 + FMADD f7, f28, f7, f19 + + FMSUB f10, f28, f10, f20 + FMADD f11, f28, f11, f21 + FMSUB f14, f28, f14, f22 + FMADD f15, f28, f15, f23 +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + + FMSUB f2, f27, f1, f2 + FNMADD f3, f27, f0, f3 + FMSUB f6, f27, f5, f6 + FNMADD f7, f27, f4, f7 + + FMSUB f10, f27, f9, f10 + FNMADD f11, f27, f8, f11 + FMSUB f14, f27, f13, f14 + FNMADD f15, f27, f12, f15 + + FNMADD f2, f26, f0, f2 + FNMADD f3, f26, f1, f3 + FNMADD f6, f26, f4, f6 + FNMADD f7, f26, f5, f7 + + FNMADD f10, f26, f8, f10 + FNMADD f11, f26, f9, f11 + FNMADD f14, f26, f12, f14 + FNMADD f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMADD f2, f28, f2, f16 + FMSUB f3, f28, f3, f17 + FMADD f6, f28, f6, f18 + FMSUB f7, f28, f7, f19 + + FMADD f10, f28, f10, f20 + FMSUB f11, f28, f11, f21 + FMADD f14, f28, f14, f22 + FMSUB f15, f28, f15, f23 +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f3 + FMUL f19, f25, f2 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f2, f24, f2, f18 + FMADD f3, f24, f3, f19 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FMADD f6, f27, f3, f6 + FNMSUB f7, f27, f2, f7 + + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + FNMSUB f6, f26, f2, f6 + FNMSUB f7, f26, f3, f7 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FMADD f10, f29, f3, f10 + FNMSUB f11, f29, f2, f11 + + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + FNMSUB f10, f28, f2, f10 + FNMSUB f11, f28, f3, f11 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FMADD f14, f31, f3, f14 + FNMSUB f15, f31, f2, f15 + + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + FNMSUB f14, f30, f2, f14 + FNMSUB f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FMADD f10, f29, f7, f10 + FNMSUB f11, f29, f6, f11 + + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + FNMSUB f10, f28, f6, f10 + FNMSUB f11, f28, f7, f11 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FMADD f14, f31, f7, f14 + FNMSUB f15, f31, f6, f15 + + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + FNMSUB f14, f30, f6, f14 + FNMSUB f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FMADD f14, f29, f11, f14 + FNMSUB f15, f29, f10, f15 + + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + FNMSUB f14, f28, f10, f14 + FNMSUB f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + FMSUB f14, f30, f14, f18 + FMADD f15, f30, f15, f19 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f2, f24, f2, f18 + FMSUB f3, f24, f3, f19 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FMSUB f6, f27, f3, f6 + FNMADD f7, f27, f2, f7 + + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + FNMADD f6, f26, f2, f6 + FNMADD f7, f26, f3, f7 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FMSUB f10, f29, f3, f10 + FNMADD f11, f29, f2, f11 + + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + FNMADD f10, f28, f2, f10 + FNMADD f11, f28, f3, f11 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FMSUB f14, f31, f3, f14 + FNMADD f15, f31, f2, f15 + + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + FNMADD f14, f30, f2, f14 + FNMADD f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FMSUB f10, f29, f7, f10 + FNMADD f11, f29, f6, f11 + + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + FNMADD f10, f28, f6, f10 + FNMADD f11, f28, f7, f11 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FMSUB f14, f31, f7, f14 + FNMADD f15, f31, f6, f15 + + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + FNMADD f14, f30, f6, f14 + FNMADD f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FMSUB f14, f29, f11, f14 + FNMADD f15, f29, f10, f15 + + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + FNMADD f14, f28, f10, f14 + FNMADD f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 + FMADD f14, f30, f14, f18 + FMSUB f15, f30, f15, f19 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + FMUL f18, f25, f15 + FMUL f19, f25, f14 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + FMSUB f14, f24, f14, f18 + FMADD f15, f24, f15, f19 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FMADD f10, f27, f15, f10 + FNMSUB f11, f27, f14, f11 + + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + FNMSUB f10, f26, f14, f10 + FNMSUB f11, f26, f15, f11 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FMADD f6, f29, f15, f6 + FNMSUB f7, f29, f14, f7 + + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + FNMSUB f6, f28, f14, f6 + FNMSUB f7, f28, f15, f7 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FMADD f2, f31, f15, f2 + FNMSUB f3, f31, f14, f3 + + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + FNMSUB f2, f30, f14, f2 + FNMSUB f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FMADD f6, f29, f11, f6 + FNMSUB f7, f29, f10, f7 + + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + FNMSUB f6, f28, f10, f6 + FNMSUB f7, f28, f11, f7 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FMADD f2, f31, f11, f2 + FNMSUB f3, f31, f10, f3 + + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + FNMSUB f2, f30, f10, f2 + FNMSUB f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FMADD f2, f29, f7, f2 + FNMSUB f3, f29, f6, f3 + + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + FNMSUB f2, f28, f6, f2 + FNMSUB f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + +#else + + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + FMADD f14, f24, f14, f18 + FMSUB f15, f24, f15, f19 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FMSUB f10, f27, f15, f10 + FNMADD f11, f27, f14, f11 + + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + FNMADD f10, f26, f14, f10 + FNMADD f11, f26, f15, f11 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FMSUB f6, f29, f15, f6 + FNMADD f7, f29, f14, f7 + + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + FNMADD f6, f28, f14, f6 + FNMADD f7, f28, f15, f7 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FMSUB f2, f31, f15, f2 + FNMADD f3, f31, f14, f3 + + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + FNMADD f2, f30, f14, f2 + FNMADD f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FMSUB f6, f29, f11, f6 + FNMADD f7, f29, f10, f7 + + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + FNMADD f6, f28, f10, f6 + FNMADD f7, f28, f11, f7 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FMSUB f2, f31, f11, f2 + FNMADD f3, f31, f10, f3 + + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + FNMADD f2, f30, f10, f2 + FNMADD f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FMSUB f2, f29, f7, f2 + FNMADD f3, f29, f6, f3 + + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + FNMADD f2, f28, f6, f2 + FNMADD f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMSUB f3, f30, f3, f19 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f3, 9 * SIZE(BO) + STFD f6, 10 * SIZE(BO) + STFD f7, 11 * SIZE(BO) + STFD f10, 12 * SIZE(BO) + STFD f11, 13 * SIZE(BO) + STFD f14, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f18, f20, f0 + FMA4 f3, f19, f20, f3 + FMA2 f1, f18, f21, f1 + FMA3 f2, f19, f21, f2 + + FMA1 f4, f18, f22, f4 + FMA4 f7, f19, f22, f7 + FMA2 f5, f18, f23, f5 + FMA3 f6, f19, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f8, f18, f24, f8 + FMA4 f11, f19, f24, f11 + FMA2 f9, f18, f25, f9 + FMA3 f10, f19, f25, f10 + + FMA1 f12, f18, f26, f12 + FMA4 f15, f19, f26, f15 + FMA2 f13, f18, f27, f13 + FMA3 f14, f19, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA4 f3, f29, f20, f3 + FMA2 f1, f28, f21, f1 + FMA3 f2, f29, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA4 f7, f29, f22, f7 + FMA2 f5, f28, f23, f5 + FMA3 f6, f29, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f8, f28, f24, f8 + FMA4 f11, f29, f24, f11 + FMA2 f9, f28, f25, f9 + FMA3 f10, f29, f25, f10 + + FMA1 f12, f28, f26, f12 + FMA4 f15, f29, f26, f15 + FMA2 f13, f28, f27, f13 + FMA3 f14, f29, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f30, f20, f0 + FMA4 f3, f31, f20, f3 + FMA2 f1, f30, f21, f1 + FMA3 f2, f31, f21, f2 + + FMA1 f4, f30, f22, f4 + FMA4 f7, f31, f22, f7 + FMA2 f5, f30, f23, f5 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f8, f30, f24, f8 + FMA4 f11, f31, f24, f11 + FMA2 f9, f30, f25, f9 + FMA3 f10, f31, f25, f10 + + FMA1 f12, f30, f26, f12 + FMA4 f15, f31, f26, f15 + FMA2 f13, f30, f27, f13 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 32 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + +#ifndef CONJ + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + +#else + + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + +#else + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S new file mode 100644 index 0000000..fdcf5be --- /dev/null +++ b/kernel/power/ztrsm_kernel_ppc440_LN.S @@ -0,0 +1,2256 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + srawi. J, N, 1 + ble .L30 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +.L20: + andi. I, M, 1 + ble .L09 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(BO) + bdnz .L22 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L27 + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L27: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f20, f2 + FADD f3, f21, f3 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L09: + srawi. I, M, 1 + ble .L29 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .LKERNEL_MainFinish + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.LKERNEL_MainFinish: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L29: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt .L10 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + + andi. I, M, 1 + ble .L40 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 4 + +.L42: + FMADD f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f20, f1 + nop + FMADD f2, f18, f20, f2 + nop + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + nop + FMADD f2, f18, f22, f2 + nop + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + nop + FMADD f2, f18, f20, f2 + nop + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + nop + FMADD f2, f18, f22, f2 + nop + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble .L47 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f20, f1 + nop + FMADD f2, f18, f20, f2 + nop + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + bdnz .L46 + .align 4 + +.L47: +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L40: + srawi. I, M, 1 + ble .L49 + .align 4 + +.L31: +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(AO) + bdnz .L32 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L37 + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + bdnz .L36 + .align 4 + +.L37: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S new file mode 100644 index 0000000..a9c98dd --- /dev/null +++ b/kernel/power/ztrsm_kernel_ppc440_LT.S @@ -0,0 +1,2208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + srawi. J, N, 1 + ble .L30 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .LKERNEL_MainFinish + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.LKERNEL_MainFinish: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L20: + andi. I, M, 1 + ble .L29 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(BO) + bdnz .L22 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L27 + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L27: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L29: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt .L10 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble .L40 + .align 4 + +.L31: +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(AO) + bdnz .L32 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L37 + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + bdnz .L36 + .align 4 + +.L37: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L40: + andi. I, M, 1 + ble .L49 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 4 + +.L42: + fmadd f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFD f16, 4 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFD f20, 4 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 5 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 6 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 6 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 8 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 8 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 2 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble .L47 + .align 4 + +.L46: + fmadd f0, f16, f20, f0 + LFD f21, 1 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 2 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 2 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + bdnz .L46 + .align 4 + +.L47: +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S new file mode 100644 index 0000000..c9b794e --- /dev/null +++ b/kernel/power/ztrsm_kernel_ppc440_RT.S @@ -0,0 +1,2209 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + andi. J, N, 1 + ble .L30 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble .L40 + .align 4 + +.L31: +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(AO) + bdnz .L32 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L37 + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + bdnz .L36 + .align 4 + +.L37: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L40: + andi. I, M, 1 + ble .L49 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 4 + +.L42: + fmadd f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFD f16, 4 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFD f20, 4 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 5 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 6 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 6 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 8 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 8 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 2 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble .L47 + .align 4 + +.L46: + fmadd f0, f16, f20, f0 + LFD f21, 1 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 2 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 2 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + bdnz .L46 + .align 4 + +.L47: +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L30: + srawi. J, N, 1 + ble .L999 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .LKERNEL_MainFinish + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.LKERNEL_MainFinish: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L20: + andi. I, M, 1 + ble .L29 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(BO) + bdnz .L22 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L27 + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L27: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L29: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt .L10 + .align 4 + + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c new file mode 100644 index 0000000..0ab57f3 --- /dev/null +++ b/kernel/setparam-ref.c @@ -0,0 +1,819 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifdef BUILD_KERNEL +#include "kernelTS.h" +#endif + +#undef DEBUG + +static void init_parameter(void); + +gotoblas_t TABLE_NAME = { + GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, + + 0, 0, 0, + SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N), +#ifdef HAVE_EXCLUSIVE_CACHE + 1, +#else + 0, +#endif + + samax_kTS, samin_kTS, smax_kTS, smin_kTS, + isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, + snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS, + dsdot_kTS, + srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, + sgemv_nTS, sgemv_tTS, sger_kTS, + ssymv_LTS, ssymv_UTS, + + sgemm_kernelTS, sgemm_betaTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + sgemm_incopyTS, sgemm_itcopyTS, +#else + sgemm_oncopyTS, sgemm_otcopyTS, +#endif + sgemm_oncopyTS, sgemm_otcopyTS, + strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, + strsm_ilnucopyTS, strsm_ilnncopyTS, strsm_iltucopyTS, strsm_iltncopyTS, +#else + strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, + strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, +#endif + strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, + strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, + strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS, + strmm_ilnucopyTS, strmm_ilnncopyTS, strmm_iltucopyTS, strmm_iltncopyTS, +#else + strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, + strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, +#endif + strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, + strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + ssymm_iutcopyTS, ssymm_iltcopyTS, +#else + ssymm_outcopyTS, ssymm_oltcopyTS, +#endif + ssymm_outcopyTS, ssymm_oltcopyTS, + + sneg_tcopyTS, slaswp_ncopyTS, + + 0, 0, 0, + DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), + + damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, + idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, + dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS, + drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, + dgemv_nTS, dgemv_tTS, dger_kTS, + dsymv_LTS, dsymv_UTS, + + dgemm_kernelTS, dgemm_betaTS, +#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N + dgemm_incopyTS, dgemm_itcopyTS, +#else + dgemm_oncopyTS, dgemm_otcopyTS, +#endif + dgemm_oncopyTS, dgemm_otcopyTS, + dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, +#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N + dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, + dtrsm_ilnucopyTS, dtrsm_ilnncopyTS, dtrsm_iltucopyTS, dtrsm_iltncopyTS, +#else + dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS, + dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS, +#endif + dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS, + dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS, + dtrmm_kernel_RNTS, dtrmm_kernel_RTTS, dtrmm_kernel_LNTS, dtrmm_kernel_LTTS, +#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N + dtrmm_iunucopyTS, dtrmm_iunncopyTS, dtrmm_iutucopyTS, dtrmm_iutncopyTS, + dtrmm_ilnucopyTS, dtrmm_ilnncopyTS, dtrmm_iltucopyTS, dtrmm_iltncopyTS, +#else + dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS, + dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS, +#endif + dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS, + dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS, +#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N + dsymm_iutcopyTS, dsymm_iltcopyTS, +#else + dsymm_outcopyTS, dsymm_oltcopyTS, +#endif + dsymm_outcopyTS, dsymm_oltcopyTS, + + dneg_tcopyTS, dlaswp_ncopyTS, + +#ifdef EXPRECISION + + 0, 0, 0, + QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), + + qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, + iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, + qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS, + qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, + qgemv_nTS, qgemv_tTS, qger_kTS, + qsymv_LTS, qsymv_UTS, + + qgemm_kernelTS, qgemm_betaTS, +#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N + qgemm_incopyTS, qgemm_itcopyTS, +#else + qgemm_oncopyTS, qgemm_otcopyTS, +#endif + qgemm_oncopyTS, qgemm_otcopyTS, + qtrsm_kernel_LNTS, qtrsm_kernel_LTTS, qtrsm_kernel_RNTS, qtrsm_kernel_RTTS, +#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N + qtrsm_iunucopyTS, qtrsm_iunncopyTS, qtrsm_iutucopyTS, qtrsm_iutncopyTS, + qtrsm_ilnucopyTS, qtrsm_ilnncopyTS, qtrsm_iltucopyTS, qtrsm_iltncopyTS, +#else + qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS, + qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS, +#endif + qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS, + qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS, + qtrmm_kernel_RNTS, qtrmm_kernel_RTTS, qtrmm_kernel_LNTS, qtrmm_kernel_LTTS, +#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N + qtrmm_iunucopyTS, qtrmm_iunncopyTS, qtrmm_iutucopyTS, qtrmm_iutncopyTS, + qtrmm_ilnucopyTS, qtrmm_ilnncopyTS, qtrmm_iltucopyTS, qtrmm_iltncopyTS, +#else + qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS, + qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS, +#endif + qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS, + qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS, +#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N + qsymm_iutcopyTS, qsymm_iltcopyTS, +#else + qsymm_outcopyTS, qsymm_oltcopyTS, +#endif + qsymm_outcopyTS, qsymm_oltcopyTS, + + qneg_tcopyTS, qlaswp_ncopyTS, + +#endif + + 0, 0, 0, + CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N), + + camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, + cnrm2_kTS, casum_kTS, ccopy_kTS, + cdotu_kTS, cdotc_kTS, csrot_kTS, + caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, + + cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS, + cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS, + cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS, + csymv_LTS, csymv_UTS, + chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS, + + cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS, + cgemm_betaTS, + +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + cgemm_incopyTS, cgemm_itcopyTS, +#else + cgemm_oncopyTS, cgemm_otcopyTS, +#endif + cgemm_oncopyTS, cgemm_otcopyTS, + + ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS, + ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS, + +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + ctrsm_iunucopyTS, ctrsm_iunncopyTS, ctrsm_iutucopyTS, ctrsm_iutncopyTS, + ctrsm_ilnucopyTS, ctrsm_ilnncopyTS, ctrsm_iltucopyTS, ctrsm_iltncopyTS, +#else + ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, + ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, +#endif + ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, + ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, + + ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS, + ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS, + +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + ctrmm_iunucopyTS, ctrmm_iunncopyTS, ctrmm_iutucopyTS, ctrmm_iutncopyTS, + ctrmm_ilnucopyTS, ctrmm_ilnncopyTS, ctrmm_iltucopyTS, ctrmm_iltncopyTS, +#else + ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS, + ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS, +#endif + ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS, + ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS, + +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + csymm_iutcopyTS, csymm_iltcopyTS, +#else + csymm_outcopyTS, csymm_oltcopyTS, +#endif + csymm_outcopyTS, csymm_oltcopyTS, +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + chemm_iutcopyTS, chemm_iltcopyTS, +#else + chemm_outcopyTS, chemm_oltcopyTS, +#endif + chemm_outcopyTS, chemm_oltcopyTS, + + cgemm3m_kernelTS, + + cgemm3m_incopybTS, cgemm3m_incopyrTS, + cgemm3m_incopyiTS, cgemm3m_itcopybTS, + cgemm3m_itcopyrTS, cgemm3m_itcopyiTS, + cgemm3m_oncopybTS, cgemm3m_oncopyrTS, + cgemm3m_oncopyiTS, cgemm3m_otcopybTS, + cgemm3m_otcopyrTS, cgemm3m_otcopyiTS, + + csymm3m_iucopybTS, csymm3m_ilcopybTS, + csymm3m_iucopyrTS, csymm3m_ilcopyrTS, + csymm3m_iucopyiTS, csymm3m_ilcopyiTS, + csymm3m_oucopybTS, csymm3m_olcopybTS, + csymm3m_oucopyrTS, csymm3m_olcopyrTS, + csymm3m_oucopyiTS, csymm3m_olcopyiTS, + + chemm3m_iucopybTS, chemm3m_ilcopybTS, + chemm3m_iucopyrTS, chemm3m_ilcopyrTS, + chemm3m_iucopyiTS, chemm3m_ilcopyiTS, + + chemm3m_oucopybTS, chemm3m_olcopybTS, + chemm3m_oucopyrTS, chemm3m_olcopyrTS, + chemm3m_oucopyiTS, chemm3m_olcopyiTS, + + cneg_tcopyTS, claswp_ncopyTS, + + 0, 0, 0, + ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N), + + zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, + znrm2_kTS, zasum_kTS, zcopy_kTS, + zdotu_kTS, zdotc_kTS, zdrot_kTS, + zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, + + zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS, + zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS, + zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS, + zsymv_LTS, zsymv_UTS, + zhemv_LTS, zhemv_UTS, zhemv_MTS, zhemv_VTS, + + zgemm_kernel_nTS, zgemm_kernel_lTS, zgemm_kernel_rTS, zgemm_kernel_bTS, + zgemm_betaTS, + +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + zgemm_incopyTS, zgemm_itcopyTS, +#else + zgemm_oncopyTS, zgemm_otcopyTS, +#endif + zgemm_oncopyTS, zgemm_otcopyTS, + + ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS, + ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS, + +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + ztrsm_iunucopyTS, ztrsm_iunncopyTS, ztrsm_iutucopyTS, ztrsm_iutncopyTS, + ztrsm_ilnucopyTS, ztrsm_ilnncopyTS, ztrsm_iltucopyTS, ztrsm_iltncopyTS, +#else + ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS, + ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS, +#endif + ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS, + ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS, + + ztrmm_kernel_RNTS, ztrmm_kernel_RTTS, ztrmm_kernel_RRTS, ztrmm_kernel_RCTS, + ztrmm_kernel_LNTS, ztrmm_kernel_LTTS, ztrmm_kernel_LRTS, ztrmm_kernel_LCTS, + +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + ztrmm_iunucopyTS, ztrmm_iunncopyTS, ztrmm_iutucopyTS, ztrmm_iutncopyTS, + ztrmm_ilnucopyTS, ztrmm_ilnncopyTS, ztrmm_iltucopyTS, ztrmm_iltncopyTS, +#else + ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS, + ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS, +#endif + ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS, + ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS, + +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + zsymm_iutcopyTS, zsymm_iltcopyTS, +#else + zsymm_outcopyTS, zsymm_oltcopyTS, +#endif + zsymm_outcopyTS, zsymm_oltcopyTS, +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + zhemm_iutcopyTS, zhemm_iltcopyTS, +#else + zhemm_outcopyTS, zhemm_oltcopyTS, +#endif + zhemm_outcopyTS, zhemm_oltcopyTS, + + zgemm3m_kernelTS, + + zgemm3m_incopybTS, zgemm3m_incopyrTS, + zgemm3m_incopyiTS, zgemm3m_itcopybTS, + zgemm3m_itcopyrTS, zgemm3m_itcopyiTS, + zgemm3m_oncopybTS, zgemm3m_oncopyrTS, + zgemm3m_oncopyiTS, zgemm3m_otcopybTS, + zgemm3m_otcopyrTS, zgemm3m_otcopyiTS, + + zsymm3m_iucopybTS, zsymm3m_ilcopybTS, + zsymm3m_iucopyrTS, zsymm3m_ilcopyrTS, + zsymm3m_iucopyiTS, zsymm3m_ilcopyiTS, + zsymm3m_oucopybTS, zsymm3m_olcopybTS, + zsymm3m_oucopyrTS, zsymm3m_olcopyrTS, + zsymm3m_oucopyiTS, zsymm3m_olcopyiTS, + + zhemm3m_iucopybTS, zhemm3m_ilcopybTS, + zhemm3m_iucopyrTS, zhemm3m_ilcopyrTS, + zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS, + + zhemm3m_oucopybTS, zhemm3m_olcopybTS, + zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, + zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, + + zneg_tcopyTS, zlaswp_ncopyTS, + +#ifdef EXPRECISION + + 0, 0, 0, + XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N), + + xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, + xnrm2_kTS, xasum_kTS, xcopy_kTS, + xdotu_kTS, xdotc_kTS, xqrot_kTS, + xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, + + xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS, + xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS, + xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS, + xsymv_LTS, xsymv_UTS, + xhemv_LTS, xhemv_UTS, xhemv_MTS, xhemv_VTS, + + xgemm_kernel_nTS, xgemm_kernel_lTS, xgemm_kernel_rTS, xgemm_kernel_bTS, + xgemm_betaTS, + +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xgemm_incopyTS, xgemm_itcopyTS, +#else + xgemm_oncopyTS, xgemm_otcopyTS, +#endif + xgemm_oncopyTS, xgemm_otcopyTS, + + xtrsm_kernel_LNTS, xtrsm_kernel_LTTS, xtrsm_kernel_LRTS, xtrsm_kernel_LCTS, + xtrsm_kernel_RNTS, xtrsm_kernel_RTTS, xtrsm_kernel_RRTS, xtrsm_kernel_RCTS, + +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xtrsm_iunucopyTS, xtrsm_iunncopyTS, xtrsm_iutucopyTS, xtrsm_iutncopyTS, + xtrsm_ilnucopyTS, xtrsm_ilnncopyTS, xtrsm_iltucopyTS, xtrsm_iltncopyTS, +#else + xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS, + xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS, +#endif + xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS, + xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS, + + xtrmm_kernel_RNTS, xtrmm_kernel_RTTS, xtrmm_kernel_RRTS, xtrmm_kernel_RCTS, + xtrmm_kernel_LNTS, xtrmm_kernel_LTTS, xtrmm_kernel_LRTS, xtrmm_kernel_LCTS, + +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xtrmm_iunucopyTS, xtrmm_iunncopyTS, xtrmm_iutucopyTS, xtrmm_iutncopyTS, + xtrmm_ilnucopyTS, xtrmm_ilnncopyTS, xtrmm_iltucopyTS, xtrmm_iltncopyTS, +#else + xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS, + xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS, +#endif + xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS, + xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS, + +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xsymm_iutcopyTS, xsymm_iltcopyTS, +#else + xsymm_outcopyTS, xsymm_oltcopyTS, +#endif + xsymm_outcopyTS, xsymm_oltcopyTS, +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xhemm_iutcopyTS, xhemm_iltcopyTS, +#else + xhemm_outcopyTS, xhemm_oltcopyTS, +#endif + xhemm_outcopyTS, xhemm_oltcopyTS, + + xgemm3m_kernelTS, + + xgemm3m_incopybTS, xgemm3m_incopyrTS, + xgemm3m_incopyiTS, xgemm3m_itcopybTS, + xgemm3m_itcopyrTS, xgemm3m_itcopyiTS, + xgemm3m_oncopybTS, xgemm3m_oncopyrTS, + xgemm3m_oncopyiTS, xgemm3m_otcopybTS, + xgemm3m_otcopyrTS, xgemm3m_otcopyiTS, + + xsymm3m_iucopybTS, xsymm3m_ilcopybTS, + xsymm3m_iucopyrTS, xsymm3m_ilcopyrTS, + xsymm3m_iucopyiTS, xsymm3m_ilcopyiTS, + xsymm3m_oucopybTS, xsymm3m_olcopybTS, + xsymm3m_oucopyrTS, xsymm3m_olcopyrTS, + xsymm3m_oucopyiTS, xsymm3m_olcopyiTS, + + xhemm3m_iucopybTS, xhemm3m_ilcopybTS, + xhemm3m_iucopyrTS, xhemm3m_ilcopyrTS, + xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS, + + xhemm3m_oucopybTS, xhemm3m_olcopybTS, + xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, + xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, + + xneg_tcopyTS, xlaswp_ncopyTS, + +#endif + + init_parameter, + + SNUMOPT, DNUMOPT, QNUMOPT, + +}; + +#ifdef ARCH_X86 +static int get_l2_size_old(void){ + int i, eax, ebx, ecx, edx, cpuid_level; + int info[15]; + + cpuid(2, &eax, &ebx, &ecx, &edx); + + info[ 0] = BITMASK(eax, 8, 0xff); + info[ 1] = BITMASK(eax, 16, 0xff); + info[ 2] = BITMASK(eax, 24, 0xff); + + info[ 3] = BITMASK(ebx, 0, 0xff); + info[ 4] = BITMASK(ebx, 8, 0xff); + info[ 5] = BITMASK(ebx, 16, 0xff); + info[ 6] = BITMASK(ebx, 24, 0xff); + + info[ 7] = BITMASK(ecx, 0, 0xff); + info[ 8] = BITMASK(ecx, 8, 0xff); + info[ 9] = BITMASK(ecx, 16, 0xff); + info[10] = BITMASK(ecx, 24, 0xff); + + info[11] = BITMASK(edx, 0, 0xff); + info[12] = BITMASK(edx, 8, 0xff); + info[13] = BITMASK(edx, 16, 0xff); + info[14] = BITMASK(edx, 24, 0xff); + + for (i = 0; i < 15; i++){ + + switch (info[i]){ + + /* This table is from http://www.sandpile.org/ia32/cpuid.htm */ + + case 0x1a : + return 96; + + case 0x39 : + case 0x3b : + case 0x41 : + case 0x79 : + case 0x81 : + return 128; + + case 0x3a : + return 192; + + case 0x21 : + case 0x3c : + case 0x42 : + case 0x7a : + case 0x7e : + case 0x82 : + return 256; + + case 0x3d : + return 384; + + case 0x3e : + case 0x43 : + case 0x7b : + case 0x7f : + case 0x83 : + case 0x86 : + return 512; + + case 0x44 : + case 0x78 : + case 0x7c : + case 0x84 : + case 0x87 : + return 1024; + + case 0x45 : + case 0x7d : + case 0x85 : + return 2048; + + case 0x48 : + return 3184; + + case 0x49 : + return 4096; + + case 0x4e : + return 6144; + } + } + return 0; +} +#endif + +static __inline__ int get_l2_size(void){ + + int eax, ebx, ecx, edx, l2; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + l2 = BITMASK(ecx, 16, 0xffff); + +#ifndef ARCH_X86 + return l2; + +#else + + if (l2 > 0) return l2; + + return get_l2_size_old(); +#endif +} + +static __inline__ int get_l3_size(void){ + + int eax, ebx, ecx, edx; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + return BITMASK(edx, 18, 0x3fff) * 512; +} + + +static void init_parameter(void) { + + int l2 = get_l2_size(); + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#ifdef EXPRECISION + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; +#endif + +#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) + +#ifdef DEBUG + fprintf(stderr, "Katmai, Coppermine, Banias\n"); +#endif + + TABLE_NAME.sgemm_p = 64 * (l2 >> 7); + TABLE_NAME.dgemm_p = 32 * (l2 >> 7); + TABLE_NAME.cgemm_p = 32 * (l2 >> 7); + TABLE_NAME.zgemm_p = 16 * (l2 >> 7); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 16 * (l2 >> 7); + TABLE_NAME.xgemm_p = 8 * (l2 >> 7); +#endif +#endif + +#ifdef CORE_NORTHWOOD + +#ifdef DEBUG + fprintf(stderr, "Northwood\n"); +#endif + + TABLE_NAME.sgemm_p = 96 * (l2 >> 7); + TABLE_NAME.dgemm_p = 48 * (l2 >> 7); + TABLE_NAME.cgemm_p = 48 * (l2 >> 7); + TABLE_NAME.zgemm_p = 24 * (l2 >> 7); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 24 * (l2 >> 7); + TABLE_NAME.xgemm_p = 12 * (l2 >> 7); +#endif +#endif + +#ifdef ATOM + +#ifdef DEBUG + fprintf(stderr, "Atom\n"); +#endif + + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.dgemm_p = 128; + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.zgemm_p = 64; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 64; + TABLE_NAME.xgemm_p = 32; +#endif +#endif + +#ifdef CORE_PRESCOTT + +#ifdef DEBUG + fprintf(stderr, "Prescott\n"); +#endif + + TABLE_NAME.sgemm_p = 56 * (l2 >> 7); + TABLE_NAME.dgemm_p = 28 * (l2 >> 7); + TABLE_NAME.cgemm_p = 28 * (l2 >> 7); + TABLE_NAME.zgemm_p = 14 * (l2 >> 7); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 14 * (l2 >> 7); + TABLE_NAME.xgemm_p = 7 * (l2 >> 7); +#endif +#endif + +#ifdef CORE2 + +#ifdef DEBUG + fprintf(stderr, "Core2\n"); +#endif + + TABLE_NAME.sgemm_p = 92 * (l2 >> 9); + TABLE_NAME.dgemm_p = 46 * (l2 >> 9); + TABLE_NAME.cgemm_p = 46 * (l2 >> 9); + TABLE_NAME.zgemm_p = 23 * (l2 >> 9); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 92 * (l2 >> 9); + TABLE_NAME.xgemm_p = 46 * (l2 >> 9); +#endif +#endif + +#ifdef PENRYN + +#ifdef DEBUG + fprintf(stderr, "Penryn\n"); +#endif + + TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; + TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; + TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; + TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8; + TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4; +#endif +#endif + +#ifdef NEHALEM + +#ifdef DEBUG + fprintf(stderr, "Nehalem\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + +#ifdef OPTERON + +#ifdef DEBUG + fprintf(stderr, "Opteron\n"); +#endif + + TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7); + TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7); + TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7); + TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 56 + 14 * (l2 >> 7); + TABLE_NAME.xgemm_p = 28 + 7 * (l2 >> 7); +#endif +#endif + +#ifdef BARCELONA + +#ifdef DEBUG + fprintf(stderr, "Barcelona\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + +#ifdef NANO + +#ifdef DEBUG + fprintf(stderr, "NANO\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + + + TABLE_NAME.sgemm_p = (TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1); + TABLE_NAME.dgemm_p = (TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1); + TABLE_NAME.cgemm_p = (TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1) & ~(CGEMM_DEFAULT_UNROLL_M - 1); + TABLE_NAME.zgemm_p = (TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1) & ~(ZGEMM_DEFAULT_UNROLL_M - 1); +#ifdef QUAD_PRECISION + TABLE_NAME.qgemm_p = (TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1); + TABLE_NAME.xgemm_p = (TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1) & ~(XGEMM_DEFAULT_UNROLL_M - 1); +#endif + +#ifdef DEBUG + fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); +#endif + + TABLE_NAME.sgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15); + + TABLE_NAME.dgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15); + +#ifdef EXPRECISION + TABLE_NAME.qgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); +#endif + + TABLE_NAME.cgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15); + + TABLE_NAME.zgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); + +#ifdef EXPRECISION + TABLE_NAME.xgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15); +#endif + +} diff --git a/kernel/sparc/._KERNEL b/kernel/sparc/._KERNEL new file mode 100644 index 0000000..c06b6e7 Binary files /dev/null and b/kernel/sparc/._KERNEL differ diff --git a/kernel/sparc/._KERNEL.sparc b/kernel/sparc/._KERNEL.sparc new file mode 100644 index 0000000..b2276a6 Binary files /dev/null and b/kernel/sparc/._KERNEL.sparc differ diff --git a/kernel/sparc/._KERNEL.sparcv7 b/kernel/sparc/._KERNEL.sparcv7 new file mode 100644 index 0000000..bef60da Binary files /dev/null and b/kernel/sparc/._KERNEL.sparcv7 differ diff --git a/kernel/sparc/._Makefile b/kernel/sparc/._Makefile new file mode 100644 index 0000000..1428159 Binary files /dev/null and b/kernel/sparc/._Makefile differ diff --git a/kernel/sparc/._amax.S b/kernel/sparc/._amax.S new file mode 100644 index 0000000..840c536 Binary files /dev/null and b/kernel/sparc/._amax.S differ diff --git a/kernel/sparc/._asum.S b/kernel/sparc/._asum.S new file mode 100644 index 0000000..b73e3d2 Binary files /dev/null and b/kernel/sparc/._asum.S differ diff --git a/kernel/sparc/._axpy.S b/kernel/sparc/._axpy.S new file mode 100644 index 0000000..66e299a Binary files /dev/null and b/kernel/sparc/._axpy.S differ diff --git a/kernel/sparc/._cabs.S b/kernel/sparc/._cabs.S new file mode 100644 index 0000000..6e60566 Binary files /dev/null and b/kernel/sparc/._cabs.S differ diff --git a/kernel/sparc/._cnrm2.S b/kernel/sparc/._cnrm2.S new file mode 100644 index 0000000..69efc2e Binary files /dev/null and b/kernel/sparc/._cnrm2.S differ diff --git a/kernel/sparc/._copy.S b/kernel/sparc/._copy.S new file mode 100644 index 0000000..9a5226f Binary files /dev/null and b/kernel/sparc/._copy.S differ diff --git a/kernel/sparc/._dnrm2.S b/kernel/sparc/._dnrm2.S new file mode 100644 index 0000000..5d3b3da Binary files /dev/null and b/kernel/sparc/._dnrm2.S differ diff --git a/kernel/sparc/._dot.S b/kernel/sparc/._dot.S new file mode 100644 index 0000000..20418f8 Binary files /dev/null and b/kernel/sparc/._dot.S differ diff --git a/kernel/sparc/._gemm_kernel.S b/kernel/sparc/._gemm_kernel.S new file mode 100644 index 0000000..941f8b2 Binary files /dev/null and b/kernel/sparc/._gemm_kernel.S differ diff --git a/kernel/sparc/._gemm_kernel_2x8.S b/kernel/sparc/._gemm_kernel_2x8.S new file mode 100644 index 0000000..ca0e684 Binary files /dev/null and b/kernel/sparc/._gemm_kernel_2x8.S differ diff --git a/kernel/sparc/._gemm_ncopy.S b/kernel/sparc/._gemm_ncopy.S new file mode 100644 index 0000000..834049a Binary files /dev/null and b/kernel/sparc/._gemm_ncopy.S differ diff --git a/kernel/sparc/._gemm_ncopy_2.S b/kernel/sparc/._gemm_ncopy_2.S new file mode 100644 index 0000000..558d831 Binary files /dev/null and b/kernel/sparc/._gemm_ncopy_2.S differ diff --git a/kernel/sparc/._gemm_ncopy_8.S b/kernel/sparc/._gemm_ncopy_8.S new file mode 100644 index 0000000..e322b86 Binary files /dev/null and b/kernel/sparc/._gemm_ncopy_8.S differ diff --git a/kernel/sparc/._gemm_tcopy.S b/kernel/sparc/._gemm_tcopy.S new file mode 100644 index 0000000..52f5afe Binary files /dev/null and b/kernel/sparc/._gemm_tcopy.S differ diff --git a/kernel/sparc/._gemm_tcopy_2.S b/kernel/sparc/._gemm_tcopy_2.S new file mode 100644 index 0000000..111aad1 Binary files /dev/null and b/kernel/sparc/._gemm_tcopy_2.S differ diff --git a/kernel/sparc/._gemv_n.S b/kernel/sparc/._gemv_n.S new file mode 100644 index 0000000..9791175 Binary files /dev/null and b/kernel/sparc/._gemv_n.S differ diff --git a/kernel/sparc/._gemv_t.S b/kernel/sparc/._gemv_t.S new file mode 100644 index 0000000..2357b0c Binary files /dev/null and b/kernel/sparc/._gemv_t.S differ diff --git a/kernel/sparc/._ger.S b/kernel/sparc/._ger.S new file mode 100644 index 0000000..aafe074 Binary files /dev/null and b/kernel/sparc/._ger.S differ diff --git a/kernel/sparc/._iamax.S b/kernel/sparc/._iamax.S new file mode 100644 index 0000000..a0ab72b Binary files /dev/null and b/kernel/sparc/._iamax.S differ diff --git a/kernel/sparc/._imax.S b/kernel/sparc/._imax.S new file mode 100644 index 0000000..590787b Binary files /dev/null and b/kernel/sparc/._imax.S differ diff --git a/kernel/sparc/._izamax.S b/kernel/sparc/._izamax.S new file mode 100644 index 0000000..0790143 Binary files /dev/null and b/kernel/sparc/._izamax.S differ diff --git a/kernel/sparc/._lsame.S b/kernel/sparc/._lsame.S new file mode 100644 index 0000000..17800ae Binary files /dev/null and b/kernel/sparc/._lsame.S differ diff --git a/kernel/sparc/._max.S b/kernel/sparc/._max.S new file mode 100644 index 0000000..8b9836f Binary files /dev/null and b/kernel/sparc/._max.S differ diff --git a/kernel/sparc/._rot.S b/kernel/sparc/._rot.S new file mode 100644 index 0000000..6583c76 Binary files /dev/null and b/kernel/sparc/._rot.S differ diff --git a/kernel/sparc/._scal.S b/kernel/sparc/._scal.S new file mode 100644 index 0000000..b16bc9c Binary files /dev/null and b/kernel/sparc/._scal.S differ diff --git a/kernel/sparc/._snrm2.S b/kernel/sparc/._snrm2.S new file mode 100644 index 0000000..3d6c7ea Binary files /dev/null and b/kernel/sparc/._snrm2.S differ diff --git a/kernel/sparc/._staticbuffer.S b/kernel/sparc/._staticbuffer.S new file mode 100644 index 0000000..b9ef556 Binary files /dev/null and b/kernel/sparc/._staticbuffer.S differ diff --git a/kernel/sparc/._swap.S b/kernel/sparc/._swap.S new file mode 100644 index 0000000..ee6b654 Binary files /dev/null and b/kernel/sparc/._swap.S differ diff --git a/kernel/sparc/._trsm_kernel_LN.S b/kernel/sparc/._trsm_kernel_LN.S new file mode 100644 index 0000000..b6db754 Binary files /dev/null and b/kernel/sparc/._trsm_kernel_LN.S differ diff --git a/kernel/sparc/._trsm_kernel_LN_2x8.S b/kernel/sparc/._trsm_kernel_LN_2x8.S new file mode 100644 index 0000000..2bbce07 Binary files /dev/null and b/kernel/sparc/._trsm_kernel_LN_2x8.S differ diff --git a/kernel/sparc/._trsm_kernel_LT.S b/kernel/sparc/._trsm_kernel_LT.S new file mode 100644 index 0000000..4fe17b2 Binary files /dev/null and b/kernel/sparc/._trsm_kernel_LT.S differ diff --git a/kernel/sparc/._trsm_kernel_LT_2x8.S b/kernel/sparc/._trsm_kernel_LT_2x8.S new file mode 100644 index 0000000..63b4149 Binary files /dev/null and b/kernel/sparc/._trsm_kernel_LT_2x8.S differ diff --git a/kernel/sparc/._trsm_kernel_RT.S b/kernel/sparc/._trsm_kernel_RT.S new file mode 100644 index 0000000..27fad5f Binary files /dev/null and b/kernel/sparc/._trsm_kernel_RT.S differ diff --git a/kernel/sparc/._trsm_kernel_RT_2x8.S b/kernel/sparc/._trsm_kernel_RT_2x8.S new file mode 100644 index 0000000..7381ab3 Binary files /dev/null and b/kernel/sparc/._trsm_kernel_RT_2x8.S differ diff --git a/kernel/sparc/._zamax.S b/kernel/sparc/._zamax.S new file mode 100644 index 0000000..5102409 Binary files /dev/null and b/kernel/sparc/._zamax.S differ diff --git a/kernel/sparc/._zasum.S b/kernel/sparc/._zasum.S new file mode 100644 index 0000000..5a143de Binary files /dev/null and b/kernel/sparc/._zasum.S differ diff --git a/kernel/sparc/._zaxpy.S b/kernel/sparc/._zaxpy.S new file mode 100644 index 0000000..b0b22e6 Binary files /dev/null and b/kernel/sparc/._zaxpy.S differ diff --git a/kernel/sparc/._zcopy.S b/kernel/sparc/._zcopy.S new file mode 100644 index 0000000..d3cee48 Binary files /dev/null and b/kernel/sparc/._zcopy.S differ diff --git a/kernel/sparc/._zdot.S b/kernel/sparc/._zdot.S new file mode 100644 index 0000000..5df9490 Binary files /dev/null and b/kernel/sparc/._zdot.S differ diff --git a/kernel/sparc/._zgemm_kernel.S b/kernel/sparc/._zgemm_kernel.S new file mode 100644 index 0000000..ac09109 Binary files /dev/null and b/kernel/sparc/._zgemm_kernel.S differ diff --git a/kernel/sparc/._zgemm_kernel_1x4.S b/kernel/sparc/._zgemm_kernel_1x4.S new file mode 100644 index 0000000..dc8d81a Binary files /dev/null and b/kernel/sparc/._zgemm_kernel_1x4.S differ diff --git a/kernel/sparc/._zgemm_ncopy.S b/kernel/sparc/._zgemm_ncopy.S new file mode 100644 index 0000000..23cefd3 Binary files /dev/null and b/kernel/sparc/._zgemm_ncopy.S differ diff --git a/kernel/sparc/._zgemm_tcopy.S b/kernel/sparc/._zgemm_tcopy.S new file mode 100644 index 0000000..c826e2b Binary files /dev/null and b/kernel/sparc/._zgemm_tcopy.S differ diff --git a/kernel/sparc/._zgemv_n.S b/kernel/sparc/._zgemv_n.S new file mode 100644 index 0000000..2536588 Binary files /dev/null and b/kernel/sparc/._zgemv_n.S differ diff --git a/kernel/sparc/._zgemv_t.S b/kernel/sparc/._zgemv_t.S new file mode 100644 index 0000000..b8d6eeb Binary files /dev/null and b/kernel/sparc/._zgemv_t.S differ diff --git a/kernel/sparc/._znrm2.S b/kernel/sparc/._znrm2.S new file mode 100644 index 0000000..6e0fd26 Binary files /dev/null and b/kernel/sparc/._znrm2.S differ diff --git a/kernel/sparc/._zrot.S b/kernel/sparc/._zrot.S new file mode 100644 index 0000000..7c181bc Binary files /dev/null and b/kernel/sparc/._zrot.S differ diff --git a/kernel/sparc/._zscal.S b/kernel/sparc/._zscal.S new file mode 100644 index 0000000..e43475c Binary files /dev/null and b/kernel/sparc/._zscal.S differ diff --git a/kernel/sparc/._zswap.S b/kernel/sparc/._zswap.S new file mode 100644 index 0000000..47fdd1c Binary files /dev/null and b/kernel/sparc/._zswap.S differ diff --git a/kernel/sparc/._ztrsm_kernel_LN.S b/kernel/sparc/._ztrsm_kernel_LN.S new file mode 100644 index 0000000..fed8577 Binary files /dev/null and b/kernel/sparc/._ztrsm_kernel_LN.S differ diff --git a/kernel/sparc/._ztrsm_kernel_LT.S b/kernel/sparc/._ztrsm_kernel_LT.S new file mode 100644 index 0000000..8b6d5fc Binary files /dev/null and b/kernel/sparc/._ztrsm_kernel_LT.S differ diff --git a/kernel/sparc/._ztrsm_kernel_LT_1x4.S b/kernel/sparc/._ztrsm_kernel_LT_1x4.S new file mode 100644 index 0000000..59bba04 Binary files /dev/null and b/kernel/sparc/._ztrsm_kernel_LT_1x4.S differ diff --git a/kernel/sparc/._ztrsm_kernel_RT.S b/kernel/sparc/._ztrsm_kernel_RT.S new file mode 100644 index 0000000..df9a307 Binary files /dev/null and b/kernel/sparc/._ztrsm_kernel_RT.S differ diff --git a/kernel/sparc/._ztrsm_kernel_RT_1x4.S b/kernel/sparc/._ztrsm_kernel_RT_1x4.S new file mode 100644 index 0000000..1d3d7b4 Binary files /dev/null and b/kernel/sparc/._ztrsm_kernel_RT_1x4.S differ diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL new file mode 100644 index 0000000..594fd05 --- /dev/null +++ b/kernel/sparc/KERNEL @@ -0,0 +1,69 @@ +ifndef SAMINKERNEL +SAMINKERNEL = amax.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax.S +endif + +ifndef SMINKERNEL +SMINKERNEL = max.S +endif + +ifndef DMINKERNEL +DMINKERNEL = max.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax.S +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + diff --git a/kernel/sparc/KERNEL.sparc b/kernel/sparc/KERNEL.sparc new file mode 100644 index 0000000..fb6cc2b --- /dev/null +++ b/kernel/sparc/KERNEL.sparc @@ -0,0 +1,56 @@ +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = gemm_ncopy.S +SGEMMOTCOPY = gemm_tcopy.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy.S +DGEMMOTCOPY = gemm_tcopy.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = zgemm_ncopy.S +CGEMMOTCOPY = zgemm_tcopy.S +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy.S +ZGEMMOTCOPY = zgemm_tcopy.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN.S +STRSMKERNEL_LT = trsm_kernel_LT.S +STRSMKERNEL_RN = trsm_kernel_LT.S +STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S diff --git a/kernel/sparc/KERNEL.sparcv7 b/kernel/sparc/KERNEL.sparcv7 new file mode 100644 index 0000000..dfda684 --- /dev/null +++ b/kernel/sparc/KERNEL.sparcv7 @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_2x8.S +SGEMMINCOPY = gemm_ncopy_2.S +SGEMMITCOPY = gemm_tcopy_2.S +SGEMMONCOPY = gemm_ncopy_8.S +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy.$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) + +DGEMMKERNEL = gemm_kernel_2x8.S +DGEMMINCOPY = gemm_ncopy_2.S +DGEMMITCOPY = gemm_tcopy_2.S +DGEMMONCOPY = gemm_ncopy_8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = dgemm_incopy.$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy.$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) + +CGEMMKERNEL = zgemm_kernel_1x4.S +CGEMMINCOPY = ../generic/zgemm_ncopy_1.c +CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy.$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy.$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_1x4.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy.$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy.$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_2x8.S +STRSMKERNEL_LT = trsm_kernel_LT_2x8.S +STRSMKERNEL_RN = trsm_kernel_LT_2x8.S +STRSMKERNEL_RT = trsm_kernel_RT_2x8.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x8.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x8.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x8.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x8.S + +CTRSMKERNEL_LN = ztrsm_kernel_LT_1x4.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_1x4.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_1x4.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_1x4.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4.S diff --git a/kernel/sparc/Makefile b/kernel/sparc/Makefile new file mode 100644 index 0000000..efae70d --- /dev/null +++ b/kernel/sparc/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/sparc/amax.S b/kernel/sparc/amax.S new file mode 100644 index 0000000..7729e5c --- /dev/null +++ b/kernel/sparc/amax.S @@ -0,0 +1,380 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#else +#define FCMOV FMOVL +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + add N, -1, N + LDF [X], c4 + add X, INCX, X + cmp N, 0 + ble .LL20 + FABS c4, c1 + + FABS c4, c2 + FABS c4, c3 + FABS c4, c4 + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + FABS a8, t4 + LDF [X + 7 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + add I, -1, I + FCMOV %fcc1, t2, c2 + cmp I, 0 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + +.LL20: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FABS a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FABS a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FABS a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FABS a8, t4 + LDF [X + 0 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + add I, -1, I + FCMOV %fcc1, t2, c2 + cmp I, 0 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/asum.S b/kernel/sparc/asum.S new file mode 100644 index 0000000..7205fa6 --- /dev/null +++ b/kernel/sparc/asum.S @@ -0,0 +1,325 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + sll INCX, BASE_SHIFT, INCX + + FMOV c1, c2 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL19 + cmp INCX, SIZE + bne .LL50 + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + cmp I, 0 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 128 + +.LL11: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + add I, -1, I + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + + FADD c1, t3, c1 + cmp I, 0 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + + FADD c2, t4, c2 + nop + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + nop + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + + FADD c2, t2, c2 + nop + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + + FADD c1, t3, c1 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + add X, 8 * SIZE, X + + FADD c2, t4, c2 + FABS a8, t4 + bg,pt %icc, .LL11 + LDF [X - 1 * SIZE], a8 + +.LL12: + FADD c1, t1, c1 + FABS a1, t1 + FADD c2, t2, c2 + FABS a2, t2 + + FADD c1, t3, c1 + FABS a3, t3 + FADD c2, t4, c2 + FABS a4, t4 + + FADD c1, t1, c1 + FABS a5, t1 + FADD c2, t2, c2 + FABS a6, t2 + + FADD c1, t3, c1 + FABS a7, t3 + FADD c2, t4, c2 + FABS a8, t4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + FADD c1, t1, c1 + FABS a1, t1 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FADD c1, t1, c1 + add I, -1, I + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FADD c2, t2, c2 + cmp I, 0 + FABS a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + + FADD c1, t3, c1 + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + FADD c2, t4, c2 + FABS a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + + FADD c2, t2, c2 + FABS a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + + FADD c1, t3, c1 + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + + FADD c2, t4, c2 + FABS a8, t4 + LDF [X + 0 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FADD c1, t1, c1 + FABS a1, t1 + FADD c2, t2, c2 + FABS a2, t2 + + FADD c1, t3, c1 + FABS a3, t3 + FADD c2, t4, c2 + FABS a4, t4 + + FADD c1, t1, c1 + FABS a5, t1 + FADD c2, t2, c2 + FABS a6, t2 + + FADD c1, t3, c1 + FABS a7, t3 + FADD c2, t4, c2 + FABS a8, t4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FADD c1, t1, c1 + add I, -1, I + FABS a1, t1 + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/axpy.S b/kernel/sparc/axpy.S new file mode 100644 index 0000000..997f9e0 --- /dev/null +++ b/kernel/sparc/axpy.S @@ -0,0 +1,503 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(DOUBLE) && !defined(__64BIT__) +#define N %i0 +#define X %i5 +#define INCX %i1 +#define Y %i2 +#define INCY %i3 +#define I %i4 +#else +#define N %i0 +#define X %i4 +#define INCX %i5 +#define Y %i1 +#define INCY %i2 +#define I %i3 +#endif + +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define b1 %f16 +#define b2 %f18 +#define b3 %f20 +#define b4 %f22 +#define b5 %f24 +#define b6 %f26 +#define b7 %f28 +#define b8 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 +#define c1 %f40 +#define c2 %f42 +#define c3 %f44 +#define c4 %f46 + +#define c5 %f48 +#define c6 %f50 +#define c7 %f52 +#define c8 %f54 + +#define ALPHA %f62 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define b1 %f8 +#define b2 %f9 +#define b3 %f10 +#define b4 %f11 +#define b5 %f12 +#define b6 %f13 +#define b7 %f14 +#define b8 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 +#define c1 %f20 +#define c2 %f21 +#define c3 %f22 +#define c4 %f23 + +#define c5 %f24 +#define c6 %f25 +#define c7 %f26 +#define c8 %f27 + +#define ALPHA %f31 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], INCX + ld [%sp + STACK_START + 32], Y + ld [%sp + STACK_START + 36], INCY +#else + st %i3, [%sp + STACK_START + 16] + ld [%sp + STACK_START + 28], Y + ld [%sp + STACK_START + 32], INCY +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp + STACK_START + 56], Y + ldx [%sp + STACK_START + 64], INCY +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif +#endif + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + FMUL ALPHA, a1, t1 + FMUL ALPHA, a2, t2 + FMUL ALPHA, a3, t3 + FMUL ALPHA, a4, t4 + + FADD b1, t1, c1 + FMUL ALPHA, a5, t1 + FADD b2, t2, c2 + FMUL ALPHA, a6, t2 + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL12 + nop + +#ifdef DOUBLE +#define PREFETCHSIZE 54 +#else +#define PREFETCHSIZE 108 +#endif + +.LL11: + prefetch [Y + PREFETCHSIZE * SIZE], 0 + + LDF [X + 8 * SIZE], a1 + LDF [X + 9 * SIZE], a2 + LDF [X + 10 * SIZE], a3 + LDF [X + 11 * SIZE], a4 + + FADD b3, t3, c3 + STF c1, [Y + 0 * SIZE] + FMUL ALPHA, a7, t3 + + FADD b4, t4, c4 + STF c2, [Y + 1 * SIZE] + FMUL ALPHA, a8, t4 + + LDF [Y + 8 * SIZE], b1 + LDF [Y + 9 * SIZE], b2 + LDF [Y + 10 * SIZE], b3 + LDF [Y + 11 * SIZE], b4 + + FADD b5, t1, c5 + STF c3, [Y + 2 * SIZE] + FMUL ALPHA, a1, t1 + + FADD b6, t2, c6 + STF c4, [Y + 3 * SIZE] + FMUL ALPHA, a2, t2 + + prefetch [X + PREFETCHSIZE * SIZE], 0 + + LDF [X + 12 * SIZE], a5 + LDF [X + 13 * SIZE], a6 + LDF [X + 14 * SIZE], a7 + LDF [X + 15 * SIZE], a8 + + FADD b7, t3, c7 + STF c5, [Y + 4 * SIZE] + FMUL ALPHA, a3, t3 + + FADD b8, t4, c8 + STF c6, [Y + 5 * SIZE] + FMUL ALPHA, a4, t4 + + LDF [Y + 12 * SIZE], b5 + LDF [Y + 13 * SIZE], b6 + LDF [Y + 14 * SIZE], b7 + LDF [Y + 15 * SIZE], b8 + + FADD b1, t1, c1 + STF c7, [Y + 6 * SIZE] + FMUL ALPHA, a5, t1 + deccc I + + FADD b2, t2, c2 + STF c8, [Y + 7 * SIZE] + FMUL ALPHA, a6, t2 + add Y, 8 * SIZE, Y + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FADD b3, t3, c3 + FMUL ALPHA, a7, t3 + FADD b4, t4, c4 + FMUL ALPHA, a8, t4 + + FADD b5, t1, c5 + FADD b6, t2, c6 + FADD b7, t3, c7 + FADD b8, t4, c8 + + STF c1, [Y + 0 * SIZE] + STF c2, [Y + 1 * SIZE] + STF c3, [Y + 2 * SIZE] + STF c4, [Y + 3 * SIZE] + + STF c5, [Y + 4 * SIZE] + STF c6, [Y + 5 * SIZE] + STF c7, [Y + 6 * SIZE] + STF c8, [Y + 7 * SIZE] + + add Y, 8 * SIZE, Y + add X, 8 * SIZE, X + + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + + FMUL ALPHA, a1, t1 + FADD b1, t1, c1 + + add I, -1, I + cmp I, 0 + STF c1, [Y + 0 * SIZE] + add Y, 1 * SIZE, Y + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + mov Y, YY + + LDF [X + 0 * SIZE], a1 + add I, -1, I + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + add Y, INCY, Y + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + ble,pt %icc, .LL52 + add Y, INCY, Y + + +.LL51: + FMUL ALPHA, a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FMUL ALPHA, a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + + FMUL ALPHA, a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FMUL ALPHA, a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FADD b1, t1, c1 + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + + FMUL ALPHA, a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FADD b2, t2, c2 + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + + FMUL ALPHA, a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FADD b3, t3, c3 + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + + FMUL ALPHA, a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FADD b4, t4, c4 + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + FMUL ALPHA, a8, t4 + LDF [X + 0 * SIZE], a8 + add X, INCX, X + + STF c1, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b5, t1, c1 + STF c2, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b6, t2, c2 + STF c3, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b7, t3, c3 + STF c4, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b8, t4, c4 + + LDF [Y + 0 * SIZE], b5 + add I, -1, I + add Y, INCY, Y + LDF [Y + 0 * SIZE], b6 + cmp I, 0 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b8 + add Y, INCY, Y + + STF c1, [YY + 0 * SIZE] + add YY, INCY, YY + STF c2, [YY + 0 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + add YY, INCY, YY + STF c4, [YY + 0 * SIZE] + + bg,pt %icc, .LL51 + add YY, INCY, YY + +.LL52: + FMUL ALPHA, a1, t1 + FMUL ALPHA, a2, t2 + FMUL ALPHA, a3, t3 + FMUL ALPHA, a4, t4 + + FADD b1, t1, c1 + FMUL ALPHA, a5, t1 + FADD b2, t2, c2 + FMUL ALPHA, a6, t2 + FADD b3, t3, c3 + FMUL ALPHA, a7, t3 + FADD b4, t4, c4 + FMUL ALPHA, a8, t4 + + STF c1, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b5, t1, c1 + STF c2, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b6, t2, c2 + STF c3, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b7, t3, c3 + STF c4, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b8, t4, c4 + + STF c1, [YY + 0 * SIZE] + add YY, INCY, YY + STF c2, [YY + 0 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + add YY, INCY, YY + STF c4, [YY + 0 * SIZE] + add YY, INCY, YY + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + + FMUL ALPHA, a1, t1 + FADD b1, t1, c1 + + add I, -1, I + cmp I, 0 + STF c1, [Y + 0 * SIZE] + add Y, INCY, Y + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/cabs.S b/kernel/sparc/cabs.S new file mode 100644 index 0000000..119293e --- /dev/null +++ b/kernel/sparc/cabs.S @@ -0,0 +1,58 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + + add %sp, -128, %sp + + LDF [%o0 + 0 * SIZE], %f0 + LDF [%o0 + 1 * SIZE], %f8 + FABS %f8, %f8 + FABS %f0, %f0 + FADD %f0, %f8, %f0 +#if !defined(DOUBLE) && defined(F2CCONV) + fstod %f0, %f0 +#endif + retl + sub %sp, -128, %sp + + EPILOGUE + diff --git a/kernel/sparc/cnrm2.S b/kernel/sparc/cnrm2.S new file mode 100644 index 0000000..8dc4b56 --- /dev/null +++ b/kernel/sparc/cnrm2.S @@ -0,0 +1,329 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + + PROLOGUE + SAVESP + + FCLR(0) + + FMOV c1, c2 + FMOV c1, c3 + FMOV c1, c4 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL20 + sll INCX, ZBASE_SHIFT, INCX + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + ld [X + 0 * SIZE], a1 + add I, -1, I + ld [X + 1 * SIZE], a2 + cmp I, 0 + ld [X + 2 * SIZE], a3 + ld [X + 3 * SIZE], a4 + ld [X + 4 * SIZE], a5 + ld [X + 5 * SIZE], a6 + ld [X + 6 * SIZE], a7 + ld [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + + faddd c2, t2, c2 + add I, -1, I + fsmuld a2, a2, t2 + ld [X + 0 * SIZE], a1 + + faddd c3, t3, c3 + cmp I, 0 + fsmuld a3, a3, t3 + ld [X + 1 * SIZE], a2 + + faddd c4, t4, c4 + fsmuld a4, a4, t4 + ld [X + 2 * SIZE], a3 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + ld [X + 3 * SIZE], a4 + + faddd c2, t2, c2 + fsmuld a6, a6, t2 + ld [X + 4 * SIZE], a5 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + ld [X + 5 * SIZE], a6 + + faddd c4, t4, c4 + ld [X + 6 * SIZE], a7 + fsmuld a8, a8, t4 + add X, 8 * SIZE, X + + bg,pt %icc, .LL11 + ld [X - 1 * SIZE], a8 + +.LL12: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + faddd c2, t2, c2 + fsmuld a2, a2, t2 + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + faddd c4, t4, c4 + fsmuld a4, a4, t4 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + faddd c2, t2, c2 + fsmuld a6, a6, t2 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + faddd c4, t4, c4 + fsmuld a8, a8, t4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + ld [X + 0 * SIZE], a1 + add I, -1, I + ld [X + 1 * SIZE], a2 + cmp I, 0 + faddd c1, t1, c1 + faddd c2, t2, c2 + fsmuld a1, a1, t1 + fsmuld a2, a2, t2 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + faddd c1, t1, c1 + faddd c2, t2, c2 + faddd c3, t3, c3 + faddd c4, t4, c4 + + faddd c1, c2, c1 + faddd c3, c4, c3 + faddd c1, c3, c1 + + fsqrtd c1, c1 + +#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) + fdtos c1, c1 +#endif +.LL20: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + ld [X + 0 * SIZE], a1 + ld [X + 1 * SIZE], a2 + add X, INCX, X + ld [X + 0 * SIZE], a3 + ld [X + 1 * SIZE], a4 + add X, INCX, X + ld [X + 0 * SIZE], a5 + ld [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + ld [X + 0 * SIZE], a7 + cmp I, 0 + ld [X + 1 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + faddd c1, t1, c1 + add I, -1, I + fsmuld a1, a1, t1 + ld [X + 0 * SIZE], a1 + + faddd c2, t2, c2 + cmp I, 0 + fsmuld a2, a2, t2 + ld [X + 1 * SIZE], a2 + add X, INCX, X + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + ld [X + 0 * SIZE], a3 + + faddd c4, t4, c4 + fsmuld a4, a4, t4 + ld [X + 1 * SIZE], a4 + add X, INCX, X + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + ld [X + 0 * SIZE], a5 + + faddd c2, t2, c2 + fsmuld a6, a6, t2 + ld [X + 1 * SIZE], a6 + add X, INCX, X + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + ld [X + 0 * SIZE], a7 + + faddd c4, t4, c4 + fsmuld a8, a8, t4 + ld [X + 1 * SIZE], a8 + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + faddd c2, t2, c2 + fsmuld a2, a2, t2 + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + faddd c4, t4, c4 + fsmuld a4, a4, t4 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + faddd c2, t2, c2 + fsmuld a6, a6, t2 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + faddd c4, t4, c4 + fsmuld a8, a8, t4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + ld [X + 0 * SIZE], a1 + add I, -1, I + ld [X + 1 * SIZE], a2 + cmp I, 0 + faddd c1, t1, c1 + faddd c2, t2, c2 + fsmuld a1, a1, t1 + fsmuld a2, a2, t2 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + faddd c1, t1, c1 + faddd c2, t2, c2 + faddd c3, t3, c3 + faddd c4, t4, c4 + + faddd c1, c2, c1 + faddd c3, c4, c3 + faddd c1, c3, c1 + + fsqrtd c1, c1 + +#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) + fdtos c1, c1 +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/copy.S b/kernel/sparc/copy.S new file mode 100644 index 0000000..959d2ff --- /dev/null +++ b/kernel/sparc/copy.S @@ -0,0 +1,218 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define a9 %f16 +#define a10 %f18 +#define a11 %f20 +#define a12 %f22 +#define a13 %f24 +#define a14 %f26 +#define a15 %f28 +#define a16 %f30 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define a9 %f8 +#define a10 %f9 +#define a11 %f10 +#define a12 %f11 +#define a13 %f12 +#define a14 %f13 +#define a15 %f14 +#define a16 %f15 +#endif + + PROLOGUE + SAVESP + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + +#define PREFETCHSIZE 32 + +.LL11: + LDF [X + 0 * SIZE], a1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + STF a1, [Y + 0 * SIZE] + prefetch [Y + PREFETCHSIZE * SIZE], 0 + STF a2, [Y + 1 * SIZE] + STF a3, [Y + 2 * SIZE] + STF a4, [Y + 3 * SIZE] + STF a5, [Y + 4 * SIZE] + STF a6, [Y + 5 * SIZE] + STF a7, [Y + 6 * SIZE] + STF a8, [Y + 7 * SIZE] + + add I, -1, I + cmp I, 0 + add Y, 8 * SIZE, Y + add X, 8 * SIZE, X + + bg,pt %icc, .LL11 + nop + + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + add X, 1 * SIZE, X + STF a1, [Y + 0 * SIZE] + bg,pt %icc, .LL16 + add Y, 1 * SIZE, Y + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + add X, INCX, X + + STF a1, [Y + 0 * SIZE] + add Y, INCY, Y + add I, -1, I + STF a2, [Y + 0 * SIZE] + add Y, INCY, Y + cmp I, 0 + STF a3, [Y + 0 * SIZE] + add Y, INCY, Y + STF a4, [Y + 0 * SIZE] + add Y, INCY, Y + STF a5, [Y + 0 * SIZE] + add Y, INCY, Y + STF a6, [Y + 0 * SIZE] + add Y, INCY, Y + STF a7, [Y + 0 * SIZE] + add Y, INCY, Y + STF a8, [Y + 0 * SIZE] + + bg,pt %icc, .LL51 + add Y, INCY, Y + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + add X, INCX, X + STF a1, [Y + 0 * SIZE] + bg,pt %icc, .LL56 + add Y, INCY, Y + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S new file mode 100644 index 0000000..8063e23 --- /dev/null +++ b/kernel/sparc/dnrm2.S @@ -0,0 +1,675 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 +#define XX %i4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#define fmax %f32 +#define fzero %f34 +#define fone %f36 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#define fmax %f16 +#define fzero %f17 +#define fone %f18 +#endif + + PROLOGUE + SAVESP + +#ifdef DOUBLE + FCLR(3) +#else + FCLR(17) +#endif + + mov X, XX + mov 0x3ff, %g1 + sll %g1, 20, %g1 + + cmp N, 0 + ble .LL99 + FMOV fzero, c1 + + cmp INCX, 0 + ble .LL99 + sll INCX, BASE_SHIFT, INCX + + add %sp, -8, %sp + st %g1, [%sp + STACK_START + 0] + st %g0, [%sp + STACK_START + 4] + + add N, -1, N + LDF [X], c4 + add X, INCX, X + + LDF [%sp + STACK_START], fone + add %sp, 8, %sp + + FABS c4, c1 + FABS c4, c2 + FABS c4, c3 + FABS c4, c4 + + cmp INCX, SIZE + bne .LL100 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + FABS a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a2, t2 + LDF [X + 0 * SIZE], a1 + FABS a3, t3 + LDF [X + 1 * SIZE], a2 + FABS a4, t4 + LDF [X + 2 * SIZE], a3 + + FCMP %fcc0, t1, c1 + LDF [X + 3 * SIZE], a4 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + FABS a8, t4 + LDF [X + 7 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + add I, -1, I + FMOVG %fcc1, t2, c2 + cmp I, 0 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + + FCMP %fcc0, t1, c1 + FMOVG %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + mov XX, X + FMOVG %fcc0, c2, c1 + FMOVG %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FMOVG %fcc0, c3, c1 + + FCMP c1, fzero + fbe .LL99 + nop + + FMOV c1, fmax + add N, 1, N + FDIV fone, c1, fone + + FMOV fzero, c1 + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL35 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL32 + add X, 8 * SIZE, X + +.LL31: + FMUL fone, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMUL fone, a2, t2 + LDF [X + 0 * SIZE], a1 + FMUL fone, a3, t3 + LDF [X + 1 * SIZE], a2 + FMUL fone, a4, t4 + LDF [X + 2 * SIZE], a3 + + FMUL t1, t1, t1 + LDF [X + 3 * SIZE], a4 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + LDF [X + 4 * SIZE], a5 + FADD c2, t2, c2 + FMUL fone, a6, t2 + LDF [X + 5 * SIZE], a6 + FADD c3, t3, c3 + FMUL fone, a7, t3 + LDF [X + 6 * SIZE], a7 + FADD c4, t4, c4 + FMUL fone, a8, t4 + LDF [X + 7 * SIZE], a8 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + add I, -1, I + FADD c2, t2, c2 + cmp I, 0 + FADD c3, t3, c3 + FADD c4, t4, c4 + + bg,pt %icc, .LL31 + add X, 8 * SIZE, X + +.LL32: + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL fone, a3, t3 + FMUL fone, a4, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + FADD c2, t2, c2 + FMUL fone, a6, t2 + FADD c3, t3, c3 + FMUL fone, a7, t3 + FADD c4, t4, c4 + FMUL fone, a8, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +.LL35: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL39 + nop + +.LL36: + LDF [X + 0 * SIZE], a1 + FMUL fone, a1, t1 + FMUL t1, t1, t1 + FADD c1, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL36 + add X, 1 * SIZE, X + +.LL39: + FADD c1, c2, c1 + FADD c3, c4, c3 + FADD c1, c3, c1 + + FSQRT c1, c1 + FMUL fmax, c1, c1 + +.LL99: + return %i7 + 8 + clr %g0 + +.LL100: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL105 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + ble,pt %icc, .LL102 + add X, INCX, X + +.LL101: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FABS a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FABS a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FABS a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FABS a8, t4 + LDF [X + 0 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + add I, -1, I + FMOVG %fcc1, t2, c2 + cmp I, 0 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + bg,pt %icc, .LL101 + add X, INCX, X + +.LL102: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + +.LL105: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL109 + nop + +.LL106: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FMOVG %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL106 + add X, INCX, X + +.LL109: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + mov XX, X + FMOVG %fcc0, c2, c1 + FMOVG %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FMOVG %fcc0, c3, c1 + + FCMP c1, fzero + fbe .LL99 + nop + + FMOV c1, fmax + FDIV fone, c1, fone + + FMOV fzero, c1 + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + add N, 1, N + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL135 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + LDF [X + 0 * SIZE], a5 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a6 + add X, INCX, X + cmp I, 0 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL132 + add X, INCX, X + +.LL131: + FMUL fone, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMUL fone, a2, t2 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FMUL fone, a3, t3 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FMUL fone, a4, t4 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + FMUL t1, t1, t1 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FADD c2, t2, c2 + FMUL fone, a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FADD c3, t3, c3 + FMUL fone, a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FADD c4, t4, c4 + FMUL fone, a8, t4 + LDF [X + 0 * SIZE], a8 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + add I, -1, I + FADD c2, t2, c2 + cmp I, 0 + FADD c3, t3, c3 + FADD c4, t4, c4 + + bg,pt %icc, .LL131 + add X, INCX, X + +.LL132: + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL fone, a3, t3 + FMUL fone, a4, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + FADD c2, t2, c2 + FMUL fone, a6, t2 + FADD c3, t3, c3 + FMUL fone, a7, t3 + FADD c4, t4, c4 + FMUL fone, a8, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +.LL135: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL139 + nop + +.LL136: + LDF [X + 0 * SIZE], a1 + FMUL fone, a1, t1 + FMUL t1, t1, t1 + FADD c1, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL136 + add X, INCX, X + +.LL139: + FADD c1, c2, c1 + FADD c3, c4, c3 + FADD c1, c3, c1 + + FSQRT c1, c1 + FMUL fmax, c1, c1 + + return %i7 + 8 + clr %g0 + + EPILOGUE diff --git a/kernel/sparc/dot.S b/kernel/sparc/dot.S new file mode 100644 index 0000000..f89d5f9 --- /dev/null +++ b/kernel/sparc/dot.S @@ -0,0 +1,423 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f4 +#define t2 %f6 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + +#define b1 %f32 +#define b2 %f34 +#define b3 %f36 +#define b4 %f38 +#define b5 %f40 +#define b6 %f42 +#define b7 %f44 +#define b8 %f46 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 + +#define b1 %f16 +#define b2 %f17 +#define b3 %f18 +#define b4 %f19 +#define b5 %f20 +#define b6 %f21 +#define b7 %f22 +#define b8 %f23 +#endif + + PROLOGUE + SAVESP + +#ifdef DOUBLE + FCLR(0) + FCLR(2) + FCLR(4) + FCLR(6) +#else + FCLR(0) + FCLR(1) + FCLR(4) + FCLR(5) +#endif + + cmp N, 0 + ble .LL19 + nop + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + + LDF [X + 6 * SIZE], a7 + add I, -1, I + LDF [Y + 6 * SIZE], b7 + cmp I, 0 + + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + LDF [Y + 7 * SIZE], b8 + add Y, 8 * SIZE, Y + + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 40 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + FADD c1, t1, c1 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + FMUL a1, b1, t1 + + LDF [X + 0 * SIZE], a1 + FADD c2, t2, c2 + FMUL a2, b2, t2 + LDF [Y + 0 * SIZE], b1 + add I, -1, I + + LDF [X + 1 * SIZE], a2 + FADD c1, t1, c1 + FMUL a3, b3, t1 + LDF [Y + 1 * SIZE], b2 + cmp I, 0 + + LDF [X + 2 * SIZE], a3 + FADD c2, t2, c2 + FMUL a4, b4, t2 + LDF [Y + 2 * SIZE], b3 + + LDF [X + 3 * SIZE], a4 + FADD c1, t1, c1 + FMUL a5, b5, t1 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + FADD c2, t2, c2 + FMUL a6, b6, t2 + LDF [Y + 4 * SIZE], b5 + + LDF [X + 5 * SIZE], a6 + FADD c1, t1, c1 + FMUL a7, b7, t1 + LDF [Y + 5 * SIZE], b6 + + LDF [X + 6 * SIZE], a7 + FADD c2, t2, c2 + FMUL a8, b8, t2 + LDF [Y + 6 * SIZE], b7 + add Y, 8 * SIZE, Y + + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + bg,pt %icc, .LL11 + LDF [Y - 1 * SIZE], b8 + +.LL12: + FADD c1, t1, c1 + FMUL a1, b1, t1 + + FADD c2, t2, c2 + FMUL a2, b2, t2 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a4, b4, t2 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a6, b6, t2 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a8, b8, t2 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + add X, 1 * SIZE, X + FADD c1, t1, c1 + FMUL a1, b1, t1 + bg,pt %icc, .LL16 + add Y, 1 * SIZE, Y + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, c2, c1 + + return %i7 + 8 + nop + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + add Y, INCY, Y + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL52 + nop + +.LL51: + FADD c1, t1, c1 + FMUL a1, b1, t1 + + LDF [X + 0 * SIZE], a1 + FADD c2, t2, c2 + add X, INCX, X + FMUL a2, b2, t2 + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a2 + FADD c1, t1, c1 + add X, INCX, X + FMUL a3, b3, t1 + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + add I, -1, I + + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FADD c2, t2, c2 + FMUL a4, b4, t2 + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + cmp I, 0 + + LDF [X + 0 * SIZE], a4 + add X, INCX, X + FADD c1, t1, c1 + FMUL a5, b5, t1 + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FADD c2, t2, c2 + FMUL a6, b6, t2 + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FADD c1, t1, c1 + FMUL a7, b7, t1 + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FADD c2, t2, c2 + FMUL a8, b8, t2 + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + bg,pt %icc, .LL51 + add Y, INCY, Y + +.LL52: + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b2, t2 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a4, b4, t2 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a6, b6, t2 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a8, b8, t2 + +.LL55: + and N, 7, I + cmp I, 0 + ble %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + add X, INCX, X + add Y, INCY, Y + + FADD c1, t1, c1 + FMUL a1, b1, t1 + + addcc I, -1, I + bg %icc, .LL56 + nop + + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + + FADD c1, c2, c1 + + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/gemm_kernel.S b/kernel/sparc/gemm_kernel.S new file mode 100644 index 0000000..b663243 --- /dev/null +++ b/kernel/sparc/gemm_kernel.S @@ -0,0 +1,3054 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define OFFSET %l5 +#define KK %l6 +#define TEMP1 %l7 +#define TEMP2 %i3 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f58 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f60 +#define ALPHA %f62 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#define ALPHA %f30 +#endif + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 36], OFFSET +#endif +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC +#ifdef TRMMKERNEL + ldx [%sp+ STACK_START + 72], OFFSET +#endif +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif +#endif + + FCLR(29) + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDC, BASE_SHIFT, LDC + +.LL11: + add C, LDC, C2 + FMOV FZERO, t1 + nop + mov C, C1 + + add C2, LDC, C3 + FMOV FZERO, t2 + sra K, 2, L + mov A, AO + + sra M, 2, I + add C3, LDC, C4 + FMOV FZERO, t3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + cmp I, 0 + add C4, LDC, C + FMOV FZERO, t4 + + ble,pn %icc, .LL50 + FMOV FZERO, c01 + +.LL21: +#if !defined(TRMMKERNEL) + FMOV FZERO, c02 + mov B, BO + + FMOV FZERO, c03 + cmp L, 0 +#else + FMOV FZERO, c02 + FMOV FZERO, c03 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 2 + BASE_SHIFT, TEMP1 + + add AO, TEMP1, AO + add B, TEMP1, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 4, L +#endif + sra L, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c04 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c05 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c06 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c07 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c08 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c09 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c10 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c11 + LDF [BO + 4 * SIZE], b5 /* ***** */ + + LDF [AO + 4 * SIZE], a5 /* ***** */ + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 + 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 + 3 * SIZE], 3 + FMOV FZERO, c15 + + ble,pn %icc, .LL25 + FMOV FZERO, c16 + + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + +.LL22: + FADD c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c08, t2, c08 + FMUL a5, b2, t2 + FADD c12, t3, c12 + FMUL a5, b3, t3 + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL29 + nop + +.LL26: + FADD c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#ifndef TRMMKERNEL + FADD c04, t1, c04 + add I, -1, I + FMUL c01, ALPHA, c01 + LDF [C1 + 0 * SIZE], a1 + + FADD c08, t2, c08 + cmp I, 0 + FMUL c02, ALPHA, c02 + LDF [C1 + 1 * SIZE], a2 + + FADD c12, t3, c12 + nop + FMUL c03, ALPHA, c03 + LDF [C1 + 2 * SIZE], a3 + + FADD c16, t4, c16 + nop + FMUL c04, ALPHA, c04 + LDF [C1 + 3 * SIZE], a4 + + FMUL c05, ALPHA, c05 + LDF [C2 + 0 * SIZE], b1 + FMUL c06, ALPHA, c06 + LDF [C2 + 1 * SIZE], b2 + + FMUL c07, ALPHA, c07 + LDF [C2 + 2 * SIZE], b3 + FMUL c08, ALPHA, c08 + LDF [C2 + 3 * SIZE], b4 + + FMUL c09, ALPHA, c09 + LDF [C3 + 0 * SIZE], t1 + FMUL c10, ALPHA, c10 + LDF [C3 + 1 * SIZE], t2 + + FMUL c11, ALPHA, c11 + LDF [C3 + 2 * SIZE], t3 + FMUL c12, ALPHA, c12 + LDF [C3 + 3 * SIZE], t4 + + FMUL c13, ALPHA, c13 + add C1, 4 * SIZE, C1 + FADD c01, a1, c01 + LDF [C4 + 0 * SIZE], a1 + + FMUL c14, ALPHA, c14 + add C2, 4 * SIZE, C2 + FADD c02, a2, c02 + LDF [C4 + 1 * SIZE], a2 + + FMUL c15, ALPHA, c15 + add C3, 4 * SIZE, C3 + FADD c03, a3, c03 + LDF [C4 + 2 * SIZE], a3 + + FMUL c16, ALPHA, c16 + nop + FADD c04, a4, c04 + LDF [C4 + 3 * SIZE], a4 + + STF c01, [C1 - 4 * SIZE] + FADD c05, b1, c05 + STF c02, [C1 - 3 * SIZE] + FADD c06, b2, c06 + + STF c03, [C1 - 2 * SIZE] + FADD c07, b3, c07 + STF c04, [C1 - 1 * SIZE] + FADD c08, b4, c08 + + STF c05, [C2 - 4 * SIZE] + FADD c09, t1, c09 + STF c06, [C2 - 3 * SIZE] + FADD c10, t2, c10 + + STF c07, [C2 - 2 * SIZE] + FADD c11, t3, c11 + STF c08, [C2 - 1 * SIZE] + FADD c12, t4, c12 + + STF c09, [C3 - 4 * SIZE] + FADD c13, a1, c13 + STF c10, [C3 - 3 * SIZE] + FADD c14, a2, c14 + + STF c11, [C3 - 2 * SIZE] + FADD c15, a3, c15 + STF c12, [C3 - 1 * SIZE] + FADD c16, a4, c16 + + STF c13, [C4 + 0 * SIZE] + FMOV FZERO, t1 + STF c14, [C4 + 1 * SIZE] + FMOV FZERO, t2 + + STF c15, [C4 + 2 * SIZE] + FMOV FZERO, t3 + STF c16, [C4 + 3 * SIZE] + FMOV FZERO, t4 + + add C4, 4 * SIZE, C4 +#else + + FADD c04, t1, c04 + FMUL c01, ALPHA, c01 + FADD c08, t2, c08 + FMUL c02, ALPHA, c02 + FADD c12, t3, c12 + FMUL c03, ALPHA, c03 + FADD c16, t4, c16 + FMUL c04, ALPHA, c04 + + STF c01, [C1 + 0 * SIZE] + FMUL c05, ALPHA, c05 + STF c02, [C1 + 1 * SIZE] + FMUL c06, ALPHA, c06 + STF c03, [C1 + 2 * SIZE] + FMUL c07, ALPHA, c07 + STF c04, [C1 + 3 * SIZE] + FMUL c08, ALPHA, c08 + + STF c05, [C2 + 0 * SIZE] + FMUL c09, ALPHA, c09 + STF c06, [C2 + 1 * SIZE] + FMUL c10, ALPHA, c10 + STF c07, [C2 + 2 * SIZE] + FMUL c11, ALPHA, c11 + STF c08, [C2 + 3 * SIZE] + FMUL c12, ALPHA, c12 + + STF c09, [C3 + 0 * SIZE] + FMUL c13, ALPHA, c13 + STF c10, [C3 + 1 * SIZE] + FMUL c14, ALPHA, c14 + STF c11, [C3 + 2 * SIZE] + FMUL c15, ALPHA, c15 + STF c12, [C3 + 3 * SIZE] + FMUL c16, ALPHA, c16 + + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + STF c15, [C4 + 2 * SIZE] + STF c16, [C4 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + add C3, 4 * SIZE, C3 + add C4, 4 * SIZE, C4 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -4, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + +#endif + + sra K, 2, L + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 2, I + FMOV FZERO, c02 + cmp I, 0 + + FMOV FZERO, t1 + ble,pn %icc, .LL70 + FMOV FZERO, c04 + +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t2 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, t3 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c05 +#else + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 4, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + +#endif + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD c04, t2, c04 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#ifndef TRMMKERNEL + FADD c02, t1, c02 + FMUL c01, ALPHA, c01 + LDF [C1 + 0 * SIZE], a1 + FADD c04, t2, c04 + FMUL c03, ALPHA, c03 + LDF [C1 + 1 * SIZE], a2 + FADD c06, t3, c06 + FMUL c05, ALPHA, c05 + LDF [C2 + 0 * SIZE], a3 + FADD c08, t4, c08 + FMUL c07, ALPHA, c07 + LDF [C2 + 1 * SIZE], a4 + + FMUL c02, ALPHA, c02 + FADD c01, a1, c01 + LDF [C3 + 0 * SIZE], b1 + + FMUL c04, ALPHA, c04 + FADD c02, a2, c02 + LDF [C3 + 1 * SIZE], b2 + + FMUL c06, ALPHA, c06 + FADD c03, a3, c03 + LDF [C4 + 0 * SIZE], b3 + + FMUL c08, ALPHA, c08 + FADD c04, a4, c04 + LDF [C4 + 1 * SIZE], b4 + + STF c01, [C1 + 0 * SIZE] + FADD c05, b1, c05 + STF c02, [C1 + 1 * SIZE] + FADD c06, b2, c06 + add C1, 2 * SIZE, C1 + + STF c03, [C2 + 0 * SIZE] + FADD c07, b3, c07 + STF c04, [C2 + 1 * SIZE] + FADD c08, b4, c08 + add C2, 2 * SIZE, C2 + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + add C3, 2 * SIZE, C3 + + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + add C4, 2 * SIZE, C4 +#else + + FADD c02, t1, c02 + FADD c04, t2, c04 + FADD c06, t3, c06 + FADD c08, t4, c08 + + FMUL c01, ALPHA, c01 + FMUL c03, ALPHA, c03 + FMUL c05, ALPHA, c05 + FMUL c07, ALPHA, c07 + + FMUL c02, ALPHA, c02 + FMUL c04, ALPHA, c04 + FMUL c06, ALPHA, c06 + FMUL c08, ALPHA, c08 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + +.LL70: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL71: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL75 + nop + +.LL72: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a1, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 4 * SIZE], a1 + + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a2, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [BO + 9 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a2, b3, t3 + LDF [BO + 10 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 11 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 12 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 13 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [BO + 14 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a3, b4, t4 + LDF [BO + 15 * SIZE], b4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 16 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a4, b2, t2 + LDF [BO + 17 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 18 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 19 * SIZE], b4 + + add BO, 16 * SIZE, BO + bg,pt %icc, .LL72 + LDF [AO + 3 * SIZE], a4 + +.LL75: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL79 + nop + +.LL76: + FADD c01, t1, c01 + add AO, 1 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + add BO, 4 * SIZE, BO + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + bg,pt %icc, .LL76 + LDF [BO + 3 * SIZE], b4 + + +.LL79: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + LDF [C1 + 0 * SIZE], a1 + FADD c02, t2, c02 + LDF [C2 + 0 * SIZE], a2 + FADD c03, t3, c03 + LDF [C3 + 0 * SIZE], a3 + FADD c04, t4, c04 + LDF [C4 + 0 * SIZE], a4 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + FADD c01, a1, c01 + FADD c02, a2, c02 + FADD c03, a3, c03 + FADD c04, a4, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL99: + add J, -1, J + mov BO, B + cmp J, 0 + bg,pt %icc, .LL11 +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 4, KK +#else + nop +#endif + +.LL100: /* n & 2 */ + sra M, 2, I + and N, 2, J + + cmp J, 0 + add C, LDC, C2 + ble,pn %icc, .LL200 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov C, C1 + add C2, LDC, C + + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t1 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, t2 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 +#else + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 +#endif + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + FADD c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: +#ifndef TRMMKERNEL + FADD c03, t1, c03 + add I, -1, I + LDF [C1 + 0 * SIZE], a1 + FADD c07, t2, c07 + cmp I, 0 + LDF [C1 + 1 * SIZE], a2 + FADD c04, t3, c04 + LDF [C1 + 2 * SIZE], a3 + FADD c08, t4, c08 + LDF [C1 + 3 * SIZE], a4 + + LDF [C2 + 0 * SIZE], b1 + FMUL c01, ALPHA, c01 + LDF [C2 + 1 * SIZE], b2 + FMUL c02, ALPHA, c02 + LDF [C2 + 2 * SIZE], b3 + FMUL c03, ALPHA, c03 + LDF [C2 + 3 * SIZE], b4 + FMUL c04, ALPHA, c04 + + FMUL c05, ALPHA, c05 + FADD c01, a1, c01 + FMUL c06, ALPHA, c06 + FADD c02, a2, c02 + FMUL c07, ALPHA, c07 + FADD c03, a3, c03 + FMUL c08, ALPHA, c08 + FADD c04, a4, c04 + + STF c01, [C1 + 0 * SIZE] + FADD c05, b1, c05 + STF c02, [C1 + 1 * SIZE] + FADD c06, b2, c06 + STF c03, [C1 + 2 * SIZE] + FADD c07, b3, c07 + STF c04, [C1 + 3 * SIZE] + add C1, 4 * SIZE, C1 + FADD c08, b4, c08 + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + add C2, 4 * SIZE, C2 +#else + FADD c03, t1, c03 + FADD c07, t2, c07 + FADD c04, t3, c04 + FADD c08, t4, c08 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + FMUL c05, ALPHA, c05 + FMUL c06, ALPHA, c06 + FMUL c07, ALPHA, c07 + FMUL c08, ALPHA, c08 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -4, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 +#endif + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL170 + nop + +.LL151: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL a1, b1, t1 + FMUL a1, b2, t2 + FMUL a2, b1, t3 + FMUL a2, b2, t4 + + add AO, 2 * SIZE, AO + add BO, 2 * SIZE, BO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL156 + nop + +.LL159: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C2 + 0 * SIZE], a2 + LDF [C1 + 1 * SIZE], a3 + LDF [C2 + 1 * SIZE], a4 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + FADD c01, a1, c01 + FADD c02, a2, c02 + FADD c03, a3, c03 + FADD c04, a4, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + add C1, 2 * SIZE, C1 + STF c04, [C2 + 1 * SIZE] + add C2, 2 * SIZE, C2 +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + STF c04, [C2 + 1 * SIZE] + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + +.LL170: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + +.LL171: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL175 + nop + +.LL172: + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + add L, -1, L + LDF [AO + 0 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 9 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 10 * SIZE], b3 + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 11 * SIZE], b4 + add BO, 8 * SIZE, BO + + bg,pt %icc, .LL172 + LDF [AO + 3 * SIZE], a4 + +.LL175: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL179 + nop + +.LL176: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + add AO, 1 * SIZE, AO + LDF [BO + 2 * SIZE], b1 + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 3 * SIZE], b2 + + add BO, 2 * SIZE, BO + bg,pt %icc, .LL176 + LDF [AO + 0 * SIZE], a1 + +.LL179: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + LDF [C1 + 0 * SIZE], a1 + FADD c02, t2, c02 + LDF [C2 + 0 * SIZE], a2 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + + FADD c01, a1, c01 + FADD c02, a2, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] +#else + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL199: + mov BO, B +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 2, KK +#else + nop +#endif + +.LL200: + and N, 1, J + sra M, 2, I + + cmp J, 0 + ble,pn %icc, .LL999 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + cmp I, 0 + ble,pn %icc, .LL250 + mov C, C1 + +.LL221: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL225 + prefetch [C1 + 4 * SIZE], 2 + +.LL222: + FADD c01, t1, c01 + add BO, 4 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + add L, -1, L + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + + FADD c01, t1, c01 + cmp L, 0 + FMUL a1, b2, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [AO + 9 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b2, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 11 * SIZE], a4 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 12 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 13 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [AO + 14 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b3, t4 + LDF [AO + 15 * SIZE], a4 + LDF [BO + 2 * SIZE], b3 + + FADD c01, t1, c01 + FMUL a1, b4, t1 + LDF [AO + 16 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b4, t2 + LDF [AO + 17 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 18 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 19 * SIZE], a4 + add AO, 16 * SIZE, AO + + bg,pt %icc, .LL222 + LDF [BO + 3 * SIZE], b4 + +.LL225: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL229 + nop + +.LL226: + FADD c01, t1, c01 + add BO, 1 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + add AO, 4 * SIZE, AO + + bg,pt %icc, .LL226 + LDF [BO + 0 * SIZE], b1 + +.LL229: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + add I, -1, I + FADD c02, t2, c02 + cmp I, 0 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + LDF [C1 + 2 * SIZE], a3 + LDF [C1 + 3 * SIZE], a4 + + FADD c01, a1, c01 + FADD c02, a2, c02 + FADD c03, a3, c03 + FADD c04, a4, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + add C1, 4 * SIZE, C1 +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + add C1, 4 * SIZE, C1 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -4, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 +#endif + + bg,pt %icc, .LL221 + nop + +.LL250: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL270 + nop + +.LL251: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL255 + nop + +.LL252: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + LDF [BO + 4 * SIZE], b1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b2, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 5 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 9 * SIZE], a2 + LDF [BO + 6 * SIZE], b3 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + add AO, 8 * SIZE, AO + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL252 + add BO, 4 * SIZE, BO + +.LL255: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL259 + nop + +.LL256: + + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 2 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a2, b1, t2 + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add AO, 2 * SIZE, AO + + bg,pt %icc, .LL256 + add BO, 1 * SIZE, BO + +.LL259: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + LDF [C1 + 0 * SIZE], a1 + FADD c02, t2, c02 + LDF [C1 + 1 * SIZE], a2 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FADD c01, a1, c01 + FADD c02, a2, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + add C1, 2 * SIZE, C1 +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + add C1, 2 * SIZE, C1 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + +.LL270: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +.LL271: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + mov B, BO + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + cmp L, 0 + FMOV FZERO, t2 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t2 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 +#endif + + ble,pn %icc, .LL275 + LDF [BO + 3 * SIZE], b4 + +.LL272: + FADD c01, t1, c01 + add L, -1, L + add AO, 4 * SIZE, AO + + FMUL a1, b1, t1 + add BO, 4 * SIZE, BO + LDF [AO + 0 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + LDF [BO + 0 * SIZE], b1 + FMUL a2, b2, t2 + + LDF [AO + 1 * SIZE], a2 + FADD c01, t3, c01 + LDF [BO + 1 * SIZE], b2 + FMUL a3, b3, t3 + + LDF [AO + 2 * SIZE], a3 + FADD c02, t4, c02 + LDF [BO + 2 * SIZE], b3 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL272 + LDF [BO + 3 * SIZE], b4 + +.LL275: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL279 + nop + +.LL276: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 1 * SIZE], a1 + + LDF [BO + 1 * SIZE], b1 + add BO, 1 * SIZE, BO + cmp L, 0 + bg,pt %icc, .LL276 + add AO, 1 * SIZE, AO + +.LL279: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + + LDF [C1 + 0 * SIZE], a1 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + FADD c01, c02, c01 + + FMUL c01, ALPHA, c01 + FADD c01, a1, c01 + STF c01, [C1 + 0 * SIZE] +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + FADD c01, c02, c01 + + FMUL c01, ALPHA, c01 + STF c01, [C1 + 0 * SIZE] + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_kernel_2x8.S b/kernel/sparc/gemm_kernel_2x8.S new file mode 100644 index 0000000..c0d257a --- /dev/null +++ b/kernel/sparc/gemm_kernel_2x8.S @@ -0,0 +1,2561 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define BB %o7 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define C5 %l5 +#define C6 %l6 +#define C7 %l7 +#define C8 %i3 + +#define OFFSET %g1 +#define KK %g2 +#define TEMP1 %g3 +#define TEMP2 %g4 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define ALPHA %f62 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#define alpha 31 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define ALPHA %f31 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#define alpha 31 + +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + st %i3, [%sp + STACK_START + 16] + + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 36], OFFSET +#endif +#endif + LDF [%sp + STACK_START + 16], ALPHA +#ifdef TRMMKERNEL + st %g1, [%sp + STACK_START + 8] + st %g2, [%sp + STACK_START + 12] + st %g3, [%sp + STACK_START + 16] + st %g4, [%sp + STACK_START + 20] +#endif +#else + + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC +#ifdef TRMMKERNEL + ldx [%sp+ STACK_START + 72], OFFSET +#endif + +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif + +#ifdef TRMMKERNEL + stx %g1, [%sp + STACK_START + 32] + stx %g2, [%sp + STACK_START + 40] + stx %g3, [%sp + STACK_START + 48] + stx %g4, [%sp + STACK_START + 56] +#endif + +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL30 + sll LDC, BASE_SHIFT, LDC + +.LL11: + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C5 + add C5, LDC, C6 + add C6, LDC, C7 + add C7, LDC, C8 + add C8, LDC, C + + sll K, BASE_SHIFT + 3, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov A, AO + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL20 + add B, BB, BB + .align 4 + +.LL12: + prefetch [BB + 0 * SIZE], 1 + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 8 * SIZE], a5 + + LDF [BO + 0 * SIZE], b1 + + LDF [BO + 1 * SIZE], b2 + FCLR (cc01) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc09) + LDF [BO + 4 * SIZE], b5 + FCLR (cc13) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc06) + LDF [BO + 7 * SIZE], b8 + FCLR (cc10) + LDF [BO + 8 * SIZE], b9 + FCLR (cc14) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc03) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc11) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc15) + + prefetch [C5 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C6 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C7 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C8 + 2 * SIZE], 3 + FCLR (cc16) + +#ifndef TRMMKERNEL + sra K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 8, L +#endif + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + add BB, 32 * SIZE, BB + .align 4 + +.LL13: + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#ifndef TRMMKERNEL + and K, 7, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 8, L +#endif + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + nop + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + nop + FMADD (aa2, bb5, cc10, cc10) + nop + + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + LDF [C2 + 0 * SIZE], a3 + LDF [C2 + 1 * SIZE], a4 + + LDF [C3 + 0 * SIZE], b1 + LDF [C3 + 1 * SIZE], b2 + LDF [C4 + 0 * SIZE], b3 + LDF [C4 + 1 * SIZE], b4 + + FMADD (alpha, cc01, aa1, cc01) + LDF [C5 + 0 * SIZE], a1 + FMADD (alpha, cc02, aa2, cc02) + LDF [C5 + 1 * SIZE], a2 + FMADD (alpha, cc03, aa3, cc03) + LDF [C6 + 0 * SIZE], a3 + FMADD (alpha, cc04, aa4, cc04) + LDF [C6 + 1 * SIZE], a4 + + FMADD (alpha, cc05, bb1, cc05) + LDF [C7 + 0 * SIZE], b1 + FMADD (alpha, cc06, bb2, cc06) + LDF [C7 + 1 * SIZE], b2 + FMADD (alpha, cc07, bb3, cc07) + LDF [C8 + 0 * SIZE], b3 + FMADD (alpha, cc08, bb4, cc08) + LDF [C8 + 1 * SIZE], b4 + + FMADD (alpha, cc09, aa1, cc09) + STF c01, [C1 + 0 * SIZE] + FMADD (alpha, cc10, aa2, cc10) + STF c02, [C1 + 1 * SIZE] + FMADD (alpha, cc11, aa3, cc11) + STF c03, [C2 + 0 * SIZE] + FMADD (alpha, cc12, aa4, cc12) + STF c04, [C2 + 1 * SIZE] + + FMADD (alpha, cc13, bb1, cc13) + STF c05, [C3 + 0 * SIZE] + FMADD (alpha, cc14, bb2, cc14) + STF c06, [C3 + 1 * SIZE] + FMADD (alpha, cc15, bb3, cc15) + STF c07, [C4 + 0 * SIZE] + FMADD (alpha, cc16, bb4, cc16) + STF c08, [C4 + 1 * SIZE] + +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c02, c02 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c04, c04 + + FMUL ALPHA, c05, c05 + FMUL ALPHA, c06, c06 + FMUL ALPHA, c07, c07 + FMUL ALPHA, c08, c08 + + FMUL ALPHA, c09, c09 + STF c01, [C1 + 0 * SIZE] + FMUL ALPHA, c10, c10 + STF c02, [C1 + 1 * SIZE] + FMUL ALPHA, c11, c11 + STF c03, [C2 + 0 * SIZE] + FMUL ALPHA, c12, c12 + STF c04, [C2 + 1 * SIZE] + + FMUL ALPHA, c13, c13 + STF c05, [C3 + 0 * SIZE] + FMUL ALPHA, c14, c14 + STF c06, [C3 + 1 * SIZE] + FMUL ALPHA, c15, c15 + STF c07, [C4 + 0 * SIZE] + FMUL ALPHA, c16, c16 + STF c08, [C4 + 1 * SIZE] +#endif + + STF c09, [C5 + 0 * SIZE] + add C1, 2 * SIZE, C1 + STF c10, [C5 + 1 * SIZE] + add C2, 2 * SIZE, C2 + STF c11, [C6 + 0 * SIZE] + add C3, 2 * SIZE, C3 + STF c12, [C6 + 1 * SIZE] + add C4, 2 * SIZE, C4 + + STF c13, [C7 + 0 * SIZE] + add C5, 2 * SIZE, C5 + STF c14, [C7 + 1 * SIZE] + add C6, 2 * SIZE, C6 + STF c15, [C8 + 0 * SIZE] + add C7, 2 * SIZE, C7 + STF c16, [C8 + 1 * SIZE] + add C8, 2 * SIZE, C8 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + .align 4 + +.LL20: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL29 + nop + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + FCLR (cc01) + LDF [BO + 1 * SIZE], b2 + FCLR (cc03) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc07) + LDF [BO + 4 * SIZE], b5 + FCLR (cc09) + LDF [BO + 5 * SIZE], b6 + FCLR (cc11) + LDF [BO + 6 * SIZE], b7 + FCLR (cc13) + LDF [BO + 7 * SIZE], b8 + FCLR (cc15) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 8, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + LDF [BO + 8 * SIZE], b9 + .align 4 + +.LL23: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa2, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa2, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa2, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa2, bb5, cc09, cc09) + LDF [BO + 20 * SIZE], b5 + FMADD (aa2, bb6, cc11, cc11) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa2, bb7, cc13, cc13) + LDF [BO + 22 * SIZE], b7 + FMADD (aa2, bb8, cc15, cc15) + LDF [BO + 23 * SIZE], b8 + + LDF [AO + 4 * SIZE], a1 + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb1, cc01, cc01) + LDF [BO + 32 * SIZE], b1 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 26 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [BO + 28 * SIZE], b5 + FMADD (aa3, bb6, cc11, cc11) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 30 * SIZE], b7 + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa4, bb9, cc01, cc01) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb2, cc03, cc03) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa4, bb3, cc05, cc05) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc07, cc07) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa4, bb5, cc09, cc09) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb6, cc11, cc11) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa4, bb7, cc13, cc13) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc15, cc15) + LDF [BO + 39 * SIZE], b8 + + LDF [AO + 6 * SIZE], a3 + LDF [AO + 7 * SIZE], a4 + + add AO, 4 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL23 + add BO, 32 * SIZE, BO + .align 4 + +.LL25: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 8, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 1 * SIZE], a1 + add AO, 1 * SIZE, AO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL27 + add BO, 8 * SIZE, BO + .align 4 + +.LL28: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C2 + 0 * SIZE], a2 + LDF [C3 + 0 * SIZE], a3 + LDF [C4 + 0 * SIZE], a4 + + FMADD (alpha, cc01, aa1, cc01) + LDF [C5 + 0 * SIZE], b1 + FMADD (alpha, cc03, aa2, cc03) + LDF [C6 + 0 * SIZE], b2 + + FMADD (alpha, cc05, aa3, cc05) + LDF [C7 + 0 * SIZE], b3 + FMADD (alpha, cc07, aa4, cc07) + LDF [C8 + 0 * SIZE], b4 + + FMADD (alpha, cc09, bb1, cc09) + STF c01, [C1 + 0 * SIZE] + FMADD (alpha, cc11, bb2, cc11) + STF c03, [C2 + 0 * SIZE] + FMADD (alpha, cc13, bb3, cc13) + STF c05, [C3 + 0 * SIZE] + FMADD (alpha, cc15, bb4, cc15) + STF c07, [C4 + 0 * SIZE] +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c05, c05 + FMUL ALPHA, c07, c07 + + FMUL ALPHA, c09, c09 + STF c01, [C1 + 0 * SIZE] + FMUL ALPHA, c11, c11 + STF c03, [C2 + 0 * SIZE] + + FMUL ALPHA, c13, c13 + STF c05, [C3 + 0 * SIZE] + FMUL ALPHA, c15, c15 + STF c07, [C4 + 0 * SIZE] +#endif + + STF c09, [C5 + 0 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c13, [C7 + 0 * SIZE] + STF c15, [C8 + 0 * SIZE] + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + .align 4 + +.LL29: +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 8, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + mov BO, B + .align 4 + +.LL30: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL50 + mov C, C1 + + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + mov A, AO + .align 4 + +.LL32: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc02) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + LDF [BO + 8 * SIZE], b9 + FCLR (cc04) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc05) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C3 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc08) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 4, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + nop + .align 4 + +.LL33: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + nop + FMADD (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD (aa3, bb8, cc07, cc07) + FMADD (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL37 + add BO, 4 * SIZE, BO + .align 4 + +.LL38: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + LDF [C2 + 0 * SIZE], a3 + LDF [C2 + 1 * SIZE], a4 + + FMADD (alpha, cc01, aa1, cc01) + LDF [C3 + 0 * SIZE], b1 + FMADD (alpha, cc02, aa2, cc02) + LDF [C3 + 1 * SIZE], b2 + FMADD (alpha, cc03, aa3, cc03) + LDF [C4 + 0 * SIZE], b3 + FMADD (alpha, cc04, aa4, cc04) + LDF [C4 + 1 * SIZE], b4 + + FMADD (alpha, cc05, bb1, cc05) + STF c01, [C1 + 0 * SIZE] + FMADD (alpha, cc06, bb2, cc06) + STF c02, [C1 + 1 * SIZE] + FMADD (alpha, cc07, bb3, cc07) + STF c03, [C2 + 0 * SIZE] + FMADD (alpha, cc08, bb4, cc08) + STF c04, [C2 + 1 * SIZE] +#else + + FMUL ALPHA, c01, c01 + FMUL ALPHA, c02, c02 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c04, c04 + + FMUL ALPHA, c05, c05 + STF c01, [C1 + 0 * SIZE] + FMUL ALPHA, c06, c06 + STF c02, [C1 + 1 * SIZE] + FMUL ALPHA, c07, c07 + STF c03, [C2 + 0 * SIZE] + FMUL ALPHA, c08, c08 + STF c04, [C2 + 1 * SIZE] +#endif + + STF c05, [C3 + 0 * SIZE] + add C1, 2 * SIZE, C1 + STF c06, [C3 + 1 * SIZE] + add C2, 2 * SIZE, C2 + STF c07, [C4 + 0 * SIZE] + add C3, 2 * SIZE, C3 + STF c08, [C4 + 1 * SIZE] + add C4, 2 * SIZE, C4 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +.LL40: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL49 + nop + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc05) + LDF [BO + 8 * SIZE], b9 + FCLR (cc07) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL45 + nop + +.LL43: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + FMADD (aa2, bb7, cc05, cc05) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc07, cc07) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + LDF [AO + 2 * SIZE], a3 + add BO, 16 * SIZE, BO + + FMADD (aa4, bb5, cc01, cc01) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc03, cc03) + LDF [BO + 5 * SIZE], b6 + FMADD (aa4, bb7, cc05, cc05) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc07, cc07) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL43 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL45: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL48 + nop + .align 4 + +.LL47: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 4 * SIZE], b1 + add L, -1, L + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 5 * SIZE], b2 + add AO, 1 * SIZE, AO + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 6 * SIZE], b3 + cmp L, 0 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 7 * SIZE], b4 + add BO, 4 * SIZE, BO + + bg,pt %icc, .LL47 + LDF [AO + 0 * SIZE], a1 + .align 4 + +.LL48: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C2 + 0 * SIZE], a2 + LDF [C3 + 0 * SIZE], a3 + LDF [C4 + 0 * SIZE], a4 + + FMADD (alpha, cc01, aa1, cc01) + FMADD (alpha, cc03, aa2, cc03) + FMADD (alpha, cc05, aa3, cc05) + FMADD (alpha, cc07, aa4, cc07) +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c05, c05 + FMUL ALPHA, c07, c07 +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + .align 4 + +.LL49: +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 4, KK +#endif + mov BO, B + .align 4 + +.LL50: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL70 + mov C, C1 + + add C, LDC, C2 + add C2, LDC, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL60 + mov A, AO + .align 4 + +.LL52: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL55 + nop + .align 4 + +.LL53: + FMADD (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL53 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL55: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL58 + nop + .align 4 + +.LL57: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL57 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL58: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + LDF [C2 + 0 * SIZE], a3 + LDF [C2 + 1 * SIZE], a4 + + FMADD (alpha, cc01, aa1, cc01) + FMADD (alpha, cc02, aa2, cc02) + FMADD (alpha, cc03, aa3, cc03) + FMADD (alpha, cc04, aa4, cc04) +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c02, c02 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c04, c04 +#endif + + STF c01, [C1 + 0 * SIZE] + add I, -1, I + STF c02, [C1 + 1 * SIZE] + add C1, 2 * SIZE, C1 + + STF c03, [C2 + 0 * SIZE] + cmp I, 0 + STF c04, [C2 + 1 * SIZE] + add C2, 2 * SIZE, C2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + bg,pt %icc, .LL52 + nop + .align 4 + +.LL60: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL69 + nop + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + FCLR (cc01) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL65 + nop + .align 4 + +.LL63: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb3, cc01, cc01) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc03, cc03) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + + LDF [AO + 2 * SIZE], a3 + add BO, 8 * SIZE, BO + + FMADD (aa4, bb7, cc01, cc01) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc03, cc03) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL63 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL65: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL68 + nop + .align 4 + +.LL67: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 2 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 3 * SIZE], b2 + + LDF [AO + 1 * SIZE], a1 + add L, -1, L + add AO, 1 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL67 + add BO, 2 * SIZE, BO + .align 4 + +.LL68: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C2 + 0 * SIZE], a2 + + FMADD (alpha, cc01, aa1, cc01) + FMADD (alpha, cc03, aa2, cc03) +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c03, c03 +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + .align 4 + +.LL69: +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 2, KK +#endif + mov BO, B + .align 4 + +.LL70: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + mov C, C1 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL80 + mov A, AO + .align 4 + +.LL72: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + prefetch [C1 + 2 * SIZE], 3 + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL75 + nop + +.LL73: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + LDF [BO + 4 * SIZE], b1 + cmp L, 0 + + FMADD (aa3, bb2, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb2, cc02, cc02) + LDF [AO + 7 * SIZE], a4 + + LDF [BO + 5 * SIZE], b2 + add BO, 4 * SIZE, BO + + FMADD (aa1, bb3, cc01, cc01) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb3, cc02, cc02) + LDF [AO + 9 * SIZE], a2 + + LDF [BO + 2 * SIZE], b3 + add AO, 8 * SIZE, AO + + FMADD (aa3, bb4, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa4, bb4, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL73 + LDF [BO + 3 * SIZE], b4 + .align 4 + +.LL75: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL78 + nop + .align 4 + +.LL77: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add L, -1, L + add AO, 2 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL77 + add BO, 1 * SIZE, BO + .align 4 + +.LL78: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + + FMADD (alpha, cc01, aa1, cc01) + FMADD (alpha, cc02, aa2, cc02) +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c02, c02 +#endif + + STF c01, [C1 + 0 * SIZE] + add I, -1, I + STF c02, [C1 + 1 * SIZE] + cmp I, 0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + bg,pt %icc, .LL72 + add C1, 2 * SIZE, C1 + .align 4 + +.LL80: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [BO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], a2 + LDF [BO + 1 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + LDF [BO + 2 * SIZE], b3 + LDF [AO + 3 * SIZE], a4 + LDF [BO + 3 * SIZE], b4 + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL85 + FCLR (cc01) + .align 4 + +.LL83: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + LDF [BO + 4 * SIZE], b1 + + FMADD (aa2, bb2, cc01, cc01) + LDF [AO + 5 * SIZE], a2 + LDF [BO + 5 * SIZE], b2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + LDF [BO + 6 * SIZE], b3 + + FMADD (aa4, bb4, cc01, cc01) + LDF [AO + 7 * SIZE], a4 + LDF [BO + 7 * SIZE], b4 + + add AO, 4 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL83 + add BO, 4 * SIZE, BO + .align 4 + +.LL85: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL88 + nop + .align 4 + +.LL87: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 1 * SIZE], a1 + LDF [BO + 1 * SIZE], b1 + + add AO, 1 * SIZE, AO + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL87 + add BO, 1 * SIZE, BO + .align 4 + +.LL88: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + + FMADD (alpha, cc01, aa1, cc01) +#else + FMUL ALPHA, c01, c01 +#endif + + STF c01, [C1 + 0 * SIZE] + .align 4 + +.LL999: +#ifdef TRMMKERNEL +#ifndef __64BIT__ + ld [%sp + STACK_START + 8], %g1 + ld [%sp + STACK_START + 12], %g2 + ld [%sp + STACK_START + 16], %g3 + ld [%sp + STACK_START + 20], %g4 +#else + ldx [%sp + STACK_START + 32], %g1 + ldx [%sp + STACK_START + 40], %g2 + ldx [%sp + STACK_START + 48], %g3 + ldx [%sp + STACK_START + 56], %g4 +#endif +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_ncopy.S b/kernel/sparc/gemm_ncopy.S new file mode 100644 index 0000000..880d39c --- /dev/null +++ b/kernel/sparc/gemm_ncopy.S @@ -0,0 +1,309 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 + +#define I %l4 +#define J %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + add A2, LDA, A3 + sra M, 2, I + add A3, LDA, A4 + cmp I, 0 + + ble,pn %icc, .LL15 + add A4, LDA, A + +#define PREFETCHSIZE 36 +#define WPREFETCHSIZE 20 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c05 + LDF [A3 + 0 * SIZE], c09 + LDF [A4 + 0 * SIZE], c13 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 1 * SIZE], c06 + LDF [A3 + 1 * SIZE], c10 + LDF [A4 + 1 * SIZE], c14 + + prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 2 * SIZE], c03 + LDF [A2 + 2 * SIZE], c07 + LDF [A3 + 2 * SIZE], c11 + LDF [A4 + 2 * SIZE], c15 + + prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 3 * SIZE], c04 + LDF [A2 + 3 * SIZE], c08 + LDF [A3 + 3 * SIZE], c12 + LDF [A4 + 3 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c05, [B + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c09, [B + 2 * SIZE] + add A3, 4 * SIZE, A3 + STF c13, [B + 3 * SIZE] + add A4, 4 * SIZE, A4 + STF c02, [B + 4 * SIZE] + add I, -1, I + STF c06, [B + 5 * SIZE] + cmp I, 0 + STF c10, [B + 6 * SIZE] + STF c14, [B + 7 * SIZE] +#ifdef DOUBLE + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 +#endif + STF c03, [B + 8 * SIZE] + STF c07, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c15, [B + 11 * SIZE] + STF c04, [B + 12 * SIZE] + STF c08, [B + 13 * SIZE] + STF c12, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + bg,pt %icc, .LL12 + add B, 16 * SIZE, B + +.LL15: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL16: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + LDF [A2 + 0 * SIZE], c05 + add A2, 1 * SIZE, A2 + LDF [A3 + 0 * SIZE], c09 + add A3, 1 * SIZE, A3 + LDF [A4 + 0 * SIZE], c13 + add A4, 1 * SIZE, A4 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c05, [B + 1 * SIZE] + cmp I, 0 + STF c09, [B + 2 * SIZE] + STF c13, [B + 3 * SIZE] + bg,pt %icc, .LL16 + add B, 4 * SIZE, B + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +.LL111: + sra M, 2, I + add A, LDA, A2 + cmp I, 0 + mov A, A1 + + ble,pn %icc, .LL115 + add A2, LDA, A + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c05 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 1 * SIZE], c06 + + LDF [A1 + 2 * SIZE], c03 + LDF [A2 + 2 * SIZE], c07 + LDF [A1 + 3 * SIZE], c04 + LDF [A2 + 3 * SIZE], c08 + + STF c01, [B + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c05, [B + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c02, [B + 2 * SIZE] + add I, -1, I + STF c06, [B + 3 * SIZE] + cmp I, 0 + STF c03, [B + 4 * SIZE] + STF c07, [B + 5 * SIZE] + STF c04, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + bg,pt %icc, .LL112 + add B, 8 * SIZE, B + +.LL115: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL200 + nop + +.LL116: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + add I, -1, I + LDF [A2 + 0 * SIZE], c05 + add A2, 1 * SIZE, A2 + cmp I, 0 + + STF c01, [B + 0 * SIZE] + STF c05, [B + 1 * SIZE] + bg,pt %icc, .LL116 + add B, 2 * SIZE, B + +.LL200: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL211: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL215 + mov A, A1 + +.LL212: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + add A1, 4 * SIZE, A1 + STF c04, [B + 3 * SIZE] + + bg,pt %icc, .LL212 + add B, 4 * SIZE, B + +.LL215: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +.LL216: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + add I, -1, I + cmp I, 0 + + STF c01, [B + 0 * SIZE] + bg,pt %icc, .LL216 + add B, 1 * SIZE, B + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_ncopy_2.S b/kernel/sparc/gemm_ncopy_2.S new file mode 100644 index 0000000..b52e71d --- /dev/null +++ b/kernel/sparc/gemm_ncopy_2.S @@ -0,0 +1,235 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 72 +#define WPREFETCHSIZE 20 + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 + +#define I %l4 +#define J %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + sra M, 3, I + cmp I, 0 + + ble,pn %icc, .LL15 + add A2, LDA, A + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A1 + 1 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + LDF [A1 + 2 * SIZE], c05 + LDF [A2 + 2 * SIZE], c06 + LDF [A1 + 3 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c09 + LDF [A2 + 4 * SIZE], c10 + LDF [A1 + 5 * SIZE], c11 + LDF [A2 + 5 * SIZE], c12 + LDF [A1 + 6 * SIZE], c13 + LDF [A2 + 6 * SIZE], c14 + LDF [A1 + 7 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + add A1, 8 * SIZE, A1 + add I, -1, I + add A2, 8 * SIZE, A2 + cmp I, 0 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + bg,pt %icc, .LL12 + add B, 16 * SIZE, B + +.LL15: + and M, 7, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL16: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + LDF [A2 + 0 * SIZE], c02 + add A2, 1 * SIZE, A2 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + bg,pt %icc, .LL16 + add B, 2 * SIZE, B + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL111: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL115 + mov A, A1 + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + add A1, 4 * SIZE, A1 + STF c04, [B + 3 * SIZE] + + bg,pt %icc, .LL112 + add B, 4 * SIZE, B + +.LL115: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +.LL116: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + add I, -1, I + cmp I, 0 + + STF c01, [B + 0 * SIZE] + bg,pt %icc, .LL116 + add B, 1 * SIZE, B + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_ncopy_8.S b/kernel/sparc/gemm_ncopy_8.S new file mode 100644 index 0000000..f55195f --- /dev/null +++ b/kernel/sparc/gemm_ncopy_8.S @@ -0,0 +1,921 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 42 +#define WPREFETCHSIZE 20 + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 +#define A5 %o0 +#define A6 %o1 +#define A7 %o2 +#define A8 %o3 + +#define I %l4 +#define J %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL20 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + add A2, LDA, A3 + sra M, 3, I + add A3, LDA, A4 + cmp I, 0 + + add A4, LDA, A5 + add A5, LDA, A6 + add A6, LDA, A7 + add A7, LDA, A8 + + ble,pn %icc, .LL13 + add A8, LDA, A + .align 4 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A5 + 0 * SIZE], c05 + LDF [A6 + 0 * SIZE], c06 + LDF [A7 + 0 * SIZE], c07 + LDF [A8 + 0 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 1 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A3 + 1 * SIZE], c11 + LDF [A4 + 1 * SIZE], c12 + LDF [A5 + 1 * SIZE], c13 + LDF [A6 + 1 * SIZE], c14 + LDF [A7 + 1 * SIZE], c15 + LDF [A8 + 1 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 2 * SIZE], c01 + LDF [A2 + 2 * SIZE], c02 + LDF [A3 + 2 * SIZE], c03 + LDF [A4 + 2 * SIZE], c04 + LDF [A5 + 2 * SIZE], c05 + LDF [A6 + 2 * SIZE], c06 + LDF [A7 + 2 * SIZE], c07 + LDF [A8 + 2 * SIZE], c08 + + prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 3 * SIZE], c09 + LDF [A2 + 3 * SIZE], c10 + LDF [A3 + 3 * SIZE], c11 + LDF [A4 + 3 * SIZE], c12 + LDF [A5 + 3 * SIZE], c13 + LDF [A6 + 3 * SIZE], c14 + LDF [A7 + 3 * SIZE], c15 + LDF [A8 + 3 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 16) * SIZE], 2 + STF c01, [B + 16 * SIZE] + STF c02, [B + 17 * SIZE] + STF c03, [B + 18 * SIZE] + STF c04, [B + 19 * SIZE] + STF c05, [B + 20 * SIZE] + STF c06, [B + 21 * SIZE] + STF c07, [B + 22 * SIZE] + STF c08, [B + 23 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 24) * SIZE], 2 + STF c09, [B + 24 * SIZE] + STF c10, [B + 25 * SIZE] + STF c11, [B + 26 * SIZE] + STF c12, [B + 27 * SIZE] + STF c13, [B + 28 * SIZE] + STF c14, [B + 29 * SIZE] + STF c15, [B + 30 * SIZE] + STF c16, [B + 31 * SIZE] + + prefetch [A5 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c01 + LDF [A2 + 4 * SIZE], c02 + LDF [A3 + 4 * SIZE], c03 + LDF [A4 + 4 * SIZE], c04 + LDF [A5 + 4 * SIZE], c05 + LDF [A6 + 4 * SIZE], c06 + LDF [A7 + 4 * SIZE], c07 + LDF [A8 + 4 * SIZE], c08 + + prefetch [A6 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 5 * SIZE], c09 + LDF [A2 + 5 * SIZE], c10 + LDF [A3 + 5 * SIZE], c11 + LDF [A4 + 5 * SIZE], c12 + LDF [A5 + 5 * SIZE], c13 + LDF [A6 + 5 * SIZE], c14 + LDF [A7 + 5 * SIZE], c15 + LDF [A8 + 5 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 32) * SIZE], 2 + STF c01, [B + 32 * SIZE] + STF c02, [B + 33 * SIZE] + STF c03, [B + 34 * SIZE] + STF c04, [B + 35 * SIZE] + STF c05, [B + 36 * SIZE] + STF c06, [B + 37 * SIZE] + STF c07, [B + 38 * SIZE] + STF c08, [B + 39 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 40) * SIZE], 2 + STF c09, [B + 40 * SIZE] + STF c10, [B + 41 * SIZE] + STF c11, [B + 42 * SIZE] + STF c12, [B + 43 * SIZE] + STF c13, [B + 44 * SIZE] + STF c14, [B + 45 * SIZE] + STF c15, [B + 46 * SIZE] + STF c16, [B + 47 * SIZE] + + prefetch [A7 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 6 * SIZE], c01 + LDF [A2 + 6 * SIZE], c02 + LDF [A3 + 6 * SIZE], c03 + LDF [A4 + 6 * SIZE], c04 + LDF [A5 + 6 * SIZE], c05 + LDF [A6 + 6 * SIZE], c06 + LDF [A7 + 6 * SIZE], c07 + LDF [A8 + 6 * SIZE], c08 + + prefetch [A8 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 7 * SIZE], c09 + LDF [A2 + 7 * SIZE], c10 + LDF [A3 + 7 * SIZE], c11 + LDF [A4 + 7 * SIZE], c12 + LDF [A5 + 7 * SIZE], c13 + LDF [A6 + 7 * SIZE], c14 + LDF [A7 + 7 * SIZE], c15 + LDF [A8 + 7 * SIZE], c16 + + add A1, 8 * SIZE, A1 + add A2, 8 * SIZE, A2 + add A3, 8 * SIZE, A3 + add A4, 8 * SIZE, A4 + + prefetch [B + (WPREFETCHSIZE + 48) * SIZE], 2 + STF c01, [B + 48 * SIZE] + STF c02, [B + 49 * SIZE] + STF c03, [B + 50 * SIZE] + STF c04, [B + 51 * SIZE] + STF c05, [B + 52 * SIZE] + STF c06, [B + 53 * SIZE] + STF c07, [B + 54 * SIZE] + STF c08, [B + 55 * SIZE] + + add A5, 8 * SIZE, A5 + add A6, 8 * SIZE, A6 + add A7, 8 * SIZE, A7 + add A8, 8 * SIZE, A8 + + prefetch [B + (WPREFETCHSIZE + 56) * SIZE], 2 + STF c09, [B + 56 * SIZE] + STF c10, [B + 57 * SIZE] + STF c11, [B + 58 * SIZE] + STF c12, [B + 59 * SIZE] + STF c13, [B + 60 * SIZE] + STF c14, [B + 61 * SIZE] + STF c15, [B + 62 * SIZE] + STF c16, [B + 63 * SIZE] + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL12 + add B, 64 * SIZE, B + .align 4 + +.LL13: + and M, 4, I + cmp I, 0 + ble,pn %icc, .LL14 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A5 + 0 * SIZE], c05 + LDF [A6 + 0 * SIZE], c06 + LDF [A7 + 0 * SIZE], c07 + LDF [A8 + 0 * SIZE], c08 + + LDF [A1 + 1 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A3 + 1 * SIZE], c11 + LDF [A4 + 1 * SIZE], c12 + LDF [A5 + 1 * SIZE], c13 + LDF [A6 + 1 * SIZE], c14 + LDF [A7 + 1 * SIZE], c15 + LDF [A8 + 1 * SIZE], c16 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + LDF [A1 + 2 * SIZE], c01 + LDF [A2 + 2 * SIZE], c02 + LDF [A3 + 2 * SIZE], c03 + LDF [A4 + 2 * SIZE], c04 + LDF [A5 + 2 * SIZE], c05 + LDF [A6 + 2 * SIZE], c06 + LDF [A7 + 2 * SIZE], c07 + LDF [A8 + 2 * SIZE], c08 + + LDF [A1 + 3 * SIZE], c09 + LDF [A2 + 3 * SIZE], c10 + LDF [A3 + 3 * SIZE], c11 + LDF [A4 + 3 * SIZE], c12 + LDF [A5 + 3 * SIZE], c13 + LDF [A6 + 3 * SIZE], c14 + LDF [A7 + 3 * SIZE], c15 + LDF [A8 + 3 * SIZE], c16 + + STF c01, [B + 16 * SIZE] + STF c02, [B + 17 * SIZE] + STF c03, [B + 18 * SIZE] + STF c04, [B + 19 * SIZE] + STF c05, [B + 20 * SIZE] + STF c06, [B + 21 * SIZE] + STF c07, [B + 22 * SIZE] + STF c08, [B + 23 * SIZE] + + STF c09, [B + 24 * SIZE] + STF c10, [B + 25 * SIZE] + STF c11, [B + 26 * SIZE] + STF c12, [B + 27 * SIZE] + STF c13, [B + 28 * SIZE] + STF c14, [B + 29 * SIZE] + STF c15, [B + 30 * SIZE] + STF c16, [B + 31 * SIZE] + + add A1, 4 * SIZE, A1 + add A2, 4 * SIZE, A2 + add A3, 4 * SIZE, A3 + add A4, 4 * SIZE, A4 + + add A5, 4 * SIZE, A5 + add A6, 4 * SIZE, A6 + add A7, 4 * SIZE, A7 + add A8, 4 * SIZE, A8 + + add B, 32 * SIZE, B + .align 4 + +.LL14: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A5 + 0 * SIZE], c05 + LDF [A6 + 0 * SIZE], c06 + LDF [A7 + 0 * SIZE], c07 + LDF [A8 + 0 * SIZE], c08 + + LDF [A1 + 1 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A3 + 1 * SIZE], c11 + LDF [A4 + 1 * SIZE], c12 + LDF [A5 + 1 * SIZE], c13 + LDF [A6 + 1 * SIZE], c14 + LDF [A7 + 1 * SIZE], c15 + LDF [A8 + 1 * SIZE], c16 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + add A3, 2 * SIZE, A3 + add A4, 2 * SIZE, A4 + + add A5, 2 * SIZE, A5 + add A6, 2 * SIZE, A6 + add A7, 2 * SIZE, A7 + add A8, 2 * SIZE, A8 + + add B, 16 * SIZE, B + .align 4 + +.LL15: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL19 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A5 + 0 * SIZE], c05 + LDF [A6 + 0 * SIZE], c06 + LDF [A7 + 0 * SIZE], c07 + LDF [A8 + 0 * SIZE], c08 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + add B, 8 * SIZE, B + .align 4 + +.LL19: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL20: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + + add A, LDA, A2 + mov A, A1 + add A2, LDA, A3 + sra M, 3, I + add A3, LDA, A4 + cmp I, 0 + + ble,pn %icc, .LL23 + add A4, LDA, A + .align 4 + +.LL22: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A1 + 1 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A3 + 1 * SIZE], c07 + LDF [A4 + 1 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 2 * SIZE], c09 + LDF [A2 + 2 * SIZE], c10 + LDF [A3 + 2 * SIZE], c11 + LDF [A4 + 2 * SIZE], c12 + LDF [A1 + 3 * SIZE], c13 + LDF [A2 + 3 * SIZE], c14 + LDF [A3 + 3 * SIZE], c15 + LDF [A4 + 3 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c01 + LDF [A2 + 4 * SIZE], c02 + LDF [A3 + 4 * SIZE], c03 + LDF [A4 + 4 * SIZE], c04 + LDF [A1 + 5 * SIZE], c05 + LDF [A2 + 5 * SIZE], c06 + LDF [A3 + 5 * SIZE], c07 + LDF [A4 + 5 * SIZE], c08 + + prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 6 * SIZE], c09 + LDF [A2 + 6 * SIZE], c10 + LDF [A3 + 6 * SIZE], c11 + LDF [A4 + 6 * SIZE], c12 + LDF [A1 + 7 * SIZE], c13 + LDF [A2 + 7 * SIZE], c14 + LDF [A3 + 7 * SIZE], c15 + LDF [A4 + 7 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 16) * SIZE], 2 + STF c01, [B + 16 * SIZE] + STF c02, [B + 17 * SIZE] + STF c03, [B + 18 * SIZE] + STF c04, [B + 19 * SIZE] + STF c05, [B + 20 * SIZE] + STF c06, [B + 21 * SIZE] + STF c07, [B + 22 * SIZE] + STF c08, [B + 23 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 24) * SIZE], 2 + STF c09, [B + 24 * SIZE] + STF c10, [B + 25 * SIZE] + STF c11, [B + 26 * SIZE] + STF c12, [B + 27 * SIZE] + STF c13, [B + 28 * SIZE] + STF c14, [B + 29 * SIZE] + STF c15, [B + 30 * SIZE] + STF c16, [B + 31 * SIZE] + + add A1, 8 * SIZE, A1 + add A2, 8 * SIZE, A2 + add A3, 8 * SIZE, A3 + add A4, 8 * SIZE, A4 + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL22 + add B, 32 * SIZE, B + .align 4 + +.LL23: + and M, 4, I + cmp I, 0 + ble,pn %icc, .LL24 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A1 + 1 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A3 + 1 * SIZE], c07 + LDF [A4 + 1 * SIZE], c08 + + LDF [A1 + 2 * SIZE], c09 + LDF [A2 + 2 * SIZE], c10 + LDF [A3 + 2 * SIZE], c11 + LDF [A4 + 2 * SIZE], c12 + LDF [A1 + 3 * SIZE], c13 + LDF [A2 + 3 * SIZE], c14 + LDF [A3 + 3 * SIZE], c15 + LDF [A4 + 3 * SIZE], c16 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + add A1, 4 * SIZE, A1 + add A2, 4 * SIZE, A2 + add A3, 4 * SIZE, A3 + add A4, 4 * SIZE, A4 + + add B, 16 * SIZE, B + .align 4 + +.LL24: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL25 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A1 + 1 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A3 + 1 * SIZE], c07 + LDF [A4 + 1 * SIZE], c08 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + add A3, 2 * SIZE, A3 + add A4, 2 * SIZE, A4 + + add B, 8 * SIZE, B + .align 4 + +.LL25: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL30 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + + add B, 4 * SIZE, B + .align 4 + +.LL30: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL40 + nop + + add A, LDA, A2 + mov A, A1 + sra M, 3, I + cmp I, 0 + + ble,pn %icc, .LL33 + add A2, LDA, A + .align 4 + +.LL32: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A1 + 1 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + LDF [A1 + 2 * SIZE], c05 + LDF [A2 + 2 * SIZE], c06 + LDF [A1 + 3 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c09 + LDF [A2 + 4 * SIZE], c10 + LDF [A1 + 5 * SIZE], c11 + LDF [A2 + 5 * SIZE], c12 + LDF [A1 + 6 * SIZE], c13 + LDF [A2 + 6 * SIZE], c14 + LDF [A1 + 7 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + add A1, 8 * SIZE, A1 + add A2, 8 * SIZE, A2 + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL32 + add B, 16 * SIZE, B + .align 4 + +.LL33: + and M, 4, I + cmp I, 0 + ble,pn %icc, .LL34 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A1 + 1 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + LDF [A1 + 2 * SIZE], c05 + LDF [A2 + 2 * SIZE], c06 + LDF [A1 + 3 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + add A1, 4 * SIZE, A1 + add A2, 4 * SIZE, A2 + + add B, 8 * SIZE, B + .align 4 + +.LL34: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL35 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A1 + 1 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + + add B, 4 * SIZE, B + .align 4 + +.LL35: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + + add B, 2 * SIZE, B + .align 4 + +.LL40: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + + sra M, 3, I + cmp I, 0 + + ble,pn %icc, .LL43 + mov A, A1 + .align 4 + +.LL42: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + add A1, 8 * SIZE, A1 + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL42 + add B, 8 * SIZE, B + .align 4 + +.LL43: + and M, 4, I + cmp I, 0 + ble,pn %icc, .LL44 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + + add A1, 4 * SIZE, A1 + + add B, 4 * SIZE, B + .align 4 + +.LL44: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL45 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + + add A1, 2 * SIZE, A1 + + add B, 2 * SIZE, B + .align 4 + +.LL45: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + + LDF [A1 + 0 * SIZE], c01 + STF c01, [B + 0 * SIZE] + .align 4 + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_tcopy.S b/kernel/sparc/gemm_tcopy.S new file mode 100644 index 0000000..9838a53 --- /dev/null +++ b/kernel/sparc/gemm_tcopy.S @@ -0,0 +1,376 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 + +#define I %l4 +#define J %l5 + +#define B1 %o0 +#define B2 %o1 +#define B3 %o3 +#define M4 %o4 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sll M, BASE_SHIFT + 2, M4 + + and N, -4, B2 + and N, -2, B3 + sll M, BASE_SHIFT, B1 + smul B1, B2, B2 + smul B1, B3, B3 + add B, B2, B2 + add B, B3, B3 + + sra M, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + add A2, LDA, A3 + sra N, 2, I + add A3, LDA, A4 + cmp I, 0 + + mov B, B1 + add B, 16 * SIZE, B + + ble,pn %icc, .LL15 + add A4, LDA, A + +#define PREFETCHSIZE 8 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A2 + 0 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A3 + 0 * SIZE], c09 + LDF [A3 + 1 * SIZE], c10 + LDF [A3 + 2 * SIZE], c11 + LDF [A3 + 3 * SIZE], c12 + + prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A4 + 0 * SIZE], c13 + LDF [A4 + 1 * SIZE], c14 + LDF [A4 + 2 * SIZE], c15 + LDF [A4 + 3 * SIZE], c16 + + prefetch [B1 + (PREFETCHSIZE + 0) * SIZE], 0 + STF c01, [B1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c03, [B1 + 2 * SIZE] + add A3, 4 * SIZE, A3 + STF c04, [B1 + 3 * SIZE] + add A4, 4 * SIZE, A4 + STF c05, [B1 + 4 * SIZE] + add I, -1, I + STF c06, [B1 + 5 * SIZE] + cmp I, 0 + STF c07, [B1 + 6 * SIZE] + STF c08, [B1 + 7 * SIZE] + +#ifdef DOUBLE + prefetch [B1 + (PREFETCHSIZE + 8) * SIZE], 0 +#endif + STF c09, [B1 + 8 * SIZE] + STF c10, [B1 + 9 * SIZE] + STF c11, [B1 + 10 * SIZE] + STF c12, [B1 + 11 * SIZE] + STF c13, [B1 + 12 * SIZE] + STF c14, [B1 + 13 * SIZE] + STF c15, [B1 + 14 * SIZE] + STF c16, [B1 + 15 * SIZE] + bg,pt %icc, .LL12 + add B1, M4, B1 + +.LL15: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL17 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + LDF [A3 + 0 * SIZE], c05 + LDF [A3 + 1 * SIZE], c06 + LDF [A4 + 0 * SIZE], c07 + LDF [A4 + 1 * SIZE], c08 + + STF c01, [B2 + 0 * SIZE] + add A1, 2 * SIZE, A1 + STF c02, [B2 + 1 * SIZE] + add A2, 2 * SIZE, A2 + STF c03, [B2 + 2 * SIZE] + add A3, 2 * SIZE, A3 + STF c04, [B2 + 3 * SIZE] + add A4, 2 * SIZE, A4 + STF c05, [B2 + 4 * SIZE] + STF c06, [B2 + 5 * SIZE] + STF c07, [B2 + 6 * SIZE] + STF c08, [B2 + 7 * SIZE] + add B2, 8 * SIZE, B2 + +.LL17: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + + STF c01, [B3 + 0 * SIZE] + STF c02, [B3 + 1 * SIZE] + STF c03, [B3 + 2 * SIZE] + STF c04, [B3 + 3 * SIZE] + add B3, 4 * SIZE, B3 + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and M, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +.LL111: + sra N, 2, I + add A, LDA, A2 + cmp I, 0 + mov A, A1 + + mov B, B1 + add B, 8 * SIZE, B + + ble,pn %icc, .LL115 + add A2, LDA, A + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + LDF [A2 + 0 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + STF c01, [B1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c03, [B1 + 2 * SIZE] + add I, -1, I + STF c04, [B1 + 3 * SIZE] + cmp I, 0 + STF c05, [B1 + 4 * SIZE] + STF c06, [B1 + 5 * SIZE] + STF c07, [B1 + 6 * SIZE] + STF c08, [B1 + 7 * SIZE] + + bg,pt %icc, .LL112 + add B1, M4, B1 + +.LL115: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL117 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + STF c01, [B2 + 0 * SIZE] + add A1, 2 * SIZE, A1 + STF c02, [B2 + 1 * SIZE] + add A2, 2 * SIZE, A2 + STF c03, [B2 + 2 * SIZE] + add I, -1, I + STF c04, [B2 + 3 * SIZE] + cmp I, 0 + add B2, 4 * SIZE, B2 + +.LL117: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL200 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + + STF c01, [B3 + 0 * SIZE] + STF c02, [B3 + 1 * SIZE] + add B3, 2 * SIZE, B3 + +.LL200: + and M, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL211: + sra N, 2, I + cmp I, 0 + + mov B, B1 + + ble,pn %icc, .LL215 + mov A, A1 + +.LL212: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + add A1, 4 * SIZE, A1 + STF c04, [B + 3 * SIZE] + + bg,pt %icc, .LL212 + add B, M4, B + +.LL215: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL217 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + STF c01, [B2 + 0 * SIZE] + STF c02, [B2 + 1 * SIZE] + add A1, 2 * SIZE, A1 + +.LL217: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + + LDF [A1 + 0 * SIZE], c01 + STF c01, [B3 + 0 * SIZE] + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_tcopy_2.S b/kernel/sparc/gemm_tcopy_2.S new file mode 100644 index 0000000..aed95f9 --- /dev/null +++ b/kernel/sparc/gemm_tcopy_2.S @@ -0,0 +1,298 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 72 +#define WPREFETCHSIZE 16 + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 + +#define I %l4 +#define J %l5 + +#define B1 %o0 +#define B2 %o1 +#define B3 %o3 +#define M2 %o4 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sll M, BASE_SHIFT + 1, M2 + + and N, -2, B2 + sll M, BASE_SHIFT, B1 + smul B1, B2, B2 + add B, B2, B2 + + sra M, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + sra N, 3, I + cmp I, 0 + + mov B, B1 + add B, 4 * SIZE, B + + ble,pn %icc, .LL13 + add A2, LDA, A + .align 4 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + LDF [A2 + 2 * SIZE], c11 + LDF [A2 + 3 * SIZE], c12 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A2 + 4 * SIZE], c13 + LDF [A2 + 5 * SIZE], c14 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + LDF [A2 + 6 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + add A1, 8 * SIZE, A1 + add A2, 8 * SIZE, A2 + add I, -1, I + cmp I, 0 + + prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 + STF c01, [B1 + 0 * SIZE] + STF c02, [B1 + 1 * SIZE] + STF c09, [B1 + 2 * SIZE] + STF c10, [B1 + 3 * SIZE] + add B1, M2, B1 + + prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 + STF c03, [B1 + 0 * SIZE] + STF c04, [B1 + 1 * SIZE] + STF c11, [B1 + 2 * SIZE] + STF c12, [B1 + 3 * SIZE] + add B1, M2, B1 + + prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 + STF c05, [B1 + 0 * SIZE] + STF c06, [B1 + 1 * SIZE] + STF c13, [B1 + 2 * SIZE] + STF c14, [B1 + 3 * SIZE] + add B1, M2, B1 + + prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 + STF c07, [B1 + 0 * SIZE] + STF c08, [B1 + 1 * SIZE] + STF c15, [B1 + 2 * SIZE] + STF c16, [B1 + 3 * SIZE] + + bg,pt %icc, .LL12 + add B1, M2, B1 + +.LL13: + and N, 4, I + cmp I, 0 + ble,pn %icc, .LL14 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + LDF [A1 + 2 * SIZE], c05 + LDF [A1 + 3 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + add A1, 4 * SIZE, A1 + add A2, 4 * SIZE, A2 + + STF c01, [B1 + 0 * SIZE] + STF c02, [B1 + 1 * SIZE] + STF c03, [B1 + 2 * SIZE] + STF c04, [B1 + 3 * SIZE] + add B1, M2, B1 + STF c05, [B1 + 0 * SIZE] + STF c06, [B1 + 1 * SIZE] + STF c07, [B1 + 2 * SIZE] + STF c08, [B1 + 3 * SIZE] + add B1, M2, B1 + .align 4 + +.LL14: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + + STF c01, [B1 + 0 * SIZE] + STF c02, [B1 + 1 * SIZE] + STF c03, [B1 + 2 * SIZE] + STF c04, [B1 + 3 * SIZE] + add B1, M2, B1 + .align 4 + +.LL15: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + + STF c01, [B2 + 0 * SIZE] + STF c02, [B2 + 1 * SIZE] + add B2, 2 * SIZE, B2 + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and M, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL211: + sra N, 1, I + cmp I, 0 + + mov B, B1 + + ble,pn %icc, .LL215 + mov A, A1 + +.LL212: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + add A1, 2 * SIZE, A1 + add I, -1, I + cmp I, 0 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + + bg,pt %icc, .LL212 + add B, M2, B + +.LL215: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + + LDF [A1 + 0 * SIZE], c01 + STF c01, [B2 + 0 * SIZE] + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemv_n.S b/kernel/sparc/gemv_n.S new file mode 100644 index 0000000..649ef16 --- /dev/null +++ b/kernel/sparc/gemv_n.S @@ -0,0 +1,1400 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define LDA %i2 +#define X %i3 +#define INCX %i4 +#else +#define A %i4 +#define LDA %i5 +#define X %i2 +#define INCX %i3 +#endif + +#define Y %l0 +#define INCY %l1 +#define BUFFER %l2 + +#define I %l3 +#define J %l5 + +#define A1 %o0 +#define A2 %o1 +#define A3 %o2 +#define A4 %o3 + +#define Y1 %l4 +#define YY %l6 + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define y1 %f8 +#define y2 %f10 +#define y3 %f12 +#define y4 %f14 +#define y5 %f16 +#define y6 %f18 +#define y7 %f20 +#define y8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 + +#define a9 %f40 +#define a10 %f42 +#define a11 %f44 +#define a12 %f46 +#define a13 %f48 +#define a14 %f50 +#define a15 %f52 +#define a16 %f54 + +#define x1 %f56 +#define x2 %f58 +#define x3 %f60 +#define x4 %f62 + +#define FZERO %f52 +#define ALPHA %f54 +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define y1 %f4 +#define y2 %f5 +#define y3 %f6 +#define y4 %f7 +#define y5 %f8 +#define y6 %f9 +#define y7 %f10 +#define y8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 + +#define a9 %f20 +#define a10 %f21 +#define a11 %f22 +#define a12 %f23 +#define a13 %f24 +#define a14 %f25 +#define a15 %f26 +#define a16 %f27 + +#define x1 %f28 +#define x2 %f29 +#define x3 %f30 +#define x4 %f31 + +#define FZERO %f26 +#define ALPHA %f27 +#endif + +#ifndef __64BIT__ +#define STACK_FZERO [%sp + STACK_START + 8] +#define STACK_ALPHA [%sp + STACK_START + 16] +#else +#define STACK_FZERO [%sp + STACK_START + 32] +#define STACK_ALPHA [%sp + STACK_START + 40] +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], LDA + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY + ld [%sp + STACK_START + 48], BUFFER +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + + ld [%sp + STACK_START + 28], X + ld [%sp + STACK_START + 32], INCX + ld [%sp + STACK_START + 36], Y + ld [%sp + STACK_START + 40], INCY + ld [%sp + STACK_START + 44], BUFFER +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp + STACK_START + 56], X + ldx [%sp + STACK_START + 64], INCX + ldx [%sp + STACK_START + 72], Y + ldx [%sp + STACK_START + 80], INCY + ldx [%sp + STACK_START + 88], BUFFER +#ifdef DOUBLE + FMOV %f6, ALPHA + STF %f6, STACK_ALPHA +#else + FMOV %f7, ALPHA + STF %f7, STACK_ALPHA +#endif +#endif + + sll LDA, BASE_SHIFT, LDA + + cmp M, 0 + ble %icc, .LL999 + sll INCX, BASE_SHIFT, INCX + cmp N, 0 + ble %icc, .LL999 + sll INCY, BASE_SHIFT, INCY + +#ifdef DOUBLE + FCLR(21) +#else + FCLR(26) +#endif + + cmp INCY, SIZE + be %icc, .LL10 + mov Y, YY + + add M, 7, J + sra J, 3, J + mov BUFFER, YY + mov BUFFER, Y1 + +.LL01: + STF FZERO, [Y1 + 0 * SIZE] + STF FZERO, [Y1 + 1 * SIZE] + STF FZERO, [Y1 + 2 * SIZE] + STF FZERO, [Y1 + 3 * SIZE] + STF FZERO, [Y1 + 4 * SIZE] + STF FZERO, [Y1 + 5 * SIZE] + STF FZERO, [Y1 + 6 * SIZE] + deccc J + STF FZERO, [Y1 + 7 * SIZE] + bg,pn %icc, .LL01 + add Y1, 8 * SIZE, Y1 + +.LL10: + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL20 + nop + +.LL11: + mov YY, Y1 + + mov A, A1 + add A, LDA, A2 + add A2, LDA, A3 + add A3, LDA, A4 + add A4, LDA, A + + LDF STACK_ALPHA, ALPHA + + LDF [X], x1 + add X, INCX, X + LDF [X], x2 + add X, INCX, X + LDF [X], x3 + add X, INCX, X + LDF [X], x4 + add X, INCX, X + + FMUL ALPHA, x1, x1 + FMUL ALPHA, x2, x2 + FMUL ALPHA, x3, x3 + FMUL ALPHA, x4, x4 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL16 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + + LDF [A2 + 0 * SIZE], a9 + LDF [A2 + 1 * SIZE], a10 + LDF [A2 + 2 * SIZE], a11 + LDF [A2 + 3 * SIZE], a12 + LDF [A2 + 4 * SIZE], a13 + LDF [A2 + 5 * SIZE], a14 + LDF [A2 + 6 * SIZE], a15 + LDF [A2 + 7 * SIZE], a16 + + FMUL a1, x1, t1 + LDF [A3 + 0 * SIZE], a1 + FMUL a2, x1, t2 + LDF [A3 + 1 * SIZE], a2 + FMUL a3, x1, t3 + LDF [A3 + 2 * SIZE], a3 + FMUL a4, x1, t4 + LDF [A3 + 3 * SIZE], a4 + + deccc I + ble,pn %icc, .LL13 + nop + nop + nop + nop + +#ifdef DOUBLE +#define PREFETCHSIZE 20 +#else +#define PREFETCHSIZE 40 +#endif + +.LL12: + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + FADD y1, t1, y1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a5, x1, t1 + LDF [A3 + 4 * SIZE], a5 + + FADD y2, t2, y2 + nop + FMUL a6, x1, t2 + LDF [A3 + 5 * SIZE], a6 + + FADD y3, t3, y3 + nop + FMUL a7, x1, t3 + LDF [A3 + 6 * SIZE], a7 + + FADD y4, t4, y4 + nop + FMUL a8, x1, t4 + LDF [A3 + 7 * SIZE], a8 + + FADD y5, t1, y5 + nop + FMUL a9, x2, t1 + LDF [A4 + 0 * SIZE], a9 + + FADD y6, t2, y6 + nop + FMUL a10, x2, t2 + LDF [A4 + 1 * SIZE], a10 + + FADD y7, t3, y7 + nop + FMUL a11, x2, t3 + LDF [A4 + 2 * SIZE], a11 + + FADD y8, t4, y8 + nop + FMUL a12, x2, t4 + LDF [A4 + 3 * SIZE], a12 + + FADD y1, t1, y1 + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + FMUL a13, x2, t1 + LDF [A4 + 4 * SIZE], a13 + + FADD y2, t2, y2 + nop + FMUL a14, x2, t2 + LDF [A4 + 5 * SIZE], a14 + + FADD y3, t3, y3 + nop + FMUL a15, x2, t3 + LDF [A4 + 6 * SIZE], a15 + + FADD y4, t4, y4 + nop + FMUL a16, x2, t4 + LDF [A4 + 7 * SIZE], a16 + + FADD y5, t1, y5 + nop + FMUL a1, x3, t1 + LDF [A1 + 8 * SIZE], a1 + + FADD y6, t2, y6 + nop + FMUL a2, x3, t2 + LDF [A1 + 9 * SIZE], a2 + + FADD y7, t3, y7 + nop + FMUL a3, x3, t3 + LDF [A1 + 10 * SIZE], a3 + + FADD y8, t4, y8 + nop + FMUL a4, x3, t4 + LDF [A1 + 11 * SIZE], a4 + + FADD y1, t1, y1 + prefetch [A3 + PREFETCHSIZE * SIZE], 1 + FMUL a5, x3, t1 + LDF [A1 + 12 * SIZE], a5 + + FADD y2, t2, y2 + nop + FMUL a6, x3, t2 + LDF [A1 + 13 * SIZE], a6 + + FADD y3, t3, y3 + nop + FMUL a7, x3, t3 + LDF [A1 + 14 * SIZE], a7 + + FADD y4, t4, y4 + nop + FMUL a8, x3, t4 + LDF [A1 + 15 * SIZE], a8 + + FADD y5, t1, y5 + nop + FMUL a9, x4, t1 + LDF [A2 + 8 * SIZE], a9 + + FADD y6, t2, y6 + nop + FMUL a10, x4, t2 + LDF [A2 + 9 * SIZE], a10 + + FADD y7, t3, y7 + nop + FMUL a11, x4, t3 + LDF [A2 + 10 * SIZE], a11 + + FADD y8, t4, y8 + nop + FMUL a12, x4, t4 + LDF [A2 + 11 * SIZE], a12 + + FADD y1, t1, y1 + prefetch [A4 + PREFETCHSIZE * SIZE], 1 + FMUL a13, x4, t1 + LDF [A2 + 12 * SIZE], a13 + + FADD y2, t2, y2 + add A3, 8 * SIZE, A3 + FMUL a14, x4, t2 + LDF [A2 + 13 * SIZE], a14 + + FADD y3, t3, y3 + add Y1, 8 * SIZE, Y1 + FMUL a15, x4, t3 + LDF [A2 + 14 * SIZE], a15 + + FADD y4, t4, y4 + deccc I + FMUL a16, x4, t4 + LDF [A2 + 15 * SIZE], a16 + + FADD y5, t1, y5 + add A1, 8 * SIZE, A1 + FMUL a1, x1, t1 + LDF [A3 + 0 * SIZE], a1 + + FADD y6, t2, y6 + add A2, 8 * SIZE, A2 + FMUL a2, x1, t2 + LDF [A3 + 1 * SIZE], a2 + + FADD y7, t3, y7 + add A4, 8 * SIZE, A4 + FMUL a3, x1, t3 + LDF [A3 + 2 * SIZE], a3 + + FADD y8, t4, y8 + nop + FMUL a4, x1, t4 + LDF [A3 + 3 * SIZE], a4 + + STF y1, [Y1 - 8 * SIZE] + STF y2, [Y1 - 7 * SIZE] + STF y3, [Y1 - 6 * SIZE] + STF y4, [Y1 - 5 * SIZE] + + STF y5, [Y1 - 4 * SIZE] + STF y6, [Y1 - 3 * SIZE] + STF y7, [Y1 - 2 * SIZE] + + bg,pn %icc, .LL12 + STF y8, [Y1 - 1 * SIZE] + +.LL13: + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + FADD y1, t1, y1 + FMUL a5, x1, t1 + LDF [A3 + 0 * SIZE], a1 + FADD y2, t2, y2 + FMUL a6, x1, t2 + LDF [A3 + 1 * SIZE], a2 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + LDF [A3 + 2 * SIZE], a3 + FADD y4, t4, y4 + FMUL a8, x1, t4 + LDF [A3 + 3 * SIZE], a4 + + FADD y5, t1, y5 + FMUL a9, x2, t1 + LDF [A3 + 4 * SIZE], a5 + FADD y6, t2, y6 + FMUL a10, x2, t2 + LDF [A3 + 5 * SIZE], a6 + + FADD y7, t3, y7 + FMUL a11, x2, t3 + LDF [A3 + 6 * SIZE], a7 + FADD y8, t4, y8 + FMUL a12, x2, t4 + LDF [A3 + 7 * SIZE], a8 + + FADD y1, t1, y1 + FMUL a13, x2, t1 + LDF [A4 + 0 * SIZE], a9 + FADD y2, t2, y2 + FMUL a14, x2, t2 + LDF [A4 + 1 * SIZE], a10 + + FADD y3, t3, y3 + FMUL a15, x2, t3 + LDF [A4 + 2 * SIZE], a11 + FADD y4, t4, y4 + FMUL a16, x2, t4 + LDF [A4 + 3 * SIZE], a12 + + FADD y5, t1, y5 + FMUL a1, x3, t1 + LDF [A4 + 4 * SIZE], a13 + FADD y6, t2, y6 + FMUL a2, x3, t2 + LDF [A4 + 5 * SIZE], a14 + + FADD y7, t3, y7 + FMUL a3, x3, t3 + LDF [A4 + 6 * SIZE], a15 + FADD y8, t4, y8 + FMUL a4, x3, t4 + LDF [A4 + 7 * SIZE], a16 + + FADD y1, t1, y1 + FMUL a5, x3, t1 + FADD y2, t2, y2 + FMUL a6, x3, t2 + + FADD y3, t3, y3 + FMUL a7, x3, t3 + FADD y4, t4, y4 + FMUL a8, x3, t4 + + FADD y5, t1, y5 + FMUL a9, x4, t1 + FADD y6, t2, y6 + FMUL a10, x4, t2 + + FADD y7, t3, y7 + FMUL a11, x4, t3 + FADD y8, t4, y8 + FMUL a12, x4, t4 + + FADD y1, t1, y1 + FMUL a13, x4, t1 + FADD y2, t2, y2 + FMUL a14, x4, t2 + + FADD y3, t3, y3 + FMUL a15, x4, t3 + FADD y4, t4, y4 + FMUL a16, x4, t4 + add A4, 8 * SIZE, A4 + + STF y1, [Y1 + 0 * SIZE] + FADD y5, t1, y5 + STF y2, [Y1 + 1 * SIZE] + FADD y6, t2, y6 + STF y3, [Y1 + 2 * SIZE] + FADD y7, t3, y7 + STF y4, [Y1 + 3 * SIZE] + FADD y8, t4, y8 + + STF y5, [Y1 + 4 * SIZE] + add A1, 8 * SIZE, A1 + STF y6, [Y1 + 5 * SIZE] + add A2, 8 * SIZE, A2 + STF y7, [Y1 + 6 * SIZE] + add A3, 8 * SIZE, A3 + STF y8, [Y1 + 7 * SIZE] + add Y1, 8 * SIZE, Y1 + +.LL16: + andcc M, 4, I + ble,pn %icc, .LL17 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [A2 + 0 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + LDF [A2 + 2 * SIZE], a7 + LDF [A2 + 3 * SIZE], a8 + + LDF [A3 + 0 * SIZE], a9 + LDF [A3 + 1 * SIZE], a10 + LDF [A3 + 2 * SIZE], a11 + LDF [A3 + 3 * SIZE], a12 + + LDF [A4 + 0 * SIZE], a13 + LDF [A4 + 1 * SIZE], a14 + LDF [A4 + 2 * SIZE], a15 + LDF [A4 + 3 * SIZE], a16 + + LDF [Y1 + 0 * SIZE], y1 + add A1, 4 * SIZE, A1 + LDF [Y1 + 1 * SIZE], y2 + add A2, 4 * SIZE, A2 + LDF [Y1 + 2 * SIZE], y3 + add A3, 4 * SIZE, A3 + LDF [Y1 + 3 * SIZE], y4 + add A4, 4 * SIZE, A4 + + FMUL a1, x1, t1 + FMUL a2, x1, t2 + FMUL a3, x1, t3 + FMUL a4, x1, t4 + + FADD y1, t1, y1 + FMUL a5, x2, t1 + FADD y2, t2, y2 + FMUL a6, x2, t2 + FADD y3, t3, y3 + FMUL a7, x2, t3 + FADD y4, t4, y4 + FMUL a8, x2, t4 + + FADD y1, t1, y1 + FMUL a9, x3, t1 + FADD y2, t2, y2 + FMUL a10, x3, t2 + + FADD y3, t3, y3 + FMUL a11, x3, t3 + FADD y4, t4, y4 + FMUL a12, x3, t4 + + FADD y1, t1, y1 + FMUL a13, x4, t1 + FADD y2, t2, y2 + FMUL a14, x4, t2 + + FADD y3, t3, y3 + FMUL a15, x4, t3 + FADD y4, t4, y4 + FMUL a16, x4, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FADD y3, t3, y3 + FADD y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + add Y1, 4 * SIZE, Y1 + +.LL17: + andcc M, 2, I + ble,pn %icc, .LL18 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A2 + 0 * SIZE], a2 + LDF [A3 + 0 * SIZE], a3 + LDF [A4 + 0 * SIZE], a4 + LDF [Y1 + 0 * SIZE], y1 + + LDF [A1 + 1 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + LDF [A3 + 1 * SIZE], a7 + LDF [A4 + 1 * SIZE], a8 + LDF [Y1 + 1 * SIZE], y2 + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + add A3, 2 * SIZE, A3 + add A4, 2 * SIZE, A4 + + FMUL a1, x1, t1 + FMUL a2, x2, t2 + FMUL a3, x3, t3 + FMUL a4, x4, t4 + + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y1, t2, y1 + FMUL a6, x2, t2 + FADD y1, t3, y1 + FMUL a7, x3, t3 + FADD y1, t4, y1 + FMUL a8, x4, t4 + + FADD y2, t1, y2 + FADD y2, t2, y2 + FADD y2, t3, y2 + FADD y2, t4, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, 2 * SIZE, Y1 + +.LL18: + andcc M, 1, I + ble,pn %icc, .LL19 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A2 + 0 * SIZE], a2 + LDF [A3 + 0 * SIZE], a3 + LDF [A4 + 0 * SIZE], a4 + LDF [Y1 + 0 * SIZE], y1 + + FMUL a1, x1, t1 + FMUL a2, x2, t2 + FMUL a3, x3, t3 + FMUL a4, x4, t4 + + FADD y1, t1, y1 + FADD y1, t2, y1 + FADD y1, t3, y1 + FADD y1, t4, y1 + + STF y1, [Y1] + +.LL19: + deccc J + bg %icc, .LL11 + nop + +.LL20: + andcc N, 2, J + ble,pn %icc, .LL30 + nop + +.LL21: + mov YY, Y1 + + mov A, A1 + add A, LDA, A2 + add A2, LDA, A + + LDF STACK_ALPHA, ALPHA + + LDF [X], x1 + add X, INCX, X + LDF [X], x2 + add X, INCX, X + + FMUL ALPHA, x1, x1 + FMUL ALPHA, x2, x2 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL26 + nop + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + + LDF [A2 + 0 * SIZE], a9 + LDF [A2 + 1 * SIZE], a10 + LDF [A2 + 2 * SIZE], a11 + LDF [A2 + 3 * SIZE], a12 + LDF [A2 + 4 * SIZE], a13 + LDF [A2 + 5 * SIZE], a14 + LDF [A2 + 6 * SIZE], a15 + LDF [A2 + 7 * SIZE], a16 + + FMUL a1, x1, t1 + deccc I + LDF [A1 + 8 * SIZE], a1 + FMUL a2, x1, t2 + LDF [A1 + 9 * SIZE], a2 + FMUL a3, x1, t3 + LDF [A1 + 10 * SIZE], a3 + FMUL a4, x1, t4 + ble,pn %icc, .LL23 + LDF [A1 + 11 * SIZE], a4 + +.LL22: + FADD y1, t1, y1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a5, x1, t1 + LDF [A1 + 12 * SIZE], a5 + FADD y2, t2, y2 + FMUL a6, x1, t2 + LDF [A1 + 13 * SIZE], a6 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + LDF [A1 + 14 * SIZE], a7 + FADD y4, t4, y4 + FMUL a8, x1, t4 + LDF [A1 + 15 * SIZE], a8 + + FADD y5, t1, y5 + FMUL a9, x2, t1 + LDF [A2 + 8 * SIZE], a9 + FADD y6, t2, y6 + FMUL a10, x2, t2 + LDF [A2 + 9 * SIZE], a10 + + FADD y7, t3, y7 + FMUL a11, x2, t3 + LDF [A2 + 10 * SIZE], a11 + FADD y8, t4, y8 + FMUL a12, x2, t4 + LDF [A2 + 11 * SIZE], a12 + + FADD y1, t1, y1 + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + FMUL a13, x2, t1 + LDF [A2 + 12 * SIZE], a13 + FADD y2, t2, y2 + FMUL a14, x2, t2 + LDF [A2 + 13 * SIZE], a14 + + FADD y3, t3, y3 + FMUL a15, x2, t3 + LDF [A2 + 14 * SIZE], a15 + FADD y4, t4, y4 + FMUL a16, x2, t4 + LDF [A2 + 15 * SIZE], a16 + + FADD y5, t1, y5 + FMUL a1, x1, t1 + LDF [A1 + 16 * SIZE], a1 + FADD y6, t2, y6 + FMUL a2, x1, t2 + LDF [A1 + 17 * SIZE], a2 + + FADD y7, t3, y7 + FMUL a3, x1, t3 + LDF [A1 + 18 * SIZE], a3 + FADD y8, t4, y8 + FMUL a4, x1, t4 + LDF [A1 + 19 * SIZE], a4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + + LDF [Y1 + 8 * SIZE], y1 + add A1, 8 * SIZE, A1 + LDF [Y1 + 9 * SIZE], y2 + add A2, 8 * SIZE, A2 + LDF [Y1 + 10 * SIZE], y3 + deccc I + LDF [Y1 + 11 * SIZE], y4 + LDF [Y1 + 12 * SIZE], y5 + LDF [Y1 + 13 * SIZE], y6 + LDF [Y1 + 14 * SIZE], y7 + LDF [Y1 + 15 * SIZE], y8 + bg,pn %icc, .LL22 + add Y1, 8 * SIZE, Y1 + +.LL23: + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y2, t2, y2 + FMUL a6, x1, t2 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + FADD y4, t4, y4 + FMUL a8, x1, t4 + + FADD y5, t1, y5 + FMUL a9, x2, t1 + FADD y6, t2, y6 + FMUL a10, x2, t2 + + FADD y7, t3, y7 + FMUL a11, x2, t3 + FADD y8, t4, y8 + FMUL a12, x2, t4 + + FADD y1, t1, y1 + FMUL a13, x2, t1 + FADD y2, t2, y2 + FMUL a14, x2, t2 + + FADD y3, t3, y3 + FMUL a15, x2, t3 + FADD y4, t4, y4 + FMUL a16, x2, t4 + + STF y1, [Y1 + 0 * SIZE] + FADD y5, t1, y5 + STF y2, [Y1 + 1 * SIZE] + FADD y6, t2, y6 + STF y3, [Y1 + 2 * SIZE] + FADD y7, t3, y7 + STF y4, [Y1 + 3 * SIZE] + FADD y8, t4, y8 + + STF y5, [Y1 + 4 * SIZE] + add A1, 8 * SIZE, A1 + STF y6, [Y1 + 5 * SIZE] + add A2, 8 * SIZE, A2 + STF y7, [Y1 + 6 * SIZE] + nop + STF y8, [Y1 + 7 * SIZE] + add Y1, 8 * SIZE, Y1 + +.LL26: + andcc M, 4, I + ble,pn %icc, .LL27 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [A2 + 0 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + LDF [A2 + 2 * SIZE], a7 + LDF [A2 + 3 * SIZE], a8 + + LDF [Y1 + 0 * SIZE], y1 + add A1, 4 * SIZE, A1 + LDF [Y1 + 1 * SIZE], y2 + add A2, 4 * SIZE, A2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + + FMUL a1, x1, t1 + FMUL a2, x1, t2 + FMUL a3, x1, t3 + FMUL a4, x1, t4 + + FADD y1, t1, y1 + FMUL a5, x2, t1 + FADD y2, t2, y2 + FMUL a6, x2, t2 + FADD y3, t3, y3 + FMUL a7, x2, t3 + FADD y4, t4, y4 + FMUL a8, x2, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FADD y3, t3, y3 + FADD y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + add Y1, 4 * SIZE, Y1 + +.LL27: + andcc M, 2, I + ble,pn %icc, .LL28 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A2 + 0 * SIZE], a2 + LDF [Y1 + 0 * SIZE], y1 + LDF [A1 + 1 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + add A1, 2 * SIZE, A1 + LDF [Y1 + 1 * SIZE], y2 + add A2, 2 * SIZE, A2 + + FMUL a1, x1, t1 + FMUL a2, x2, t2 + + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y1, t2, y1 + FMUL a6, x2, t2 + + FADD y2, t1, y2 + FADD y2, t2, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, 2 * SIZE, Y1 + +.LL28: + andcc M, 1, I + ble,pn %icc, .LL30 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A2 + 0 * SIZE], a2 + LDF [Y1 + 0 * SIZE], y1 + + FMUL a1, x1, t1 + FMUL a2, x2, t2 + + FADD y1, t1, y1 + FADD y1, t2, y1 + + STF y1, [Y1] + +.LL30: + andcc N, 1, J + ble,pn %icc, .LL990 + nop + +.LL31: + mov YY, Y1 + mov A, A1 + + LDF STACK_ALPHA, ALPHA + + LDF [X], x1 + add X, INCX, X + + FMUL ALPHA, x1, x1 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL36 + nop + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + + FMUL a1, x1, t1 + deccc I + LDF [A1 + 8 * SIZE], a1 + FMUL a2, x1, t2 + LDF [A1 + 9 * SIZE], a2 + FMUL a3, x1, t3 + LDF [A1 + 10 * SIZE], a3 + FMUL a4, x1, t4 + ble,pn %icc, .LL33 + LDF [A1 + 11 * SIZE], a4 + +.LL32: + FADD y1, t1, y1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a5, x1, t1 + LDF [A1 + 12 * SIZE], a5 + FADD y2, t2, y2 + FMUL a6, x1, t2 + LDF [A1 + 13 * SIZE], a6 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + LDF [A1 + 14 * SIZE], a7 + FADD y4, t4, y4 + FMUL a8, x1, t4 + LDF [A1 + 15 * SIZE], a8 + + FADD y5, t1, y5 + FMUL a1, x1, t1 + LDF [A1 + 16 * SIZE], a1 + FADD y6, t2, y6 + FMUL a2, x1, t2 + LDF [A1 + 17 * SIZE], a2 + + FADD y7, t3, y7 + FMUL a3, x1, t3 + LDF [A1 + 18 * SIZE], a3 + FADD y8, t4, y8 + FMUL a4, x1, t4 + LDF [A1 + 19 * SIZE], a4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + + LDF [Y1 + 8 * SIZE], y1 + LDF [Y1 + 9 * SIZE], y2 + LDF [Y1 + 10 * SIZE], y3 + LDF [Y1 + 11 * SIZE], y4 + LDF [Y1 + 12 * SIZE], y5 + deccc I + LDF [Y1 + 13 * SIZE], y6 + add A1, 8 * SIZE, A1 + LDF [Y1 + 14 * SIZE], y7 + add Y1, 8 * SIZE, Y1 + bg,pn %icc, .LL32 + LDF [Y1 + 7 * SIZE], y8 + +.LL33: + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y2, t2, y2 + FMUL a6, x1, t2 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + FADD y4, t4, y4 + FMUL a8, x1, t4 + + STF y1, [Y1 + 0 * SIZE] + FADD y5, t1, y5 + STF y2, [Y1 + 1 * SIZE] + FADD y6, t2, y6 + STF y3, [Y1 + 2 * SIZE] + FADD y7, t3, y7 + STF y4, [Y1 + 3 * SIZE] + FADD y8, t4, y8 + + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + add A1, 8 * SIZE, A1 + STF y8, [Y1 + 7 * SIZE] + add Y1, 8 * SIZE, Y1 + +.LL36: + andcc M, 4, I + ble,pn %icc, .LL37 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [Y1 + 0 * SIZE], y1 + add A1, 4 * SIZE, A1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + + FMUL a1, x1, t1 + FMUL a2, x1, t2 + FMUL a3, x1, t3 + FMUL a4, x1, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FADD y3, t3, y3 + FADD y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + add Y1, 4 * SIZE, Y1 + +.LL37: + andcc M, 2, I + ble,pn %icc, .LL38 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [Y1 + 0 * SIZE], y1 + + LDF [A1 + 1 * SIZE], a5 + LDF [Y1 + 1 * SIZE], y2 + add A1, 2 * SIZE, A1 + + FMUL a1, x1, t1 + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y2, t1, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, 2 * SIZE, Y1 + +.LL38: + andcc M, 1, I + ble,pn %icc, .LL990 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [Y1 + 0 * SIZE], y1 + + FMUL a1, x1, t1 + FADD y1, t1, y1 + + STF y1, [Y1] + +.LL990: + cmp INCY, SIZE + be %icc, .LL999 + mov Y, Y1 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL995 + nop + +.LL991: + LDF [BUFFER + 0 * SIZE], a1 + LDF [Y], y1 + add Y, INCY, Y + + LDF [BUFFER + 1 * SIZE], a2 + LDF [Y], y2 + add Y, INCY, Y + + LDF [BUFFER + 2 * SIZE], a3 + LDF [Y], y3 + add Y, INCY, Y + + LDF [BUFFER + 3 * SIZE], a4 + LDF [Y], y4 + add Y, INCY, Y + + LDF [BUFFER + 4 * SIZE], a5 + FADD y1, a1, y1 + LDF [Y], y5 + add Y, INCY, Y + + LDF [BUFFER + 5 * SIZE], a6 + FADD y2, a2, y2 + LDF [Y], y6 + add Y, INCY, Y + + LDF [BUFFER + 6 * SIZE], a7 + FADD y3, a3, y3 + LDF [Y], y7 + add Y, INCY, Y + + LDF [BUFFER + 7 * SIZE], a8 + FADD y4, a4, y4 + LDF [Y], y8 + add Y, INCY, Y + + STF y1, [Y1] + FADD y5, a5, y5 + add Y1, INCY, Y1 + STF y2, [Y1] + FADD y6, a6, y6 + add Y1, INCY, Y1 + STF y3, [Y1] + FADD y7, a7, y7 + add Y1, INCY, Y1 + STF y4, [Y1] + FADD y8, a8, y8 + add Y1, INCY, Y1 + STF y5, [Y1] + add Y1, INCY, Y1 + STF y6, [Y1] + add Y1, INCY, Y1 + STF y7, [Y1] + add Y1, INCY, Y1 + STF y8, [Y1] + add Y1, INCY, Y1 + + deccc I + bg,pn %icc, .LL991 + add BUFFER, 8 * SIZE, BUFFER + +.LL995: + andcc M, 7, I + ble,pn %icc, .LL999 + nop + + andcc M, 4, I + ble,pn %icc, .LL996 + nop + + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + LDF [BUFFER + 2 * SIZE], a3 + LDF [BUFFER + 3 * SIZE], a4 + add BUFFER, 4 * SIZE, BUFFER + + LDF [Y], y1 + add Y, INCY, Y + LDF [Y], y2 + add Y, INCY, Y + LDF [Y], y3 + add Y, INCY, Y + LDF [Y], y4 + add Y, INCY, Y + + FADD y1, a1, y1 + FADD y2, a2, y2 + FADD y3, a3, y3 + FADD y4, a4, y4 + + STF y1, [Y1] + add Y1, INCY, Y1 + STF y2, [Y1] + add Y1, INCY, Y1 + STF y3, [Y1] + add Y1, INCY, Y1 + STF y4, [Y1] + add Y1, INCY, Y1 + +.LL996: + andcc M, 2, I + ble,pn %icc, .LL997 + nop + + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + add BUFFER, 2 * SIZE, BUFFER + + LDF [Y], y1 + add Y, INCY, Y + LDF [Y], y2 + add Y, INCY, Y + + FADD y1, a1, y1 + FADD y2, a2, y2 + + STF y1, [Y1] + add Y1, INCY, Y1 + STF y2, [Y1] + add Y1, INCY, Y1 + +.LL997: + andcc M, 1, I + ble,pn %icc, .LL999 + nop + + LDF [BUFFER + 0 * SIZE], a1 + + LDF [Y], y1 + + FADD y1, a1, y1 + + STF y1, [Y1] + + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemv_t.S b/kernel/sparc/gemv_t.S new file mode 100644 index 0000000..fad006a --- /dev/null +++ b/kernel/sparc/gemv_t.S @@ -0,0 +1,705 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 1020 + +#define M %i0 +#define N %i1 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define LDA %i2 +#define X %i3 +#define INCX %i4 +#else +#define A %i4 +#define LDA %i5 +#define X %i2 +#define INCX %i3 +#endif + +#define Y %l0 +#define INCY %l1 +#define BUFFER %l2 + +#define I %l3 +#define IS %l4 +#define J %l5 +#define MIN_M %l6 +#define XP %l7 + +#define A1 %o0 +#define A2 %o1 +#define A3 %o2 +#define A4 %o3 +#define X1 %o4 +#define Y1 %o5 +#define PNLDA %g1 +#define Y2 %o7 /* Danger? */ + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define c1 %f8 +#define c2 %f10 +#define c3 %f12 +#define c4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + +#define a9 %f32 +#define a10 %f34 +#define a11 %f36 +#define a12 %f38 +#define a13 %f40 +#define a14 %f42 +#define a15 %f44 +#define a16 %f46 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 +#define b6 %f58 +#define b7 %f60 +#define b8 %f62 + +#define FZERO %f60 +#define ALPHA %f62 + +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define c1 %f4 +#define c2 %f5 +#define c3 %f6 +#define c4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 + +#define a9 %f16 +#define a10 %f17 +#define a11 %f18 +#define a12 %f19 +#define a13 %f20 +#define a14 %f21 +#define a15 %f22 +#define a16 %f23 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 +#define b6 %f29 +#define b7 %f30 +#define b8 %f31 + +#define FZERO %f30 +#define ALPHA %f31 +#endif + +#ifndef __64BIT__ +#define STACK_FZERO [%sp + STACK_START + 8] +#define STACK_ALPHA [%sp + STACK_START + 16] +#else +#define STACK_FZERO [%sp + STACK_START + 32] +#define STACK_ALPHA [%sp + STACK_START + 40] +#endif + +#ifdef DOUBLE +#define PREFETCHSIZE 36 +#else +#define PREFETCHSIZE 72 +#endif + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], LDA + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY + ld [%sp + STACK_START + 48], BUFFER +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + + ld [%sp + STACK_START + 28], X + ld [%sp + STACK_START + 32], INCX + ld [%sp + STACK_START + 36], Y + ld [%sp + STACK_START + 40], INCY + ld [%sp + STACK_START + 44], BUFFER +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp+ STACK_START + 56], X + ldx [%sp+ STACK_START + 64], INCX + ldx [%sp+ STACK_START + 72], Y + ldx [%sp+ STACK_START + 80], INCY + ldx [%sp+ STACK_START + 88], BUFFER +#ifdef DOUBLE + FMOV %f6, ALPHA + STF %f6, STACK_ALPHA +#else + FMOV %f7, ALPHA + STF %f7, STACK_ALPHA +#endif +#endif + +#ifdef DOUBLE + FCLR(29) +#else + FCLR(30) +#endif + + clr IS + mov P, I + sll LDA, BASE_SHIFT, LDA + sll I, BASE_SHIFT, I + smul LDA, N, PNLDA + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + sub I, PNLDA, PNLDA + +.LL10: + sll IS, BASE_SHIFT, I + sub M, IS, MIN_M + cmp MIN_M, P + nop + movg %icc, P, MIN_M + nop + cmp INCX, SIZE + beq .LL100 + add X, I, XP + + sra MIN_M, 2, I + mov BUFFER, XP + cmp I, 0 + ble,pn %icc, .LL15 + mov BUFFER, Y1 + +.LL11: + LDF [X], a1 + add X, INCX, X + LDF [X], a2 + add X, INCX, X + LDF [X], a3 + add X, INCX, X + LDF [X], a4 + add X, INCX, X + + STF a1, [Y1 + 0 * SIZE] + add I, -1, I + STF a2, [Y1 + 1 * SIZE] + cmp I, 0 + STF a3, [Y1 + 2 * SIZE] + STF a4, [Y1 + 3 * SIZE] + bg,pn %icc, .LL11 + add Y1, 4 * SIZE, Y1 + +.LL15: + and MIN_M, 3, I + cmp I, 0 + ble,pn %icc, .LL100 + nop + +.LL16: + LDF [X], a1 + add X, INCX, X + add I, -1, I + cmp I, 0 + nop + STF a1, [Y1] + bg,pn %icc, .LL16 + add Y1, 1 * SIZE, Y1 + +.LL100: + sra N, 1, J + cmp J, 0 + ble %icc, .LL200 + mov Y, Y1 + +.LL110: +#ifdef DOUBLE + FCLR(29) +#else + FCLR(30) +#endif + + FMOV FZERO, c1 + FMOV FZERO, c2 + FMOV FZERO, c3 + FMOV FZERO, c4 + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + mov A, A1 + add A, LDA, A2 + add A2, LDA, A + + mov XP, X1 + + sra MIN_M, 3, I + cmp I, 0 + ble %icc, .LL115 + prefetch [Y1 + 2 * SIZE], 0 + + LDF [A1 + 0 * SIZE], a1 + deccc I + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + + LDF [A2 + 0 * SIZE], a9 + LDF [A2 + 1 * SIZE], a10 + LDF [A2 + 2 * SIZE], a11 + LDF [A2 + 3 * SIZE], a12 + LDF [A2 + 4 * SIZE], a13 + LDF [A2 + 5 * SIZE], a14 + LDF [A2 + 6 * SIZE], a15 + LDF [A2 + 7 * SIZE], a16 + + LDF [X1 + 0 * SIZE], b1 + LDF [X1 + 1 * SIZE], b2 + LDF [X1 + 2 * SIZE], b3 + LDF [X1 + 3 * SIZE], b4 + LDF [X1 + 4 * SIZE], b5 + LDF [X1 + 5 * SIZE], b6 + + ble %icc, .LL112 + LDF [X1 + 6 * SIZE], b7 + +.LL111: + FADD c1, t1, c1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a1, b1, t1 + LDF [A1 + 8 * SIZE], a1 + + FADD c2, t2, c2 + LDF [X1 + 7 * SIZE], b8 + FMUL a9, b1, t2 + LDF [A2 + 8 * SIZE], a9 + + FADD c3, t3, c3 + LDF [X1 + 8 * SIZE], b1 + FMUL a2, b2, t3 + LDF [A1 + 9 * SIZE], a2 + + FADD c4, t4, c4 + deccc I + FMUL a10, b2, t4 + LDF [A2 + 9 * SIZE], a10 + + FADD c1, t1, c1 + LDF [X1 + 9 * SIZE], b2 + FMUL a3, b3, t1 + LDF [A1 + 10 * SIZE], a3 + + FADD c2, t2, c2 + nop + FMUL a11, b3, t2 + LDF [A2 + 10 * SIZE], a11 + + FADD c3, t3, c3 + LDF [X1 + 10 * SIZE], b3 + FMUL a4, b4, t3 + LDF [A1 + 11 * SIZE], a4 + + FADD c4, t4, c4 + nop + FMUL a12, b4, t4 + LDF [A2 + 11 * SIZE], a12 + + FADD c1, t1, c1 + LDF [X1 + 11 * SIZE], b4 + FMUL a5, b5, t1 + LDF [A1 + 12 * SIZE], a5 + + FADD c2, t2, c2 + prefetch [A2 + (PREFETCHSIZE + 4) * SIZE], 1 + FMUL a13, b5, t2 + LDF [A2 + 12 * SIZE], a13 + + FADD c3, t3, c3 + LDF [X1 + 12 * SIZE], b5 + FMUL a6, b6, t3 + LDF [A1 + 13 * SIZE], a6 + + FADD c4, t4, c4 + FMUL a14, b6, t4 + LDF [A2 + 13 * SIZE], a14 + + FADD c1, t1, c1 + LDF [X1 + 13 * SIZE], b6 + FMUL a7, b7, t1 + LDF [A1 + 14 * SIZE], a7 + + FADD c2, t2, c2 + add X1, 8 * SIZE, X1 + FMUL a15, b7, t2 + LDF [A2 + 14 * SIZE], a15 + + FADD c3, t3, c3 + LDF [X1 + 6 * SIZE], b7 + FMUL a8, b8, t3 + LDF [A1 + 15 * SIZE], a8 + + FADD c4, t4, c4 + add A1, 8 * SIZE, A1 + FMUL a16, b8, t4 + LDF [A2 + 15 * SIZE], a16 + + bg,pn %icc, .LL111 + add A2, 8 * SIZE, A2 + +.LL112: + FADD c1, t1, c1 + LDF [X1 + 7 * SIZE], b8 + FMUL a1, b1, t1 + add A1, 8 * SIZE, A1 + + FADD c2, t2, c2 + add A2, 8 * SIZE, A2 + FMUL a9, b1, t2 + add X1, 8 * SIZE, X1 + + FADD c3, t3, c3 + FMUL a2, b2, t3 + FADD c4, t4, c4 + FMUL a10, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a11, b3, t2 + + FADD c3, t3, c3 + FMUL a4, b4, t3 + FADD c4, t4, c4 + FMUL a12, b4, t4 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a13, b5, t2 + + FADD c3, t3, c3 + FMUL a6, b6, t3 + FADD c4, t4, c4 + FMUL a14, b6, t4 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a15, b7, t2 + + FADD c3, t3, c3 + FMUL a8, b8, t3 + FADD c4, t4, c4 + FMUL a16, b8, t4 + +.LL115: + andcc MIN_M, 7, I + ble %icc, .LL119 + mov Y1, Y2 + + LDF [X1 + 0 * SIZE], b1 + deccc I + LDF [A1 + 0 * SIZE], a1 + ble %icc, .LL117 + LDF [A2 + 0 * SIZE], a2 + +.LL116: + FADD c1, t1, c1 + add X1, 1 * SIZE, X1 + FMUL a1, b1, t1 + LDF [A1 + 1 * SIZE], a1 + + FADD c2, t2, c2 + add A1, 1 * SIZE, A1 + FMUL a2, b1, t2 + LDF [X1 + 0 * SIZE], b1 + + add A2, 1 * SIZE, A2 + deccc I + bg,pn %icc, .LL116 + LDF [A2 + 0 * SIZE], a2 + +.LL117: + FADD c1, t1, c1 + add X1, 1 * SIZE, X1 + FADD c2, t2, c2 + add A1, 1 * SIZE, A1 + + FMUL a1, b1, t1 + add A2, 1 * SIZE, A2 + FMUL a2, b1, t2 + nop + +.LL119: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + + FADD c1, c3, c1 + FADD c2, c4, c2 + + + LDF [Y1], a1 + LDF [Y1 + INCY], a2 + + add Y1, INCY, Y1 + add Y1, INCY, Y1 + + LDF STACK_ALPHA, ALPHA + + FMUL ALPHA, c1, c1 + FMUL ALPHA, c2, c2 + FADD a1, c1, a1 + FADD a2, c2, a2 + + STF a1, [Y2] + STF a2, [Y2 + INCY] + + deccc J + bg %icc, .LL110 +#ifdef DOUBLE + FCLR(29) +#else + FCLR(30) +#endif + +.LL200: + andcc N, 1, J + nop + ble %icc, .LL400 + FMOV FZERO, c1 + +.LL310: + FMOV FZERO, t1 + sra MIN_M, 3, I + FMOV FZERO, c2 + mov A, A1 + FMOV FZERO, t2 + add A, LDA, A + FMOV FZERO, t3 + cmp I, 0 + FMOV FZERO, t4 + ble %icc, .LL315 + mov XP, X1 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + add A1, 8 * SIZE, A1 + + LDF [X1 + 0 * SIZE], a9 + add I, -1, I + LDF [X1 + 1 * SIZE], a10 + cmp I, 0 + LDF [X1 + 2 * SIZE], a11 + LDF [X1 + 3 * SIZE], a12 + LDF [X1 + 4 * SIZE], a13 + LDF [X1 + 5 * SIZE], a14 + LDF [X1 + 6 * SIZE], a15 + LDF [X1 + 7 * SIZE], a16 + ble %icc, .LL312 + add X1, 8 * SIZE, X1 + +.LL311: + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + + FADD c1, t1, c1 + FMUL a1, a9, t1 + LDF [A1 + 0 * SIZE], a1 + LDF [X1 + 0 * SIZE], a9 + + FADD c2, t2, c2 + FMUL a2, a10, t2 + LDF [A1 + 1 * SIZE], a2 + LDF [X1 + 1 * SIZE], a10 + + FADD c1, t3, c1 + add I, -1, I + FMUL a3, a11, t3 + LDF [A1 + 2 * SIZE], a3 + LDF [X1 + 2 * SIZE], a11 + + FADD c2, t4, c2 + cmp I, 0 + FMUL a4, a12, t4 + LDF [A1 + 3 * SIZE], a4 + LDF [X1 + 3 * SIZE], a12 + + FADD c1, t1, c1 + nop + FMUL a5, a13, t1 + LDF [A1 + 4 * SIZE], a5 + LDF [X1 + 4 * SIZE], a13 + + FADD c2, t2, c2 + nop + FMUL a6, a14, t2 + LDF [A1 + 5 * SIZE], a6 + LDF [X1 + 5 * SIZE], a14 + + FADD c1, t3, c1 + FMUL a7, a15, t3 + LDF [A1 + 6 * SIZE], a7 + LDF [X1 + 6 * SIZE], a15 + + FADD c2, t4, c2 + add X1, 8 * SIZE, X1 + FMUL a8, a16, t4 + LDF [A1 + 7 * SIZE], a8 + add A1, 8 * SIZE, A1 + bg,pn %icc, .LL311 + LDF [X1 - 1 * SIZE], a16 + +.LL312: + FADD c1, t1, c1 + FMUL a1, a9, t1 + FADD c2, t2, c2 + FMUL a2, a10, t2 + FADD c1, t3, c1 + FMUL a3, a11, t3 + FADD c2, t4, c2 + FMUL a4, a12, t4 + + FADD c1, t1, c1 + FMUL a5, a13, t1 + FADD c2, t2, c2 + FMUL a6, a14, t2 + FADD c1, t3, c1 + FMUL a7, a15, t3 + FADD c2, t4, c2 + FMUL a8, a16, t4 + +.LL315: + and MIN_M, 7, I + cmp I, 0 + ble %icc, .LL319 + nop + +.LL316: + LDF [A1 + 0 * SIZE], a1 + add A1, 1 * SIZE, A1 + LDF [X1 + 0 * SIZE], b1 + nop + + FADD c1, t1, c1 + nop + add I, -1, I + FMUL a1, b1, t1 + nop + cmp I, 0 + bg,pn %icc, .LL316 + add X1, 1 * SIZE, X1 + +.LL319: + FADD c1, t1, c1 + nop + FADD c2, t2, c2 + nop + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + + FMUL ALPHA, c1, c1 + LDF [Y1 + 0 * SIZE], a1 + FADD a1, c1, a1 + STF a1, [Y1 + 0 * SIZE] + add Y1, INCY, Y1 + +.LL400: + add IS, P, IS + cmp IS, M + bl %icc, .LL10 + add A, PNLDA, A + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ger.S b/kernel/sparc/ger.S new file mode 100644 index 0000000..84cd525 --- /dev/null +++ b/kernel/sparc/ger.S @@ -0,0 +1,464 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define X %i5 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#else +#define X %i4 +#define INCX %i5 +#define Y %i2 +#define INCY %i3 +#endif + +#define A %l0 +#define LDA %l1 +#define BUFFER %l2 + +#define I %l3 +#define J %l4 + +#define A1 %o0 +#define X1 %o2 +#define XX %o3 + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define x1 %f8 +#define x2 %f10 +#define x3 %f12 +#define x4 %f14 +#define x5 %f16 +#define x6 %f18 +#define x7 %f20 +#define x8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 + +#define a9 %f40 +#define a10 %f42 +#define a11 %f44 +#define a12 %f46 +#define a13 %f48 +#define a14 %f50 +#define a15 %f52 +#define a16 %f54 + +#define y1 %f56 +#define y2 %f58 + +#define ALPHA %f60 + +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define x1 %f4 +#define x2 %f5 +#define x3 %f6 +#define x4 %f7 +#define x5 %f8 +#define x6 %f9 +#define x7 %f10 +#define x8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 + +#define a9 %f20 +#define a10 %f21 +#define a11 %f22 +#define a12 %f23 +#define a13 %f24 +#define a14 %f25 +#define a15 %f26 +#define a16 %f27 + +#define y1 %f28 +#define y2 %f29 +#define ALPHA %f30 +#endif + +#define PREFETCHSIZE 60 + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], INCX + ld [%sp + STACK_START + 32], Y + ld [%sp + STACK_START + 36], INCY + ld [%sp + STACK_START + 40], A + ld [%sp + STACK_START + 44], LDA + ld [%sp + STACK_START + 48], BUFFER +#else + st %i3, [%sp + STACK_START + 16] + + ld [%sp + STACK_START + 28], Y + ld [%sp + STACK_START + 32], INCY + ld [%sp + STACK_START + 36], A + ld [%sp + STACK_START + 40], LDA + ld [%sp + STACK_START + 44], BUFFER +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp + STACK_START + 56], Y + ldx [%sp + STACK_START + 64], INCY + ldx [%sp + STACK_START + 72], A + ldx [%sp + STACK_START + 80], LDA + ldx [%sp + STACK_START + 88], BUFFER +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif +#endif + + sll LDA, BASE_SHIFT, LDA + + cmp M, 0 + ble %icc, .LL999 + sll INCX, BASE_SHIFT, INCX + cmp N, 0 + ble %icc, .LL999 + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + be %icc, .LL10 + mov X, XX + + mov BUFFER, XX + mov BUFFER, X1 + + sra M, 3, J + cmp J, 0 + ble,pn %icc, .LL05 + nop + +.LL01: + LDF [X], a1 + add X, INCX, X + LDF [X], a2 + add X, INCX, X + LDF [X], a3 + add X, INCX, X + LDF [X], a4 + add X, INCX, X + LDF [X], a5 + add X, INCX, X + LDF [X], a6 + add X, INCX, X + LDF [X], a7 + add X, INCX, X + LDF [X], a8 + add X, INCX, X + + STF a1, [X1 + 0 * SIZE] + STF a2, [X1 + 1 * SIZE] + STF a3, [X1 + 2 * SIZE] + STF a4, [X1 + 3 * SIZE] + STF a5, [X1 + 4 * SIZE] + STF a6, [X1 + 5 * SIZE] + STF a7, [X1 + 6 * SIZE] + STF a8, [X1 + 7 * SIZE] + + add X1, 8 * SIZE, X1 + + deccc J + bg,pn %icc, .LL01 + nop + +.LL05: + andcc M, 7, J + ble,pn %icc, .LL10 + nop + +.LL06: + LDF [X], a1 + add X, INCX, X + + STF a1, [X1 + 0 * SIZE] + add X1, 1 * SIZE, X1 + + deccc J + bg,pn %icc, .LL06 + nop + +.LL10: + mov N, J + cmp N, 0 + ble,pn %icc, .LL999 + nop + +.LL11: + mov XX, X1 + + mov A, A1 + add A, LDA, A + + LDF [Y], y1 + add Y, INCY, Y + + FMUL ALPHA, y1, y1 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X1 + 0 * SIZE], x1 + LDF [A1 + 0 * SIZE], a1 + LDF [X1 + 1 * SIZE], x2 + LDF [A1 + 1 * SIZE], a2 + LDF [X1 + 2 * SIZE], x3 + LDF [A1 + 2 * SIZE], a3 + LDF [X1 + 3 * SIZE], x4 + LDF [A1 + 3 * SIZE], a4 + + LDF [X1 + 4 * SIZE], x5 + LDF [A1 + 4 * SIZE], a5 + LDF [X1 + 5 * SIZE], x6 + LDF [A1 + 5 * SIZE], a6 + LDF [X1 + 6 * SIZE], x7 + LDF [A1 + 6 * SIZE], a7 + LDF [X1 + 7 * SIZE], x8 + LDF [A1 + 7 * SIZE], a8 + + FMUL x1, y1, t1 + FMUL x2, y1, t2 + FMUL x3, y1, t3 + FMUL x4, y1, t4 + + FADD a1, t1, a1 + FMUL x5, y1, t1 + FADD a2, t2, a2 + FMUL x6, y1, t2 + + deccc I + ble,pn %icc, .LL13 + nop + +.LL12: + prefetch [A1 + PREFETCHSIZE * SIZE], 0 + + FADD a3, t3, a3 + LDF [X1 + 8 * SIZE], x1 + FMUL x7, y1, t3 + LDF [X1 + 9 * SIZE], x2 + FADD a4, t4, a4 + LDF [X1 + 10 * SIZE], x3 + FMUL x8, y1, t4 + LDF [X1 + 11 * SIZE], x4 + + FADD a5, t1, a5 + STF a1, [A1 + 0 * SIZE] + LDF [A1 + 8 * SIZE], a1 + FMUL x1, y1, t1 + STF a2, [A1 + 1 * SIZE] + LDF [A1 + 9 * SIZE], a2 + + FADD a6, t2, a6 + STF a3, [A1 + 2 * SIZE] + LDF [A1 + 10 * SIZE], a3 + FMUL x2, y1, t2 + STF a4, [A1 + 3 * SIZE] + LDF [A1 + 11 * SIZE], a4 + + FADD a7, t3, a7 + LDF [X1 + 12 * SIZE], x5 + FMUL x3, y1, t3 + LDF [X1 + 13 * SIZE], x6 + FADD a8, t4, a8 + LDF [X1 + 14 * SIZE], x7 + FMUL x4, y1, t4 + LDF [X1 + 15 * SIZE], x8 + + FADD a1, t1, a1 + STF a5, [A1 + 4 * SIZE] + deccc I + LDF [A1 + 12 * SIZE], a5 + FMUL x5, y1, t1 + STF a6, [A1 + 5 * SIZE] + LDF [A1 + 13 * SIZE], a6 + FADD a2, t2, a2 + STF a7, [A1 + 6 * SIZE] + LDF [A1 + 14 * SIZE], a7 + FMUL x6, y1, t2 + STF a8, [A1 + 7 * SIZE] + LDF [A1 + 15 * SIZE], a8 + add A1, 8 * SIZE, A1 + + bg,pn %icc, .LL12 + add X1, 8 * SIZE, X1 + +.LL13: + FADD a3, t3, a3 + FMUL x7, y1, t3 + FADD a4, t4, a4 + FMUL x8, y1, t4 + + FADD a5, t1, a5 + FADD a6, t2, a6 + FADD a7, t3, a7 + FADD a8, t4, a8 + + STF a1, [A1 + 0 * SIZE] + STF a2, [A1 + 1 * SIZE] + STF a3, [A1 + 2 * SIZE] + STF a4, [A1 + 3 * SIZE] + + STF a5, [A1 + 4 * SIZE] + STF a6, [A1 + 5 * SIZE] + STF a7, [A1 + 6 * SIZE] + STF a8, [A1 + 7 * SIZE] + + add A1, 8 * SIZE, A1 + add X1, 8 * SIZE, X1 + +.LL15: + andcc M, 4, I + ble,pn %icc, .LL16 + nop + + LDF [X1 + 0 * SIZE], x1 + LDF [A1 + 0 * SIZE], a1 + LDF [X1 + 1 * SIZE], x2 + LDF [A1 + 1 * SIZE], a2 + + LDF [X1 + 2 * SIZE], x3 + LDF [A1 + 2 * SIZE], a3 + LDF [X1 + 3 * SIZE], x4 + LDF [A1 + 3 * SIZE], a4 + + FMUL x1, y1, t1 + FMUL x2, y1, t2 + FMUL x3, y1, t3 + FMUL x4, y1, t4 + + FADD a1, t1, a1 + FADD a2, t2, a2 + FADD a3, t3, a3 + FADD a4, t4, a4 + + STF a1, [A1 + 0 * SIZE] + STF a2, [A1 + 1 * SIZE] + STF a3, [A1 + 2 * SIZE] + add X1, 4 * SIZE, X1 + STF a4, [A1 + 3 * SIZE] + add A1, 4 * SIZE, A1 + +.LL16: + andcc M, 2, I + ble,pn %icc, .LL17 + nop + + LDF [X1 + 0 * SIZE], x1 + LDF [X1 + 1 * SIZE], x2 + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + + FMUL x1, y1, t1 + FMUL x2, y1, t2 + + FADD a1, t1, a1 + FADD a2, t2, a2 + + STF a1, [A1 + 0 * SIZE] + add X1, 2 * SIZE, X1 + STF a2, [A1 + 1 * SIZE] + add A1, 2 * SIZE, A1 + +.LL17: + andcc M, 1, I + ble,pn %icc, .LL19 + nop + + LDF [X1 + 0 * SIZE], x1 + add X1, 1 * SIZE, X1 + + LDF [A1 + 0 * SIZE], a1 + + FMUL x1, y1, t1 + FADD a1, t1, a1 + + STF a1, [A1 + 0 * SIZE] + add A1, 1 * SIZE, A1 + +.LL19: + deccc J + bg %icc, .LL11 + nop + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/iamax.S b/kernel/sparc/iamax.S new file mode 100644 index 0000000..eb4a131 --- /dev/null +++ b/kernel/sparc/iamax.S @@ -0,0 +1,456 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define v1 %o0 +#define v2 %o1 +#define v3 %o2 +#define v4 %o3 +#define count %o4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#define CMOV movg +#else +#define FCMOV FMOVL +#define CMOV movl +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + clr v1 + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + mov 1, v1 + + add N, -1, N + LDF [X], c4 + add X, INCX, X + cmp N, 0 + ble .LL20 + FABS c4, c1 + + FABS c4, c2 + mov 1, v2 + FABS c4, c3 + mov 1, v3 + FABS c4, c4 + mov 1, v4 + mov 2, count + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + FABS a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a2, t2 + LDF [X + 0 * SIZE], a1 + FABS a3, t3 + LDF [X + 1 * SIZE], a2 + FABS a4, t4 + LDF [X + 2 * SIZE], a3 + + FCMP %fcc0, t1, c1 + LDF [X + 3 * SIZE], a4 + FCMP %fcc1, t2, c2 + nop + + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + FABS a8, t4 + LDF [X + 7 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + nop + CMOV %fcc0, count, v1 + add I, -1, I + + FCMOV %fcc1, t2, c2 + cmp I, 0 + CMOV %fcc1, count, v2 + add X, 8 * SIZE, X + + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + bg,pt %icc, .LL11 + add count, 4, count + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + add count, 1, count + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + +.LL20: + mov v1, %i0 + return %i7 + 8 + nop + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FABS a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FABS a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FABS a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FABS a8, t4 + LDF [X + 0 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + cmp I, 0 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + add count, 1, count + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + + mov v1, %i0 + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/imax.S b/kernel/sparc/imax.S new file mode 100644 index 0000000..c24e182 --- /dev/null +++ b/kernel/sparc/imax.S @@ -0,0 +1,419 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define v1 %o0 +#define v2 %o1 +#define v3 %o2 +#define v4 %o3 +#define count %o4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#define CMOV movg +#else +#define FCMOV FMOVL +#define CMOV movl +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + clr v1 + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + mov 1, v1 + + add N, -1, N + LDF [X], c1 + add X, INCX, X + cmp N, 0 + ble .LL20 + nop + + FMOV c1, c2 + mov 1, v2 + FMOV c1, c3 + mov 1, v3 + FMOV c1, c4 + mov 1, v4 + mov 2, count + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 40 + +.LL11: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + LDF [X + 0 * SIZE], a1 + FCMOV %fcc1, a2, c2 + CMOV %fcc1, count, v2 + LDF [X + 1 * SIZE], a2 + FCMOV %fcc2, a3, c3 + CMOV %fcc2, count, v3 + LDF [X + 2 * SIZE], a3 + FCMOV %fcc3, a4, c4 + CMOV %fcc3, count, v4 + LDF [X + 3 * SIZE], a4 + add count, 4, count + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + CMOV %fcc0, count, v1 + LDF [X + 4 * SIZE], a5 + add I, -1, I + FCMOV %fcc1, a6, c2 + CMOV %fcc1, count, v2 + LDF [X + 5 * SIZE], a6 + cmp I, 0 + FCMOV %fcc2, a7, c3 + CMOV %fcc2, count, v3 + LDF [X + 6 * SIZE], a7 + FCMOV %fcc3, a8, c4 + CMOV %fcc3, count, v4 + LDF [X + 7 * SIZE], a8 + add count, 4, count + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, a2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, a3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, a4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, a6, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, a7, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, a8, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FCMP %fcc0, a1, c1 + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + cmp I, 0 + add count, 1, count + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + +.LL20: + mov v1, %i0 + return %i7 + 8 + nop + + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FCMOV %fcc1, a2, c2 + CMOV %fcc1, count, v2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + + FCMOV %fcc2, a3, c3 + CMOV %fcc2, count, v3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + FCMOV %fcc3, a4, c4 + CMOV %fcc3, count, v4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + add count, 4, count + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + CMOV %fcc0, count, v1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + + FCMOV %fcc1, a6, c2 + add I, -1, I + CMOV %fcc1, count, v2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + + FCMOV %fcc2, a7, c3 + CMOV %fcc2, count, v3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + + cmp I, 0 + FCMOV %fcc3, a8, c4 + CMOV %fcc3, count, v4 + LDF [X + 0 * SIZE], a8 + add count, 4, count + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, a2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, a3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, a4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, a6, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, a7, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, a8, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FCMP %fcc0, a1, c1 + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + add count, 1, count + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + + mov v1, %i0 + return %i7 + 8 + nop + + + EPILOGUE diff --git a/kernel/sparc/izamax.S b/kernel/sparc/izamax.S new file mode 100644 index 0000000..3d0a48e --- /dev/null +++ b/kernel/sparc/izamax.S @@ -0,0 +1,425 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define v1 %o0 +#define v2 %o1 +#define v3 %o2 +#define v4 %o3 +#define count %o4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 +#define t5 %f16 +#define t6 %f18 +#define t7 %f20 +#define t8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 +#define t5 %f8 +#define t6 %f9 +#define t7 %f10 +#define t8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#define CMOV movg +#else +#define FCMOV FMOVL +#define CMOV movl +#endif + + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + clr v1 + + cmp INCX, 0 + ble .LL20 + sll INCX, ZBASE_SHIFT, INCX + + mov 1, v1 + + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + add N, -1, N + FABS c1, c1 + add X, INCX, X + FABS c2, c2 + cmp N, 0 + ble .LL20 + FADD c1, c2, c1 + + FMOV c1, c2 + mov 1, v2 + FMOV c1, c3 + mov 1, v3 + FMOV c1, c4 + mov 1, v4 + mov 2, count + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 32 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FABS a5, t5 + LDF [X + 4 * SIZE], a5 + FABS a6, t6 + LDF [X + 5 * SIZE], a6 + FABS a7, t7 + LDF [X + 6 * SIZE], a7 + FABS a8, t8 + LDF [X + 7 * SIZE], a8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + FCMOV %fcc1, t3, c2 + CMOV %fcc1, count, v2 + cmp I, 0 + FCMOV %fcc2, t5, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t7, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FABS a5, t5 + FABS a6, t6 + FABS a7, t7 + FABS a8, t8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t3, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t5, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t7, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + + FABS a1, t1 + FABS a2, t2 + FADD t1, t2, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add count, 1, count + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + +.LL20: + mov v1, %i0 + return %i7 + 8 + nop + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + FABS a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FABS a5, t5 + LDF [X + 0 * SIZE], a5 + FABS a6, t6 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FABS a7, t7 + LDF [X + 0 * SIZE], a7 + FABS a8, t8 + LDF [X + 1 * SIZE], a8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + FCMOV %fcc1, t3, c2 + CMOV %fcc1, count, v2 + cmp I, 0 + FCMOV %fcc2, t5, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t7, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FABS a5, t5 + FABS a6, t6 + FABS a7, t7 + FABS a8, t8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t3, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t5, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t7, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + + FABS a1, t1 + add I, -1, I + FABS a2, t2 + cmp I, 0 + FADD t1, t2, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add count, 1, count + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + + mov v1, %i0 + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/lsame.S b/kernel/sparc/lsame.S new file mode 100644 index 0000000..778301f --- /dev/null +++ b/kernel/sparc/lsame.S @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define A %o0 +#define B %o1 +#define AA %o4 +#define BB %o3 + + PROLOGUE + + ldub [A], A + ldub [B], B + add A, -32, AA + add B, -32, BB + + cmp A, 96 + movge %icc, AA, A + + cmp B, 96 + movge %icc, BB, B + + clr %g1 + cmp A, B + move %icc, 1, %g1 + retl + mov %g1, %o0 + + EPILOGUE diff --git a/kernel/sparc/max.S b/kernel/sparc/max.S new file mode 100644 index 0000000..1a4bc44 --- /dev/null +++ b/kernel/sparc/max.S @@ -0,0 +1,339 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#else +#define FCMOV FMOVL +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + add N, -1, N + LDF [X], c1 + add X, INCX, X + cmp N, 0 + ble .LL20 + nop + + FMOV c1, c2 + FMOV c1, c3 + FMOV c1, c4 + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 40 + +.LL11: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + LDF [X + 0 * SIZE], a1 + FCMOV %fcc1, a2, c2 + LDF [X + 1 * SIZE], a2 + FCMOV %fcc2, a3, c3 + LDF [X + 2 * SIZE], a3 + FCMOV %fcc3, a4, c4 + LDF [X + 3 * SIZE], a4 + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + LDF [X + 4 * SIZE], a5 + add I, -1, I + FCMOV %fcc1, a6, c2 + LDF [X + 5 * SIZE], a6 + cmp I, 0 + FCMOV %fcc2, a7, c3 + LDF [X + 6 * SIZE], a7 + FCMOV %fcc3, a8, c4 + LDF [X + 7 * SIZE], a8 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + FCMOV %fcc1, a2, c2 + FCMOV %fcc2, a3, c3 + FCMOV %fcc3, a4, c4 + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + FCMOV %fcc1, a6, c2 + FCMOV %fcc2, a7, c3 + FCMOV %fcc3, a8, c4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FCMP %fcc0, a1, c1 + FCMOV %fcc0, a1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + +.LL20: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FCMOV %fcc1, a2, c2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FCMOV %fcc2, a3, c3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FCMOV %fcc3, a4, c4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, a5, c1 + add I, -1, I + FCMP %fcc1, a6, c2 + cmp I, 0 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FCMOV %fcc1, a6, c2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FCMOV %fcc2, a7, c3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FCMOV %fcc3, a8, c4 + LDF [X + 0 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + FCMOV %fcc1, a2, c2 + FCMOV %fcc2, a3, c3 + FCMOV %fcc3, a4, c4 + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + FCMOV %fcc1, a6, c2 + FCMOV %fcc2, a7, c3 + FCMOV %fcc3, a8, c4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FCMP %fcc0, a1, c1 + FCMOV %fcc0, a1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/rot.S b/kernel/sparc/rot.S new file mode 100644 index 0000000..f5c5770 --- /dev/null +++ b/kernel/sparc/rot.S @@ -0,0 +1,668 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#define XX %l0 +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f4 +#define a2 %f6 +#define a3 %f8 +#define a4 %f10 +#define a5 %f12 +#define a6 %f14 +#define a7 %f16 +#define a8 %f18 +#define b1 %f20 +#define b2 %f22 +#define b3 %f24 +#define b4 %f26 +#define b5 %f28 +#define b6 %f30 +#define b7 %f32 +#define b8 %f34 + +#define c1 %f36 +#define c2 %f38 +#define c3 %f40 +#define c4 %f42 +#define c5 %f44 +#define c6 %f46 +#define c7 %f48 +#define c8 %f50 + +#define t1 %f52 +#define t2 %f54 +#define t3 %f56 +#define t4 %f58 +#else +#define a1 %f2 +#define a2 %f3 +#define a3 %f4 +#define a4 %f5 +#define a5 %f6 +#define a6 %f7 +#define a7 %f8 +#define a8 %f9 +#define b1 %f10 +#define b2 %f11 +#define b3 %f12 +#define b4 %f13 +#define b5 %f14 +#define b6 %f15 +#define b7 %f16 +#define b8 %f17 + +#define c1 %f18 +#define c2 %f19 +#define c3 %f20 +#define c4 %f21 +#define c5 %f22 +#define c6 %f23 +#define c7 %f24 +#define c8 %f25 + +#define t1 %f26 +#define t2 %f27 +#define t3 %f28 +#define t4 %f29 +#endif + +#ifdef DOUBLE +#define C %f0 +#define S %f2 +#else +#define C %f0 +#define S %f1 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i5, [%sp + STACK_START + 24] + + LDF [%sp + STACK_START + 24], C + LDF [%sp + STACK_START + 32], S +#else + st %i5, [%sp + STACK_START + 24] + + LDF [%sp + STACK_START + 24], C + LDF [%sp + STACK_START + 28], S +#endif +#else +#ifdef DOUBLE + FMOV %f10, C + FMOV %f12, S +#else + FMOV %f11, C + FMOV %f13, S +#endif +#endif + + cmp N, 0 + ble .LL19 + nop + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + LDF [Y + 8 * SIZE], b1 + FMUL S, a1, c4 + LDF [X + 8 * SIZE], a1 + + FMUL C, a2, c5 + FMUL S, b2, c6 + FADD c1, c2, t1 + + FMUL C, b2, c7 + LDF [Y + 9 * SIZE], b2 + FMUL S, a2, c8 + LDF [X + 9 * SIZE], a2 + FSUB c3, c4, t2 + + addcc I, -1, I + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 64 + +.LL11: + FMUL C, a3, c1 + nop + prefetch [Y + PREFETCHSIZE * SIZE], 1 + nop + + FMUL S, b3, c2 + STF t1, [X + 0 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b3, c3 + LDF [Y + 10 * SIZE], b3 + nop + nop + + FMUL S, a3, c4 + STF t2, [Y + 0 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a4, c5 + LDF [X + 10 * SIZE], a3 + nop + nop + + FMUL S, b4, c6 + STF t3, [X + 1 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b4, c7 + LDF [Y + 11 * SIZE], b4 + nop + nop + + FMUL S, a4, c8 + STF t4, [Y + 1 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a5, c1 + LDF [X + 11 * SIZE], a4 + nop + nop + + FMUL S, b5, c2 + STF t1, [X + 2 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b5, c3 + LDF [Y + 12 * SIZE], b5 + nop + nop + + FMUL S, a5, c4 + STF t2, [Y + 2 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a6, c5 + LDF [X + 12 * SIZE], a5 + nop + nop + + FMUL S, b6, c6 + STF t3, [X + 3 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b6, c7 + LDF [Y + 13 * SIZE], b6 + nop + nop + + FMUL S, a6, c8 + STF t4, [Y + 3 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a7, c1 + LDF [X + 13 * SIZE], a6 + nop + nop + + FMUL S, b7, c2 + STF t1, [X + 4 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b7, c3 + LDF [Y + 14 * SIZE], b7 + nop + nop + + FMUL S, a7, c4 + STF t2, [Y + 4 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a8, c5 + LDF [X + 14 * SIZE], a7 + nop + nop + + FMUL S, b8, c6 + STF t3, [X + 5 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b8, c7 + LDF [Y + 15 * SIZE], b8 + nop + nop + + FMUL S, a8, c8 + STF t4, [Y + 5 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a1, c1 + LDF [X + 15 * SIZE], a8 + addcc I, -1, I + nop + + FMUL S, b1, c2 + STF t1, [X + 6 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b1, c3 + LDF [Y + 16 * SIZE], b1 + nop + nop + + FMUL S, a1, c4 + STF t2, [Y + 6 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a2, c5 + LDF [X + 16 * SIZE], a1 + add Y, 8 * SIZE, Y + nop + + FMUL S, b2, c6 + STF t3, [X + 7 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b2, c7 + LDF [Y + 9 * SIZE], b2 + add X, 8 * SIZE, X + nop + + FMUL S, a2, c8 + STF t4, [Y - 1 * SIZE] + FSUB c3, c4, t2 + nop + + bg,pt %icc, .LL11 + LDF [X + 9 * SIZE], a2 + + +.LL12: + FMUL C, a3, c1 + FMUL S, b3, c2 + STF t1, [X + 0 * SIZE] + FADD c5, c6, t3 + + FMUL C, b3, c3 + FMUL S, a3, c4 + STF t2, [Y + 0 * SIZE] + FSUB c7, c8, t4 + + + FMUL C, a4, c5 + FMUL S, b4, c6 + STF t3, [X + 1 * SIZE] + FADD c1, c2, t1 + + FMUL C, b4, c7 + FMUL S, a4, c8 + STF t4, [Y + 1 * SIZE] + FSUB c3, c4, t2 + + + FMUL C, a5, c1 + FMUL S, b5, c2 + STF t1, [X + 2 * SIZE] + FADD c5, c6, t3 + + FMUL C, b5, c3 + FMUL S, a5, c4 + STF t2, [Y + 2 * SIZE] + FSUB c7, c8, t4 + + FMUL C, a6, c5 + FMUL S, b6, c6 + STF t3, [X + 3 * SIZE] + FADD c1, c2, t1 + + FMUL C, b6, c7 + FMUL S, a6, c8 + STF t4, [Y + 3 * SIZE] + FSUB c3, c4, t2 + + FMUL C, a7, c1 + FMUL S, b7, c2 + STF t1, [X + 4 * SIZE] + FADD c5, c6, t3 + + FMUL C, b7, c3 + FMUL S, a7, c4 + STF t2, [Y + 4 * SIZE] + FSUB c7, c8, t4 + + FMUL C, a8, c5 + FMUL S, b8, c6 + STF t3, [X + 5 * SIZE] + FADD c1, c2, t1 + + FMUL C, b8, c7 + FMUL S, a8, c8 + STF t4, [Y + 5 * SIZE] + FSUB c3, c4, t2 + + FADD c5, c6, t3 + STF t1, [X + 6 * SIZE] + + FSUB c7, c8, t4 + STF t2, [Y + 6 * SIZE] + + STF t3, [X + 7 * SIZE] + STF t4, [Y + 7 * SIZE] + + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + + +.LL15: + andcc N, 7, I + nop + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add X, 1 * SIZE, X + LDF [Y + 0 * SIZE], b1 + add Y, 1 * SIZE, Y + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FADD c1, c2, c2 + addcc I, -1, I + FSUB c3, c4, c4 + nop + + STF c2, [X - 1 * SIZE] + STF c4, [Y - 1 * SIZE] + bg,pt %icc, .LL16 + nop + +.LL19: + return %i7 + 8 + nop + +.LL50: + mov X, XX + mov Y, YY + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + add Y, INCY, Y + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FADD c1, c2, t1 + FSUB c3, c4, t2 + + STF t1, [XX + 0 * SIZE] + add XX, INCX, XX + STF t2, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a2, c5 + FMUL S, b2, c6 + FMUL C, b2, c7 + FMUL S, a2, c8 + + FADD c5, c6, t3 + FSUB c7, c8, t4 + + STF t3, [XX + 0 * SIZE] + add XX, INCX, XX + STF t4, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a3, c1 + FMUL S, b3, c2 + FMUL C, b3, c3 + FMUL S, a3, c4 + + FADD c1, c2, t1 + FSUB c3, c4, t2 + + STF t1, [XX + 0 * SIZE] + add XX, INCX, XX + STF t2, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a4, c5 + FMUL S, b4, c6 + FMUL C, b4, c7 + FMUL S, a4, c8 + + FADD c5, c6, t3 + FSUB c7, c8, t4 + + STF t3, [XX + 0 * SIZE] + add XX, INCX, XX + STF t4, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a5, c1 + FMUL S, b5, c2 + FMUL C, b5, c3 + FMUL S, a5, c4 + + FADD c1, c2, t1 + FSUB c3, c4, t2 + + STF t1, [XX + 0 * SIZE] + add XX, INCX, XX + STF t2, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a6, c5 + FMUL S, b6, c6 + FMUL C, b6, c7 + FMUL S, a6, c8 + + FADD c5, c6, t3 + FSUB c7, c8, t4 + + STF t3, [XX + 0 * SIZE] + add XX, INCX, XX + STF t4, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a7, c1 + FMUL S, b7, c2 + FMUL C, b7, c3 + FMUL S, a7, c4 + + FADD c1, c2, t1 + FSUB c3, c4, t2 + + STF t1, [XX + 0 * SIZE] + add XX, INCX, XX + STF t2, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a8, c5 + FMUL S, b8, c6 + FMUL C, b8, c7 + FMUL S, a8, c8 + + FADD c5, c6, t3 + FSUB c7, c8, t4 + + STF t3, [XX + 0 * SIZE] + add XX, INCX, XX + STF t4, [YY + 0 * SIZE] + add YY, INCY, YY + + addcc I, -1, I + bg,pt %icc, .LL51 + nop + + +.LL55: + andcc N, 7, I + nop + ble %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FADD c1, c2, c2 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + add X, INCX, X + STF c4, [Y + 0 * SIZE] + addcc I, -1, I + + bg %icc, .LL56 + add Y, INCY, Y + + +.LL59: + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/scal.S b/kernel/sparc/scal.S new file mode 100644 index 0000000..1414a09 --- /dev/null +++ b/kernel/sparc/scal.S @@ -0,0 +1,398 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#if defined(DOUBLE) && !defined(__64BIT__) +#define X %i5 +#define INCX %i1 +#else +#define X %i4 +#define INCX %i5 +#endif + +#define I %i2 +#define XX %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define c5 %f8 +#define c6 %f10 +#define c7 %f12 +#define c8 %f14 + +#define t1 %f16 +#define t2 %f18 +#define t3 %f20 +#define t4 %f22 +#define t5 %f24 +#define t6 %f26 +#define t7 %f28 +#define t8 %f30 + +#define FZERO %f60 +#define ALPHA %f62 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define c5 %f4 +#define c6 %f5 +#define c7 %f6 +#define c8 %f7 + +#define t1 %f8 +#define t2 %f9 +#define t3 %f10 +#define t4 %f11 +#define t5 %f12 +#define t6 %f13 +#define t7 %f14 +#define t8 %f15 + +#define FZERO %f29 +#define ALPHA %f30 +#endif + +#define PREFETCHSIZE 168 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + ld [%sp + STACK_START + 28], INCX +#else + st %i3, [%sp + STACK_START + 16] +#endif + + LDF [%sp + STACK_START + 16], ALPHA +#else +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif +#endif + + FCLR(29) + + FCMP ALPHA, FZERO + fbne .LL100 + sll INCX, BASE_SHIFT, INCX + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + STF FZERO, [X + 0 * SIZE] + add I, -1, I + STF FZERO, [X + 1 * SIZE] + cmp I, 0 + STF FZERO, [X + 2 * SIZE] + STF FZERO, [X + 3 * SIZE] + STF FZERO, [X + 4 * SIZE] + STF FZERO, [X + 5 * SIZE] + add X, 8 * SIZE, X + STF FZERO, [X - 2 * SIZE] + bg,pt %icc, .LL11 + STF FZERO, [X - 1 * SIZE] + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + STF FZERO, [X + 0 * SIZE] + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + return %i7 + 8 + clr %o0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + add I, -1, I + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + cmp I, 0 + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + bg,pt %icc, .LL51 + add X, INCX, X + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + STF FZERO, [X + 0 * SIZE] + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + return %i7 + 8 + clr %o0 + +.LL100: + cmp INCX, SIZE + bne .LL150 + sra N, 3, I + + cmp I, 0 + ble,pn %icc, .LL115 + nop + + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + LDF [X + 2 * SIZE], c3 + LDF [X + 3 * SIZE], c4 + LDF [X + 4 * SIZE], c5 + LDF [X + 5 * SIZE], c6 + LDF [X + 6 * SIZE], c7 + LDF [X + 7 * SIZE], c8 + FMUL ALPHA, c1, t1 + LDF [X + 8 * SIZE], c1 + FMUL ALPHA, c2, t2 + LDF [X + 9 * SIZE], c2 + + deccc I + ble,pt %icc, .LL112 + nop + +.LL111: + prefetch [X + PREFETCHSIZE * SIZE], 0 + deccc I + + FMUL ALPHA, c3, t3 + LDF [X + 10 * SIZE], c3 + nop + STF t1, [X + 0 * SIZE] + + FMUL ALPHA, c4, t4 + LDF [X + 11 * SIZE], c4 + nop + STF t2, [X + 1 * SIZE] + + FMUL ALPHA, c5, t5 + LDF [X + 12 * SIZE], c5 + nop + STF t3, [X + 2 * SIZE] + + FMUL ALPHA, c6, t6 + LDF [X + 13 * SIZE], c6 + nop + STF t4, [X + 3 * SIZE] + + FMUL ALPHA, c7, t7 + LDF [X + 14 * SIZE], c7 + nop + STF t5, [X + 4 * SIZE] + + FMUL ALPHA, c8, t8 + LDF [X + 15 * SIZE], c8 + nop + STF t6, [X + 5 * SIZE] + + FMUL ALPHA, c1, t1 + STF t7, [X + 6 * SIZE] + nop + LDF [X + 16 * SIZE], c1 + + FMUL ALPHA, c2, t2 + STF t8, [X + 7 * SIZE] + nop + LDF [X + 17 * SIZE], c2 + + bg,pt %icc, .LL111 + add X, 8 * SIZE, X + +.LL112: + FMUL ALPHA, c3, t3 + STF t1, [X + 0 * SIZE] + FMUL ALPHA, c4, t4 + STF t2, [X + 1 * SIZE] + + FMUL ALPHA, c5, t5 + STF t3, [X + 2 * SIZE] + FMUL ALPHA, c6, t6 + STF t4, [X + 3 * SIZE] + + FMUL ALPHA, c7, t7 + STF t5, [X + 4 * SIZE] + FMUL ALPHA, c8, t8 + STF t6, [X + 5 * SIZE] + STF t7, [X + 6 * SIZE] + STF t8, [X + 7 * SIZE] + + add X, 8 * SIZE, X + +.LL115: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL119 + nop + +.LL116: + LDF [X + 0 * SIZE], c1 + add I, -1, I + FMUL ALPHA, c1, c1 + cmp I, 0 + STF c1, [X + 0 * SIZE] + bg,pt %icc, .LL116 + add X, 1 * SIZE, X + +.LL119: + return %i7 + 8 + clr %o0 + +.LL150: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL155 + mov X, XX + +.LL151: + LDF [X + 0 * SIZE], c1 + add X, INCX, X + LDF [X + 0 * SIZE], c2 + add X, INCX, X + LDF [X + 0 * SIZE], c3 + add X, INCX, X + LDF [X + 0 * SIZE], c4 + add X, INCX, X + LDF [X + 0 * SIZE], c5 + FMUL ALPHA, c1, c1 + add X, INCX, X + LDF [X + 0 * SIZE], c6 + FMUL ALPHA, c2, c2 + add X, INCX, X + LDF [X + 0 * SIZE], c7 + FMUL ALPHA, c3, c3 + add X, INCX, X + LDF [X + 0 * SIZE], c8 + FMUL ALPHA, c4, c4 + add X, INCX, X + + STF c1, [XX + 0 * SIZE] + FMUL ALPHA, c5, c5 + add XX, INCX, XX + STF c2, [XX + 0 * SIZE] + FMUL ALPHA, c6, c6 + add XX, INCX, XX + STF c3, [XX + 0 * SIZE] + FMUL ALPHA, c7, c7 + add XX, INCX, XX + STF c4, [XX + 0 * SIZE] + FMUL ALPHA, c8, c8 + add XX, INCX, XX + STF c5, [XX + 0 * SIZE] + add XX, INCX, XX + add I, -1, I + STF c6, [XX + 0 * SIZE] + add XX, INCX, XX + cmp I, 0 + STF c7, [XX + 0 * SIZE] + add XX, INCX, XX + STF c8, [XX + 0 * SIZE] + + bg,pt %icc, .LL151 + add XX, INCX, XX + +.LL155: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [X + 0 * SIZE], c1 + add I, -1, I + FMUL ALPHA, c1, c1 + cmp I, 0 + STF c1, [X + 0 * SIZE] + bg,pt %icc, .LL156 + add X, INCX, X + +.LL159: + return %i7 + 8 + clr %o0 + + + EPILOGUE diff --git a/kernel/sparc/snrm2.S b/kernel/sparc/snrm2.S new file mode 100644 index 0000000..a802472 --- /dev/null +++ b/kernel/sparc/snrm2.S @@ -0,0 +1,334 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + + PROLOGUE + SAVESP + + FCLR(0) + + FMOV c1, c2 + FMOV c1, c3 + FMOV c1, c4 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + ld [X + 0 * SIZE], a1 + add I, -1, I + ld [X + 1 * SIZE], a2 + cmp I, 0 + ld [X + 2 * SIZE], a3 + ld [X + 3 * SIZE], a4 + ld [X + 4 * SIZE], a5 + ld [X + 5 * SIZE], a6 + ld [X + 6 * SIZE], a7 + ld [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + + faddd c2, t2, c2 + add I, -1, I + fsmuld a2, a2, t2 + ld [X + 0 * SIZE], a1 + + faddd c3, t3, c3 + cmp I, 0 + fsmuld a3, a3, t3 + ld [X + 1 * SIZE], a2 + + faddd c4, t4, c4 + fsmuld a4, a4, t4 + ld [X + 2 * SIZE], a3 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + ld [X + 3 * SIZE], a4 + + faddd c2, t2, c2 + fsmuld a6, a6, t2 + ld [X + 4 * SIZE], a5 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + ld [X + 5 * SIZE], a6 + + faddd c4, t4, c4 + ld [X + 6 * SIZE], a7 + fsmuld a8, a8, t4 + add X, 8 * SIZE, X + + bg,pt %icc, .LL11 + ld [X - 1 * SIZE], a8 + +.LL12: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + faddd c2, t2, c2 + fsmuld a2, a2, t2 + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + faddd c4, t4, c4 + fsmuld a4, a4, t4 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + faddd c2, t2, c2 + fsmuld a6, a6, t2 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + faddd c4, t4, c4 + fsmuld a8, a8, t4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + ld [X + 0 * SIZE], a1 + + add I, -1, I + cmp I, 0 + faddd c1, t1, c1 + fsmuld a1, a1, t1 + + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + faddd c1, t1, c1 + faddd c2, t2, c2 + faddd c3, t3, c3 + faddd c4, t4, c4 + + faddd c1, c2, c1 + faddd c3, c4, c3 + faddd c1, c3, c1 + + fsqrtd c1, c1 + +#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) + fdtos c1, c1 +#endif +.LL20: + + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + ld [X + 0 * SIZE], a1 + add X, INCX, X + ld [X + 0 * SIZE], a2 + add X, INCX, X + ld [X + 0 * SIZE], a3 + add X, INCX, X + ld [X + 0 * SIZE], a4 + add X, INCX, X + ld [X + 0 * SIZE], a5 + add X, INCX, X + ld [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + ld [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + ld [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + faddd c1, t1, c1 + add I, -1, I + fsmuld a1, a1, t1 + ld [X + 0 * SIZE], a1 + add X, INCX, X + + faddd c2, t2, c2 + cmp I, 0 + fsmuld a2, a2, t2 + ld [X + 0 * SIZE], a2 + add X, INCX, X + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + ld [X + 0 * SIZE], a3 + add X, INCX, X + + faddd c4, t4, c4 + fsmuld a4, a4, t4 + ld [X + 0 * SIZE], a4 + add X, INCX, X + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + ld [X + 0 * SIZE], a5 + add X, INCX, X + + faddd c2, t2, c2 + fsmuld a6, a6, t2 + ld [X + 0 * SIZE], a6 + add X, INCX, X + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + ld [X + 0 * SIZE], a7 + add X, INCX, X + + faddd c4, t4, c4 + fsmuld a8, a8, t4 + ld [X + 0 * SIZE], a8 + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + faddd c2, t2, c2 + fsmuld a2, a2, t2 + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + faddd c4, t4, c4 + fsmuld a4, a4, t4 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + faddd c2, t2, c2 + fsmuld a6, a6, t2 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + faddd c4, t4, c4 + fsmuld a8, a8, t4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + ld [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + faddd c1, t1, c1 + fsmuld a1, a1, t1 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + faddd c1, t1, c1 + faddd c2, t2, c2 + faddd c3, t3, c3 + faddd c4, t4, c4 + + faddd c1, c2, c1 + faddd c3, c4, c3 + faddd c1, c3, c1 + + fsqrtd c1, c1 + +#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) + fdtos c1, c1 +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/staticbuffer.S b/kernel/sparc/staticbuffer.S new file mode 100644 index 0000000..679ad56 --- /dev/null +++ b/kernel/sparc/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 256 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 4096 +#endif diff --git a/kernel/sparc/swap.S b/kernel/sparc/swap.S new file mode 100644 index 0000000..1d7950c --- /dev/null +++ b/kernel/sparc/swap.S @@ -0,0 +1,346 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(DOUBLE) && !defined(__64BIT__) +#define N %i0 +#define X %i5 +#define INCX %i1 +#define Y %i2 +#define INCY %i3 +#define I %i4 +#else +#define N %i0 +#define X %i4 +#define INCX %i5 +#define Y %i1 +#define INCY %i2 +#define I %i3 +#endif + +#define XX %l0 +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define b1 %f16 +#define b2 %f18 +#define b3 %f20 +#define b4 %f22 +#define b5 %f24 +#define b6 %f26 +#define b7 %f28 +#define b8 %f30 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define b1 %f8 +#define b2 %f9 +#define b3 %f10 +#define b4 %f11 +#define b5 %f12 +#define b6 %f13 +#define b7 %f14 +#define b8 %f15 +#endif + +#ifdef DOUBLE +#define PREFETCHSIZE 128 +#else +#define PREFETCHSIZE 256 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 28], INCX + ld [%sp + STACK_START + 32], Y + ld [%sp + STACK_START + 36], INCY +#else + ld [%sp+ STACK_START + 28], Y + ld [%sp+ STACK_START + 32], INCY +#endif +#else + ldx [%sp+ STACK_START + 56], Y + ldx [%sp+ STACK_START + 64], INCY +#endif + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + deccc I + ble,pn %icc, .LL12 + nop + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + deccc I + + STF a1, [Y + 0 * SIZE] + LDF [X + 8 * SIZE], a1 + STF b1, [X + 0 * SIZE] + LDF [Y + 8 * SIZE], b1 + + STF a2, [Y + 1 * SIZE] + LDF [X + 9 * SIZE], a2 + STF b2, [X + 1 * SIZE] + LDF [Y + 9 * SIZE], b2 + + STF a3, [Y + 2 * SIZE] + LDF [X + 10 * SIZE], a3 + STF b3, [X + 2 * SIZE] + LDF [Y + 10 * SIZE], b3 + + STF a4, [Y + 3 * SIZE] + LDF [X + 11 * SIZE], a4 + STF b4, [X + 3 * SIZE] + LDF [Y + 11 * SIZE], b4 + + prefetch [Y + PREFETCHSIZE * SIZE], 0 + add X, 8 * SIZE, X + + STF a5, [Y + 4 * SIZE] + LDF [X + 4 * SIZE], a5 + STF b5, [X - 4 * SIZE] + LDF [Y + 12 * SIZE], b5 + + STF a6, [Y + 5 * SIZE] + LDF [X + 5 * SIZE], a6 + STF b6, [X - 3 * SIZE] + LDF [Y + 13 * SIZE], b6 + + STF a7, [Y + 6 * SIZE] + LDF [X + 6 * SIZE], a7 + STF b7, [X - 2 * SIZE] + LDF [Y + 14 * SIZE], b7 + + STF a8, [Y + 7 * SIZE] + LDF [X + 7 * SIZE], a8 + STF b8, [X - 1 * SIZE] + LDF [Y + 15 * SIZE], b8 + + bg,pt %icc, .LL11 + add Y, 8 * SIZE, Y + +.LL12: + STF a1, [Y + 0 * SIZE] + STF b1, [X + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + STF b2, [X + 1 * SIZE] + STF a3, [Y + 2 * SIZE] + STF b3, [X + 2 * SIZE] + STF a4, [Y + 3 * SIZE] + STF b4, [X + 3 * SIZE] + STF a5, [Y + 4 * SIZE] + STF b5, [X + 4 * SIZE] + STF a6, [Y + 5 * SIZE] + STF b6, [X + 5 * SIZE] + STF a7, [Y + 6 * SIZE] + STF b7, [X + 6 * SIZE] + STF a8, [Y + 7 * SIZE] + STF b8, [X + 7 * SIZE] + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + STF a1, [Y + 0 * SIZE] + add Y, 1 * SIZE, Y + STF b1, [X + 0 * SIZE] + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + mov X, XX + cmp I, 0 + ble,pn %icc, .LL55 + mov Y, YY + +.LL51: + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + add Y, INCY, Y + + STF a1, [YY + 0 * SIZE] + add I, -1, I + add YY, INCY, YY + STF b1, [XX + 0 * SIZE] + cmp I, 0 + add XX, INCX, XX + STF a2, [YY + 0 * SIZE] + add YY, INCY, YY + STF b2, [XX + 0 * SIZE] + add XX, INCX, XX + STF a3, [YY + 0 * SIZE] + add YY, INCY, YY + STF b3, [XX + 0 * SIZE] + add XX, INCX, XX + STF a4, [YY + 0 * SIZE] + add YY, INCY, YY + STF b4, [XX + 0 * SIZE] + add XX, INCX, XX + STF a5, [YY + 0 * SIZE] + add YY, INCY, YY + STF b5, [XX + 0 * SIZE] + add XX, INCX, XX + STF a6, [YY + 0 * SIZE] + add YY, INCY, YY + STF b6, [XX + 0 * SIZE] + add XX, INCX, XX + STF a7, [YY + 0 * SIZE] + add YY, INCY, YY + STF b7, [XX + 0 * SIZE] + add XX, INCX, XX + STF a8, [YY + 0 * SIZE] + add YY, INCY, YY + STF b8, [XX + 0 * SIZE] + + bg,pt %icc, .LL51 + add XX, INCX, XX + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + add I, -1, I + cmp I, 0 + STF b1, [X + 0 * SIZE] + add X, INCX, X + STF a1, [Y + 0 * SIZE] + bg,pt %icc, .LL56 + add Y, INCY, Y + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_LN.S b/kernel/sparc/trsm_kernel_LN.S new file mode 100644 index 0000000..4577a30 --- /dev/null +++ b/kernel/sparc/trsm_kernel_LN.S @@ -0,0 +1,4254 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define OFFSET %l5 +#define KK %l6 +#define TEMP1 %l7 +#define TEMP2 %i3 +#define AORIG %g1 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f58 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f60 +#define ALPHA %f62 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#define ALPHA %f30 +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET +#endif + + FCLR(29) + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +.LL11: +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 2, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C4, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL50 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL75 + nop + +.LL72: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a1, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 4 * SIZE], a1 + + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a2, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [BO + 9 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a2, b3, t3 + LDF [BO + 10 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 11 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 12 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 13 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [BO + 14 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a3, b4, t4 + LDF [BO + 15 * SIZE], b4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 16 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a4, b2, t2 + LDF [BO + 17 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 18 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 19 * SIZE], b4 + + add BO, 16 * SIZE, BO + bg,pt %icc, .LL72 + LDF [AO + 3 * SIZE], a4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL79 + nop + +.LL76: + FADD c01, t1, c01 + add AO, 1 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + add BO, 4 * SIZE, BO + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + bg,pt %icc, .LL76 + LDF [BO + 3 * SIZE], b4 + + +.LL79: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c04, t1, c04 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + FSUB c03, t1, c03 + FMUL a3, c04, t1 + FSUB c02, t1, c02 + FMUL a4, c04, t1 + FSUB c01, t1, c01 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 + add C3, 1 * SIZE, C3 + add C4, 1 * SIZE, C4 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL50: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL70 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c02 + FMOV FZERO, t1 + FMOV FZERO, c04 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD c04, t2, c04 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD c02, t1, c02 + FADD c04, t2, c04 + FADD c06, t3, c06 + FADD c08, t4, c08 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FMUL a2, c06, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c03, t2, c03 + FSUB c05, t3, c05 + FSUB c07, t4, c07 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FMUL a2, c01, t1 + FMUL a2, c03, t2 + FMUL a2, c05, t3 + FMUL a2, c07, t4 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FSUB c06, t3, c06 + FSUB c08, t4, c08 + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c07, t1 + FMUL a2, c08, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c07, t1 + FMUL a3, c08, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a4, c07, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL70: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL21: + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + FMOV FZERO, c01 + FMOV FZERO, c02 + FMOV FZERO, c03 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c04 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c05 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c06 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c07 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c08 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c09 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c10 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c11 + LDF [BO + 4 * SIZE], b5 /* ***** */ + + LDF [AO + 4 * SIZE], a5 /* ***** */ + +#ifdef LN + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 + 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 + 3 * SIZE], 3 + FMOV FZERO, c15 +#else + prefetch [C1 - 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 - 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 - 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 - 3 * SIZE], 3 + FMOV FZERO, c15 +#endif + + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c08, t2, c08 + FMUL a5, b2, t2 + FADD c12, t3, c12 + FMUL a5, b3, t3 + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL29 + nop + +.LL26: + FADD c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 4, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD c04, t1, c04 + FADD c08, t2, c08 + FADD c12, t3, c12 + FADD c16, t4, c16 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c09, c09 + FSUB a4, c13, c13 + + FSUB b1, c02, c02 + FSUB b2, c06, c06 + FSUB b3, c10, c10 + FSUB b4, c14, c14 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c03, c03 + FSUB a2, c07, c07 + FSUB a3, c11, c11 + FSUB a4, c15, c15 + + FSUB b1, c04, c04 + FSUB b2, c08, c08 + FSUB b3, c12, c12 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 + + FMUL a2, c04, t1 + FMUL a2, c08, t2 + FMUL a2, c12, t3 + FMUL a2, c16, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c04, t1 + FMUL a3, c08, t2 + FMUL a3, c12, t3 + FMUL a3, c16, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a4, c04, t1 + FMUL a4, c08, t2 + FMUL a4, c12, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c03, t1 + FMUL a3, c07, t2 + FMUL a3, c11, t3 + FMUL a3, c15, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 + + FMUL a2, c01, t1 + FMUL a2, c05, t2 + FMUL a2, c09, t3 + FMUL a2, c13, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c01, t1 + FMUL a3, c05, t2 + FMUL a3, c09, t3 + FMUL a3, c13, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a4, c01, t1 + FMUL a4, c05, t2 + FMUL a4, c09, t3 + FMUL a4, c13, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FMUL a3, c10, t3 + FMUL a3, c14, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + FMUL a4, c03, t3 + FMUL a4, c04, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a3, c07, t3 + FMUL a3, c08, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 + + FMUL a2, c13, t1 + FMUL a2, c14, t2 + FMUL a2, c15, t3 + FMUL a2, c16, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c13, t1 + FMUL a3, c14, t2 + FMUL a3, c15, t3 + FMUL a3, c16, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a4, c13, t1 + FMUL a4, c14, t2 + FMUL a4, c15, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 + add C3, -4 * SIZE, C3 + add C4, -4 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c13, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c06, [BO + 5 * SIZE] + STF c10, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] + + STF c03, [BO + 8 * SIZE] + STF c07, [BO + 9 * SIZE] + STF c11, [BO + 10 * SIZE] + STF c15, [BO + 11 * SIZE] + + STF c04, [BO + 12 * SIZE] + STF c08, [BO + 13 * SIZE] + STF c12, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c11, [C3 + 2 * SIZE] + STF c12, [C3 + 3 * SIZE] + + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + STF c15, [C4 + 2 * SIZE] + STF c16, [C4 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + add C3, 4 * SIZE, C3 + add C4, 4 * SIZE, C4 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + sra K, 2, L + bg,pt %icc, .LL21 + FMOV FZERO, c01 + + + + + + + +.LL99: +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: /* n & 2 */ + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 1, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C2, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL150 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL175 + nop + +.LL172: + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + add L, -1, L + LDF [AO + 0 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 9 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 10 * SIZE], b3 + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 11 * SIZE], b4 + add BO, 8 * SIZE, BO + + bg,pt %icc, .LL172 + LDF [AO + 3 * SIZE], a4 + +.LL175: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL179 + nop + +.LL176: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + add AO, 1 * SIZE, AO + LDF [BO + 2 * SIZE], b1 + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 3 * SIZE], b2 + + add BO, 2 * SIZE, BO + bg,pt %icc, .LL176 + LDF [AO + 0 * SIZE], a1 + +.LL179: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL150: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL170 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + ble,pn %icc, .LL155 + nop + +.LL152: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL a1, b1, t1 + FMUL a1, b2, t2 + FMUL a2, b1, t3 + FMUL a2, b2, t4 + + add AO, 2 * SIZE, AO + add BO, 2 * SIZE, BO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL156 + nop + +.LL159: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FMUL a3, c01, c01 + FMUL a3, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a2, c01, t1 + FMUL a2, c03, t2 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FSUB c01, t1, c01 + FSUB c03, t2, c03 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c02, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL170: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL199 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + +#ifdef LN + prefetch [C1 - 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 - 3 * SIZE], 2 + FMOV FZERO, c02 +#else + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 +#endif + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + FADD c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD c03, t1, c03 + FADD c07, t2, c07 + FADD c04, t3, c04 + FADD c08, t4, c08 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c02, c02 + FSUB a4, c06, c06 + + FSUB b1, c03, c03 + FSUB b2, c07, c07 + FSUB b3, c04, c04 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a2, c04, t1 + FMUL a2, c08, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c04, t1 + FMUL a3, c08, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a4, c04, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c03, t1 + FMUL a3, c07, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a2, c01, t1 + FMUL a2, c05, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c01, t1 + FMUL a3, c05, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a4, c01, t1 + FMUL a4, c05, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c05, c05 + FMUL a3, c06, c06 + FMUL a3, c07, c07 + FMUL a3, c08, c08 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FMUL a3, c01, c01 + FMUL a3, c02, c02 + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c07, [BO + 5 * SIZE] + STF c04, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL199: +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + +.LL200: + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL250 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t2 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + + ble,pn %icc, .LL275 + LDF [BO + 3 * SIZE], b4 + +.LL272: + FADD c01, t1, c01 + add L, -1, L + add AO, 4 * SIZE, AO + + FMUL a1, b1, t1 + add BO, 4 * SIZE, BO + LDF [AO + 0 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + LDF [BO + 0 * SIZE], b1 + FMUL a2, b2, t2 + + LDF [AO + 1 * SIZE], a2 + FADD c01, t3, c01 + LDF [BO + 1 * SIZE], b2 + FMUL a3, b3, t3 + + LDF [AO + 2 * SIZE], a3 + FADD c02, t4, c02 + LDF [BO + 2 * SIZE], b3 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL272 + LDF [BO + 3 * SIZE], b4 + +.LL275: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL279 + nop + +.LL276: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 1 * SIZE], a1 + + LDF [BO + 1 * SIZE], b1 + add BO, 1 * SIZE, BO + cmp L, 0 + bg,pt %icc, .LL276 + add AO, 1 * SIZE, AO + +.LL279: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + + FADD c01, c02, c01 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL250: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL270 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL255 + nop + +.LL252: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + LDF [BO + 4 * SIZE], b1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b2, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 5 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 9 * SIZE], a2 + LDF [BO + 6 * SIZE], b3 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + add AO, 8 * SIZE, AO + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL252 + add BO, 4 * SIZE, BO + +.LL255: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + + cmp L, 0 + ble,a,pn %icc, .LL259 + nop + +.LL256: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 2 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a2, b1, t2 + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add AO, 2 * SIZE, AO + + bg,pt %icc, .LL256 + add BO, 1 * SIZE, BO + +.LL259: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL270: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL299 + nop + +.LL221: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + +#ifdef LN + prefetch [C1 - 3 * SIZE], 2 +#else + prefetch [C1 + 3 * SIZE], 2 +#endif + + ble,pn %icc, .LL225 + prefetch [C1 + 4 * SIZE], 2 + +.LL222: + FADD c01, t1, c01 + add BO, 4 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + add L, -1, L + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + + FADD c01, t1, c01 + cmp L, 0 + FMUL a1, b2, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [AO + 9 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b2, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 11 * SIZE], a4 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 12 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 13 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [AO + 14 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b3, t4 + LDF [AO + 15 * SIZE], a4 + LDF [BO + 2 * SIZE], b3 + + FADD c01, t1, c01 + FMUL a1, b4, t1 + LDF [AO + 16 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b4, t2 + LDF [AO + 17 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 18 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 19 * SIZE], a4 + add AO, 16 * SIZE, AO + + bg,pt %icc, .LL222 + LDF [BO + 3 * SIZE], b4 + +.LL225: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL229 + nop + +.LL226: + FADD c01, t1, c01 + add BO, 1 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + add AO, 4 * SIZE, AO + + bg,pt %icc, .LL226 + LDF [BO + 0 * SIZE], b1 + +.LL229: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + + FSUB c03, t1, c03 + FMUL a3, c04, t1 + + FSUB c02, t1, c02 + FMUL a4, c04, t1 + + FSUB c01, t1, c01 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c04, t1, c04 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL221 + nop + + + +.LL299: +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_LN_2x8.S b/kernel/sparc/trsm_kernel_LN_2x8.S new file mode 100644 index 0000000..a70f0e4 --- /dev/null +++ b/kernel/sparc/trsm_kernel_LN_2x8.S @@ -0,0 +1,3897 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define C5 %l5 +#define C6 %l6 +#define C7 %l7 +#define C8 %i3 + +#define OFFSET %g1 +#define KK %g2 +#define TEMP1 %g3 +#define TEMP2 %g4 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif + + st %g1, [%sp + STACK_START + 8] + st %g2, [%sp + STACK_START + 12] + st %g3, [%sp + STACK_START + 16] + st %g4, [%sp + STACK_START + 20] +#else + + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET + + stx %g1, [%sp + STACK_START + 32] + stx %g2, [%sp + STACK_START + 40] + stx %g3, [%sp + STACK_START + 48] + stx %g4, [%sp + STACK_START + 56] +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, BASE_SHIFT + 3, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C5 + add C5, LDC, C6 + add C6, LDC, C7 + add C7, LDC, C8 + add C8, LDC, C +#else + sub C, LDC, C8 + sub C8, LDC, C7 + sub C7, LDC, C6 + sub C6, LDC, C5 + sub C5, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL20 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + FCLR (cc01) + LDF [BO + 1 * SIZE], b2 + FCLR (cc03) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc07) + LDF [BO + 4 * SIZE], b5 + FCLR (cc09) + LDF [BO + 5 * SIZE], b6 + FCLR (cc11) + LDF [BO + 6 * SIZE], b7 + FCLR (cc13) + LDF [BO + 7 * SIZE], b8 + FCLR (cc15) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + LDF [BO + 8 * SIZE], b9 + .align 4 + +.LL23: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa2, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa2, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa2, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa2, bb5, cc09, cc09) + LDF [BO + 20 * SIZE], b5 + FMADD (aa2, bb6, cc11, cc11) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa2, bb7, cc13, cc13) + LDF [BO + 22 * SIZE], b7 + FMADD (aa2, bb8, cc15, cc15) + LDF [BO + 23 * SIZE], b8 + + LDF [AO + 4 * SIZE], a1 + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb1, cc01, cc01) + LDF [BO + 32 * SIZE], b1 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 26 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [BO + 28 * SIZE], b5 + FMADD (aa3, bb6, cc11, cc11) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 30 * SIZE], b7 + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa4, bb9, cc01, cc01) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb2, cc03, cc03) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa4, bb3, cc05, cc05) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc07, cc07) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa4, bb5, cc09, cc09) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb6, cc11, cc11) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa4, bb7, cc13, cc13) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc15, cc15) + LDF [BO + 39 * SIZE], b8 + + LDF [AO + 6 * SIZE], a3 + LDF [AO + 7 * SIZE], a4 + + add AO, 4 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL23 + add BO, 32 * SIZE, BO + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 1 * SIZE], a1 + add AO, 1 * SIZE, AO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL27 + add BO, 8 * SIZE, BO + .align 4 + +.LL28: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb4, cc01, cc15, cc15) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb3, cc03, cc15, cc15) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb2, cc05, cc15, cc15) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (bb1, cc07, cc15, cc15) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa4, cc09, cc15, cc15) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa3, cc11, cc15, cc15) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc15, cc15) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c15, c15 + + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 + add C5, -1 * SIZE, C5 + add C6, -1 * SIZE, C6 + add C7, -1 * SIZE, C7 + add C8, -1 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c11, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c15, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c13, [C7 + 0 * SIZE] + STF c15, [C8 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL20: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL29 + nop + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 8 * SIZE], a5 + + LDF [BO + 0 * SIZE], b1 + + LDF [BO + 1 * SIZE], b2 + FCLR (cc01) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc09) + LDF [BO + 4 * SIZE], b5 + FCLR (cc13) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc06) + LDF [BO + 7 * SIZE], b8 + FCLR (cc10) + LDF [BO + 8 * SIZE], b9 + FCLR (cc14) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc03) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc11) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc15) + + prefetch [C5 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C6 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C7 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C8 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + nop + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + nop + FMADD (aa2, bb5, cc10, cc10) + nop + + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c02, c02 + FSUB a2, c04, c04 + FSUB a3, c06, c06 + FSUB a4, c08, c08 + + FSUB b1, c10, c10 + FSUB b2, c12, c12 + FSUB b3, c14, c14 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + FMUL a1, c10, c10 + FMUL a1, c12, c12 + FMUL a1, c14, c14 + FMUL a1, c16, c16 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + FNMSUB (aa2, cc10, cc09, cc09) + FNMSUB (aa2, cc12, cc11, cc11) + FNMSUB (aa2, cc14, cc13, cc13) + FNMSUB (aa2, cc16, cc15, cc15) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 + FMUL a3, c09, c09 + FMUL a3, c11, c11 + FMUL a3, c13, c13 + FMUL a3, c15, c15 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + FNMSUB (aa2, cc09, cc10, cc10) + FNMSUB (aa2, cc11, cc12, cc12) + FNMSUB (aa2, cc13, cc14, cc14) + FNMSUB (aa2, cc15, cc16, cc16) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 + FMUL a3, c10, c10 + FMUL a3, c12, c12 + FMUL a3, c14, c14 + FMUL a3, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb1, cc02, cc10, cc10) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb2, cc02, cc12, cc12) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb3, cc02, cc14, cc14) + FNMSUB (bb4, cc01, cc15, cc15) + FNMSUB (bb4, cc02, cc16, cc16) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (aa4, cc04, cc10, cc10) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb1, cc04, cc12, cc12) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb2, cc04, cc14, cc14) + FNMSUB (bb3, cc03, cc15, cc15) + FNMSUB (bb3, cc04, cc16, cc16) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa3, cc06, cc10, cc10) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (aa4, cc06, cc12, cc12) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb1, cc06, cc14, cc14) + FNMSUB (bb2, cc05, cc15, cc15) + FNMSUB (bb2, cc06, cc16, cc16) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa2, cc08, cc10, cc10) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa3, cc08, cc12, cc12) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (aa4, cc08, cc14, cc14) + FNMSUB (bb1, cc07, cc15, cc15) + FNMSUB (bb1, cc08, cc16, cc16) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa2, cc10, cc12, cc12) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa3, cc10, cc14, cc14) + FNMSUB (aa4, cc09, cc15, cc15) + FNMSUB (aa4, cc10, cc16, cc16) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa2, cc12, cc14, cc14) + FNMSUB (aa3, cc11, cc15, cc15) + FNMSUB (aa3, cc12, cc16, cc16) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + + FNMSUB (aa2, cc13, cc15, cc15) + FNMSUB (aa2, cc14, cc16, cc16) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c16, c16 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc16, cc14, cc14) + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc16, cc12, cc12) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc16, cc10, cc10) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc16, cc08, cc08) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc16, cc06, cc06) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc16, cc04, cc04) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc16, cc02, cc02) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c14, c14 + FMUL a1, c13, c13 + + FNMSUB (aa2, cc14, cc12, cc12) + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc14, cc10, cc10) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc14, cc08, cc08) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc14, cc06, cc06) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc14, cc04, cc04) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc14, cc02, cc02) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c12, c12 + FMUL a1, c11, c11 + + FNMSUB (aa2, cc12, cc10, cc10) + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc12, cc08, cc08) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc12, cc06, cc06) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc12, cc04, cc04) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc12, cc02, cc02) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c10, c10 + FMUL a1, c09, c09 + + FNMSUB (aa2, cc10, cc08, cc08) + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc10, cc06, cc06) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc10, cc04, cc04) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc10, cc02, cc02) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 + add C5, -2 * SIZE, C5 + add C6, -2 * SIZE, C6 + add C7, -2 * SIZE, C7 + add C8, -2 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] + + STF c02, [BO + 8 * SIZE] + STF c04, [BO + 9 * SIZE] + STF c06, [BO + 10 * SIZE] + STF c08, [BO + 11 * SIZE] + + STF c10, [BO + 12 * SIZE] + STF c12, [BO + 13 * SIZE] + STF c14, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c10, [C5 + 1 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c12, [C6 + 1 * SIZE] + + STF c13, [C7 + 0 * SIZE] + STF c14, [C7 + 1 * SIZE] + STF c15, [C8 + 0 * SIZE] + STF c16, [C8 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 + add C5, 2 * SIZE, C5 + add C6, 2 * SIZE, C6 + add C7, 2 * SIZE, C7 + add C8, 2 * SIZE, C8 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + .align 4 + +.LL29: +#ifdef LN + sll K, BASE_SHIFT + 3, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 8, KK +#endif + +#ifdef RT + sub KK, 8, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL30: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL50 + nop + +#ifdef RT + sll K, BASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc05) + LDF [BO + 8 * SIZE], b9 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL45 + nop + +.LL43: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + FMADD (aa2, bb7, cc05, cc05) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc07, cc07) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + LDF [AO + 2 * SIZE], a3 + add BO, 16 * SIZE, BO + + FMADD (aa4, bb5, cc01, cc01) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc03, cc03) + LDF [BO + 5 * SIZE], b6 + FMADD (aa4, bb7, cc05, cc05) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc07, cc07) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL43 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL45: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL48 + nop + .align 4 + +.LL47: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 4 * SIZE], b1 + add L, -1, L + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 5 * SIZE], b2 + add AO, 1 * SIZE, AO + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 6 * SIZE], b3 + cmp L, 0 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 7 * SIZE], b4 + add BO, 4 * SIZE, BO + + bg,pt %icc, .LL47 + LDF [AO + 0 * SIZE], a1 + .align 4 + +.LL48: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL40: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL49 + nop + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc02) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + LDF [BO + 8 * SIZE], b9 + FCLR (cc04) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc05) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C3 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + nop + .align 4 + +.LL33: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + nop + FMADD (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD (aa3, bb8, cc07, cc07) + FMADD (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL37 + add BO, 4 * SIZE, BO + .align 4 + +.LL38: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +.LL49: +#ifdef LN + sll K, BASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + .align 4 + +.LL50: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL70 + nop + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL60 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + FCLR (cc01) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL65 + nop + .align 4 + +.LL63: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb3, cc01, cc01) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc03, cc03) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + + LDF [AO + 2 * SIZE], a3 + add BO, 8 * SIZE, BO + + FMADD (aa4, bb7, cc01, cc01) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc03, cc03) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL63 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL65: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL68 + nop + .align 4 + +.LL67: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 2 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 3 * SIZE], b2 + + LDF [AO + 1 * SIZE], a1 + add L, -1, L + add AO, 1 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL67 + add BO, 2 * SIZE, BO + .align 4 + +.LL68: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL60: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL69 + nop + .align 4 + +.LL52: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL55 + nop + .align 4 + +.LL53: + FMADD (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL53 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL58 + nop + .align 4 + +.LL57: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL57 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL58: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL52 + nop + .align 4 + +.LL69: +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL70: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, BASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C1, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL80 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [BO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], a2 + LDF [BO + 1 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + LDF [BO + 2 * SIZE], b3 + LDF [AO + 3 * SIZE], a4 + LDF [BO + 3 * SIZE], b4 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL85 + FCLR (cc01) + .align 4 + +.LL83: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + LDF [BO + 4 * SIZE], b1 + + FMADD (aa2, bb2, cc01, cc01) + LDF [AO + 5 * SIZE], a2 + LDF [BO + 5 * SIZE], b2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + LDF [BO + 6 * SIZE], b3 + + FMADD (aa4, bb4, cc01, cc01) + LDF [AO + 7 * SIZE], a4 + LDF [BO + 7 * SIZE], b4 + + add AO, 4 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL83 + add BO, 4 * SIZE, BO + .align 4 + +.LL85: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL88 + nop + .align 4 + +.LL87: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 1 * SIZE], a1 + LDF [BO + 1 * SIZE], b1 + + add AO, 1 * SIZE, AO + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL87 + add BO, 1 * SIZE, BO + .align 4 + +.LL88: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL80: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL89 + nop + .align 4 + +.LL72: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + prefetch [C1 + 2 * SIZE], 3 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL75 + nop + +.LL73: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + LDF [BO + 4 * SIZE], b1 + cmp L, 0 + + FMADD (aa3, bb2, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb2, cc02, cc02) + LDF [AO + 7 * SIZE], a4 + + LDF [BO + 5 * SIZE], b2 + add BO, 4 * SIZE, BO + + FMADD (aa1, bb3, cc01, cc01) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb3, cc02, cc02) + LDF [AO + 9 * SIZE], a2 + + LDF [BO + 2 * SIZE], b3 + add AO, 8 * SIZE, AO + + FMADD (aa3, bb4, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa4, bb4, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL73 + LDF [BO + 3 * SIZE], b4 + .align 4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL78 + nop + .align 4 + +.LL77: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add L, -1, L + add AO, 2 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL77 + add BO, 1 * SIZE, BO + .align 4 + +.LL78: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + + FNMSUB (aa2, cc02, cc01, cc01) + + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc02, cc02) + + FMUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL72 + nop + .align 4 + +.LL89: +#ifdef LN + sll K, BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL999: +#ifdef TRMMKERNEL +#ifndef __64BIT__ + ld [%sp + STACK_START + 8], %g1 + ld [%sp + STACK_START + 12], %g2 + ld [%sp + STACK_START + 16], %g3 + ld [%sp + STACK_START + 20], %g4 +#else + ldx [%sp + STACK_START + 32], %g1 + ldx [%sp + STACK_START + 40], %g2 + ldx [%sp + STACK_START + 48], %g3 + ldx [%sp + STACK_START + 56], %g4 +#endif +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_LT.S b/kernel/sparc/trsm_kernel_LT.S new file mode 100644 index 0000000..11df205 --- /dev/null +++ b/kernel/sparc/trsm_kernel_LT.S @@ -0,0 +1,4221 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define OFFSET %l5 +#define KK %l6 +#define TEMP1 %l7 +#define TEMP2 %i3 +#define AORIG %g1 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f58 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f60 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#endif + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET +#endif + + FCLR(29) + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +.LL11: +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 2, TEMP1 + sub C, TEMP1, C +#endif + + add C, LDC, C2 + FMOV FZERO, t1 + nop + mov C, C1 + + add C2, LDC, C3 + FMOV FZERO, t2 + sra M, 2, I + add C3, LDC, C4 + FMOV FZERO, t3 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + cmp I, 0 +#ifndef RT + add C4, LDC, C +#endif + FMOV FZERO, t4 + + ble,pn %icc, .LL50 + FMOV FZERO, c01 + +.LL21: + FMOV FZERO, c02 + FMOV FZERO, c03 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c04 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c05 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c06 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c07 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c08 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c09 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c10 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c11 + LDF [BO + 4 * SIZE], b5 /* ***** */ + + LDF [AO + 4 * SIZE], a5 /* ***** */ + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 + 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 + 3 * SIZE], 3 + FMOV FZERO, c15 + + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + +.LL22: + FADD c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c08, t2, c08 + FMUL a5, b2, t2 + FADD c12, t3, c12 + FMUL a5, b3, t3 + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL29 + nop + +.LL26: + FADD c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 4, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD c04, t1, c04 + FADD c08, t2, c08 + FADD c12, t3, c12 + FADD c16, t4, c16 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c09, c09 + FSUB a4, c13, c13 + + FSUB b1, c02, c02 + FSUB b2, c06, c06 + FSUB b3, c10, c10 + FSUB b4, c14, c14 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c03, c03 + FSUB a2, c07, c07 + FSUB a3, c11, c11 + FSUB a4, c15, c15 + + FSUB b1, c04, c04 + FSUB b2, c08, c08 + FSUB b3, c12, c12 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 + + FMUL a2, c04, t1 + FMUL a2, c08, t2 + FMUL a2, c12, t3 + FMUL a2, c16, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c04, t1 + FMUL a3, c08, t2 + FMUL a3, c12, t3 + FMUL a3, c16, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a4, c04, t1 + FMUL a4, c08, t2 + FMUL a4, c12, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c03, t1 + FMUL a3, c07, t2 + FMUL a3, c11, t3 + FMUL a3, c15, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 + + FMUL a2, c01, t1 + FMUL a2, c05, t2 + FMUL a2, c09, t3 + FMUL a2, c13, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c01, t1 + FMUL a3, c05, t2 + FMUL a3, c09, t3 + FMUL a3, c13, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a4, c01, t1 + FMUL a4, c05, t2 + FMUL a4, c09, t3 + FMUL a4, c13, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FMUL a3, c10, t3 + FMUL a3, c14, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + FMUL a4, c03, t3 + FMUL a4, c04, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a3, c07, t3 + FMUL a3, c08, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 + + FMUL a2, c13, t1 + FMUL a2, c14, t2 + FMUL a2, c15, t3 + FMUL a2, c16, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c13, t1 + FMUL a3, c14, t2 + FMUL a3, c15, t3 + FMUL a3, c16, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a4, c13, t1 + FMUL a4, c14, t2 + FMUL a4, c15, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 + add C3, -4 * SIZE, C3 + add C4, -4 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c13, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c06, [BO + 5 * SIZE] + STF c10, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] + + STF c03, [BO + 8 * SIZE] + STF c07, [BO + 9 * SIZE] + STF c11, [BO + 10 * SIZE] + STF c15, [BO + 11 * SIZE] + + STF c04, [BO + 12 * SIZE] + STF c08, [BO + 13 * SIZE] + STF c12, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c11, [C3 + 2 * SIZE] + STF c12, [C3 + 3 * SIZE] + + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + STF c15, [C4 + 2 * SIZE] + STF c16, [C4 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + add C3, 4 * SIZE, C3 + add C4, 4 * SIZE, C4 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL70 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c02 + FMOV FZERO, t1 + FMOV FZERO, c04 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD c04, t2, c04 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD c02, t1, c02 + FADD c04, t2, c04 + FADD c06, t3, c06 + FADD c08, t4, c08 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FMUL a2, c06, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c03, t2, c03 + FSUB c05, t3, c05 + FSUB c07, t4, c07 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FMUL a2, c01, t1 + FMUL a2, c03, t2 + FMUL a2, c05, t3 + FMUL a2, c07, t4 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FSUB c06, t3, c06 + FSUB c08, t4, c08 + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c07, t1 + FMUL a2, c08, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c07, t1 + FMUL a3, c08, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a4, c07, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL70: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL75 + nop + +.LL72: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a1, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 4 * SIZE], a1 + + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a2, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [BO + 9 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a2, b3, t3 + LDF [BO + 10 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 11 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 12 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 13 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [BO + 14 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a3, b4, t4 + LDF [BO + 15 * SIZE], b4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 16 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a4, b2, t2 + LDF [BO + 17 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 18 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 19 * SIZE], b4 + + add BO, 16 * SIZE, BO + bg,pt %icc, .LL72 + LDF [AO + 3 * SIZE], a4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL79 + nop + +.LL76: + FADD c01, t1, c01 + add AO, 1 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + add BO, 4 * SIZE, BO + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + bg,pt %icc, .LL76 + LDF [BO + 3 * SIZE], b4 + + +.LL79: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c04, t1, c04 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + FSUB c03, t1, c03 + FMUL a3, c04, t1 + FSUB c02, t1, c02 + FMUL a4, c04, t1 + FSUB c01, t1, c01 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 + add C3, 1 * SIZE, C3 + add C4, 1 * SIZE, C4 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL99: +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: /* n & 2 */ + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 1, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C2, LDC, C +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + FADD c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD c03, t1, c03 + FADD c07, t2, c07 + FADD c04, t3, c04 + FADD c08, t4, c08 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c02, c02 + FSUB a4, c06, c06 + + FSUB b1, c03, c03 + FSUB b2, c07, c07 + FSUB b3, c04, c04 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a2, c04, t1 + FMUL a2, c08, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c04, t1 + FMUL a3, c08, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a4, c04, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c03, t1 + FMUL a3, c07, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a2, c01, t1 + FMUL a2, c05, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c01, t1 + FMUL a3, c05, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a4, c01, t1 + FMUL a4, c05, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c05, c05 + FMUL a3, c06, c06 + FMUL a3, c07, c07 + FMUL a3, c08, c08 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FMUL a3, c01, c01 + FMUL a3, c02, c02 + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c07, [BO + 5 * SIZE] + STF c04, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL170 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + ble,pn %icc, .LL155 + nop + +.LL152: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL a1, b1, t1 + FMUL a1, b2, t2 + FMUL a2, b1, t3 + FMUL a2, b2, t4 + + add AO, 2 * SIZE, AO + add BO, 2 * SIZE, BO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL156 + nop + +.LL159: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FMUL a3, c01, c01 + FMUL a3, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a2, c01, t1 + FMUL a2, c03, t2 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FSUB c01, t1, c01 + FSUB c03, t2, c03 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c02, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + +.LL170: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL175 + nop + +.LL172: + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + add L, -1, L + LDF [AO + 0 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 9 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 10 * SIZE], b3 + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 11 * SIZE], b4 + add BO, 8 * SIZE, BO + + bg,pt %icc, .LL172 + LDF [AO + 3 * SIZE], a4 + +.LL175: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL179 + nop + +.LL176: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + add AO, 1 * SIZE, AO + LDF [BO + 2 * SIZE], b1 + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 3 * SIZE], b2 + + add BO, 2 * SIZE, BO + bg,pt %icc, .LL176 + LDF [AO + 0 * SIZE], a1 + +.LL179: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL199: +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + +.LL200: + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL250 + nop + +.LL221: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL225 + prefetch [C1 + 4 * SIZE], 2 + +.LL222: + FADD c01, t1, c01 + add BO, 4 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + add L, -1, L + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + + FADD c01, t1, c01 + cmp L, 0 + FMUL a1, b2, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [AO + 9 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b2, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 11 * SIZE], a4 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 12 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 13 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [AO + 14 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b3, t4 + LDF [AO + 15 * SIZE], a4 + LDF [BO + 2 * SIZE], b3 + + FADD c01, t1, c01 + FMUL a1, b4, t1 + LDF [AO + 16 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b4, t2 + LDF [AO + 17 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 18 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 19 * SIZE], a4 + add AO, 16 * SIZE, AO + + bg,pt %icc, .LL222 + LDF [BO + 3 * SIZE], b4 + +.LL225: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL229 + nop + +.LL226: + FADD c01, t1, c01 + add BO, 1 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + add AO, 4 * SIZE, AO + + bg,pt %icc, .LL226 + LDF [BO + 0 * SIZE], b1 + +.LL229: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + + FSUB c03, t1, c03 + FMUL a3, c04, t1 + + FSUB c02, t1, c02 + FMUL a4, c04, t1 + + FSUB c01, t1, c01 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c04, t1, c04 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL221 + nop + +.LL250: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL270 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL255 + nop + +.LL252: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + LDF [BO + 4 * SIZE], b1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b2, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 5 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 9 * SIZE], a2 + LDF [BO + 6 * SIZE], b3 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + add AO, 8 * SIZE, AO + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL252 + add BO, 4 * SIZE, BO + +.LL255: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + + cmp L, 0 + ble,a,pn %icc, .LL259 + nop + +.LL256: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 2 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a2, b1, t2 + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add AO, 2 * SIZE, AO + + bg,pt %icc, .LL256 + add BO, 1 * SIZE, BO + +.LL259: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL270: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL299 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t2 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + + ble,pn %icc, .LL275 + LDF [BO + 3 * SIZE], b4 + +.LL272: + FADD c01, t1, c01 + add L, -1, L + add AO, 4 * SIZE, AO + + FMUL a1, b1, t1 + add BO, 4 * SIZE, BO + LDF [AO + 0 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + LDF [BO + 0 * SIZE], b1 + FMUL a2, b2, t2 + + LDF [AO + 1 * SIZE], a2 + FADD c01, t3, c01 + LDF [BO + 1 * SIZE], b2 + FMUL a3, b3, t3 + + LDF [AO + 2 * SIZE], a3 + FADD c02, t4, c02 + LDF [BO + 2 * SIZE], b3 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL272 + LDF [BO + 3 * SIZE], b4 + +.LL275: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL279 + nop + +.LL276: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 1 * SIZE], a1 + + LDF [BO + 1 * SIZE], b1 + add BO, 1 * SIZE, BO + cmp L, 0 + bg,pt %icc, .LL276 + add AO, 1 * SIZE, AO + +.LL279: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + + FADD c01, c02, c01 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + +.LL299: +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_LT_2x8.S b/kernel/sparc/trsm_kernel_LT_2x8.S new file mode 100644 index 0000000..39015d7 --- /dev/null +++ b/kernel/sparc/trsm_kernel_LT_2x8.S @@ -0,0 +1,3896 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define C5 %l5 +#define C6 %l6 +#define C7 %l7 +#define C8 %i3 + +#define OFFSET %g1 +#define KK %g2 +#define TEMP1 %g3 +#define TEMP2 %g4 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif + st %g1, [%sp + STACK_START + 8] + st %g2, [%sp + STACK_START + 12] + st %g3, [%sp + STACK_START + 16] + st %g4, [%sp + STACK_START + 20] +#else + + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET + + stx %g1, [%sp + STACK_START + 32] + stx %g2, [%sp + STACK_START + 40] + stx %g3, [%sp + STACK_START + 48] + stx %g4, [%sp + STACK_START + 56] +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, BASE_SHIFT + 3, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C5 + add C5, LDC, C6 + add C6, LDC, C7 + add C7, LDC, C8 + add C8, LDC, C +#else + sub C, LDC, C8 + sub C8, LDC, C7 + sub C7, LDC, C6 + sub C6, LDC, C5 + sub C5, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL20 + nop + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 8 * SIZE], a5 + + LDF [BO + 0 * SIZE], b1 + + LDF [BO + 1 * SIZE], b2 + FCLR (cc01) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc09) + LDF [BO + 4 * SIZE], b5 + FCLR (cc13) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc06) + LDF [BO + 7 * SIZE], b8 + FCLR (cc10) + LDF [BO + 8 * SIZE], b9 + FCLR (cc14) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc03) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc11) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc15) + + prefetch [C5 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C6 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C7 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C8 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + nop + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + nop + FMADD (aa2, bb5, cc10, cc10) + nop + + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c02, c02 + FSUB a2, c04, c04 + FSUB a3, c06, c06 + FSUB a4, c08, c08 + + FSUB b1, c10, c10 + FSUB b2, c12, c12 + FSUB b3, c14, c14 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + FMUL a1, c10, c10 + FMUL a1, c12, c12 + FMUL a1, c14, c14 + FMUL a1, c16, c16 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + FNMSUB (aa2, cc10, cc09, cc09) + FNMSUB (aa2, cc12, cc11, cc11) + FNMSUB (aa2, cc14, cc13, cc13) + FNMSUB (aa2, cc16, cc15, cc15) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 + FMUL a3, c09, c09 + FMUL a3, c11, c11 + FMUL a3, c13, c13 + FMUL a3, c15, c15 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + FNMSUB (aa2, cc09, cc10, cc10) + FNMSUB (aa2, cc11, cc12, cc12) + FNMSUB (aa2, cc13, cc14, cc14) + FNMSUB (aa2, cc15, cc16, cc16) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 + FMUL a3, c10, c10 + FMUL a3, c12, c12 + FMUL a3, c14, c14 + FMUL a3, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb1, cc02, cc10, cc10) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb2, cc02, cc12, cc12) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb3, cc02, cc14, cc14) + FNMSUB (bb4, cc01, cc15, cc15) + FNMSUB (bb4, cc02, cc16, cc16) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (aa4, cc04, cc10, cc10) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb1, cc04, cc12, cc12) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb2, cc04, cc14, cc14) + FNMSUB (bb3, cc03, cc15, cc15) + FNMSUB (bb3, cc04, cc16, cc16) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa3, cc06, cc10, cc10) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (aa4, cc06, cc12, cc12) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb1, cc06, cc14, cc14) + FNMSUB (bb2, cc05, cc15, cc15) + FNMSUB (bb2, cc06, cc16, cc16) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa2, cc08, cc10, cc10) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa3, cc08, cc12, cc12) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (aa4, cc08, cc14, cc14) + FNMSUB (bb1, cc07, cc15, cc15) + FNMSUB (bb1, cc08, cc16, cc16) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa2, cc10, cc12, cc12) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa3, cc10, cc14, cc14) + FNMSUB (aa4, cc09, cc15, cc15) + FNMSUB (aa4, cc10, cc16, cc16) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa2, cc12, cc14, cc14) + FNMSUB (aa3, cc11, cc15, cc15) + FNMSUB (aa3, cc12, cc16, cc16) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + + FNMSUB (aa2, cc13, cc15, cc15) + FNMSUB (aa2, cc14, cc16, cc16) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c16, c16 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc16, cc14, cc14) + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc16, cc12, cc12) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc16, cc10, cc10) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc16, cc08, cc08) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc16, cc06, cc06) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc16, cc04, cc04) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc16, cc02, cc02) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c14, c14 + FMUL a1, c13, c13 + + FNMSUB (aa2, cc14, cc12, cc12) + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc14, cc10, cc10) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc14, cc08, cc08) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc14, cc06, cc06) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc14, cc04, cc04) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc14, cc02, cc02) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c12, c12 + FMUL a1, c11, c11 + + FNMSUB (aa2, cc12, cc10, cc10) + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc12, cc08, cc08) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc12, cc06, cc06) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc12, cc04, cc04) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc12, cc02, cc02) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c10, c10 + FMUL a1, c09, c09 + + FNMSUB (aa2, cc10, cc08, cc08) + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc10, cc06, cc06) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc10, cc04, cc04) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc10, cc02, cc02) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 + add C5, -2 * SIZE, C5 + add C6, -2 * SIZE, C6 + add C7, -2 * SIZE, C7 + add C8, -2 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] + + STF c02, [BO + 8 * SIZE] + STF c04, [BO + 9 * SIZE] + STF c06, [BO + 10 * SIZE] + STF c08, [BO + 11 * SIZE] + + STF c10, [BO + 12 * SIZE] + STF c12, [BO + 13 * SIZE] + STF c14, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c10, [C5 + 1 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c12, [C6 + 1 * SIZE] + + STF c13, [C7 + 0 * SIZE] + STF c14, [C7 + 1 * SIZE] + STF c15, [C8 + 0 * SIZE] + STF c16, [C8 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 + add C5, 2 * SIZE, C5 + add C6, 2 * SIZE, C6 + add C7, 2 * SIZE, C7 + add C8, 2 * SIZE, C8 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + .align 4 + +.LL20: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL29 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + FCLR (cc01) + LDF [BO + 1 * SIZE], b2 + FCLR (cc03) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc07) + LDF [BO + 4 * SIZE], b5 + FCLR (cc09) + LDF [BO + 5 * SIZE], b6 + FCLR (cc11) + LDF [BO + 6 * SIZE], b7 + FCLR (cc13) + LDF [BO + 7 * SIZE], b8 + FCLR (cc15) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + LDF [BO + 8 * SIZE], b9 + .align 4 + +.LL23: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa2, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa2, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa2, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa2, bb5, cc09, cc09) + LDF [BO + 20 * SIZE], b5 + FMADD (aa2, bb6, cc11, cc11) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa2, bb7, cc13, cc13) + LDF [BO + 22 * SIZE], b7 + FMADD (aa2, bb8, cc15, cc15) + LDF [BO + 23 * SIZE], b8 + + LDF [AO + 4 * SIZE], a1 + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb1, cc01, cc01) + LDF [BO + 32 * SIZE], b1 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 26 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [BO + 28 * SIZE], b5 + FMADD (aa3, bb6, cc11, cc11) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 30 * SIZE], b7 + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa4, bb9, cc01, cc01) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb2, cc03, cc03) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa4, bb3, cc05, cc05) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc07, cc07) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa4, bb5, cc09, cc09) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb6, cc11, cc11) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa4, bb7, cc13, cc13) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc15, cc15) + LDF [BO + 39 * SIZE], b8 + + LDF [AO + 6 * SIZE], a3 + LDF [AO + 7 * SIZE], a4 + + add AO, 4 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL23 + add BO, 32 * SIZE, BO + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 1 * SIZE], a1 + add AO, 1 * SIZE, AO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL27 + add BO, 8 * SIZE, BO + .align 4 + +.LL28: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb4, cc01, cc15, cc15) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb3, cc03, cc15, cc15) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb2, cc05, cc15, cc15) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (bb1, cc07, cc15, cc15) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa4, cc09, cc15, cc15) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa3, cc11, cc15, cc15) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc15, cc15) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c15, c15 + + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 + add C5, -1 * SIZE, C5 + add C6, -1 * SIZE, C6 + add C7, -1 * SIZE, C7 + add C8, -1 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c11, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c15, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c13, [C7 + 0 * SIZE] + STF c15, [C8 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL29: +#ifdef LN + sll K, BASE_SHIFT + 3, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 8, KK +#endif + +#ifdef RT + sub KK, 8, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL30: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL50 + nop + +#ifdef RT + sll K, BASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + nop + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc02) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + LDF [BO + 8 * SIZE], b9 + FCLR (cc04) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc05) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C3 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + nop + .align 4 + +.LL33: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + nop + FMADD (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD (aa3, bb8, cc07, cc07) + FMADD (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL37 + add BO, 4 * SIZE, BO + .align 4 + +.LL38: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +.LL40: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL49 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc05) + LDF [BO + 8 * SIZE], b9 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL45 + nop + +.LL43: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + FMADD (aa2, bb7, cc05, cc05) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc07, cc07) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + LDF [AO + 2 * SIZE], a3 + add BO, 16 * SIZE, BO + + FMADD (aa4, bb5, cc01, cc01) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc03, cc03) + LDF [BO + 5 * SIZE], b6 + FMADD (aa4, bb7, cc05, cc05) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc07, cc07) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL43 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL45: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL48 + nop + .align 4 + +.LL47: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 4 * SIZE], b1 + add L, -1, L + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 5 * SIZE], b2 + add AO, 1 * SIZE, AO + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 6 * SIZE], b3 + cmp L, 0 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 7 * SIZE], b4 + add BO, 4 * SIZE, BO + + bg,pt %icc, .LL47 + LDF [AO + 0 * SIZE], a1 + .align 4 + +.LL48: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL49: +#ifdef LN + sll K, BASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + .align 4 + +.LL50: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL70 + nop + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL60 + nop + .align 4 + +.LL52: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL55 + nop + .align 4 + +.LL53: + FMADD (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL53 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL58 + nop + .align 4 + +.LL57: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL57 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL58: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL52 + nop + .align 4 + +.LL60: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL69 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + FCLR (cc01) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL65 + nop + .align 4 + +.LL63: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb3, cc01, cc01) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc03, cc03) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + + LDF [AO + 2 * SIZE], a3 + add BO, 8 * SIZE, BO + + FMADD (aa4, bb7, cc01, cc01) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc03, cc03) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL63 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL65: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL68 + nop + .align 4 + +.LL67: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 2 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 3 * SIZE], b2 + + LDF [AO + 1 * SIZE], a1 + add L, -1, L + add AO, 1 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL67 + add BO, 2 * SIZE, BO + .align 4 + +.LL68: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL69: +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL70: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, BASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C1, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL80 + nop + .align 4 + +.LL72: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + prefetch [C1 + 2 * SIZE], 3 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL75 + nop + +.LL73: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + LDF [BO + 4 * SIZE], b1 + cmp L, 0 + + FMADD (aa3, bb2, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb2, cc02, cc02) + LDF [AO + 7 * SIZE], a4 + + LDF [BO + 5 * SIZE], b2 + add BO, 4 * SIZE, BO + + FMADD (aa1, bb3, cc01, cc01) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb3, cc02, cc02) + LDF [AO + 9 * SIZE], a2 + + LDF [BO + 2 * SIZE], b3 + add AO, 8 * SIZE, AO + + FMADD (aa3, bb4, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa4, bb4, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL73 + LDF [BO + 3 * SIZE], b4 + .align 4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL78 + nop + .align 4 + +.LL77: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add L, -1, L + add AO, 2 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL77 + add BO, 1 * SIZE, BO + .align 4 + +.LL78: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + + FNMSUB (aa2, cc02, cc01, cc01) + + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc02, cc02) + + FMUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL72 + nop + .align 4 + +.LL80: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL89 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [BO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], a2 + LDF [BO + 1 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + LDF [BO + 2 * SIZE], b3 + LDF [AO + 3 * SIZE], a4 + LDF [BO + 3 * SIZE], b4 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL85 + FCLR (cc01) + .align 4 + +.LL83: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + LDF [BO + 4 * SIZE], b1 + + FMADD (aa2, bb2, cc01, cc01) + LDF [AO + 5 * SIZE], a2 + LDF [BO + 5 * SIZE], b2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + LDF [BO + 6 * SIZE], b3 + + FMADD (aa4, bb4, cc01, cc01) + LDF [AO + 7 * SIZE], a4 + LDF [BO + 7 * SIZE], b4 + + add AO, 4 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL83 + add BO, 4 * SIZE, BO + .align 4 + +.LL85: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL88 + nop + .align 4 + +.LL87: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 1 * SIZE], a1 + LDF [BO + 1 * SIZE], b1 + + add AO, 1 * SIZE, AO + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL87 + add BO, 1 * SIZE, BO + .align 4 + +.LL88: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL89: +#ifdef LN + sll K, BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL999: +#ifdef TRMMKERNEL +#ifndef __64BIT__ + ld [%sp + STACK_START + 8], %g1 + ld [%sp + STACK_START + 12], %g2 + ld [%sp + STACK_START + 16], %g3 + ld [%sp + STACK_START + 20], %g4 +#else + ldx [%sp + STACK_START + 32], %g1 + ldx [%sp + STACK_START + 40], %g2 + ldx [%sp + STACK_START + 48], %g3 + ldx [%sp + STACK_START + 56], %g4 +#endif +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_RT.S b/kernel/sparc/trsm_kernel_RT.S new file mode 100644 index 0000000..3e1a2b9 --- /dev/null +++ b/kernel/sparc/trsm_kernel_RT.S @@ -0,0 +1,4227 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define OFFSET %l5 +#define KK %l6 +#define TEMP1 %l7 +#define TEMP2 %i3 +#define AORIG %g1 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f58 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f60 +#define ALPHA %f62 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#define ALPHA %f30 +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET +#endif + + FCLR(29) + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL250 + nop + +.LL221: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL225 + prefetch [C1 + 4 * SIZE], 2 + +.LL222: + FADD c01, t1, c01 + add BO, 4 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + add L, -1, L + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + + FADD c01, t1, c01 + cmp L, 0 + FMUL a1, b2, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [AO + 9 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b2, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 11 * SIZE], a4 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 12 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 13 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [AO + 14 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b3, t4 + LDF [AO + 15 * SIZE], a4 + LDF [BO + 2 * SIZE], b3 + + FADD c01, t1, c01 + FMUL a1, b4, t1 + LDF [AO + 16 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b4, t2 + LDF [AO + 17 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 18 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 19 * SIZE], a4 + add AO, 16 * SIZE, AO + + bg,pt %icc, .LL222 + LDF [BO + 3 * SIZE], b4 + +.LL225: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL229 + nop + +.LL226: + FADD c01, t1, c01 + add BO, 1 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + add AO, 4 * SIZE, AO + + bg,pt %icc, .LL226 + LDF [BO + 0 * SIZE], b1 + +.LL229: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + + FSUB c03, t1, c03 + FMUL a3, c04, t1 + + FSUB c02, t1, c02 + FMUL a4, c04, t1 + + FSUB c01, t1, c01 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c04, t1, c04 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL221 + nop + +.LL250: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL270 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL255 + nop + +.LL252: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + LDF [BO + 4 * SIZE], b1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b2, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 5 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 9 * SIZE], a2 + LDF [BO + 6 * SIZE], b3 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + add AO, 8 * SIZE, AO + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL252 + add BO, 4 * SIZE, BO + +.LL255: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + + cmp L, 0 + ble,a,pn %icc, .LL259 + nop + +.LL256: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 2 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a2, b1, t2 + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add AO, 2 * SIZE, AO + + bg,pt %icc, .LL256 + add BO, 1 * SIZE, BO + +.LL259: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL270: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL299 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t2 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + + ble,pn %icc, .LL275 + LDF [BO + 3 * SIZE], b4 + +.LL272: + FADD c01, t1, c01 + add L, -1, L + add AO, 4 * SIZE, AO + + FMUL a1, b1, t1 + add BO, 4 * SIZE, BO + LDF [AO + 0 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + LDF [BO + 0 * SIZE], b1 + FMUL a2, b2, t2 + + LDF [AO + 1 * SIZE], a2 + FADD c01, t3, c01 + LDF [BO + 1 * SIZE], b2 + FMUL a3, b3, t3 + + LDF [AO + 2 * SIZE], a3 + FADD c02, t4, c02 + LDF [BO + 2 * SIZE], b3 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL272 + LDF [BO + 3 * SIZE], b4 + +.LL275: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL279 + nop + +.LL276: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 1 * SIZE], a1 + + LDF [BO + 1 * SIZE], b1 + add BO, 1 * SIZE, BO + cmp L, 0 + bg,pt %icc, .LL276 + add AO, 1 * SIZE, AO + +.LL279: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + + FADD c01, c02, c01 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + +.LL299: +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + +.LL100: /* n & 2 */ + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 1, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C2, LDC, C +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + FADD c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD c03, t1, c03 + FADD c07, t2, c07 + FADD c04, t3, c04 + FADD c08, t4, c08 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c02, c02 + FSUB a4, c06, c06 + + FSUB b1, c03, c03 + FSUB b2, c07, c07 + FSUB b3, c04, c04 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a2, c04, t1 + FMUL a2, c08, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c04, t1 + FMUL a3, c08, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a4, c04, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c03, t1 + FMUL a3, c07, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a2, c01, t1 + FMUL a2, c05, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c01, t1 + FMUL a3, c05, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a4, c01, t1 + FMUL a4, c05, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c05, c05 + FMUL a3, c06, c06 + FMUL a3, c07, c07 + FMUL a3, c08, c08 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FMUL a3, c01, c01 + FMUL a3, c02, c02 + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c07, [BO + 5 * SIZE] + STF c04, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL170 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + ble,pn %icc, .LL155 + nop + +.LL152: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL a1, b1, t1 + FMUL a1, b2, t2 + FMUL a2, b1, t3 + FMUL a2, b2, t4 + + add AO, 2 * SIZE, AO + add BO, 2 * SIZE, BO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL156 + nop + +.LL159: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FMUL a3, c01, c01 + FMUL a3, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a2, c01, t1 + FMUL a2, c03, t2 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FSUB c01, t1, c01 + FSUB c03, t2, c03 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c02, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + +.LL170: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL175 + nop + +.LL172: + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + add L, -1, L + LDF [AO + 0 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 9 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 10 * SIZE], b3 + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 11 * SIZE], b4 + add BO, 8 * SIZE, BO + + bg,pt %icc, .LL172 + LDF [AO + 3 * SIZE], a4 + +.LL175: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL179 + nop + +.LL176: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + add AO, 1 * SIZE, AO + LDF [BO + 2 * SIZE], b1 + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 3 * SIZE], b2 + + add BO, 2 * SIZE, BO + bg,pt %icc, .LL176 + LDF [AO + 0 * SIZE], a1 + +.LL179: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL199: +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + +.LL200: + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL11: +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 2, TEMP1 + sub C, TEMP1, C +#endif + + add C, LDC, C2 + FMOV FZERO, t1 + nop + mov C, C1 + + add C2, LDC, C3 + FMOV FZERO, t2 + nop + mov A, AO + + sra M, 2, I + add C3, LDC, C4 + FMOV FZERO, t3 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + cmp I, 0 +#ifndef RT + add C4, LDC, C +#endif + FMOV FZERO, t4 + + ble,pn %icc, .LL50 + FMOV FZERO, c01 + +.LL21: + FMOV FZERO, c02 + FMOV FZERO, c03 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c04 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c05 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c06 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c07 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c08 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c09 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c10 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c11 + LDF [BO + 4 * SIZE], b5 /* ***** */ + + LDF [AO + 4 * SIZE], a5 /* ***** */ + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 + 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 + 3 * SIZE], 3 + FMOV FZERO, c15 + + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c08, t2, c08 + FMUL a5, b2, t2 + FADD c12, t3, c12 + FMUL a5, b3, t3 + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL29 + nop + +.LL26: + FADD c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 4, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD c04, t1, c04 + FADD c08, t2, c08 + FADD c12, t3, c12 + FADD c16, t4, c16 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c09, c09 + FSUB a4, c13, c13 + + FSUB b1, c02, c02 + FSUB b2, c06, c06 + FSUB b3, c10, c10 + FSUB b4, c14, c14 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c03, c03 + FSUB a2, c07, c07 + FSUB a3, c11, c11 + FSUB a4, c15, c15 + + FSUB b1, c04, c04 + FSUB b2, c08, c08 + FSUB b3, c12, c12 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 + + FMUL a2, c04, t1 + FMUL a2, c08, t2 + FMUL a2, c12, t3 + FMUL a2, c16, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c04, t1 + FMUL a3, c08, t2 + FMUL a3, c12, t3 + FMUL a3, c16, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a4, c04, t1 + FMUL a4, c08, t2 + FMUL a4, c12, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c03, t1 + FMUL a3, c07, t2 + FMUL a3, c11, t3 + FMUL a3, c15, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 + + FMUL a2, c01, t1 + FMUL a2, c05, t2 + FMUL a2, c09, t3 + FMUL a2, c13, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c01, t1 + FMUL a3, c05, t2 + FMUL a3, c09, t3 + FMUL a3, c13, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a4, c01, t1 + FMUL a4, c05, t2 + FMUL a4, c09, t3 + FMUL a4, c13, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FMUL a3, c10, t3 + FMUL a3, c14, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + FMUL a4, c03, t3 + FMUL a4, c04, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a3, c07, t3 + FMUL a3, c08, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 + + FMUL a2, c13, t1 + FMUL a2, c14, t2 + FMUL a2, c15, t3 + FMUL a2, c16, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c13, t1 + FMUL a3, c14, t2 + FMUL a3, c15, t3 + FMUL a3, c16, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a4, c13, t1 + FMUL a4, c14, t2 + FMUL a4, c15, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 + add C3, -4 * SIZE, C3 + add C4, -4 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c13, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c06, [BO + 5 * SIZE] + STF c10, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] + + STF c03, [BO + 8 * SIZE] + STF c07, [BO + 9 * SIZE] + STF c11, [BO + 10 * SIZE] + STF c15, [BO + 11 * SIZE] + + STF c04, [BO + 12 * SIZE] + STF c08, [BO + 13 * SIZE] + STF c12, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c11, [C3 + 2 * SIZE] + STF c12, [C3 + 3 * SIZE] + + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + STF c15, [C4 + 2 * SIZE] + STF c16, [C4 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + add C3, 4 * SIZE, C3 + add C4, 4 * SIZE, C4 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + sra K, 2, L + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL70 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c02 + FMOV FZERO, t1 + FMOV FZERO, c04 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD c04, t2, c04 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD c02, t1, c02 + FADD c04, t2, c04 + FADD c06, t3, c06 + FADD c08, t4, c08 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FMUL a2, c06, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c03, t2, c03 + FSUB c05, t3, c05 + FSUB c07, t4, c07 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FMUL a2, c01, t1 + FMUL a2, c03, t2 + FMUL a2, c05, t3 + FMUL a2, c07, t4 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FSUB c06, t3, c06 + FSUB c08, t4, c08 + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c07, t1 + FMUL a2, c08, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c07, t1 + FMUL a3, c08, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a4, c07, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL70: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL75 + nop + +.LL72: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a1, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 4 * SIZE], a1 + + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a2, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [BO + 9 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a2, b3, t3 + LDF [BO + 10 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 11 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 12 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 13 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [BO + 14 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a3, b4, t4 + LDF [BO + 15 * SIZE], b4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 16 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a4, b2, t2 + LDF [BO + 17 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 18 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 19 * SIZE], b4 + + add BO, 16 * SIZE, BO + bg,pt %icc, .LL72 + LDF [AO + 3 * SIZE], a4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL79 + nop + +.LL76: + FADD c01, t1, c01 + add AO, 1 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + add BO, 4 * SIZE, BO + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + bg,pt %icc, .LL76 + LDF [BO + 3 * SIZE], b4 + + +.LL79: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c04, t1, c04 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + FSUB c03, t1, c03 + FMUL a3, c04, t1 + FSUB c02, t1, c02 + FMUL a4, c04, t1 + FSUB c01, t1, c01 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 + add C3, 1 * SIZE, C3 + add C4, 1 * SIZE, C4 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL99: +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_RT_2x8.S b/kernel/sparc/trsm_kernel_RT_2x8.S new file mode 100644 index 0000000..c9f68ab --- /dev/null +++ b/kernel/sparc/trsm_kernel_RT_2x8.S @@ -0,0 +1,3896 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define C5 %l5 +#define C6 %l6 +#define C7 %l7 +#define C8 %i3 + +#define OFFSET %g1 +#define KK %g2 +#define TEMP1 %g3 +#define TEMP2 %g4 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif + st %g1, [%sp + STACK_START + 8] + st %g2, [%sp + STACK_START + 12] + st %g3, [%sp + STACK_START + 16] + st %g4, [%sp + STACK_START + 20] +#else + + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET + + stx %g1, [%sp + STACK_START + 32] + stx %g2, [%sp + STACK_START + 40] + stx %g3, [%sp + STACK_START + 48] + stx %g4, [%sp + STACK_START + 56] +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL50 + nop + +#ifdef RT + sll K, BASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C1, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL80 + nop + .align 4 + +.LL72: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + prefetch [C1 + 2 * SIZE], 3 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL75 + nop + +.LL73: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + LDF [BO + 4 * SIZE], b1 + cmp L, 0 + + FMADD (aa3, bb2, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb2, cc02, cc02) + LDF [AO + 7 * SIZE], a4 + + LDF [BO + 5 * SIZE], b2 + add BO, 4 * SIZE, BO + + FMADD (aa1, bb3, cc01, cc01) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb3, cc02, cc02) + LDF [AO + 9 * SIZE], a2 + + LDF [BO + 2 * SIZE], b3 + add AO, 8 * SIZE, AO + + FMADD (aa3, bb4, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa4, bb4, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL73 + LDF [BO + 3 * SIZE], b4 + .align 4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL78 + nop + .align 4 + +.LL77: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add L, -1, L + add AO, 2 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL77 + add BO, 1 * SIZE, BO + .align 4 + +.LL78: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + + FNMSUB (aa2, cc02, cc01, cc01) + + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc02, cc02) + + FMUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL72 + nop + .align 4 + +.LL80: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL89 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [BO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], a2 + LDF [BO + 1 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + LDF [BO + 2 * SIZE], b3 + LDF [AO + 3 * SIZE], a4 + LDF [BO + 3 * SIZE], b4 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL85 + FCLR (cc01) + .align 4 + +.LL83: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + LDF [BO + 4 * SIZE], b1 + + FMADD (aa2, bb2, cc01, cc01) + LDF [AO + 5 * SIZE], a2 + LDF [BO + 5 * SIZE], b2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + LDF [BO + 6 * SIZE], b3 + + FMADD (aa4, bb4, cc01, cc01) + LDF [AO + 7 * SIZE], a4 + LDF [BO + 7 * SIZE], b4 + + add AO, 4 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL83 + add BO, 4 * SIZE, BO + .align 4 + +.LL85: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL88 + nop + .align 4 + +.LL87: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 1 * SIZE], a1 + LDF [BO + 1 * SIZE], b1 + + add AO, 1 * SIZE, AO + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL87 + add BO, 1 * SIZE, BO + .align 4 + +.LL88: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL89: +#ifdef LN + sll K, BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL50: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL60 + nop + .align 4 + +.LL52: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL55 + nop + .align 4 + +.LL53: + FMADD (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL53 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL58 + nop + .align 4 + +.LL57: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL57 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL58: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL52 + nop + .align 4 + +.LL60: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL69 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + FCLR (cc01) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL65 + nop + .align 4 + +.LL63: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb3, cc01, cc01) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc03, cc03) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + + LDF [AO + 2 * SIZE], a3 + add BO, 8 * SIZE, BO + + FMADD (aa4, bb7, cc01, cc01) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc03, cc03) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL63 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL65: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL68 + nop + .align 4 + +.LL67: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 2 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 3 * SIZE], b2 + + LDF [AO + 1 * SIZE], a1 + add L, -1, L + add AO, 1 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL67 + add BO, 2 * SIZE, BO + .align 4 + +.LL68: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL69: +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL30: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL10 + nop + +#ifdef RT + sll K, BASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + nop + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc02) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + LDF [BO + 8 * SIZE], b9 + FCLR (cc04) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc05) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C3 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + nop + .align 4 + +.LL33: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + nop + FMADD (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD (aa3, bb8, cc07, cc07) + FMADD (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL37 + add BO, 4 * SIZE, BO + .align 4 + +.LL38: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +.LL40: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL49 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc05) + LDF [BO + 8 * SIZE], b9 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL45 + nop + +.LL43: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + FMADD (aa2, bb7, cc05, cc05) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc07, cc07) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + LDF [AO + 2 * SIZE], a3 + add BO, 16 * SIZE, BO + + FMADD (aa4, bb5, cc01, cc01) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc03, cc03) + LDF [BO + 5 * SIZE], b6 + FMADD (aa4, bb7, cc05, cc05) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc07, cc07) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL43 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL45: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL48 + nop + .align 4 + +.LL47: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 4 * SIZE], b1 + add L, -1, L + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 5 * SIZE], b2 + add AO, 1 * SIZE, AO + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 6 * SIZE], b3 + cmp L, 0 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 7 * SIZE], b4 + add BO, 4 * SIZE, BO + + bg,pt %icc, .LL47 + LDF [AO + 0 * SIZE], a1 + .align 4 + +.LL48: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL49: +#ifdef LN + sll K, BASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + .align 4 + +.LL10: + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, BASE_SHIFT + 3, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C5 + add C5, LDC, C6 + add C6, LDC, C7 + add C7, LDC, C8 + add C8, LDC, C +#else + sub C, LDC, C8 + sub C8, LDC, C7 + sub C7, LDC, C6 + sub C6, LDC, C5 + sub C5, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL20 + nop + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 8 * SIZE], a5 + + LDF [BO + 0 * SIZE], b1 + + LDF [BO + 1 * SIZE], b2 + FCLR (cc01) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc09) + LDF [BO + 4 * SIZE], b5 + FCLR (cc13) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc06) + LDF [BO + 7 * SIZE], b8 + FCLR (cc10) + LDF [BO + 8 * SIZE], b9 + FCLR (cc14) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc03) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc11) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc15) + + prefetch [C5 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C6 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C7 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C8 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + nop + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + nop + FMADD (aa2, bb5, cc10, cc10) + nop + + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c02, c02 + FSUB a2, c04, c04 + FSUB a3, c06, c06 + FSUB a4, c08, c08 + + FSUB b1, c10, c10 + FSUB b2, c12, c12 + FSUB b3, c14, c14 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + FMUL a1, c10, c10 + FMUL a1, c12, c12 + FMUL a1, c14, c14 + FMUL a1, c16, c16 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + FNMSUB (aa2, cc10, cc09, cc09) + FNMSUB (aa2, cc12, cc11, cc11) + FNMSUB (aa2, cc14, cc13, cc13) + FNMSUB (aa2, cc16, cc15, cc15) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 + FMUL a3, c09, c09 + FMUL a3, c11, c11 + FMUL a3, c13, c13 + FMUL a3, c15, c15 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + FNMSUB (aa2, cc09, cc10, cc10) + FNMSUB (aa2, cc11, cc12, cc12) + FNMSUB (aa2, cc13, cc14, cc14) + FNMSUB (aa2, cc15, cc16, cc16) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 + FMUL a3, c10, c10 + FMUL a3, c12, c12 + FMUL a3, c14, c14 + FMUL a3, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb1, cc02, cc10, cc10) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb2, cc02, cc12, cc12) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb3, cc02, cc14, cc14) + FNMSUB (bb4, cc01, cc15, cc15) + FNMSUB (bb4, cc02, cc16, cc16) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (aa4, cc04, cc10, cc10) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb1, cc04, cc12, cc12) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb2, cc04, cc14, cc14) + FNMSUB (bb3, cc03, cc15, cc15) + FNMSUB (bb3, cc04, cc16, cc16) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa3, cc06, cc10, cc10) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (aa4, cc06, cc12, cc12) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb1, cc06, cc14, cc14) + FNMSUB (bb2, cc05, cc15, cc15) + FNMSUB (bb2, cc06, cc16, cc16) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa2, cc08, cc10, cc10) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa3, cc08, cc12, cc12) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (aa4, cc08, cc14, cc14) + FNMSUB (bb1, cc07, cc15, cc15) + FNMSUB (bb1, cc08, cc16, cc16) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa2, cc10, cc12, cc12) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa3, cc10, cc14, cc14) + FNMSUB (aa4, cc09, cc15, cc15) + FNMSUB (aa4, cc10, cc16, cc16) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa2, cc12, cc14, cc14) + FNMSUB (aa3, cc11, cc15, cc15) + FNMSUB (aa3, cc12, cc16, cc16) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + + FNMSUB (aa2, cc13, cc15, cc15) + FNMSUB (aa2, cc14, cc16, cc16) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c16, c16 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc16, cc14, cc14) + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc16, cc12, cc12) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc16, cc10, cc10) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc16, cc08, cc08) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc16, cc06, cc06) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc16, cc04, cc04) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc16, cc02, cc02) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c14, c14 + FMUL a1, c13, c13 + + FNMSUB (aa2, cc14, cc12, cc12) + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc14, cc10, cc10) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc14, cc08, cc08) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc14, cc06, cc06) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc14, cc04, cc04) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc14, cc02, cc02) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c12, c12 + FMUL a1, c11, c11 + + FNMSUB (aa2, cc12, cc10, cc10) + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc12, cc08, cc08) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc12, cc06, cc06) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc12, cc04, cc04) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc12, cc02, cc02) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c10, c10 + FMUL a1, c09, c09 + + FNMSUB (aa2, cc10, cc08, cc08) + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc10, cc06, cc06) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc10, cc04, cc04) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc10, cc02, cc02) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 + add C5, -2 * SIZE, C5 + add C6, -2 * SIZE, C6 + add C7, -2 * SIZE, C7 + add C8, -2 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] + + STF c02, [BO + 8 * SIZE] + STF c04, [BO + 9 * SIZE] + STF c06, [BO + 10 * SIZE] + STF c08, [BO + 11 * SIZE] + + STF c10, [BO + 12 * SIZE] + STF c12, [BO + 13 * SIZE] + STF c14, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c10, [C5 + 1 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c12, [C6 + 1 * SIZE] + + STF c13, [C7 + 0 * SIZE] + STF c14, [C7 + 1 * SIZE] + STF c15, [C8 + 0 * SIZE] + STF c16, [C8 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 + add C5, 2 * SIZE, C5 + add C6, 2 * SIZE, C6 + add C7, 2 * SIZE, C7 + add C8, 2 * SIZE, C8 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + .align 4 + +.LL20: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL29 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + FCLR (cc01) + LDF [BO + 1 * SIZE], b2 + FCLR (cc03) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc07) + LDF [BO + 4 * SIZE], b5 + FCLR (cc09) + LDF [BO + 5 * SIZE], b6 + FCLR (cc11) + LDF [BO + 6 * SIZE], b7 + FCLR (cc13) + LDF [BO + 7 * SIZE], b8 + FCLR (cc15) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + LDF [BO + 8 * SIZE], b9 + .align 4 + +.LL23: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa2, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa2, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa2, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa2, bb5, cc09, cc09) + LDF [BO + 20 * SIZE], b5 + FMADD (aa2, bb6, cc11, cc11) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa2, bb7, cc13, cc13) + LDF [BO + 22 * SIZE], b7 + FMADD (aa2, bb8, cc15, cc15) + LDF [BO + 23 * SIZE], b8 + + LDF [AO + 4 * SIZE], a1 + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb1, cc01, cc01) + LDF [BO + 32 * SIZE], b1 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 26 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [BO + 28 * SIZE], b5 + FMADD (aa3, bb6, cc11, cc11) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 30 * SIZE], b7 + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa4, bb9, cc01, cc01) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb2, cc03, cc03) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa4, bb3, cc05, cc05) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc07, cc07) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa4, bb5, cc09, cc09) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb6, cc11, cc11) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa4, bb7, cc13, cc13) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc15, cc15) + LDF [BO + 39 * SIZE], b8 + + LDF [AO + 6 * SIZE], a3 + LDF [AO + 7 * SIZE], a4 + + add AO, 4 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL23 + add BO, 32 * SIZE, BO + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 1 * SIZE], a1 + add AO, 1 * SIZE, AO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL27 + add BO, 8 * SIZE, BO + .align 4 + +.LL28: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb4, cc01, cc15, cc15) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb3, cc03, cc15, cc15) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb2, cc05, cc15, cc15) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (bb1, cc07, cc15, cc15) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa4, cc09, cc15, cc15) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa3, cc11, cc15, cc15) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc15, cc15) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c15, c15 + + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 + add C5, -1 * SIZE, C5 + add C6, -1 * SIZE, C6 + add C7, -1 * SIZE, C7 + add C8, -1 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c11, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c15, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c13, [C7 + 0 * SIZE] + STF c15, [C8 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL29: +#ifdef LN + sll K, BASE_SHIFT + 3, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 8, KK +#endif + +#ifdef RT + sub KK, 8, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL999: +#ifdef TRMMKERNEL +#ifndef __64BIT__ + ld [%sp + STACK_START + 8], %g1 + ld [%sp + STACK_START + 12], %g2 + ld [%sp + STACK_START + 16], %g3 + ld [%sp + STACK_START + 20], %g4 +#else + ldx [%sp + STACK_START + 32], %g1 + ldx [%sp + STACK_START + 40], %g2 + ldx [%sp + STACK_START + 48], %g3 + ldx [%sp + STACK_START + 56], %g4 +#endif +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zamax.S b/kernel/sparc/zamax.S new file mode 100644 index 0000000..b156c5a --- /dev/null +++ b/kernel/sparc/zamax.S @@ -0,0 +1,374 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 +#define t5 %f16 +#define t6 %f18 +#define t7 %f20 +#define t8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 +#define t5 %f8 +#define t6 %f9 +#define t7 %f10 +#define t8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#else +#define FCMOV FMOVL +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, 0 + ble .LL20 + sll INCX, ZBASE_SHIFT, INCX + + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + add N, -1, N + FABS c1, c1 + add X, INCX, X + FABS c2, c2 + cmp N, 0 + ble .LL20 + FADD c1, c2, c1 + + FMOV c1, c2 + FMOV c1, c3 + FMOV c1, c4 + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FABS a5, t5 + LDF [X + 4 * SIZE], a5 + FABS a6, t6 + LDF [X + 5 * SIZE], a6 + FABS a7, t7 + LDF [X + 6 * SIZE], a7 + FABS a8, t8 + LDF [X + 7 * SIZE], a8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + add I, -1, I + FCMOV %fcc1, t3, c2 + cmp I, 0 + FCMOV %fcc2, t5, c3 + FCMOV %fcc3, t7, c4 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FABS a5, t5 + FABS a6, t6 + FABS a7, t7 + FABS a8, t8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t3, c2 + FCMOV %fcc2, t5, c3 + FCMOV %fcc3, t7, c4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + + FABS a1, t1 + FABS a2, t2 + FADD t1, t2, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + +.LL20: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + FABS a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FABS a5, t5 + LDF [X + 0 * SIZE], a5 + FABS a6, t6 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FABS a7, t7 + LDF [X + 0 * SIZE], a7 + FABS a8, t8 + LDF [X + 1 * SIZE], a8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + add I, -1, I + FCMOV %fcc1, t3, c2 + cmp I, 0 + FCMOV %fcc2, t5, c3 + FCMOV %fcc3, t7, c4 + + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FABS a5, t5 + FABS a6, t6 + FABS a7, t7 + FABS a8, t8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t3, c2 + FCMOV %fcc2, t5, c3 + FCMOV %fcc3, t7, c4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + + FABS a1, t1 + add I, -1, I + FABS a2, t2 + cmp I, 0 + FADD t1, t2, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zasum.S b/kernel/sparc/zasum.S new file mode 100644 index 0000000..53bd3c0 --- /dev/null +++ b/kernel/sparc/zasum.S @@ -0,0 +1,327 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + sll INCX, ZBASE_SHIFT, INCX + + FMOV c1, c2 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL19 + nop + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + cmp I, 0 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 32 + +.LL11: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + add I, -1, I + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + + FADD c1, t3, c1 + cmp I, 0 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + + FADD c2, t4, c2 + nop + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + nop + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + + FADD c2, t2, c2 + nop + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + + FADD c1, t3, c1 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + add X, 8 * SIZE, X + + FADD c2, t4, c2 + FABS a8, t4 + bg,pt %icc, .LL11 + LDF [X - 1 * SIZE], a8 + +.LL12: + FADD c1, t1, c1 + FABS a1, t1 + FADD c2, t2, c2 + FABS a2, t2 + + FADD c1, t3, c1 + FABS a3, t3 + FADD c2, t4, c2 + FABS a4, t4 + + FADD c1, t1, c1 + FABS a5, t1 + FADD c2, t2, c2 + FABS a6, t2 + + FADD c1, t3, c1 + FABS a7, t3 + FADD c2, t4, c2 + FABS a8, t4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add I, -1, I + cmp I, 0 + FADD c1, t1, c1 + FADD c2, t2, c2 + FABS a1, t1 + FABS a2, t2 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FADD c1, t1, c1 + add I, -1, I + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + cmp I, 0 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + + FADD c1, t3, c1 + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + + FADD c2, t4, c2 + FABS a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + + FADD c2, t2, c2 + FABS a6, t2 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + + FADD c1, t3, c1 + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + + FADD c2, t4, c2 + FABS a8, t4 + LDF [X + 1 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FADD c1, t1, c1 + FABS a1, t1 + FADD c2, t2, c2 + FABS a2, t2 + + FADD c1, t3, c1 + FABS a3, t3 + FADD c2, t4, c2 + FABS a4, t4 + + FADD c1, t1, c1 + FABS a5, t1 + FADD c2, t2, c2 + FABS a6, t2 + + FADD c1, t3, c1 + FABS a7, t3 + FADD c2, t4, c2 + FABS a8, t4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FADD c1, t1, c1 + FADD c2, t2, c2 + add I, -1, I + FABS a1, t1 + FABS a2, t2 + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zaxpy.S b/kernel/sparc/zaxpy.S new file mode 100644 index 0000000..5e2be75 --- /dev/null +++ b/kernel/sparc/zaxpy.S @@ -0,0 +1,594 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(DOUBLE) && !defined(__64BIT__) +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 +#else +#define N %i0 +#define X %i5 +#define INCX %i1 +#define Y %i2 +#define INCY %i3 +#define I %i4 +#endif + +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define b1 %f16 +#define b2 %f18 +#define b3 %f20 +#define b4 %f22 +#define b5 %f24 +#define b6 %f26 +#define b7 %f28 +#define b8 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 +#define c1 %f40 +#define c2 %f42 +#define c3 %f44 +#define c4 %f46 + +#define c5 %f48 +#define c6 %f50 +#define c7 %f52 +#define c8 %f54 + +#define ALPHA_R %f60 +#define ALPHA_I %f62 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define b1 %f8 +#define b2 %f9 +#define b3 %f10 +#define b4 %f11 +#define b5 %f12 +#define b6 %f13 +#define b7 %f14 +#define b8 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 +#define c1 %f20 +#define c2 %f21 +#define c3 %f22 +#define c4 %f23 + +#define c5 %f24 +#define c6 %f25 +#define c7 %f26 +#define c8 %f27 + +#define ALPHA_R %f30 +#define ALPHA_I %f31 +#endif + +#ifndef CONJ +#define ADD1 FSUB +#define ADD2 FADD +#else +#define ADD1 FADD +#define ADD2 FSUB +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] + + ld [%sp+ STACK_START + 32], X + ld [%sp+ STACK_START + 36], INCX + ld [%sp+ STACK_START + 40], Y + ld [%sp+ STACK_START + 44], INCY + + ldd [%sp + STACK_START + 16], ALPHA_R + ldd [%sp + STACK_START + 24], ALPHA_I +#else + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp+ STACK_START + 28], INCX + ld [%sp+ STACK_START + 32], Y + ld [%sp+ STACK_START + 36], INCY + + ld [%sp + STACK_START + 16], ALPHA_R + ld [%sp + STACK_START + 20], ALPHA_I +#endif +#else + ldx [%sp + STACK_START + 56], INCX + ldx [%sp + STACK_START + 64], Y + ldx [%sp + STACK_START + 72], INCY +#ifdef DOUBLE + FMOV %f6, ALPHA_R + FMOV %f8, ALPHA_I +#else + FMOV %f7, ALPHA_R + FMOV %f9, ALPHA_I +#endif +#endif + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + + cmp INCX, 2 * SIZE + bne .LL50 + nop + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 2 * SIZE], b3 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 4 * SIZE], b5 + LDF [Y + 5 * SIZE], b6 + + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 6 * SIZE], b7 + LDF [Y + 7 * SIZE], b8 + + FMUL ALPHA_R, a1, t1 + FMUL ALPHA_R, a2, t2 + FMUL ALPHA_R, a3, t3 + FMUL ALPHA_R, a4, t4 + + FADD b1, t1, c1 + FMUL ALPHA_I, a2, t1 + ADD2 b2, t2, c2 + FMUL ALPHA_I, a1, t2 + + deccc I + ble,pt %icc, .LL12 + nop + +#ifdef DOUBLE +#define PREFETCHSIZE 54 +#else +#define PREFETCHSIZE 108 +#endif + +.LL11: + FADD b3, t3, c3 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + FMUL ALPHA_I, a4, t3 + prefetch [X + PREFETCHSIZE * SIZE], 0 + + ADD2 b4, t4, c4 + LDF [Y + 8 * SIZE], b1 + FMUL ALPHA_I, a3, t4 + LDF [X + 9 * SIZE], a2 + + ADD1 c1, t1, c1 + LDF [Y + 9 * SIZE], b2 + FMUL ALPHA_R, a5, t1 + LDF [X + 8 * SIZE], a1 + + FADD c2, t2, c2 + LDF [Y + 10 * SIZE], b3 + FMUL ALPHA_R, a6, t2 + LDF [X + 11 * SIZE], a4 + + ADD1 c3, t3, c3 + STF c1, [Y + 0 * SIZE] + FMUL ALPHA_R, a7, t3 + LDF [Y + 11 * SIZE], b4 + + FADD c4, t4, c4 + STF c2, [Y + 1 * SIZE] + FMUL ALPHA_R, a8, t4 + LDF [X + 10 * SIZE], a3 + + FADD b5, t1, c5 + STF c3, [Y + 2 * SIZE] + FMUL ALPHA_I, a6, t1 + + ADD2 b6, t2, c6 + STF c4, [Y + 3 * SIZE] + FMUL ALPHA_I, a5, t2 + + FADD b7, t3, c7 + LDF [Y + 12 * SIZE], b5 + FMUL ALPHA_I, a8, t3 + LDF [X + 13 * SIZE], a6 + + ADD2 b8, t4, c8 + LDF [Y + 13 * SIZE], b6 + FMUL ALPHA_I, a7, t4 + LDF [X + 12 * SIZE], a5 + + ADD1 c5, t1, c5 + LDF [Y + 14 * SIZE], b7 + FMUL ALPHA_R, a1, t1 + LDF [X + 15 * SIZE], a8 + + FADD c6, t2, c6 + LDF [Y + 15 * SIZE], b8 + FMUL ALPHA_R, a2, t2 + LDF [X + 14 * SIZE], a7 + + ADD1 c7, t3, c7 + STF c5, [Y + 4 * SIZE] + FMUL ALPHA_R, a3, t3 + add X, 8 * SIZE, X + + FADD c8, t4, c8 + STF c6, [Y + 5 * SIZE] + FMUL ALPHA_R, a4, t4 + deccc I + + FADD b1, t1, c1 + STF c7, [Y + 6 * SIZE] + FMUL ALPHA_I, a2, t1 + + ADD2 b2, t2, c2 + STF c8, [Y + 7 * SIZE] + FMUL ALPHA_I, a1, t2 + + bg,pt %icc, .LL11 + add Y, 8 * SIZE, Y + + +.LL12: + FADD b3, t3, c3 + FMUL ALPHA_I, a4, t3 + ADD2 b4, t4, c4 + FMUL ALPHA_I, a3, t4 + + ADD1 c1, t1, c1 + FMUL ALPHA_R, a5, t1 + FADD c2, t2, c2 + FMUL ALPHA_R, a6, t2 + + ADD1 c3, t3, c3 + FMUL ALPHA_R, a7, t3 + FADD c4, t4, c4 + FMUL ALPHA_R, a8, t4 + + FADD b5, t1, c5 + FMUL ALPHA_I, a6, t1 + ADD2 b6, t2, c6 + FMUL ALPHA_I, a5, t2 + + FADD b7, t3, c7 + FMUL ALPHA_I, a8, t3 + ADD2 b8, t4, c8 + FMUL ALPHA_I, a7, t4 + + ADD1 c5, t1, c5 + FADD c6, t2, c6 + ADD1 c7, t3, c7 + FADD c8, t4, c8 + + STF c1, [Y + 0 * SIZE] + STF c2, [Y + 1 * SIZE] + STF c3, [Y + 2 * SIZE] + STF c4, [Y + 3 * SIZE] + + STF c5, [Y + 4 * SIZE] + STF c6, [Y + 5 * SIZE] + STF c7, [Y + 6 * SIZE] + STF c8, [Y + 7 * SIZE] + + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + + FMUL ALPHA_R, a1, t1 + FMUL ALPHA_R, a2, t2 + FMUL ALPHA_I, a2, t3 + FMUL ALPHA_I, a1, t4 + + FADD b1, t1, b1 + add I, -1, I + ADD2 b2, t2, b2 + cmp I, 0 + ADD1 b1, t3, c1 + FADD b2, t4, c2 + + STF c1, [Y + 0 * SIZE] + STF c2, [Y + 1 * SIZE] + + add Y, 2 * SIZE, Y + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + mov Y, YY + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + LDF [Y + 0 * SIZE], b3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + LDF [X + 0 * SIZE], a5 + add I, -1, I + LDF [Y + 0 * SIZE], b5 + LDF [X + 1 * SIZE], a6 + cmp I, 0 + add X, INCX, X + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + FMUL ALPHA_R, a1, t1 + LDF [Y + 0 * SIZE], b7 + FMUL ALPHA_R, a2, t2 + LDF [X + 1 * SIZE], a8 + FMUL ALPHA_R, a3, t3 + add X, INCX, X + LDF [Y + 1 * SIZE], b8 + FMUL ALPHA_R, a4, t4 + + ble,pt %icc, .LL52 + add Y, INCY, Y + + +.LL51: + FADD b1, t1, c1 + LDF [Y + 0 * SIZE], b1 + FMUL ALPHA_I, a2, t1 + LDF [X + 1 * SIZE], a2 + ADD2 b2, t2, c2 + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + FMUL ALPHA_I, a1, t2 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FADD b3, t3, c3 + LDF [Y + 0 * SIZE], b3 + FMUL ALPHA_I, a4, t3 + LDF [X + 1 * SIZE], a4 + ADD2 b4, t4, c4 + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + FMUL ALPHA_I, a3, t4 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + ADD1 c1, t1, c1 + FMUL ALPHA_R, a5, t1 + FADD c2, t2, c2 + FMUL ALPHA_R, a6, t2 + ADD1 c3, t3, c3 + FMUL ALPHA_R, a7, t3 + FADD c4, t4, c4 + FMUL ALPHA_R, a8, t4 + + STF c1, [YY + 0 * SIZE] + FADD b5, t1, c1 + FMUL ALPHA_I, a6, t1 + STF c2, [YY + 1 * SIZE] + ADD2 b6, t2, c2 + FMUL ALPHA_I, a5, t2 + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + FADD b7, t3, c3 + FMUL ALPHA_I, a8, t3 + STF c4, [YY + 1 * SIZE] + ADD2 b8, t4, c4 + FMUL ALPHA_I, a7, t4 + add YY, INCY, YY + + LDF [X + 0 * SIZE], a5 + ADD1 c1, t1, c1 + LDF [Y + 0 * SIZE], b5 + FMUL ALPHA_R, a1, t1 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FADD c2, t2, c2 + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + FMUL ALPHA_R, a2, t2 + LDF [X + 0 * SIZE], a7 + ADD1 c3, t3, c3 + LDF [Y + 0 * SIZE], b7 + FMUL ALPHA_R, a3, t3 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + FADD c4, t4, c4 + LDF [Y + 1 * SIZE], b8 + add Y, INCY, Y + FMUL ALPHA_R, a4, t4 + + STF c1, [YY + 0 * SIZE] + add I, -1, I + STF c2, [YY + 1 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + cmp I, 0 + STF c4, [YY + 1 * SIZE] + + bg,pt %icc, .LL51 + add YY, INCY, YY + +.LL52: + FADD b1, t1, c1 + FMUL ALPHA_I, a2, t1 + ADD2 b2, t2, c2 + FMUL ALPHA_I, a1, t2 + + FADD b3, t3, c3 + FMUL ALPHA_I, a4, t3 + ADD2 b4, t4, c4 + FMUL ALPHA_I, a3, t4 + + ADD1 c1, t1, c1 + FMUL ALPHA_R, a5, t1 + FADD c2, t2, c2 + FMUL ALPHA_R, a6, t2 + ADD1 c3, t3, c3 + FMUL ALPHA_R, a7, t3 + FADD c4, t4, c4 + FMUL ALPHA_R, a8, t4 + + STF c1, [YY + 0 * SIZE] + STF c2, [YY + 1 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + STF c4, [YY + 1 * SIZE] + add YY, INCY, YY + + FADD b5, t1, c1 + FMUL ALPHA_I, a6, t1 + ADD2 b6, t2, c2 + FMUL ALPHA_I, a5, t2 + FADD b7, t3, c3 + FMUL ALPHA_I, a8, t3 + ADD2 b8, t4, c4 + FMUL ALPHA_I, a7, t4 + + ADD1 c1, t1, c1 + FADD c2, t2, c2 + ADD1 c3, t3, c3 + FADD c4, t4, c4 + + STF c1, [YY + 0 * SIZE] + STF c2, [YY + 1 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + STF c4, [YY + 1 * SIZE] + add YY, INCY, YY + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + + FMUL ALPHA_R, a1, t1 + FMUL ALPHA_R, a2, t2 + FMUL ALPHA_I, a2, t3 + FMUL ALPHA_I, a1, t4 + FADD b1, t1, b1 + ADD2 b2, t2, b2 + ADD1 b1, t3, c1 + FADD b2, t4, c2 + + add I, -1, I + cmp I, 0 + STF c1, [Y + 0 * SIZE] + STF c2, [Y + 1 * SIZE] + + add Y, INCY, Y + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zcopy.S b/kernel/sparc/zcopy.S new file mode 100644 index 0000000..039ed54 --- /dev/null +++ b/kernel/sparc/zcopy.S @@ -0,0 +1,196 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#endif + + PROLOGUE + SAVESP + + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + + cmp INCX, 2 * SIZE + bne .LL50 + nop + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + +#define PREFETCHSIZE 32 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + STF a1, [Y + 0 * SIZE] + add I, -1, I + STF a2, [Y + 1 * SIZE] + cmp I, 0 + STF a3, [Y + 2 * SIZE] + add X, 8 * SIZE, X + STF a4, [Y + 3 * SIZE] + STF a5, [Y + 4 * SIZE] + STF a6, [Y + 5 * SIZE] + STF a7, [Y + 6 * SIZE] + STF a8, [Y + 7 * SIZE] + + bg,pt %icc, .LL11 + add Y, 8 * SIZE, Y + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add I, -1, I + cmp I, 0 + STF a1, [Y + 0 * SIZE] + add X, 2 * SIZE, X + STF a2, [Y + 1 * SIZE] + bg,pt %icc, .LL16 + add Y, 2 * SIZE, Y + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + LDF [X + 0 * SIZE], a7 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + + STF a1, [Y + 0 * SIZE] + add I, -1, I + STF a2, [Y + 1 * SIZE] + add Y, INCY, Y + cmp I, 0 + STF a3, [Y + 0 * SIZE] + STF a4, [Y + 1 * SIZE] + add Y, INCY, Y + STF a5, [Y + 0 * SIZE] + STF a6, [Y + 1 * SIZE] + add Y, INCY, Y + STF a7, [Y + 0 * SIZE] + STF a8, [Y + 1 * SIZE] + + bg,pt %icc, .LL51 + add Y, INCY, Y + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add I, -1, I + cmp I, 0 + add X, INCX, X + STF a1, [Y + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + bg,pt %icc, .LL56 + add Y, INCY, Y + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zdot.S b/kernel/sparc/zdot.S new file mode 100644 index 0000000..3072f0f --- /dev/null +++ b/kernel/sparc/zdot.S @@ -0,0 +1,545 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define OUT %i0 +#define N %i1 +#define X %i2 +#define INCX %i3 +#define Y %i4 +#define INCY %i5 +#else +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#endif + +#define I %l0 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + +#define b1 %f32 +#define b2 %f34 +#define b3 %f36 +#define b4 %f38 +#define b5 %f40 +#define b6 %f42 +#define b7 %f44 +#define b8 %f46 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 + +#define b1 %f16 +#define b2 %f17 +#define b3 %f18 +#define b4 %f19 +#define b5 %f20 +#define b6 %f21 +#define b7 %f22 +#define b8 %f23 +#endif + + PROLOGUE + SAVESP + +#ifdef DOUBLE + FCLR(0) + FCLR(2) + FCLR(4) + FCLR(6) +#else + FCLR(0) + FCLR(1) + FCLR(2) + FCLR(3) +#endif + + FMOV c1, c4 + FMOV c1, t1 + sll INCX, ZBASE_SHIFT, INCX + FMOV c1, t2 + sll INCY, ZBASE_SHIFT, INCY + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 2 * SIZE + bne .LL50 + nop + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + LDF [Y + 7 * SIZE], b8 + ble,pt %icc, .LL12 + add Y, 8 * SIZE, Y + +#define PREFETCHSIZE 40 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + FADD c1, t1, c1 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + FMUL a1, b1, t1 + + FADD c2, t2, c2 + FMUL a2, b1, t2 + LDF [Y + 0 * SIZE], b1 + + FADD c3, t3, c3 + FMUL a1, b2, t3 + LDF [X + 0 * SIZE], a1 + + FADD c4, t4, c4 + FMUL a2, b2, t4 + LDF [X + 1 * SIZE], a2 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + LDF [Y + 1 * SIZE], b2 + FADD c2, t2, c2 + FMUL a4, b3, t2 + LDF [Y + 2 * SIZE], b3 + + FADD c3, t3, c3 + FMUL a3, b4, t3 + LDF [X + 2 * SIZE], a3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + LDF [Y + 3 * SIZE], b4 + FADD c2, t2, c2 + FMUL a6, b5, t2 + LDF [Y + 4 * SIZE], b5 + + FADD c3, t3, c3 + FMUL a5, b6, t3 + LDF [X + 4 * SIZE], a5 + FADD c4, t4, c4 + FMUL a6, b6, t4 + LDF [X + 5 * SIZE], a6 + + FADD c1, t1, c1 + add I, -1, I + FMUL a7, b7, t1 + LDF [Y + 5 * SIZE], b6 + FADD c2, t2, c2 + cmp I, 0 + FMUL a8, b7, t2 + LDF [Y + 6 * SIZE], b7 + + FADD c3, t3, c3 + add Y, 8 * SIZE, Y + FMUL a7, b8, t3 + LDF [X + 6 * SIZE], a7 + FADD c4, t4, c4 + FMUL a8, b8, t4 + LDF [X + 7 * SIZE], a8 + + add X, 8 * SIZE, X + bg,pt %icc, .LL11 + LDF [Y - 1 * SIZE], b8 + +.LL12: + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b1, t2 + + FADD c3, t3, c3 + FMUL a1, b2, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a4, b3, t2 + + FADD c3, t3, c3 + FMUL a3, b4, t3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a6, b5, t2 + + FADD c3, t3, c3 + FMUL a5, b6, t3 + FADD c4, t4, c4 + FMUL a6, b6, t4 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a8, b7, t2 + + FADD c3, t3, c3 + FMUL a7, b8, t3 + FADD c4, t4, c4 + FMUL a8, b8, t4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, 2 * SIZE, X + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + add Y, 2 * SIZE, Y + + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b1, t2 + FADD c3, t3, c3 + FMUL a1, b2, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + nop + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +#ifndef CONJ + FSUB c1, c4, c1 + FADD c2, c3, c2 +#else + FADD c1, c4, c1 + FSUB c3, c2, c2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STF c1, [OUT + 0 * SIZE] + STF c2, [OUT + 1 * SIZE] +#endif + return %i7 + 8 + clr %g0 +.LL50: +#ifdef F_INTERFACE + cmp INCX, 0 + bge .LL41 + sub N, 1, I + + smul I, INCX, I + sub X, I, X + +.LL41: + cmp INCY, 0 + bge .LL42 + sub N, 1, I + + smul I, INCY, I + sub Y, I, Y + +.LL42: +#endif + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + LDF [X + 0 * SIZE], a7 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b3 + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b5 + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b7 + LDF [Y + 1 * SIZE], b8 + add Y, INCY, Y + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL52 + +.LL51: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + add I, -1, I + FMUL a1, b1, t1 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + + FADD c2, t2, c2 + cmp I, 0 + FMUL a2, b1, t2 + LDF [Y + 0 * SIZE], b1 + + FADD c3, t3, c3 + FMUL a1, b2, t3 + LDF [X + 0 * SIZE], a1 + FADD c4, t4, c4 + FMUL a2, b2, t4 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + + FADD c1, t1, c1 + FMUL a3, b3, t1 + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + FADD c2, t2, c2 + FMUL a4, b3, t2 + LDF [Y + 0 * SIZE], b3 + + FADD c3, t3, c3 + FMUL a3, b4, t3 + LDF [X + 0 * SIZE], a3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FMUL a5, b5, t1 + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + FADD c2, t2, c2 + FMUL a6, b5, t2 + LDF [Y + 0 * SIZE], b5 + + FADD c3, t3, c3 + FMUL a5, b6, t3 + LDF [X + 0 * SIZE], a5 + FADD c4, t4, c4 + FMUL a6, b6, t4 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + + FADD c1, t1, c1 + FMUL a7, b7, t1 + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + FADD c2, t2, c2 + FMUL a8, b7, t2 + LDF [Y + 0 * SIZE], b7 + + FADD c3, t3, c3 + FMUL a7, b8, t3 + LDF [X + 0 * SIZE], a7 + FADD c4, t4, c4 + FMUL a8, b8, t4 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + + LDF [Y + 1 * SIZE], b8 + bg,pt %icc, .LL51 + add Y, INCY, Y + +.LL52: + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b1, t2 + + FADD c3, t3, c3 + FMUL a1, b2, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a4, b3, t2 + + FADD c3, t3, c3 + FMUL a3, b4, t3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a6, b5, t2 + + FADD c3, t3, c3 + FMUL a5, b6, t3 + FADD c4, t4, c4 + FMUL a6, b6, t4 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a8, b7, t2 + + FADD c3, t3, c3 + FMUL a7, b8, t3 + FADD c4, t4, c4 + FMUL a8, b8, t4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b1, t2 + FADD c3, t3, c3 + FMUL a1, b2, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL56 + nop + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +#ifndef CONJ + FSUB c1, c4, c1 + FADD c2, c3, c2 +#else + FADD c1, c4, c1 + FSUB c3, c2, c2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STF c1, [OUT + 0 * SIZE] + STF c2, [OUT + 1 * SIZE] +#endif + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemm_kernel.S b/kernel/sparc/zgemm_kernel.S new file mode 100644 index 0000000..b02c942 --- /dev/null +++ b/kernel/sparc/zgemm_kernel.S @@ -0,0 +1,1917 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 + +#define OFFSET %l2 +#define KK %l3 +#define TEMP1 %l4 +#define TEMP2 %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f62 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f58 +#define ALPHA_R %f60 +#define ALPHA_I %f62 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#define ALPHA_R %f30 +#define ALPHA_I %f31 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FADD +#define FADD4 FSUB +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FSUB +#define FADD4 FADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FADD +#define FADD4 FADD +#else +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FSUB +#define FADD4 FSUB +#endif + + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE +#define STACK_ALPHA [%sp + STACK_START + 24] +#else +#define STACK_ALPHA [%sp + STACK_START + 20] +#endif +#else +#define STACK_ALPHA [%sp + STACK_START + 40] +#endif + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] + + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 48], OFFSET +#endif + ldd [%sp + STACK_START + 16], ALPHA_R + ldd [%sp + STACK_START + 24], ALPHA_I +#else + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 40], OFFSET +#endif + ld [%sp + STACK_START + 16], ALPHA_R + ld [%sp + STACK_START + 20], ALPHA_I +#endif +#else + +#ifdef DOUBLE + FMOV %f6, ALPHA_R + FMOV %f8, ALPHA_I + STF %f8, STACK_ALPHA +#else + FMOV %f7, ALPHA_R + FMOV %f9, ALPHA_I + STF %f9, STACK_ALPHA +#endif + + ldx [%sp+ STACK_START + 56], B + nop + ldx [%sp+ STACK_START + 64], C + nop + ldx [%sp+ STACK_START + 72], LDC +#ifdef TRMMKERNEL + ldx [%sp+ STACK_START + 80], OFFSET +#endif + + LDF [%sp + STACK_START + 32], FZERO +#endif + +#ifdef DOUBLE + FCLR(27) +#else + FCLR(29) +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDC, ZBASE_SHIFT, LDC + +.LL11: + sra M, 1, I + FMOV FZERO, t1 + add C, LDC, C2 + FMOV FZERO, t2 + + mov C, C1 + FMOV FZERO, t3 + cmp I, 0 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov A, AO + add C2, LDC, C + nop + ble,pn %icc, .LL50 + FMOV FZERO, t4 + + +.LL21: +#if !defined(TRMMKERNEL) + sra K, 2, L + FMOV FZERO, c01 + cmp L, 0 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [B + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [B + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 + mov B, BO + +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP1, AO + add B, TEMP1, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + FMOV FZERO, c01 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [BO + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 + +#endif + FMOV FZERO, c15 + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD2 c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD4 c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD2 c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD4 c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD4 c08, t2, c08 + FMUL a5, b2, t2 + FADD2 c12, t3, c12 + FMUL a5, b3, t3 + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL29 + LDF STACK_ALPHA, ALPHA_I + +.LL26: + FADD2 c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD4 c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#ifndef TRMMKERNEL + FADD2 c04, t1, c04 + LDF [C1 + 0 * SIZE], a1 + FADD4 c08, t2, c08 + LDF [C1 + 1 * SIZE], a2 + FADD2 c12, t3, c12 + LDF [C1 + 2 * SIZE], a3 + FADD4 c16, t4, c16 + LDF [C1 + 3 * SIZE], a4 + + FADD c01, c06, c01 + LDF [C2 + 0 * SIZE], b1 + FADD c02, c05, c02 + LDF [C2 + 1 * SIZE], b2 + FADD c03, c08, c03 + LDF [C2 + 2 * SIZE], b3 + FADD c04, c07, c04 + LDF [C2 + 3 * SIZE], b4 + + FADD c09, c14, c09 + FMUL ALPHA_R, c01, t1 + FADD c10, c13, c10 + FMUL ALPHA_R, c02, t2 + FADD c11, c16, c11 + FMUL ALPHA_R, c03, t3 + FADD c12, c15, c12 + FMUL ALPHA_R, c04, t4 + + FADD a1, t1, a1 + FMUL ALPHA_I, c02, t1 + FADD a2, t2, a2 + FMUL ALPHA_I, c01, t2 + FADD a3, t3, a3 + FMUL ALPHA_I, c04, t3 + FADD a4, t4, a4 + FMUL ALPHA_I, c03, t4 + + FSUB a1, t1, a1 + FMUL ALPHA_R, c09, t1 + FADD a2, t2, a2 + FMUL ALPHA_R, c10, t2 + FSUB a3, t3, a3 + FMUL ALPHA_R, c11, t3 + FADD a4, t4, a4 + FMUL ALPHA_R, c12, t4 + + FADD b1, t1, b1 + FMUL ALPHA_I, c10, t1 + FADD b2, t2, b2 + FMUL ALPHA_I, c09, t2 + FADD b3, t3, b3 + FMUL ALPHA_I, c12, t3 + FADD b4, t4, b4 + FMUL ALPHA_I, c11, t4 + + STF a1, [C1 + 0 * SIZE] + FSUB b1, t1, b1 + STF a2, [C1 + 1 * SIZE] + FADD b2, t2, b2 + STF a3, [C1 + 2 * SIZE] + FSUB b3, t3, b3 + STF a4, [C1 + 3 * SIZE] + FADD b4, t4, b4 + + STF b1, [C2 + 0 * SIZE] + FMOV FZERO, t1 + STF b2, [C2 + 1 * SIZE] + FMOV FZERO, t2 + STF b3, [C2 + 2 * SIZE] + FMOV FZERO, t3 + STF b4, [C2 + 3 * SIZE] + FMOV FZERO, t4 +#else + FADD2 c04, t1, c04 + FADD4 c08, t2, c08 + FADD2 c12, t3, c12 + FADD4 c16, t4, c16 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + STF c01, [C1 + 0 * SIZE] + FADD c09, c14, c09 + STF c02, [C1 + 1 * SIZE] + FADD c10, c13, c10 + STF c03, [C1 + 2 * SIZE] + FADD c11, c16, c11 + STF c04, [C1 + 3 * SIZE] + FADD c12, c15, c12 + + STF c09, [C2 + 0 * SIZE] + FMOV FZERO, t1 + STF c10, [C2 + 1 * SIZE] + FMOV FZERO, t2 + STF c11, [C2 + 2 * SIZE] + FMOV FZERO, t3 + STF c12, [C2 + 3 * SIZE] + FMOV FZERO, t4 +#endif + + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 1, I + FMOV FZERO, c02 + cmp I, 0 + FMOV FZERO, t1 + ble,pn %icc, .LL99 + FMOV FZERO, c04 + + +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t2 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, t3 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c05 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 1 + ZBASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + +#endif + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD2 c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD4 c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD4 c04, t2, c04 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD2 c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD4 c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#ifndef TRMMKERNEL + FADD2 c02, t1, c02 + LDF [C1 + 0 * SIZE], a1 + FADD4 c04, t2, c04 + LDF [C1 + 1 * SIZE], a2 + FADD2 c06, t3, c06 + LDF [C2 + 0 * SIZE], a3 + FADD4 c08, t4, c08 + LDF [C2 + 1 * SIZE], a4 + + FADD c01, c04, c01 + FMUL ALPHA_R, c01, t1 + FADD c02, c03, c02 + FMUL ALPHA_R, c02, t2 + FADD c05, c08, c05 + FMUL ALPHA_R, c05, t3 + FADD c06, c07, c06 + FMUL ALPHA_R, c06, t4 + + FADD a1, t1, a1 + FMUL ALPHA_I, c02, t1 + FADD a2, t2, a2 + FMUL ALPHA_I, c01, t2 + FADD a3, t3, a3 + FMUL ALPHA_I, c06, t3 + FADD a4, t4, a4 + FMUL ALPHA_I, c05, t4 + + FSUB a1, t1, a1 + FADD a2, t2, a2 + FSUB a3, t3, a3 + FADD a4, t4, a4 + + STF a1, [C1 + 0 * SIZE] + FMOV FZERO, t1 + STF a2, [C1 + 1 * SIZE] + FMOV FZERO, t2 + STF a3, [C2 + 0 * SIZE] + FMOV FZERO, t3 + STF a4, [C2 + 1 * SIZE] + FMOV FZERO, t4 +#else + FADD2 c02, t1, c02 + FADD4 c04, t2, c04 + FADD2 c06, t3, c06 + FADD4 c08, t4, c08 + + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + STF c01, [C1 + 0 * SIZE] + FMOV FZERO, t1 + STF c02, [C1 + 1 * SIZE] + FMOV FZERO, t2 + STF c05, [C2 + 0 * SIZE] + FMOV FZERO, t3 + STF c06, [C2 + 1 * SIZE] + FMOV FZERO, t4 +#endif + + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL99: + add J, -1, J + mov BO, B + cmp J, 0 + bg,pt %icc, .LL11 +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 2, KK +#else + nop +#endif + +.LL100: + sra M, 1, I + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + mov A, AO + + mov C, C1 + add C, LDC, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t1 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, t2 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c05 + FMOV FZERO, c02 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c05 + FMOV FZERO, c02 +#endif + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD1 c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD1 c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD1 c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD3 c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD4 c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b1, t1 + FADD3 c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD4 c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: +#ifndef TRMMKERNEL + FADD1 c03, t1, c03 + LDF [C1 + 0 * SIZE], a1 + FADD3 c07, t2, c07 + LDF [C1 + 1 * SIZE], a2 + FADD2 c04, t3, c04 + LDF [C1 + 2 * SIZE], a3 + FADD4 c08, t4, c08 + LDF [C1 + 3 * SIZE], a4 + + FADD c01, c06, c01 + FMUL ALPHA_R, c01, t1 + FADD c02, c05, c02 + FMUL ALPHA_R, c02, t2 + FADD c03, c08, c03 + FMUL ALPHA_R, c03, t3 + FADD c04, c07, c04 + FMUL ALPHA_R, c04, t4 + + FADD a1, t1, a1 + FMUL ALPHA_I, c02, t1 + FADD a2, t2, a2 + FMUL ALPHA_I, c01, t2 + FADD a3, t3, a3 + FMUL ALPHA_I, c04, t3 + FADD a4, t4, a4 + FMUL ALPHA_I, c03, t4 + + FSUB a1, t1, a1 + FADD a2, t2, a2 + FSUB a3, t3, a3 + FADD a4, t4, a4 + + STF a1, [C1 + 0 * SIZE] + FMOV FZERO, t1 + STF a2, [C1 + 1 * SIZE] + FMOV FZERO, t2 + STF a3, [C1 + 2 * SIZE] + FMOV FZERO, t3 + STF a4, [C1 + 3 * SIZE] + FMOV FZERO, t4 +#else + FADD1 c03, t1, c03 + FADD3 c07, t2, c07 + FADD2 c04, t3, c04 + FADD4 c08, t4, c08 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + STF c01, [C1 + 0 * SIZE] + FMOV FZERO, t1 + STF c02, [C1 + 1 * SIZE] + FMOV FZERO, t2 + STF c03, [C1 + 2 * SIZE] + FMOV FZERO, t3 + STF c04, [C1 + 3 * SIZE] + FMOV FZERO, t4 +#endif + + add C1, 4 * SIZE, C1 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD1 c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD3 c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD2 c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD1 c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD3 c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD3 c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD2 c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + FADD1 c01, t1, c01 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + FADD3 c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + cmp L, 0 + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [BO + 1 * SIZE], b2 + + bg,pt %icc, .LL156 + LDF [AO + 1 * SIZE], a2 + +.LL159: +#ifndef TRMMKERNEL + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + + FADD c01, c04, c01 + FADD c02, c03, c02 + + FMUL ALPHA_R, c01, t1 + FMUL ALPHA_R, c02, t2 + FMUL ALPHA_I, c02, t3 + FMUL ALPHA_I, c01, t4 + + FADD a1, t1, a1 + FADD a2, t2, a2 + FSUB a1, t3, a1 + FADD a2, t4, a2 + + STF a1, [C1 + 0 * SIZE] + STF a2, [C1 + 1 * SIZE] +#else + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + FADD c01, c04, c01 + FADD c02, c03, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] +#endif + + add C1, 2 * SIZE, C1 + +#ifndef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemm_kernel_1x4.S b/kernel/sparc/zgemm_kernel_1x4.S new file mode 100644 index 0000000..03397fd --- /dev/null +++ b/kernel/sparc/zgemm_kernel_1x4.S @@ -0,0 +1,1599 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define BB %o7 + +#define C1 %l0 +#define C2 %l1 +#define C3 %l2 +#define C4 %l3 + +#define OFFSET %l4 +#define KK %l5 +#define TEMP1 %l6 +#define TEMP2 %l7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define ALPHA_R %f60 +#define ALPHA_I %f62 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#define alpha_r 29 +#define alpha_i 31 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define ALPHA_R %f30 +#define ALPHA_I %f31 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#define alpha_r 30 +#define alpha_i 31 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FMADD +#define FMADD4 FNMSUB +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FMADD +#define FMADD4 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FNMSUB +#define FMADD4 FNMSUB +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] + + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 48], OFFSET +#endif + + ldd [%sp + STACK_START + 16], ALPHA_R + ldd [%sp + STACK_START + 24], ALPHA_I +#else + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 40], OFFSET +#endif + + ld [%sp + STACK_START + 16], ALPHA_R + ld [%sp + STACK_START + 20], ALPHA_I +#endif +#else + ldx [%sp + STACK_START + 56], B + ldx [%sp + STACK_START + 64], C + ldx [%sp + STACK_START + 72], LDC +#ifdef TRMMKERNEL + ldx [%sp + STACK_START + 80], OFFSET +#endif + +#ifdef DOUBLE + FMOV %f6, ALPHA_R + FMOV %f8, ALPHA_I +#else + FMOV %f7, ALPHA_R + FMOV %f9, ALPHA_I +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + cmp M, 0 + ble,pn %icc, .LL999 + nop + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL20 + sll LDC, ZBASE_SHIFT, LDC + +.LL11: + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C + + sll K, ZBASE_SHIFT + 2, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov A, AO + + mov M, I + add B, BB, BB + .align 4 + +.LL12: + prefetch [BB + 0 * SIZE], 1 +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 2, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + FCLR (cc01) + LDF [AO + 1 * SIZE], a2 + FCLR (cc05) + LDF [AO + 8 * SIZE], a5 + FCLR (cc09) + LDF [BO + 0 * SIZE], b1 + FCLR (cc13) + + LDF [BO + 1 * SIZE], b2 + FCLR (cc02) + LDF [BO + 2 * SIZE], b3 + FCLR (cc06) + LDF [BO + 3 * SIZE], b4 + FCLR (cc10) + LDF [BO + 4 * SIZE], b5 + FCLR (cc14) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc03) + LDF [BO + 6 * SIZE], b7 + FCLR (cc07) + LDF [BO + 7 * SIZE], b8 + FCLR (cc11) + LDF [BO + 8 * SIZE], b9 + FCLR (cc15) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc16) + +#ifndef TRMMKERNEL + sra K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + add BB, 32 * SIZE, BB + .align 4 + +.LL13: + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#ifndef TRMMKERNEL + and K, 7, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + nop + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + nop + FMADD2 (aa2, bb5, cc10, cc10) + nop + + FMADD3 (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD2 (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + FADD c01, c04, c01 + LDF [C1 + 1 * SIZE], a2 + FADD c02, c03, c02 + LDF [C2 + 0 * SIZE], a3 + FADD c05, c08, c05 + LDF [C2 + 1 * SIZE], a4 + FADD c06, c07, c06 + + LDF [C3 + 0 * SIZE], b1 + FADD c09, c12, c09 + LDF [C3 + 1 * SIZE], b2 + FADD c10, c11, c10 + LDF [C4 + 0 * SIZE], b3 + FADD c13, c16, c13 + LDF [C4 + 1 * SIZE], b4 + FADD c14, c15, c14 + + FMADD (alpha_r, cc01, aa1, aa1) + FMADD (alpha_r, cc02, aa2, aa2) + FMADD (alpha_r, cc05, aa3, aa3) + FMADD (alpha_r, cc06, aa4, aa4) + + FMADD (alpha_r, cc09, bb1, bb1) + FMADD (alpha_r, cc10, bb2, bb2) + FMADD (alpha_r, cc13, bb3, bb3) + FMADD (alpha_r, cc14, bb4, bb4) + +#else + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + FADD c09, c12, c09 + FADD c10, c11, c10 + FADD c13, c16, c13 + FADD c14, c15, c14 + + FMUL ALPHA_R, c01, a1 + FMUL ALPHA_R, c02, a2 + FMUL ALPHA_R, c05, a3 + FMUL ALPHA_R, c06, a4 + + FMUL ALPHA_R, c09, b1 + FMUL ALPHA_R, c10, b2 + FMUL ALPHA_R, c13, b3 + FMUL ALPHA_R, c14, b4 +#endif + + FNMSUB (alpha_i, cc02, aa1, aa1) + FMADD (alpha_i, cc01, aa2, aa2) + FNMSUB (alpha_i, cc06, aa3, aa3) + FMADD (alpha_i, cc05, aa4, aa4) + + FNMSUB (alpha_i, cc10, bb1, bb1) + STF a1, [C1 + 0 * SIZE] + FMADD (alpha_i, cc09, bb2, bb2) + STF a2, [C1 + 1 * SIZE] + FNMSUB (alpha_i, cc14, bb3, bb3) + STF a3, [C2 + 0 * SIZE] + FMADD (alpha_i, cc13, bb4, bb4) + STF a4, [C2 + 1 * SIZE] + + STF b1, [C3 + 0 * SIZE] + add C1, 2 * SIZE, C1 + STF b2, [C3 + 1 * SIZE] + add C2, 2 * SIZE, C2 + STF b3, [C4 + 0 * SIZE] + add C3, 2 * SIZE, C3 + STF b4, [C4 + 1 * SIZE] + add C4, 2 * SIZE, C4 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + mov BO, B + .align 4 + +.LL20: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL30 + mov C, C1 + + add C, LDC, C2 + add C2, LDC, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov M, I + mov A, AO + .align 4 + +.LL22: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 1, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + FCLR (cc01) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc04) + LDF [BO + 8 * SIZE], b9 + FCLR (cc05) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + FCLR (cc08) + .align 4 + +.LL23: + FMADD1 (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD2 (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD3 (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD2 (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + nop + FMADD2 (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD3 (aa3, bb8, cc07, cc07) + FMADD4 (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL23 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL25: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL27 + add BO, 4 * SIZE, BO + .align 4 + +.LL28: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + FADD c01, c04, c01 + LDF [C1 + 1 * SIZE], a2 + FADD c02, c03, c02 + LDF [C2 + 0 * SIZE], a3 + FADD c05, c08, c05 + LDF [C2 + 1 * SIZE], a4 + FADD c06, c07, c06 + + FMADD (alpha_r, cc01, aa1, aa1) + FMADD (alpha_r, cc02, aa2, aa2) + FMADD (alpha_r, cc05, aa3, aa3) + FMADD (alpha_r, cc06, aa4, aa4) +#else + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + FMUL ALPHA_R, c01, a1 + FMUL ALPHA_R, c02, a2 + FMUL ALPHA_R, c05, a3 + FMUL ALPHA_R, c06, a4 +#endif + + FNMSUB (alpha_i, cc02, aa1, aa1) + FMADD (alpha_i, cc01, aa2, aa2) + FNMSUB (alpha_i, cc06, aa3, aa3) + FMADD (alpha_i, cc05, aa4, aa4) + + STF a1, [C1 + 0 * SIZE] + add I, -1, I + STF a2, [C1 + 1 * SIZE] + cmp I, 0 + STF a3, [C2 + 0 * SIZE] + add C1, 2 * SIZE, C1 + STF a4, [C2 + 1 * SIZE] + add C2, 2 * SIZE, C2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + + bg,pt %icc, .LL22 + nop + +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 2, KK +#endif + + mov BO, B + .align 4 + +.LL30: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + mov C, C1 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov M, I + mov A, AO + .align 4 + +.LL32: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 0, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + FCLR (cc08) + .align 4 + +.LL33: + FMADD1 (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD1 (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD2 (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD3 (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD4 (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD1 (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD2 (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD3 (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD4 (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD1 (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD2 (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD3 (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD4 (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL37 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL38: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + FADD c01, c04, c01 + LDF [C1 + 1 * SIZE], a2 + FADD c02, c03, c02 + + FMADD (alpha_r, cc01, aa1, aa1) + FMADD (alpha_r, cc02, aa2, aa2) +#else + FADD c01, c04, c01 + FADD c02, c03, c02 + + FMUL ALPHA_R, c01, a1 + FMUL ALPHA_R, c02, a2 +#endif + + FNMSUB (alpha_i, cc02, aa1, aa1) + FMADD (alpha_i, cc01, aa2, aa2) + + STF a1, [C1 + 0 * SIZE] + STF a2, [C1 + 1 * SIZE] + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 0, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + add C1, 2 * SIZE, C1 + .align 4 + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemm_ncopy.S b/kernel/sparc/zgemm_ncopy.S new file mode 100644 index 0000000..2b0c398 --- /dev/null +++ b/kernel/sparc/zgemm_ncopy.S @@ -0,0 +1,250 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 + +#define I %l4 +#define J %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, ZBASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + sra M, 2, I + cmp I, 0 + + ble,pn %icc, .LL15 + add A2, LDA, A + +#define PREFETCHSIZE 36 +#define WPREFETCHSIZE 20 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + LDF [A1 + 2 * SIZE], c05 + LDF [A1 + 3 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c09 + LDF [A1 + 5 * SIZE], c10 + LDF [A2 + 4 * SIZE], c11 + LDF [A2 + 5 * SIZE], c12 + + LDF [A1 + 6 * SIZE], c13 + LDF [A1 + 7 * SIZE], c14 + LDF [A2 + 6 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + + STF c01, [B + 0 * SIZE] + add A1, 8 * SIZE, A1 + STF c02, [B + 1 * SIZE] + add A2, 8 * SIZE, A2 + STF c03, [B + 2 * SIZE] + add I, -1, I + STF c04, [B + 3 * SIZE] + cmp I, 0 + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] +#ifdef DOUBLE + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 +#endif + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + bg,pt %icc, .LL12 + add B, 16 * SIZE, B + +.LL15: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL16: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + add A1, 2 * SIZE, A1 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + add A2, 2 * SIZE, A2 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + bg,pt %icc, .LL16 + add B, 4 * SIZE, B + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL111: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL115 + mov A, A1 + + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + add A1, 8 * SIZE, A1 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + bg,pt %icc, .LL112 + add B, 8 * SIZE, B + +.LL115: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +.LL116: + LDF [A1 + 0 * SIZE], c01 + add I, -1, I + LDF [A1 + 1 * SIZE], c02 + add A1, 2 * SIZE, A1 + cmp I, 0 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + bg,pt %icc, .LL116 + add B, 2 * SIZE, B + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemm_tcopy.S b/kernel/sparc/zgemm_tcopy.S new file mode 100644 index 0000000..5553761 --- /dev/null +++ b/kernel/sparc/zgemm_tcopy.S @@ -0,0 +1,305 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 + +#define I %l4 +#define J %l5 + +#define B1 %o0 +#define B2 %o1 +#define M4 %o4 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sll M, BASE_SHIFT + 2, M4 + + and N, -2, B2 + sll M, ZBASE_SHIFT, B1 + smul B1, B2, B2 + add B, B2, B2 + + sra M, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, ZBASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + sra N, 2, I + cmp I, 0 + + mov B, B1 + add B, 8 * SIZE, B + + ble,pn %icc, .LL15 + add A2, LDA, A + +#define PREFETCHSIZE 16 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A2 + 0 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A2 + 2 * SIZE], c11 + LDF [A2 + 3 * SIZE], c12 + + LDF [A2 + 4 * SIZE], c13 + LDF [A2 + 5 * SIZE], c14 + LDF [A2 + 6 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + prefetch [B1 + (PREFETCHSIZE + 0) * SIZE], 2 + + STF c01, [B1 + 0 * SIZE] + add A1, 8 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add A2, 8 * SIZE, A2 + STF c03, [B1 + 2 * SIZE] + STF c04, [B1 + 3 * SIZE] + STF c09, [B1 + 4 * SIZE] + add I, -1, I + STF c10, [B1 + 5 * SIZE] + cmp I, 0 + STF c11, [B1 + 6 * SIZE] + STF c12, [B1 + 7 * SIZE] + add B1, M4, B1 + +#ifdef DOUBLE + prefetch [B1 + (PREFETCHSIZE + 8) * SIZE], 2 +#endif + STF c05, [B1 + 0 * SIZE] + STF c06, [B1 + 1 * SIZE] + STF c07, [B1 + 2 * SIZE] + STF c08, [B1 + 3 * SIZE] + STF c13, [B1 + 4 * SIZE] + STF c14, [B1 + 5 * SIZE] + STF c15, [B1 + 6 * SIZE] + STF c16, [B1 + 7 * SIZE] + bg,pt %icc, .LL12 + add B1, M4, B1 + +.LL15: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL17 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + LDF [A2 + 0 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + STF c01, [B1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c03, [B1 + 2 * SIZE] + STF c04, [B1 + 3 * SIZE] + STF c05, [B1 + 4 * SIZE] + STF c06, [B1 + 5 * SIZE] + STF c07, [B1 + 6 * SIZE] + STF c08, [B1 + 7 * SIZE] + add B1, M4, B1 + +.LL17: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + STF c01, [B2 + 0 * SIZE] + STF c02, [B2 + 1 * SIZE] + STF c03, [B2 + 2 * SIZE] + STF c04, [B2 + 3 * SIZE] + add B2, 4 * SIZE, B2 + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and M, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL111: + sra N, 2, I + cmp I, 0 + mov A, A1 + + ble,pn %icc, .LL115 + mov B, B1 + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + + STF c01, [B1 + 0 * SIZE] + add A1, 8 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add I, -1, I + STF c03, [B1 + 2 * SIZE] + cmp I, 0 + STF c04, [B1 + 3 * SIZE] + add B1, M4, B1 + + STF c05, [B1 + 0 * SIZE] + STF c06, [B1 + 1 * SIZE] + STF c07, [B1 + 2 * SIZE] + STF c08, [B1 + 3 * SIZE] + + bg,pt %icc, .LL112 + add B1, M4, B1 + +.LL115: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL117 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add I, -1, I + STF c03, [B1 + 2 * SIZE] + cmp I, 0 + STF c04, [B1 + 3 * SIZE] + add B1, M4, B1 + +.LL117: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + STF c01, [B2 + 0 * SIZE] + STF c02, [B2 + 1 * SIZE] + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemv_n.S b/kernel/sparc/zgemv_n.S new file mode 100644 index 0000000..46ff438 --- /dev/null +++ b/kernel/sparc/zgemv_n.S @@ -0,0 +1,1176 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE 44 +#else +#define PREFETCHSIZE 88 +#endif + +#define M %i0 +#define N %i1 +#define A %i5 +#define LDA %i2 +#define X %i3 +#define INCX %i4 + +#define Y %l0 +#define INCY %l1 +#define BUFFER %l2 + +#define I %l3 +#define J %l5 + +#define A1 %o0 +#define A2 %o1 +#define A3 %o2 +#define A4 %o3 + +#define Y1 %l4 +#define YY %l6 + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define y1 %f8 +#define y2 %f10 +#define y3 %f12 +#define y4 %f14 +#define y5 %f16 +#define y6 %f18 +#define y7 %f20 +#define y8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 + +#define a9 %f40 +#define a10 %f42 +#define a11 %f44 +#define a12 %f46 +#define a13 %f48 +#define a14 %f50 +#define a15 %f52 +#define a16 %f54 + +#define x1 %f56 +#define x2 %f58 +#define x3 %f60 +#define x4 %f62 + +#define FZERO %f50 +#define ALPHA_R %f52 +#define ALPHA_I %f54 +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define y1 %f4 +#define y2 %f5 +#define y3 %f6 +#define y4 %f7 +#define y5 %f8 +#define y6 %f9 +#define y7 %f10 +#define y8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 + +#define a9 %f20 +#define a10 %f21 +#define a11 %f22 +#define a12 %f23 +#define a13 %f24 +#define a14 %f25 +#define a15 %f26 +#define a16 %f27 + +#define x1 %f28 +#define x2 %f29 +#define x3 %f30 +#define x4 %f31 + +#define FZERO %f25 +#define ALPHA_R %f26 +#define ALPHA_I %f27 +#endif + +#ifndef __64BIT__ +#define STACK_ALPHA_R [%sp + STACK_START + 16] +#ifndef DOUBLE +#define STACK_ALPHA_I [%sp + STACK_START + 20] +#else +#define STACK_ALPHA_I [%sp + STACK_START + 24] +#endif +#else +#define STACK_ALPHA_R [%sp + STACK_START + 32] +#define STACK_ALPHA_I [%sp + STACK_START + 40] +#endif + +#ifndef CONJ +#define FSUBX FSUB +#define FADDX FADD +#else +#define FSUBX FADD +#define FADDX FSUB +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] /* ALPHA_I */ + + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], LDA + ld [%sp + STACK_START + 40], X + ld [%sp + STACK_START + 44], INCX + ld [%sp + STACK_START + 48], Y + ld [%sp + STACK_START + 52], INCY + ld [%sp + STACK_START + 56], BUFFER +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ + st %i4, [%sp + STACK_START + 20] /* ALPHA_I */ + + ld [%sp + STACK_START + 28], LDA + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY + ld [%sp + STACK_START + 48], BUFFER +#endif +#else + ldx [%sp + STACK_START + 56], LDA + ldx [%sp + STACK_START + 64], X + ldx [%sp + STACK_START + 72], INCX + ldx [%sp + STACK_START + 80], Y + ldx [%sp + STACK_START + 88], INCY + ldx [%sp + STACK_START + 96], BUFFER + +#ifdef DOUBLE + std %f6, STACK_ALPHA_R + std %f8, STACK_ALPHA_I +#else + st %f7, STACK_ALPHA_R + st %f9, STACK_ALPHA_I +#endif +#endif + + sll LDA, ZBASE_SHIFT, LDA + + cmp M, 0 + ble %icc, .LL999 + sll INCX, ZBASE_SHIFT, INCX + + cmp N, 0 + ble %icc, .LL999 + sll INCY, ZBASE_SHIFT, INCY + + cmp INCY, 2 * SIZE + be %icc, .LL20 + mov Y, YY + +#ifdef DOUBLE + FCLR(19) +#else + FCLR(25) +#endif + + add M, 3, J + sra J, 2, J + mov BUFFER, YY + mov BUFFER, Y1 + +.LL01: + STF FZERO, [Y1 + 0 * SIZE] + nop + STF FZERO, [Y1 + 1 * SIZE] + STF FZERO, [Y1 + 2 * SIZE] + STF FZERO, [Y1 + 3 * SIZE] + STF FZERO, [Y1 + 4 * SIZE] + nop + STF FZERO, [Y1 + 5 * SIZE] + deccc J + STF FZERO, [Y1 + 6 * SIZE] + nop + STF FZERO, [Y1 + 7 * SIZE] + bg,pn %icc, .LL01 + add Y1, 8 * SIZE, Y1 + +.LL20: + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + +.LL21: + mov YY, Y1 + mov A, A1 + LDF STACK_ALPHA_R, ALPHA_R + LDF STACK_ALPHA_I, ALPHA_I + + add A, LDA, A2 + add A2, LDA, A + + LDF [X + 0 * SIZE], x1 + LDF [X + 1 * SIZE], x2 + add X, INCX, X + LDF [X + 0 * SIZE], x3 + LDF [X + 1 * SIZE], x4 + add X, INCX, X + + FMUL ALPHA_R, x1, a1 + FMUL ALPHA_I, x2, a4 + FMUL ALPHA_I, x1, a2 + FMUL ALPHA_R, x2, a3 + + FMUL ALPHA_R, x3, a5 + FMUL ALPHA_I, x4, a8 + FMUL ALPHA_I, x3, a6 + FMUL ALPHA_R, x4, a7 + +#ifndef XCONJ + FSUB a1, a4, x1 + FADD a2, a3, x2 + FSUB a5, a8, x3 + FADD a6, a7, x4 +#else + FADD a1, a4, x1 + FSUB a2, a3, x2 + FADD a5, a8, x3 + FSUB a6, a7, x4 +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL27 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [A1 + 4 * SIZE], a9 + LDF [A1 + 5 * SIZE], a10 + LDF [A1 + 6 * SIZE], a11 + LDF [A1 + 7 * SIZE], a12 + + LDF [A2 + 0 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + LDF [A2 + 2 * SIZE], a7 + LDF [A2 + 3 * SIZE], a8 + + LDF [A2 + 4 * SIZE], a13 + LDF [A2 + 5 * SIZE], a14 + LDF [A2 + 6 * SIZE], a15 + LDF [A2 + 7 * SIZE], a16 + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + + + FMUL a1, x1, t1 + deccc I + FMUL a1, x2, t2 + LDF [A1 + 8 * SIZE], a1 + + FMUL a3, x1, t3 + FMUL a3, x2, t4 + ble,pn %icc, .LL26 + LDF [A1 + 10 * SIZE], a3 + + FADD y1, t1, y1 + LDF [Y1 + 3 * SIZE], y4 + FMUL a2, x2, t1 + + FADD y2, t2, y2 + FMUL a2, x1, t2 + LDF [A1 + 9 * SIZE], a2 + + FADD y3, t3, y3 + LDF [Y1 + 4 * SIZE], y5 + FMUL a4, x2, t3 + + FADD y4, t4, y4 + FMUL a4, x1, t4 + LDF [A1 + 11 * SIZE], a4 + + FSUBX y1, t1, y1 + LDF [Y1 + 5 * SIZE], y6 + FMUL a5, x3, t1 + + FADDX y2, t2, y2 + FMUL a5, x4, t2 + LDF [A2 + 8 * SIZE], a5 + + FSUBX y3, t3, y3 + LDF [Y1 + 6 * SIZE], y7 + FMUL a7, x3, t3 + + FADDX y4, t4, y4 + FMUL a7, x4, t4 + LDF [A2 + 10 * SIZE], a7 + + FADD y1, t1, y1 + LDF [Y1 + 7 * SIZE], y8 + FMUL a6, x4, t1 + + FADD y2, t2, y2 + FMUL a6, x3, t2 + LDF [A2 + 9 * SIZE], a6 + + FADD y3, t3, y3 + FMUL a8, x4, t3 + + FADD y4, t4, y4 + FMUL a8, x3, t4 + LDF [A2 + 11 * SIZE], a8 + + FSUBX y1, t1, y1 + FMUL a9, x1, t1 + + FADDX y2, t2, y2 + FMUL a9, x2, t2 + LDF [A1 + 12 * SIZE], a9 + + FSUBX y3, t3, y3 + deccc I + FMUL a11, x1, t3 + + FADDX y4, t4, y4 + FMUL a11, x2, t4 + ble,pn %icc, .LL23 + LDF [A1 + 14 * SIZE], a11 + +.LL22: + FADD y5, t1, y5 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a10, x2, t1 + LDF [Y1 + 7 * SIZE], y8 + + FADD y6, t2, y6 + FMUL a10, x1, t2 + LDF [A1 + 13 * SIZE], a10 + + FADD y7, t3, y7 + FMUL a12, x2, t3 + STF y1, [Y1 + 0 * SIZE] + + FADD y8, t4, y8 + FMUL a12, x1, t4 + LDF [A1 + 15 * SIZE], a12 + + FSUBX y5, t1, y5 + FMUL a13, x3, t1 + STF y2, [Y1 + 1 * SIZE] + + FADDX y6, t2, y6 + FMUL a13, x4, t2 + LDF [A2 + 12 * SIZE], a13 + + FSUBX y7, t3, y7 + FMUL a15, x3, t3 + STF y3, [Y1 + 2 * SIZE] + + FADDX y8, t4, y8 + FMUL a15, x4, t4 + LDF [A2 + 14 * SIZE], a15 + + FADD y5, t1, y5 + FMUL a14, x4, t1 + STF y4, [Y1 + 3 * SIZE] + + FADD y6, t2, y6 + FMUL a14, x3, t2 + LDF [A2 + 13 * SIZE], a14 + + FADD y7, t3, y7 + FMUL a16, x4, t3 + LDF [Y1 + 8 * SIZE], y1 + + FADD y8, t4, y8 + FMUL a16, x3, t4 + LDF [A2 + 15 * SIZE], a16 + + FSUBX y5, t1, y5 + FMUL a1, x1, t1 + LDF [Y1 + 9 * SIZE], y2 + + FADDX y6, t2, y6 + FMUL a1, x2, t2 + LDF [A1 + 16 * SIZE], a1 + + FSUBX y7, t3, y7 + FMUL a3, x1, t3 + LDF [Y1 + 10 * SIZE], y3 + + FADDX y8, t4, y8 + FMUL a3, x2, t4 + LDF [A1 + 18 * SIZE], a3 + + FADD y1, t1, y1 + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + FMUL a2, x2, t1 + LDF [Y1 + 11 * SIZE], y4 + + FADD y2, t2, y2 + FMUL a2, x1, t2 + LDF [A1 + 17 * SIZE], a2 + + FADD y3, t3, y3 + FMUL a4, x2, t3 + STF y5, [Y1 + 4 * SIZE] + + FADD y4, t4, y4 + FMUL a4, x1, t4 + LDF [A1 + 19 * SIZE], a4 + + FSUBX y1, t1, y1 + FMUL a5, x3, t1 + STF y6, [Y1 + 5 * SIZE] + + FADDX y2, t2, y2 + FMUL a5, x4, t2 + LDF [A2 + 16 * SIZE], a5 + + FSUBX y3, t3, y3 + FMUL a7, x3, t3 + STF y7, [Y1 + 6 * SIZE] + + FADDX y4, t4, y4 + deccc I + FMUL a7, x4, t4 + LDF [A2 + 18 * SIZE], a7 + + FADD y1, t1, y1 + FMUL a6, x4, t1 + STF y8, [Y1 + 7 * SIZE] + + FADD y2, t2, y2 + FMUL a6, x3, t2 + LDF [A2 + 17 * SIZE], a6 + + FADD y3, t3, y3 + add A1, 8 * SIZE, A1 + FMUL a8, x4, t3 + LDF [Y1 + 12 * SIZE], y5 + + FADD y4, t4, y4 + FMUL a8, x3, t4 + LDF [A2 + 19 * SIZE], a8 + + FSUBX y1, t1, y1 + add A2, 8 * SIZE, A2 + FMUL a9, x1, t1 + LDF [Y1 + 13 * SIZE], y6 + + FADDX y2, t2, y2 + add Y1, 8 * SIZE, Y1 + FMUL a9, x2, t2 + LDF [A1 + 12 * SIZE], a9 + + FSUBX y3, t3, y3 + FMUL a11, x1, t3 + LDF [Y1 + 6 * SIZE], y7 + + FADDX y4, t4, y4 + FMUL a11, x2, t4 + bg,pn %icc, .LL22 + LDF [A1 + 14 * SIZE], a11 + +.LL23: + FADD y5, t1, y5 + FMUL a10, x2, t1 + LDF [Y1 + 7 * SIZE], y8 + + FADD y6, t2, y6 + FMUL a10, x1, t2 + LDF [A1 + 13 * SIZE], a10 + + FADD y7, t3, y7 + FMUL a12, x2, t3 + STF y1, [Y1 + 0 * SIZE] + + FADD y8, t4, y8 + FMUL a12, x1, t4 + LDF [A1 + 15 * SIZE], a12 + + FSUBX y5, t1, y5 + FMUL a13, x3, t1 + STF y2, [Y1 + 1 * SIZE] + + FADDX y6, t2, y6 + FMUL a13, x4, t2 + LDF [A2 + 12 * SIZE], a13 + + FSUBX y7, t3, y7 + FMUL a15, x3, t3 + STF y3, [Y1 + 2 * SIZE] + FADDX y8, t4, y8 + FMUL a15, x4, t4 + LDF [A2 + 14 * SIZE], a15 + + FADD y5, t1, y5 + FMUL a14, x4, t1 + STF y4, [Y1 + 3 * SIZE] + FADD y6, t2, y6 + FMUL a14, x3, t2 + LDF [A2 + 13 * SIZE], a14 + + FADD y7, t3, y7 + FMUL a16, x4, t3 + LDF [Y1 + 8 * SIZE], y1 + FADD y8, t4, y8 + FMUL a16, x3, t4 + LDF [A2 + 15 * SIZE], a16 + + FSUBX y5, t1, y5 + add A1, 8 * SIZE, A1 + FMUL a1, x1, t1 + LDF [Y1 + 9 * SIZE], y2 + + FADDX y6, t2, y6 + add A2, 8 * SIZE, A2 + FMUL a1, x2, t2 + LDF [A1 + 8 * SIZE], a1 + + FSUBX y7, t3, y7 + FMUL a3, x1, t3 + LDF [Y1 + 10 * SIZE], y3 + + FADDX y8, t4, y8 + add Y1, 8 * SIZE, Y1 + FMUL a3, x2, t4 + LDF [A1 + 10 * SIZE], a3 + + STF y5, [Y1 - 4 * SIZE] + STF y6, [Y1 - 3 * SIZE] + STF y7, [Y1 - 2 * SIZE] + STF y8, [Y1 - 1 * SIZE] + +.LL26: + FADD y1, t1, y1 + LDF [Y1 + 3 * SIZE], y4 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + + FADD y3, t3, y3 + LDF [Y1 + 4 * SIZE], y5 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + + FSUBX y1, t1, y1 + LDF [Y1 + 5 * SIZE], y6 + FMUL a5, x3, t1 + FADDX y2, t2, y2 + FMUL a5, x4, t2 + + FSUBX y3, t3, y3 + LDF [Y1 + 6 * SIZE], y7 + FADDX y4, t4, y4 + FMUL a7, x4, t4 + + FADD y1, t1, y1 + LDF [Y1 + 7 * SIZE], y8 + FMUL a7, x3, t3 + FMUL a6, x4, t1 + FADD y2, t2, y2 + FMUL a6, x3, t2 + + FADD y3, t3, y3 + FMUL a8, x4, t3 + FADD y4, t4, y4 + FMUL a8, x3, t4 + + FSUBX y1, t1, y1 + FMUL a9, x1, t1 + FADDX y2, t2, y2 + FMUL a9, x2, t2 + + FSUBX y3, t3, y3 + FMUL a11, x1, t3 + FADDX y4, t4, y4 + FMUL a11, x2, t4 + + FADD y5, t1, y5 + FMUL a10, x2, t1 + FADD y6, t2, y6 + FMUL a10, x1, t2 + + FADD y7, t3, y7 + FMUL a12, x2, t3 + FADD y8, t4, y8 + FMUL a12, x1, t4 + + FSUBX y5, t1, y5 + FMUL a13, x3, t1 + FADDX y6, t2, y6 + FMUL a13, x4, t2 + + FSUBX y7, t3, y7 + FMUL a15, x3, t3 + FADDX y8, t4, y8 + FMUL a15, x4, t4 + + FADD y5, t1, y5 + FMUL a14, x4, t1 + FADD y6, t2, y6 + FMUL a14, x3, t2 + + FADD y7, t3, y7 + FMUL a16, x4, t3 + FADD y8, t4, y8 + FMUL a16, x3, t4 + + STF y1, [Y1 + 0 * SIZE] + FSUBX y5, t1, y5 + STF y2, [Y1 + 1 * SIZE] + FADDX y6, t2, y6 + STF y3, [Y1 + 2 * SIZE] + FSUBX y7, t3, y7 + STF y4, [Y1 + 3 * SIZE] + FADDX y8, t4, y8 + + STF y5, [Y1 + 4 * SIZE] + add A1, 8 * SIZE, A1 + STF y6, [Y1 + 5 * SIZE] + add A2, 8 * SIZE, A2 + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + add Y1, 8 * SIZE, Y1 + +.LL27: + andcc M, 2, I + ble,pn %icc, .LL28 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + + FMUL a1, x1, t1 + LDF [A2 + 0 * SIZE], a5 + FMUL a1, x2, t2 + LDF [A2 + 1 * SIZE], a6 + FMUL a3, x1, t3 + LDF [A2 + 2 * SIZE], a7 + FMUL a3, x2, t4 + LDF [A2 + 3 * SIZE], a8 + + FADD y1, t1, y1 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + + FADD y3, t3, y3 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + + FSUBX y1, t1, y1 + FMUL a5, x3, t1 + FADDX y2, t2, y2 + FMUL a5, x4, t2 + + FSUBX y3, t3, y3 + FMUL a7, x3, t3 + FADDX y4, t4, y4 + FMUL a7, x4, t4 + + FADD y1, t1, y1 + FMUL a6, x4, t1 + FADD y2, t2, y2 + FMUL a6, x3, t2 + + FADD y3, t3, y3 + FMUL a8, x4, t3 + FADD y4, t4, y4 + FMUL a8, x3, t4 + + FSUBX y1, t1, y1 + FADDX y2, t2, y2 + FSUBX y3, t3, y3 + FADDX y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF y2, [Y1 + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF y3, [Y1 + 2 * SIZE] + nop + STF y4, [Y1 + 3 * SIZE] + add Y1, 4 * SIZE, Y1 + +.LL28: + andcc M, 1, I + ble,pn %icc, .LL29 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + + FMUL a1, x1, t1 + FMUL a1, x2, t2 + FMUL a2, x2, t3 + FMUL a2, x1, t4 + + FADD y1, t1, y1 + FMUL a3, x3, t1 + FADD y2, t2, y2 + FMUL a3, x4, t2 + + FSUBX y1, t3, y1 + FMUL a4, x4, t3 + FADDX y2, t4, y2 + FMUL a4, x3, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FSUBX y1, t3, y1 + FADDX y2, t4, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + +.LL29: + deccc J + bg %icc, .LL21 + nop + + +.LL30: + andcc N, 1, J + ble,pn %icc, .LL990 + nop + +.LL31: + mov YY, Y1 + mov A, A1 + + LDF STACK_ALPHA_R, ALPHA_R + LDF STACK_ALPHA_I, ALPHA_I + + LDF [X + 0 * SIZE], x1 + LDF [X + 1 * SIZE], x2 + + FMUL ALPHA_R, x1, a1 /* AC */ + FMUL ALPHA_I, x1, a2 /* AD */ + FMUL ALPHA_R, x2, a3 /* BC */ + FMUL ALPHA_I, x2, a4 /* BD */ + +#ifndef XCONJ + FSUB a1, a4, x1 + FADD a2, a3, x2 +#else + FADD a1, a4, x1 + FSUB a2, a3, x2 +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL37 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [A1 + 4 * SIZE], a9 + LDF [A1 + 5 * SIZE], a10 + LDF [A1 + 6 * SIZE], a11 + LDF [A1 + 7 * SIZE], a12 + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + FMUL a1, x1, t1 + deccc I + FMUL a1, x2, t2 + LDF [A1 + 8 * SIZE], a1 + FMUL a3, x1, t3 + FMUL a3, x2, t4 + ble,pn %icc, .LL33 + LDF [A1 + 10 * SIZE], a3 + +.LL32: + FADD y1, t1, y1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + LDF [A1 + 9 * SIZE], a2 + + FADD y3, t3, y3 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + LDF [A1 + 11 * SIZE], a4 + + FSUBX y1, t1, y1 + FMUL a9, x1, t1 + FADDX y2, t2, y2 + FMUL a9, x2, t2 + LDF [A1 + 12 * SIZE], a9 + + FSUBX y3, t3, y3 + FMUL a11, x1, t3 + FADDX y4, t4, y4 + FMUL a11, x2, t4 + LDF [A1 + 14 * SIZE], a11 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + FADD y5, t1, y5 + FMUL a10, x2, t1 + LDF [Y1 + 8 * SIZE], y1 + FADD y6, t2, y6 + FMUL a10, x1, t2 + LDF [A1 + 13 * SIZE], a10 + + FADD y7, t3, y7 + deccc I + FMUL a12, x2, t3 + LDF [Y1 + 9 * SIZE], y2 + FADD y8, t4, y8 + FMUL a12, x1, t4 + LDF [A1 + 15 * SIZE], a12 + + FSUBX y5, t1, y5 + add A1, 8 * SIZE, A1 + FMUL a1, x1, t1 + LDF [Y1 + 10 * SIZE], y3 + FADDX y6, t2, y6 + FMUL a1, x2, t2 + LDF [A1 + 8 * SIZE], a1 + + FSUBX y7, t3, y7 + FMUL a3, x1, t3 + LDF [Y1 + 11 * SIZE], y4 + FADDX y8, t4, y8 + FMUL a3, x2, t4 + LDF [A1 + 10 * SIZE], a3 + + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + + LDF [Y1 + 12 * SIZE], y5 + LDF [Y1 + 13 * SIZE], y6 + LDF [Y1 + 14 * SIZE], y7 + add Y1, 8 * SIZE, Y1 + bg,pn %icc, .LL32 + LDF [Y1 + 7 * SIZE], y8 + +.LL33: + FADD y1, t1, y1 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + + FADD y3, t3, y3 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + + FSUBX y1, t1, y1 + FMUL a9, x1, t1 + FADDX y2, t2, y2 + FMUL a9, x2, t2 + + FSUBX y3, t3, y3 + FMUL a11, x1, t3 + FADDX y4, t4, y4 + FMUL a11, x2, t4 + + FADD y5, t1, y5 + FMUL a10, x2, t1 + FADD y6, t2, y6 + FMUL a10, x1, t2 + + FADD y7, t3, y7 + FMUL a12, x2, t3 + FADD y8, t4, y8 + FMUL a12, x1, t4 + + FSUBX y5, t1, y5 + FADDX y6, t2, y6 + FSUBX y7, t3, y7 + FADDX y8, t4, y8 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + + add A1, 8 * SIZE, A1 + add Y1, 8 * SIZE, Y1 + + +.LL37: + andcc M, 2, I + ble,pn %icc, .LL38 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [Y1 + 0 * SIZE], y1 + FMUL a1, x1, t1 + LDF [Y1 + 1 * SIZE], y2 + FMUL a1, x2, t2 + LDF [Y1 + 2 * SIZE], y3 + FMUL a3, x1, t3 + LDF [Y1 + 3 * SIZE], y4 + FMUL a3, x2, t4 + + FADD y1, t1, y1 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + FADD y3, t3, y3 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + + FSUBX y1, t1, y1 + FADDX y2, t2, y2 + FSUBX y3, t3, y3 + FADDX y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + add A1, 4 * SIZE, A1 + add Y1, 4 * SIZE, Y1 + +.LL38: + andcc M, 1, I + ble,pn %icc, .LL990 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + + FMUL a1, x1, t1 + FMUL a1, x2, t2 + FMUL a2, x2, t3 + FMUL a2, x1, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FSUBX y1, t3, y1 + FADDX y2, t4, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + +.LL990: + cmp INCY, 2 * SIZE + be %icc, .LL999 + mov Y, Y1 + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL995 + nop + +.LL991: + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], y1 + LDF [Y + 1 * SIZE], y2 + add Y, INCY, Y + + LDF [BUFFER + 2 * SIZE], a3 + LDF [BUFFER + 3 * SIZE], a4 + LDF [Y + 0 * SIZE], y3 + LDF [Y + 1 * SIZE], y4 + add Y, INCY, Y + + LDF [BUFFER + 4 * SIZE], a5 + LDF [BUFFER + 5 * SIZE], a6 + LDF [Y + 0 * SIZE], y5 + LDF [Y + 1 * SIZE], y6 + add Y, INCY, Y + + LDF [BUFFER + 6 * SIZE], a7 + LDF [BUFFER + 7 * SIZE], a8 + LDF [Y + 0 * SIZE], y7 + LDF [Y + 1 * SIZE], y8 + add Y, INCY, Y + + FADD y1, a1, y1 + FADD y2, a2, y2 + FADD y3, a3, y3 + FADD y4, a4, y4 + FADD y5, a5, y5 + FADD y6, a6, y6 + FADD y7, a7, y7 + FADD y8, a8, y8 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + STF y3, [Y1 + 0 * SIZE] + STF y4, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + STF y5, [Y1 + 0 * SIZE] + STF y6, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + STF y7, [Y1 + 0 * SIZE] + STF y8, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + + deccc I + bg,pn %icc, .LL991 + add BUFFER, 8 * SIZE, BUFFER + +.LL995: + andcc M, 2, I + ble,pn %icc, .LL996 + nop + + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], y1 + LDF [Y + 1 * SIZE], y2 + add Y, INCY, Y + + LDF [BUFFER + 2 * SIZE], a3 + LDF [BUFFER + 3 * SIZE], a4 + LDF [Y + 0 * SIZE], y3 + LDF [Y + 1 * SIZE], y4 + add Y, INCY, Y + + FADD y1, a1, y1 + FADD y2, a2, y2 + FADD y3, a3, y3 + FADD y4, a4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + STF y3, [Y1 + 0 * SIZE] + STF y4, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + + add BUFFER, 4 * SIZE, BUFFER + +.LL996: + andcc M, 1, I + ble,pn %icc, .LL999 + nop + + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], y1 + LDF [Y + 1 * SIZE], y2 + + FADD y1, a1, y1 + FADD y2, a2, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemv_t.S b/kernel/sparc/zgemv_t.S new file mode 100644 index 0000000..2b4a64c --- /dev/null +++ b/kernel/sparc/zgemv_t.S @@ -0,0 +1,1737 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 4000 + +#define M %i0 +#define N %i1 +#define A %i5 +#define LDA %i2 +#define X %i3 +#define INCX %i4 + +#define Y %l0 +#define INCY %l1 +#define BUFFER %l2 + +#define I %l3 +#define IS %l4 +#define J %l5 +#define MIN_M %l6 +#define XP %l7 + +#define A1 %o0 +#define A2 %o1 +#define A3 %o2 +#define A4 %o3 + +#define X1 %o4 +#define Y1 %o5 +#define PNLDA %g1 +#define Y2 %o7 /* Danger? */ + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define c1 %f8 +#define c2 %f10 +#define c3 %f12 +#define c4 %f14 +#define c5 %f16 +#define c6 %f18 +#define c7 %f20 +#define c8 %f22 +#define c9 %f24 +#define c10 %f26 +#define c11 %f28 +#define c12 %f30 +#define c13 %f32 +#define c14 %f34 +#define c15 %f36 +#define c16 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f48 +#define a6 %f50 +#define a7 %f52 +#define a8 %f54 + +#define b1 %f56 +#define b2 %f58 +#define b3 %f60 +#define b4 %f62 +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define c1 %f4 +#define c2 %f5 +#define c3 %f6 +#define c4 %f7 +#define c5 %f8 +#define c6 %f9 +#define c7 %f10 +#define c8 %f11 +#define c9 %f12 +#define c10 %f13 +#define c11 %f14 +#define c12 %f15 +#define c13 %f16 +#define c14 %f17 +#define c15 %f18 +#define c16 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f24 +#define a6 %f25 +#define a7 %f26 +#define a8 %f27 + +#define b1 %f28 +#define b2 %f29 +#define b3 %f30 +#define b4 %f31 +#endif + +#ifndef __64BIT__ +#define ALPHA_R [%sp + STACK_START + 16] +#ifndef DOUBLE +#define ALPHA_I [%sp + STACK_START + 20] +#else +#define ALPHA_I [%sp + STACK_START + 24] +#endif +#else +#define ALPHA_R [%sp + STACK_START + 32] +#define ALPHA_I [%sp + STACK_START + 40] +#endif + +#ifdef DOUBLE +#define PREFETCHSIZE 18 +#else +#define PREFETCHSIZE 36 +#endif + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] /* ALPHA_I */ + + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], LDA + ld [%sp + STACK_START + 40], X + ld [%sp + STACK_START + 44], INCX + ld [%sp + STACK_START + 48], Y + ld [%sp + STACK_START + 52], INCY + ld [%sp + STACK_START + 56], BUFFER +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ + st %i4, [%sp + STACK_START + 20] /* ALPHA_I */ + + ld [%sp + STACK_START + 28], LDA + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY + ld [%sp + STACK_START + 48], BUFFER +#endif +#else + ldx [%sp + STACK_START + 56], LDA + ldx [%sp + STACK_START + 64], X + ldx [%sp + STACK_START + 72], INCX + ldx [%sp + STACK_START + 80], Y + ldx [%sp + STACK_START + 88], INCY + ldx [%sp + STACK_START + 96], BUFFER +#ifdef DOUBLE + std %f6, ALPHA_R + std %f8, ALPHA_I +#else + st %f7, ALPHA_R + st %f9, ALPHA_I +#endif +#endif + + clr IS + mov P, I + sll LDA, ZBASE_SHIFT, LDA + sll I, ZBASE_SHIFT, I + smul LDA, N, PNLDA + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + sub I, PNLDA, PNLDA + +.LL10: + sll IS, ZBASE_SHIFT, I + sub M, IS, MIN_M + mov P, J + + cmp MIN_M, J + nop + movg %icc, J, MIN_M + nop + cmp INCX, 2 * SIZE + beq .LL100 + add X, I, XP + + sra MIN_M, 2, I + mov BUFFER, XP + cmp I, 0 + ble,pn %icc, .LL15 + mov BUFFER, Y1 + +.LL11: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + LDF [X + 0 * SIZE], a7 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + + STF a1, [Y1 + 0 * SIZE] + add I, -1, I + STF a2, [Y1 + 1 * SIZE] + cmp I, 0 + STF a3, [Y1 + 2 * SIZE] + STF a4, [Y1 + 3 * SIZE] + STF a5, [Y1 + 4 * SIZE] + STF a6, [Y1 + 5 * SIZE] + STF a7, [Y1 + 6 * SIZE] + STF a8, [Y1 + 7 * SIZE] + bg,pn %icc, .LL11 + add Y1, 8 * SIZE, Y1 + +.LL15: + and MIN_M, 3, I + cmp I, 0 + ble,pn %icc, .LL100 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + add I, -1, I + cmp I, 0 + nop + STF a1, [Y1 + 0 * SIZE] + STF a2, [Y1 + 1 * SIZE] + bg,pn %icc, .LL16 + add Y1, 2 * SIZE, Y1 + +.LL100: + sra N, 2, J + cmp J, 0 + ble %icc, .LL200 + mov Y, Y1 + +.LL110: + FCLR(0) + + FMOV t1, c1 + sra MIN_M, 2, I + FMOV t1, c2 + add A, LDA, A2 + FMOV t1, c3 + mov A, A1 + FMOV t1, c4 + add A2, LDA, A3 + + FMOV t1, c5 + FMOV t1, c6 + FMOV t1, c7 + FMOV t1, c8 + FMOV t1, c9 + FMOV t1, c10 + FMOV t1, c11 + FMOV t1, c12 + FMOV t1, c13 + FMOV t1, c14 + FMOV t1, c15 + FMOV t1, c16 + + add A3, LDA, A4 + FMOV t1, t2 + mov XP, X1 + FMOV t1, t3 + add A4, LDA, A + cmp I, 0 + ble %icc, .LL115 + FMOV t1, t4 + + LDF [A1 + 0 * SIZE], a1 + nop + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + add A2, 2 * SIZE, A2 + LDF [A3 + 0 * SIZE], a5 + LDF [A3 + 1 * SIZE], a6 + add A3, 2 * SIZE, A3 + LDF [A4 + 0 * SIZE], a7 + LDF [A4 + 1 * SIZE], a8 + add A4, 2 * SIZE, A4 + + LDF [X1 + 0 * SIZE], b1 + nop + LDF [X1 + 1 * SIZE], b2 + nop + LDF [X1 + 2 * SIZE], b3 + add X1, 4 * SIZE, X1 + + deccc I + ble .LL112 + prefetch [Y1 + 7 * SIZE], 2 + +#ifndef XCONJ +#define FADDX FADD +#else +#define FADDX FSUB +#endif + +.LL111: + FADD c13, t1, c13 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a1, b1, t1 + nop + + FADDX c14, t2, c14 + nop + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b1, t3 + LDF [X1 - 1 * SIZE], b4 + + FADD c16, t4, c16 + nop + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b1, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b1, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b1, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b2, t2 + LDF [A3 + 0 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b1, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b2, t4 + LDF [A3 + 1 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b1, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b2, t2 + LDF [A4 + 0 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b1, t3 + LDF [X1 + 0 * SIZE], b1 + + FADD c12, t4, c12 + nop + FMUL a8, b2, t4 + LDF [A4 + 1 * SIZE], a8 + + FADD c13, t1, c13 + nop + FMUL a1, b3, t1 + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + + FADDX c14, t2, c14 + nop + FMUL a1, b4, t2 + LDF [A1 + 2 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b3, t3 + LDF [X1 + 1 * SIZE], b2 + + FADD c16, t4, c16 + nop + FMUL a2, b4, t4 + LDF [A1 + 3 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b3, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b4, t2 + LDF [A2 + 2 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b3, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b4, t4 + LDF [A2 + 3 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b3, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b4, t2 + LDF [A3 + 2 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b3, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b4, t4 + LDF [A3 + 3 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b3, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b4, t2 + LDF [A4 + 2 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b3, t3 + LDF [X1 + 2 * SIZE], b3 + + FADD c12, t4, c12 + nop + FMUL a8, b4, t4 + LDF [A4 + 3 * SIZE], a8 + + FADD c13, t1, c13 + prefetch [A3 + PREFETCHSIZE * SIZE], 1 + FMUL a1, b1, t1 + nop + + FADDX c14, t2, c14 + nop + FMUL a1, b2, t2 + LDF [A1 + 4 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b1, t3 + LDF [X1 + 3 * SIZE], b4 + + FADD c16, t4, c16 + nop + FMUL a2, b2, t4 + LDF [A1 + 5 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b1, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b2, t2 + LDF [A2 + 4 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b1, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b2, t4 + LDF [A2 + 5 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b1, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b2, t2 + LDF [A3 + 4 * SIZE], a5 + + FADD c7, t3, c7 + deccc I + FMUL a6, b1, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b2, t4 + LDF [A3 + 5 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b1, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b2, t2 + LDF [A4 + 4 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b1, t3 + LDF [X1 + 4 * SIZE], b1 + + FADD c12, t4, c12 + nop + FMUL a8, b2, t4 + LDF [A4 + 5 * SIZE], a8 + + FADD c13, t1, c13 + prefetch [A4 + PREFETCHSIZE * SIZE], 1 + FMUL a1, b3, t1 + nop + + FADDX c14, t2, c14 + nop + FMUL a1, b4, t2 + LDF [A1 + 6 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b3, t3 + LDF [X1 + 5 * SIZE], b2 + + FADD c16, t4, c16 + nop + FMUL a2, b4, t4 + LDF [A1 + 7 * SIZE], a2 + + FADD c1, t1, c1 + add A1, 8 * SIZE, A1 + FMUL a3, b3, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b4, t2 + LDF [A2 + 6 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b3, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b4, t4 + LDF [A2 + 7 * SIZE], a4 + + FADD c5, t1, c5 + add A2, 8 * SIZE, A2 + FMUL a5, b3, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b4, t2 + LDF [A3 + 6 * SIZE], a5 + + FADD c7, t3, c7 + add A4, 8 * SIZE, A4 + FMUL a6, b3, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b4, t4 + LDF [A3 + 7 * SIZE], a6 + + FADD c9, t1, c9 + add A3, 8 * SIZE, A3 + FMUL a7, b3, t1 + nop + + FADDX c10, t2, c10 + add X1, 8 * SIZE, X1 + FMUL a7, b4, t2 + LDF [A4 - 2 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b3, t3 + LDF [X1 - 2 * SIZE], b3 + + FADD c12, t4, c12 + FMUL a8, b4, t4 + bg,pn %icc, .LL111 + LDF [A4 - 1 * SIZE], a8 + +.LL112: + FADD c13, t1, c13 + nop + FMUL a1, b1, t1 + LDF [X1 - 1 * SIZE], b4 + + FADDX c14, t2, c14 + nop + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b1, t3 + LDF [X1 - 1 * SIZE], b4 + + FADD c16, t4, c16 + nop + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b1, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b1, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b1, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b2, t2 + LDF [A3 + 0 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b1, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b2, t4 + LDF [A3 + 1 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b1, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b2, t2 + LDF [A4 + 0 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b1, t3 + LDF [X1 + 0 * SIZE], b1 + + FADD c12, t4, c12 + nop + FMUL a8, b2, t4 + LDF [A4 + 1 * SIZE], a8 + + FADD c13, t1, c13 + nop + FMUL a1, b3, t1 + LDF [X1 + 1 * SIZE], b2 + + FADDX c14, t2, c14 + nop + FMUL a1, b4, t2 + LDF [A1 + 2 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a2, b4, t4 + LDF [A1 + 3 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b3, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b4, t2 + LDF [A2 + 2 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b3, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b4, t4 + LDF [A2 + 3 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b3, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b4, t2 + LDF [A3 + 2 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b3, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b4, t4 + LDF [A3 + 3 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b3, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b4, t2 + LDF [A4 + 2 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b3, t3 + LDF [X1 + 2 * SIZE], b3 + + FADD c12, t4, c12 + nop + FMUL a8, b4, t4 + LDF [A4 + 3 * SIZE], a8 + + FADD c13, t1, c13 + nop + FMUL a1, b1, t1 + LDF [X1 + 3 * SIZE], b4 + + FADDX c14, t2, c14 + add X1, 4 * SIZE, X1 + FMUL a1, b2, t2 + LDF [A1 + 4 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b1, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a2, b2, t4 + LDF [A1 + 5 * SIZE], a2 + + FADD c1, t1, c1 + add A1, 6 * SIZE, A1 + FMUL a3, b1, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b2, t2 + LDF [A2 + 4 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b1, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b2, t4 + LDF [A2 + 5 * SIZE], a4 + + FADD c5, t1, c5 + add A2, 6 * SIZE, A2 + FMUL a5, b1, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b2, t2 + LDF [A3 + 4 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b1, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b2, t4 + LDF [A3 + 5 * SIZE], a6 + + FADD c9, t1, c9 + add A3, 6 * SIZE, A3 + FMUL a7, b1, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b2, t2 + LDF [A4 + 4 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b1, t3 + nop + + FADD c12, t4, c12 + nop + FMUL a8, b2, t4 + LDF [A4 + 5 * SIZE], a8 + + FADD c13, t1, c13 + add A4, 6 * SIZE, A4 + FMUL a1, b3, t1 + nop + + FADDX c14, t2, c14 + nop + FMUL a1, b4, t2 + nop + + FADD c15, t3, c15 + FMUL a2, b3, t3 + FADD c16, t4, c16 + FMUL a2, b4, t4 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADDX c2, t2, c2 + FMUL a3, b4, t2 + FADD c3, t3, c3 + FMUL a4, b3, t3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + + FADD c5, t1, c5 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + FADD c7, t3, c7 + FMUL a6, b3, t3 + FADD c8, t4, c8 + FMUL a6, b4, t4 + + FADD c9, t1, c9 + FMUL a7, b3, t1 + FADDX c10, t2, c10 + FMUL a7, b4, t2 + FADD c11, t3, c11 + FMUL a8, b3, t3 + FADD c12, t4, c12 + FMUL a8, b4, t4 + +.LL115: + andcc MIN_M, 3, I + LDF ALPHA_R, b3 + mov Y1, Y2 + ble,pn %icc, .LL119 + LDF ALPHA_I, b4 + +.L116: + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + LDF [X1 + 0 * SIZE], b1 + LDF [X1 + 1 * SIZE], b2 + add X1, 2 * SIZE, X1 + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + add A2, 2 * SIZE, A2 + LDF [A3 + 0 * SIZE], a5 + LDF [A3 + 1 * SIZE], a6 + add A3, 2 * SIZE, A3 + LDF [A4 + 0 * SIZE], a7 + LDF [A4 + 1 * SIZE], a8 + add A4, 2 * SIZE, A4 + + FADD c13, t1, c13 + FMUL a1, b1, t1 + FADDX c14, t2, c14 + FMUL a1, b2, t2 + FADD c15, t3, c15 + FMUL a2, b1, t3 + FADD c16, t4, c16 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + FADD c3, t3, c3 + FMUL a4, b1, t3 + FADD c4, t4, c4 + FMUL a4, b2, t4 + + FADD c5, t1, c5 + FMUL a5, b1, t1 + FADDX c6, t2, c6 + FMUL a5, b2, t2 + FADD c7, t3, c7 + FMUL a6, b1, t3 + FADD c8, t4, c8 + FMUL a6, b2, t4 + + FADD c9, t1, c9 + FMUL a7, b1, t1 + FADDX c10, t2, c10 + FMUL a7, b2, t2 + FADD c11, t3, c11 + FMUL a8, b1, t3 + FADD c12, t4, c12 + FMUL a8, b2, t4 + + deccc I + bg %icc, .L116 + nop + +.LL119: + FADD c13, t1, c13 + LDF [Y1 + 0 * SIZE], a1 + FADDX c14, t2, c14 + LDF [Y1 + 1 * SIZE] ,a2 + add Y1, INCY, Y1 + FADD c15, t3, c15 + LDF [Y1 + 0 * SIZE], a3 + FADD c16, t4, c16 + LDF [Y1 + 1 * SIZE] ,a4 + add Y1, INCY, Y1 + +#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) + FSUB c1, c4, c1 + LDF [Y1 + 0 * SIZE], a5 + FSUB c5, c8, c5 + LDF [Y1 + 1 * SIZE] ,a6 + add Y1, INCY, Y1 + FSUB c9, c12, c9 + LDF [Y1 + 0 * SIZE], a7 + FSUB c13, c16, c13 + LDF [Y1 + 1 * SIZE] ,a8 + add Y1, INCY, Y1 +#else + FADD c1, c4, c1 + LDF [Y1 + 0 * SIZE], a5 + FADD c5, c8, c5 + LDF [Y1 + 1 * SIZE] ,a6 + add Y1, INCY, Y1 + FADD c9, c12, c9 + LDF [Y1 + 0 * SIZE], a7 + FADD c13, c16, c13 + LDF [Y1 + 1 * SIZE] ,a8 + add Y1, INCY, Y1 +#endif + +#ifndef CONJ + FADD c2, c3, c2 + FCLR(0) + FADD c6, c7, c6 + FADD c10, c11, c10 + FADD c14, c15, c14 +#else + FSUB c2, c3, c2 + FCLR(0) + FSUB c6, c7, c6 + FSUB c10, c11, c10 + FSUB c14, c15, c14 +#endif + + FMUL b3, c1, c3 + FMOV t1, t2 + FMUL b4, c1, c4 + FMOV t1, t3 + FMUL b4, c2, c1 + FMOV t1, t4 + FMUL b3, c2, c2 + + FMUL b3, c5, c7 + FMUL b4, c5, c8 + FMUL b4, c6, c5 + FMUL b3, c6, c6 + + FMUL b3, c9, c11 + FMUL b4, c9, c12 + FMUL b4, c10, c9 + FMUL b3, c10, c10 + + FMUL b3, c13, c15 + FSUB c3, c1, c1 + FMUL b4, c13, c16 + FADD c2, c4, c2 + FMUL b4, c14, c13 + FSUB c7, c5, c5 + FMUL b3, c14, c14 + FADD c6, c8, c6 + + FSUB c11, c9, c9 + FADD c10, c12, c10 + FSUB c15, c13, c13 + FADD c14, c16, c14 + + FADD a1, c1, a1 + FADD a2, c2, a2 + FADD a3, c5, a3 + FADD a4, c6, a4 + + STF a1, [Y2 + 0 * SIZE] + FADD a5, c9, a5 + STF a2, [Y2 + 1 * SIZE] + FADD a6, c10, a6 + add Y2, INCY, Y2 + STF a3, [Y2 + 0 * SIZE] + FADD a7, c13, a7 + STF a4, [Y2 + 1 * SIZE] + FADD a8, c14, a8 + add Y2, INCY, Y2 + + STF a5, [Y2 + 0 * SIZE] + FMOV t1, c1 + add J, -1, J + STF a6, [Y2 + 1 * SIZE] + FMOV t1, c2 + cmp J, 0 + add Y2, INCY, Y2 + STF a7, [Y2 + 0 * SIZE] + FMOV t1, c3 + STF a8, [Y2 + 1 * SIZE] + FMOV t1, c4 + add Y2, INCY, Y2 + + FMOV t1, c5 + bg %icc, .LL110 + FMOV t1, c6 + +.LL200: + FCLR(0) + + and N, 2, J + cmp J, 0 + FMOV t1, c1 + ble %icc, .LL300 + + FMOV t1, c2 + sra MIN_M, 2, I + FMOV t1, t2 + add A, LDA, A2 + FMOV t1, c3 + mov A, A1 + FMOV t1, t3 + cmp I, 0 + FMOV t1, c4 + + FMOV t1, c5 + FMOV t1, c6 + FMOV t1, c7 + FMOV t1, c8 + + add A2, LDA, A + FMOV t1, t4 + ble %icc, .LL215 + mov XP, X1 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a5 + LDF [A1 + 3 * SIZE], a6 + add A1, 4 * SIZE, A1 + + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + LDF [A2 + 2 * SIZE], a7 + LDF [A2 + 3 * SIZE], a8 + add A2, 4 * SIZE, A2 + + LDF [X1 + 0 * SIZE], b1 + add I, -1, I + LDF [X1 + 1 * SIZE], b2 + cmp I, 0 + LDF [X1 + 2 * SIZE], b3 + LDF [X1 + 3 * SIZE], b4 + ble %icc, .LL212 + add X1, 4 * SIZE, X1 + +.LL211: + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + + FADD c5, t1, c5 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + FADD c3, t3, c3 + FMUL a4, b1, t3 + LDF [X1 + 0 * SIZE], b1 + FADD c4, t4, c4 + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + + FADD c5, t1, c5 + LDF [X1 + 1 * SIZE], b2 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + LDF [A1 + 2 * SIZE], a5 + FADD c7, t3, c7 + add I, -1, I + FMUL a6, b3, t3 + FADD c8, t4, c8 + cmp I, 0 + FMUL a6, b4, t4 + LDF [A1 + 3 * SIZE], a6 + + FADD c1, t1, c1 + FMUL a7, b3, t1 + FADDX c2, t2, c2 + FMUL a7, b4, t2 + LDF [A2 + 2 * SIZE], a7 + FADD c3, t3, c3 + FMUL a8, b3, t3 + LDF [X1 + 2 * SIZE], b3 + FADD c4, t4, c4 + FMUL a8, b4, t4 + LDF [A2 + 3 * SIZE], a8 + + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + FADD c5, t1, c5 + LDF [X1 + 3 * SIZE], b4 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + LDF [A1 + 4 * SIZE], a1 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + LDF [A1 + 5 * SIZE], a2 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + LDF [A2 + 4 * SIZE], a3 + FADD c3, t3, c3 + FMUL a4, b1, t3 + LDF [X1 + 4 * SIZE], b1 + FADD c4, t4, c4 + FMUL a4, b2, t4 + LDF [A2 + 5 * SIZE], a4 + + FADD c5, t1, c5 + LDF [X1 + 5 * SIZE], b2 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + LDF [A1 + 6 * SIZE], a5 + FADD c7, t3, c7 + FMUL a6, b3, t3 + FADD c8, t4, c8 + FMUL a6, b4, t4 + LDF [A1 + 7 * SIZE], a6 + add A1, 8 * SIZE, A1 + + FADD c1, t1, c1 + FMUL a7, b3, t1 + FADDX c2, t2, c2 + FMUL a7, b4, t2 + LDF [A2 + 6 * SIZE], a7 + FADD c3, t3, c3 + FMUL a8, b3, t3 + LDF [X1 + 6 * SIZE], b3 + FADD c4, t4, c4 + add X1, 8 * SIZE, X1 + FMUL a8, b4, t4 + LDF [A2 + 7 * SIZE], a8 + add A2, 8 * SIZE, A2 + bg,pn %icc, .LL211 + LDF [X1 - 1 * SIZE], b4 + +.LL212: + FADD c5, t1, c5 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + FADD c3, t3, c3 + FMUL a4, b1, t3 + LDF [X1 + 0 * SIZE], b1 + FADD c4, t4, c4 + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + + FADD c5, t1, c5 + LDF [X1 + 1 * SIZE], b2 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + LDF [A1 + 2 * SIZE], a5 + FADD c7, t3, c7 + FMUL a6, b3, t3 + FADD c8, t4, c8 + FMUL a6, b4, t4 + LDF [A1 + 3 * SIZE], a6 + add A1, 4 * SIZE, A1 + + FADD c1, t1, c1 + FMUL a7, b3, t1 + FADDX c2, t2, c2 + FMUL a7, b4, t2 + LDF [A2 + 2 * SIZE], a7 + FADD c3, t3, c3 + FMUL a8, b3, t3 + LDF [X1 + 2 * SIZE], b3 + FADD c4, t4, c4 + FMUL a8, b4, t4 + LDF [A2 + 3 * SIZE], a8 + add A2, 4 * SIZE, A2 + + FADD c5, t1, c5 + LDF [X1 + 3 * SIZE], b4 + add X1, 4 * SIZE, X1 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + FADD c3, t3, c3 + FMUL a4, b1, t3 + FADD c4, t4, c4 + FMUL a4, b2, t4 + + FADD c5, t1, c5 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + FADD c7, t3, c7 + FMUL a6, b3, t3 + FADD c8, t4, c8 + FMUL a6, b4, t4 + + FADD c1, t1, c1 + FMUL a7, b3, t1 + FADDX c2, t2, c2 + FMUL a7, b4, t2 + FADD c3, t3, c3 + FMUL a8, b3, t3 + FADD c4, t4, c4 + FMUL a8, b4, t4 + +.LL215: + andcc MIN_M, 3, I + LDF ALPHA_R, b3 + mov Y1, Y2 + ble %icc, .LL219 + LDF ALPHA_I, b4 + + LDF [A1 + 0 * SIZE], a1 + add I, -1, I + LDF [A1 + 1 * SIZE], a2 + cmp I, 0 + add A1, 2 * SIZE, A1 + + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + add A2, 2 * SIZE, A2 + + LDF [X1 + 0 * SIZE], b1 + LDF [X1 + 1 * SIZE], b2 + ble %icc, .LL217 + add X1, 2 * SIZE, X1 + +.LL216: + FADD c5, t1, c5 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c7, t3, c7 + add I, -1, I + FMUL a2, b1, t3 + FADD c8, t4, c8 + cmp I, 0 + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + FADD c3, t3, c3 + FMUL a4, b1, t3 + LDF [X1 + 0 * SIZE], b1 + FADD c4, t4, c4 + add X1, 2 * SIZE, X1 + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + add A2, 2 * SIZE, A2 + bg,pn %icc, .LL216 + LDF [X1 - 1 * SIZE], b2 + +.LL217: + FADD c5, t1, c5 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + FADD c3, t3, c3 + FMUL a4, b1, t3 + FADD c4, t4, c4 + FMUL a4, b2, t4 + +.LL219: + FADD c5, t1, c5 + LDF [Y1 + 0 * SIZE], a1 + FADDX c6, t2, c6 + LDF [Y1 + 1 * SIZE] ,a2 + add Y1, INCY, Y1 + FADD c7, t3, c7 + LDF [Y1 + 0 * SIZE], a3 + FADD c8, t4, c8 + LDF [Y1 + 1 * SIZE] ,a4 + add Y1, INCY, Y1 + +#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) + FSUB c1, c4, c1 + FSUB c5, c8, c5 +#else + FADD c1, c4, c1 + FADD c5, c8, c5 +#endif + +#ifndef CONJ + FADD c2, c3, c2 + FADD c6, c7, c6 +#else + FSUB c2, c3, c2 + FSUB c6, c7, c6 +#endif + + FMUL b3, c1, c3 + FMUL b4, c1, c4 + FMUL b4, c2, c1 + FMUL b3, c2, c2 + + FMUL b3, c5, c7 + FMUL b4, c5, c8 + FMUL b4, c6, c5 + FMUL b3, c6, c6 + + FSUB c3, c1, c1 + FADD c2, c4, c2 + FSUB c7, c5, c5 + FADD c6, c8, c6 + + FADD a1, c1, a1 + FADD a2, c2, a2 + FADD a3, c5, a3 + FADD a4, c6, a4 + + STF a1, [Y2 + 0 * SIZE] + STF a2, [Y2 + 1 * SIZE] + add Y2, INCY, Y2 + STF a3, [Y2 + 0 * SIZE] + STF a4, [Y2 + 1 * SIZE] + +.LL300: + andcc N, 1, J + FCLR(0) + ble %icc, .LL400 + FMOV t1, c1 + +.LL310: + sra MIN_M, 2, I + FMOV t1, c2 + FMOV t1, c3 + FMOV t1, c4 + mov A, A1 + FMOV t1, t2 + add A, LDA, A + FMOV t1, t3 + cmp I, 0 + FMOV t1, t4 + ble %icc, .LL315 + mov XP, X1 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + add A1, 8 * SIZE, A1 + + LDF [X1 + 0 * SIZE], c9 + add I, -1, I + LDF [X1 + 1 * SIZE], c10 + cmp I, 0 + LDF [X1 + 2 * SIZE], c11 + LDF [X1 + 3 * SIZE], c12 + LDF [X1 + 4 * SIZE], c13 + LDF [X1 + 5 * SIZE], c14 + LDF [X1 + 6 * SIZE], c15 + LDF [X1 + 7 * SIZE], c16 + ble %icc, .LL312 + add X1, 8 * SIZE, X1 + +.LL311: + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + + FADD c1, t1, c1 + FMUL a1, c9, t1 + FADDX c2, t2, c2 + FMUL a1, c10, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c3, t3, c3 + FMUL a2, c9, t3 + LDF [X1 + 0 * SIZE], c9 + FADD c4, t4, c4 + FMUL a2, c10, t4 + LDF [A1 + 1 * SIZE], a2 + LDF [X1 + 1 * SIZE], c10 + + FADD c1, t1, c1 + FMUL a3, c11, t1 + FADDX c2, t2, c2 + FMUL a3, c12, t2 + LDF [A1 + 2 * SIZE], a3 + FADD c3, t3, c3 + add I, -1, I + FMUL a4, c11, t3 + LDF [X1 + 2 * SIZE], c11 + FADD c4, t4, c4 + cmp I, 0 + FMUL a4, c12, t4 + LDF [A1 + 3 * SIZE], a4 + LDF [X1 + 3 * SIZE], c12 + + FADD c1, t1, c1 + FMUL a5, c13, t1 + FADDX c2, t2, c2 + FMUL a5, c14, t2 + LDF [A1 + 4 * SIZE], a5 + FADD c3, t3, c3 + FMUL a6, c13, t3 + LDF [X1 + 4 * SIZE], c13 + FADD c4, t4, c4 + FMUL a6, c14, t4 + LDF [A1 + 5 * SIZE], a6 + LDF [X1 + 5 * SIZE], c14 + + FADD c1, t1, c1 + FMUL a7, c15, t1 + FADDX c2, t2, c2 + FMUL a7, c16, t2 + LDF [A1 + 6 * SIZE], a7 + + FADD c3, t3, c3 + FMUL a8, c15, t3 + LDF [X1 + 6 * SIZE], c15 + FADD c4, t4, c4 + add X1, 8 * SIZE, X1 + FMUL a8, c16, t4 + LDF [A1 + 7 * SIZE], a8 + add A1, 8 * SIZE, A1 + bg,pn %icc, .LL311 + LDF [X1 - 1 * SIZE], c16 + +.LL312: + FADD c1, t1, c1 + FMUL a1, c9, t1 + FADDX c2, t2, c2 + FMUL a1, c10, t2 + FADD c3, t3, c3 + FMUL a2, c9, t3 + FADD c4, t4, c4 + FMUL a2, c10, t4 + + FADD c1, t1, c1 + FMUL a3, c11, t1 + FADDX c2, t2, c2 + FMUL a3, c12, t2 + FADD c3, t3, c3 + FMUL a4, c11, t3 + FADD c4, t4, c4 + FMUL a4, c12, t4 + + FADD c1, t1, c1 + FMUL a5, c13, t1 + FADDX c2, t2, c2 + FMUL a5, c14, t2 + FADD c3, t3, c3 + FMUL a6, c13, t3 + FADD c4, t4, c4 + FMUL a6, c14, t4 + + FADD c1, t1, c1 + FMUL a7, c15, t1 + FADDX c2, t2, c2 + FMUL a7, c16, t2 + FADD c3, t3, c3 + FMUL a8, c15, t3 + FADD c4, t4, c4 + FMUL a8, c16, t4 + +.LL315: + andcc MIN_M, 3, I + LDF ALPHA_R, b3 + mov Y1, Y2 + ble %icc, .LL319 + LDF ALPHA_I, b4 + + LDF [A1 + 0 * SIZE], a1 + add I, -1, I + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + LDF [X1 + 0 * SIZE], b1 + cmp I, 0 + LDF [X1 + 1 * SIZE], b2 + ble %icc, .LL317 + add X1, 2 * SIZE, X1 + +.LL316: + FADD c1, t1, c1 + add I, -1, I + FMUL a1, b1, t1 + FADDX c2, t2, c2 + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c3, t3, c3 + cmp I, 0 + FMUL a2, b1, t3 + LDF [X1 + 0 * SIZE], b1 + FADD c4, t4, c4 + add X1, 2 * SIZE, X1 + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + + bg,pn %icc, .LL316 + LDF [X1 - 1 * SIZE], b2 + +.LL317: + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADDX c2, t2, c2 + FMUL a1, b2, t2 + FADD c3, t3, c3 + FMUL a2, b1, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + +.LL319: + FADD c1, t1, c1 + LDF [Y1 + 0 * SIZE], a1 + FADDX c2, t2, c2 + LDF [Y1 + 1 * SIZE] ,a2 + add Y1, INCY, Y1 + FADD c3, t3, c3 + FADD c4, t4, c4 + +#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) + FSUB c1, c4, c1 +#else + FADD c1, c4, c1 +#endif + +#ifndef CONJ + FADD c2, c3, c2 +#else + FSUB c2, c3, c2 +#endif + + FMUL b3, c1, c3 + FMUL b4, c1, c4 + FMUL b4, c2, c1 + FMUL b3, c2, c2 + + FSUB c3, c1, c1 + FADD c2, c4, c2 + FADD a1, c1, a1 + FADD a2, c2, a2 + + STF a1, [Y2 + 0 * SIZE] + STF a2, [Y2 + 1 * SIZE] + +.LL400: + mov P, I + add IS, I, IS + cmp IS, M + bl %icc, .LL10 + add A, PNLDA, A + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S new file mode 100644 index 0000000..28e9e07 --- /dev/null +++ b/kernel/sparc/znrm2.S @@ -0,0 +1,665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 +#define XX %i4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#define fmax %f32 +#define fzero %f34 +#define fone %f36 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#define fmax %f16 +#define fzero %f17 +#define fone %f18 +#endif + + PROLOGUE + SAVESP + +#ifdef DOUBLE + FCLR(3) +#else + FCLR(17) +#endif + + mov X, XX + mov 0x3ff, %g1 + sll %g1, 20, %g1 + + cmp N, 0 + ble .LL99 + FMOV fzero, c1 + + cmp INCX, 0 + ble .LL99 + sll INCX, ZBASE_SHIFT, INCX + + add %sp, -8, %sp + st %g1, [%sp + STACK_START + 0] + st %g0, [%sp + STACK_START + 4] + + LDF [%sp + STACK_START], fone + add %sp, 8, %sp + + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + cmp INCX, 2 * SIZE + bne .LL100 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + FABS a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a2, t2 + LDF [X + 0 * SIZE], a1 + FABS a3, t3 + LDF [X + 1 * SIZE], a2 + FABS a4, t4 + LDF [X + 2 * SIZE], a3 + + FCMP %fcc0, t1, c1 + LDF [X + 3 * SIZE], a4 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + FABS a8, t4 + LDF [X + 7 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + add I, -1, I + FMOVG %fcc1, t2, c2 + cmp I, 0 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FABS a1, t1 + FABS a2, t2 + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + mov XX, X + FMOVG %fcc0, c2, c1 + FMOVG %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FMOVG %fcc0, c3, c1 + + FCMP c1, fzero + fbe .LL99 + nop + + FMOV c1, fmax + FDIV fone, c1, fone + + FMOV fzero, c1 + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL35 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL32 + add X, 8 * SIZE, X + +.LL31: + FMUL fone, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMUL fone, a2, t2 + LDF [X + 0 * SIZE], a1 + FMUL fone, a3, t3 + LDF [X + 1 * SIZE], a2 + FMUL fone, a4, t4 + LDF [X + 2 * SIZE], a3 + + FMUL t1, t1, t1 + LDF [X + 3 * SIZE], a4 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + LDF [X + 4 * SIZE], a5 + FADD c2, t2, c2 + FMUL fone, a6, t2 + LDF [X + 5 * SIZE], a6 + FADD c3, t3, c3 + FMUL fone, a7, t3 + LDF [X + 6 * SIZE], a7 + FADD c4, t4, c4 + FMUL fone, a8, t4 + LDF [X + 7 * SIZE], a8 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + add I, -1, I + FADD c2, t2, c2 + cmp I, 0 + FADD c3, t3, c3 + FADD c4, t4, c4 + + bg,pt %icc, .LL31 + add X, 8 * SIZE, X + +.LL32: + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL fone, a3, t3 + FMUL fone, a4, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + FADD c2, t2, c2 + FMUL fone, a6, t2 + FADD c3, t3, c3 + FMUL fone, a7, t3 + FADD c4, t4, c4 + FMUL fone, a8, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +.LL35: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL39 + nop + +.LL36: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FADD c1, t1, c1 + FADD c2, t2, c2 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL36 + add X, 2 * SIZE, X + +.LL39: + FADD c1, c2, c1 + FADD c3, c4, c3 + FADD c1, c3, c1 + + FSQRT c1, c1 + FMUL fmax, c1, c1 + +.LL99: + return %i7 + 8 + clr %g0 + +.LL100: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL105 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + ble,pt %icc, .LL102 + add X, INCX, X + +.LL101: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + FABS a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + FABS a6, t2 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + FABS a8, t4 + LDF [X + 1 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + add I, -1, I + FMOVG %fcc1, t2, c2 + cmp I, 0 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + bg,pt %icc, .LL101 + add X, INCX, X + +.LL102: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + +.LL105: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL109 + nop + +.LL106: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FABS a1, t1 + FABS a2, t2 + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL106 + add X, INCX, X + +.LL109: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + mov XX, X + FMOVG %fcc0, c2, c1 + FMOVG %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FMOVG %fcc0, c3, c1 + + FCMP c1, fzero + fbe .LL99 + nop + + FMOV c1, fmax + FDIV fone, c1, fone + + FMOV fzero, c1 + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL135 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + LDF [X + 0 * SIZE], a5 + add I, -1, I + LDF [X + 1 * SIZE], a6 + add X, INCX, X + cmp I, 0 + LDF [X + 0 * SIZE], a7 + LDF [X + 1 * SIZE], a8 + + ble,pt %icc, .LL132 + add X, INCX, X + +.LL131: + FMUL fone, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMUL fone, a2, t2 + LDF [X + 0 * SIZE], a1 + FMUL fone, a3, t3 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + FMUL fone, a4, t4 + LDF [X + 0 * SIZE], a3 + + FMUL t1, t1, t1 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + LDF [X + 0 * SIZE], a5 + FADD c2, t2, c2 + FMUL fone, a6, t2 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FADD c3, t3, c3 + FMUL fone, a7, t3 + LDF [X + 0 * SIZE], a7 + FADD c4, t4, c4 + FMUL fone, a8, t4 + LDF [X + 1 * SIZE], a8 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + add I, -1, I + FADD c2, t2, c2 + cmp I, 0 + FADD c3, t3, c3 + FADD c4, t4, c4 + + bg,pt %icc, .LL131 + add X, INCX, X + +.LL132: + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL fone, a3, t3 + FMUL fone, a4, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + FADD c2, t2, c2 + FMUL fone, a6, t2 + FADD c3, t3, c3 + FMUL fone, a7, t3 + FADD c4, t4, c4 + FMUL fone, a8, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +.LL135: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL139 + nop + +.LL136: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FADD c1, t1, c1 + FADD c2, t2, c2 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL136 + add X, INCX, X + +.LL139: + FADD c1, c2, c1 + FADD c3, c4, c3 + FADD c1, c3, c1 + + FSQRT c1, c1 + FMUL fmax, c1, c1 + + return %i7 + 8 + clr %g0 + + EPILOGUE diff --git a/kernel/sparc/zrot.S b/kernel/sparc/zrot.S new file mode 100644 index 0000000..ec274ca --- /dev/null +++ b/kernel/sparc/zrot.S @@ -0,0 +1,673 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#define XX %l0 +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f4 +#define a2 %f6 +#define a3 %f8 +#define a4 %f10 +#define a5 %f12 +#define a6 %f14 +#define a7 %f16 +#define a8 %f18 +#define b1 %f20 +#define b2 %f22 +#define b3 %f24 +#define b4 %f26 +#define b5 %f28 +#define b6 %f30 +#define b7 %f32 +#define b8 %f34 + +#define c1 %f36 +#define c2 %f38 +#define c3 %f40 +#define c4 %f42 +#define c5 %f44 +#define c6 %f46 +#define c7 %f48 +#define c8 %f50 + +#define t1 %f52 +#define t2 %f54 +#define t3 %f56 +#define t4 %f58 +#else +#define a1 %f2 +#define a2 %f3 +#define a3 %f4 +#define a4 %f5 +#define a5 %f6 +#define a6 %f7 +#define a7 %f8 +#define a8 %f9 +#define b1 %f10 +#define b2 %f11 +#define b3 %f12 +#define b4 %f13 +#define b5 %f14 +#define b6 %f15 +#define b7 %f16 +#define b8 %f17 + +#define c1 %f18 +#define c2 %f19 +#define c3 %f20 +#define c4 %f21 +#define c5 %f22 +#define c6 %f23 +#define c7 %f24 +#define c8 %f25 + +#define t1 %f26 +#define t2 %f27 +#define t3 %f28 +#define t4 %f29 +#endif + +#ifdef DOUBLE +#define C %f0 +#define S %f2 +#else +#define C %f0 +#define S %f1 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i5, [%sp + STACK_START + 24] + + LDF [%sp + STACK_START + 24], C + LDF [%sp + STACK_START + 32], S +#else + st %i5, [%sp + STACK_START + 24] + + LDF [%sp + STACK_START + 24], C + LDF [%sp + STACK_START + 28], S +#endif +#else +#ifdef DOUBLE + FMOV %f10, C + FMOV %f12, S +#else + FMOV %f11, C + FMOV %f13, S +#endif +#endif + + cmp N, 0 + ble .LL19 + nop + + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + LDF [Y + 8 * SIZE], b1 + FMUL S, a1, c4 + LDF [X + 8 * SIZE], a1 + + FMUL C, a2, c5 + FMUL S, b2, c6 + FADD c1, c2, t1 + + FMUL C, b2, c7 + LDF [Y + 9 * SIZE], b2 + FMUL S, a2, c8 + LDF [X + 9 * SIZE], a2 + FSUB c3, c4, t2 + + addcc I, -1, I + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 64 + +.LL11: + FMUL C, a3, c1 + nop + prefetch [Y + PREFETCHSIZE * SIZE], 1 + nop + + FMUL S, b3, c2 + STF t1, [X + 0 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b3, c3 + LDF [Y + 10 * SIZE], b3 + nop + nop + + FMUL S, a3, c4 + STF t2, [Y + 0 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a4, c5 + LDF [X + 10 * SIZE], a3 + nop + nop + + FMUL S, b4, c6 + STF t3, [X + 1 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b4, c7 + LDF [Y + 11 * SIZE], b4 + nop + nop + + FMUL S, a4, c8 + STF t4, [Y + 1 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a5, c1 + LDF [X + 11 * SIZE], a4 + nop + nop + + FMUL S, b5, c2 + STF t1, [X + 2 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b5, c3 + LDF [Y + 12 * SIZE], b5 + nop + nop + + FMUL S, a5, c4 + STF t2, [Y + 2 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a6, c5 + LDF [X + 12 * SIZE], a5 + nop + nop + + FMUL S, b6, c6 + STF t3, [X + 3 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b6, c7 + LDF [Y + 13 * SIZE], b6 + nop + nop + + FMUL S, a6, c8 + STF t4, [Y + 3 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a7, c1 + LDF [X + 13 * SIZE], a6 + nop + nop + + FMUL S, b7, c2 + STF t1, [X + 4 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b7, c3 + LDF [Y + 14 * SIZE], b7 + nop + nop + + FMUL S, a7, c4 + STF t2, [Y + 4 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a8, c5 + LDF [X + 14 * SIZE], a7 + nop + nop + + FMUL S, b8, c6 + STF t3, [X + 5 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b8, c7 + LDF [Y + 15 * SIZE], b8 + nop + nop + + FMUL S, a8, c8 + STF t4, [Y + 5 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a1, c1 + LDF [X + 15 * SIZE], a8 + addcc I, -1, I + nop + + FMUL S, b1, c2 + STF t1, [X + 6 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b1, c3 + LDF [Y + 16 * SIZE], b1 + nop + nop + + FMUL S, a1, c4 + STF t2, [Y + 6 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a2, c5 + LDF [X + 16 * SIZE], a1 + add Y, 8 * SIZE, Y + nop + + FMUL S, b2, c6 + STF t3, [X + 7 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b2, c7 + LDF [Y + 9 * SIZE], b2 + add X, 8 * SIZE, X + nop + + FMUL S, a2, c8 + STF t4, [Y - 1 * SIZE] + FSUB c3, c4, t2 + nop + + bg,pt %icc, .LL11 + LDF [X + 9 * SIZE], a2 + + +.LL12: + FMUL C, a3, c1 + FMUL S, b3, c2 + STF t1, [X + 0 * SIZE] + FADD c5, c6, t3 + + FMUL C, b3, c3 + FMUL S, a3, c4 + STF t2, [Y + 0 * SIZE] + FSUB c7, c8, t4 + + + FMUL C, a4, c5 + FMUL S, b4, c6 + STF t3, [X + 1 * SIZE] + FADD c1, c2, t1 + + FMUL C, b4, c7 + FMUL S, a4, c8 + STF t4, [Y + 1 * SIZE] + FSUB c3, c4, t2 + + + FMUL C, a5, c1 + FMUL S, b5, c2 + STF t1, [X + 2 * SIZE] + FADD c5, c6, t3 + + FMUL C, b5, c3 + FMUL S, a5, c4 + STF t2, [Y + 2 * SIZE] + FSUB c7, c8, t4 + + FMUL C, a6, c5 + FMUL S, b6, c6 + STF t3, [X + 3 * SIZE] + FADD c1, c2, t1 + + FMUL C, b6, c7 + FMUL S, a6, c8 + STF t4, [Y + 3 * SIZE] + FSUB c3, c4, t2 + + FMUL C, a7, c1 + FMUL S, b7, c2 + STF t1, [X + 4 * SIZE] + FADD c5, c6, t3 + + FMUL C, b7, c3 + FMUL S, a7, c4 + STF t2, [Y + 4 * SIZE] + FSUB c7, c8, t4 + + FMUL C, a8, c5 + FMUL S, b8, c6 + STF t3, [X + 5 * SIZE] + FADD c1, c2, t1 + + FMUL C, b8, c7 + FMUL S, a8, c8 + STF t4, [Y + 5 * SIZE] + FSUB c3, c4, t2 + + FADD c5, c6, t3 + STF t1, [X + 6 * SIZE] + + FSUB c7, c8, t4 + STF t2, [Y + 6 * SIZE] + + STF t3, [X + 7 * SIZE] + STF t4, [Y + 7 * SIZE] + + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + + +.LL15: + andcc N, 3, I + nop + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + add X, 2 * SIZE, X + FMUL S, b1, c2 + add Y, 2 * SIZE, Y + + FMUL C, b1, c3 + addcc I, -1, I + FMUL S, a1, c4 + nop + + FMUL C, a2, c5 + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X - 2 * SIZE] + FADD c5, c6, c6 + STF c4, [Y - 2 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X - 1 * SIZE] + bg,pt %icc, .LL16 + STF c8, [Y - 1 * SIZE] + +.LL19: + return %i7 + 8 + nop + +.LL50: + mov X, XX + mov Y, YY + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + nop + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + add Y, INCY, Y + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + nop + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + add Y, INCY, Y + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + nop + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + add Y, INCY, Y + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + nop + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + add Y, INCY, Y + + addcc I, -1, I + bg,pt %icc, .LL51 + nop + + +.LL55: + andcc N, 3, I + nop + ble %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + addcc I, -1, I + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + + bg %icc, .LL56 + add Y, INCY, Y + + +.LL59: + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/zscal.S b/kernel/sparc/zscal.S new file mode 100644 index 0000000..5c6ade3 --- /dev/null +++ b/kernel/sparc/zscal.S @@ -0,0 +1,518 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define X %i3 +#define INCX %i4 +#else +#define X %i5 +#define INCX %i3 +#endif + +#define I %i1 +#define XX %i2 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define c5 %f8 +#define c6 %f10 +#define c7 %f12 +#define c8 %f14 + +#define t1 %f16 +#define t2 %f18 +#define t3 %f20 +#define t4 %f22 +#define t5 %f24 +#define t6 %f26 +#define t7 %f28 +#define t8 %f30 + +#define c9 %f32 +#define c10 %f34 +#define c11 %f36 +#define c12 %f38 +#define c13 %f40 +#define c14 %f42 +#define c15 %f44 +#define c16 %f46 + +#define s1 %f32 +#define s2 %f34 +#define s3 %f36 +#define s4 %f38 +#define s5 %f40 +#define s6 %f42 +#define s7 %f44 +#define s8 %f46 + +#define FZERO %f48 +#define ALPHA_R %f50 +#define ALPHA_I %f52 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define c5 %f4 +#define c6 %f5 +#define c7 %f6 +#define c8 %f7 + +#define c9 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define s1 %f8 +#define s2 %f9 +#define s3 %f10 +#define s4 %f11 +#define s5 %f12 +#define s6 %f13 +#define s7 %f14 +#define s8 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 +#define t5 %f20 +#define t6 %f21 +#define t7 %f22 +#define t8 %f23 + +#define FZERO %f24 +#define ALPHA_R %f25 +#define ALPHA_I %f26 +#endif + +#define PREFETCHSIZE 128 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] + + ld [%sp+ STACK_START + 32], X + ld [%sp+ STACK_START + 36], INCX +#else + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 24] + ld [%sp+ STACK_START + 28], INCX +#endif + LDF [%sp + STACK_START + 16], ALPHA_R + LDF [%sp + STACK_START + 24], ALPHA_I +#else + ldx [%sp + STACK_START + 56], INCX +#ifdef DOUBLE + FMOV %f6, ALPHA_R + FMOV %f8, ALPHA_I +#else + FMOV %f7, ALPHA_R + FMOV %f9, ALPHA_I +#endif +#endif + +#ifdef DOUBLE + FCLR(17) +#else + FCLR(24) +#endif + + FCMP ALPHA_R, FZERO + fbne .LL100 + sll INCX, ZBASE_SHIFT, INCX + + FCMP ALPHA_I, FZERO + fbne .LL100 + nop + cmp INCX, 2 * SIZE + bne .LL50 + nop + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + STF FZERO, [X + 0 * SIZE] + add I, -1, I + STF FZERO, [X + 1 * SIZE] + cmp I, 0 + STF FZERO, [X + 2 * SIZE] + STF FZERO, [X + 3 * SIZE] + STF FZERO, [X + 4 * SIZE] + STF FZERO, [X + 5 * SIZE] + add X, 8 * SIZE, X + STF FZERO, [X - 2 * SIZE] + bg,pt %icc, .LL11 + STF FZERO, [X - 1 * SIZE] + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + STF FZERO, [X + 0 * SIZE] + STF FZERO, [X + 1 * SIZE] + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + return %i7 + 8 + clr %o0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + STF FZERO, [X + 0 * SIZE] + add I, -1, I + STF FZERO, [X + 1 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + cmp I, 0 + STF FZERO, [X + 1 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + STF FZERO, [X + 1 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + STF FZERO, [X + 1 * SIZE] + bg,pt %icc, .LL51 + add X, INCX, X + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + STF FZERO, [X + 0 * SIZE] + add I, -1, I + STF FZERO, [X + 1 * SIZE] + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + return %i7 + 8 + clr %o0 + +.LL100: + cmp INCX, 2 * SIZE + bne .LL150 + sra N, 2, I + + cmp I, 0 + ble,pn %icc, .LL115 + nop + + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + LDF [X + 2 * SIZE], c3 + LDF [X + 3 * SIZE], c4 + LDF [X + 4 * SIZE], c5 + LDF [X + 5 * SIZE], c6 + LDF [X + 6 * SIZE], c7 + LDF [X + 7 * SIZE], c8 + + FMUL ALPHA_R, c1, t1 + FMUL ALPHA_I, c2, t3 + + FMUL ALPHA_I, c1, t2 + LDF [X + 8 * SIZE], c1 + FMUL ALPHA_R, c2, t4 + LDF [X + 9 * SIZE], c2 + + FMUL ALPHA_R, c3, t5 + deccc I + FMUL ALPHA_I, c4, t7 + FSUB t1, t3, s1 + + FMUL ALPHA_I, c3, t6 + LDF [X + 10 * SIZE], c3 + FMUL ALPHA_R, c4, t8 + LDF [X + 11 * SIZE], c4 + FADD t4, t2, s2 + + ble,pn %icc, .LL112 + nop + +.LL111: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + FMUL ALPHA_R, c5, t1 + FMUL ALPHA_I, c6, t3 + FSUB t5, t7, s3 + STF s1, [X + 0 * SIZE] + + FMUL ALPHA_I, c5, t2 + LDF [X + 12 * SIZE], c5 + FMUL ALPHA_R, c6, t4 + LDF [X + 13 * SIZE], c6 + + FADD t8, t6, s4 + STF s2, [X + 1 * SIZE] + + FMUL ALPHA_R, c7, t5 + FMUL ALPHA_I, c8, t7 + FSUB t1, t3, s5 + STF s3, [X + 2 * SIZE] + + FMUL ALPHA_I, c7, t6 + LDF [X + 14 * SIZE], c7 + FMUL ALPHA_R, c8, t8 + LDF [X + 15 * SIZE], c8 + + FADD t4, t2, s6 + STF s4, [X + 3 * SIZE] + + FMUL ALPHA_R, c1, t1 + FMUL ALPHA_I, c2, t3 + FSUB t5, t7, s7 + STF s5, [X + 4 * SIZE] + + FMUL ALPHA_I, c1, t2 + LDF [X + 16 * SIZE], c1 + FMUL ALPHA_R, c2, t4 + LDF [X + 17 * SIZE], c2 + + FADD t8, t6, s8 + STF s6, [X + 5 * SIZE] + + FMUL ALPHA_R, c3, t5 + deccc I + FMUL ALPHA_I, c4, t7 + FSUB t1, t3, s1 + STF s7, [X + 6 * SIZE] + + FMUL ALPHA_I, c3, t6 + LDF [X + 18 * SIZE], c3 + FMUL ALPHA_R, c4, t8 + LDF [X + 19 * SIZE], c4 + + FADD t4, t2, s2 + STF s8, [X + 7 * SIZE] + + bg,pt %icc, .LL111 + add X, 8 * SIZE, X + + +.LL112: + FMUL ALPHA_R, c5, t1 + FMUL ALPHA_I, c6, t3 + FSUB t5, t7, s3 + STF s1, [X + 0 * SIZE] + + FMUL ALPHA_I, c5, t2 + FMUL ALPHA_R, c6, t4 + FADD t8, t6, s4 + STF s2, [X + 1 * SIZE] + + FMUL ALPHA_R, c7, t5 + FMUL ALPHA_I, c8, t7 + FSUB t1, t3, s5 + STF s3, [X + 2 * SIZE] + + FMUL ALPHA_I, c7, t6 + FMUL ALPHA_R, c8, t8 + FADD t4, t2, s6 + STF s4, [X + 3 * SIZE] + + FSUB t5, t7, s7 + FADD t8, t6, s8 + + STF s5, [X + 4 * SIZE] + STF s6, [X + 5 * SIZE] + STF s7, [X + 6 * SIZE] + STF s8, [X + 7 * SIZE] + add X, 8 * SIZE, X + +.LL115: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL119 + nop + +.LL116: + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + + FMUL ALPHA_R, c1, c3 + FMUL ALPHA_I, c1, c4 + FMUL ALPHA_I, c2, c1 + FMUL ALPHA_R, c2, c2 + + FSUB c3, c1, c1 + FADD c2, c4, c2 + + STF c1, [X + 0 * SIZE] + STF c2, [X + 1 * SIZE] + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL116 + add X, 2 * SIZE, X + +.LL119: + return %i7 + 8 + clr %o0 + +.LL150: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL155 + mov X, XX + +.LL151: + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + add X, INCX, X + LDF [X + 0 * SIZE], c3 + FMUL ALPHA_R, c1, c9 + LDF [X + 1 * SIZE], c4 + FMUL ALPHA_I, c1, c10 + add X, INCX, X + LDF [X + 0 * SIZE], c5 + FMUL ALPHA_I, c2, c1 + LDF [X + 1 * SIZE], c6 + FMUL ALPHA_R, c2, c2 + add X, INCX, X + LDF [X + 0 * SIZE], c7 + FMUL ALPHA_R, c3, c11 + LDF [X + 1 * SIZE], c8 + FMUL ALPHA_I, c3, c12 + add X, INCX, X + + FMUL ALPHA_I, c4, c3 + FMUL ALPHA_R, c4, c4 + + FMUL ALPHA_R, c5, c13 + FMUL ALPHA_I, c5, c14 + FMUL ALPHA_I, c6, c5 + FMUL ALPHA_R, c6, c6 + + FMUL ALPHA_R, c7, c15 + FSUB c9, c1, c1 + FMUL ALPHA_I, c7, c16 + FADD c2, c10, c2 + FMUL ALPHA_I, c8, c7 + FSUB c11, c3, c3 + FMUL ALPHA_R, c8, c8 + FADD c4, c12, c4 + + STF c1, [XX + 0 * SIZE] + FSUB c13, c5, c5 + add I, -1, I + STF c2, [XX + 1 * SIZE] + FADD c6, c14, c6 + add XX, INCX, XX + STF c3, [XX + 0 * SIZE] + FSUB c15, c7, c7 + cmp I, 0 + STF c4, [XX + 1 * SIZE] + FADD c8, c16, c8 + add XX, INCX, XX + STF c5, [XX + 0 * SIZE] + STF c6, [XX + 1 * SIZE] + add XX, INCX, XX + STF c7, [XX + 0 * SIZE] + STF c8, [XX + 1 * SIZE] + bg,pt %icc, .LL151 + add XX, INCX, XX + +.LL155: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + + FMUL ALPHA_R, c1, c3 + FMUL ALPHA_I, c1, c4 + FMUL ALPHA_I, c2, c1 + FMUL ALPHA_R, c2, c2 + + FSUB c3, c1, c1 + FADD c2, c4, c2 + + STF c1, [X + 0 * SIZE] + STF c2, [X + 1 * SIZE] + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL156 + add X, INCX, X + +.LL159: + return %i7 + 8 + clr %o0 + + + EPILOGUE diff --git a/kernel/sparc/zswap.S b/kernel/sparc/zswap.S new file mode 100644 index 0000000..88ed221 --- /dev/null +++ b/kernel/sparc/zswap.S @@ -0,0 +1,342 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(DOUBLE) && !defined(__64BIT__) +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 +#else +#define N %i0 +#define X %i5 +#define INCX %i1 +#define Y %i2 +#define INCY %i3 +#define I %i4 +#endif + +#define XX %l0 +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define b1 %f16 +#define b2 %f18 +#define b3 %f20 +#define b4 %f22 +#define b5 %f24 +#define b6 %f26 +#define b7 %f28 +#define b8 %f30 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define b1 %f8 +#define b2 %f9 +#define b3 %f10 +#define b4 %f11 +#define b5 %f12 +#define b6 %f13 +#define b7 %f14 +#define b8 %f15 +#endif + +#ifdef DOUBLE +#define PREFETCHSIZE 128 +#else +#define PREFETCHSIZE 256 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY +#else + ld [%sp + STACK_START + 28], INCX + ld [%sp + STACK_START + 32], Y + ld [%sp + STACK_START + 36], INCY +#endif +#else + ldx [%sp + STACK_START + 56], INCX + ldx [%sp + STACK_START + 64], Y + ldx [%sp + STACK_START + 72], INCY +#endif + + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + deccc I + ble,pn %icc, .LL12 + nop + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + deccc I + + STF a1, [Y + 0 * SIZE] + LDF [X + 8 * SIZE], a1 + STF b1, [X + 0 * SIZE] + LDF [Y + 8 * SIZE], b1 + + STF a2, [Y + 1 * SIZE] + LDF [X + 9 * SIZE], a2 + STF b2, [X + 1 * SIZE] + LDF [Y + 9 * SIZE], b2 + + STF a3, [Y + 2 * SIZE] + LDF [X + 10 * SIZE], a3 + STF b3, [X + 2 * SIZE] + LDF [Y + 10 * SIZE], b3 + + STF a4, [Y + 3 * SIZE] + LDF [X + 11 * SIZE], a4 + STF b4, [X + 3 * SIZE] + LDF [Y + 11 * SIZE], b4 + + prefetch [Y + PREFETCHSIZE * SIZE], 0 + add X, 8 * SIZE, X + + STF a5, [Y + 4 * SIZE] + LDF [X + 4 * SIZE], a5 + STF b5, [X - 4 * SIZE] + LDF [Y + 12 * SIZE], b5 + + STF a6, [Y + 5 * SIZE] + LDF [X + 5 * SIZE], a6 + STF b6, [X - 3 * SIZE] + LDF [Y + 13 * SIZE], b6 + + STF a7, [Y + 6 * SIZE] + LDF [X + 6 * SIZE], a7 + STF b7, [X - 2 * SIZE] + LDF [Y + 14 * SIZE], b7 + + STF a8, [Y + 7 * SIZE] + LDF [X + 7 * SIZE], a8 + STF b8, [X - 1 * SIZE] + LDF [Y + 15 * SIZE], b8 + + bg,pt %icc, .LL11 + add Y, 8 * SIZE, Y + +.LL12: + STF a1, [Y + 0 * SIZE] + STF b1, [X + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + STF b2, [X + 1 * SIZE] + STF a3, [Y + 2 * SIZE] + STF b3, [X + 2 * SIZE] + STF a4, [Y + 3 * SIZE] + STF b4, [X + 3 * SIZE] + STF a5, [Y + 4 * SIZE] + STF b5, [X + 4 * SIZE] + STF a6, [Y + 5 * SIZE] + STF b6, [X + 5 * SIZE] + STF a7, [Y + 6 * SIZE] + STF b7, [X + 6 * SIZE] + STF a8, [Y + 7 * SIZE] + STF b8, [X + 7 * SIZE] + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + cmp I, 0 + STF a1, [Y + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + add Y, 2 * SIZE, Y + STF b1, [X + 0 * SIZE] + STF b2, [X + 1 * SIZE] + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + mov X, XX + cmp I, 0 + ble,pn %icc, .LL55 + mov Y, YY + +.LL51: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + LDF [Y + 0 * SIZE], b3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + LDF [X + 0 * SIZE], a5 + LDF [Y + 0 * SIZE], b5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + LDF [Y + 0 * SIZE], b7 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + LDF [Y + 1 * SIZE], b8 + add Y, INCY, Y + + STF a1, [YY + 0 * SIZE] + add I, -1, I + STF b1, [XX + 0 * SIZE] + cmp I, 0 + STF a2, [YY + 1 * SIZE] + add YY, INCY, YY + STF b2, [XX + 1 * SIZE] + add XX, INCX, XX + STF a3, [YY + 0 * SIZE] + STF b3, [XX + 0 * SIZE] + STF a4, [YY + 1 * SIZE] + add YY, INCY, YY + STF b4, [XX + 1 * SIZE] + add XX, INCX, XX + STF a5, [YY + 0 * SIZE] + STF b5, [XX + 0 * SIZE] + STF a6, [YY + 1 * SIZE] + add YY, INCY, YY + STF b6, [XX + 1 * SIZE] + add XX, INCX, XX + STF a7, [YY + 0 * SIZE] + STF b7, [XX + 0 * SIZE] + STF a8, [YY + 1 * SIZE] + add YY, INCY, YY + STF b8, [XX + 1 * SIZE] + + bg,pt %icc, .LL51 + add XX, INCX, XX + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + LDF [Y + 1 * SIZE], b2 + STF b1, [X + 0 * SIZE] + STF b2, [X + 1 * SIZE] + add X, INCX, X + STF a1, [Y + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + bg,pt %icc, .LL56 + add Y, INCY, Y + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_LN.S b/kernel/sparc/ztrsm_kernel_LN.S new file mode 100644 index 0000000..131284e --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_LN.S @@ -0,0 +1,2395 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 + +#define OFFSET %l2 +#define KK %l3 +#define TEMP1 %l4 +#define TEMP2 %l5 +#define AORIG %l6 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f62 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f58 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#endif + +#define t5 c13 +#define t6 c14 +#define t7 c15 +#define t8 c16 + +#ifndef CONJ +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FADD +#define FADD4 FSUB +#else + +#if defined(LN) || defined(LT) +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FADD +#define FADD4 FADD +#endif + +#if defined(RN) || defined(RT) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FSUB +#define FADD4 FADD +#endif +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], B + ldx [%sp+ STACK_START + 64], C + ldx [%sp+ STACK_START + 72], LDC + ldx [%sp+ STACK_START + 80], OFFSET +#endif + +#ifdef DOUBLE + FCLR(27) +#else + FCLR(29) +#endif + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +.LL11: +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + add LDC, LDC, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C2, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL50 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 1 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c02 + FMOV FZERO, t1 + FMOV FZERO, c04 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD2 c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD4 c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD4 c04, t2, c04 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD2 c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD4 c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD2 c02, t1, c02 + FADD4 c04, t2, c04 + FADD2 c06, t3, c06 + FADD4 c08, t4, c08 + + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t3 + FMUL a4, c01, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FADD3 c05, t3, c05 + FADD4 c06, t4, c06 + + FMUL b1, c05, t1 + FMUL b2, c06, t2 + FMUL b1, c06, t3 + FMUL b2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c05, t1 + FMUL a2, c06, t2 + FMUL a1, c06, t3 + FMUL a2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a4, c06, t3 + FMUL a4, c05, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FADD3 c01, t3, c01 + FADD4 c02, t4, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL50: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL21: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + FMOV FZERO, c01 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [BO + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + +#ifdef LN + prefetch [C1 - 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 - 3 * SIZE], 3 + FMOV FZERO, c14 +#else + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 +#endif + + FMOV FZERO, c15 + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD2 c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD4 c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD2 c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD4 c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD4 c08, t2, c08 + FMUL a5, b2, t2 + FADD2 c12, t3, c12 + FMUL a5, b3, t3 + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL29 + nop + +.LL26: + FADD2 c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD4 c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 2, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD2 c04, t1, c04 + FADD4 c08, t2, c08 + FADD2 c12, t3, c12 + FADD4 c16, t4, c16 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + FADD c09, c14, c09 + FADD c10, c13, c10 + FADD c11, c16, c11 + FADD c12, c15, c12 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c09, c09 + FSUB a4, c10, c10 + + FSUB b1, c03, c03 + FSUB b2, c04, c04 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c09, t3, c09 + FSUB c10, t4, c10 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + FADD2 c09, t7, c09 + FADD4 c10, t8, c10 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c09, t5 + FMUL b2, c10, t6 + FMUL b1, c10, t7 + FMUL b2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c09, t5 + FMUL a2, c10, t6 + FMUL a1, c10, t7 + FMUL a2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c09, t3 + FMUL a3, c10, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c10, t7 + FMUL a4, c09, t8 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + FADD2 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c04, t7 + FMUL a4, c03, t8 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD3 c09, t5, c09 + FADD4 c10, t6, c10 + FADD3 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c09, t1 + FMUL b2, c10, t2 + FMUL b1, c10, t3 + FMUL b2, c09, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c09, t1 + FMUL a2, c10, t2 + FMUL a1, c10, t3 + FMUL a2, c09, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c10, t5 + FMUL a4, c09, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FADD3 c01, t5, c01 + FADD4 c02, t6, c02 + FADD3 c03, t7, c03 + FADD4 c04, t8, c04 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c03, t5 + FMUL b2, c04, t6 + FMUL b1, c04, t7 + FMUL b2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c10, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c11, [BO + 6 * SIZE] + STF c12, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c11, [AO + 6 * SIZE] + STF c12, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c09, [C2 + 0 * SIZE] + STF c10, [C2 + 1 * SIZE] + STF c11, [C2 + 2 * SIZE] + STF c12, [C2 + 3 * SIZE] + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL21 + nop + +.LL99: +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL150 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD1 c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD3 c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD2 c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD1 c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD3 c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD3 c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD2 c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + FADD1 c01, t1, c01 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + FADD3 c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + cmp L, 0 + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [BO + 1 * SIZE], b2 + + bg,pt %icc, .LL156 + LDF [AO + 1 * SIZE], a2 + +.LL159: + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL150: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c03 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + +#ifdef LN + prefetch [C1 - 3 * SIZE], 3 +#else + prefetch [C1 + 3 * SIZE], 3 +#endif + FMOV FZERO, c05 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD1 c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD1 c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD1 c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD3 c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD4 c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b1, t1 + FADD3 c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD4 c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD1 c03, t1, c03 + FADD3 c07, t2, c07 + FADD2 c04, t3, c04 + FADD4 c08, t4, c08 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t5 + FMUL a4, c01, t6 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL199: +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_LT.S b/kernel/sparc/ztrsm_kernel_LT.S new file mode 100644 index 0000000..2a85698 --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_LT.S @@ -0,0 +1,2389 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 + +#define OFFSET %l2 +#define KK %l3 +#define TEMP1 %l4 +#define TEMP2 %l5 +#define AORIG %l6 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f62 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f58 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#endif + +#define t5 c13 +#define t6 c14 +#define t7 c15 +#define t8 c16 + +#ifndef CONJ +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FADD +#define FADD4 FSUB +#else + +#if defined(LN) || defined(LT) +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FADD +#define FADD4 FADD +#endif + +#if defined(RN) || defined(RT) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FSUB +#define FADD4 FADD +#endif +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], B + ldx [%sp+ STACK_START + 64], C + ldx [%sp+ STACK_START + 72], LDC + ldx [%sp+ STACK_START + 80], OFFSET +#endif + +#ifdef DOUBLE + FCLR(27) +#else + FCLR(29) +#endif + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +.LL11: +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + add LDC, LDC, TEMP1 + sub C, TEMP1, C +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + + sra M, 1, I + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + cmp I, 0 +#ifndef RT + add C2, LDC, C +#endif + ble,pn %icc, .LL50 + FMOV FZERO, t4 + + +.LL21: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + FMOV FZERO, c01 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [BO + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 + + FMOV FZERO, c15 + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD2 c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD4 c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD2 c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD4 c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD4 c08, t2, c08 + FMUL a5, b2, t2 + FADD2 c12, t3, c12 + FMUL a5, b3, t3 + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL29 + nop + +.LL26: + FADD2 c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD4 c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 2, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD2 c04, t1, c04 + FADD4 c08, t2, c08 + FADD2 c12, t3, c12 + FADD4 c16, t4, c16 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + FADD c09, c14, c09 + FADD c10, c13, c10 + FADD c11, c16, c11 + FADD c12, c15, c12 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c09, c09 + FSUB a4, c10, c10 + + FSUB b1, c03, c03 + FSUB b2, c04, c04 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c09, t3, c09 + FSUB c10, t4, c10 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + FADD2 c09, t7, c09 + FADD4 c10, t8, c10 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c09, t5 + FMUL b2, c10, t6 + FMUL b1, c10, t7 + FMUL b2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c09, t5 + FMUL a2, c10, t6 + FMUL a1, c10, t7 + FMUL a2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c09, t3 + FMUL a3, c10, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c10, t7 + FMUL a4, c09, t8 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + FADD2 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c04, t7 + FMUL a4, c03, t8 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD3 c09, t5, c09 + FADD4 c10, t6, c10 + FADD3 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c09, t1 + FMUL b2, c10, t2 + FMUL b1, c10, t3 + FMUL b2, c09, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c09, t1 + FMUL a2, c10, t2 + FMUL a1, c10, t3 + FMUL a2, c09, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c10, t5 + FMUL a4, c09, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FADD3 c01, t5, c01 + FADD4 c02, t6, c02 + FADD3 c03, t7, c03 + FADD4 c04, t8, c04 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c03, t5 + FMUL b2, c04, t6 + FMUL b1, c04, t7 + FMUL b2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c10, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c11, [BO + 6 * SIZE] + STF c12, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c11, [AO + 6 * SIZE] + STF c12, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c09, [C2 + 0 * SIZE] + STF c10, [C2 + 1 * SIZE] + STF c11, [C2 + 2 * SIZE] + STF c12, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 1, I + FMOV FZERO, c02 + cmp I, 0 + FMOV FZERO, t1 + ble,pn %icc, .LL99 + FMOV FZERO, c04 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 1 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD2 c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD4 c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD4 c04, t2, c04 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD2 c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD4 c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD2 c02, t1, c02 + FADD4 c04, t2, c04 + FADD2 c06, t3, c06 + FADD4 c08, t4, c08 + + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t3 + FMUL a4, c01, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FADD3 c05, t3, c05 + FADD4 c06, t4, c06 + + FMUL b1, c05, t1 + FMUL b2, c06, t2 + FMUL b1, c06, t3 + FMUL b2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c05, t1 + FMUL a2, c06, t2 + FMUL a1, c06, t3 + FMUL a2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a4, c06, t3 + FMUL a4, c05, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FADD3 c01, t3, c01 + FADD4 c02, t4, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL99: +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c03 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c05 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD1 c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD1 c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD1 c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD3 c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD4 c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b1, t1 + FADD3 c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD4 c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD1 c03, t1, c03 + FADD3 c07, t2, c07 + FADD2 c04, t3, c04 + FADD4 c08, t4, c08 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t5 + FMUL a4, c01, t6 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD1 c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD3 c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD2 c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD1 c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD3 c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD3 c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD2 c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + FADD1 c01, t1, c01 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + FADD3 c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + cmp L, 0 + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [BO + 1 * SIZE], b2 + + bg,pt %icc, .LL156 + LDF [AO + 1 * SIZE], a2 + +.LL159: + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL199: +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_LT_1x4.S b/kernel/sparc/ztrsm_kernel_LT_1x4.S new file mode 100644 index 0000000..f7d9e38 --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_LT_1x4.S @@ -0,0 +1,2131 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 +#define C3 %l2 +#define C4 %l3 + +#define OFFSET %l4 +#define KK %l5 +#define TEMP1 %l6 +#define TEMP2 %l7 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 +#endif + +#ifndef CONJ +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FMADD +#define FMADD4 FNMSUB +#else +#if defined(LN) || defined(LT) +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FMADD +#define FMADD4 FMADD +#endif +#if defined(RN) || defined(RT) +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#endif +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp + STACK_START + 56], B + ldx [%sp + STACK_START + 64], C + ldx [%sp + STACK_START + 72], LDC + ldx [%sp + STACK_START + 80], OFFSET +#endif + + cmp M, 0 + ble,pn %icc, .LL999 + nop + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL20 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, ZBASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + FCLR (cc01) + LDF [AO + 1 * SIZE], a2 + FCLR (cc05) + LDF [AO + 8 * SIZE], a5 + FCLR (cc09) + LDF [BO + 0 * SIZE], b1 + FCLR (cc13) + + LDF [BO + 1 * SIZE], b2 + FCLR (cc02) + LDF [BO + 2 * SIZE], b3 + FCLR (cc06) + LDF [BO + 3 * SIZE], b4 + FCLR (cc10) + LDF [BO + 4 * SIZE], b5 + FCLR (cc14) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc03) + LDF [BO + 6 * SIZE], b7 + FCLR (cc07) + LDF [BO + 7 * SIZE], b8 + FCLR (cc11) + LDF [BO + 8 * SIZE], b9 + FCLR (cc15) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + nop + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + nop + FMADD2 (aa2, bb5, cc10, cc10) + nop + + FMADD3 (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD2 (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + FADD c09, c12, c09 + FADD c10, c11, c10 + FADD c13, c16, c13 + FADD c14, c15, c14 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c13, c13 + FSUB b4, c14, c14 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + FMUL a1, c05, b3 + FMUL a2, c05, b4 + FMUL a1, c09, b5 + FMUL a2, c09, b6 + FMUL a1, c13, b7 + FMUL a2, c13, b8 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) + FNMSUB (aa2, cc06, bb3, cc05) + FMADD (aa1, cc06, bb4, cc06) + FNMSUB (aa2, cc10, bb5, cc09) + FMADD (aa1, cc10, bb6, cc10) + FNMSUB (aa2, cc14, bb7, cc13) + FMADD (aa1, cc14, bb8, cc14) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) + FMADD (aa2, cc06, bb3, cc05) + FMSUB (aa1, cc06, bb4, cc06) + FMADD (aa2, cc10, bb5, cc09) + FMSUB (aa1, cc10, bb6, cc10) + FMADD (aa2, cc14, bb7, cc13) + FMSUB (aa1, cc14, bb8, cc14) +#endif +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + LDF [BO + 7 * SIZE], b8 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif + + FNMSUB (bb3, cc01, cc05, cc05) + FNMSUB (bb3, cc02, cc06, cc06) + FNMSUB (bb5, cc01, cc09, cc09) + FNMSUB (bb5, cc02, cc10, cc10) + FNMSUB (bb7, cc01, cc13, cc13) + FNMSUB (bb7, cc02, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc02, cc05, cc05) + FNMSUB (bb4, cc01, cc06, cc06) + FMADD (bb6, cc02, cc09, cc09) + FNMSUB (bb6, cc01, cc10, cc10) + FMADD (bb8, cc02, cc13, cc13) + FNMSUB (bb8, cc01, cc14, cc14) +#else + FNMSUB (bb4, cc02, cc05, cc05) + FMADD (bb4, cc01, cc06, cc06) + FNMSUB (bb6, cc02, cc09, cc09) + FMADD (bb6, cc01, cc10, cc10) + FNMSUB (bb8, cc02, cc13, cc13) + FMADD (bb8, cc01, cc14, cc14) +#endif + + LDF [BO + 10 * SIZE], b1 + LDF [BO + 11 * SIZE], b2 + LDF [BO + 12 * SIZE], b3 + LDF [BO + 13 * SIZE], b4 + LDF [BO + 14 * SIZE], b5 + LDF [BO + 15 * SIZE], b6 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc09, cc09) + FNMSUB (bb3, cc06, cc10, cc10) + FNMSUB (bb5, cc05, cc13, cc13) + FNMSUB (bb5, cc06, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc06, cc09, cc09) + FNMSUB (bb4, cc05, cc10, cc10) + FMADD (bb6, cc06, cc13, cc13) + FNMSUB (bb6, cc05, cc14, cc14) +#else + FNMSUB (bb4, cc06, cc09, cc09) + FMADD (bb4, cc05, cc10, cc10) + FNMSUB (bb6, cc06, cc13, cc13) + FMADD (bb6, cc05, cc14, cc14) +#endif + + LDF [BO + 20 * SIZE], b1 + LDF [BO + 21 * SIZE], b2 + LDF [BO + 22 * SIZE], b3 + LDF [BO + 23 * SIZE], b4 + + FMUL b1, c09, a1 + FMUL b2, c09, a2 + +#ifndef CONJ + FNMSUB (bb2, cc10, aa1, cc09) + FMADD (bb1, cc10, aa2, cc10) +#else + FMADD (bb2, cc10, aa1, cc09) + FMSUB (bb1, cc10, aa2, cc10) +#endif + + FNMSUB (bb3, cc09, cc13, cc13) + FNMSUB (bb3, cc10, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc10, cc13, cc13) + FNMSUB (bb4, cc09, cc14, cc14) +#else + FNMSUB (bb4, cc10, cc13, cc13) + FMADD (bb4, cc09, cc14, cc14) +#endif + + LDF [BO + 30 * SIZE], b1 + LDF [BO + 31 * SIZE], b2 + + FMUL b1, c13, a1 + FMUL b2, c13, a2 + +#ifndef CONJ + FNMSUB (bb2, cc14, aa1, cc13) + FMADD (bb1, cc14, aa2, cc14) +#else + FMADD (bb2, cc14, aa1, cc13) + FMSUB (bb1, cc14, aa2, cc14) +#endif +#endif + +#ifdef RT + LDF [BO + 30 * SIZE], b1 + LDF [BO + 31 * SIZE], b2 + LDF [BO + 28 * SIZE], b3 + LDF [BO + 29 * SIZE], b4 + LDF [BO + 26 * SIZE], b5 + LDF [BO + 27 * SIZE], b6 + LDF [BO + 24 * SIZE], b7 + LDF [BO + 25 * SIZE], b8 + + FMUL b1, c13, a1 + FMUL b2, c13, a2 + +#ifndef CONJ + FNMSUB (bb2, cc14, aa1, cc13) + FMADD (bb1, cc14, aa2, cc14) +#else + FMADD (bb2, cc14, aa1, cc13) + FMSUB (bb1, cc14, aa2, cc14) +#endif + + FNMSUB (bb3, cc13, cc09, cc09) + FNMSUB (bb3, cc14, cc10, cc10) + FNMSUB (bb5, cc13, cc05, cc05) + FNMSUB (bb5, cc14, cc06, cc06) + FNMSUB (bb7, cc13, cc01, cc01) + FNMSUB (bb7, cc14, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc14, cc09, cc09) + FNMSUB (bb4, cc13, cc10, cc10) + FMADD (bb6, cc14, cc05, cc05) + FNMSUB (bb6, cc13, cc06, cc06) + FMADD (bb8, cc14, cc01, cc01) + FNMSUB (bb8, cc13, cc02, cc02) +#else + FNMSUB (bb4, cc14, cc09, cc09) + FMADD (bb4, cc13, cc10, cc10) + FNMSUB (bb6, cc14, cc05, cc05) + FMADD (bb6, cc13, cc06, cc06) + FNMSUB (bb8, cc14, cc01, cc01) + FMADD (bb8, cc13, cc02, cc02) +#endif + + LDF [BO + 20 * SIZE], b1 + LDF [BO + 21 * SIZE], b2 + LDF [BO + 18 * SIZE], b3 + LDF [BO + 19 * SIZE], b4 + LDF [BO + 16 * SIZE], b5 + LDF [BO + 17 * SIZE], b6 + + FMUL b1, c09, a1 + FMUL b2, c09, a2 + +#ifndef CONJ + FNMSUB (bb2, cc10, aa1, cc09) + FMADD (bb1, cc10, aa2, cc10) +#else + FMADD (bb2, cc10, aa1, cc09) + FMSUB (bb1, cc10, aa2, cc10) +#endif + + FNMSUB (bb3, cc09, cc05, cc05) + FNMSUB (bb3, cc10, cc06, cc06) + FNMSUB (bb5, cc09, cc01, cc01) + FNMSUB (bb5, cc10, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc10, cc05, cc05) + FNMSUB (bb4, cc09, cc06, cc06) + FMADD (bb6, cc10, cc01, cc01) + FNMSUB (bb6, cc09, cc02, cc02) +#else + FNMSUB (bb4, cc10, cc05, cc05) + FMADD (bb4, cc09, cc06, cc06) + FNMSUB (bb6, cc10, cc01, cc01) + FMADD (bb6, cc09, cc02, cc02) +#endif + + LDF [BO + 10 * SIZE], b1 + LDF [BO + 11 * SIZE], b2 + LDF [BO + 8 * SIZE], b3 + LDF [BO + 9 * SIZE], b4 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc01, cc01) + FNMSUB (bb3, cc06, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc06, cc01, cc01) + FNMSUB (bb4, cc05, cc02, cc02) +#else + FNMSUB (bb4, cc06, cc01, cc01) + FMADD (bb4, cc05, cc02, cc02) +#endif + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c10, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c14, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + +#ifdef LN + sll K, ZBASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL20: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL22: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + FCLR (cc01) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc04) + LDF [BO + 8 * SIZE], b9 + FCLR (cc05) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + FCLR (cc08) + .align 4 + +.LL23: + FMADD1 (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD2 (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD3 (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD2 (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + nop + FMADD2 (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD3 (aa3, bb8, cc07, cc07) + FMADD4 (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL23 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL27 + add BO, 4 * SIZE, BO + .align 4 + +.LL28: + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + FMUL a1, c05, b3 + FMUL a2, c05, b4 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) + FNMSUB (aa2, cc06, bb3, cc05) + FMADD (aa1, cc06, bb4, cc06) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) + FMADD (aa2, cc06, bb3, cc05) + FMSUB (aa1, cc06, bb4, cc06) +#endif +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif + + FNMSUB (bb3, cc01, cc05, cc05) + FNMSUB (bb3, cc02, cc06, cc06) + +#ifndef CONJ + FMADD (bb4, cc02, cc05, cc05) + FNMSUB (bb4, cc01, cc06, cc06) +#else + FNMSUB (bb4, cc02, cc05, cc05) + FMADD (bb4, cc01, cc06, cc06) +#endif + + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + LDF [BO + 4 * SIZE], b3 + LDF [BO + 5 * SIZE], b4 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc01, cc01) + FNMSUB (bb3, cc06, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc06, cc01, cc01) + FNMSUB (bb4, cc05, cc02, cc02) +#else + FNMSUB (bb4, cc06, cc01, cc01) + FMADD (bb4, cc05, cc02, cc02) +#endif + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL22 + nop + +#ifdef LN + sll K, ZBASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL30: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + FCLR (cc08) + .align 4 + +.LL33: + FMADD1 (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD1 (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD2 (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD3 (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD4 (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD1 (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD2 (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD3 (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD4 (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD1 (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD2 (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD3 (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD4 (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL37 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL38: + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 +#else + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 +#endif + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_RT.S b/kernel/sparc/ztrsm_kernel_RT.S new file mode 100644 index 0000000..2949e48 --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_RT.S @@ -0,0 +1,2389 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 + +#define OFFSET %l2 +#define KK %l3 +#define TEMP1 %l4 +#define TEMP2 %l5 +#define AORIG %l6 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f62 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f58 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#endif + +#define t5 c13 +#define t6 c14 +#define t7 c15 +#define t8 c16 + +#ifndef CONJ +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FADD +#define FADD4 FSUB +#else + +#if defined(LN) || defined(LT) +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FADD +#define FADD4 FADD +#endif + +#if defined(RN) || defined(RT) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FSUB +#define FADD4 FADD +#endif +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], B + ldx [%sp+ STACK_START + 64], C + ldx [%sp+ STACK_START + 72], LDC + ldx [%sp+ STACK_START + 80], OFFSET +#endif + +#ifdef DOUBLE + FCLR(27) +#else + FCLR(29) +#endif + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL100 + nop + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c03 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c05 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD1 c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD1 c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD1 c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD3 c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD4 c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b1, t1 + FADD3 c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD4 c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD1 c03, t1, c03 + FADD3 c07, t2, c07 + FADD2 c04, t3, c04 + FADD4 c08, t4, c08 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t5 + FMUL a4, c01, t6 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD1 c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD3 c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD2 c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD1 c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD3 c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD3 c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD2 c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + FADD1 c01, t1, c01 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + FADD3 c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + cmp L, 0 + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [BO + 1 * SIZE], b2 + + bg,pt %icc, .LL156 + LDF [AO + 1 * SIZE], a2 + +.LL159: + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL199: +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + +.LL100: + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL11: +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + add LDC, LDC, TEMP1 + sub C, TEMP1, C +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + + sra M, 1, I + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + cmp I, 0 +#ifndef RT + add C2, LDC, C +#endif + ble,pn %icc, .LL50 + FMOV FZERO, t4 + + +.LL21: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + FMOV FZERO, c01 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [BO + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 + + FMOV FZERO, c15 + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD2 c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD4 c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD2 c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD4 c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD4 c08, t2, c08 + FMUL a5, b2, t2 + FADD2 c12, t3, c12 + FMUL a5, b3, t3 + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL29 + nop + +.LL26: + FADD2 c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD4 c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 2, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD2 c04, t1, c04 + FADD4 c08, t2, c08 + FADD2 c12, t3, c12 + FADD4 c16, t4, c16 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + FADD c09, c14, c09 + FADD c10, c13, c10 + FADD c11, c16, c11 + FADD c12, c15, c12 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c09, c09 + FSUB a4, c10, c10 + + FSUB b1, c03, c03 + FSUB b2, c04, c04 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c09, t3, c09 + FSUB c10, t4, c10 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + FADD2 c09, t7, c09 + FADD4 c10, t8, c10 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c09, t5 + FMUL b2, c10, t6 + FMUL b1, c10, t7 + FMUL b2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c09, t5 + FMUL a2, c10, t6 + FMUL a1, c10, t7 + FMUL a2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c09, t3 + FMUL a3, c10, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c10, t7 + FMUL a4, c09, t8 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + FADD2 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c04, t7 + FMUL a4, c03, t8 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD3 c09, t5, c09 + FADD4 c10, t6, c10 + FADD3 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c09, t1 + FMUL b2, c10, t2 + FMUL b1, c10, t3 + FMUL b2, c09, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c09, t1 + FMUL a2, c10, t2 + FMUL a1, c10, t3 + FMUL a2, c09, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c10, t5 + FMUL a4, c09, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FADD3 c01, t5, c01 + FADD4 c02, t6, c02 + FADD3 c03, t7, c03 + FADD4 c04, t8, c04 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c03, t5 + FMUL b2, c04, t6 + FMUL b1, c04, t7 + FMUL b2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c10, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c11, [BO + 6 * SIZE] + STF c12, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c11, [AO + 6 * SIZE] + STF c12, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c09, [C2 + 0 * SIZE] + STF c10, [C2 + 1 * SIZE] + STF c11, [C2 + 2 * SIZE] + STF c12, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 1, I + FMOV FZERO, c02 + cmp I, 0 + FMOV FZERO, t1 + ble,pn %icc, .LL99 + FMOV FZERO, c04 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 1 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD2 c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD4 c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD4 c04, t2, c04 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD2 c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD4 c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD2 c02, t1, c02 + FADD4 c04, t2, c04 + FADD2 c06, t3, c06 + FADD4 c08, t4, c08 + + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t3 + FMUL a4, c01, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FADD3 c05, t3, c05 + FADD4 c06, t4, c06 + + FMUL b1, c05, t1 + FMUL b2, c06, t2 + FMUL b1, c06, t3 + FMUL b2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c05, t1 + FMUL a2, c06, t2 + FMUL a1, c06, t3 + FMUL a2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a4, c06, t3 + FMUL a4, c05, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FADD3 c01, t3, c01 + FADD4 c02, t4, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL99: +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_RT_1x4.S b/kernel/sparc/ztrsm_kernel_RT_1x4.S new file mode 100644 index 0000000..49d449a --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_RT_1x4.S @@ -0,0 +1,2132 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 +#define C3 %l2 +#define C4 %l3 + +#define OFFSET %l4 +#define KK %l5 +#define TEMP1 %l6 +#define TEMP2 %l7 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#endif + +#ifndef CONJ +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FMADD +#define FMADD4 FNMSUB +#else +#if defined(LN) || defined(LT) +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FMADD +#define FMADD4 FMADD +#endif +#if defined(RN) || defined(RT) +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#endif +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp + STACK_START + 56], B + ldx [%sp + STACK_START + 64], C + ldx [%sp + STACK_START + 72], LDC + ldx [%sp + STACK_START + 80], OFFSET +#endif + + cmp M, 0 + ble,pn %icc, .LL999 + nop + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL20 + nop + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + FCLR (cc08) + .align 4 + +.LL33: + FMADD1 (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD1 (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD2 (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD3 (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD4 (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD1 (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD2 (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD3 (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD4 (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD1 (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD2 (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD3 (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD4 (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL37 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL38: + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 +#else + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 +#endif + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL20: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL22: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + FCLR (cc01) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc04) + LDF [BO + 8 * SIZE], b9 + FCLR (cc05) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + FCLR (cc08) + .align 4 + +.LL23: + FMADD1 (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD2 (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD3 (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD2 (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + nop + FMADD2 (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD3 (aa3, bb8, cc07, cc07) + FMADD4 (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL23 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL27 + add BO, 4 * SIZE, BO + .align 4 + +.LL28: + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + FMUL a1, c05, b3 + FMUL a2, c05, b4 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) + FNMSUB (aa2, cc06, bb3, cc05) + FMADD (aa1, cc06, bb4, cc06) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) + FMADD (aa2, cc06, bb3, cc05) + FMSUB (aa1, cc06, bb4, cc06) +#endif +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif + + FNMSUB (bb3, cc01, cc05, cc05) + FNMSUB (bb3, cc02, cc06, cc06) + +#ifndef CONJ + FMADD (bb4, cc02, cc05, cc05) + FNMSUB (bb4, cc01, cc06, cc06) +#else + FNMSUB (bb4, cc02, cc05, cc05) + FMADD (bb4, cc01, cc06, cc06) +#endif + + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + LDF [BO + 4 * SIZE], b3 + LDF [BO + 5 * SIZE], b4 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc01, cc01) + FNMSUB (bb3, cc06, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc06, cc01, cc01) + FNMSUB (bb4, cc05, cc02, cc02) +#else + FNMSUB (bb4, cc06, cc01, cc01) + FMADD (bb4, cc05, cc02, cc02) +#endif + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL22 + nop + +#ifdef LN + sll K, ZBASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL30: + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, ZBASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + FCLR (cc01) + LDF [AO + 1 * SIZE], a2 + FCLR (cc05) + LDF [AO + 8 * SIZE], a5 + FCLR (cc09) + LDF [BO + 0 * SIZE], b1 + FCLR (cc13) + + LDF [BO + 1 * SIZE], b2 + FCLR (cc02) + LDF [BO + 2 * SIZE], b3 + FCLR (cc06) + LDF [BO + 3 * SIZE], b4 + FCLR (cc10) + LDF [BO + 4 * SIZE], b5 + FCLR (cc14) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc03) + LDF [BO + 6 * SIZE], b7 + FCLR (cc07) + LDF [BO + 7 * SIZE], b8 + FCLR (cc11) + LDF [BO + 8 * SIZE], b9 + FCLR (cc15) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + nop + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + nop + FMADD2 (aa2, bb5, cc10, cc10) + nop + + FMADD3 (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD2 (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + FADD c09, c12, c09 + FADD c10, c11, c10 + FADD c13, c16, c13 + FADD c14, c15, c14 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c13, c13 + FSUB b4, c14, c14 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + FMUL a1, c05, b3 + FMUL a2, c05, b4 + FMUL a1, c09, b5 + FMUL a2, c09, b6 + FMUL a1, c13, b7 + FMUL a2, c13, b8 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) + FNMSUB (aa2, cc06, bb3, cc05) + FMADD (aa1, cc06, bb4, cc06) + FNMSUB (aa2, cc10, bb5, cc09) + FMADD (aa1, cc10, bb6, cc10) + FNMSUB (aa2, cc14, bb7, cc13) + FMADD (aa1, cc14, bb8, cc14) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) + FMADD (aa2, cc06, bb3, cc05) + FMSUB (aa1, cc06, bb4, cc06) + FMADD (aa2, cc10, bb5, cc09) + FMSUB (aa1, cc10, bb6, cc10) + FMADD (aa2, cc14, bb7, cc13) + FMSUB (aa1, cc14, bb8, cc14) +#endif +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + LDF [BO + 7 * SIZE], b8 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif + + FNMSUB (bb3, cc01, cc05, cc05) + FNMSUB (bb3, cc02, cc06, cc06) + FNMSUB (bb5, cc01, cc09, cc09) + FNMSUB (bb5, cc02, cc10, cc10) + FNMSUB (bb7, cc01, cc13, cc13) + FNMSUB (bb7, cc02, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc02, cc05, cc05) + FNMSUB (bb4, cc01, cc06, cc06) + FMADD (bb6, cc02, cc09, cc09) + FNMSUB (bb6, cc01, cc10, cc10) + FMADD (bb8, cc02, cc13, cc13) + FNMSUB (bb8, cc01, cc14, cc14) +#else + FNMSUB (bb4, cc02, cc05, cc05) + FMADD (bb4, cc01, cc06, cc06) + FNMSUB (bb6, cc02, cc09, cc09) + FMADD (bb6, cc01, cc10, cc10) + FNMSUB (bb8, cc02, cc13, cc13) + FMADD (bb8, cc01, cc14, cc14) +#endif + + LDF [BO + 10 * SIZE], b1 + LDF [BO + 11 * SIZE], b2 + LDF [BO + 12 * SIZE], b3 + LDF [BO + 13 * SIZE], b4 + LDF [BO + 14 * SIZE], b5 + LDF [BO + 15 * SIZE], b6 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc09, cc09) + FNMSUB (bb3, cc06, cc10, cc10) + FNMSUB (bb5, cc05, cc13, cc13) + FNMSUB (bb5, cc06, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc06, cc09, cc09) + FNMSUB (bb4, cc05, cc10, cc10) + FMADD (bb6, cc06, cc13, cc13) + FNMSUB (bb6, cc05, cc14, cc14) +#else + FNMSUB (bb4, cc06, cc09, cc09) + FMADD (bb4, cc05, cc10, cc10) + FNMSUB (bb6, cc06, cc13, cc13) + FMADD (bb6, cc05, cc14, cc14) +#endif + + LDF [BO + 20 * SIZE], b1 + LDF [BO + 21 * SIZE], b2 + LDF [BO + 22 * SIZE], b3 + LDF [BO + 23 * SIZE], b4 + + FMUL b1, c09, a1 + FMUL b2, c09, a2 + +#ifndef CONJ + FNMSUB (bb2, cc10, aa1, cc09) + FMADD (bb1, cc10, aa2, cc10) +#else + FMADD (bb2, cc10, aa1, cc09) + FMSUB (bb1, cc10, aa2, cc10) +#endif + + FNMSUB (bb3, cc09, cc13, cc13) + FNMSUB (bb3, cc10, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc10, cc13, cc13) + FNMSUB (bb4, cc09, cc14, cc14) +#else + FNMSUB (bb4, cc10, cc13, cc13) + FMADD (bb4, cc09, cc14, cc14) +#endif + + LDF [BO + 30 * SIZE], b1 + LDF [BO + 31 * SIZE], b2 + + FMUL b1, c13, a1 + FMUL b2, c13, a2 + +#ifndef CONJ + FNMSUB (bb2, cc14, aa1, cc13) + FMADD (bb1, cc14, aa2, cc14) +#else + FMADD (bb2, cc14, aa1, cc13) + FMSUB (bb1, cc14, aa2, cc14) +#endif +#endif + +#ifdef RT + LDF [BO + 30 * SIZE], b1 + LDF [BO + 31 * SIZE], b2 + LDF [BO + 28 * SIZE], b3 + LDF [BO + 29 * SIZE], b4 + LDF [BO + 26 * SIZE], b5 + LDF [BO + 27 * SIZE], b6 + LDF [BO + 24 * SIZE], b7 + LDF [BO + 25 * SIZE], b8 + + FMUL b1, c13, a1 + FMUL b2, c13, a2 + +#ifndef CONJ + FNMSUB (bb2, cc14, aa1, cc13) + FMADD (bb1, cc14, aa2, cc14) +#else + FMADD (bb2, cc14, aa1, cc13) + FMSUB (bb1, cc14, aa2, cc14) +#endif + + FNMSUB (bb3, cc13, cc09, cc09) + FNMSUB (bb3, cc14, cc10, cc10) + FNMSUB (bb5, cc13, cc05, cc05) + FNMSUB (bb5, cc14, cc06, cc06) + FNMSUB (bb7, cc13, cc01, cc01) + FNMSUB (bb7, cc14, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc14, cc09, cc09) + FNMSUB (bb4, cc13, cc10, cc10) + FMADD (bb6, cc14, cc05, cc05) + FNMSUB (bb6, cc13, cc06, cc06) + FMADD (bb8, cc14, cc01, cc01) + FNMSUB (bb8, cc13, cc02, cc02) +#else + FNMSUB (bb4, cc14, cc09, cc09) + FMADD (bb4, cc13, cc10, cc10) + FNMSUB (bb6, cc14, cc05, cc05) + FMADD (bb6, cc13, cc06, cc06) + FNMSUB (bb8, cc14, cc01, cc01) + FMADD (bb8, cc13, cc02, cc02) +#endif + + LDF [BO + 20 * SIZE], b1 + LDF [BO + 21 * SIZE], b2 + LDF [BO + 18 * SIZE], b3 + LDF [BO + 19 * SIZE], b4 + LDF [BO + 16 * SIZE], b5 + LDF [BO + 17 * SIZE], b6 + + FMUL b1, c09, a1 + FMUL b2, c09, a2 + +#ifndef CONJ + FNMSUB (bb2, cc10, aa1, cc09) + FMADD (bb1, cc10, aa2, cc10) +#else + FMADD (bb2, cc10, aa1, cc09) + FMSUB (bb1, cc10, aa2, cc10) +#endif + + FNMSUB (bb3, cc09, cc05, cc05) + FNMSUB (bb3, cc10, cc06, cc06) + FNMSUB (bb5, cc09, cc01, cc01) + FNMSUB (bb5, cc10, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc10, cc05, cc05) + FNMSUB (bb4, cc09, cc06, cc06) + FMADD (bb6, cc10, cc01, cc01) + FNMSUB (bb6, cc09, cc02, cc02) +#else + FNMSUB (bb4, cc10, cc05, cc05) + FMADD (bb4, cc09, cc06, cc06) + FNMSUB (bb6, cc10, cc01, cc01) + FMADD (bb6, cc09, cc02, cc02) +#endif + + LDF [BO + 10 * SIZE], b1 + LDF [BO + 11 * SIZE], b2 + LDF [BO + 8 * SIZE], b3 + LDF [BO + 9 * SIZE], b4 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc01, cc01) + FNMSUB (bb3, cc06, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc06, cc01, cc01) + FNMSUB (bb4, cc05, cc02, cc02) +#else + FNMSUB (bb4, cc06, cc01, cc01) + FMADD (bb4, cc05, cc02, cc02) +#endif + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c10, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c14, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + +#ifdef LN + sll K, ZBASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/x86/._KERNEL b/kernel/x86/._KERNEL new file mode 100644 index 0000000..aa6bb35 Binary files /dev/null and b/kernel/x86/._KERNEL differ diff --git a/kernel/x86/._KERNEL.ATHLON b/kernel/x86/._KERNEL.ATHLON new file mode 100644 index 0000000..e9c3894 Binary files /dev/null and b/kernel/x86/._KERNEL.ATHLON differ diff --git a/kernel/x86/._KERNEL.ATOM b/kernel/x86/._KERNEL.ATOM new file mode 100644 index 0000000..dd140e6 Binary files /dev/null and b/kernel/x86/._KERNEL.ATOM differ diff --git a/kernel/x86/._KERNEL.BANIAS b/kernel/x86/._KERNEL.BANIAS new file mode 100644 index 0000000..7b00506 Binary files /dev/null and b/kernel/x86/._KERNEL.BANIAS differ diff --git a/kernel/x86/._KERNEL.BARCELONA b/kernel/x86/._KERNEL.BARCELONA new file mode 100644 index 0000000..22862ed Binary files /dev/null and b/kernel/x86/._KERNEL.BARCELONA differ diff --git a/kernel/x86/._KERNEL.COPPERMINE b/kernel/x86/._KERNEL.COPPERMINE new file mode 100644 index 0000000..dfe16fe Binary files /dev/null and b/kernel/x86/._KERNEL.COPPERMINE differ diff --git a/kernel/x86/._KERNEL.CORE2 b/kernel/x86/._KERNEL.CORE2 new file mode 100644 index 0000000..afc03e3 Binary files /dev/null and b/kernel/x86/._KERNEL.CORE2 differ diff --git a/kernel/x86/._KERNEL.DUNNINGTON b/kernel/x86/._KERNEL.DUNNINGTON new file mode 100644 index 0000000..e03d05c Binary files /dev/null and b/kernel/x86/._KERNEL.DUNNINGTON differ diff --git a/kernel/x86/._KERNEL.KATMAI b/kernel/x86/._KERNEL.KATMAI new file mode 100644 index 0000000..1e33c27 Binary files /dev/null and b/kernel/x86/._KERNEL.KATMAI differ diff --git a/kernel/x86/._KERNEL.NANO b/kernel/x86/._KERNEL.NANO new file mode 100644 index 0000000..9924fc0 Binary files /dev/null and b/kernel/x86/._KERNEL.NANO differ diff --git a/kernel/x86/._KERNEL.NEHALEM b/kernel/x86/._KERNEL.NEHALEM new file mode 100644 index 0000000..2a99446 Binary files /dev/null and b/kernel/x86/._KERNEL.NEHALEM differ diff --git a/kernel/x86/._KERNEL.NORTHWOOD b/kernel/x86/._KERNEL.NORTHWOOD new file mode 100644 index 0000000..c88ac91 Binary files /dev/null and b/kernel/x86/._KERNEL.NORTHWOOD differ diff --git a/kernel/x86/._KERNEL.OPTERON b/kernel/x86/._KERNEL.OPTERON new file mode 100644 index 0000000..0a038be Binary files /dev/null and b/kernel/x86/._KERNEL.OPTERON differ diff --git a/kernel/x86/._KERNEL.OPTERON_SSE3 b/kernel/x86/._KERNEL.OPTERON_SSE3 new file mode 100644 index 0000000..fe42578 Binary files /dev/null and b/kernel/x86/._KERNEL.OPTERON_SSE3 differ diff --git a/kernel/x86/._KERNEL.P5 b/kernel/x86/._KERNEL.P5 new file mode 100644 index 0000000..03fef9e Binary files /dev/null and b/kernel/x86/._KERNEL.P5 differ diff --git a/kernel/x86/._KERNEL.P6 b/kernel/x86/._KERNEL.P6 new file mode 100644 index 0000000..ce4d4a7 Binary files /dev/null and b/kernel/x86/._KERNEL.P6 differ diff --git a/kernel/x86/._KERNEL.PENRYN b/kernel/x86/._KERNEL.PENRYN new file mode 100644 index 0000000..e48be25 Binary files /dev/null and b/kernel/x86/._KERNEL.PENRYN differ diff --git a/kernel/x86/._KERNEL.PRESCOTT b/kernel/x86/._KERNEL.PRESCOTT new file mode 100644 index 0000000..639be93 Binary files /dev/null and b/kernel/x86/._KERNEL.PRESCOTT differ diff --git a/kernel/x86/._KERNEL.VIAC3 b/kernel/x86/._KERNEL.VIAC3 new file mode 100644 index 0000000..5aaa104 Binary files /dev/null and b/kernel/x86/._KERNEL.VIAC3 differ diff --git a/kernel/x86/._KERNEL.YONAH b/kernel/x86/._KERNEL.YONAH new file mode 100644 index 0000000..bbab52d Binary files /dev/null and b/kernel/x86/._KERNEL.YONAH differ diff --git a/kernel/x86/._Makefile b/kernel/x86/._Makefile new file mode 100644 index 0000000..1e4640c Binary files /dev/null and b/kernel/x86/._Makefile differ diff --git a/kernel/x86/._amax.S b/kernel/x86/._amax.S new file mode 100644 index 0000000..353c3be Binary files /dev/null and b/kernel/x86/._amax.S differ diff --git a/kernel/x86/._amax_sse.S b/kernel/x86/._amax_sse.S new file mode 100644 index 0000000..3595da2 Binary files /dev/null and b/kernel/x86/._amax_sse.S differ diff --git a/kernel/x86/._amax_sse2.S b/kernel/x86/._amax_sse2.S new file mode 100644 index 0000000..ce05367 Binary files /dev/null and b/kernel/x86/._amax_sse2.S differ diff --git a/kernel/x86/._asum.S b/kernel/x86/._asum.S new file mode 100644 index 0000000..3aae673 Binary files /dev/null and b/kernel/x86/._asum.S differ diff --git a/kernel/x86/._asum_sse.S b/kernel/x86/._asum_sse.S new file mode 100644 index 0000000..f7a906d Binary files /dev/null and b/kernel/x86/._asum_sse.S differ diff --git a/kernel/x86/._asum_sse2.S b/kernel/x86/._asum_sse2.S new file mode 100644 index 0000000..1f98607 Binary files /dev/null and b/kernel/x86/._asum_sse2.S differ diff --git a/kernel/x86/._axpy.S b/kernel/x86/._axpy.S new file mode 100644 index 0000000..dd969df Binary files /dev/null and b/kernel/x86/._axpy.S differ diff --git a/kernel/x86/._axpy_sse.S b/kernel/x86/._axpy_sse.S new file mode 100644 index 0000000..a97c50c Binary files /dev/null and b/kernel/x86/._axpy_sse.S differ diff --git a/kernel/x86/._axpy_sse2.S b/kernel/x86/._axpy_sse2.S new file mode 100644 index 0000000..1bd26cc Binary files /dev/null and b/kernel/x86/._axpy_sse2.S differ diff --git a/kernel/x86/._axpy_sse2_opteron.S b/kernel/x86/._axpy_sse2_opteron.S new file mode 100644 index 0000000..5df6246 Binary files /dev/null and b/kernel/x86/._axpy_sse2_opteron.S differ diff --git a/kernel/x86/._cabs.S b/kernel/x86/._cabs.S new file mode 100644 index 0000000..19a4e68 Binary files /dev/null and b/kernel/x86/._cabs.S differ diff --git a/kernel/x86/._copy.S b/kernel/x86/._copy.S new file mode 100644 index 0000000..21355a4 Binary files /dev/null and b/kernel/x86/._copy.S differ diff --git a/kernel/x86/._copy_sse.S b/kernel/x86/._copy_sse.S new file mode 100644 index 0000000..aeb10f0 Binary files /dev/null and b/kernel/x86/._copy_sse.S differ diff --git a/kernel/x86/._copy_sse2.S b/kernel/x86/._copy_sse2.S new file mode 100644 index 0000000..7943ab5 Binary files /dev/null and b/kernel/x86/._copy_sse2.S differ diff --git a/kernel/x86/._cpuid.S b/kernel/x86/._cpuid.S new file mode 100644 index 0000000..17ca8cc Binary files /dev/null and b/kernel/x86/._cpuid.S differ diff --git a/kernel/x86/._dot.S b/kernel/x86/._dot.S new file mode 100644 index 0000000..d1f5249 Binary files /dev/null and b/kernel/x86/._dot.S differ diff --git a/kernel/x86/._dot_amd.S b/kernel/x86/._dot_amd.S new file mode 100644 index 0000000..4201f77 Binary files /dev/null and b/kernel/x86/._dot_amd.S differ diff --git a/kernel/x86/._dot_sse.S b/kernel/x86/._dot_sse.S new file mode 100644 index 0000000..f03e3f6 Binary files /dev/null and b/kernel/x86/._dot_sse.S differ diff --git a/kernel/x86/._dot_sse2.S b/kernel/x86/._dot_sse2.S new file mode 100644 index 0000000..e7bb966 Binary files /dev/null and b/kernel/x86/._dot_sse2.S differ diff --git a/kernel/x86/._dot_sse2_opteron.S b/kernel/x86/._dot_sse2_opteron.S new file mode 100644 index 0000000..c7cb52f Binary files /dev/null and b/kernel/x86/._dot_sse2_opteron.S differ diff --git a/kernel/x86/._dot_sse_opteron.S b/kernel/x86/._dot_sse_opteron.S new file mode 100644 index 0000000..4fc749f Binary files /dev/null and b/kernel/x86/._dot_sse_opteron.S differ diff --git a/kernel/x86/._gemm_beta.S b/kernel/x86/._gemm_beta.S new file mode 100644 index 0000000..1d543cf Binary files /dev/null and b/kernel/x86/._gemm_beta.S differ diff --git a/kernel/x86/._gemm_kernel_1x4.S b/kernel/x86/._gemm_kernel_1x4.S new file mode 100644 index 0000000..0fa19d0 Binary files /dev/null and b/kernel/x86/._gemm_kernel_1x4.S differ diff --git a/kernel/x86/._gemm_kernel_2x2.S b/kernel/x86/._gemm_kernel_2x2.S new file mode 100644 index 0000000..bde97f6 Binary files /dev/null and b/kernel/x86/._gemm_kernel_2x2.S differ diff --git a/kernel/x86/._gemm_kernel_2x2_atom.S b/kernel/x86/._gemm_kernel_2x2_atom.S new file mode 100644 index 0000000..8a8098c Binary files /dev/null and b/kernel/x86/._gemm_kernel_2x2_atom.S differ diff --git a/kernel/x86/._gemm_kernel_2x4_3dnow.S b/kernel/x86/._gemm_kernel_2x4_3dnow.S new file mode 100644 index 0000000..8df83f3 Binary files /dev/null and b/kernel/x86/._gemm_kernel_2x4_3dnow.S differ diff --git a/kernel/x86/._gemm_kernel_2x4_barcelona.S b/kernel/x86/._gemm_kernel_2x4_barcelona.S new file mode 100644 index 0000000..25a0f31 Binary files /dev/null and b/kernel/x86/._gemm_kernel_2x4_barcelona.S differ diff --git a/kernel/x86/._gemm_kernel_2x4_core2.S b/kernel/x86/._gemm_kernel_2x4_core2.S new file mode 100644 index 0000000..532d452 Binary files /dev/null and b/kernel/x86/._gemm_kernel_2x4_core2.S differ diff --git a/kernel/x86/._gemm_kernel_2x4_penryn.S b/kernel/x86/._gemm_kernel_2x4_penryn.S new file mode 100644 index 0000000..832b7a5 Binary files /dev/null and b/kernel/x86/._gemm_kernel_2x4_penryn.S differ diff --git a/kernel/x86/._gemm_kernel_2x4_sse2.S b/kernel/x86/._gemm_kernel_2x4_sse2.S new file mode 100644 index 0000000..0c57b69 Binary files /dev/null and b/kernel/x86/._gemm_kernel_2x4_sse2.S differ diff --git a/kernel/x86/._gemm_kernel_2x4_sse3.S b/kernel/x86/._gemm_kernel_2x4_sse3.S new file mode 100644 index 0000000..756835d Binary files /dev/null and b/kernel/x86/._gemm_kernel_2x4_sse3.S differ diff --git a/kernel/x86/._gemm_kernel_4x2_core2.S b/kernel/x86/._gemm_kernel_4x2_core2.S new file mode 100644 index 0000000..b38d390 Binary files /dev/null and b/kernel/x86/._gemm_kernel_4x2_core2.S differ diff --git a/kernel/x86/._gemm_kernel_4x2_sse2.S b/kernel/x86/._gemm_kernel_4x2_sse2.S new file mode 100644 index 0000000..51b5e0b Binary files /dev/null and b/kernel/x86/._gemm_kernel_4x2_sse2.S differ diff --git a/kernel/x86/._gemm_kernel_4x4_barcelona.S b/kernel/x86/._gemm_kernel_4x4_barcelona.S new file mode 100644 index 0000000..286bb5d Binary files /dev/null and b/kernel/x86/._gemm_kernel_4x4_barcelona.S differ diff --git a/kernel/x86/._gemm_kernel_4x4_penryn.S b/kernel/x86/._gemm_kernel_4x4_penryn.S new file mode 100644 index 0000000..a38035d Binary files /dev/null and b/kernel/x86/._gemm_kernel_4x4_penryn.S differ diff --git a/kernel/x86/._gemm_kernel_4x4_sse.S b/kernel/x86/._gemm_kernel_4x4_sse.S new file mode 100644 index 0000000..e47c29e Binary files /dev/null and b/kernel/x86/._gemm_kernel_4x4_sse.S differ diff --git a/kernel/x86/._gemm_kernel_4x4_sse3.S b/kernel/x86/._gemm_kernel_4x4_sse3.S new file mode 100644 index 0000000..e9ac027 Binary files /dev/null and b/kernel/x86/._gemm_kernel_4x4_sse3.S differ diff --git a/kernel/x86/._gemm_kernel_8x1_sse2.S b/kernel/x86/._gemm_kernel_8x1_sse2.S new file mode 100644 index 0000000..92c4f0d Binary files /dev/null and b/kernel/x86/._gemm_kernel_8x1_sse2.S differ diff --git a/kernel/x86/._gemm_kernel_8x2_core2.S b/kernel/x86/._gemm_kernel_8x2_core2.S new file mode 100644 index 0000000..fbf789d Binary files /dev/null and b/kernel/x86/._gemm_kernel_8x2_core2.S differ diff --git a/kernel/x86/._gemm_kernel_8x2_sse.S b/kernel/x86/._gemm_kernel_8x2_sse.S new file mode 100644 index 0000000..bbca3c9 Binary files /dev/null and b/kernel/x86/._gemm_kernel_8x2_sse.S differ diff --git a/kernel/x86/._gemm_ncopy_2.S b/kernel/x86/._gemm_ncopy_2.S new file mode 100644 index 0000000..3d8c565 Binary files /dev/null and b/kernel/x86/._gemm_ncopy_2.S differ diff --git a/kernel/x86/._gemm_ncopy_2_sse.S b/kernel/x86/._gemm_ncopy_2_sse.S new file mode 100644 index 0000000..9ace746 Binary files /dev/null and b/kernel/x86/._gemm_ncopy_2_sse.S differ diff --git a/kernel/x86/._gemm_ncopy_4_sse.S b/kernel/x86/._gemm_ncopy_4_sse.S new file mode 100644 index 0000000..fea6579 Binary files /dev/null and b/kernel/x86/._gemm_ncopy_4_sse.S differ diff --git a/kernel/x86/._gemm_tcopy_2.S b/kernel/x86/._gemm_tcopy_2.S new file mode 100644 index 0000000..12522f2 Binary files /dev/null and b/kernel/x86/._gemm_tcopy_2.S differ diff --git a/kernel/x86/._gemm_tcopy_2_sse.S b/kernel/x86/._gemm_tcopy_2_sse.S new file mode 100644 index 0000000..7b35596 Binary files /dev/null and b/kernel/x86/._gemm_tcopy_2_sse.S differ diff --git a/kernel/x86/._gemm_tcopy_4_sse.S b/kernel/x86/._gemm_tcopy_4_sse.S new file mode 100644 index 0000000..dea3762 Binary files /dev/null and b/kernel/x86/._gemm_tcopy_4_sse.S differ diff --git a/kernel/x86/._gemv_n.S b/kernel/x86/._gemv_n.S new file mode 100644 index 0000000..0554838 Binary files /dev/null and b/kernel/x86/._gemv_n.S differ diff --git a/kernel/x86/._gemv_n_atom.S b/kernel/x86/._gemv_n_atom.S new file mode 100644 index 0000000..1d5c7d4 Binary files /dev/null and b/kernel/x86/._gemv_n_atom.S differ diff --git a/kernel/x86/._gemv_n_sse.S b/kernel/x86/._gemv_n_sse.S new file mode 100644 index 0000000..5bfb70f Binary files /dev/null and b/kernel/x86/._gemv_n_sse.S differ diff --git a/kernel/x86/._gemv_n_sse2.S b/kernel/x86/._gemv_n_sse2.S new file mode 100644 index 0000000..5a7c53d Binary files /dev/null and b/kernel/x86/._gemv_n_sse2.S differ diff --git a/kernel/x86/._gemv_t.S b/kernel/x86/._gemv_t.S new file mode 100644 index 0000000..7d182a3 Binary files /dev/null and b/kernel/x86/._gemv_t.S differ diff --git a/kernel/x86/._gemv_t_atom.S b/kernel/x86/._gemv_t_atom.S new file mode 100644 index 0000000..29dba5b Binary files /dev/null and b/kernel/x86/._gemv_t_atom.S differ diff --git a/kernel/x86/._gemv_t_sse.S b/kernel/x86/._gemv_t_sse.S new file mode 100644 index 0000000..15adb29 Binary files /dev/null and b/kernel/x86/._gemv_t_sse.S differ diff --git a/kernel/x86/._gemv_t_sse2.S b/kernel/x86/._gemv_t_sse2.S new file mode 100644 index 0000000..e44d545 Binary files /dev/null and b/kernel/x86/._gemv_t_sse2.S differ diff --git a/kernel/x86/._iamax.S b/kernel/x86/._iamax.S new file mode 100644 index 0000000..181f9ec Binary files /dev/null and b/kernel/x86/._iamax.S differ diff --git a/kernel/x86/._iamax_sse.S b/kernel/x86/._iamax_sse.S new file mode 100644 index 0000000..d886869 Binary files /dev/null and b/kernel/x86/._iamax_sse.S differ diff --git a/kernel/x86/._iamax_sse2.S b/kernel/x86/._iamax_sse2.S new file mode 100644 index 0000000..cd85397 Binary files /dev/null and b/kernel/x86/._iamax_sse2.S differ diff --git a/kernel/x86/._izamax.S b/kernel/x86/._izamax.S new file mode 100644 index 0000000..65511b0 Binary files /dev/null and b/kernel/x86/._izamax.S differ diff --git a/kernel/x86/._izamax_sse.S b/kernel/x86/._izamax_sse.S new file mode 100644 index 0000000..827f721 Binary files /dev/null and b/kernel/x86/._izamax_sse.S differ diff --git a/kernel/x86/._izamax_sse2.S b/kernel/x86/._izamax_sse2.S new file mode 100644 index 0000000..19bdca5 Binary files /dev/null and b/kernel/x86/._izamax_sse2.S differ diff --git a/kernel/x86/._lsame.S b/kernel/x86/._lsame.S new file mode 100644 index 0000000..a1a8e04 Binary files /dev/null and b/kernel/x86/._lsame.S differ diff --git a/kernel/x86/._nrm2.S b/kernel/x86/._nrm2.S new file mode 100644 index 0000000..646f5fc Binary files /dev/null and b/kernel/x86/._nrm2.S differ diff --git a/kernel/x86/._nrm2_sse.S b/kernel/x86/._nrm2_sse.S new file mode 100644 index 0000000..1853bcf Binary files /dev/null and b/kernel/x86/._nrm2_sse.S differ diff --git a/kernel/x86/._qaxpy.S b/kernel/x86/._qaxpy.S new file mode 100644 index 0000000..2a1c304 Binary files /dev/null and b/kernel/x86/._qaxpy.S differ diff --git a/kernel/x86/._qconjg.S b/kernel/x86/._qconjg.S new file mode 100644 index 0000000..3defcb3 Binary files /dev/null and b/kernel/x86/._qconjg.S differ diff --git a/kernel/x86/._qdot.S b/kernel/x86/._qdot.S new file mode 100644 index 0000000..ac2da0b Binary files /dev/null and b/kernel/x86/._qdot.S differ diff --git a/kernel/x86/._qgemm_kernel_2x2.S b/kernel/x86/._qgemm_kernel_2x2.S new file mode 100644 index 0000000..b5cbbe5 Binary files /dev/null and b/kernel/x86/._qgemm_kernel_2x2.S differ diff --git a/kernel/x86/._qgemv_n.S b/kernel/x86/._qgemv_n.S new file mode 100644 index 0000000..b32b483 Binary files /dev/null and b/kernel/x86/._qgemv_n.S differ diff --git a/kernel/x86/._qgemv_t.S b/kernel/x86/._qgemv_t.S new file mode 100644 index 0000000..a4d98eb Binary files /dev/null and b/kernel/x86/._qgemv_t.S differ diff --git a/kernel/x86/._qtrsm_kernel_LN_2x2.S b/kernel/x86/._qtrsm_kernel_LN_2x2.S new file mode 100644 index 0000000..5b9c537 Binary files /dev/null and b/kernel/x86/._qtrsm_kernel_LN_2x2.S differ diff --git a/kernel/x86/._qtrsm_kernel_LT_2x2.S b/kernel/x86/._qtrsm_kernel_LT_2x2.S new file mode 100644 index 0000000..99b3e63 Binary files /dev/null and b/kernel/x86/._qtrsm_kernel_LT_2x2.S differ diff --git a/kernel/x86/._qtrsm_kernel_RT_2x2.S b/kernel/x86/._qtrsm_kernel_RT_2x2.S new file mode 100644 index 0000000..2c26d10 Binary files /dev/null and b/kernel/x86/._qtrsm_kernel_RT_2x2.S differ diff --git a/kernel/x86/._rot.S b/kernel/x86/._rot.S new file mode 100644 index 0000000..f37f7ef Binary files /dev/null and b/kernel/x86/._rot.S differ diff --git a/kernel/x86/._rot_sse.S b/kernel/x86/._rot_sse.S new file mode 100644 index 0000000..6572ad9 Binary files /dev/null and b/kernel/x86/._rot_sse.S differ diff --git a/kernel/x86/._rot_sse2.S b/kernel/x86/._rot_sse2.S new file mode 100644 index 0000000..a23f33a Binary files /dev/null and b/kernel/x86/._rot_sse2.S differ diff --git a/kernel/x86/._scal.S b/kernel/x86/._scal.S new file mode 100644 index 0000000..8a6a9ed Binary files /dev/null and b/kernel/x86/._scal.S differ diff --git a/kernel/x86/._scal_sse.S b/kernel/x86/._scal_sse.S new file mode 100644 index 0000000..670d1b3 Binary files /dev/null and b/kernel/x86/._scal_sse.S differ diff --git a/kernel/x86/._scal_sse2.S b/kernel/x86/._scal_sse2.S new file mode 100644 index 0000000..bddd63e Binary files /dev/null and b/kernel/x86/._scal_sse2.S differ diff --git a/kernel/x86/._staticbuffer.S b/kernel/x86/._staticbuffer.S new file mode 100644 index 0000000..9b19ab0 Binary files /dev/null and b/kernel/x86/._staticbuffer.S differ diff --git a/kernel/x86/._swap.S b/kernel/x86/._swap.S new file mode 100644 index 0000000..218c68a Binary files /dev/null and b/kernel/x86/._swap.S differ diff --git a/kernel/x86/._swap_sse.S b/kernel/x86/._swap_sse.S new file mode 100644 index 0000000..1c45f5d Binary files /dev/null and b/kernel/x86/._swap_sse.S differ diff --git a/kernel/x86/._swap_sse2.S b/kernel/x86/._swap_sse2.S new file mode 100644 index 0000000..e48e9bb Binary files /dev/null and b/kernel/x86/._swap_sse2.S differ diff --git a/kernel/x86/._trsm_kernel_LN_2x2.S b/kernel/x86/._trsm_kernel_LN_2x2.S new file mode 100644 index 0000000..dd8cd8e Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_2x2.S differ diff --git a/kernel/x86/._trsm_kernel_LN_2x2_atom.S b/kernel/x86/._trsm_kernel_LN_2x2_atom.S new file mode 100644 index 0000000..460df2b Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_2x2_atom.S differ diff --git a/kernel/x86/._trsm_kernel_LN_2x4_penryn.S b/kernel/x86/._trsm_kernel_LN_2x4_penryn.S new file mode 100644 index 0000000..a33ae21 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_2x4_penryn.S differ diff --git a/kernel/x86/._trsm_kernel_LN_2x4_sse2.S b/kernel/x86/._trsm_kernel_LN_2x4_sse2.S new file mode 100644 index 0000000..032da62 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_2x4_sse2.S differ diff --git a/kernel/x86/._trsm_kernel_LN_2x4_sse3.S b/kernel/x86/._trsm_kernel_LN_2x4_sse3.S new file mode 100644 index 0000000..8b06766 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_2x4_sse3.S differ diff --git a/kernel/x86/._trsm_kernel_LN_4x2_core2.S b/kernel/x86/._trsm_kernel_LN_4x2_core2.S new file mode 100644 index 0000000..6055754 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_4x2_core2.S differ diff --git a/kernel/x86/._trsm_kernel_LN_4x2_sse2.S b/kernel/x86/._trsm_kernel_LN_4x2_sse2.S new file mode 100644 index 0000000..48a8c4e Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_4x2_sse2.S differ diff --git a/kernel/x86/._trsm_kernel_LN_4x4_penryn.S b/kernel/x86/._trsm_kernel_LN_4x4_penryn.S new file mode 100644 index 0000000..9b13c86 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_4x4_penryn.S differ diff --git a/kernel/x86/._trsm_kernel_LN_4x4_sse.S b/kernel/x86/._trsm_kernel_LN_4x4_sse.S new file mode 100644 index 0000000..3b83634 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_4x4_sse.S differ diff --git a/kernel/x86/._trsm_kernel_LN_8x2_sse.S b/kernel/x86/._trsm_kernel_LN_8x2_sse.S new file mode 100644 index 0000000..8aa5b02 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LN_8x2_sse.S differ diff --git a/kernel/x86/._trsm_kernel_LT_1x4.S b/kernel/x86/._trsm_kernel_LT_1x4.S new file mode 100644 index 0000000..f055e9b Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_1x4.S differ diff --git a/kernel/x86/._trsm_kernel_LT_2x2.S b/kernel/x86/._trsm_kernel_LT_2x2.S new file mode 100644 index 0000000..8060511 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_2x2.S differ diff --git a/kernel/x86/._trsm_kernel_LT_2x2_atom.S b/kernel/x86/._trsm_kernel_LT_2x2_atom.S new file mode 100644 index 0000000..b6102ff Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_2x2_atom.S differ diff --git a/kernel/x86/._trsm_kernel_LT_2x4_penryn.S b/kernel/x86/._trsm_kernel_LT_2x4_penryn.S new file mode 100644 index 0000000..b0847e6 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_2x4_penryn.S differ diff --git a/kernel/x86/._trsm_kernel_LT_2x4_sse2.S b/kernel/x86/._trsm_kernel_LT_2x4_sse2.S new file mode 100644 index 0000000..cc8c556 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_2x4_sse2.S differ diff --git a/kernel/x86/._trsm_kernel_LT_2x4_sse3.S b/kernel/x86/._trsm_kernel_LT_2x4_sse3.S new file mode 100644 index 0000000..ece3c95 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_2x4_sse3.S differ diff --git a/kernel/x86/._trsm_kernel_LT_4x2_core2.S b/kernel/x86/._trsm_kernel_LT_4x2_core2.S new file mode 100644 index 0000000..604a5f5 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_4x2_core2.S differ diff --git a/kernel/x86/._trsm_kernel_LT_4x2_sse2.S b/kernel/x86/._trsm_kernel_LT_4x2_sse2.S new file mode 100644 index 0000000..e2268ba Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_4x2_sse2.S differ diff --git a/kernel/x86/._trsm_kernel_LT_4x4_penryn.S b/kernel/x86/._trsm_kernel_LT_4x4_penryn.S new file mode 100644 index 0000000..5330b6d Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_4x4_penryn.S differ diff --git a/kernel/x86/._trsm_kernel_LT_4x4_sse.S b/kernel/x86/._trsm_kernel_LT_4x4_sse.S new file mode 100644 index 0000000..c785b61 Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_4x4_sse.S differ diff --git a/kernel/x86/._trsm_kernel_LT_8x2_sse.S b/kernel/x86/._trsm_kernel_LT_8x2_sse.S new file mode 100644 index 0000000..d919b2c Binary files /dev/null and b/kernel/x86/._trsm_kernel_LT_8x2_sse.S differ diff --git a/kernel/x86/._trsm_kernel_RT_1x4.S b/kernel/x86/._trsm_kernel_RT_1x4.S new file mode 100644 index 0000000..ce478f7 Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_1x4.S differ diff --git a/kernel/x86/._trsm_kernel_RT_2x2.S b/kernel/x86/._trsm_kernel_RT_2x2.S new file mode 100644 index 0000000..a60d4c7 Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_2x2.S differ diff --git a/kernel/x86/._trsm_kernel_RT_2x2_atom.S b/kernel/x86/._trsm_kernel_RT_2x2_atom.S new file mode 100644 index 0000000..77b93c1 Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_2x2_atom.S differ diff --git a/kernel/x86/._trsm_kernel_RT_2x4_penryn.S b/kernel/x86/._trsm_kernel_RT_2x4_penryn.S new file mode 100644 index 0000000..00b097f Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_2x4_penryn.S differ diff --git a/kernel/x86/._trsm_kernel_RT_2x4_sse2.S b/kernel/x86/._trsm_kernel_RT_2x4_sse2.S new file mode 100644 index 0000000..cb680e6 Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_2x4_sse2.S differ diff --git a/kernel/x86/._trsm_kernel_RT_2x4_sse3.S b/kernel/x86/._trsm_kernel_RT_2x4_sse3.S new file mode 100644 index 0000000..2b50941 Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_2x4_sse3.S differ diff --git a/kernel/x86/._trsm_kernel_RT_4x2_core2.S b/kernel/x86/._trsm_kernel_RT_4x2_core2.S new file mode 100644 index 0000000..788187a Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_4x2_core2.S differ diff --git a/kernel/x86/._trsm_kernel_RT_4x2_sse2.S b/kernel/x86/._trsm_kernel_RT_4x2_sse2.S new file mode 100644 index 0000000..948de0d Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_4x2_sse2.S differ diff --git a/kernel/x86/._trsm_kernel_RT_4x4_penryn.S b/kernel/x86/._trsm_kernel_RT_4x4_penryn.S new file mode 100644 index 0000000..d12dd25 Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_4x4_penryn.S differ diff --git a/kernel/x86/._trsm_kernel_RT_4x4_sse.S b/kernel/x86/._trsm_kernel_RT_4x4_sse.S new file mode 100644 index 0000000..9349b4d Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_4x4_sse.S differ diff --git a/kernel/x86/._trsm_kernel_RT_8x2_sse.S b/kernel/x86/._trsm_kernel_RT_8x2_sse.S new file mode 100644 index 0000000..87373d9 Binary files /dev/null and b/kernel/x86/._trsm_kernel_RT_8x2_sse.S differ diff --git a/kernel/x86/._xaxpy.S b/kernel/x86/._xaxpy.S new file mode 100644 index 0000000..4ad7c74 Binary files /dev/null and b/kernel/x86/._xaxpy.S differ diff --git a/kernel/x86/._xdot.S b/kernel/x86/._xdot.S new file mode 100644 index 0000000..2d6a1dd Binary files /dev/null and b/kernel/x86/._xdot.S differ diff --git a/kernel/x86/._xgemm3m_kernel_2x2.S b/kernel/x86/._xgemm3m_kernel_2x2.S new file mode 100644 index 0000000..8c37efd Binary files /dev/null and b/kernel/x86/._xgemm3m_kernel_2x2.S differ diff --git a/kernel/x86/._xgemm_kernel_1x1.S b/kernel/x86/._xgemm_kernel_1x1.S new file mode 100644 index 0000000..13a26f6 Binary files /dev/null and b/kernel/x86/._xgemm_kernel_1x1.S differ diff --git a/kernel/x86/._xgemv_n.S b/kernel/x86/._xgemv_n.S new file mode 100644 index 0000000..b706dbc Binary files /dev/null and b/kernel/x86/._xgemv_n.S differ diff --git a/kernel/x86/._xgemv_t.S b/kernel/x86/._xgemv_t.S new file mode 100644 index 0000000..2d023b6 Binary files /dev/null and b/kernel/x86/._xgemv_t.S differ diff --git a/kernel/x86/._xtrsm_kernel_LT_1x1.S b/kernel/x86/._xtrsm_kernel_LT_1x1.S new file mode 100644 index 0000000..61e28ef Binary files /dev/null and b/kernel/x86/._xtrsm_kernel_LT_1x1.S differ diff --git a/kernel/x86/._zamax.S b/kernel/x86/._zamax.S new file mode 100644 index 0000000..5a1c49d Binary files /dev/null and b/kernel/x86/._zamax.S differ diff --git a/kernel/x86/._zamax_sse.S b/kernel/x86/._zamax_sse.S new file mode 100644 index 0000000..549d60f Binary files /dev/null and b/kernel/x86/._zamax_sse.S differ diff --git a/kernel/x86/._zamax_sse2.S b/kernel/x86/._zamax_sse2.S new file mode 100644 index 0000000..df0e9f4 Binary files /dev/null and b/kernel/x86/._zamax_sse2.S differ diff --git a/kernel/x86/._zasum.S b/kernel/x86/._zasum.S new file mode 100644 index 0000000..bb8bfd0 Binary files /dev/null and b/kernel/x86/._zasum.S differ diff --git a/kernel/x86/._zasum_sse.S b/kernel/x86/._zasum_sse.S new file mode 100644 index 0000000..cee454c Binary files /dev/null and b/kernel/x86/._zasum_sse.S differ diff --git a/kernel/x86/._zasum_sse2.S b/kernel/x86/._zasum_sse2.S new file mode 100644 index 0000000..649b276 Binary files /dev/null and b/kernel/x86/._zasum_sse2.S differ diff --git a/kernel/x86/._zaxpy.S b/kernel/x86/._zaxpy.S new file mode 100644 index 0000000..6fbb2ba Binary files /dev/null and b/kernel/x86/._zaxpy.S differ diff --git a/kernel/x86/._zaxpy_sse.S b/kernel/x86/._zaxpy_sse.S new file mode 100644 index 0000000..2aec34f Binary files /dev/null and b/kernel/x86/._zaxpy_sse.S differ diff --git a/kernel/x86/._zaxpy_sse2.S b/kernel/x86/._zaxpy_sse2.S new file mode 100644 index 0000000..df4a98f Binary files /dev/null and b/kernel/x86/._zaxpy_sse2.S differ diff --git a/kernel/x86/._zcopy.S b/kernel/x86/._zcopy.S new file mode 100644 index 0000000..a2d7989 Binary files /dev/null and b/kernel/x86/._zcopy.S differ diff --git a/kernel/x86/._zcopy_sse.S b/kernel/x86/._zcopy_sse.S new file mode 100644 index 0000000..bcea785 Binary files /dev/null and b/kernel/x86/._zcopy_sse.S differ diff --git a/kernel/x86/._zcopy_sse2.S b/kernel/x86/._zcopy_sse2.S new file mode 100644 index 0000000..5d1a1d0 Binary files /dev/null and b/kernel/x86/._zcopy_sse2.S differ diff --git a/kernel/x86/._zdot.S b/kernel/x86/._zdot.S new file mode 100644 index 0000000..43f1daa Binary files /dev/null and b/kernel/x86/._zdot.S differ diff --git a/kernel/x86/._zdot_amd.S b/kernel/x86/._zdot_amd.S new file mode 100644 index 0000000..3c3018f Binary files /dev/null and b/kernel/x86/._zdot_amd.S differ diff --git a/kernel/x86/._zdot_sse.S b/kernel/x86/._zdot_sse.S new file mode 100644 index 0000000..22a1f07 Binary files /dev/null and b/kernel/x86/._zdot_sse.S differ diff --git a/kernel/x86/._zdot_sse2.S b/kernel/x86/._zdot_sse2.S new file mode 100644 index 0000000..c3b9f10 Binary files /dev/null and b/kernel/x86/._zdot_sse2.S differ diff --git a/kernel/x86/._zgemm3m_kernel_1x4_athlon.S b/kernel/x86/._zgemm3m_kernel_1x4_athlon.S new file mode 100644 index 0000000..0a498e6 Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_1x4_athlon.S differ diff --git a/kernel/x86/._zgemm3m_kernel_2x2_atom.S b/kernel/x86/._zgemm3m_kernel_2x2_atom.S new file mode 100644 index 0000000..261bd71 Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_2x2_atom.S differ diff --git a/kernel/x86/._zgemm3m_kernel_2x2_coppermine.S b/kernel/x86/._zgemm3m_kernel_2x2_coppermine.S new file mode 100644 index 0000000..9109d7c Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_2x2_coppermine.S differ diff --git a/kernel/x86/._zgemm3m_kernel_2x4_barcelona.S b/kernel/x86/._zgemm3m_kernel_2x4_barcelona.S new file mode 100644 index 0000000..bc32e85 Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_2x4_barcelona.S differ diff --git a/kernel/x86/._zgemm3m_kernel_2x4_opteron.S b/kernel/x86/._zgemm3m_kernel_2x4_opteron.S new file mode 100644 index 0000000..fde29fc Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_2x4_opteron.S differ diff --git a/kernel/x86/._zgemm3m_kernel_2x4_penryn.S b/kernel/x86/._zgemm3m_kernel_2x4_penryn.S new file mode 100644 index 0000000..6d61ee5 Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_2x4_penryn.S differ diff --git a/kernel/x86/._zgemm3m_kernel_2x4_prescott.S b/kernel/x86/._zgemm3m_kernel_2x4_prescott.S new file mode 100644 index 0000000..71ae3c0 Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_2x4_prescott.S differ diff --git a/kernel/x86/._zgemm3m_kernel_4x2_core2.S b/kernel/x86/._zgemm3m_kernel_4x2_core2.S new file mode 100644 index 0000000..044653b Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_4x2_core2.S differ diff --git a/kernel/x86/._zgemm3m_kernel_4x2_northwood.S b/kernel/x86/._zgemm3m_kernel_4x2_northwood.S new file mode 100644 index 0000000..bfa5348 Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_4x2_northwood.S differ diff --git a/kernel/x86/._zgemm3m_kernel_4x4_barcelona.S b/kernel/x86/._zgemm3m_kernel_4x4_barcelona.S new file mode 100644 index 0000000..d340752 Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_4x4_barcelona.S differ diff --git a/kernel/x86/._zgemm3m_kernel_4x4_opteron.S b/kernel/x86/._zgemm3m_kernel_4x4_opteron.S new file mode 100644 index 0000000..f620846 Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_4x4_opteron.S differ diff --git a/kernel/x86/._zgemm3m_kernel_4x4_penryn.S b/kernel/x86/._zgemm3m_kernel_4x4_penryn.S new file mode 100644 index 0000000..64cab4c Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_4x4_penryn.S differ diff --git a/kernel/x86/._zgemm3m_kernel_4x4_prescott.S b/kernel/x86/._zgemm3m_kernel_4x4_prescott.S new file mode 100644 index 0000000..1811071 Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_4x4_prescott.S differ diff --git a/kernel/x86/._zgemm3m_kernel_8x2_core2.S b/kernel/x86/._zgemm3m_kernel_8x2_core2.S new file mode 100644 index 0000000..eadca2a Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_8x2_core2.S differ diff --git a/kernel/x86/._zgemm3m_kernel_8x2_sse.S b/kernel/x86/._zgemm3m_kernel_8x2_sse.S new file mode 100644 index 0000000..0022ffd Binary files /dev/null and b/kernel/x86/._zgemm3m_kernel_8x2_sse.S differ diff --git a/kernel/x86/._zgemm_beta.S b/kernel/x86/._zgemm_beta.S new file mode 100644 index 0000000..befe072 Binary files /dev/null and b/kernel/x86/._zgemm_beta.S differ diff --git a/kernel/x86/._zgemm_kernel_1x1.S b/kernel/x86/._zgemm_kernel_1x1.S new file mode 100644 index 0000000..2bcea83 Binary files /dev/null and b/kernel/x86/._zgemm_kernel_1x1.S differ diff --git a/kernel/x86/._zgemm_kernel_1x1_atom.S b/kernel/x86/._zgemm_kernel_1x1_atom.S new file mode 100644 index 0000000..b5af4ec Binary files /dev/null and b/kernel/x86/._zgemm_kernel_1x1_atom.S differ diff --git a/kernel/x86/._zgemm_kernel_1x2.S b/kernel/x86/._zgemm_kernel_1x2.S new file mode 100644 index 0000000..a18db04 Binary files /dev/null and b/kernel/x86/._zgemm_kernel_1x2.S differ diff --git a/kernel/x86/._zgemm_kernel_1x2_3dnow.S b/kernel/x86/._zgemm_kernel_1x2_3dnow.S new file mode 100644 index 0000000..e3d0cb6 Binary files /dev/null and b/kernel/x86/._zgemm_kernel_1x2_3dnow.S differ diff --git a/kernel/x86/._zgemm_kernel_1x2_barcelona.S b/kernel/x86/._zgemm_kernel_1x2_barcelona.S new file mode 100644 index 0000000..45ed96c Binary files /dev/null and b/kernel/x86/._zgemm_kernel_1x2_barcelona.S differ diff --git a/kernel/x86/._zgemm_kernel_1x2_penryn.S b/kernel/x86/._zgemm_kernel_1x2_penryn.S new file mode 100644 index 0000000..486d96f Binary files /dev/null and b/kernel/x86/._zgemm_kernel_1x2_penryn.S differ diff --git a/kernel/x86/._zgemm_kernel_1x2_sse2.S b/kernel/x86/._zgemm_kernel_1x2_sse2.S new file mode 100644 index 0000000..6844e0e Binary files /dev/null and b/kernel/x86/._zgemm_kernel_1x2_sse2.S differ diff --git a/kernel/x86/._zgemm_kernel_1x2_sse3.S b/kernel/x86/._zgemm_kernel_1x2_sse3.S new file mode 100644 index 0000000..6b624e3 Binary files /dev/null and b/kernel/x86/._zgemm_kernel_1x2_sse3.S differ diff --git a/kernel/x86/._zgemm_kernel_2x1_core2.S b/kernel/x86/._zgemm_kernel_2x1_core2.S new file mode 100644 index 0000000..ac521cc Binary files /dev/null and b/kernel/x86/._zgemm_kernel_2x1_core2.S differ diff --git a/kernel/x86/._zgemm_kernel_2x1_sse2.S b/kernel/x86/._zgemm_kernel_2x1_sse2.S new file mode 100644 index 0000000..8430d83 Binary files /dev/null and b/kernel/x86/._zgemm_kernel_2x1_sse2.S differ diff --git a/kernel/x86/._zgemm_kernel_2x2_barcelona.S b/kernel/x86/._zgemm_kernel_2x2_barcelona.S new file mode 100644 index 0000000..d5c968d Binary files /dev/null and b/kernel/x86/._zgemm_kernel_2x2_barcelona.S differ diff --git a/kernel/x86/._zgemm_kernel_2x2_penryn.S b/kernel/x86/._zgemm_kernel_2x2_penryn.S new file mode 100644 index 0000000..43eb8f4 Binary files /dev/null and b/kernel/x86/._zgemm_kernel_2x2_penryn.S differ diff --git a/kernel/x86/._zgemm_kernel_2x2_sse.S b/kernel/x86/._zgemm_kernel_2x2_sse.S new file mode 100644 index 0000000..c8c95cb Binary files /dev/null and b/kernel/x86/._zgemm_kernel_2x2_sse.S differ diff --git a/kernel/x86/._zgemm_kernel_2x2_sse3.S b/kernel/x86/._zgemm_kernel_2x2_sse3.S new file mode 100644 index 0000000..8dd44f4 Binary files /dev/null and b/kernel/x86/._zgemm_kernel_2x2_sse3.S differ diff --git a/kernel/x86/._zgemm_kernel_4x1_core2.S b/kernel/x86/._zgemm_kernel_4x1_core2.S new file mode 100644 index 0000000..a0cb86b Binary files /dev/null and b/kernel/x86/._zgemm_kernel_4x1_core2.S differ diff --git a/kernel/x86/._zgemm_kernel_4x1_sse.S b/kernel/x86/._zgemm_kernel_4x1_sse.S new file mode 100644 index 0000000..93f8648 Binary files /dev/null and b/kernel/x86/._zgemm_kernel_4x1_sse.S differ diff --git a/kernel/x86/._zgemm_ncopy_2.S b/kernel/x86/._zgemm_ncopy_2.S new file mode 100644 index 0000000..7e21d84 Binary files /dev/null and b/kernel/x86/._zgemm_ncopy_2.S differ diff --git a/kernel/x86/._zgemm_tcopy_2.S b/kernel/x86/._zgemm_tcopy_2.S new file mode 100644 index 0000000..d2b6aee Binary files /dev/null and b/kernel/x86/._zgemm_tcopy_2.S differ diff --git a/kernel/x86/._zgemv_n.S b/kernel/x86/._zgemv_n.S new file mode 100644 index 0000000..b64f8fb Binary files /dev/null and b/kernel/x86/._zgemv_n.S differ diff --git a/kernel/x86/._zgemv_n_atom.S b/kernel/x86/._zgemv_n_atom.S new file mode 100644 index 0000000..9badfec Binary files /dev/null and b/kernel/x86/._zgemv_n_atom.S differ diff --git a/kernel/x86/._zgemv_n_sse.S b/kernel/x86/._zgemv_n_sse.S new file mode 100644 index 0000000..650d9cc Binary files /dev/null and b/kernel/x86/._zgemv_n_sse.S differ diff --git a/kernel/x86/._zgemv_n_sse2.S b/kernel/x86/._zgemv_n_sse2.S new file mode 100644 index 0000000..3744e9b Binary files /dev/null and b/kernel/x86/._zgemv_n_sse2.S differ diff --git a/kernel/x86/._zgemv_t.S b/kernel/x86/._zgemv_t.S new file mode 100644 index 0000000..6d1ae58 Binary files /dev/null and b/kernel/x86/._zgemv_t.S differ diff --git a/kernel/x86/._zgemv_t_atom.S b/kernel/x86/._zgemv_t_atom.S new file mode 100644 index 0000000..df5adbd Binary files /dev/null and b/kernel/x86/._zgemv_t_atom.S differ diff --git a/kernel/x86/._zgemv_t_sse.S b/kernel/x86/._zgemv_t_sse.S new file mode 100644 index 0000000..5e4c766 Binary files /dev/null and b/kernel/x86/._zgemv_t_sse.S differ diff --git a/kernel/x86/._zgemv_t_sse2.S b/kernel/x86/._zgemv_t_sse2.S new file mode 100644 index 0000000..f4b370e Binary files /dev/null and b/kernel/x86/._zgemv_t_sse2.S differ diff --git a/kernel/x86/._znrm2.S b/kernel/x86/._znrm2.S new file mode 100644 index 0000000..9a3f2c6 Binary files /dev/null and b/kernel/x86/._znrm2.S differ diff --git a/kernel/x86/._znrm2_sse.S b/kernel/x86/._znrm2_sse.S new file mode 100644 index 0000000..559479e Binary files /dev/null and b/kernel/x86/._znrm2_sse.S differ diff --git a/kernel/x86/._zrot.S b/kernel/x86/._zrot.S new file mode 100644 index 0000000..ae626a3 Binary files /dev/null and b/kernel/x86/._zrot.S differ diff --git a/kernel/x86/._zrot_sse.S b/kernel/x86/._zrot_sse.S new file mode 100644 index 0000000..2c1f53d Binary files /dev/null and b/kernel/x86/._zrot_sse.S differ diff --git a/kernel/x86/._zrot_sse2.S b/kernel/x86/._zrot_sse2.S new file mode 100644 index 0000000..51c16c5 Binary files /dev/null and b/kernel/x86/._zrot_sse2.S differ diff --git a/kernel/x86/._zscal.S b/kernel/x86/._zscal.S new file mode 100644 index 0000000..d97e54e Binary files /dev/null and b/kernel/x86/._zscal.S differ diff --git a/kernel/x86/._zscal_sse.S b/kernel/x86/._zscal_sse.S new file mode 100644 index 0000000..02ca6c1 Binary files /dev/null and b/kernel/x86/._zscal_sse.S differ diff --git a/kernel/x86/._zscal_sse2.S b/kernel/x86/._zscal_sse2.S new file mode 100644 index 0000000..5b0bd7e Binary files /dev/null and b/kernel/x86/._zscal_sse2.S differ diff --git a/kernel/x86/._zswap.S b/kernel/x86/._zswap.S new file mode 100644 index 0000000..2e9080e Binary files /dev/null and b/kernel/x86/._zswap.S differ diff --git a/kernel/x86/._zswap_sse.S b/kernel/x86/._zswap_sse.S new file mode 100644 index 0000000..8a9619f Binary files /dev/null and b/kernel/x86/._zswap_sse.S differ diff --git a/kernel/x86/._zswap_sse2.S b/kernel/x86/._zswap_sse2.S new file mode 100644 index 0000000..0c8d262 Binary files /dev/null and b/kernel/x86/._zswap_sse2.S differ diff --git a/kernel/x86/._ztrsm_kernel_LN_2x1_core2.S b/kernel/x86/._ztrsm_kernel_LN_2x1_core2.S new file mode 100644 index 0000000..0b0fc7e Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LN_2x1_core2.S differ diff --git a/kernel/x86/._ztrsm_kernel_LN_2x1_sse2.S b/kernel/x86/._ztrsm_kernel_LN_2x1_sse2.S new file mode 100644 index 0000000..73701ef Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LN_2x1_sse2.S differ diff --git a/kernel/x86/._ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/._ztrsm_kernel_LN_2x2_penryn.S new file mode 100644 index 0000000..27e9b5a Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LN_2x2_penryn.S differ diff --git a/kernel/x86/._ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/._ztrsm_kernel_LN_2x2_sse.S new file mode 100644 index 0000000..d23d44a Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LN_2x2_sse.S differ diff --git a/kernel/x86/._ztrsm_kernel_LN_4x1_sse.S b/kernel/x86/._ztrsm_kernel_LN_4x1_sse.S new file mode 100644 index 0000000..050888c Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LN_4x1_sse.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_1x1.S b/kernel/x86/._ztrsm_kernel_LT_1x1.S new file mode 100644 index 0000000..765a854 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_1x1.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_1x1_atom.S b/kernel/x86/._ztrsm_kernel_LT_1x1_atom.S new file mode 100644 index 0000000..5d1be80 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_1x1_atom.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/._ztrsm_kernel_LT_1x2_penryn.S new file mode 100644 index 0000000..cb4f5a6 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_1x2_penryn.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_1x2_sse2.S b/kernel/x86/._ztrsm_kernel_LT_1x2_sse2.S new file mode 100644 index 0000000..7224ced Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_1x2_sse2.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_1x2_sse3.S b/kernel/x86/._ztrsm_kernel_LT_1x2_sse3.S new file mode 100644 index 0000000..c756af8 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_1x2_sse3.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_2x1_core2.S b/kernel/x86/._ztrsm_kernel_LT_2x1_core2.S new file mode 100644 index 0000000..d9d934c Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_2x1_core2.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_2x1_sse2.S b/kernel/x86/._ztrsm_kernel_LT_2x1_sse2.S new file mode 100644 index 0000000..a86f10e Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_2x1_sse2.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/._ztrsm_kernel_LT_2x2_penryn.S new file mode 100644 index 0000000..cd2c8d1 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_2x2_penryn.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/._ztrsm_kernel_LT_2x2_sse.S new file mode 100644 index 0000000..141ce63 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_2x2_sse.S differ diff --git a/kernel/x86/._ztrsm_kernel_LT_4x1_sse.S b/kernel/x86/._ztrsm_kernel_LT_4x1_sse.S new file mode 100644 index 0000000..7cb7f64 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_LT_4x1_sse.S differ diff --git a/kernel/x86/._ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/._ztrsm_kernel_RT_1x2_penryn.S new file mode 100644 index 0000000..24c15d1 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_RT_1x2_penryn.S differ diff --git a/kernel/x86/._ztrsm_kernel_RT_1x2_sse2.S b/kernel/x86/._ztrsm_kernel_RT_1x2_sse2.S new file mode 100644 index 0000000..5026d97 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_RT_1x2_sse2.S differ diff --git a/kernel/x86/._ztrsm_kernel_RT_1x2_sse3.S b/kernel/x86/._ztrsm_kernel_RT_1x2_sse3.S new file mode 100644 index 0000000..be6b73f Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_RT_1x2_sse3.S differ diff --git a/kernel/x86/._ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/._ztrsm_kernel_RT_2x2_penryn.S new file mode 100644 index 0000000..57ae3e0 Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_RT_2x2_penryn.S differ diff --git a/kernel/x86/._ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/._ztrsm_kernel_RT_2x2_sse.S new file mode 100644 index 0000000..b48f5ba Binary files /dev/null and b/kernel/x86/._ztrsm_kernel_RT_2x2_sse.S differ diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL new file mode 100644 index 0000000..69becf6 --- /dev/null +++ b/kernel/x86/KERNEL @@ -0,0 +1,398 @@ +GEMVDEP = ../l2param.h + +ifdef HAVE_SSE + +ifndef SAMAXKERNEL +SAMAXKERNEL = amax_sse.S +endif + +ifndef CAMAXKERNEL +CAMAXKERNEL = zamax_sse.S +endif + +ifndef SAMINKERNEL +SAMINKERNEL = amax_sse.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax_sse.S +endif + +ifndef ISAMAXKERNEL +ISAMAXKERNEL = iamax_sse.S +endif + +ifndef ICAMAXKERNEL +ICAMAXKERNEL = izamax_sse.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax_sse.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax_sse.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = iamax_sse.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax_sse.S +endif + +ifndef SMAXKERNEL +SMAXKERNEL = amax_sse.S +endif + +ifndef SMINKERNEL +SMINKERNEL = amax_sse.S +endif + +ifndef SASUMKERNEL +SASUMKERNEL = asum_sse.S +endif + +ifndef CASUMKERNEL +CASUMKERNEL = zasum_sse.S +endif + +ifndef SDOTKERNEL +SDOTKERNEL = dot_sse.S +endif + +ifndef CDOTKERNEL +CDOTKERNEL = zdot_sse.S +endif + +ifndef SCOPYKERNEL +SCOPYKERNEL = copy_sse.S +endif + +ifndef CCOPYKERNEL +CCOPYKERNEL = zcopy_sse.S +endif + +ifndef SSACALKERNEL +SSCALKERNEL = scal_sse.S +endif + +ifndef CSACALKERNEL +CSCALKERNEL = zscal_sse.S +endif + +ifndef SAXPYKERNEL +SAXPYKERNEL = axpy_sse.S +endif + +ifndef CAXPYKERNEL +CAXPYKERNEL = zaxpy_sse.S +endif + +ifndef SROTKERNEL +SROTKERNEL = rot_sse.S +endif + +ifndef CROTKERNEL +CROTKERNEL = zrot_sse.S +endif + +ifndef SSWAPKERNEL +SSWAPKERNEL = swap_sse.S +endif + +ifndef CSWAPKERNEL +CSWAPKERNEL = zswap_sse.S +endif + +ifndef SGEMVNKERNEL +SGEMVNKERNEL = gemv_n_sse.S +endif + +ifndef SGEMVTKERNEL +SGEMVTKERNEL = gemv_t_sse.S +endif + +ifndef CGEMVNKERNEL +CGEMVNKERNEL = zgemv_n_sse.S +endif + +ifndef CGEMVTKERNEL +CGEMVTKERNEL = zgemv_t_sse.S +endif + +endif + + +ifdef HAVE_SSE2 + +ifndef DAMAXKERNEL +DAMAXKERNEL = amax_sse2.S +endif + +ifndef ZAMAXKERNEL +ZAMAXKERNEL = zamax_sse2.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax_sse2.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax_sse2.S +endif + +ifndef IDAMAXKERNEL +IDAMAXKERNEL = iamax_sse2.S +endif + +ifndef IZAMAXKERNEL +IZAMAXKERNEL = izamax_sse2.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax_sse2.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax_sse2.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = iamax_sse2.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax_sse2.S +endif + +ifndef DMAXKERNEL +DMAXKERNEL = amax_sse2.S +endif + +ifndef DMINKERNEL +DMINKERNEL = amax_sse2.S +endif + +ifndef DDOTKERNEL +DDOTKERNEL = dot_sse2.S +endif + +ifndef ZDOTKERNEL +ZDOTKERNEL = zdot_sse2.S +endif + +ifndef DCOPYKERNEL +# DCOPYKERNEL = copy_sse2.S +endif + +ifndef ZCOPYKERNEL +ZCOPYKERNEL = zcopy_sse2.S +endif + +ifndef DSACALKERNEL +DSCALKERNEL = scal_sse2.S +endif + +ifndef ZSACALKERNEL +ZSCALKERNEL = zscal_sse2.S +endif + +ifndef DASUMKERNEL +DASUMKERNEL = asum_sse2.S +endif + +ifndef ZASUMKERNEL +ZASUMKERNEL = zasum_sse2.S +endif + +ifndef DAXPYKERNEL +DAXPYKERNEL = axpy_sse2.S +endif + +ifndef ZAXPYKERNEL +ZAXPYKERNEL = zaxpy_sse2.S +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2_sse.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2_sse.S +endif + +ifndef DROTKERNEL +DROTKERNEL = rot_sse2.S +endif + +ifndef ZROTKERNEL +ZROTKERNEL = zrot_sse2.S +endif + +ifndef DSWAPKERNEL +DSWAPKERNEL = swap_sse2.S +endif + +ifndef ZSWAPKERNEL +ZSWAPKERNEL = zswap_sse2.S +endif + +endif + + +ifndef SAMINKERNEL +SAMINKERNEL = amax.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax.S +endif + +ifndef QAMINKERNEL +QAMINKERNEL = amax.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax.S +endif + +ifndef XAMINKERNEL +XAMINKERNEL = zamax.S +endif + +ifndef SMAXKERNEL +SMAXKERNEL = amax.S +endif + +ifndef DMAXKERNEL +DMAXKERNEL = amax.S +endif + +ifndef QMAXKERNEL +QMAXKERNEL = amax.S +endif + +ifndef SMINKERNEL +SMINKERNEL = amax.S +endif + +ifndef DMINKERNEL +DMINKERNEL = amax.S +endif + +ifndef QMINKERNEL +QMINKERNEL = amax.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax.S +endif + +ifndef IQAMINKERNEL +IQAMINKERNEL = iamax.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax.S +endif + +ifndef IXAMINKERNEL +IXAMINKERNEL = izamax.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = iamax.S +endif + +ifndef QDOTKERNEL +QDOTKERNEL = qdot.S +endif + +ifndef XDOTKERNEL +XDOTKERNEL = xdot.S +endif + +ifndef QAXPYKERNEL +QAXPYKERNEL = qaxpy.S +endif + +ifndef XAXPYKERNEL +XAXPYKERNEL = xaxpy.S +endif + +ifndef QGEMVNKERNEL +QGEMVNKERNEL = qgemv_n.S +endif + +ifndef QGEMVTKERNEL +QGEMVTKERNEL = qgemv_t.S +endif + +ifndef XGEMVNKERNEL +XGEMVNKERNEL = xgemv_n.S +endif + +ifndef XGEMVTKERNEL +XGEMVTKERNEL = xgemv_t.S +endif + +QGEMMKERNEL = qgemm_kernel_2x2.S +QGEMMINCOPY = +QGEMMITCOPY = +QGEMMONCOPY = ../generic/gemm_ncopy_2.c +QGEMMOTCOPY = ../generic/gemm_tcopy_2.c +QGEMMINCOPYOBJ = +QGEMMITCOPYOBJ = +QGEMMONCOPYOBJ = qgemm_oncopy$(TSUFFIX).$(SUFFIX) +QGEMMOTCOPYOBJ = qgemm_otcopy$(TSUFFIX).$(SUFFIX) + +XGEMMKERNEL = xgemm_kernel_1x1.S +XGEMMINCOPY = +XGEMMITCOPY = +XGEMMONCOPY = ../generic/zgemm_ncopy_1.c +XGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +XGEMMINCOPYOBJ = +XGEMMITCOPYOBJ = +XGEMMONCOPYOBJ = xgemm_oncopy$(TSUFFIX).$(SUFFIX) +XGEMMOTCOPYOBJ = xgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +QGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S +XGEMM_BETA = ../generic/zgemm_beta.c + +QTRSMKERNEL_LN = qtrsm_kernel_LN_2x2.S +QTRSMKERNEL_LT = qtrsm_kernel_LT_2x2.S +QTRSMKERNEL_RN = qtrsm_kernel_LT_2x2.S +QTRSMKERNEL_RT = qtrsm_kernel_RT_2x2.S + +XTRSMKERNEL_LN = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_LT = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S + +XGEMM3MKERNEL = xgemm3m_kernel_2x2.S diff --git a/kernel/x86/KERNEL.ATHLON b/kernel/x86/KERNEL.ATHLON new file mode 100644 index 0000000..30f1e32 --- /dev/null +++ b/kernel/x86/KERNEL.ATHLON @@ -0,0 +1,63 @@ +SGEMMKERNEL = gemm_kernel_2x4_3dnow.S +SGEMMINCOPY = ../generic/gemm_ncopy_2.c +SGEMMITCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_1x4.S +DGEMMINCOPY = ../generic/gemm_ncopy_1.c +DGEMMITCOPY = ../generic/gemm_tcopy_1.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_1x2_3dnow.S +CGEMMINCOPY = ../generic/zgemm_ncopy_1.c +CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = trsm_kernel_LT_1x4.S +DTRSMKERNEL_LT = trsm_kernel_LT_1x4.S +DTRSMKERNEL_RN = trsm_kernel_LT_1x4.S +DTRSMKERNEL_RT = trsm_kernel_RT_1x4.S + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ifdef HAVE_SSE +CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S +CGEMM3MKERNEL = zgemm3m_kernel_1x4_athlon.S +endif + +ZGEMM3MKERNEL = zgemm3m_kernel_1x4_athlon.S diff --git a/kernel/x86/KERNEL.ATOM b/kernel/x86/KERNEL.ATOM new file mode 100644 index 0000000..b0f6733 --- /dev/null +++ b/kernel/x86/KERNEL.ATOM @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_penryn.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x2_atom.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_2.S +DGEMMOTCOPY = gemm_tcopy_2.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_penryn.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x1_atom.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x2_atom.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x2_atom.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x2_atom.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x2_atom.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1_atom.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1_atom.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1_atom.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1_atom.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x2_atom.S diff --git a/kernel/x86/KERNEL.BANIAS b/kernel/x86/KERNEL.BANIAS new file mode 100644 index 0000000..22c02f0 --- /dev/null +++ b/kernel/x86/KERNEL.BANIAS @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x2_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x1_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x1.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S diff --git a/kernel/x86/KERNEL.BARCELONA b/kernel/x86/KERNEL.BARCELONA new file mode 100644 index 0000000..231350a --- /dev/null +++ b/kernel/x86/KERNEL.BARCELONA @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_barcelona.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_barcelona.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S diff --git a/kernel/x86/KERNEL.COPPERMINE b/kernel/x86/KERNEL.COPPERMINE new file mode 100644 index 0000000..22c02f0 --- /dev/null +++ b/kernel/x86/KERNEL.COPPERMINE @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x2_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x1_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x1.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S diff --git a/kernel/x86/KERNEL.CORE2 b/kernel/x86/KERNEL.CORE2 new file mode 100644 index 0000000..0c0659e --- /dev/null +++ b/kernel/x86/KERNEL.CORE2 @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x2_core2.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_2.S +SGEMMOTCOPY = gemm_tcopy_2.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x2_core2.S +DGEMMINCOPY = gemm_ncopy_4_sse.S +DGEMMITCOPY = gemm_tcopy_4_sse.S +DGEMMONCOPY = gemm_ncopy_2_sse.S +DGEMMOTCOPY = gemm_tcopy_2_sse.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x1_core2.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x1_core2.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x2_core2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x2_core2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x2_core2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x2_core2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x1_core2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x1_core2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x1_core2.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_2x1_core2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x2_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x2_core2.S diff --git a/kernel/x86/KERNEL.DUNNINGTON b/kernel/x86/KERNEL.DUNNINGTON new file mode 100644 index 0000000..08e3543 --- /dev/null +++ b/kernel/x86/KERNEL.DUNNINGTON @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_penryn.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_penryn.S +DGEMMINCOPY = gemm_ncopy_2.S +DGEMMITCOPY = gemm_tcopy_2.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_penryn.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_penryn.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_penryn.S diff --git a/kernel/x86/KERNEL.KATMAI b/kernel/x86/KERNEL.KATMAI new file mode 100644 index 0000000..93623e5 --- /dev/null +++ b/kernel/x86/KERNEL.KATMAI @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.COPPERMINE diff --git a/kernel/x86/KERNEL.NANO b/kernel/x86/KERNEL.NANO new file mode 100644 index 0000000..65b03ae --- /dev/null +++ b/kernel/x86/KERNEL.NANO @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/KERNEL.NEHALEM b/kernel/x86/KERNEL.NEHALEM new file mode 100644 index 0000000..65b03ae --- /dev/null +++ b/kernel/x86/KERNEL.NEHALEM @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/KERNEL.NORTHWOOD b/kernel/x86/KERNEL.NORTHWOOD new file mode 100644 index 0000000..ddf80e9 --- /dev/null +++ b/kernel/x86/KERNEL.NORTHWOOD @@ -0,0 +1,60 @@ +SGEMMKERNEL = gemm_kernel_8x2_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_2.S +SGEMMOTCOPY = gemm_tcopy_2.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x2_sse2.S +DGEMMINCOPY = gemm_ncopy_4_sse.S +DGEMMITCOPY = gemm_tcopy_4_sse.S +DGEMMONCOPY = gemm_ncopy_2.S +DGEMMOTCOPY = gemm_tcopy_2.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x1_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x1_sse2.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x2_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x2_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x2_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x2_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x1_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x1_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x1_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_2x1_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x2_northwood.S + diff --git a/kernel/x86/KERNEL.OPTERON b/kernel/x86/KERNEL.OPTERON new file mode 100644 index 0000000..7b8b137 --- /dev/null +++ b/kernel/x86/KERNEL.OPTERON @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_sse.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_sse2.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_sse.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_sse2.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_opteron.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_opteron.S diff --git a/kernel/x86/KERNEL.OPTERON_SSE3 b/kernel/x86/KERNEL.OPTERON_SSE3 new file mode 100644 index 0000000..05e7b25 --- /dev/null +++ b/kernel/x86/KERNEL.OPTERON_SSE3 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.OPTERON diff --git a/kernel/x86/KERNEL.P5 b/kernel/x86/KERNEL.P5 new file mode 100644 index 0000000..12de178 --- /dev/null +++ b/kernel/x86/KERNEL.P5 @@ -0,0 +1,2 @@ +include $(KERNELDIR)/KERNEL.P6 + diff --git a/kernel/x86/KERNEL.P6 b/kernel/x86/KERNEL.P6 new file mode 100644 index 0000000..8a7500c --- /dev/null +++ b/kernel/x86/KERNEL.P6 @@ -0,0 +1,60 @@ +SGEMMKERNEL = gemm_kernel_2x2.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_1x1.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x1.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_2x2.S +STRSMKERNEL_LT = trsm_kernel_LT_2x2.S +STRSMKERNEL_RN = trsm_kernel_LT_2x2.S +STRSMKERNEL_RT = trsm_kernel_RT_2x2.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S + +CGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S + diff --git a/kernel/x86/KERNEL.PENRYN b/kernel/x86/KERNEL.PENRYN new file mode 100644 index 0000000..08e3543 --- /dev/null +++ b/kernel/x86/KERNEL.PENRYN @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_penryn.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_penryn.S +DGEMMINCOPY = gemm_ncopy_2.S +DGEMMITCOPY = gemm_tcopy_2.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_penryn.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_penryn.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_penryn.S diff --git a/kernel/x86/KERNEL.PRESCOTT b/kernel/x86/KERNEL.PRESCOTT new file mode 100644 index 0000000..355e00f --- /dev/null +++ b/kernel/x86/KERNEL.PRESCOTT @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_sse3.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_sse3.S +DGEMMINCOPY = gemm_ncopy_2.S +DGEMMITCOPY = gemm_tcopy_2.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_sse3.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse3.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse3.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse3.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse3.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse3.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_prescott.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_prescott.S diff --git a/kernel/x86/KERNEL.VIAC3 b/kernel/x86/KERNEL.VIAC3 new file mode 100644 index 0000000..94ade28 --- /dev/null +++ b/kernel/x86/KERNEL.VIAC3 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ATHLON diff --git a/kernel/x86/KERNEL.YONAH b/kernel/x86/KERNEL.YONAH new file mode 100644 index 0000000..5b3ecae --- /dev/null +++ b/kernel/x86/KERNEL.YONAH @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_sse3.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_sse3.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_sse3.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse3.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse3.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse3.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse3.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse3.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_prescott.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_prescott.S diff --git a/kernel/x86/Makefile b/kernel/x86/Makefile new file mode 100644 index 0000000..efae70d --- /dev/null +++ b/kernel/x86/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/x86/amax.S b/kernel/x86/amax.S new file mode 100644 index 0000000..01c2bd6 --- /dev/null +++ b/kernel/x86/amax.S @@ -0,0 +1,315 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + + PROLOGUE + +#define M %ebx +#define INCX %esi +#define X %ecx +#define I %edx + +#ifndef USE_MIN +#define FMOV fcmovbe +#else +#define FMOV fcmovnbe +#endif + +#include "l1param.h" + + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_INCX, INCX + movl STACK_X, X + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + sall $BASE_SHIFT, INCX + + fldz + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + fstp %st(0) + + FLD (X) +#ifdef USE_ABS + fabs +#endif + addl INCX, X + decl M + jle .L999 + + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 1 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 2 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 3 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 4 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 5 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 6 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 7 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl $8 * SIZE, X + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl $1 * SIZE, X + decl I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl INCX, X + decl I + jg .L61 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/amax_sse.S b/kernel/x86/amax_sse.S new file mode 100644 index 0000000..65792cf --- /dev/null +++ b/kernel/x86/amax_sse.S @@ -0,0 +1,510 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %eax + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + xorps %xmm0, %xmm0 + leal (, INCX, SIZE), INCX + + testl M, M + jle .L999 + +#ifdef USE_ABS +#ifndef HAVE_SSE2 + subl $8, %esp + movl $0x7fffffff, (%esp) + movss (%esp), %xmm3 + shufps $0, %xmm3, %xmm3 + addl $8, %esp +#else + pcmpeqb %xmm3, %xmm3 + psrld $1, %xmm3 +#endif +#endif + + movss (X), %xmm0 + shufps $0, %xmm0, %xmm0 +#ifdef USE_ABS + andps %xmm3, %xmm0 +#endif + movaps %xmm0, %xmm1 + addl INCX, X + decl M + jle .L999 + + cmpl $SIZE, INCX + jne .L40 + + subl $-32 * SIZE, X + + cmpl $3, M + jle .L17 + + testl $SIZE, X + je .L05 + + movss -32 * SIZE(X), %xmm4 + addl $SIZE, X + shufps $0, %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + + decl M + ALIGN_3 + +.L05: + testl $2 * SIZE, X + je .L06 + + movsd -32 * SIZE(X), %xmm4 + addl $2 * SIZE, X + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm1 + + subl $2, M + ALIGN_3 + +.L06: + movl M, I + sarl $5, I + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + movaps 12 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + + subl $-32 * SIZE, X + ALIGN_3 + + +.L15: + testl $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + + movaps -20 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + + addl $16 * SIZE, X + ALIGN_3 + +.L16: + testl $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + addl $8 * SIZE, X + ALIGN_3 + +.L17: + testl $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + addl $4 * SIZE, X + ALIGN_3 + +.L18: + testl $2, M + je .L19 + + movsd -32 * SIZE(X), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm1 + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L40: + movl M, I + sarl $3, I + jle .L45 + ALIGN_4 + +.L41: + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxss %xmm7, %xmm1 + + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxss %xmm7, %xmm1 + + decl I + jg .L41 + ALIGN_4 + +.L45: + testl $4, M + je .L46 + + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxss %xmm7, %xmm1 + ALIGN_3 + +.L46: + testl $2, M + je .L47 + + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxss %xmm5, %xmm1 + ALIGN_3 + +.L47: + testl $1, M + je .L998 + + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + ALIGN_4 + +.L998: + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + subl $8, %esp + movss %xmm0, (%esp) + flds (%esp) + addl $8, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/amax_sse2.S b/kernel/x86/amax_sse2.S new file mode 100644 index 0000000..ad56244 --- /dev/null +++ b/kernel/x86/amax_sse2.S @@ -0,0 +1,518 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %eax + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + leal (, INCX, SIZE), INCX + + testl M, M + jle .L999 + +#ifdef USE_ABS + pcmpeqb %xmm3, %xmm3 + psrlq $1, %xmm3 +#endif + + movsd (X), %xmm0 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm0 +#endif + unpcklpd %xmm0, %xmm0 + movaps %xmm0, %xmm1 + decl M + jle .L999 + + cmpl $SIZE, INCX + jne .L40 + + subl $-16 * SIZE, X + + testl $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm0 + addl $SIZE, X + decl M + jle .L998 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + movaps 6 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + + subl $-16 * SIZE, X + ALIGN_4 + +.L15: + testl $8, M + jle .L16 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movaps -14 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movaps -12 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + + movaps -10 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + addl $8 * SIZE, X + ALIGN_3 + +.L16: + testl $4, M + jle .L17 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movaps -14 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + addl $4 * SIZE, X + ALIGN_3 + +.L17: + testl $2, M + jle .L18 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + addl $2 * SIZE, X + ALIGN_3 + +.L18: + testl $1, M + jle .L998 + + movsd -16 * SIZE(X), %xmm4 + unpcklpd %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm1 + jmp .L998 + ALIGN_3 + +.L40: + movl M, I + sarl $4, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addl INCX, X + movhps (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + + movsd (X), %xmm7 + addl INCX, X + movhps (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addl INCX, X + movhps (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + + movsd (X), %xmm7 + addl INCX, X + movhps (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + + decl I + jg .L41 + ALIGN_4 + +.L45: + andl $15, M + jle .L998 + + testl $8, M + je .L46 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addl INCX, X + movhps (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + + movsd (X), %xmm7 + addl INCX, X + movhps (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + ALIGN_3 + +.L46: + testl $4, M + je .L47 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + ALIGN_3 + +.L47: + testl $2, M + je .L48 + + movsd (X), %xmm6 + addl INCX, X + movhps (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + ALIGN_3 + +.L48: + testl $1, M + je .L998 + + movsd (X), %xmm7 + unpcklpd %xmm7, %xmm7 +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + ALIGN_4 + +.L998: + maxpd %xmm1, %xmm0 + movaps %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + subl $8, %esp + movsd %xmm0, (%esp) + fldl (%esp) + addl $8, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/asum.S b/kernel/x86/asum.S new file mode 100644 index 0000000..e1b0a6e --- /dev/null +++ b/kernel/x86/asum.S @@ -0,0 +1,225 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $BASE_SHIFT, INCX + fldz + fldz + fldz + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L21: + FLD (X) + fabs + faddp %st,%st(1) + addl $1 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addl INCX, X + fabs + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/asum_sse.S b/kernel/x86/asum_sse.S new file mode 100644 index 0000000..4506f29 --- /dev/null +++ b/kernel/x86/asum_sse.S @@ -0,0 +1,366 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define I %eax +#define M %ecx +#define X %esi +#define INCX %ebx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + +#ifdef HAVE_SSE2 + pcmpeqb %xmm3, %xmm3 + psrld $1, %xmm3 +#else + movl $0x7fffffff, STACK_M + movss STACK_M, %xmm3 + shufps $0, %xmm3, %xmm3 +#endif + + leal (, INCX, SIZE), INCX + + cmpl $SIZE, INCX + jne .L100 + + subl $-32 * SIZE, X + + cmpl $3, M + jle .L18 + + testl $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + andps %xmm3, %xmm0 + addl $SIZE, X + decl M + jle .L998 + ALIGN_3 + +.L05: + testl $8, X + je .L10 + + movsd -32 * SIZE(X), %xmm1 + andps %xmm3, %xmm1 + addl $2 * SIZE, X + subl $2, M + jle .L998 + ALIGN_3 + +.L10: + movl M, I + sarl $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps 12 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + decl I + jg .L11 + ALIGN_3 + +.L12: + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + + subl $-32 * SIZE, X + ALIGN_3 + +.L14: + testl $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + + movaps -20 * SIZE(X), %xmm7 + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + + addl $16 * SIZE, X + ALIGN_3 + +.L16: + testl $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L17: + testl $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + addl $4 * SIZE, X + ALIGN_3 + +.L18: + testl $2, M + je .L19 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm1 + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + jmp .L998 + ALIGN_4 + +.L100: + movl M, I + sarl $3, I + jle .L105 + ALIGN_4 + +.L101: + movss (X), %xmm4 + addl INCX, X + andps %xmm3, %xmm4 + addss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X + andps %xmm3, %xmm5 + addss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X + andps %xmm3, %xmm6 + addss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X + andps %xmm3, %xmm7 + addss %xmm7, %xmm1 + + movss (X), %xmm4 + addl INCX, X + andps %xmm3, %xmm4 + addss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X + andps %xmm3, %xmm5 + addss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X + andps %xmm3, %xmm6 + addss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X + andps %xmm3, %xmm7 + addss %xmm7, %xmm1 + + decl I + jg .L101 + ALIGN_4 + +.L105: + andl $7, M + jle .L998 + ALIGN_4 + +.L106: + movss (X), %xmm4 + andps %xmm3, %xmm4 + addss %xmm4, %xmm0 + addl INCX, X + decl M + jg .L106 + ALIGN_4 + +.L998: + addps %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + movss %xmm0, STACK_M + flds STACK_M + + popl %ebx + popl %esi + + ret + + EPILOGUE diff --git a/kernel/x86/asum_sse2.S b/kernel/x86/asum_sse2.S new file mode 100644 index 0000000..cea3503 --- /dev/null +++ b/kernel/x86/asum_sse2.S @@ -0,0 +1,318 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define I %eax +#define M %ecx +#define X %esi +#define INCX %ebx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + pcmpeqb %xmm3, %xmm3 + psrlq $1, %xmm3 + + sall $BASE_SHIFT, INCX + + subl $-16 * SIZE, X + + cmpl $SIZE, INCX + jne .L40 + + testl $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm0 + addl $SIZE, X + + andps %xmm3, %xmm0 + subl $1, M + jle .L999 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decl I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps 6 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + decl I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + subl $-16 * SIZE, X + ALIGN_3 + +.L20: + andl $15, M + jle .L999 + + testl $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + addl $8 * SIZE, X + ALIGN_3 + +.L21: + testl $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + ALIGN_3 + +.L22: + testl $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + addl $2 * SIZE, X + +.L23: + testl $1, M + je .L999 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addsd %xmm4, %xmm1 + jmp .L999 + ALIGN_3 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + movsd -16 * SIZE(X), %xmm4 + addl INCX, X + movhps -16 * SIZE(X), %xmm4 + addl INCX, X + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movsd -16 * SIZE(X), %xmm5 + addl INCX, X + movhps -16 * SIZE(X), %xmm5 + addl INCX, X + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movsd -16 * SIZE(X), %xmm6 + addl INCX, X + movhps -16 * SIZE(X), %xmm6 + addl INCX, X + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + + movsd -16 * SIZE(X), %xmm7 + addl INCX, X + movhps -16 * SIZE(X), %xmm7 + addl INCX, X + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + decl I + jg .L50 + ALIGN_4 + +.L60: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andl $7, M + jle .L999 + ALIGN_4 + +.L61: + movsd -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addsd %xmm4, %xmm0 + addl INCX, X + decl M + jg .L61 + ALIGN_4 + +.L999: + addpd %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movaps %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + movsd %xmm0, STACK_M + fldl STACK_M + popl %ebx + popl %esi + ret + + EPILOGUE + diff --git a/kernel/x86/axpy.S b/kernel/x86/axpy.S new file mode 100644 index 0000000..7f3d99e --- /dev/null +++ b/kernel/x86/axpy.S @@ -0,0 +1,247 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) +#else +#define STACK_X 20 + STACK + ARGS(%esp) +#define STACK_INCX 24 + STACK + ARGS(%esp) +#define STACK_Y 28 + STACK + ARGS(%esp) +#define STACK_INCY 32 + STACK + ARGS(%esp) +#endif + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + FLD STACK_ALPHA + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L40 + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl M, %eax + sarl $3, %eax + jle .L15 + ALIGN_3 + +#define PRESIZE 33 + +.L16: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * SIZE(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1),%st + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + fmul %st(1),%st + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1),%st + FADD 2 * SIZE(Y) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + fmul %st(1),%st + FADD 3 * SIZE(Y) + FST 3 * SIZE(Y) + +#ifdef HAS_PREFETCH + prefetcht0 (4 + PRESIZE) * SIZE(X) +#endif + + FLD 4 * SIZE(X) + fmul %st(1),%st + FADD 4 * SIZE(Y) + FST 4 * SIZE(Y) + + FLD 5 * SIZE(X) + fmul %st(1),%st + FADD 5 * SIZE(Y) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1),%st + FADD 6 * SIZE(Y) + FST 6 * SIZE(Y) + + FLD 7 * SIZE(X) + fmul %st(1),%st + FADD 7 * SIZE(Y) + FST 7 * SIZE(Y) + +#ifdef HAVE_3DNOW + prefetchw 24 * SIZE(Y) +#endif + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl M, %eax + andl $7, %eax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1),%st + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + addl $SIZE, X + addl $SIZE, Y + decl %eax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movl M, %eax + sarl $2, %eax + jle .L28 + ALIGN_3 + +.L29: + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L35: + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + xorl %eax,%eax + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/axpy_sse.S b/kernel/x86/axpy_sse.S new file mode 100644 index 0000000..291a219 --- /dev/null +++ b/kernel/x86/axpy_sse.S @@ -0,0 +1,1551 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 20 + STACK + ARGS(%esp) +#define STACK_INCX 24 + STACK + ARGS(%esp) +#define STACK_Y 28 + STACK + ARGS(%esp) +#define STACK_INCY 32 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define Y %edi +#define INCX %ecx +#define INCY %edx +#define YY %ebp + +#define ALPHA %xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movss STACK_ALPHA, ALPHA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + shufps $0, ALPHA, ALPHA + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L19 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + cmpl $3, M + jle .L16 + + testl $SIZE, Y + je .L00 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_3 + +.L00: + testl $SIZE * 2, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_3 + +.L10: + testl $SIZE * 3, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl %eax + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 8 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L11 + ALIGN_3 + +.L12: + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L13: + movl M, %eax + andl $16, %eax + jle .L14 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + movl M, %eax + andl $8, %eax + jle .L15 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + movl M, %eax + andl $4, %eax + jle .L16 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + movl M, %eax + andl $2, %eax + jle .L17 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + movl M, %eax + andl $1, %eax + jle .L19 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: + +#ifdef ALIGNED_ACCESS + + testl $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -18 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -10 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -6 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -18 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -10 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -6 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L23: + movl M, %eax + andl $16, %eax + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + SHUFPD_1 %xmm3, %xmm2 + SHUFPD_1 %xmm4, %xmm3 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + movl M, %eax + andl $8, %eax + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $4, %eax + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $2, %eax + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + movl M, %eax + andl $1, %eax + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L29: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L30: + testl $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + + decl %eax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -17 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -13 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -9 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -5 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -17 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -13 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -9 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -5 * SIZE(X), %xmm3 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L33: + movl M, %eax + andl $16, %eax + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + movl M, %eax + andl $8, %eax + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + movl M, %eax + andl $4, %eax + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + movl M, %eax + andl $2, %eax + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + movl M, %eax + andl $1, %eax + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L39: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + + decl %eax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -19 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -15 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -11 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -7 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -19 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -15 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -11 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -7 * SIZE(X), %xmm3 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L43: + movl M, %eax + andl $16, %eax + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + movl M, %eax + andl $8, %eax + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + movl M, %eax + andl $4, %eax + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + movl M, %eax + andl $2, %eax + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + movl M, %eax + andl $1, %eax + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L49: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret +#else + + movl M, %eax + sarl $5, %eax + jle .L23 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L23: + movl M, %eax + andl $16, %eax + jle .L24 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + movl M, %eax + andl $8, %eax + jle .L25 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $4, %eax + jle .L26 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $2, %eax + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + movl M, %eax + andl $1, %eax + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L29: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret +#endif + ALIGN_3 + + +.L50: + movl M, %eax + movl Y, YY + sarl $3, %eax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + addl INCX, X + mulss ALPHA, %xmm0 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm0 + + movss (X), %xmm1 + addl INCX, X + mulss ALPHA, %xmm1 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm1 + + movss (X), %xmm2 + addl INCX, X + mulss ALPHA, %xmm2 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm2 + + movss (X), %xmm3 + addl INCX, X + mulss ALPHA, %xmm3 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm3 + + movss %xmm0, (Y) + addl INCY, Y + movss %xmm1, (Y) + addl INCY, Y + movss %xmm2, (Y) + addl INCY, Y + movss %xmm3, (Y) + addl INCY, Y + + movss (X), %xmm0 + addl INCX, X + mulss ALPHA, %xmm0 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm0 + + movss (X), %xmm1 + addl INCX, X + mulss ALPHA, %xmm1 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm1 + + movss (X), %xmm2 + addl INCX, X + mulss ALPHA, %xmm2 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm2 + + movss (X), %xmm3 + addl INCX, X + mulss ALPHA, %xmm3 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm3 + + movss %xmm0, (Y) + addl INCY, Y + movss %xmm1, (Y) + addl INCY, Y + movss %xmm2, (Y) + addl INCY, Y + movss %xmm3, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $7, %eax + jle .L59 + ALIGN_3 + +.L56: + movss (X), %xmm0 + addl INCX, X + mulss ALPHA, %xmm0 + movss (Y), %xmm6 + addss %xmm6, %xmm0 + movss %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L56 + ALIGN_3 + +.L59: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/axpy_sse2.S b/kernel/x86/axpy_sse2.S new file mode 100644 index 0000000..5e31d3d --- /dev/null +++ b/kernel/x86/axpy_sse2.S @@ -0,0 +1,799 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define Y %edi +#define INCX %ecx +#define INCY %edx +#define YY %ebp + +#define ALPHA %xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movsd STACK_ALPHA, ALPHA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + unpcklpd ALPHA, ALPHA + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L47 + + cmpl $SIZE, INCX + jne .L40 + cmpl $SIZE, INCY + jne .L40 + + testl $SIZE, Y + je .L10 + + movsd (X), %xmm0 + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + movsd %xmm0, (Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_4 + +.L10: + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -8 * SIZE(X), %xmm0 + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -6 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -4 * SIZE(X), %xmm2 + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -2 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + mulpd ALPHA, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + mulpd ALPHA, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 6 * SIZE(X), %xmm3 + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -8 * SIZE(X), %xmm0 + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -6 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -4 * SIZE(X), %xmm2 + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -2 * SIZE(X), %xmm3 + + mulpd ALPHA, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + mulpd ALPHA, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L13: + movl M, %eax + andl $8, %eax + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + movl M, %eax + andl $4, %eax + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + movl M, %eax + andl $2, %eax + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L16: + movl M, %eax + andl $1, %eax + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -9 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -7 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -5 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -3 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 3 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulpd ALPHA, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 5 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -9 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -7 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -5 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -3 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm3 + mulpd ALPHA, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L23: + movl M, %eax + andl $8, %eax + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + movl M, %eax + andl $4, %eax + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $2, %eax + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $1, %eax + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +#else + movl M, %eax + sarl $3, %eax + jle .L23 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + + subl $-8 * SIZE, Y + subl $-8 * SIZE, X + decl %eax + jg .L21 + ALIGN_3 + +.L22: + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + subl $-8 * SIZE, Y + subl $-8 * SIZE, X + ALIGN_3 + +.L23: + movl M, %eax + andl $4, %eax + jle .L25 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $2, %eax + jle .L26 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $1, %eax + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 +#endif + +.L40: + movl Y, YY + movl M, %eax + sarl $3, %eax + jle .L45 + ALIGN_3 + +.L41: + movsd 0 * SIZE(X), %xmm0 + addl INCX, X + movhpd 0 * SIZE(X), %xmm0 + addl INCX, X + mulpd ALPHA, %xmm0 + + movsd 0 * SIZE(YY), %xmm6 + addl INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addl INCY, YY + addpd %xmm6, %xmm0 + + movsd 0 * SIZE(X), %xmm1 + addl INCX, X + movhpd 0 * SIZE(X), %xmm1 + addl INCX, X + mulpd ALPHA, %xmm1 + + movsd 0 * SIZE(YY), %xmm6 + addl INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addl INCY, YY + addpd %xmm6, %xmm1 + + movsd 0 * SIZE(X), %xmm2 + addl INCX, X + movhpd 0 * SIZE(X), %xmm2 + addl INCX, X + mulpd ALPHA, %xmm2 + + movsd 0 * SIZE(YY), %xmm6 + addl INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addl INCY, YY + addpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm3 + addl INCX, X + movhpd 0 * SIZE(X), %xmm3 + addl INCX, X + mulpd ALPHA, %xmm3 + + movsd 0 * SIZE(YY), %xmm6 + addl INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addl INCY, YY + addpd %xmm6, %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addl INCY, Y + movsd %xmm1, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addl INCY, Y + movsd %xmm2, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addl INCY, Y + movsd %xmm3, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L45: + movl M, %eax + andl $7, %eax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + addl INCX, X + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + movsd %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L46 + ALIGN_3 + +.L47: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/axpy_sse2_opteron.S b/kernel/x86/axpy_sse2_opteron.S new file mode 100644 index 0000000..fb22415 --- /dev/null +++ b/kernel/x86/axpy_sse2_opteron.S @@ -0,0 +1,496 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define Y %edi +#define INCX %ecx +#define INCY %edx + +#define PREFETCHSIZE 64 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + PROFCODE + + movlpd ALPHA, %xmm7 + unpcklpd %xmm7, %xmm7 + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L999 + + cmpl $SIZE, INCX + jne .L100 + cmpl $SIZE, INCY + jne .L100 + + testl $SIZE, Y + je .L00 + + movlpd 0 * SIZE(X), %xmm0 + mulsd %xmm7, %xmm0 + addsd 0 * SIZE(Y), %xmm0 + movlpd %xmm0, 0 * SIZE(Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L999 + ALIGN_3 + +.L00: + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L15 + ALIGN_3 + +.L11: + prefetch (PREFETCHSIZE + 0) * SIZE(X) + + movapd 0 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movapd 2 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + prefetchw (PREFETCHSIZE + 0) * SIZE(Y) + + movapd 4 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 4 * SIZE(Y), %xmm2 + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 6 * SIZE(Y), %xmm3 + movapd %xmm3, 6 * SIZE(Y) + + prefetch (PREFETCHSIZE + 8) * SIZE(X) + + movapd 8 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 8 * SIZE(Y), %xmm0 + movapd %xmm0, 8 * SIZE(Y) + + movapd 10 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 10 * SIZE(Y), %xmm1 + movapd %xmm1, 10 * SIZE(Y) + + prefetchw (PREFETCHSIZE + 8) * SIZE(Y) + + movapd 12 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 12 * SIZE(Y), %xmm2 + movapd %xmm2, 12 * SIZE(Y) + + movapd 14 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 14 * SIZE(Y), %xmm3 + movapd %xmm3, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + decl %eax + jg .L11 + ALIGN_3 + +.L15: + movl M, %eax + testl $8, %eax + jle .L16 + + movapd 0 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movapd 2 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + movapd 4 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 4 * SIZE(Y), %xmm2 + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 6 * SIZE(Y), %xmm3 + movapd %xmm3, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $4, %eax + jle .L17 + + movapd 0 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movapd 2 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $2, %eax + jle .L18 + + movapd 0 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L18: + testl $1, %eax + jle .L99 + + movlpd 0 * SIZE(X), %xmm0 + mulsd %xmm7, %xmm0 + addsd 0 * SIZE(Y), %xmm0 + movlpd %xmm0, 0 * SIZE(Y) + jmp .L99 + ALIGN_3 + +.L20: + movl M, %eax + sarl $4, %eax + jle .L25 + ALIGN_4 + +.L21: +#ifdef OPTERON + prefetcht0 (PREFETCHSIZE + 0) * SIZE(X) + prefetchw (PREFETCHSIZE + 0) * SIZE(Y) +#endif + + movlpd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movlpd 2 * SIZE(X), %xmm1 + movhpd 3 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + movlpd 4 * SIZE(X), %xmm2 + movhpd 5 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 4 * SIZE(Y), %xmm2 + movapd %xmm2, 4 * SIZE(Y) + + movlpd 6 * SIZE(X), %xmm3 + movhpd 7 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 6 * SIZE(Y), %xmm3 + movapd %xmm3, 6 * SIZE(Y) + +#ifdef OPTERON + prefetcht0 (PREFETCHSIZE + 8) * SIZE(X) + prefetchw (PREFETCHSIZE + 8) * SIZE(Y) +#endif + + movlpd 8 * SIZE(X), %xmm0 + movhpd 9 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 8 * SIZE(Y), %xmm0 + movapd %xmm0, 8 * SIZE(Y) + + movlpd 10 * SIZE(X), %xmm1 + movhpd 11 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 10 * SIZE(Y), %xmm1 + movapd %xmm1, 10 * SIZE(Y) + + movlpd 12 * SIZE(X), %xmm2 + movhpd 13 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 12 * SIZE(Y), %xmm2 + movapd %xmm2, 12 * SIZE(Y) + + movlpd 14 * SIZE(X), %xmm3 + movhpd 15 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 14 * SIZE(Y), %xmm3 + movapd %xmm3, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L25: + movl M, %eax + testl $8, %eax + jle .L26 + + movlpd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movlpd 2 * SIZE(X), %xmm1 + movhpd 3 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + movlpd 4 * SIZE(X), %xmm2 + movhpd 5 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 4 * SIZE(Y), %xmm2 + movapd %xmm2, 4 * SIZE(Y) + + movlpd 6 * SIZE(X), %xmm3 + movhpd 7 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 6 * SIZE(Y), %xmm3 + movapd %xmm3, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, %eax + jle .L27 + + movlpd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movlpd 2 * SIZE(X), %xmm1 + movhpd 3 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, %eax + jle .L28 + + movlpd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, %eax + jle .L99 + + movlpd 0 * SIZE(X), %xmm0 + mulsd %xmm7, %xmm0 + addsd 0 * SIZE(Y), %xmm0 + movlpd %xmm0, 0 * SIZE(Y) + ALIGN_3 + +.L99: + xorl %eax,%eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L100: + movl M, %eax + movl Y, %ebp + sarl $3, %eax + jle .L114 + ALIGN_3 + +.L110: + movlpd 0 * SIZE(X), %xmm0 + addl INCX, X + movhpd 0 * SIZE(X), %xmm0 + addl INCX, X + mulpd %xmm7, %xmm0 + + movlpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + movhpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + addpd %xmm6, %xmm0 + + movlpd 0 * SIZE(X), %xmm1 + addl INCX, X + movhpd 0 * SIZE(X), %xmm1 + addl INCX, X + mulpd %xmm7, %xmm1 + + movlpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + movhpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + addpd %xmm6, %xmm1 + + movlpd 0 * SIZE(X), %xmm2 + addl INCX, X + movhpd 0 * SIZE(X), %xmm2 + addl INCX, X + mulpd %xmm7, %xmm2 + + movlpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + movhpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + addpd %xmm6, %xmm2 + + movlpd 0 * SIZE(X), %xmm3 + addl INCX, X + movhpd 0 * SIZE(X), %xmm3 + addl INCX, X + mulpd %xmm7, %xmm3 + + movlpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + movhpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + addpd %xmm6, %xmm3 + + movlpd %xmm0, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addl INCY, Y + movlpd %xmm1, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addl INCY, Y + movlpd %xmm2, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addl INCY, Y + movlpd %xmm3, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L110 + ALIGN_3 + +.L114: + movl M, %eax + andl $7, %eax + jle .L999 + ALIGN_3 + +.L115: + movlpd (X), %xmm0 + addl INCX, X + mulsd %xmm7, %xmm0 + addsd (Y), %xmm0 + movlpd %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L115 + ALIGN_3 + +.L999: + xorl %eax,%eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/cabs.S b/kernel/x86/cabs.S new file mode 100644 index 0000000..ba80420 --- /dev/null +++ b/kernel/x86/cabs.S @@ -0,0 +1,57 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl 4(%esp), %eax + FLD 0 * SIZE(%eax) + fabs + FLD 1 * SIZE(%eax) + fabs + faddp %st, %st(1) + ret + + EPILOGUE diff --git a/kernel/x86/copy.S b/kernel/x86/copy.S new file mode 100644 index 0000000..721d5c5 --- /dev/null +++ b/kernel/x86/copy.S @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define X 8 + STACK + ARGS(%esp) +#define INCX 12 + STACK + ARGS(%esp) +#define Y 16 + STACK + ARGS(%esp) +#define INCY 20 + STACK + ARGS(%esp) + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl M, %ebx + movl X, %ecx + movl INCX, %esi + movl Y, %edx + movl INCY, %edi + + testl %ebx, %ebx # if m == 0 goto End + jle .L999 + +#if SIZE > 8 + sall $BASE_SHIFT, %esi + sall $BASE_SHIFT, %edi +#else + leal (, %esi, SIZE), %esi + leal (, %edi, SIZE), %edi +#endif + + cmpl $SIZE, %esi # if incx != 1 + jne .L100 + cmpl $SIZE, %edi # if incy != 1 + jne .L100 + + movl %ebx, %eax # i = m + sarl $3, %eax + jle .L20 + ALIGN_2 + +.L11: + FLD 7 * SIZE(%ecx) + FLD 6 * SIZE(%ecx) + FLD 5 * SIZE(%ecx) + FLD 4 * SIZE(%ecx) + FLD 3 * SIZE(%ecx) + FLD 2 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + FST 2 * SIZE(%edx) + FST 3 * SIZE(%edx) + FST 4 * SIZE(%edx) + FST 5 * SIZE(%edx) + FST 6 * SIZE(%edx) + FST 7 * SIZE(%edx) + + addl $8 * SIZE, %ecx + addl $8 * SIZE, %edx + decl %eax + jg .L11 + ALIGN_2 + +.L20: + movl %ebx, %eax # i = m + andl $7, %eax + jle .L99 + ALIGN_2 + +.L21: + FLD (%ecx) + FST (%edx) + addl $SIZE, %ecx + addl $SIZE, %edx + decl %eax + jg .L21 + +.L99: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L100: + movl %ebx, %eax + sarl $3, %eax + jle .L120 + ALIGN_2 + +.L111: + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + + fxch %st(7) + FST (%edx) + addl %edi, %edx + + fxch %st(5) + FST (%edx) + addl %edi, %edx + + fxch %st(3) + FST (%edx) + addl %edi, %edx + + fxch %st(1) + FST (%edx) + addl %edi, %edx + + FST (%edx) + addl %edi, %edx + + FST (%edx) + addl %edi, %edx + + FST (%edx) + addl %edi, %edx + + FST (%edx) + addl %edi, %edx + + decl %eax + jg .L111 + +.L120: + movl %ebx, %eax + andl $7, %eax + jle .L999 + ALIGN_2 + +.L121: + FLD (%ecx) + FST (%edx) + addl %esi, %ecx + addl %edi, %edx + decl %eax + jg .L121 + +.L999: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/copy_sse.S b/kernel/x86/copy_sse.S new file mode 100644 index 0000000..34902dc --- /dev/null +++ b/kernel/x86/copy_sse.S @@ -0,0 +1,962 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + cmpl $3, M + jle .L55 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + ALIGN_4 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_4 + +.L10: + testl $3 * SIZE, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -32 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -28 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -24 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm2) + movaps %xmm3, -20 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4,-16 * SIZE(Y) + LOAD(16 * SIZE, X, %xmm4) + movaps %xmm5,-12 * SIZE(Y) + LOAD(20 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -8 * SIZE(Y) + LOAD(24 * SIZE, X, %xmm6) + movaps %xmm7, -4 * SIZE(Y) + LOAD(28 * SIZE, X, %xmm7) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + movaps %xmm4, -16 * SIZE(Y) + movaps %xmm5, -12 * SIZE(Y) + movaps %xmm6, -8 * SIZE(Y) + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + ALIGN_3 + +.L13: + testl $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + testl $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: + testl $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + movaps -6 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 14 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 18 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 22 * SIZE(X), %xmm6 + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 26 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L23: + testl $16, M + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $8, M + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm1, %xmm0 + shufps $0x4e, %xmm2, %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, M + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, M + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, M + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L30: + testl $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + movaps -5 * SIZE(X), %xmm7 + + decl %eax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 15 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 19 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 23 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 27 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L33: + testl $16, M + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + testl $8, M + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, M + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + testl $2, M + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, M + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L39: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + movaps -7 * SIZE(X), %xmm7 + + decl %eax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 13 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 17 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 21 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 25 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L43: + testl $16, M + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + testl $8, M + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, M + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + testl $2, M + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, M + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L49: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_4 + +.L50: + movl M, %eax + sarl $3, %eax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + addl INCX, X + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + movss (X), %xmm4 + addl INCX, X + movss (X), %xmm5 + addl INCX, X + movss (X), %xmm6 + addl INCX, X + movss (X), %xmm7 + addl INCX, X + + movss %xmm0, (Y) + addl INCY, Y + movss %xmm1, (Y) + addl INCY, Y + movss %xmm2, (Y) + addl INCY, Y + movss %xmm3, (Y) + addl INCY, Y + movss %xmm4, (Y) + addl INCY, Y + movss %xmm5, (Y) + addl INCY, Y + movss %xmm6, (Y) + addl INCY, Y + movss %xmm7, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $7, %eax + jle .L57 + ALIGN_3 + +.L56: + movss (X), %xmm0 + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/copy_sse2.S b/kernel/x86/copy_sse2.S new file mode 100644 index 0000000..11524aa --- /dev/null +++ b/kernel/x86/copy_sse2.S @@ -0,0 +1,655 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + cmpl $SIZE, INCX + jne .L40 + cmpl $SIZE, INCY + jne .L40 + +#ifdef ALIGNED_ACCESS + testl $SIZE, Y +#else + testl $SIZE, X +#endif + je .L10 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_4 + +.L10: + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + +#ifdef ALIGNED_ACCESS + testl $SIZE, X +#else + testl $SIZE, Y +#endif + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -16 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -14 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -12 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movaps %xmm3, -10 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4, -8 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movaps %xmm5, -6 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -4 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movaps %xmm7, -2 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, -8 * SIZE(Y) + movaps %xmm5, -6 * SIZE(Y) + movaps %xmm6, -4 * SIZE(Y) + movaps %xmm7, -2 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L13: + testl $8, M + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + testl $4, M + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L16: + testl $1, M + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + LOAD( 1 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + LOAD( 3 * SIZE, X, %xmm2) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + LOAD( 5 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + LOAD( 7 * SIZE, X, %xmm4) + + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + LOAD( 9 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + LOAD(11 * SIZE, X, %xmm6) + + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + LOAD(13 * SIZE, X, %xmm7) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +#else + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movaps -10 * SIZE(X), %xmm3 + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +#endif + +.L40: + movl M, %eax + sarl $3, %eax + jle .L45 + ALIGN_3 + +.L41: + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + movlps %xmm0, (Y) + addl INCY, Y + movhps %xmm0, (Y) + addl INCY, Y + movlps %xmm1, (Y) + addl INCY, Y + movhps %xmm1, (Y) + addl INCY, Y + movlps %xmm2, (Y) + addl INCY, Y + movhps %xmm2, (Y) + addl INCY, Y + movlps %xmm3, (Y) + addl INCY, Y + movhps %xmm3, (Y) + addl INCY, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L45: + movl M, %eax + andl $7, %eax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L46 + ALIGN_3 + +.L47: + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/cpuid.S b/kernel/x86/cpuid.S new file mode 100644 index 0000000..773b67d --- /dev/null +++ b/kernel/x86/cpuid.S @@ -0,0 +1,64 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl 12(%esp), %eax + cpuid + + movl 16(%esp), %esi + movl %eax, (%esi) + movl 20(%esp), %esi + movl %ebx, (%esi) + movl 24(%esp), %esi + movl %ecx, (%esi) + movl 28(%esp), %esi + movl %edx, (%esi) + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/dot.S b/kernel/x86/dot.S new file mode 100644 index 0000000..5bd5d28 --- /dev/null +++ b/kernel/x86/dot.S @@ -0,0 +1,219 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + fldz + fldz + fldz + fldz + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl N, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + FMUL 0 * SIZE(Y) + faddp %st,%st(1) + FLD 1 * SIZE(X) + FMUL 1 * SIZE(Y) + faddp %st,%st(2) + FLD 2 * SIZE(X) + FMUL 2 * SIZE(Y) + faddp %st,%st(3) + FLD 3 * SIZE(X) + FMUL 3 * SIZE(Y) + faddp %st,%st(4) + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addl $SIZE, X + FMUL (Y) + addl $SIZE, Y + faddp %st,%st(1) + decl %eax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L29: +#endif + movl N, %eax + sarl $2, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(1) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(2) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(3) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(4) + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st, %st(1) + decl %eax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_amd.S b/kernel/x86/dot_amd.S new file mode 100644 index 0000000..75ad36e --- /dev/null +++ b/kernel/x86/dot_amd.S @@ -0,0 +1,236 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + fldz + fldz + fldz + fldz + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl N, %eax + sarl $3, %eax + jle .L15 + FLD 0 * SIZE(X) + ALIGN_4 + +.L16: + FLD 1 * SIZE(X) + FMUL 1 * SIZE(Y) + faddp %st,%st(2) + FMUL 0 * SIZE(Y) + faddp %st,%st(2) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + FMUL 3 * SIZE(Y) + faddp %st,%st(4) + FMUL 2 * SIZE(Y) + faddp %st,%st(4) + FLD 4 * SIZE(X) + + FLD 5 * SIZE(X) + FMUL 5 * SIZE(Y) + faddp %st,%st(2) + FMUL 4 * SIZE(Y) + faddp %st,%st(2) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + FMUL 7 * SIZE(Y) + faddp %st,%st(4) + FMUL 6 * SIZE(Y) + faddp %st,%st(4) + FLD 8 * SIZE(X) + + prefetch 16 * SIZE(X) + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + + ffreep %st(0) + ALIGN_3 + +.L15: + movl N, %eax + andl $7, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addl $SIZE, X + FMUL (Y) + addl $SIZE, Y + faddp %st,%st(1) + decl %eax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L29: +#endif + movl N, %eax + sarl $2, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(1) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(2) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(3) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(4) + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st, %st(1) + decl %eax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_sse.S b/kernel/x86/dot_sse.S new file mode 100644 index 0000000..1811921 --- /dev/null +++ b/kernel/x86/dot_sse.S @@ -0,0 +1,1320 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ecx +#define X %esi +#define INCX %ebx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N), N # N + movl (INCX),INCX # INCX + movl (INCY),INCY # INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + cmpl $3, N + jle .L17 + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + mulss -32 * SIZE(Y), %xmm0 + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + ALIGN_2 + +.L05: + testl $2 * SIZE, Y + je .L10 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + mulps %xmm4, %xmm1 + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, N + jle .L999 + ALIGN_2 + +.L10: +#ifdef ALIGNED_ACCESS + testl $2 * SIZE, X + jne .L30 + + testl $SIZE, X + jne .L20 +#else + testl $3 * SIZE, X + jne .L20 +#endif + + movl N, %eax + sarl $5, %eax + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -8 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L11 + ALIGN_3 + +.L12: + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -8 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L14: + testl $31, N + jle .L999 + + testl $16, N + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L15: + testl $8, N + jle .L16 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $4, N + jle .L17 + + movaps -32 * SIZE(X), %xmm4 + mulps -32 * SIZE(Y), %xmm4 + + addps %xmm4, %xmm2 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $2, N + jle .L18 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + mulps %xmm6, %xmm4 + addps %xmm4, %xmm3 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L18: + testl $1, N + jle .L999 + + movss -32 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS +.L20: + + movaps -33 * SIZE(X), %xmm4 + addl $3 * SIZE, X + + movl N, %eax + sarl $5, %eax + jle .L24 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L24: + testl $31, N + jle .L999 + + testl $16, N + jle .L25 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm6, %xmm4 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, N + jle .L27 + + movaps -32 * SIZE(X), %xmm5 + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm2 + movaps %xmm5, %xmm4 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + PSHUFD2($0x39, %xmm4, %xmm5) + + mulps %xmm6, %xmm5 + addps %xmm5, %xmm3 + movhlps %xmm4, %xmm4 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, N + jle .L999 + + PSHUFD1($0x39, %xmm4) + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L30: + testl $SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm4 + addl $2 * SIZE, X + + movl N, %eax + sarl $5, %eax + jle .L34 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + decl %eax + jle .L32 + + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L32: + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L34: + testl $31, N + jle .L999 + + testl $16, N + jle .L35 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L35: + testl $8, N + jle .L36 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps %xmm6, %xmm4 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L36: + testl $4, N + jle .L37 + + movaps -32 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps %xmm5, %xmm4 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L37: + testl $2, N + jle .L38 + + xorps %xmm5, %xmm5 + movhlps %xmm4, %xmm5 + + mulps -32 * SIZE(Y), %xmm5 + addps %xmm5, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L38: + testl $1, N + jle .L999 + + movss -34 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm4 + addl $SIZE, X + + movl N, %eax + sarl $5, %eax + jle .L44 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + decl %eax + jle .L42 + + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L44: + testl $31, N + jle .L999 + + testl $16, N + jle .L45 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L45: + testl $8, N + jle .L46 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm6, %xmm4 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L46: + testl $4, N + jle .L47 + + movaps -32 * SIZE(X), %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm2 + movaps %xmm5, %xmm4 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L47: + testl $2, N + jle .L48 + + movaps -32 * SIZE(X), %xmm5 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd -32 * SIZE(Y), %xmm7 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm3 + movlhps %xmm5, %xmm4 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L48: + testl $1, N + jle .L999 + + PSHUFD1($0x93, %xmm4) + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_4 + +#else +.L20: + movl N, %eax + sarl $5, %eax + jle .L24 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movlps -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movlps -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movlps -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movlps -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movlps 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 + + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movlps 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movlps -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movlps -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movlps -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movlps -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L24: + testl $31, N + jle .L999 + + testl $16, N + jle .L25 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, N + jle .L27 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + mulps -32 * SIZE(Y), %xmm4 + + addps %xmm4, %xmm2 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + mulps %xmm6, %xmm4 + addps %xmm4, %xmm3 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, N + jle .L999 + + movss -32 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 +#endif + +.L50: + movl N, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L53: + movss 0 * SIZE(X), %xmm4 + addl INCX, X + mulss 0 * SIZE(Y), %xmm4 + addl INCY, Y + movss 0 * SIZE(X), %xmm5 + addl INCX, X + mulss 0 * SIZE(Y), %xmm5 + addl INCY, Y + movss 0 * SIZE(X), %xmm6 + addl INCX, X + mulss 0 * SIZE(Y), %xmm6 + addl INCY, Y + movss 0 * SIZE(X), %xmm7 + addl INCX, X + mulss 0 * SIZE(Y), %xmm7 + addl INCY, Y + + addss %xmm4, %xmm0 + addss %xmm5, %xmm1 + addss %xmm6, %xmm2 + addss %xmm7, %xmm3 + + decl %eax + jg .L53 + ALIGN_3 + +.L55: + movl N, %eax + andl $3, %eax + jle .L999 + ALIGN_3 + +.L56: + movss 0 * SIZE(X), %xmm4 + addl INCX, X + mulss 0 * SIZE(Y), %xmm4 + addl INCY, Y + addss %xmm4, %xmm0 + decl %eax + jg .L56 + ALIGN_3 + +.L999: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if defined(HAVE_SSE3) && !defined(__INTERIX) + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#elif defined(HAVE_SSE2) + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + PSHUFD2($1, %xmm0, %xmm1) + addss %xmm1, %xmm0 +#else + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#endif + + movss %xmm0, STACK_N + flds STACK_N + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_sse2.S b/kernel/x86/dot_sse2.S new file mode 100644 index 0000000..f2053d2 --- /dev/null +++ b/kernel/x86/dot_sse2.S @@ -0,0 +1,728 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ecx +#define X %esi +#define INCX %ebx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, Y + je .L10 + + movsd -16 * SIZE(X), %xmm0 + mulsd -16 * SIZE(Y), %xmm0 + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + ALIGN_2 + +.L10: + testl $SIZE, X + jne .L20 + + movl N, %eax + sarl $4, %eax + jle .L14 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps -4 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L11 + ALIGN_3 + +.L12: + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps -4 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps -2 * SIZE(X), %xmm7 + + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L14: + testl $15, N + jle .L999 + + testl $8, N + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, N + jle .L16 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, N + jle .L17 + + movaps -16 * SIZE(X), %xmm4 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, N + jle .L999 + + movsd -16 * SIZE(X), %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L20: + +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm4 + addl $SIZE, X + + movl N, %eax + sarl $4, %eax + jle .L24 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + movaps -12 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -10 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps -8 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps -6 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -2 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps 2 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps 4 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -10 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps -8 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps -6 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -2 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + movaps -12 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -10 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movapd %xmm6, %xmm4 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movaps -16 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movapd %xmm5, %xmm4 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + SHUFPD_1 %xmm4, %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +#else + + movl N, %eax + sarl $4, %eax + jle .L24 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movlps -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movlps -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movlps -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movlps -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movlps -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movlps -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movlps 2 * SIZE(X), %xmm5 + movhps 3 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movlps 4 * SIZE(X), %xmm6 + movhps 5 * SIZE(X), %xmm6 + + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movlps 6 * SIZE(X), %xmm7 + movhps 7 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movlps -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movlps -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movlps -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movlps -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movlps -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movlps -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + movsd -16 * SIZE(X), %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 +#endif + +.L50: + movl N, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L53: + movsd (X), %xmm4 + addl INCX, X + mulsd (Y), %xmm4 + addl INCY, Y + movsd (X), %xmm5 + addl INCX, X + mulsd (Y), %xmm5 + addl INCY, Y + movsd (X), %xmm6 + addl INCX, X + mulsd (Y), %xmm6 + addl INCY, Y + movsd (X), %xmm7 + addl INCX, X + mulsd (Y), %xmm7 + addl INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + decl %eax + jg .L53 + ALIGN_3 + +.L55: + movl N, %eax + andl $3, %eax + jle .L999 + ALIGN_3 + +.L56: + movsd (X), %xmm4 + addl INCX, X + mulsd (Y), %xmm4 + addl INCY, Y + addsd %xmm4, %xmm0 + decl %eax + jg .L56 + ALIGN_3 + +.L999: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + pshufd $0xe, %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + movlps %xmm0, STACK_N + fldl STACK_N + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_sse2_opteron.S b/kernel/x86/dot_sse2_opteron.S new file mode 100644 index 0000000..7ac059f --- /dev/null +++ b/kernel/x86/dot_sse2_opteron.S @@ -0,0 +1,368 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ecx +#define X %esi +#define INCX %ebx +#define Y %edi +#define INCY %edx + +#define PREFETCHSIZE 84 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N), N # N + movl (INCX),INCX # INCX + movl (INCY),INCY # INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + testl $SIZE, Y + je .L10 + + movsd 0 * SIZE(X), %xmm0 + mulsd 0 * SIZE(Y), %xmm0 + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + ALIGN_2 + +.L10: + movl N, %eax + sarl $4, %eax + jle .L24 + + movlpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + movlpd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 + movlpd 4 * SIZE(X), %xmm6 + movhpd 5 * SIZE(X), %xmm6 + movlpd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 + + mulpd 0 * SIZE(Y), %xmm4 + mulpd 2 * SIZE(Y), %xmm5 + mulpd 4 * SIZE(Y), %xmm6 + mulpd 6 * SIZE(Y), %xmm7 + decl %eax + jle .L22 + + ALIGN_3 + +.L21: + prefetch (PREFETCHSIZE + 0) * SIZE(Y) + + addpd %xmm4, %xmm0 + movlpd 8 * SIZE(X), %xmm4 + movhpd 9 * SIZE(X), %xmm4 + addpd %xmm5, %xmm1 + movlpd 10 * SIZE(X), %xmm5 + movhpd 11 * SIZE(X), %xmm5 + addpd %xmm6, %xmm2 + movlpd 12 * SIZE(X), %xmm6 + movhpd 13 * SIZE(X), %xmm6 + addpd %xmm7, %xmm3 + movlpd 14 * SIZE(X), %xmm7 + movhpd 15 * SIZE(X), %xmm7 + + mulpd 8 * SIZE(Y), %xmm4 + mulpd 10 * SIZE(Y), %xmm5 + mulpd 12 * SIZE(Y), %xmm6 + mulpd 14 * SIZE(Y), %xmm7 + + prefetch (PREFETCHSIZE + 8) * SIZE(Y) + + addpd %xmm4, %xmm0 + movlpd 16 * SIZE(X), %xmm4 + movhpd 17 * SIZE(X), %xmm4 + addpd %xmm5, %xmm1 + movlpd 18 * SIZE(X), %xmm5 + movhpd 19 * SIZE(X), %xmm5 + addpd %xmm6, %xmm2 + movlpd 20 * SIZE(X), %xmm6 + movhpd 21 * SIZE(X), %xmm6 + addpd %xmm7, %xmm3 + movlpd 22 * SIZE(X), %xmm7 + movhpd 23 * SIZE(X), %xmm7 + + mulpd 16 * SIZE(Y), %xmm4 + mulpd 18 * SIZE(Y), %xmm5 + mulpd 20 * SIZE(Y), %xmm6 + mulpd 22 * SIZE(Y), %xmm7 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + addpd %xmm4, %xmm0 + movlpd 8 * SIZE(X), %xmm4 + movhpd 9 * SIZE(X), %xmm4 + addpd %xmm5, %xmm1 + movlpd 10 * SIZE(X), %xmm5 + movhpd 11 * SIZE(X), %xmm5 + addpd %xmm6, %xmm2 + movlpd 12 * SIZE(X), %xmm6 + movhpd 13 * SIZE(X), %xmm6 + addpd %xmm7, %xmm3 + movlpd 14 * SIZE(X), %xmm7 + movhpd 15 * SIZE(X), %xmm7 + + mulpd 8 * SIZE(Y), %xmm4 + mulpd 10 * SIZE(Y), %xmm5 + mulpd 12 * SIZE(Y), %xmm6 + mulpd 14 * SIZE(Y), %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movlpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + movlpd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 + movlpd 4 * SIZE(X), %xmm6 + movhpd 5 * SIZE(X), %xmm6 + movlpd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 + + mulpd 0 * SIZE(Y), %xmm4 + mulpd 2 * SIZE(Y), %xmm5 + mulpd 4 * SIZE(Y), %xmm6 + mulpd 6 * SIZE(Y), %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movlpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + movlpd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 + mulpd 0 * SIZE(Y), %xmm4 + mulpd 2 * SIZE(Y), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movlpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + mulpd 0 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + movsd 0 * SIZE(X), %xmm4 + mulsd 0 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + + +.L50: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L51 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L51: + testl INCY, INCY + jge .L52 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L52: +#endif + + movl N, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(X), %xmm4 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(X), %xmm5 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm5 + addl INCY, Y + movsd 0 * SIZE(X), %xmm6 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm6 + addl INCY, Y + movsd 0 * SIZE(X), %xmm7 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm7 + addl INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + decl %eax + jg .L53 + ALIGN_3 + +.L55: + movl N, %eax + andl $3, %eax + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm4 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addl INCY, Y + addsd %xmm4, %xmm0 + decl %eax + jg .L56 + ALIGN_3 + +.L999: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if !defined(HAVE_SSE3) || defined(__INTERIX) + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + movsd %xmm0, STACK_N + fldl STACK_N + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_sse_opteron.S b/kernel/x86/dot_sse_opteron.S new file mode 100644 index 0000000..fc63219 --- /dev/null +++ b/kernel/x86/dot_sse_opteron.S @@ -0,0 +1,411 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ecx +#define X %esi +#define INCX %ebx +#define Y %edi +#define INCY %edx + +#define PREFETCHSIZE 84 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N), N # N + movl (INCX),INCX # INCX + movl (INCY),INCY # INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + cmpl $3, N + jle .L27 + + testl $SIZE, Y + je .L05 + + movss 0 * SIZE(X), %xmm0 + mulss 0 * SIZE(Y), %xmm0 + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + ALIGN_2 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + + mulss 0 * SIZE(Y), %xmm4 + mulss 1 * SIZE(Y), %xmm5 + + addss %xmm4, %xmm1 + addss %xmm5, %xmm2 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, N + jle .L999 + ALIGN_2 + +.L10: + movl N, %eax + sarl $5, %eax + jle .L24 + + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + movlps 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 + movlps 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 + + mulps 0 * SIZE(Y), %xmm4 + mulps 4 * SIZE(Y), %xmm5 + mulps 8 * SIZE(Y), %xmm6 + mulps 12 * SIZE(Y), %xmm7 + decl %eax + jle .L22 + + ALIGN_3 + +.L21: + prefetch (PREFETCHSIZE + 0) * SIZE(Y) + + addps %xmm4, %xmm0 + movlps 16 * SIZE(X), %xmm4 + movhps 18 * SIZE(X), %xmm4 + addps %xmm5, %xmm1 + movlps 20 * SIZE(X), %xmm5 + movhps 22 * SIZE(X), %xmm5 + addps %xmm6, %xmm2 + movlps 24 * SIZE(X), %xmm6 + movhps 26 * SIZE(X), %xmm6 + addps %xmm7, %xmm3 + movlps 28 * SIZE(X), %xmm7 + movhps 30 * SIZE(X), %xmm7 + + mulps 16 * SIZE(Y), %xmm4 + mulps 20 * SIZE(Y), %xmm5 + mulps 24 * SIZE(Y), %xmm6 + mulps 28 * SIZE(Y), %xmm7 + + prefetch (PREFETCHSIZE + 16) * SIZE(Y) + + addps %xmm4, %xmm0 + movlps 32 * SIZE(X), %xmm4 + movhps 34 * SIZE(X), %xmm4 + addps %xmm5, %xmm1 + movlps 36 * SIZE(X), %xmm5 + movhps 38 * SIZE(X), %xmm5 + addps %xmm6, %xmm2 + movlps 40 * SIZE(X), %xmm6 + movhps 42 * SIZE(X), %xmm6 + addps %xmm7, %xmm3 + movlps 44 * SIZE(X), %xmm7 + movhps 46 * SIZE(X), %xmm7 + + mulps 32 * SIZE(Y), %xmm4 + mulps 36 * SIZE(Y), %xmm5 + mulps 40 * SIZE(Y), %xmm6 + mulps 44 * SIZE(Y), %xmm7 + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + addps %xmm4, %xmm0 + movlps 16 * SIZE(X), %xmm4 + movhps 18 * SIZE(X), %xmm4 + addps %xmm5, %xmm1 + movlps 20 * SIZE(X), %xmm5 + movhps 22 * SIZE(X), %xmm5 + addps %xmm6, %xmm2 + movlps 24 * SIZE(X), %xmm6 + movhps 26 * SIZE(X), %xmm6 + addps %xmm7, %xmm3 + movlps 28 * SIZE(X), %xmm7 + movhps 30 * SIZE(X), %xmm7 + + mulps 16 * SIZE(Y), %xmm4 + mulps 20 * SIZE(Y), %xmm5 + mulps 24 * SIZE(Y), %xmm6 + mulps 28 * SIZE(Y), %xmm7 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + addps %xmm6, %xmm2 + addps %xmm7, %xmm3 + + addl $32 * SIZE, X + addl $32 * SIZE, Y + ALIGN_3 + +.L24: + testl $31, N + jle .L999 + + testl $16, N + jle .L25 + + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + movlps 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 + movlps 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 + + mulps 0 * SIZE(Y), %xmm4 + mulps 4 * SIZE(Y), %xmm5 + mulps 8 * SIZE(Y), %xmm6 + mulps 12 * SIZE(Y), %xmm7 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + addps %xmm6, %xmm2 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + mulps 0 * SIZE(Y), %xmm4 + mulps 4 * SIZE(Y), %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, N + jle .L27 + + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + mulps 0 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + + mulss 0 * SIZE(Y), %xmm4 + mulss 1 * SIZE(Y), %xmm5 + + addss %xmm4, %xmm0 + addss %xmm5, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + + +.L28: + testl $1, N + jle .L999 + + movss 0 * SIZE(X), %xmm4 + mulss 0 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + + +.L50: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L51 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L51: + testl INCY, INCY + jge .L52 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L52: +#endif + + movl N, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L53: + movss 0 * SIZE(X), %xmm4 + addl INCX, X + mulss 0 * SIZE(Y), %xmm4 + addl INCY, Y + movss 0 * SIZE(X), %xmm5 + addl INCX, X + mulss 0 * SIZE(Y), %xmm5 + addl INCY, Y + movss 0 * SIZE(X), %xmm6 + addl INCX, X + mulss 0 * SIZE(Y), %xmm6 + addl INCY, Y + movss 0 * SIZE(X), %xmm7 + addl INCX, X + mulss 0 * SIZE(Y), %xmm7 + addl INCY, Y + + addss %xmm4, %xmm0 + addss %xmm5, %xmm1 + addss %xmm6, %xmm2 + addss %xmm7, %xmm3 + + decl %eax + jg .L53 + ALIGN_3 + +.L55: + movl N, %eax + andl $3, %eax + jle .L999 + ALIGN_3 + +.L56: + movss 0 * SIZE(X), %xmm4 + addl INCX, X + mulss 0 * SIZE(Y), %xmm4 + addl INCY, Y + addss %xmm4, %xmm0 + decl %eax + jg .L56 + ALIGN_3 + +.L999: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if !defined(HAVE_SSE3) || defined(__INTERIX) + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + pshufd $1, %xmm0, %xmm1 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + + movss %xmm0, STACK_N + flds STACK_N + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/gemm_beta.S b/kernel/x86/gemm_beta.S new file mode 100644 index 0000000..b68dcf3 --- /dev/null +++ b/kernel/x86/gemm_beta.S @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define BETA 16 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define LDC 44 + STACK + ARGS(%esp) +#else +#define BETA 16 + STACK + ARGS(%esp) +#define C 36 + STACK + ARGS(%esp) +#define LDC 40 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl M, %esi # m + movl N, %ecx # n + FLD BETA # beta + + movl C, %edi # C + movl LDC, %ebp # ldc + + testl %esi, %esi # if n <= 0 goto End + jle .L999 + testl %ecx, %ecx # if m <= 0 goto End + jle .L999 + + ftst + fnstsw %ax + andb $68, %ah + je .L201 + ALIGN_4 + +.L101: + movl %edi, %eax # c_offset = c + leal (%edi, %ebp, SIZE), %edi # c += ldc + movl %esi, %edx + sarl $3, %edx + jle .L103 + ALIGN_4 + +.L102: +#ifdef HAS_PREFETCH +#ifndef ATHLON + prefetchnta 12 * SIZE(%eax) + prefetchnta 16 * SIZE(%eax) +#else + prefetchw 32 * SIZE(%eax) +#endif +#endif + + FSTU 0 * SIZE(%eax) + FSTU 1 * SIZE(%eax) + FSTU 2 * SIZE(%eax) + FSTU 3 * SIZE(%eax) + FSTU 4 * SIZE(%eax) + FSTU 5 * SIZE(%eax) + FSTU 6 * SIZE(%eax) + FSTU 7 * SIZE(%eax) + addl $8 * SIZE, %eax + decl %edx + jg .L102 + ALIGN_4 + +.L103: + movl %esi, %edx + andl $7, %edx + jle .L105 + ALIGN_4 + +.L104: + FSTU 0 * SIZE(%eax) + addl $SIZE, %eax + decl %edx + jg .L104 + ALIGN_4 + +.L105: + decl %ecx + jg .L101 + jmp .L999 + ALIGN_3 + + +.L201: + movl %edi, %eax # c_offset = c + leal (%edi, %ebp, SIZE), %edi # c += ldc + movl %esi, %edx + sarl $3, %edx + jle .L203 + ALIGN_4 + +.L202: +#ifdef HAS_PREFETCH +#ifndef ATHLON + prefetchnta 16 * SIZE(%eax) + prefetchnta 20 * SIZE(%eax) +#else + prefetchw 32 * SIZE(%eax) +#endif +#endif + + FLD 0 * SIZE(%eax) + fmul %st(1),%st + FST 0 * SIZE(%eax) + + FLD 1 * SIZE(%eax) + fmul %st(1),%st + FST 1 * SIZE(%eax) + + FLD 2 * SIZE(%eax) + fmul %st(1),%st + FST 2 * SIZE(%eax) + + FLD 3 * SIZE(%eax) + fmul %st(1),%st + FST 3 * SIZE(%eax) + + FLD 4 * SIZE(%eax) + fmul %st(1),%st + FST 4 * SIZE(%eax) + + FLD 5 * SIZE(%eax) + fmul %st(1),%st + FST 5 * SIZE(%eax) + + FLD 6 * SIZE(%eax) + fmul %st(1),%st + FST 6 * SIZE(%eax) + + FLD 7 * SIZE(%eax) + fmul %st(1),%st + FST 7 * SIZE(%eax) + + addl $8 * SIZE, %eax + decl %edx + jg .L202 + ALIGN_4 + +.L203: + movl %esi, %edx + andl $7, %edx + jle .L205 + ALIGN_4 + +.L204: + FLD 0 * SIZE(%eax) + fmul %st(1), %st + FST 0 * SIZE(%eax) + addl $SIZE, %eax + decl %edx + jg .L204 + ALIGN_4 + +.L205: + decl %ecx + jg .L201 + ALIGN_3 + +.L999: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_1x4.S b/kernel/x86/gemm_kernel_1x4.S new file mode 100644 index 0000000..e1ff4e8 --- /dev/null +++ b/kernel/x86/gemm_kernel_1x4.S @@ -0,0 +1,907 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define STACK_A 20 + STACK + ARGS(%esp) +#define STACK_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define STACK_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define BB %ebx +#define LDC %ebp +#define BX %esi + +#define PREFETCHSIZE (8 * 5 + 4) + +#define AOFFSET 1 +#define BOFFSET -7 + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#else +#define PREFETCH prefetcht0 +#endif + +#define KERNEL \ + PREFETCH PREFETCHSIZE * SIZE + AOFFSET(A, %eax, 1);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -15 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -14 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -13 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -15 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -11 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -10 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -9 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -14 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -7 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -6 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -5 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -13 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -3 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -2 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -1 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -12 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 0 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 1 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 2 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 3 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -11 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 5 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 6 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 7 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -10 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 9 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 10 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 11 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -9 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 13 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 14 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 15 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD 8 * SIZE + AOFFSET(A, %eax, 1);\ + fxch %st(1);\ + FLD 16 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -15 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + AOFFSET(A, %eax, 1);\ + faddp %st, %st(5);\ + FLD -14 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -13 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -7 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -12 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -11 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -10 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -9 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -6 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -8 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -7 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -6 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -5 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -5 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -4 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -3 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -2 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -1 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -4 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 0 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 1 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 2 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 3 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -3 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 4 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 5 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 6 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 7 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -2 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 8 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 9 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 10 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 11 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -1 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 12 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 13 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 14 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 15 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD 16 * SIZE + AOFFSET(A, %eax, 1);\ + fxch %st(2);\ + FLD 16 * SIZE + BOFFSET(BB, %eax, 4);\ + subl $-16 * SIZE, %eax + +/* + + A hint of scheduling is received from following URL + + http://www.netlib.org/atlas/atlas-comm/msg00260.html + +*/ + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl STACK_LDC, LDC + leal (, LDC, SIZE), LDC + + subl $(AOFFSET - 16 * SIZE), STACK_A + subl $(BOFFSET - 16 * SIZE), STACK_B + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + sarl $2, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L11: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl K, BX + sall $BASE_SHIFT + 2, BX + addl B, BX + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L14: + prefetchnta -16 * SIZE + BOFFSET(BX) + subl $-8 * SIZE, BX + + movl STACK_B, B + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 4), B +#endif + + leal (%edi, LDC, 2), %eax + + fldz + fldz + fldz + fldz + + FLD 0 * SIZE + AOFFSET(A) + FLD -8 * SIZE + AOFFSET(A) + FLD -16 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + +#ifdef HAVE_3DNOW + prefetchw 1 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, LDC) + prefetchw 1 * SIZE(%eax) + prefetchw 2 * SIZE(%eax, LDC) +#elif defined(HAVE_SSE) + prefetcht0 1 * SIZE(%edi) + prefetcht0 2 * SIZE(%edi, LDC) + prefetcht0 1 * SIZE(%eax) + prefetcht0 2 * SIZE(%eax, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-16, %eax + + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal 32 * SIZE(B, %eax, 4), BB + leal (B, %eax, 4), B + negl %eax + NOBRANCH + je .L16 + ALIGN_4 + +.L15: + KERNEL + jge .L16 + KERNEL + jge .L16 + KERNEL + jge .L16 + KERNEL + jl .L15 + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $15, %eax + je .L19 + ALIGN_4 + +.L17: + fmul %st(1), %st + faddp %st, %st(4) + + FLD -15 * SIZE + BOFFSET(B) + fmul %st(1), %st + faddp %st, %st(5) + + FLD -14 * SIZE + BOFFSET(B) + fmul %st(1), %st + faddp %st, %st(6) + + FMUL -13 * SIZE + BOFFSET(B) + faddp %st, %st(6) + FLD -15 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + addl $1 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + leal (%edi, LDC, 2), %eax + +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) + FADD (%edi,LDC) + FST (%edi,LDC) + FADD (%eax) + FST (%eax) + FADD (%eax,LDC) + FST (%eax,LDC) +#else + FST (%edi) + FST (%edi,LDC) + FST (%eax) + FST (%eax,LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 4), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, %edi + decl I + jne .L14 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C + movl B, STACK_B + decl J + jne .L11 + ALIGN_4 + +.L20: + movl N, %eax + andl $2, %eax + je .L30 + ALIGN_3 + +.L21: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L24: + movl STACK_B, B + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 2), B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -15 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -13 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -14 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -11 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -13 * SIZE + AOFFSET(A) + FLD -10 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -9 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -12 * SIZE + AOFFSET(A) + FLD -8 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -7 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -11 * SIZE + AOFFSET(A) + FLD -6 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -5 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -10 * SIZE + AOFFSET(A) + FLD -4 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -3 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -9 * SIZE + AOFFSET(A) + FLD -2 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -1 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -8 * SIZE + AOFFSET(A) + FLD 0 * SIZE + BOFFSET(B) + + addl $ 8 * SIZE, A + subl $-16 * SIZE, B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -15 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + addl $1 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(2) + faddp %st, %st(2) + + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) + FADD (%edi,LDC) + FST (%edi,LDC) +#else + FST (%edi) + FST (%edi,LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 2), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, %edi + decl I + jne .L24 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + movl B, STACK_B + ALIGN_4 + +.L30: + movl N, %eax + andl $1, %eax + je .L999 + ALIGN_3 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L34: + movl STACK_B, B + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 1), B +#endif + + fldz + fldz + fldz + fldz + + prefetchw 1 * SIZE(%edi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L36 + ALIGN_3 + +.L35: + FLD -16 * SIZE + AOFFSET(A) + FMUL -16 * SIZE + BOFFSET(B) + faddp %st, %st(1) + + FLD -15 * SIZE + AOFFSET(A) + FMUL -15 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -14 * SIZE + AOFFSET(A) + FMUL -14 * SIZE + BOFFSET(B) + faddp %st, %st(3) + + FLD -13 * SIZE + AOFFSET(A) + FMUL -13 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -12 * SIZE + AOFFSET(A) + FMUL -12 * SIZE + BOFFSET(B) + faddp %st, %st(1) + + FLD -11 * SIZE + AOFFSET(A) + FMUL -11 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -10 * SIZE + AOFFSET(A) + FMUL -10 * SIZE + BOFFSET(B) + faddp %st, %st(3) + + FLD -9 * SIZE + AOFFSET(A) + FMUL -9 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + addl $8 * SIZE, A + addl $8 * SIZE, B + + decl %eax + jne .L35 + ALIGN_4 + +.L36: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L39 + ALIGN_4 + +.L37: + FLD -16 * SIZE + AOFFSET(A) + FMUL -16 * SIZE + BOFFSET(B) + faddp %st, %st(1) + + addl $1 * SIZE,A + addl $1 * SIZE,B + decl %eax + jne .L37 + ALIGN_4 + +.L39: + faddp %st, %st(2) + faddp %st, %st(2) + faddp %st, %st(1) + + FMUL ALPHA + +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) +#else + FST (%edi) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 1), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, %edi + decl I + jne .L34 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C + movl B, STACK_B + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x2.S b/kernel/x86/gemm_kernel_2x2.S new file mode 100644 index 0000000..1483bc4 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x2.S @@ -0,0 +1,697 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl N, %eax # j = (n >> 1) # MEMORY + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + + sarl $1, %eax + leal (, %ebp, SIZE), %ebp + leal 0(%ecx) , %ecx # NOP + movl %eax, J # j = (n >> 1) # MEMORY + test %eax, %eax + je .L8 # if !(n >> 1) goto .L8 + ALIGN_4 + +.L34: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl %ebx, BX + + movl M, %esi # m # MEMORY + movl A, %edx # a # MEMORY + movl C, %edi # C # MEMORY + sarl $1, %esi # i = (m >> 1) + je .L12 + ALIGN_4 + +.MainHead: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ebx, %eax, 2), %ecx +#endif + +#ifdef HAVE_SSE + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + fldz + fldz + + FLD 4 * SIZE(%ecx) # b5 + FLD 4 * SIZE(%edx) # a5 + FLD 0 * SIZE(%ecx) # b1 + FLD 0 * SIZE(%edx) # a1 + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(%ecx) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(%ecx) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET) * SIZE(%edx) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(%edx) + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(%edx) + + fmul %st, %st(1) + FMUL 3 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(%edx) + fmul %st, %st(1) + FMUL 3 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(%edx) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(%ecx) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(%edx) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(%edx) + fmul %st, %st(3) + FMUL 5 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(%edx) + + fmul %st, %st(3) + FMUL 7 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(%edx) + fmul %st, %st(3) + FMUL 7 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(%edx) + fxch %st(2) + + subl $-8 * SIZE, %ecx + subl $-8 * SIZE, %edx + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(%edx) + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(%edx) + + addl $2 * SIZE,%ecx + addl $2 * SIZE,%edx + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmulp %st, %st(3) + +#ifndef TRMMKERNEL + FADD 0 * SIZE(%edi) + FST 0 * SIZE(%edi) + FADD 0 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi,%ebp) + FADD 1 * SIZE(%edi) + FST 1 * SIZE(%edi) + FADD 1 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi,%ebp) +#else + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) + FST 1 * SIZE(%edi,%ebp) +#endif + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ecx, %eax, 2), %ecx +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %edi + rep + decl %esi # i -- + rep + jne .MainHead + ALIGN_4 + +.L12: + movl M, %eax # m # MEMORY + andl $1, %eax + je .L27 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ebx, %eax, 2), %ecx +#endif + fldz + fldz + + FLD 0 * SIZE(%edx) # temp1 = *(aoffset + 0) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(%edx) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, %edx + addl $4 * SIZE, %ecx + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, %edx + addl $2 * SIZE, %ecx + ALIGN_4 + +.L33: + ffreep %st(0) + FLD ALPHA + + fmul %st, %st(2) + fmulp %st, %st(1) + +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) + FADD (%edi,%ebp) + FST (%edi,%ebp) +#else + FST (%edi) + FST (%edi,%ebp) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ecx, %eax, 2), %ecx +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L27: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + lea (, %ebp, 2), %eax + addl %eax, C # C + 2 * ldc # MEMORY + movl %ecx, %ebx # b # MEMORY + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.L8: + movl N, %eax # n # MEMORY + andl $1, %eax + je .End + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, %edi # c # MEMORY + movl A, %edx # a # MEMORY + + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L36 + ALIGN_4 + +.L46: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ebx, %eax, 1), %ecx +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + fldz + sarl $1, %eax + fldz + FLD 0 * SIZE(%ecx) # temp1 = *(boffset + 0) + + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%ecx) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(%ecx) # temp1 = *(boffset + 0) + + addl $4 * SIZE,%edx + addl $2 * SIZE,%ecx + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(%ecx) # temp1 = *(boffset + 0) + + addl $2 * SIZE,%edx + addl $1 * SIZE,%ecx + ALIGN_4 + +.L45: + ffreep %st(0) + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + +#ifndef TRMMKERNEL + FADD 0 * SIZE(%edi) + FST 0 * SIZE(%edi) + FADD 1 * SIZE(%edi) + FST 1 * SIZE(%edi) +#else + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) +#endif + + addl $2 * SIZE, %edi + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ecx, %eax, 1), %ecx +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L36: + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .End + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ebx, %eax, 1), %ecx +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + fldz + ALIGN_3 + +.L51: + FLD (%edx) + FMUL (%ecx) + addl $1 * SIZE,%edx + addl $1 * SIZE,%ecx + faddp %st,%st(1) + decl %eax + jne .L51 + + FMUL ALPHA +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) +#else + FST (%edi) +#endif + ALIGN_4 + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x2_atom.S b/kernel/x86/gemm_kernel_2x2_atom.S new file mode 100644 index 0000000..f895412 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x2_atom.S @@ -0,0 +1,736 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define CO1 %esi +#define LDC %ebp +#define B %edi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 1, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, CO1 # coffset = c + leal (, LDC, 2), %eax + addl %eax, C + + movl A, AA # aoffset = a + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movl BX, %eax + prefetcht0 0 * SIZE(%eax) + subl $-8 * SIZE, BX + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movsd ALPHA, %xmm0 + + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm6 + + addsd 0 * SIZE(CO1, LDC), %xmm5 + addsd 1 * SIZE(CO1, LDC), %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm6, 1 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO1, LDC) + movsd %xmm7, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO1 + decl %ebx + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movsd ALPHA, %xmm0 + + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 + addsd 0 * SIZE(CO1, LDC), %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, CO1 + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + decl J + jg .L10 + ALIGN_4 + +.L30: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO1 + addl LDC, C + + movl A, AA + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movsd ALPHA, %xmm3 + + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + + mulsd %xmm3, %xmm4 + mulsd %xmm3, %xmm6 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm6, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO1 + decl %ebx + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movsd ALPHA, %xmm0 + + addsd %xmm5, %xmm4 + mulsd %xmm0, %xmm4 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_3dnow.S b/kernel/x86/gemm_kernel_2x4_3dnow.S new file mode 100644 index 0000000..a86efda --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_3dnow.S @@ -0,0 +1,1917 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 20 + STACK + ARGS(%esi) +#define OLD_B 24 + STACK + ARGS(%esi) +#define OLD_C 28 + STACK + ARGS(%esi) +#define OLD_LDC 32 + STACK + ARGS(%esi) +#define OLD_OFFSET 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 8(%esp) +#define N 12(%esp) +#define M 16(%esp) +#define A 20(%esp) +#define C 24(%esp) +#define J 28(%esp) +#define OLD_STACK 32(%esp) +#define OFFSET 36(%esp) +#define KK 40(%esp) +#define KKK 44(%esp) +#define BUFFER 64(%esp) + +#define AA %edx +#define BB %ecx + +#define PREFETCHSIZE (16 * 2 + 6) + +#define AOFFSET -32 +#define BOFFSET 128 + +/* + + A hint of scheduling is received from following URL + +https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11 + +*/ + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movd OLD_ALPHA, %mm3 + + movl %ebx, M + movl %eax, N + movl %ecx, K + subl $AOFFSET * SIZE, %edx + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, %edi + movl OLD_C, %ebx + punpckldq %mm3, %mm3 + + movq %mm3, ALPHA + + movl %ebx, C + movl OLD_LDC, %ebp + leal (, %ebp, SIZE), %ebp + +#ifdef TRMMKERNEL + movl OLD_OFFSET, %eax + movl %eax, OFFSET +#ifndef LEFT + negl %eax + movl %eax, KK +#endif +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_3 + +.L01: +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_3 + +.L02: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm4, 8 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + + movd 8 * SIZE(%edi), %mm0 + movd 9 * SIZE(%edi), %mm1 + movd 10 * SIZE(%edi), %mm2 + movd 11 * SIZE(%edi), %mm3 + movd 12 * SIZE(%edi), %mm4 + movd 13 * SIZE(%edi), %mm5 + movd 14 * SIZE(%edi), %mm6 + movd 15 * SIZE(%edi), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 16 * SIZE(%ecx) + movq %mm1, 18 * SIZE(%ecx) + movq %mm2, 20 * SIZE(%ecx) + movq %mm3, 22 * SIZE(%ecx) + movq %mm4, 24 * SIZE(%ecx) + movq %mm5, 26 * SIZE(%ecx) + movq %mm6, 28 * SIZE(%ecx) + movq %mm7, 30 * SIZE(%ecx) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_2 + +.L04: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + addl $4 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + + leal (%ebp, %ebp, 2), %eax + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, %ebp) + prefetchw 2 * SIZE(%esi, %ebp, 2) + prefetchw 2 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L15 + ALIGN_4 + +.L12: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 10 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 18 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 26 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 34 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 42 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 50 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 58 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 66 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 68 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 72 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 74 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 76 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 96 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 82 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 84 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 88 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 90 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 92 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (112 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 98 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq (100 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq (104 + BOFFSET) * SIZE(BB), %mm2 + pfmul (102 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq (106 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq (108 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq (128 + BOFFSET) * SIZE(BB), %mm2 + pfmul (110 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq (114 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq (116 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (120 + BOFFSET) * SIZE(BB), %mm3 + pfmul (118 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq (122 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq (124 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (144 + BOFFSET) * SIZE(BB), %mm3 + pfmul (126 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L12 + ALIGN_3 + +.L15: + movq ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L18 + ALIGN_3 + +.L16: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_3 + +.L18: + leal (%ebp, %ebp, 2), %eax + +#ifndef TRMMKERNEL + pfmul %mm3, %mm4 + pfadd 0 * SIZE(%esi), %mm4 + pfmul %mm3, %mm5 + pfadd 0 * SIZE(%esi, %ebp, 1), %mm5 + pfmul %mm3, %mm6 + pfadd 0 * SIZE(%esi, %ebp, 2), %mm6 + pfmul %mm3, %mm7 + pfadd 0 * SIZE(%esi, %eax, 1), %mm7 +#else + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + pfmul %mm3, %mm6 + pfmul %mm3, %mm7 +#endif + + movq %mm4, 0 * SIZE(%esi) + movq %mm5, 0 * SIZE(%esi, %ebp, 1) + movq %mm6, 0 * SIZE(%esi, %ebp, 2) + movq %mm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + ALIGN_4 + +.L21: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 8 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 2 + BOFFSET) * SIZE(BB), %mm2 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 10 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 18 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 3 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 26 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 34 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 5 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 42 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 50 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 7 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 58 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 16 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 66 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 68 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 72 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 9 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 74 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 76 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 96 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 10 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 82 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 84 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 88 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 11 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 90 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 92 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd (112 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 12 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 98 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movd (100 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd (104 + BOFFSET) * SIZE(BB), %mm2 + pfmul (102 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 13 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd (106 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movd (108 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd (128 + BOFFSET) * SIZE(BB), %mm2 + pfmul (110 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 14 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd (114 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movd (116 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd (120 + BOFFSET) * SIZE(BB), %mm3 + pfmul (118 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 15 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd (122 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movd (124 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd (144 + BOFFSET) * SIZE(BB), %mm3 + pfmul (126 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 24 + AOFFSET) * SIZE(AA), %mm1 + + subl $-16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_3 + +.L25: + movd ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L28 + ALIGN_3 + +.L26: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 2 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_3 + +.L28: + leal (%ebp, %ebp, 2), %eax + + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + pfmul %mm3, %mm6 + pfmul %mm3, %mm7 + +#ifndef TRMMKERNEL + movd 0 * SIZE(%esi) , %mm0 + movd 0 * SIZE(%esi, %ebp, 1), %mm1 + movd 0 * SIZE(%esi, %ebp, 2), %mm2 + movd 0 * SIZE(%esi, %eax, 1), %mm3 + + pfadd %mm0, %mm4 + pfadd %mm1, %mm5 + pfadd %mm2, %mm6 + pfadd %mm3, %mm7 +#endif + + movd %mm4, 0 * SIZE(%esi) + movd %mm5, 0 * SIZE(%esi, %ebp, 1) + movd %mm6, 0 * SIZE(%esi, %ebp, 2) + movd %mm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, %ebp, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $2, %eax + jle .L60 + ALIGN_3 + +.L31: +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L33 + ALIGN_3 + +.L32: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm4, 8 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L32 + +.L33: + movl K, %eax + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L34: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L34 + ALIGN_4 + +.L40: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L45 + ALIGN_4 + +.L42: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L42 + ALIGN_3 + +.L45: + movq ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L48 + ALIGN_3 + +.L46: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_3 + +.L48: + pfadd %mm6, %mm4 + pfadd %mm7, %mm5 + + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + +#ifndef TRMMKERNEL + pfadd 0 * SIZE(%esi), %mm4 + pfadd 0 * SIZE(%esi, %ebp, 1), %mm5 +#endif + + movq %mm4, 0 * SIZE(%esi) + movq %mm5, 0 * SIZE(%esi, %ebp, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + ALIGN_4 + +.L51: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 8 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 3 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 5 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 7 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 16 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 9 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 10 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 11 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 12 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 13 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 14 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 15 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 24 + AOFFSET) * SIZE(AA), %mm1 + + subl $-16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_3 + +.L55: + movd ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L58 + ALIGN_3 + +.L56: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_3 + +.L58: + pfadd %mm6, %mm4 + pfadd %mm7, %mm5 + + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + +#ifndef TRMMKERNEL + movd 0 * SIZE(%esi) , %mm0 + movd 0 * SIZE(%esi, %ebp, 1), %mm1 + + pfadd %mm0, %mm4 + pfadd %mm1, %mm5 +#endif + + movd %mm4, 0 * SIZE(%esi) + movd %mm5, 0 * SIZE(%esi, %ebp, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, %ebp, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_3 + +.L61: +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L63 + ALIGN_3 + +.L62: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm4, 8 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L62 + +.L63: + movl K, %eax + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L64: + movd 0 * SIZE(%edi), %mm0 + punpckldq %mm0, %mm0 + movq %mm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L64 + ALIGN_4 + +.L70: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L90 + ALIGN_4 + +.L71: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + pxor %mm7, %mm7 + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L75 + ALIGN_4 + +.L72: + pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 4 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm6 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 8 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 12 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm6 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 16 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm4 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 20 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm6 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 24 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm4 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 28 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm6 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_3 + +.L75: + movq ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L78 + ALIGN_3 + +.L76: + pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_3 + +.L78: + pfadd %mm5, %mm4 + pfadd %mm7, %mm6 + pfadd %mm6, %mm4 + + pfmul %mm3, %mm4 +#ifndef TRMMKERNEL + pfadd 0 * SIZE(%esi), %mm4 +#endif + movq %mm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L90: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + ALIGN_4 + +.L91: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 8 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + pxor %mm7, %mm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L95 + ALIGN_4 + +.L92: + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 4 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm6 + movd ( 3 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 8 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movd ( 5 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 12 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm6 + movd ( 7 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 16 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 16 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm4 + movd ( 9 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 10 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 20 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm6 + movd ( 11 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 12 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 24 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm4 + movd ( 13 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 14 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 28 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm6 + movd ( 15 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 24 + AOFFSET) * SIZE(AA), %mm1 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_3 + +.L95: + movd ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L98 + ALIGN_3 + +.L96: + pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L96 + ALIGN_3 + +.L98: +#ifndef TRMMKERNEL + movd 0 * SIZE(%esi), %mm0 +#endif + + pfadd %mm5, %mm4 + pfadd %mm7, %mm6 + pfadd %mm6, %mm4 + + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + +#ifndef TRMMKERNEL + pfadd %mm0, %mm4 +#endif + movd %mm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_barcelona.S b/kernel/x86/gemm_kernel_2x4_barcelona.S new file mode 100644 index 0000000..1acdc16 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_barcelona.S @@ -0,0 +1,1268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define OLD_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define OLD_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define B %edi +#define LDC %ebp +#define AO %edx +#define BO %ecx +#define CO %esi +#define I %ebx + +#define movsd movlps +#define movapd movups +#define movlpd movlps +#define movhpd movhps + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm1; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL2(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -12 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL3(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -10 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL4(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd (BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup (AO, %eax, 2), %xmm0 + +#define KERNEL5(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -6 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL6(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -4 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL7(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -2 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL8(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl OLD_B, B + movl OLD_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax + +#ifndef LEFT + negl %eax +#endif + + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax + movl %eax, BX + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm4, %xmm4 + movddup -8 * SIZE(AO), %xmm3 + + leal (LDC, LDC, 2), %eax + + prefetchw 1 * SIZE(CO) + pxor %xmm5, %xmm5 + prefetchw 3 * SIZE(CO, LDC) + pxor %xmm6, %xmm6 + prefetchw 1 * SIZE(CO, LDC, 2) + pxor %xmm7, %xmm7 + prefetchw 3 * SIZE(CO, %eax) + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetch -16 * SIZE(%eax) + addl $8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + BRANCH + jl .L12 + ALIGN_3 + +.L15: + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + je .L18 + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO, %eax, 4), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO, %eax, 2), %xmm0 + mulpd %xmm0, %xmm2 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + addpd %xmm0, %xmm7 + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + + addl $SIZE, %eax + jl .L17 + ALIGN_4 + +.L18: + leal (CO, LDC, 2), %eax + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO ), %xmm0 + movhpd 0 * SIZE(CO, LDC), %xmm0 + movsd 0 * SIZE(%eax ), %xmm1 + movhpd 0 * SIZE(%eax, LDC), %xmm1 + + movsd 1 * SIZE(CO ), %xmm2 + movhpd 1 * SIZE(CO, LDC), %xmm2 + movsd 1 * SIZE(%eax ), %xmm3 + movhpd 1 * SIZE(%eax, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(CO) + movsd %xmm6, 1 * SIZE(CO) + + movhpd %xmm4, 0 * SIZE(CO, LDC) + movhpd %xmm6, 1 * SIZE(CO, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movsd %xmm7, 1 * SIZE(%eax) + + movhpd %xmm5, 0 * SIZE(%eax, LDC) + movhpd %xmm7, 1 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO # coffset += 2 + decl I # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I # i = (m >> 2) + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + movddup -8 * SIZE(AO), %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd -8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -13 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd (BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -11 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd 8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -9 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -8 * SIZE(AO), %xmm0 + + subl $ -8 * SIZE, AO + subl $-32 * SIZE, BO + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO), %xmm0 + + addl $1 * SIZE, AO + addl $4 * SIZE, BO + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (CO, LDC, 2), %eax + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO ), %xmm0 + movhpd 0 * SIZE(CO, LDC), %xmm0 + movsd 0 * SIZE(%eax ), %xmm1 + movhpd 0 * SIZE(%eax, LDC), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(CO ) + movhpd %xmm4, 0 * SIZE(CO, LDC) + movsd %xmm5, 0 * SIZE(%eax ) + movhpd %xmm5, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BO, B + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + ALIGN_2 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + prefetchw 1 * SIZE(CO) + pxor %xmm5, %xmm5 + prefetchw 1 * SIZE(CO, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -16 * SIZE(BO), %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -14 * SIZE(BO), %xmm0 + movddup -13 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -14 * SIZE(BO), %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + mulpd -12 * SIZE(BO), %xmm0 + movddup -11 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -12 * SIZE(BO), %xmm1 + movddup -10 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -10 * SIZE(BO), %xmm1 + movddup -8 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AO) + + mulpd -8 * SIZE(BO), %xmm0 + movddup -7 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -8 * SIZE(BO), %xmm1 + movddup -6 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -6 * SIZE(BO), %xmm1 + movddup -4 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + mulpd -4 * SIZE(BO), %xmm0 + movddup -3 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -4 * SIZE(BO), %xmm1 + movddup -2 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -2 * SIZE(BO), %xmm0 + movddup -1 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -2 * SIZE(BO), %xmm1 + movddup 0 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + subl $-16 * SIZE, AO + subl $-16 * SIZE, BO + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -16 * SIZE(BO), %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + addl $2 * SIZE, AO + addl $2 * SIZE, BO + decl %eax + jg .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO), %xmm0 + movhpd 0 * SIZE(CO, LDC), %xmm0 + + movsd 1 * SIZE(CO), %xmm1 + movhpd 1 * SIZE(CO, LDC), %xmm1 +#endif + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movlpd %xmm4, 0 * SIZE(CO) + movlpd %xmm5, 1 * SIZE(CO) + + movhpd %xmm4, 0 * SIZE(CO, LDC) + movhpd %xmm5, 1 * SIZE(CO, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO # coffset += 2 + decl I # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, I + testl $1, I # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(AO), %xmm0 + + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -14 * SIZE(AO), %xmm0 + + mulpd -12 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -13 * SIZE(AO), %xmm0 + + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -12 * SIZE(AO), %xmm0 + + mulpd -8 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -11 * SIZE(AO), %xmm0 + + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -10 * SIZE(AO), %xmm0 + + mulpd -4 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -9 * SIZE(AO), %xmm0 + + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -8 * SIZE(AO), %xmm0 + + subl $ -8 * SIZE, AO + subl $-16 * SIZE, BO + + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(AO), %xmm0 + + subl $-1 * SIZE, AO + subl $-2 * SIZE, BO + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + addpd %xmm5, %xmm4 + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO), %xmm0 + movhpd 0 * SIZE(CO, LDC), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movlpd %xmm4, 0 * SIZE(CO) + movhpd %xmm4, 0 * SIZE(CO, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + + leal (, LDC, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + prefetchw 1 * SIZE(CO) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(BO), %xmm0 + + mulpd -14 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -14 * SIZE(BO), %xmm0 + + mulpd -12 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -13 * SIZE(BO), %xmm0 + + mulpd -10 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -12 * SIZE(BO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + mulpd -8 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -11 * SIZE(BO), %xmm0 + + mulpd -6 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -10 * SIZE(BO), %xmm0 + + mulpd -4 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -9 * SIZE(BO), %xmm0 + + mulpd -2 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -8 * SIZE(BO), %xmm0 + + subl $-16 * SIZE, AO + subl $ -8 * SIZE, BO + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movddup ALPHA, %xmm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd -16 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(BO), %xmm0 + + addl $2 * SIZE, AO + addl $1 * SIZE, BO + decl %eax + jg .L76 + ALIGN_4 + +.L78: + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO), %xmm0 + movhpd 1 * SIZE(CO), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(CO) + movhpd %xmm4, 1 * SIZE(CO) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO # coffset += 2 + decl I # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, I + testl $1, I # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movapd -14 * SIZE(AO), %xmm0 + + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm5 + movapd -12 * SIZE(AO), %xmm0 + + mulpd -12 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm6 + movapd -10 * SIZE(AO), %xmm0 + + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm7 + movapd -8 * SIZE(AO), %xmm0 + + subl $-8 * SIZE, AO + subl $-8 * SIZE, BO + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movddup ALPHA, %xmm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd -16 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm4 + movsd -15 * SIZE(AO), %xmm0 + + addl $1 * SIZE, AO + addl $1 * SIZE, BO + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO), %xmm0 + addsd %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(CO) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_core2.S b/kernel/x86/gemm_kernel_2x4_core2.S new file mode 100644 index 0000000..9907131 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_core2.S @@ -0,0 +1,1318 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH_R (8 * 4) + +#define PREFETCHSIZE (8 * 21 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + + prefetcht2 (PREFETCH_R + 0) * SIZE(%eax) + prefetcht2 (PREFETCH_R + 8) * SIZE(%eax) + + subl $-8 * SIZE, BX + + leal (C1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(C1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 1 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps 2 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps 6 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps 10 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps 14 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + + subl $-32 * SIZE, BB + subl $-16 * SIZE, AA + + subl $1, %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 + SHUFPD_1 %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + + movddup ALPHA, %xmm3 + + movaps %xmm4, %xmm0 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm0, %xmm6 + + movaps %xmm5, %xmm1 + unpcklpd %xmm7, %xmm5 + unpckhpd %xmm1, %xmm7 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + + movsd 0 * SIZE(%eax), %xmm2 + movhpd 1 * SIZE(%eax), %xmm2 + movsd 0 * SIZE(%eax, LDC), %xmm3 + movhpd 1 * SIZE(%eax, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 + addpd %xmm2, %xmm5 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 0 * SIZE(C1, LDC) + movhpd %xmm6, 1 * SIZE(C1, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movhpd %xmm5, 1 * SIZE(%eax) + movsd %xmm7, 0 * SIZE(%eax, LDC) + movhpd %xmm7, 1 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movddup ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 0 * SIZE(C1, LDC), %xmm0 + + movsd 0 * SIZE(%eax), %xmm1 + movhpd 0 * SIZE(%eax, LDC), %xmm1 +#endif + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 0 * SIZE(C1, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movhpd %xmm5, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $2, %eax + jle .L50 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(C1) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(C1, LDC) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movddup ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + movsd %xmm0, %xmm5 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhpd %xmm5, 1 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L31 + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 0 * SIZE(C1, LDC), %xmm0 +#endif + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + haddpd %xmm4, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 +#endif + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_penryn.S b/kernel/x86/gemm_kernel_2x4_penryn.S new file mode 100644 index 0000000..263aea0 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_penryn.S @@ -0,0 +1,1367 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#ifdef NANO +#define PREFETCHSIZE (8 * 3 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 13 + 4) +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -16 * SIZE(%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + + leal (C1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + PREFETCHW 1 * SIZE(C1) + xorps %xmm5, %xmm5 + PREFETCHW 3 * SIZE(C1, LDC) + xorps %xmm6, %xmm6 + PREFETCHW 1 * SIZE(%eax) + xorps %xmm7, %xmm7 + PREFETCHW 3 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + subl $-32 * SIZE, BB + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movddup ALPHA, %xmm3 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + movsd %xmm0, %xmm5 + mulpd %xmm3, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + mulpd %xmm3, %xmm6 + movsd %xmm0, %xmm7 + mulpd %xmm3, %xmm7 + + movl C1, %eax + orl LDC, %eax + testl $15, %eax + NOBRANCH + jne .L18x + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movaps (C1), %xmm0 + movaps (C1, LDC), %xmm1 + movaps (%eax), %xmm2 + movaps (%eax, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movaps %xmm4, (C1) + movaps %xmm5, (C1, LDC) + movaps %xmm6, (%eax) + movaps %xmm7, (%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movups (C1), %xmm0 + movups (C1, LDC), %xmm1 + movups (%eax), %xmm2 + movups (%eax, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movups %xmm4, (C1) + movups %xmm5, (C1, LDC) + movups %xmm6, (%eax) + movups %xmm7, (%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movddup ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 0 * SIZE(C1, LDC), %xmm0 + + movsd 0 * SIZE(%eax), %xmm1 + movhpd 0 * SIZE(%eax, LDC), %xmm1 +#endif + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 0 * SIZE(C1, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movhpd %xmm5, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $2, %eax + jle .L50 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + xorps %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1) + xorps %xmm6, %xmm6 + PREFETCHW 1 * SIZE(C1, LDC) + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movddup ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + movsd %xmm0, %xmm5 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhpd %xmm5, 1 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L31 + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 0 * SIZE(C1, LDC), %xmm0 +#endif + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + xorps %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + haddpd %xmm4, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 +#endif + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_sse2.S b/kernel/x86/gemm_kernel_2x4_sse2.S new file mode 100644 index 0000000..be58235 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_sse2.S @@ -0,0 +1,1790 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define movsd movlpd +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movsd OLD_ALPHA, %xmm3 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movss OLD_OFFT, %xmm4 +#endif + + unpcklpd %xmm3, %xmm3 + movl OLD_B, %edi + movl OLD_C, %ebx + movapd %xmm3, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + leal (, LDC, SIZE), LDC + + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl %edi, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movl BX, %eax + + prefetchnta 0 * SIZE(%eax) + prefetchnta 8 * SIZE(%eax) + + subl $-8 * SIZE, BX + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + + prefetchw 1 * SIZE(%esi) + prefetchw 1 * SIZE(%esi, LDC) + prefetchw 1 * SIZE(%esi, LDC, 2) + prefetchw 1 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulpd %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + mulpd %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 + mulpd %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhpd 1 * SIZE(%esi, LDC, 2), %xmm2 + mulpd %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhpd 1 * SIZE(%esi, %eax, 1), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#else + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movhpd %xmm6, 1 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + movhpd %xmm7, 1 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movsd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movsd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movsd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movsd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movsd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movsd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movsd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movsd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movsd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movsd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movsd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movsd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulsd %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + mulsd %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + mulsd %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + mulsd %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + + addsd %xmm0, %xmm4 + addsd %xmm1, %xmm5 + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 +#else + mulsd %xmm3, %xmm4 + mulsd %xmm3, %xmm5 + mulsd %xmm3, %xmm6 + mulsd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + ALIGN_2 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + movddup 2 * SIZE(%edi), %xmm2 + movddup 3 * SIZE(%edi), %xmm3 + movddup 4 * SIZE(%edi), %xmm4 + movddup 5 * SIZE(%edi), %xmm5 + movddup 6 * SIZE(%edi), %xmm6 + movddup 7 * SIZE(%edi), %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + movsd 2 * SIZE(%edi), %xmm2 + movsd 3 * SIZE(%edi), %xmm3 + movsd 4 * SIZE(%edi), %xmm4 + movsd 5 * SIZE(%edi), %xmm5 + movsd 6 * SIZE(%edi), %xmm6 + movsd 7 * SIZE(%edi), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpckhpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#endif + prefetcht0 80 * SIZE(%edi) + prefetcht1 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L32 + ALIGN_2 + +.L35: + movl K, %eax + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) +#endif +#endif + +#if defined(OPTERON) || defined(BARCELONA) + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) +#endif + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L36 + ALIGN_4 + +.L40: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 +#endif + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + + addsd %xmm0, %xmm4 + addsd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + movddup 2 * SIZE(%edi), %xmm2 + movddup 3 * SIZE(%edi), %xmm3 + movddup 4 * SIZE(%edi), %xmm4 + movddup 5 * SIZE(%edi), %xmm5 + movddup 6 * SIZE(%edi), %xmm6 + movddup 7 * SIZE(%edi), %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + movsd 2 * SIZE(%edi), %xmm2 + movsd 3 * SIZE(%edi), %xmm3 + movsd 4 * SIZE(%edi), %xmm4 + movsd 5 * SIZE(%edi), %xmm5 + movsd 6 * SIZE(%edi), %xmm6 + movsd 7 * SIZE(%edi), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpckhpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#endif + prefetcht1 80 * SIZE(%edi) + prefetcht0 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L62 + ALIGN_2 + +.L65: + movl K, %eax + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) +#endif +#endif + +#if defined(OPTERON) || defined(BARCELONA) + movq 0 * SIZE(%edi), %mm0 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) +#endif + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L66 + ALIGN_4 + +.L70: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) +#endif + +#ifdef PENTIUM4 + prefetchnta 2 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + addsd %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_sse3.S b/kernel/x86/gemm_kernel_2x4_sse3.S new file mode 100644 index 0000000..e2732da --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_sse3.S @@ -0,0 +1,1635 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm2, %xmm7 + +#define KERNEL6(address) \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm2, %xmm7; \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + movl BX, %eax + prefetcht2 0 * SIZE(%eax) + subl $-4 * SIZE, BX + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC, 1) + prefetchnta 3 * SIZE(%esi, LDC, 2) + prefetchnta 3 * SIZE(%esi, %eax, 1) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#ifdef CORE_PRESCOTT + andl $-8, %eax + sall $4, %eax + je .L15 + +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(16 * 8) + KERNEL2(16 * 8) + KERNEL3(16 * 8) + KERNEL4(16 * 8) + KERNEL5(16 * 8) + KERNEL6(16 * 8) + KERNEL7(16 * 8) + KERNEL8(16 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(16 * 9) + KERNEL2(16 * 9) + KERNEL3(16 * 9) + KERNEL4(16 * 9) + KERNEL5(16 * 9) + KERNEL6(16 * 9) + KERNEL7(16 * 9) + KERNEL8(16 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(16 * 10) + KERNEL2(16 * 10) + KERNEL3(16 * 10) + KERNEL4(16 * 10) + KERNEL5(16 * 10) + KERNEL6(16 * 10) + KERNEL7(16 * 10) + KERNEL8(16 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(16 * 11) + KERNEL2(16 * 11) + KERNEL3(16 * 11) + KERNEL4(16 * 11) + KERNEL5(16 * 11) + KERNEL6(16 * 11) + KERNEL7(16 * 11) + KERNEL8(16 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(16 * 12) + KERNEL2(16 * 12) + KERNEL3(16 * 12) + KERNEL4(16 * 12) + KERNEL5(16 * 12) + KERNEL6(16 * 12) + KERNEL7(16 * 12) + KERNEL8(16 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(16 * 13) + KERNEL2(16 * 13) + KERNEL3(16 * 13) + KERNEL4(16 * 13) + KERNEL5(16 * 13) + KERNEL6(16 * 13) + KERNEL7(16 * 13) + KERNEL8(16 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(16 * 14) + KERNEL2(16 * 14) + KERNEL3(16 * 14) + KERNEL4(16 * 14) + KERNEL5(16 * 14) + KERNEL6(16 * 14) + KERNEL7(16 * 14) + KERNEL8(16 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(16 * 15) + KERNEL2(16 * 15) + KERNEL3(16 * 15) + KERNEL4(16 * 15) + KERNEL5(16 * 15) + KERNEL6(16 * 15) + KERNEL7(16 * 15) + KERNEL8(16 * 15) +#else + addl $32 * 4 * SIZE, AA + addl $32 * 8 * SIZE, BB + subl $128 * 8, %eax + jg .L1X +#endif + +.L12: + leal (AA, %eax, 1), AA # * 16 + leal (BB, %eax, 2), BB # * 64 + +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + SHUFPD_2 %xmm0, %xmm0 + SHUFPD_2 %xmm1, %xmm1 + SHUFPD_2 %xmm2, %xmm2 + SHUFPD_2 %xmm3, %xmm3 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + + movl %esi, %eax + orl LDC, %eax + testl $15, %eax + NOBRANCH + jne .L18x + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movapd 0 * SIZE(%esi), %xmm0 + movapd 0 * SIZE(%esi, LDC, 1), %xmm1 + movapd 0 * SIZE(%esi, LDC, 2), %xmm2 + movapd 0 * SIZE(%esi, %eax, 1), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movapd %xmm4, 0 * SIZE(%esi) + movapd %xmm5, 0 * SIZE(%esi, LDC, 1) + movapd %xmm6, 0 * SIZE(%esi, LDC, 2) + movapd %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhpd 1 * SIZE(%esi, LDC, 2), %xmm2 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhpd 1 * SIZE(%esi, %eax, 1), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movhpd %xmm6, 1 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + movhpd %xmm7, 1 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_3 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (%esi, LDC, 1), %eax + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 + SHUFPD_2 %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 0 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%esi, LDC, 2), %xmm1 + movhpd 0 * SIZE(%eax, LDC, 2), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 0 * SIZE(%eax) + movsd %xmm5, 0 * SIZE(%esi, LDC, 2) + movhpd %xmm5, 0 * SIZE(%eax, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + movl BB, B + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 + SHUFPD_2 %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 0 * SIZE(%esi, LDC, 1), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 0 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + movl BB, B + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + + addsd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x2_core2.S b/kernel/x86/gemm_kernel_4x2_core2.S new file mode 100644 index 0000000..641b5fc --- /dev/null +++ b/kernel/x86/gemm_kernel_4x2_core2.S @@ -0,0 +1,1304 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 256(%esp) + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 7 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl %esp, %esi # save old stack + + subl $512 + LOCAL_BUFFER_SIZE, %esp + andl $-4096, %esp # align stack + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movsd OLD_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movd OLD_OFFT, %mm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + unpcklpd %xmm3, %xmm3 + movl OLD_B, B + movl OLD_C, %ebx + + movapd %xmm3, ALPHA + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + leal (, LDC, SIZE), LDC + + sarl $1, %eax + movl %eax, J + jle .L40 + ALIGN_4 + +.L01: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl B, BX + + movl C, C1 + movl A, AA + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(C1) + pxor %xmm7, %xmm7 + prefetcht0 7 * SIZE(C1, LDC) + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetcht0 (%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd -12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd -10 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + movapd -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + PADDING; + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm6 + movapd -4 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm4 + movapd -2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm6 + PADDING; + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + movapd 8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm4 + movapd 14 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + subl $-32 * SIZE, BB + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm6 + movapd -16 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 24 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + subl $-32 * SIZE, AA + decl %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm2 + movhpd 3 * SIZE(C1), %xmm2 + + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + movsd 2 * SIZE(C1, LDC), %xmm3 + movhpd 3 * SIZE(C1, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 2 * SIZE(C1) + movhpd %xmm6, 3 * SIZE(C1) + + movsd %xmm5, 0 * SIZE(C1, LDC) + movhpd %xmm5, 1 * SIZE(C1, LDC) + movsd %xmm7, 2 * SIZE(C1, LDC) + movhpd %xmm7, 3 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $2, I + jle .L30 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $2, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movapd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhpd %xmm5, 1 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $1, I + jle .L39 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: + movsd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + mulsd %xmm3, %xmm4 + mulsd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + + addsd %xmm0, %xmm4 + addsd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L40: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_4 + +.L41: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L45 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $7, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, C1 + movl A, AA + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht0 3 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movapd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm2 + movhpd 3 * SIZE(C1), %xmm2 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 2 * SIZE(C1) + movhpd %xmm6, 3 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $2, I + jle .L70 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: + movapd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L70: + movl M, I + testl $1, I + jle .L79 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movsd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm5, %xmm4 + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + addsd %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L79: + addl LDC, C + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x2_sse2.S b/kernel/x86/gemm_kernel_4x2_sse2.S new file mode 100644 index 0000000..2e67afa --- /dev/null +++ b/kernel/x86/gemm_kernel_4x2_sse2.S @@ -0,0 +1,1539 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movq STACK_ALPHA, %mm7 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movq %mm7, 0 * SIZE + ALPHA + movq %mm7, 1 * SIZE + ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $BASE_SHIFT, LDC + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + BRANCH + jne .L04 + ALIGN_4 + +.L05: + movl B, BX + + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + NOBRANCH + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#endif + + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) + + movl BX, %eax + prefetcht2 0 * SIZE(%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + .align 8 + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + + movl %esi, %eax + orl LDC, %eax + testl $15, %eax + NOBRANCH + jne .L18x + +#ifndef TRMMKERNEL + movapd 0 * SIZE(%esi), %xmm0 + movapd 2 * SIZE(%esi), %xmm1 + movapd 0 * SIZE(%esi, LDC), %xmm2 + movapd 2 * SIZE(%esi, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 + addpd %xmm2, %xmm5 + addpd %xmm3, %xmm7 +#endif + + movapd %xmm4, 0 * SIZE(%esi) + movapd %xmm6, 2 * SIZE(%esi) + movapd %xmm5, 0 * SIZE(%esi, LDC) + movapd %xmm7, 2 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + BRANCH + jg .L10 + jmp .L30 + ALIGN_2 + +.L18x: +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhpd 1 * SIZE(%esi, LDC), %xmm2 + movsd 2 * SIZE(%esi, LDC), %xmm3 + movhpd 3 * SIZE(%esi, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 + addpd %xmm2, %xmm5 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm6, 2 * SIZE(%esi) + movhpd %xmm6, 3 * SIZE(%esi) + + movsd %xmm5, 0 * SIZE(%esi, LDC) + movhpd %xmm5, 1 * SIZE(%esi, LDC) + movsd %xmm7, 2 * SIZE(%esi, LDC) + movhpd %xmm7, 3 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + BRANCH + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + + pxor %xmm7, %xmm7 +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $2, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + SHUFPD_1 %xmm0, %xmm0 + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + SHUFPD_1 %xmm2, %xmm2 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhpd 1 * SIZE(%esi, LDC), %xmm2 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + + movsd %xmm5, 0 * SIZE(%esi, LDC) + movhpd %xmm5, 1 * SIZE(%esi, LDC) + + addl $2 * SIZE, %esi # coffset += 4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + mulsd %xmm3, %xmm4 + mulsd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(%esi), %xmm4 + addsd 0 * SIZE(%esi, LDC), %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC) + addl $1 * SIZE, %esi + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 2 * ldc + BRANCH + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + addpd %xmm0, %xmm4 + + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + addpd %xmm1, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + unpckhpd %xmm4, %xmm4 + movsd %xmm4, 1 * SIZE(%esi) + + movsd %xmm6, 2 * SIZE(%esi) + unpckhpd %xmm6, %xmm6 + movsd %xmm6, 3 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + unpckhpd %xmm4, %xmm4 + movsd %xmm4, 1 * SIZE(%esi) + + addl $2 * SIZE, %esi # coffset += 4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(%esi), %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x4_barcelona.S b/kernel/x86/gemm_kernel_4x4_barcelona.S new file mode 100644 index 0000000..18b9a43 --- /dev/null +++ b/kernel/x86/gemm_kernel_4x4_barcelona.S @@ -0,0 +1,2151 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA 16 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + shufps $0, %xmm3, %xmm3 + movl OLD_B, %edi + movl OLD_C, %ebx + movaps %xmm3, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + leal (, LDC, SIZE), LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + movaps 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (%esi, LDC, 2), %eax + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + prefetchw 3 * SIZE(%eax) + prefetchw 3 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + shufps $0xe4, %xmm1, %xmm1 + shufps $0xe4, %xmm2, %xmm2 + shufps $0xe4, %xmm3, %xmm3 + + mulps %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhps 2 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhps 2 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movhps %xmm6, 2 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + movhps %xmm7, 2 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulps %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulss %xmm3, %xmm4 + movss 0 * SIZE(%esi), %xmm0 + mulss %xmm3, %xmm5 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + mulss %xmm3, %xmm6 + movss 0 * SIZE(%esi, LDC, 2), %xmm2 + mulss %xmm3, %xmm7 + movss 0 * SIZE(%esi, %eax, 1), %xmm3 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 + addss %xmm2, %xmm6 + addss %xmm3, %xmm7 +#else + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + mulss %xmm3, %xmm6 + mulss %xmm3, %xmm7 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC, 1) + movss %xmm6, 0 * SIZE(%esi, LDC, 2) + movss %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + movaps 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movsd 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + addl $2 * SIZE, %esi # coffset += 2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movups 0 * SIZE(%edi), %xmm3 + movups 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(%edi), %xmm3 + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + mulss %xmm3, %xmm4 +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + addss %xmm0, %xmm4 +#endif + movss %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x4_penryn.S b/kernel/x86/gemm_kernel_4x4_penryn.S new file mode 100644 index 0000000..6775d1d --- /dev/null +++ b/kernel/x86/gemm_kernel_4x4_penryn.S @@ -0,0 +1,1831 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 20 + STACK + ARGS(%esp) +#define ARG_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define ARG_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef NANO +#define PREFETCHSIZE (16 * 3 + 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE (16 * 1 - 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (16 * 13 + 8) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -32 * SIZE(%eax) + subl $-16 * SIZE, %eax + movl %eax, BX + + leal (C1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + PREFETCHW 3 * SIZE(C1) + xorps %xmm5, %xmm5 + PREFETCHW 7 * SIZE(C1, LDC) + xorps %xmm6, %xmm6 + PREFETCHW 3 * SIZE(%eax) + xorps %xmm7, %xmm7 + PREFETCHW 7 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 +#ifndef NEHALEM + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) +#endif + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + + movss ALPHA, %xmm3 + + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + shufps $0, %xmm3, %xmm3 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + movsd 0 * SIZE(%eax), %xmm2 + movhps 2 * SIZE(%eax), %xmm2 + movsd 0 * SIZE(%eax, LDC), %xmm3 + movhps 2 * SIZE(%eax, LDC), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhps %xmm5, 2 * SIZE(C1, LDC) + + movsd %xmm6, 0 * SIZE(%eax) + movhps %xmm6, 2 * SIZE(%eax) + movsd %xmm7, 0 * SIZE(%eax, LDC) + movhps %xmm7, 2 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $2, I + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movss ALPHA, %xmm1 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 0 * SIZE(C1, LDC), %xmm0 + + movsd 0 * SIZE(%eax), %xmm1 + movhps 0 * SIZE(%eax, LDC), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 0 * SIZE(C1, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movhps %xmm5, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $1, I + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movss ALPHA, %xmm1 + + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + movss 0 * SIZE(C1, LDC), %xmm1 + + movss 0 * SIZE(%eax), %xmm2 + movss 0 * SIZE(%eax, LDC), %xmm3 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 + addss %xmm2, %xmm6 + addss %xmm3, %xmm7 +#endif + + movss %xmm4, 0 * SIZE(C1) + movss %xmm5, 0 * SIZE(C1, LDC) + movss %xmm6, 0 * SIZE(%eax) + movss %xmm7, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L40: + movl N, %eax + testl $2, %eax + jle .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movss ALPHA, %xmm1 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhps %xmm5, 2 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L41 + ALIGN_4 + +.L50: + movl M, I + testl $2, I + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movss ALPHA, %xmm1 + + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 0 * SIZE(C1, LDC), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L69 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movss ALPHA, %xmm1 + + addps %xmm5, %xmm4 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + movss 0 * SIZE(C1, LDC), %xmm1 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 +#endif + + movss %xmm4, 0 * SIZE(C1) + movss %xmm5, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L70: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + movss ALPHA, %xmm1 + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + addl %eax, BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L71 + ALIGN_4 + +.L80: + movl M, I + testl $2, I + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + ALIGN_4 + +.L86: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + movss ALPHA, %xmm1 + + addps %xmm5, %xmm4 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L90: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + movss ALPHA, %xmm1 + + haddps %xmm4, %xmm4 + mulss %xmm1, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + + addss %xmm0, %xmm4 +#endif + + movss %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x4_sse.S b/kernel/x86/gemm_kernel_4x4_sse.S new file mode 100644 index 0000000..b360a58 --- /dev/null +++ b/kernel/x86/gemm_kernel_4x4_sse.S @@ -0,0 +1,2589 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA 16 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#ifdef ATHLON +#define PREFETCH prefetch +#define PREFETCHSIZE 64 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; +#endif + +#ifdef PENTIUM4 +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1 +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + shufps $0, %xmm3, %xmm3 + movl OLD_B, %edi + movl OLD_C, %ebx + movaps %xmm3, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + leal (, LDC, SIZE), LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht2 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) + prefetchnta 80 * SIZE(%edi) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) +#endif + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl %edi, BX + + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movl BX, %eax + +#ifdef HAVE_SSE + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) + prefetchw 4 * SIZE(%esi, LDC, 2) + prefetchw 4 * SIZE(%esi, %eax) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + prefetchnta 4 * SIZE(%esi, LDC, 2) + prefetchnta 4 * SIZE(%esi, %eax) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + shufps $0xe4, %xmm1, %xmm1 + shufps $0xe4, %xmm2, %xmm2 + shufps $0xe4, %xmm3, %xmm3 + + mulps %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhps 2 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhps 2 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + movlps %xmm6, 0 * SIZE(%esi, LDC, 2) + movhps %xmm6, 2 * SIZE(%esi, LDC, 2) + movlps %xmm7, 0 * SIZE(%esi, %eax, 1) + movhps %xmm7, 2 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulps %xmm3, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC, 1) + movlps %xmm6, 0 * SIZE(%esi, LDC, 2) + movlps %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulss %xmm3, %xmm4 + movss 0 * SIZE(%esi), %xmm0 + mulss %xmm3, %xmm5 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + mulss %xmm3, %xmm6 + movss 0 * SIZE(%esi, LDC, 2), %xmm2 + mulss %xmm3, %xmm7 + movss 0 * SIZE(%esi, %eax, 1), %xmm3 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 + addss %xmm2, %xmm6 + addss %xmm3, %xmm7 +#else + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + mulss %xmm3, %xmm6 + mulss %xmm3, %xmm7 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC, 1) + movss %xmm6, 0 * SIZE(%esi, LDC, 2) + movss %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + prefetchnta 80 * SIZE(%edi) + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht1 112 * SIZE(%ecx) +#endif + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) +#endif + addl $2 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC, 1) + addl $2 * SIZE, %esi # coffset += 2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + prefetchnta 80 * SIZE(%edi) + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht1 112 * SIZE(%ecx) +#endif + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) +#endif + addl $1 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + movlps %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + mulss %xmm3, %xmm4 +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + addss %xmm0, %xmm4 +#endif + movss %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x4_sse3.S b/kernel/x86/gemm_kernel_4x4_sse3.S new file mode 100644 index 0000000..78efab6 --- /dev/null +++ b/kernel/x86/gemm_kernel_4x4_sse3.S @@ -0,0 +1,2090 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA 16 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm2, %xmm7; \ + movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm2, %xmm7; \ + movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm3, %xmm7; \ + movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm3, %xmm7; \ + movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm2, %xmm7 + +#define KERNEL6(address) \ + movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm2, %xmm7; \ + movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm3, %xmm7; \ + movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm3, %xmm7; \ + movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + shufps $0, %xmm3, %xmm3 + movl OLD_B, %edi + movl OLD_C, %ebx + movaps %xmm3, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + leal (, LDC, SIZE), LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + movddup 4 * SIZE(%edi), %xmm2 + movddup 6 * SIZE(%edi), %xmm3 + movddup 8 * SIZE(%edi), %xmm4 + movddup 10 * SIZE(%edi), %xmm5 + movddup 12 * SIZE(%edi), %xmm6 + movddup 14 * SIZE(%edi), %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_2 + +.L06: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $4 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + prefetchnta 4 * SIZE(%esi, LDC, 2) + prefetchnta 4 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(32 * 8) + KERNEL2(32 * 8) + KERNEL3(32 * 8) + KERNEL4(32 * 8) + KERNEL5(32 * 8) + KERNEL6(32 * 8) + KERNEL7(32 * 8) + KERNEL8(32 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(32 * 9) + KERNEL2(32 * 9) + KERNEL3(32 * 9) + KERNEL4(32 * 9) + KERNEL5(32 * 9) + KERNEL6(32 * 9) + KERNEL7(32 * 9) + KERNEL8(32 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(32 * 10) + KERNEL2(32 * 10) + KERNEL3(32 * 10) + KERNEL4(32 * 10) + KERNEL5(32 * 10) + KERNEL6(32 * 10) + KERNEL7(32 * 10) + KERNEL8(32 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(32 * 11) + KERNEL2(32 * 11) + KERNEL3(32 * 11) + KERNEL4(32 * 11) + KERNEL5(32 * 11) + KERNEL6(32 * 11) + KERNEL7(32 * 11) + KERNEL8(32 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(32 * 12) + KERNEL2(32 * 12) + KERNEL3(32 * 12) + KERNEL4(32 * 12) + KERNEL5(32 * 12) + KERNEL6(32 * 12) + KERNEL7(32 * 12) + KERNEL8(32 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(32 * 13) + KERNEL2(32 * 13) + KERNEL3(32 * 13) + KERNEL4(32 * 13) + KERNEL5(32 * 13) + KERNEL6(32 * 13) + KERNEL7(32 * 13) + KERNEL8(32 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(32 * 14) + KERNEL2(32 * 14) + KERNEL3(32 * 14) + KERNEL4(32 * 14) + KERNEL5(32 * 14) + KERNEL6(32 * 14) + KERNEL7(32 * 14) + KERNEL8(32 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(32 * 15) + KERNEL2(32 * 15) + KERNEL3(32 * 15) + KERNEL4(32 * 15) + KERNEL5(32 * 15) + KERNEL6(32 * 15) + KERNEL7(32 * 15) + KERNEL8(32 * 15) +#else + addl $128 * 4 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 +#endif + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsldup 8 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + shufps $0xe4, %xmm1, %xmm1 + shufps $0xe4, %xmm2, %xmm2 + shufps $0xe4, %xmm3, %xmm3 + + mulps %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhps 2 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhps 2 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movhps %xmm6, 2 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + movhps %xmm7, 2 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 44 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 60 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 0 * SIZE(%esi, LDC, 1), %xmm0 + movsd 0 * SIZE(%esi, LDC, 2), %xmm1 + movhps 0 * SIZE(%esi, %eax, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 0 * SIZE(%esi, LDC, 1) + movsd %xmm5, 0 * SIZE(%esi, LDC, 2) + movhps %xmm5, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + shufps $0, %xmm0, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movhps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 8 * SIZE(BB), %xmm2 + shufps $0, %xmm0, %xmm0 + movhps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movhps 20 * SIZE(BB), %xmm3 + shufps $0, %xmm0, %xmm0 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + movss 3 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + shufps $0, %xmm0, %xmm0 + movhps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movss 8 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0, %xmm1, %xmm1 + movhps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movss 5 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movsd 40 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + movhps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movss 6 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + movhps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 56 * SIZE(BB), %xmm3 + shufps $0, %xmm1, %xmm1 + movhps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm7 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0, %xmm0, %xmm0 + movhps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 8 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + + addps %xmm5, %xmm4 + mulps %xmm7, %xmm4 + + movhlps %xmm4, %xmm5 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + movss 0 * SIZE(%esi, LDC, 2), %xmm2 + movss 0 * SIZE(%esi, %eax, 1), %xmm3 + + addss %xmm4, %xmm0 + psrlq $32, %xmm4 + addss %xmm4, %xmm1 + addss %xmm5, %xmm2 + psrlq $32, %xmm5 + addss %xmm5, %xmm3 + + movss %xmm0, 0 * SIZE(%esi) + movss %xmm1, 0 * SIZE(%esi, LDC, 1) + movss %xmm2, 0 * SIZE(%esi, LDC, 2) + movss %xmm3 , 0 * SIZE(%esi, %eax, 1) +#else + movss %xmm4, 0 * SIZE(%esi) + psrlq $32, %xmm4 + movss %xmm4, 0 * SIZE(%esi, LDC, 1) + + movss %xmm5, 0 * SIZE(%esi, LDC, 2) + psrlq $32, %xmm5 + movss %xmm5 , 0 * SIZE(%esi, %eax, 1) +#endif + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L45 + ALIGN_4 + +.L42: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + movddup 4 * SIZE(%edi), %xmm2 + movddup 6 * SIZE(%edi), %xmm3 + movddup 8 * SIZE(%edi), %xmm4 + movddup 10 * SIZE(%edi), %xmm5 + movddup 12 * SIZE(%edi), %xmm6 + movddup 14 * SIZE(%edi), %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $7, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movddup 0 * SIZE(%edi), %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 16 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 48 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 0 * SIZE(%esi, LDC, 1), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 0 * SIZE(%esi, LDC, 1) + addl $2 * SIZE, %esi # coffset += 2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + shufps $0, %xmm0, %xmm0 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + shufps $0, %xmm0, %xmm0 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 3 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + shufps $0, %xmm0, %xmm0 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + shufps $0, %xmm1, %xmm1 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 6 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + shufps $0, %xmm1, %xmm1 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + shufps $0, %xmm1, %xmm1 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + + addss %xmm4, %xmm0 + psrlq $32, %xmm4 + addss %xmm4, %xmm1 + + movss %xmm0, 0 * SIZE(%esi) + movss %xmm1, 0 * SIZE(%esi, LDC, 1) +#else + movss %xmm4, 0 * SIZE(%esi) + psrlq $32, %xmm4 + movss %xmm4, 0 * SIZE(%esi, LDC, 1) +#endif + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + movss %xmm0, 0 * SIZE(%ecx) + movss %xmm0, 1 * SIZE(%ecx) + movss %xmm1, 2 * SIZE(%ecx) + movss %xmm1, 3 * SIZE(%ecx) + movss %xmm2, 4 * SIZE(%ecx) + movss %xmm2, 5 * SIZE(%ecx) + movss %xmm3, 6 * SIZE(%ecx) + movss %xmm3, 7 * SIZE(%ecx) + movss %xmm4, 8 * SIZE(%ecx) + movss %xmm4, 9 * SIZE(%ecx) + movss %xmm5, 10 * SIZE(%ecx) + movss %xmm5, 11 * SIZE(%ecx) + movss %xmm6, 12 * SIZE(%ecx) + movss %xmm6, 13 * SIZE(%ecx) + movss %xmm7, 14 * SIZE(%ecx) + movss %xmm7, 15 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(%edi), %xmm0 + movss %xmm0, 0 * SIZE(%ecx) + movss %xmm0, 1 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps 16 * SIZE(AA), %xmm1 + movddup 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movddup 2 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movddup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movddup 6 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movddup 16 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movddup 10 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movddup 12 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movddup 14 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movddup 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movddup 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(AA), %xmm1 + movsd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 16 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 10 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 12 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 14 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + movhlps %xmm4, %xmm5 + addps %xmm5, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 0 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movss 4 * SIZE(AA), %xmm1 + movss 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 2 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addss %xmm2, %xmm5 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 3 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 6 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 8 * SIZE(AA), %xmm0 + addss %xmm2, %xmm5 + movss 16 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 10 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 6 * SIZE(AA), %xmm1 + addss %xmm3, %xmm5 + movss 12 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 14 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + mulss %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + addss %xmm0, %xmm4 +#else + mulss %xmm3, %xmm4 +#endif + movss %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_8x1_sse2.S b/kernel/x86/gemm_kernel_8x1_sse2.S new file mode 100644 index 0000000..52a9ebc --- /dev/null +++ b/kernel/x86/gemm_kernel_8x1_sse2.S @@ -0,0 +1,878 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 36(%esp) +#define J 44(%esp) +#define OLD_STACK 48(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define KERNELMACRO(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movq STACK_ALPHA, %mm7 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + + movq %mm7, 0 * SIZE + ALPHA + movq %mm7, 1 * SIZE + ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + + leal (, LDC, SIZE), LDC + + test %eax, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetchnta 96 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $7, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L10: + leal BUFFER, %ecx # boffset1 = boffset // different point + movl K, %eax + + movapd 0 * SIZE + BUFFER, %xmm2 + movapd 0 * SIZE(%edx), %xmm0 + movapd 8 * SIZE + BUFFER, %xmm3 + movapd 8 * SIZE(%edx), %xmm1 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if 0 + andl $-8, %eax + leal (, %eax, 8), %eax + je .L12 + + KERNELMACRO(32 * 0) # 0 + cmpl $64 * 1, %eax + jle .L11 + KERNELMACRO(32 * 1) # 1 + cmpl $64 * 2, %eax + jle .L11 + KERNELMACRO(32 * 2) # 2 + cmpl $64 * 3, %eax + jle .L11 + KERNELMACRO(32 * 3) # 3 + cmpl $64 * 4, %eax + jle .L11 + KERNELMACRO(32 * 4) # 4 + cmpl $64 * 5, %eax + jle .L11 + KERNELMACRO(32 * 5) # 5 + cmpl $64 * 6, %eax + jle .L11 + KERNELMACRO(32 * 6) # 6 + cmpl $64 * 7, %eax + jle .L11 + KERNELMACRO(32 * 7) # 7 + cmpl $64 * 8, %eax + jle .L11 + KERNELMACRO(32 * 8) # 8 + cmpl $64 * 9, %eax + jle .L11 + KERNELMACRO(32 * 9) # 9 + cmpl $64 * 10, %eax + jle .L11 + KERNELMACRO(32 * 10) # 10 + cmpl $64 * 11, %eax + jle .L11 + KERNELMACRO(32 * 11) # 11 + cmpl $64 * 12, %eax + jle .L11 + KERNELMACRO(32 * 12) # 12 + cmpl $64 * 13, %eax + jle .L11 + KERNELMACRO(32 * 13) # 13 + cmpl $64 * 14, %eax + jle .L11 + KERNELMACRO(32 * 14) # 14 + cmpl $64 * 15, %eax + jle .L11 + movq 1 * SIZE(%esi), %mm0 + movq 1 * SIZE(%esi, LDC), %mm1 + KERNELMACRO(32 * 15) # 15 +.L11: + leal (%edx, %eax, 4), %edx + leal (%ecx, %eax, 4), %ecx + +#else + movapd 0 * SIZE(BB), %xmm0 + movapd 8 * SIZE(BB), %xmm2 + movapd 0 * SIZE(AA), %xmm1 + movapd 8 * SIZE(AA), %xmm3 + + prefetchnta 8 * SIZE(%esi) + + sarl $3, %eax + je .L12 + +#define PRE 40 + +.L11: + mulpd %xmm0, %xmm1 + movd (PRE + 0) * SIZE(AA), %mm0 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 4 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(AA), %xmm1 + movd (PRE + 8) * SIZE(AA), %mm0 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(BB), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movapd 12 * SIZE(AA), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(AA), %xmm3 + movd (PRE + 16) * SIZE(AA), %mm0 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(BB), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 18 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 20 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 22 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd 32 * SIZE(AA), %xmm1 + movd (PRE + 24) * SIZE(AA), %mm0 + addpd %xmm0, %xmm7 + movapd 6 * SIZE(BB), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movapd 26 * SIZE(AA), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movapd 28 * SIZE(AA), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(AA), %xmm3 + movd (PRE + 32) * SIZE(AA), %mm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(BB), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm4 + movapd 34 * SIZE(AA), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm5 + movapd 36 * SIZE(AA), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 38 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd 48 * SIZE(AA), %xmm1 + movd (PRE + 40) * SIZE(AA), %mm0 + addpd %xmm2, %xmm7 + movapd 10 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm4 + movapd 42 * SIZE(AA), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm5 + movapd 44 * SIZE(AA), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 46 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(AA), %xmm3 + movd (PRE + 48) * SIZE(AA), %mm0 + addpd %xmm2, %xmm7 + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm4 + movapd 50 * SIZE(AA), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm5 + movapd 52 * SIZE(AA), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 54 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd 64 * SIZE(AA), %xmm1 + movd (PRE + 56) * SIZE(AA), %mm0 + addpd %xmm2, %xmm7 + movapd 14 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm4 + movapd 58 * SIZE(AA), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm5 + movapd 60 * SIZE(AA), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 62 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + movapd 24 * SIZE(BB), %xmm2 + + addl $64 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L11 +#endif + +.L12: + movapd ALPHA, %xmm3 + movl K, %eax + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 4 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + mulpd 6 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm7 + + addl $8 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + movsd 4 * SIZE(%esi), %xmm2 + movhpd 5 * SIZE(%esi), %xmm2 + movsd 6 * SIZE(%esi), %xmm3 + movhpd 7 * SIZE(%esi), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movhpd %xmm5, 3 * SIZE(%esi) + movsd %xmm6, 4 * SIZE(%esi) + movhpd %xmm6, 5 * SIZE(%esi) + movsd %xmm7, 6 * SIZE(%esi) + movhpd %xmm7, 7 * SIZE(%esi) + + addl $8 * SIZE, %esi # coffset += 4 + BRANCH + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L20: + movl M, %ebx + testl $4, %ebx + jle .L30 + + leal BUFFER, %ecx + movl K, %eax + + movapd 0 * SIZE + BUFFER, %xmm2 + movapd 0 * SIZE(%edx), %xmm0 + movapd 8 * SIZE + BUFFER, %xmm3 + movapd 8 * SIZE(%edx), %xmm1 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + sarl $3, %eax + je .L22 + +.L21: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 2 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 2 * SIZE(BB), %xmm0 + movapd 4 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 6 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 4 * SIZE(BB), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 10 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 6 * SIZE(BB), %xmm0 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 14 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 8 * SIZE(BB), %xmm0 + movapd 16 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 18 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 10 * SIZE(BB), %xmm0 + movapd 20 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 22 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 12 * SIZE(BB), %xmm0 + movapd 24 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 26 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 14 * SIZE(BB), %xmm0 + movapd 28 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 30 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L21 + +.L22: + movapd ALPHA, %xmm3 + movl K, %eax + andl $7, %eax + BRANCH + je .L24 + +.L23: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 2 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L23 + ALIGN_4 + +.L24: + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movhpd %xmm5, 3 * SIZE(%esi) + addl $4 * SIZE, %esi # coffset += 4 + ALIGN_4 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + + leal BUFFER, %ecx + movl K, %eax + + movapd 0 * SIZE + BUFFER, %xmm2 + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE + BUFFER, %xmm3 + movapd 8 * SIZE(AA), %xmm1 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + sarl $3, %eax + je .L32 + +.L31: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 2 * SIZE(BB), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 4 * SIZE(BB), %xmm0 + movapd 4 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 6 * SIZE(BB), %xmm0 + movapd 6 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 8 * SIZE(BB), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 10 * SIZE(BB), %xmm0 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 12 * SIZE(BB), %xmm0 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 14 * SIZE(BB), %xmm0 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: + movapd ALPHA, %xmm3 + movl K, %eax + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + mulpd %xmm3, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + addl $2 * SIZE, %esi + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + + leal BUFFER, %ecx + movl K, %eax + + movsd 0 * SIZE + BUFFER, %xmm2 + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE + BUFFER, %xmm3 + movsd 4 * SIZE(AA), %xmm1 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + sarl $3, %eax + je .L52 + +.L51: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 4 * SIZE(AA), %xmm0 + mulsd 8 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 5 * SIZE(AA), %xmm0 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 6 * SIZE(AA), %xmm0 + mulsd 12 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 7 * SIZE(AA), %xmm0 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: + movsd ALPHA, %xmm3 + movl K, %eax + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + movsd 0 * SIZE(%esi), %xmm0 + mulsd %xmm3, %xmm4 + addsd %xmm0, %xmm4 + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_2 + +.L99: + addl LDC, C + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_8x2_core2.S b/kernel/x86/gemm_kernel_8x2_core2.S new file mode 100644 index 0000000..3fd8c56 --- /dev/null +++ b/kernel/x86/gemm_kernel_8x2_core2.S @@ -0,0 +1,1622 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 512(%esp) + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 16 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $512 + LOCAL_BUFFER_SIZE, %esp + andl $-4096, %esp # align stack + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + movss STACK_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + shufps $0, %xmm3, %xmm3 + + movl STACK_B, B + movl STACK_C, %ebx + + movaps %xmm3, ALPHA + movl %ebx, C + movl STACK_LDC, LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + leal (, LDC, SIZE), LDC + + sarl $1, %eax + movl %eax, J + jle .L50 + ALIGN_4 + +.L01: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BB) + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BB) + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl C, C1 + movl A, AA + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -16 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 7 * SIZE(C1) + pxor %xmm7, %xmm7 + prefetcht0 7 * SIZE(C1, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps -12 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps -8 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps -4 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps 16 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps 8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps 16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 20 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm2, %xmm6 + movaps 24 * SIZE(AA), %xmm3 + addps %xmm1, %xmm7 + + movaps 24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 28 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + subl $-64 * SIZE, BB + movaps 48 * SIZE(AA), %xmm3 + subl $-64 * SIZE, AA + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps -32 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA, %xmm3 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm2 + movhps 6 * SIZE(C1), %xmm2 + + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + movsd 4 * SIZE(C1, LDC), %xmm3 + movhps 6 * SIZE(C1, LDC), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm6, 4 * SIZE(C1) + movhps %xmm6, 6 * SIZE(C1) + + movsd %xmm5, 0 * SIZE(C1, LDC) + movhps %xmm5, 2 * SIZE(C1, LDC) + movsd %xmm7, 4 * SIZE(C1, LDC) + movhps %xmm7, 6 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $8, KK +#endif + + addl $8 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $4, I + jle .L30 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + addps %xmm0, %xmm7 + movaps -24 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps -8 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + addps %xmm2, %xmm5 + movaps -12 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movaps 32 * SIZE(BB), %xmm1 + addps %xmm2, %xmm7 + movaps -8 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm2, %xmm5 + movaps -4 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm2, %xmm7 + movaps 16 * SIZE(AA), %xmm2 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhps %xmm5, 2 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $2, I + jle .L40 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -24 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addps %xmm0, %xmm7 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movsd -8 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd -16 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movsd 8 * SIZE(BB), %xmm1 + addps %xmm2, %xmm5 + movsd -22 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movsd 32 * SIZE(BB), %xmm1 + addps %xmm2, %xmm7 + movsd -20 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addps %xmm2, %xmm5 + movsd -18 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movsd 48 * SIZE(BB), %xmm3 + addps %xmm2, %xmm7 + movsd -8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: + movsd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ +#endif + + movss -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movss -28 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movss -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm0, %xmm1 + mulss -28 * SIZE(BB), %xmm0 + addss %xmm1, %xmm4 + movss -24 * SIZE(BB), %xmm1 + addss %xmm0, %xmm5 + movss -31 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm1 + mulss -20 * SIZE(BB), %xmm0 + addss %xmm1, %xmm6 + movss 0 * SIZE(BB), %xmm1 + addss %xmm0, %xmm7 + movss -30 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss -12 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss -8 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss -29 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss -4 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 16 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss -24 * SIZE(AA), %xmm0 + mulss %xmm2, %xmm1 + mulss 4 * SIZE(BB), %xmm2 + addss %xmm1, %xmm4 + movss 8 * SIZE(BB), %xmm1 + addss %xmm2, %xmm5 + movss -27 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm1 + mulss 12 * SIZE(BB), %xmm2 + addss %xmm1, %xmm6 + movss 32 * SIZE(BB), %xmm1 + addss %xmm2, %xmm7 + movss -26 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm3 + mulss 20 * SIZE(BB), %xmm2 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm2, %xmm5 + movss -25 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm3 + mulss 28 * SIZE(BB), %xmm2 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm2, %xmm7 + movss -20 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movss ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulss %xmm0, %xmm1 + mulss -28 * SIZE(BB), %xmm0 + addss %xmm1, %xmm4 + movss -24 * SIZE(BB), %xmm1 + addss %xmm0, %xmm5 + movss -31 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + movss 0 * SIZE(C1, LDC), %xmm1 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 +#endif + + movss %xmm4, 0 * SIZE(C1) + movss %xmm5, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_4 + +.L51: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L55 + ALIGN_4 + +.L52: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movl K, %eax + andl $7, %eax + BRANCH + jle .L60 + ALIGN_4 + +.L56: + movss -32 * SIZE(B), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L56 + ALIGN_4 + +.L60: + movl C, C1 + movl A, AA + movl M, I + sarl $3, I + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht0 3 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AA), %xmm1 + addps %xmm0, %xmm4 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm6 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + mulps -20 * SIZE(AA), %xmm1 + addps %xmm0, %xmm5 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm2 + mulps -12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movaps -8 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm2 + mulps -4 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 16 * SIZE(AA), %xmm2 + addps %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm3, %xmm0 + mulps 4 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps -12 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 12 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps -8 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm2 + mulps 20 * SIZE(AA), %xmm3 + addps %xmm2, %xmm4 + movaps 24 * SIZE(AA), %xmm2 + addps %xmm3, %xmm6 + movaps -4 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm2 + mulps 28 * SIZE(AA), %xmm3 + addps %xmm2, %xmm5 + movaps 48 * SIZE(AA), %xmm2 + addps %xmm3, %xmm7 + movaps 16 * SIZE(BB), %xmm3 + + addl $ 64 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AA), %xmm1 + addps %xmm0, %xmm4 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm6 + movaps -28 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm2 + movhps 6 * SIZE(C1), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm6, 4 * SIZE(C1) + movhps %xmm6, 6 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $8, KK +#endif + + addl $8 * SIZE, C1 + decl I + jg .L61 + ALIGN_4 + +.L70: + movl M, I + testl $4, I + jle .L80 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + movaps -16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + movaps -12 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movaps -12 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps -8 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movaps -8 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps -4 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movaps -4 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps 16 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + + subl $-32 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + ALIGN_4 + +.L80: + movl M, I + testl $2, I + jle .L90 + +.L81: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -16 * SIZE(BB), %xmm3 + movsd -24 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + movsd -22 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movsd -12 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -20 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -18 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -8 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movsd 16 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movsd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + addps %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + addl $2 * SIZE, C1 + ALIGN_4 + +.L90: + movl M, I + testl $1, I + jle .L99 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movss -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movss -16 * SIZE(BB), %xmm3 + movss -28 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -28 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -30 * SIZE(AA), %xmm0 + addss %xmm1, %xmm5 + movss -24 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -29 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -20 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -24 * SIZE(AA), %xmm0 + addss %xmm1, %xmm5 + movss -0 * SIZE(BB), %xmm1 + mulss %xmm2, %xmm3 + movss -27 * SIZE(AA), %xmm2 + addss %xmm3, %xmm4 + movss -12 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -26 * SIZE(AA), %xmm2 + addss %xmm3, %xmm5 + movss -8 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -25 * SIZE(AA), %xmm2 + addss %xmm3, %xmm4 + movss -4 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -20 * SIZE(AA), %xmm2 + addss %xmm3, %xmm5 + movss 16 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: + movss ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm5, %xmm4 + mulss %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + addss %xmm0, %xmm4 +#endif + movss %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L99: + addl LDC, C + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_8x2_sse.S b/kernel/x86/gemm_kernel_8x2_sse.S new file mode 100644 index 0000000..c389764 --- /dev/null +++ b/kernel/x86/gemm_kernel_8x2_sse.S @@ -0,0 +1,2746 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCHSIZE 48 /* for PIII */ + +#define AA %edx +#define BB %ecx + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL4(address) \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL5(address) \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL6(address) \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movd STACK_ALPHA, %mm7 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movd %mm7, 0 * SIZE + ALPHA + movd %mm7, 1 * SIZE + ALPHA + movd %mm7, 2 * SIZE + ALPHA + movd %mm7, 3 * SIZE + ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + leal (, LDC, SIZE), LDC + + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + addl $2 * SIZE, B + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + addl $8 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef PENTIUM4 +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + + prefetchnta 7 * SIZE(%esi) + prefetchnta 7 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 8 * SIZE, AA + addl $64 * 8 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + + prefetchnta 8 * SIZE(%esi) + prefetchnta 8 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + shufps $0xe4, %xmm5, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + shufps $0xe4, %xmm6, %xmm6 + shufps $0xe4, %xmm7, %xmm7 + + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhps 2 * SIZE(%esi, LDC), %xmm2 + movsd 4 * SIZE(%esi, LDC), %xmm3 + movhps 6 * SIZE(%esi, LDC), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm6 + addps %xmm2, %xmm5 + addps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm6, 4 * SIZE(%esi) + movhps %xmm6, 6 * SIZE(%esi) + + movsd %xmm5, 0 * SIZE(%esi, LDC) + movhps %xmm5, 2 * SIZE(%esi, LDC) + movsd %xmm7, 4 * SIZE(%esi, LDC) + movhps %xmm7, 6 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $8, KK +#endif + + addl $8 * SIZE, %esi + BRANCH + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + andl $7, %ebx + jle .L99 + + testl $4, %ebx + jle .L50 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 +#endif + +.L32: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + addps %xmm0, %xmm5 +#endif + +#ifdef HAVE_SSE2 + movsd %xmm4, 0 * SIZE(%esi) + unpckhpd %xmm4, %xmm4 + movsd %xmm4, 2 * SIZE(%esi) + + movsd %xmm5, 0 * SIZE(%esi, LDC) + unpckhpd %xmm5, %xmm5 + movsd %xmm5, 2 * SIZE(%esi, LDC) +#else + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + + movlps %xmm5, 0 * SIZE(%esi, LDC) + movhps %xmm5, 2 * SIZE(%esi, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi + ALIGN_2 + +.L50: + testl $2, %ebx + jle .L70 + + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 +#endif + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi, LDC), %xmm0 + addps %xmm0, %xmm5 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi + ALIGN_2 + +.L70: + testl $1, %ebx + jle .L99 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 40 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 48 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 72 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 +#endif + +.L72: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movss ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addss 0 * SIZE(%esi), %xmm4 + addss 0 * SIZE(%esi, LDC), %xmm5 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC) + + addl $1 * SIZE, %esi + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 2 * ldc + BRANCH + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 96 * SIZE(B) + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + addl $ 8 * SIZE, B + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + addl $32 * SIZE, %ecx + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + addl $1 * SIZE, B + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(%ecx) + addl $4 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm6 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm6 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +#else + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 20 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 28 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 40 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm0 + mulps 36 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 48 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 44 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 72 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 40 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 +#endif + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm5 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 + + movsd 4 * SIZE(%esi), %xmm0 + movhps 6 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm5 +#endif + +#ifdef HAVE_SSE2 + movsd %xmm4, 0 * SIZE(%esi) + unpckhpd %xmm4, %xmm4 + movsd %xmm4, 2 * SIZE(%esi) + + movsd %xmm5, 4 * SIZE(%esi) + unpckhpd %xmm5, %xmm5 + movsd %xmm5, 6 * SIZE(%esi) +#else + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + + movlps %xmm5, 4 * SIZE(%esi) + movhps %xmm5, 6 * SIZE(%esi) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $8, KK +#endif + + addl $8 * SIZE, %esi + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + andl $7, %ebx + jle .L999 + + testl $4, %ebx + jle .L150 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + mulps 12 * SIZE(BB), %xmm1 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + movaps 20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 20 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + mulps 28 * SIZE(BB), %xmm1 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 +#endif + +.L132: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + addl $4 * SIZE, %esi + ALIGN_2 + +.L150: + testl $2, %ebx + jle .L170 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 40 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 +#endif + +.L152: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + addl $2 * SIZE, %esi + ALIGN_2 + +.L170: + testl $1, %ebx + jle .L999 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + movss 3 * SIZE(AA), %xmm0 + addss %xmm3, %xmm6 + mulss 12 * SIZE(BB), %xmm0 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + movss 5 * SIZE(AA), %xmm1 + addss %xmm2, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 32 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addss %xmm3, %xmm6 + mulss 28 * SIZE(BB), %xmm1 + movss 40 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 +#endif + +.L172: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movss ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + mulss %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addss 0 * SIZE(%esi), %xmm4 +#endif + movss %xmm4, 0 * SIZE(%esi) + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/gemm_ncopy_2.S b/kernel/x86/gemm_ncopy_2.S new file mode 100644 index 0000000..a2674c7 --- /dev/null +++ b/kernel/x86/gemm_ncopy_2.S @@ -0,0 +1,274 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 8 + +#define J 0 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define A 12 + STACK + ARGS(%esp) +#define LDA 16 + STACK + ARGS(%esp) +#define B 20 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl B, %esi # ESI : offsetB + movl M, %edi + + movl A, %ebx # EBX : offsetA + movl LDA, %edx + leal (%ebx, %edx, SIZE), %ebp + + addl %edx, %edx + subl %edi, %edx # edx = 2 * lda - m + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L21: +#if 0 + movl %edi, %ecx # ECX : I(Counter of M) + andl $-8, %ecx + leal (%ebx, %ecx, SIZE), %ebx + leal (%ebp, %ecx, SIZE), %ebp + negl %ecx + ALIGN_3 + +.Blocking1: + MMXLOAD (%ebx, %ecx, SIZE), %mm0 + MMXLOAD (%ebp, %ecx, SIZE), %mm1 + addl $8, %ecx + jl .Blocking1 + + movl %edi, %ecx # ECX : I(Counter of M) + andl $-8, %ecx + negl %ecx + leal (%ebx, %ecx, SIZE), %ebx + leal (%ebp, %ecx, SIZE), %ebp +#endif + + movl %edi, %ecx # ECX : I(Counter of M) + sarl $2, %ecx + je .L24 + ALIGN_3 + +.L25: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebx), %mm0 + MMXLOAD 0 * SIZE(%ebp), %mm1 + MMXLOAD 1 * SIZE(%ebx), %mm2 + MMXLOAD 1 * SIZE(%ebp), %mm3 + + MMXLOAD 2 * SIZE(%ebx), %mm4 + MMXLOAD 2 * SIZE(%ebp), %mm5 + MMXLOAD 3 * SIZE(%ebx), %mm6 + MMXLOAD 3 * SIZE(%ebp), %mm7 + + MMXSTORE %mm0, 0 * SIZE(%esi) + MMXSTORE %mm1, 1 * SIZE(%esi) + MMXSTORE %mm2, 2 * SIZE(%esi) + MMXSTORE %mm3, 3 * SIZE(%esi) + + MMXSTORE %mm4, 4 * SIZE(%esi) + MMXSTORE %mm5, 5 * SIZE(%esi) + MMXSTORE %mm6, 6 * SIZE(%esi) + MMXSTORE %mm7, 7 * SIZE(%esi) +#else + FLD 3 * SIZE(%ebp) + FLD 3 * SIZE(%ebx) + FLD 2 * SIZE(%ebp) + FLD 2 * SIZE(%ebx) + FLD 1 * SIZE(%ebp) + FLD 1 * SIZE(%ebx) + FLD 0 * SIZE(%ebp) + FLD 0 * SIZE(%ebx) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + FST 4 * SIZE(%esi) + FST 5 * SIZE(%esi) + FST 6 * SIZE(%esi) + FST 7 * SIZE(%esi) +#endif + addl $4 * SIZE, %ebx + addl $4 * SIZE, %ebp + addl $8 * SIZE, %esi + decl %ecx + jne .L25 + ALIGN_3 + +.L24: + movl %edi, %ecx + andl $3, %ecx + jle .L30 + ALIGN_3 + +.L31: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebx), %mm0 + MMXLOAD 0 * SIZE(%ebp), %mm1 + MMXSTORE %mm0, 0 * SIZE(%esi) + MMXSTORE %mm1, 1 * SIZE(%esi) +#else + FLD 0 * SIZE(%ebp) + FLD 0 * SIZE(%ebx) + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) +#endif + addl $1 * SIZE, %ebx + addl $1 * SIZE, %ebp + addl $2 * SIZE, %esi + decl %ecx + jne .L31 + ALIGN_3 + +.L30: + leal (%ebx, %edx, SIZE), %ebx + leal (%ebp, %edx, SIZE), %ebp + decl J + jne .L21 + ALIGN_3 + +.L20: + movl N, %eax + andl $1,%eax + jle .L38 + ALIGN_3 + +.L39: + movl %edi, %ecx + sarl $3, %ecx + je .L42 + ALIGN_3 + +.L43: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebx), %mm0 + MMXLOAD 1 * SIZE(%ebx), %mm1 + MMXLOAD 2 * SIZE(%ebx), %mm2 + MMXLOAD 3 * SIZE(%ebx), %mm3 + MMXLOAD 4 * SIZE(%ebx), %mm4 + MMXLOAD 5 * SIZE(%ebx), %mm5 + MMXLOAD 6 * SIZE(%ebx), %mm6 + MMXLOAD 7 * SIZE(%ebx), %mm7 + + MMXSTORE %mm0, 0 * SIZE(%esi) + MMXSTORE %mm1, 1 * SIZE(%esi) + MMXSTORE %mm2, 2 * SIZE(%esi) + MMXSTORE %mm3, 3 * SIZE(%esi) + MMXSTORE %mm4, 4 * SIZE(%esi) + MMXSTORE %mm5, 5 * SIZE(%esi) + MMXSTORE %mm6, 6 * SIZE(%esi) + MMXSTORE %mm7, 7 * SIZE(%esi) +#else + FLD 7 * SIZE(%ebx) + FLD 6 * SIZE(%ebx) + FLD 5 * SIZE(%ebx) + FLD 4 * SIZE(%ebx) + FLD 3 * SIZE(%ebx) + FLD 2 * SIZE(%ebx) + FLD 1 * SIZE(%ebx) + FLD 0 * SIZE(%ebx) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + FST 4 * SIZE(%esi) + FST 5 * SIZE(%esi) + FST 6 * SIZE(%esi) + FST 7 * SIZE(%esi) +#endif + + addl $8 * SIZE, %ebx + addl $8 * SIZE, %esi + decl %ecx + jne .L43 + ALIGN_3 + +.L42: + movl %edi, %ecx + andl $7, %ecx + jle .L38 + ALIGN_3 + +.L49: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebx), %mm0 + MMXSTORE %mm0, 0 * SIZE(%esi) +#else + FLD 0 * SIZE(%ebx) + FST 0 * SIZE(%esi) +#endif + addl $1 * SIZE, %ebx + addl $1 * SIZE, %esi + decl %ecx + jne .L49 + ALIGN_3 + +.L38: + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_ncopy_2_sse.S b/kernel/x86/gemm_ncopy_2_sse.S new file mode 100644 index 0000000..1a8262c --- /dev/null +++ b/kernel/x86/gemm_ncopy_2_sse.S @@ -0,0 +1,215 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 + +#define STACK 16 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define ARG_A 12 + STACK + ARGS(%esp) +#define ARG_LDA 16 + STACK + ARGS(%esp) +#define ARG_B 20 + STACK + ARGS(%esp) + +#define A %eax +#define B %ebx +#define LDA %ebp +#define A1 %ecx +#define A2 %edx +#define I %esi +#define J %edi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_A, A + movl ARG_B, B + movl ARG_LDA, LDA + + sall $BASE_SHIFT, LDA + + movl N, J + sarl $1, J + je .L20 + ALIGN_3 + +.L10: + movl A, A1 + leal (A, LDA, 1), A2 + leal (A, LDA, 2), A + + movl M, I + sarl $2, I + je .L15 + ALIGN_3 + +.L12: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A2) , %xmm0 + movsd 1 * SIZE(A1) , %xmm1 + movhps 1 * SIZE(A2) , %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(A2) + + movsd 2 * SIZE(A1) , %xmm2 + movhps 2 * SIZE(A2) , %xmm2 + movsd 3 * SIZE(A1) , %xmm3 + movhps 3 * SIZE(A2) , %xmm3 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + addl $ 4 * SIZE, A1 + addl $ 4 * SIZE, A2 + subl $-8 * SIZE, B + decl I + jne .L12 + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A2) , %xmm0 + movsd 1 * SIZE(A1) , %xmm1 + movhps 1 * SIZE(A2) , %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + addl $ 2 * SIZE, A1 + addl $ 2 * SIZE, A2 + subl $-4 * SIZE, B + ALIGN_4 + +.L16: + testl $1, M + jle .L19 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A2) , %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + subl $-2 * SIZE, B + ALIGN_4 + +.L19: + decl J + jne .L10 + ALIGN_3 + +.L20: + testl $1, N + jle .L999 + + movl A, A1 + + movl M, I + sarl $2, I + je .L25 + ALIGN_3 + +.L22: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1), %xmm0 + movhps 1 * SIZE(A1), %xmm0 + movsd 2 * SIZE(A1), %xmm1 + movhps 3 * SIZE(A1), %xmm1 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + addl $ 4 * SIZE, A1 + subl $-4 * SIZE, B + decl I + jne .L22 + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + + movsd 0 * SIZE(A1), %xmm0 + movhps 1 * SIZE(A1), %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + addl $ 2 * SIZE, A1 + subl $-2 * SIZE, B + ALIGN_4 + +.L26: + testl $1, M + jle .L999 + + movsd 0 * SIZE(A1), %xmm0 + movsd %xmm0, 0 * SIZE(B) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_ncopy_4_sse.S b/kernel/x86/gemm_ncopy_4_sse.S new file mode 100644 index 0000000..3e919b2 --- /dev/null +++ b/kernel/x86/gemm_ncopy_4_sse.S @@ -0,0 +1,315 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 + +#define STACK 16 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define ARG_A 12 + STACK + ARGS(%esp) +#define ARG_LDA 16 + STACK + ARGS(%esp) +#define ARG_B 20 + STACK + ARGS(%esp) + +#define A %eax +#define B %ebx +#define LDA %ebp +#define A1 %ecx +#define A2 %edx +#define I %esi +#define J %edi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_A, A + movl ARG_B, B + movl ARG_LDA, LDA + + sall $BASE_SHIFT, LDA + + movl N, J + sarl $2, J + je .L20 + ALIGN_3 + +.L10: + movl A, A1 + leal (A, LDA, 2), A2 + leal (A, LDA, 4), A + + movl M, I + sarl $2, I + je .L15 + ALIGN_3 + +.L12: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 0 * SIZE(A2) , %xmm1 + movhps 0 * SIZE(A2, LDA), %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) + + movsd 1 * SIZE(A1) , %xmm2 + movhps 1 * SIZE(A1, LDA), %xmm2 + movsd 1 * SIZE(A2) , %xmm3 + movhps 1 * SIZE(A2, LDA), %xmm3 + + PREFETCH RPREFETCHSIZE * SIZE(A2) + + movsd 2 * SIZE(A1) , %xmm4 + movhps 2 * SIZE(A1, LDA), %xmm4 + movsd 2 * SIZE(A2) , %xmm5 + movhps 2 * SIZE(A2, LDA), %xmm5 + + PREFETCH RPREFETCHSIZE * SIZE(A2, LDA) + + movsd 3 * SIZE(A1) , %xmm6 + movhps 3 * SIZE(A1, LDA), %xmm6 + movsd 3 * SIZE(A2) , %xmm7 + movhps 3 * SIZE(A2, LDA), %xmm7 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + PREFETCHW (RPREFETCHSIZE + 8) * SIZE(B) + + movaps %xmm4, 8 * SIZE(B) + movaps %xmm5, 10 * SIZE(B) + movaps %xmm6, 12 * SIZE(B) + movaps %xmm7, 14 * SIZE(B) + + addl $ 4 * SIZE, A1 + addl $ 4 * SIZE, A2 + subl $-16 * SIZE, B + decl I + jne .L12 + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 0 * SIZE(A2) , %xmm1 + movhps 0 * SIZE(A2, LDA), %xmm1 + + movsd 1 * SIZE(A1) , %xmm2 + movhps 1 * SIZE(A1, LDA), %xmm2 + movsd 1 * SIZE(A2) , %xmm3 + movhps 1 * SIZE(A2, LDA), %xmm3 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + addl $ 2 * SIZE, A1 + addl $ 2 * SIZE, A2 + subl $-8 * SIZE, B + ALIGN_4 + +.L16: + testl $1, M + jle .L19 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 0 * SIZE(A2) , %xmm1 + movhps 0 * SIZE(A2, LDA), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + subl $-4 * SIZE, B + ALIGN_4 + +.L19: + decl J + jne .L10 + ALIGN_3 + +.L20: + testl $2, N + jle .L30 + + movl A, A1 + leal (A, LDA, 2), A + + movl M, I + sarl $2, I + je .L25 + ALIGN_3 + +.L22: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 1 * SIZE(A1) , %xmm1 + movhps 1 * SIZE(A1, LDA), %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) + + movsd 2 * SIZE(A1) , %xmm2 + movhps 2 * SIZE(A1, LDA), %xmm2 + movsd 3 * SIZE(A1) , %xmm3 + movhps 3 * SIZE(A1, LDA), %xmm3 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + addl $ 4 * SIZE, A1 + subl $-8 * SIZE, B + decl I + jne .L22 + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 1 * SIZE(A1) , %xmm1 + movhps 1 * SIZE(A1, LDA), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + addl $ 2 * SIZE, A1 + addl $ 2 * SIZE, A2 + subl $-4 * SIZE, B + ALIGN_4 + +.L26: + testl $1, M + jle .L30 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + subl $-2 * SIZE, B + ALIGN_4 + +.L30: + testl $1, N + jle .L999 + + movl A, A1 + + movl M, I + sarl $2, I + je .L35 + ALIGN_3 + +.L32: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1), %xmm0 + movhps 1 * SIZE(A1), %xmm0 + movsd 2 * SIZE(A1), %xmm1 + movhps 3 * SIZE(A1), %xmm1 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + addl $ 4 * SIZE, A1 + subl $-4 * SIZE, B + decl I + jne .L32 + ALIGN_3 + +.L35: + testl $2, M + jle .L36 + + movsd 0 * SIZE(A1), %xmm0 + movhps 1 * SIZE(A1), %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + addl $ 2 * SIZE, A1 + subl $-2 * SIZE, B + ALIGN_4 + +.L36: + testl $1, M + jle .L999 + + movsd 0 * SIZE(A1), %xmm0 + movsd %xmm0, 0 * SIZE(B) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_tcopy_2.S b/kernel/x86/gemm_tcopy_2.S new file mode 100644 index 0000000..61b7754 --- /dev/null +++ b/kernel/x86/gemm_tcopy_2.S @@ -0,0 +1,305 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 8 + +#define J 0 + STACK(%esp) +#define BOFFSET2 4 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define A 12 + STACK + ARGS(%esp) +#define LDA 16 + STACK + ARGS(%esp) +#define B 20 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl A, %ebp + movl B, %edi + + movl M, %ebx + movl N, %eax + andl $-2, %eax + + imull %ebx, %eax # m * ( n & ~1) + + leal (%edi,%eax,SIZE), %eax # boffset2 = b + m * (n & ~1) + movl %eax, BOFFSET2 + + movl M, %esi +#ifdef DOUBLE + sall $4,%esi +#else + sall $3,%esi +#endif + + sarl $1, %ebx # if !(m & 1) goto L28 + movl %ebx, J + jle .L28 + ALIGN_4 + +.L39: + movl %ebp, %edx # aoffset1 = a + movl LDA, %eax + movl N, %ebx + + leal (%ebp, %eax,SIZE), %ecx # aoffset2 = a + lda + leal (%ecx, %eax,SIZE), %ebp # aoffset += 2 * lda + movl %edi, %eax # boffset1 = b_offset + addl $4 * SIZE, %edi # boffset += 4 + + sarl $2, %ebx + jle .L32 + ALIGN_4 + +.L36: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%edx), %mm0 + MMXLOAD 1 * SIZE(%edx), %mm1 + MMXLOAD 0 * SIZE(%ecx), %mm2 + MMXLOAD 1 * SIZE(%ecx), %mm3 + + MMXLOAD 2 * SIZE(%edx), %mm4 + MMXLOAD 3 * SIZE(%edx), %mm5 + MMXLOAD 2 * SIZE(%ecx), %mm6 + MMXLOAD 3 * SIZE(%ecx), %mm7 + + MMXSTORE %mm0, 0 * SIZE(%eax) + MMXSTORE %mm1, 1 * SIZE(%eax) + MMXSTORE %mm2, 2 * SIZE(%eax) + MMXSTORE %mm3, 3 * SIZE(%eax) + + addl %esi, %eax + + MMXSTORE %mm4, 0 * SIZE(%eax) + MMXSTORE %mm5, 1 * SIZE(%eax) + MMXSTORE %mm6, 2 * SIZE(%eax) + MMXSTORE %mm7, 3 * SIZE(%eax) +#else + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%edx) + FLD 0 * SIZE(%edx) + + FST 0 * SIZE(%eax) + FST 1 * SIZE(%eax) + FST 2 * SIZE(%eax) + FST 3 * SIZE(%eax) + + addl %esi, %eax + + FLD 3 * SIZE(%ecx) + FLD 2 * SIZE(%ecx) + FLD 3 * SIZE(%edx) + FLD 2 * SIZE(%edx) + + FST 0 * SIZE(%eax) + FST 1 * SIZE(%eax) + FST 2 * SIZE(%eax) + FST 3 * SIZE(%eax) +#endif + + addl $4 * SIZE, %ecx + addl $4 * SIZE, %edx + addl %esi, %eax + decl %ebx + jne .L36 + ALIGN_4 + +.L32: + movl N, %ebx + test $2, %ebx + je .L37 + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%edx), %mm0 + MMXLOAD 1 * SIZE(%edx), %mm1 + MMXLOAD 0 * SIZE(%ecx), %mm2 + MMXLOAD 1 * SIZE(%ecx), %mm3 + + MMXSTORE %mm0, 0 * SIZE(%eax) + MMXSTORE %mm1, 1 * SIZE(%eax) + MMXSTORE %mm2, 2 * SIZE(%eax) + MMXSTORE %mm3, 3 * SIZE(%eax) +#else + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%edx) + FLD 0 * SIZE(%edx) + + FST 0 * SIZE(%eax) + FST 1 * SIZE(%eax) + FST 2 * SIZE(%eax) + FST 3 * SIZE(%eax) +#endif + + addl $2 * SIZE, %ecx + addl $2 * SIZE, %edx + ALIGN_4 + +.L37: + movl N, %ebx + test $1, %ebx + je .L38 + + movl BOFFSET2, %eax + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%edx), %mm0 + MMXLOAD 0 * SIZE(%ecx), %mm1 + MMXSTORE %mm0, 0 * SIZE(%eax) + MMXSTORE %mm1, 1 * SIZE(%eax) +#else + FLD 0 * SIZE(%edx) + FST 0 * SIZE(%eax) + FLD 0 * SIZE(%ecx) + FST 1 * SIZE(%eax) +#endif + addl $2 * SIZE, %eax + movl %eax, BOFFSET2 + ALIGN_4 + +.L38: + decl J + jg .L39 + ALIGN_4 + +.L28: + movl M, %eax + movl N, %ebx + + testb $1, %al + je .L40 + + sarl $2, %ebx + jle .L41 + ALIGN_4 + +.L45: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebp), %mm0 + MMXLOAD 1 * SIZE(%ebp), %mm1 + MMXLOAD 2 * SIZE(%ebp), %mm2 + MMXLOAD 3 * SIZE(%ebp), %mm3 + + MMXSTORE %mm0, 0 * SIZE(%edi) + MMXSTORE %mm1, 1 * SIZE(%edi) + + addl %esi, %edi + + MMXSTORE %mm2, 0 * SIZE(%edi) + MMXSTORE %mm3, 1 * SIZE(%edi) +#else + FLD 0 * SIZE(%ebp) + FST 0 * SIZE(%edi) + FLD 1 * SIZE(%ebp) + FST 1 * SIZE(%edi) + addl %esi, %edi + + FLD 2 * SIZE(%ebp) + FST 0 * SIZE(%edi) + FLD 3 * SIZE(%ebp) + FST 1 * SIZE(%edi) +#endif + addl %esi,%edi + addl $4 * SIZE, %ebp + decl %ebx + jg .L45 + ALIGN_4 + +.L41: + movl N, %ebx + test $2, %ebx + je .L46 + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebp), %mm0 + MMXSTORE %mm0, 0 * SIZE(%edi) + MMXLOAD 1 * SIZE(%ebp), %mm1 + MMXSTORE %mm1, 1 * SIZE(%edi) +#else + FLD 1 * SIZE(%ebp) + FLD 0 * SIZE(%ebp) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) +#endif + + addl $2 * SIZE, %ebp + ALIGN_4 + +.L46: + movl N, %ebx + test $1, %ebx + je .L40 + + movl BOFFSET2, %eax + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebp), %mm0 + MMXSTORE %mm0, 0 * SIZE(%eax) +#else + FLD (%ebp) + FST (%eax) +#endif + ALIGN_4 + +.L40: + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS,%esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_tcopy_2_sse.S b/kernel/x86/gemm_tcopy_2_sse.S new file mode 100644 index 0000000..de5f4ff --- /dev/null +++ b/kernel/x86/gemm_tcopy_2_sse.S @@ -0,0 +1,236 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 + +#define STACK 16 +#define ARGS 8 + +#define J 0 + STACK(%esp) +#define BOFFSET2 4 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define A 12 + STACK + ARGS(%esp) +#define LDA 16 + STACK + ARGS(%esp) +#define B 20 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl A, %ebp + movl B, %edi + + movl M, %ebx + movl N, %eax + andl $-2, %eax + + imull %ebx, %eax # m * ( n & ~1) + + leal (%edi,%eax,SIZE), %eax # boffset2 = b + m * (n & ~1) + movl %eax, BOFFSET2 + + movl M, %esi +#ifdef DOUBLE + sall $4,%esi +#else + sall $3,%esi +#endif + + sarl $1, %ebx # if !(m & 1) goto L28 + movl %ebx, J + jle .L28 + ALIGN_4 + +.L39: + movl %ebp, %edx # aoffset1 = a + movl LDA, %eax + movl N, %ebx + + leal (%ebp, %eax,SIZE), %ecx # aoffset2 = a + lda + leal (%ecx, %eax,SIZE), %ebp # aoffset += 2 * lda + movl %edi, %eax # boffset1 = b_offset + addl $4 * SIZE, %edi # boffset += 4 + + sarl $2, %ebx + jle .L32 + ALIGN_4 + +.L36: + PREFETCH RPREFETCHSIZE * SIZE(%edx) + + movsd 0 * SIZE(%edx), %xmm0 + movhps 1 * SIZE(%edx), %xmm0 + movsd 0 * SIZE(%ecx), %xmm2 + movhps 1 * SIZE(%ecx), %xmm2 + + PREFETCH RPREFETCHSIZE * SIZE(%ecx) + + movsd 2 * SIZE(%edx), %xmm4 + movhps 3 * SIZE(%edx), %xmm4 + movsd 2 * SIZE(%ecx), %xmm6 + movhps 3 * SIZE(%ecx), %xmm6 + + movaps %xmm0, 0 * SIZE(%eax) + movaps %xmm2, 2 * SIZE(%eax) + + addl %esi, %eax + + movaps %xmm4, 0 * SIZE(%eax) + movaps %xmm6, 2 * SIZE(%eax) + + addl $4 * SIZE, %ecx + addl $4 * SIZE, %edx + addl %esi, %eax + decl %ebx + jne .L36 + ALIGN_4 + +.L32: + movl N, %ebx + test $2, %ebx + je .L37 + + PREFETCH RPREFETCHSIZE * SIZE(%edx) + movsd 0 * SIZE(%edx), %xmm0 + movhps 1 * SIZE(%edx), %xmm0 + + PREFETCH RPREFETCHSIZE * SIZE(%ecx) + movsd 0 * SIZE(%ecx), %xmm2 + movhps 1 * SIZE(%ecx), %xmm2 + + movaps %xmm0, 0 * SIZE(%eax) + movaps %xmm2, 2 * SIZE(%eax) + + addl $2 * SIZE, %ecx + addl $2 * SIZE, %edx + ALIGN_4 + +.L37: + movl N, %ebx + test $1, %ebx + je .L38 + + movl BOFFSET2, %eax + + movsd 0 * SIZE(%edx), %xmm0 + movhps 0 * SIZE(%ecx), %xmm0 + movaps %xmm0, 0 * SIZE(%eax) + + addl $2 * SIZE, %eax + movl %eax, BOFFSET2 + ALIGN_4 + +.L38: + decl J + jg .L39 + ALIGN_4 + +.L28: + movl M, %eax + movl N, %ebx + + testb $1, %al + je .L40 + + sarl $2, %ebx + jle .L41 + ALIGN_4 + +.L45: + movsd 0 * SIZE(%ebp), %xmm0 + movhps 1 * SIZE(%ebp), %xmm0 + movsd 2 * SIZE(%ebp), %xmm2 + movhps 3 * SIZE(%ebp), %xmm2 + + movaps %xmm0, 0 * SIZE(%edi) + + addl %esi, %edi + + movaps %xmm2, 0 * SIZE(%edi) + + addl %esi,%edi + addl $4 * SIZE, %ebp + decl %ebx + jg .L45 + ALIGN_4 + +.L41: + movl N, %ebx + test $2, %ebx + je .L46 + + movsd 0 * SIZE(%ebp), %xmm0 + movhps 1 * SIZE(%ebp), %xmm0 + movaps %xmm0, 0 * SIZE(%edi) + addl $2 * SIZE, %ebp + ALIGN_4 + +.L46: + movl N, %ebx + test $1, %ebx + je .L40 + + movl BOFFSET2, %eax + + movsd 0 * SIZE(%ebp), %xmm0 + movsd %xmm0, 0 * SIZE(%eax) + ALIGN_4 + +.L40: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS,%esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_tcopy_4_sse.S b/kernel/x86/gemm_tcopy_4_sse.S new file mode 100644 index 0000000..4e1e2e6 --- /dev/null +++ b/kernel/x86/gemm_tcopy_4_sse.S @@ -0,0 +1,305 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 8 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 + +#define STACK 16 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define ARG_A 12 + STACK + ARGS(%esp) +#define ARG_LDA 16 + STACK + ARGS(%esp) +#define ARG_B 20 + STACK + ARGS(%esp) + +#define A %eax +#define B %ebx +#define LDA %ebp +#define A1 %ecx +#define A2 %edx +#define I %esi +#define J %edi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_A, A + movl ARG_B, B + movl ARG_LDA, LDA + + sall $BASE_SHIFT, LDA + + movl N, J + sarl $2, J + je .L20 + ALIGN_3 + +.L10: + movl A, A1 + leal (A, LDA, 2), A2 + addl $4 * SIZE, A + + movl M, I + sarl $2, I + je .L15 + ALIGN_3 + +.L12: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 2 * SIZE(A1) , %xmm1 + movhps 3 * SIZE(A1) , %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) + + movsd 0 * SIZE(A1, LDA), %xmm2 + movhps 1 * SIZE(A1, LDA), %xmm2 + movsd 2 * SIZE(A1, LDA), %xmm3 + movhps 3 * SIZE(A1, LDA), %xmm3 + + PREFETCH RPREFETCHSIZE * SIZE(A2) + + movsd 0 * SIZE(A2) , %xmm4 + movhps 1 * SIZE(A2) , %xmm4 + movsd 2 * SIZE(A2) , %xmm5 + movhps 3 * SIZE(A2) , %xmm5 + + PREFETCH RPREFETCHSIZE * SIZE(A2, LDA) + + movsd 0 * SIZE(A2, LDA), %xmm6 + movhps 1 * SIZE(A2, LDA), %xmm6 + movsd 2 * SIZE(A2, LDA), %xmm7 + movhps 3 * SIZE(A2, LDA), %xmm7 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + PREFETCHW (RPREFETCHSIZE + 8) * SIZE(B) + + movaps %xmm4, 8 * SIZE(B) + movaps %xmm5, 10 * SIZE(B) + movaps %xmm6, 12 * SIZE(B) + movaps %xmm7, 14 * SIZE(B) + + leal (A1, LDA, 4), A1 + leal (A2, LDA, 4), A2 + subl $-16 * SIZE, B + decl I + jne .L12 + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 2 * SIZE(A1) , %xmm1 + movhps 3 * SIZE(A1) , %xmm1 + + movsd 0 * SIZE(A1, LDA), %xmm2 + movhps 1 * SIZE(A1, LDA), %xmm2 + movsd 2 * SIZE(A1, LDA), %xmm3 + movhps 3 * SIZE(A1, LDA), %xmm3 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + leal (A1, LDA, 2), A1 + subl $-8 * SIZE, B + ALIGN_4 + +.L16: + testl $1, M + jle .L19 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 2 * SIZE(A1) , %xmm1 + movhps 3 * SIZE(A1) , %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + subl $-4 * SIZE, B + ALIGN_4 + +.L19: + decl J + jne .L10 + ALIGN_3 + +.L20: + testl $2, N + jle .L30 + + movl A, A1 + leal (A, LDA, 2), A2 + addl $2 * SIZE, A + + movl M, I + sarl $2, I + je .L25 + ALIGN_3 + +.L22: + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 0 * SIZE(A1, LDA), %xmm1 + movhps 1 * SIZE(A1, LDA), %xmm1 + + movsd 0 * SIZE(A2) , %xmm2 + movhps 1 * SIZE(A2) , %xmm2 + movsd 0 * SIZE(A2, LDA), %xmm3 + movhps 1 * SIZE(A2, LDA), %xmm3 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + leal (A1, LDA, 4), A1 + leal (A2, LDA, 4), A2 + subl $-8 * SIZE, B + decl I + jne .L22 + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 0 * SIZE(A1, LDA), %xmm1 + movhps 1 * SIZE(A1, LDA), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + leal (A1, LDA, 2), A1 + subl $-4 * SIZE, B + ALIGN_4 + +.L26: + testl $1, M + jle .L30 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + + movaps %xmm0, 0 * SIZE(B) + subl $-2 * SIZE, B + ALIGN_4 + +.L30: + testl $1, N + jle .L999 + + movl A, A1 + leal (A, LDA, 2), A2 + + movl M, I + sarl $2, I + je .L35 + ALIGN_3 + +.L32: + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 0 * SIZE(A2) , %xmm1 + movhps 0 * SIZE(A2, LDA), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + leal (A1, LDA, 4), A1 + leal (A2, LDA, 4), A2 + subl $-4 * SIZE, B + decl I + jne .L32 + ALIGN_3 + +.L35: + testl $2, M + jle .L36 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + leal (A1, LDA, 2), A1 + subl $-2 * SIZE, B + ALIGN_4 + +.L36: + testl $1, M + jle .L999 + + movsd 0 * SIZE(A1) , %xmm0 + movsd %xmm0, 0 * SIZE(B) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_n.S b/kernel/x86/gemv_n.S new file mode 100644 index 0000000..13fd1ed --- /dev/null +++ b/kernel/x86/gemv_n.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 32 +#endif + +#if defined(ATHLON) || defined(OPTERON) || defined(OPTERON) +#define P 32 +#endif + +#ifndef P +#define P DTB_ENTRIES +#endif + +#define STACK 16 +#define ARGS 16 + +#define PLDA_M 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_N 8 + STACK(%esp) +#define IS 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define LDA 28 + STACK + ARGS(%esp) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#define BUFFER 48 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define LDA 24 + STACK + ARGS(%esp) +#define X 28 + STACK + ARGS(%esp) +#define INCX 32 + STACK + ARGS(%esp) +#define Y 36 + STACK + ARGS(%esp) +#define INCY 40 + STACK + ARGS(%esp) +#define BUFFER 44 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA + movl X, %edi + + movl LDA, %ebx + leal 0(,%ebx,SIZE),%ebx # EBX : lda + + movl $0, IS + movl M, %edx + movl N, %esi + + test %esi, %esi + jle .L79 # goto END + test %edx, %edx + jle .L79 # goto END + + movl INCY, %eax + leal (,%eax,SIZE),%eax + movl %eax, INCY + + movl LDA, %eax + imull $P, %eax # P * lda + subl M ,%eax # P * lda - m + leal (, %eax, SIZE), %eax + movl %eax, PLDA_M + ALIGN_2 + +.L32: + movl IS, %esi + movl $P, %edx + movl N, %eax + subl %esi,%eax # n - is + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + + movl %eax, MIN_N + movl INCX, %edx + + leal (%edi, %esi, SIZE), %esi # xp = x + is + movl %esi, XP + cmpl $1, %edx + je .L34 # if incx == 1 goto L34 + + movl BUFFER, %esi + leal (, %edx, SIZE), %edx + movl %esi, XP # xp = buffer + sarl $2,%eax + jle .L35 + ALIGN_2 + +.L36: + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_N, %eax + andl $3, %eax + jle .L34 + ALIGN_2 + +.L42: + FLD (%edi) + addl %edx, %edi + FST (%esi) + addl $SIZE, %esi + decl %eax + jg .L42 + ALIGN_3 + +/* Main Routine */ +.L34: + movl Y, %ecx # c_offset + movl M, %ebp + sarl $2, %ebp # j = (m >> 2) + jle .L47 + ALIGN_2 + +.L48: + movl A, %edx # a_offset = a + fldz + addl $4 * SIZE, A # a += 4 + fldz + movl XP, %esi # b_offset = xp + fldz + movl MIN_N, %eax # i = min_n + fldz + FLD (%esi) # bt1 = b_offset + sarl $1, %eax + jle .L51 + ALIGN_2 + +#ifdef PENTIUM3 +#define PRESIZE 8 +#else +#define PRESIZE 24 +#endif + +.L80: +#ifdef PENTIUM3 + prefetcht1 PRESIZE * SIZE(%edx, %ebx, 1) + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + + prefetcht1 PRESIZE * SIZE(%esi) + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(%esi) # bt1 = b_offset + + prefetcht1 PRESIZE * SIZE(%edx, %ebx, 2) + addl %ebx, %edx # a_offset += lda + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + addl %ebx, %edx + faddp %st, %st(4) # ct4 += at1 + + FLD 2 * SIZE(%esi) # bt1 = b_offset + addl $2 * SIZE, %esi # b_offset += 2 + +#else +#ifdef PENTIUM4 + prefetchnta 8 * SIZE(%esi) +#endif + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(%esi) # bt1 = b_offset + + addl %ebx, %edx # a_offset += lda + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + faddp %st, %st(4) # ct4 += at1 + FLD 2 * SIZE(%esi) # bt1 = b_offset + + addl %ebx, %edx + addl $2 * SIZE, %esi # b_offset += 2 +#endif + decl %eax + jg .L80 + +.L51: + movl MIN_N,%eax + andl $1, %eax + je .L57 + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + faddp %st, %st(4) # ct4 += at1 + fldz + ALIGN_2 + +.L57: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + movl INCY, %eax + + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + + decl %ebp # j -- + jg .L48 + ALIGN_3 + +.L47: + movl M, %ebp + andl $3, %ebp # j = (m & 3) + jle .L60 + ALIGN_2 + +.L61: + + movl A, %edx # a_offset = a + fldz + addl $SIZE, A # a++ + fldz + movl XP,%esi + fldz + movl MIN_N,%eax + fldz + sarl $3,%eax + jle .L64 + ALIGN_2 + +.L65: + FLD 0 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(1) + addl %ebx, %edx + + FLD 1 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(2) + addl %ebx ,%edx + + FLD 2 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(3) + addl %ebx, %edx + + FLD 3 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(4) + addl %ebx, %edx + + FLD 4 * SIZE(%esi) + FMUL (%edx) + faddp %st,%st(1) + addl %ebx, %edx + + FLD 5 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(2) + addl %ebx, %edx + + FLD 6 * SIZE(%esi) + FMUL (%edx) + faddp %st,%st(3) + addl %ebx, %edx + + FLD 7 * SIZE(%esi) + FMUL (%edx) + faddp %st,%st(4) + addl %ebx, %edx + + addl $8 * SIZE, %esi + decl %eax + jg .L65 + +.L64: + movl MIN_N,%eax + andl $7, %eax + jle .L70 + ALIGN_2 + +.L71: + FLD (%esi) + addl $SIZE, %esi # b_offset ++ + FMUL (%edx) + addl %ebx, %edx # a_offset += lda + faddp %st, %st(1) + decl %eax + jg .L71 + ALIGN_2 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1), %st + movl INCY, %eax + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + decl %ebp + jg .L61 + +.L60: + movl PLDA_M, %esi + addl %esi, A # a += P * lda - m + addl $P, IS + movl N, %esi + cmpl %esi,IS + jl .L32 + +.L79: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_n_atom.S b/kernel/x86/gemv_n_atom.S new file mode 100644 index 0000000..e88409c --- /dev/null +++ b/kernel/x86/gemv_n_atom.S @@ -0,0 +1,774 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + leal (,INCX, SIZE), INCX + leal (,LDA, SIZE), LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + pxor %xmm7, %xmm7 + + movl M, %eax + addl $16, %eax + sarl $4, %eax + ALIGN_3 + +.L01: + movapd %xmm7, 0 * SIZE(Y1) + movapd %xmm7, 2 * SIZE(Y1) + movapd %xmm7, 4 * SIZE(Y1) + movapd %xmm7, 6 * SIZE(Y1) + movapd %xmm7, 8 * SIZE(Y1) + movapd %xmm7, 10 * SIZE(Y1) + movapd %xmm7, 12 * SIZE(Y1) + movapd %xmm7, 14 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + movsd (X), %xmm6 + addl INCX, X + movsd (X), %xmm7 + addl INCX, X + + movsd ALPHA, %xmm0 + + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + + movl M, I + sarl $3, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + movsd -16 * SIZE(A1, LDA), %xmm4 + movsd -15 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -11 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -11 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) +#endif + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -9 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -10 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -9 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -12 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm0 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -7 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -8 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -7 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -10 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + movlpd %xmm1, -9 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -11 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -11 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -9 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -10 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -9 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -12 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm0 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, -10 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + movlpd %xmm1, -9 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $4, M + je .L16 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + movsd -16 * SIZE(A1, LDA), %xmm4 + movsd -15 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L16: + testl $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + movsd -16 * SIZE(A1, LDA), %xmm4 + movsd -15 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm3 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm6, %xmm2 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm3 + addsd %xmm3, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L990 + + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + movsd (X), %xmm6 + addl INCX, X + movsd (X), %xmm7 + addl INCX, X + + movsd ALPHA, %xmm0 + + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + movsd -14 * SIZE(Y1), %xmm4 + movsd -13 * SIZE(Y1), %xmm5 + + movl M, I + sarl $3, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + decl I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -16 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + mulsd %xmm6, %xmm3 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addsd %xmm2, %xmm4 + movsd -12 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm5 + movsd -11 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm4, -14 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm4 + mulsd %xmm6, %xmm3 + movlpd %xmm5, -13 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm5 + + addsd %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -9 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -12 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + mulsd %xmm6, %xmm3 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + addsd %xmm2, %xmm4 + movsd -8 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm5 + movsd -7 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm4, -10 * SIZE(Y1) + movsd -6 * SIZE(Y1), %xmm4 + mulsd %xmm6, %xmm3 + movlpd %xmm5, -9 * SIZE(Y1) + movsd -5 * SIZE(Y1), %xmm5 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -16 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + mulsd %xmm6, %xmm3 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addsd %xmm2, %xmm4 + movsd -12 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm5 + movsd -11 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm4, -14 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm4 + mulsd %xmm6, %xmm3 + movlpd %xmm5, -13 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm5 + + addsd %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -9 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -12 * SIZE(Y1) + mulsd %xmm6, %xmm3 + movlpd %xmm1, -11 * SIZE(Y1) + + addsd %xmm2, %xmm4 + movsd -8 * SIZE(Y1), %xmm0 + addsd %xmm3, %xmm5 + movsd -7 * SIZE(Y1), %xmm1 + + movlpd %xmm4, -10 * SIZE(Y1) + movsd -6 * SIZE(Y1), %xmm4 + movlpd %xmm5, -9 * SIZE(Y1) + movsd -5 * SIZE(Y1), %xmm5 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testl $4, M + je .L26 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -16 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + mulsd %xmm6, %xmm3 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + + movlpd %xmm4, -14 * SIZE(Y1) + movlpd %xmm5, -13 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L26: + testl $2, M + je .L27 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L27: + testl $1, M + je .L990 + + movsd -16 * SIZE(A1), %xmm2 + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm6, %xmm2 + addsd %xmm2, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L990: + movl Y, Y1 + movl BUFFER, X + movl Y1, A1 + + movl STACK_INCY, INCY + sall $BASE_SHIFT, INCY + + movl M, %eax + sarl $3, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y1), %xmm0 + addl INCY, Y1 + movsd (Y1), %xmm1 + addl INCY, Y1 + movsd (Y1), %xmm2 + addl INCY, Y1 + movsd (Y1), %xmm3 + addl INCY, Y1 + movsd (Y1), %xmm4 + addl INCY, Y1 + movsd (Y1), %xmm5 + addl INCY, Y1 + movsd (Y1), %xmm6 + addl INCY, Y1 + movsd (Y1), %xmm7 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + addsd 2 * SIZE(X), %xmm2 + addsd 3 * SIZE(X), %xmm3 + addsd 4 * SIZE(X), %xmm4 + addsd 5 * SIZE(X), %xmm5 + addsd 6 * SIZE(X), %xmm6 + addsd 7 * SIZE(X), %xmm7 + + movlpd %xmm0, (A1) + addl INCY, A1 + movlpd %xmm1, (A1) + addl INCY, A1 + movlpd %xmm2, (A1) + addl INCY, A1 + movlpd %xmm3, (A1) + addl INCY, A1 + movlpd %xmm4, (A1) + addl INCY, A1 + movlpd %xmm5, (A1) + addl INCY, A1 + movlpd %xmm6, (A1) + addl INCY, A1 + movlpd %xmm7, (A1) + addl INCY, A1 + + addl $8 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $7, M + jle .L999 + + testl $4, M + jle .L995 + + movsd (Y1), %xmm0 + addl INCY, Y1 + movsd (Y1), %xmm1 + addl INCY, Y1 + movsd (Y1), %xmm2 + addl INCY, Y1 + movsd (Y1), %xmm3 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + addsd 2 * SIZE(X), %xmm2 + addsd 3 * SIZE(X), %xmm3 + + movlpd %xmm0, (A1) + addl INCY, A1 + movlpd %xmm1, (A1) + addl INCY, A1 + movlpd %xmm2, (A1) + addl INCY, A1 + movlpd %xmm3, (A1) + addl INCY, A1 + + addl $4 * SIZE, X + ALIGN_3 + +.L995: + testl $2, M + jle .L996 + + movsd (Y1), %xmm0 + addl INCY, Y1 + movsd (Y1), %xmm1 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + + movlpd %xmm0, (A1) + addl INCY, A1 + movlpd %xmm1, (A1) + addl INCY, A1 + + addl $2 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movsd (Y1), %xmm0 + + addsd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, (A1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S new file mode 100644 index 0000000..aae49a2 --- /dev/null +++ b/kernel/x86/gemv_n_sse.S @@ -0,0 +1,662 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef movsd +#undef movsd +#endif + +#ifdef PENTIUM3 +#ifdef HAVE_SSE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 20 + STACKSIZE(%esp) +#define STACK_LDA 24 + STACKSIZE(%esp) +#define STACK_X 28 + STACKSIZE(%esp) +#define STACK_INCX 32 + STACKSIZE(%esp) +#define Y 36 + STACKSIZE(%esp) +#define STACK_INCY 40 + STACKSIZE(%esp) +#define BUFFER 44 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + leal (,INCX, SIZE), INCX + leal (,LDA, SIZE), LDA + + subl $-32 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + xorps %xmm7, %xmm7 + + movl M, %eax + addl $16, %eax + sarl $4, %eax + ALIGN_3 + +.L01: + movaps %xmm7, 0 * SIZE(Y1) + movaps %xmm7, 4 * SIZE(Y1) + movaps %xmm7, 8 * SIZE(Y1) + movaps %xmm7, 12 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, Y1 + addl $32 * SIZE, Y1 + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + movss (X), %xmm6 + addl INCX, X + movss (X), %xmm7 + addl INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm6 + mulss %xmm0, %xmm7 + + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + ALIGN_3 + + movl M, I + sarl $4, I + jle .L15 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm3 + movhps -26 * SIZE(A1), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + movsd -28 * SIZE(A1, LDA), %xmm5 + movhps -26 * SIZE(A1, LDA), %xmm5 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -20 * SIZE(A1), %xmm3 + movhps -18 * SIZE(A1), %xmm3 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1, LDA), %xmm4 + movhps -22 * SIZE(A1, LDA), %xmm4 + + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm5 + movhps -18 * SIZE(A1, LDA), %xmm5 + + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) +#endif + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -16 * SIZE(A1), %xmm2 + movhps -14 * SIZE(A1), %xmm2 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -12 * SIZE(A1), %xmm3 + movhps -10 * SIZE(A1), %xmm3 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm4 + movhps -14 * SIZE(A1, LDA), %xmm4 + + movaps %xmm0, -24 * SIZE(Y1) + movaps -16 * SIZE(Y1), %xmm0 + + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + movhps -10 * SIZE(A1, LDA), %xmm5 + + movaps %xmm1, -20 * SIZE(Y1) + movaps -12 * SIZE(Y1), %xmm1 + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -20 * SIZE(A1), %xmm3 + movhps -18 * SIZE(A1), %xmm3 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1, LDA), %xmm4 + movhps -22 * SIZE(A1, LDA), %xmm4 + + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm5 + movhps -18 * SIZE(A1, LDA), %xmm5 + + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + movaps %xmm0, -24 * SIZE(Y1) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -20 * SIZE(Y1) + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $8, M + je .L16 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm3 + movhps -26 * SIZE(A1), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + movsd -28 * SIZE(A1, LDA), %xmm5 + movhps -26 * SIZE(A1, LDA), %xmm5 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + addl $8 * SIZE, A1 + addl $8 * SIZE, Y1 + ALIGN_3 + +.L16: + testl $4, M + je .L17 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -32 * SIZE(A1, LDA), %xmm3 + movhps -30 * SIZE(A1, LDA), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $2, M + je .L18 + + movsd -32 * SIZE(A1), %xmm2 + movsd -32 * SIZE(A1, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L18: + testl $1, M + je .L19 + + movss -32 * SIZE(A1), %xmm2 + movss -32 * SIZE(A1, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm0 + + mulss %xmm6, %xmm2 + addss %xmm2, %xmm0 + mulss %xmm7, %xmm3 + addss %xmm3, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L990 + + movl BUFFER, Y1 + addl $32 * SIZE, Y1 + + movl A, A1 + + movss (X), %xmm6 + addl INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm6 + + shufps $0, %xmm6, %xmm6 + ALIGN_3 + + movl M, I + sarl $4, I + jle .L25 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm3 + movhps -26 * SIZE(A1), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + + decl I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 + + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -20 * SIZE(A1), %xmm3 + movhps -18 * SIZE(A1), %xmm3 + + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -16 * SIZE(A1), %xmm2 + movhps -14 * SIZE(A1), %xmm2 + + movaps %xmm0, -24 * SIZE(Y1) + movaps -16 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -12 * SIZE(A1), %xmm3 + movhps -10 * SIZE(A1), %xmm3 + + movaps %xmm1, -20 * SIZE(Y1) + movaps -12 * SIZE(Y1), %xmm1 + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -20 * SIZE(A1), %xmm3 + movhps -18 * SIZE(A1), %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y1) + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movaps %xmm1, -20 * SIZE(Y1) + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + ALIGN_3 + +.L25: + testl $8, M + je .L26 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm3 + movhps -26 * SIZE(A1), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + addl $8 * SIZE, A1 + addl $8 * SIZE, Y1 + ALIGN_3 + +.L26: + testl $4, M + je .L27 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + + movaps -32 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testl $2, M + je .L28 + + movsd -32 * SIZE(A1), %xmm2 + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L28: + testl $1, M + je .L990 + + movss -32 * SIZE(A1), %xmm2 + movss -32 * SIZE(Y1), %xmm0 + + mulss %xmm6, %xmm2 + addss %xmm2, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L990: + movl Y, Y1 + movl BUFFER, X + + movl STACK_INCY, INCY + sall $BASE_SHIFT, INCY + + movl M, %eax + sarl $2, %eax + jle .L994 + ALIGN_3 + +.L992: + movss (Y1), %xmm0 + addss 0 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + movss (Y1), %xmm0 + addss 1 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + movss (Y1), %xmm0 + addss 2 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + movss (Y1), %xmm0 + addss 3 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + addl $4 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $2, M + jle .L996 + + movss (Y1), %xmm0 + addss 0 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + movss (Y1), %xmm0 + addss 1 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + addl $2 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movss (Y1), %xmm0 + addss 0 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S new file mode 100644 index 0000000..669c5ac --- /dev/null +++ b/kernel/x86/gemv_n_sse2.S @@ -0,0 +1,686 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetch +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + leal (,INCX, SIZE), INCX + leal (,LDA, SIZE), LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + pxor %xmm7, %xmm7 + + movl M, %eax + addl $16, %eax + sarl $4, %eax + ALIGN_3 + +.L01: + movapd %xmm7, 0 * SIZE(Y1) + movapd %xmm7, 2 * SIZE(Y1) + movapd %xmm7, 4 * SIZE(Y1) + movapd %xmm7, 6 * SIZE(Y1) + movapd %xmm7, 8 * SIZE(Y1) + movapd %xmm7, 10 * SIZE(Y1) + movapd %xmm7, 12 * SIZE(Y1) + movapd %xmm7, 14 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + +#ifdef HAVE_SSE3 + movddup (X), %xmm6 + addl INCX, X + movddup (X), %xmm7 + addl INCX, X + + movddup ALPHA, %xmm0 + + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm7 +#else + movsd (X), %xmm6 + addl INCX, X + movsd (X), %xmm7 + addl INCX, X + + movsd ALPHA, %xmm0 + + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 +#endif + + ALIGN_3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm3 + movhpd -13 * SIZE(A1), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + movsd -14 * SIZE(A1, LDA), %xmm5 + movhpd -13 * SIZE(A1, LDA), %xmm5 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -10 * SIZE(A1), %xmm3 + movhpd -9 * SIZE(A1), %xmm3 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + movhpd -11 * SIZE(A1, LDA), %xmm4 + + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm5 + movhpd -9 * SIZE(A1, LDA), %xmm5 + + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) +#endif + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + movhpd -7 * SIZE(A1), %xmm2 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -6 * SIZE(A1), %xmm3 + movhpd -5 * SIZE(A1), %xmm3 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1, LDA), %xmm4 + movhpd -7 * SIZE(A1, LDA), %xmm4 + + movapd %xmm0, -12 * SIZE(Y1) + movapd -8 * SIZE(Y1), %xmm0 + + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movsd -6 * SIZE(A1, LDA), %xmm5 + movhpd -5 * SIZE(A1, LDA), %xmm5 + + movapd %xmm1, -10 * SIZE(Y1) + movapd -6 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -10 * SIZE(A1), %xmm3 + movhpd -9 * SIZE(A1), %xmm3 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + movhpd -11 * SIZE(A1, LDA), %xmm4 + + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm5 + movhpd -9 * SIZE(A1, LDA), %xmm5 + + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + movapd %xmm0, -12 * SIZE(Y1) + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movapd %xmm1, -10 * SIZE(Y1) + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $4, M + je .L16 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm3 + movhpd -13 * SIZE(A1), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + movsd -14 * SIZE(A1, LDA), %xmm5 + movhpd -13 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + + movapd %xmm0, -16 * SIZE(Y1) + movapd %xmm1, -14 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L16: + testl $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm3 + movhpd -15 * SIZE(A1, LDA), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm7, %xmm3 + addpd %xmm3, %xmm0 + + movapd %xmm0, -16 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm3 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm6, %xmm2 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm3 + addsd %xmm3, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L990 + + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + +#ifdef HAVE_SSE3 + movddup (X), %xmm6 + addl INCX, X + + movddup ALPHA, %xmm0 + + mulpd %xmm0, %xmm6 +#else + movsd (X), %xmm6 + addl INCX, X + + movsd ALPHA, %xmm0 + + mulsd %xmm0, %xmm6 + unpcklpd %xmm6, %xmm6 +#endif + + ALIGN_3 + + movl M, I + sarl $3, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm3 + movhpd -13 * SIZE(A1), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + decl I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -10 * SIZE(A1), %xmm3 + movhpd -9 * SIZE(A1), %xmm3 + + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + movhpd -7 * SIZE(A1), %xmm2 + + movapd %xmm0, -12 * SIZE(Y1) + movapd -8 * SIZE(Y1), %xmm0 + + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -6 * SIZE(A1), %xmm3 + movhpd -5 * SIZE(A1), %xmm3 + + movapd %xmm1, -10 * SIZE(Y1) + movapd -6 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -10 * SIZE(A1), %xmm3 + movhpd -9 * SIZE(A1), %xmm3 + + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movapd %xmm0, -12 * SIZE(Y1) + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movapd %xmm1, -10 * SIZE(Y1) + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testl $4, M + je .L26 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm3 + movhpd -13 * SIZE(A1), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + + movapd %xmm0, -16 * SIZE(Y1) + movapd %xmm1, -14 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L26: + testl $2, M + je .L27 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + + movapd -16 * SIZE(Y1), %xmm0 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + + movapd %xmm0, -16 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L27: + testl $1, M + je .L990 + + movsd -16 * SIZE(A1), %xmm2 + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm6, %xmm2 + addsd %xmm2, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L990: + movl Y, Y1 + movl BUFFER, X + + movl STACK_INCY, INCY + sall $BASE_SHIFT, INCY + + movl M, %eax + sarl $3, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 2 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 4 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 6 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $8 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $7, M + jle .L999 + + testl $4, M + jle .L995 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 2 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $4 * SIZE, X + ALIGN_3 + +.L995: + testl $2, M + jle .L996 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $2 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movsd (Y1), %xmm0 + + movsd 0 * SIZE(X), %xmm4 + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_t.S b/kernel/x86/gemv_t.S new file mode 100644 index 0000000..2eecd3f --- /dev/null +++ b/kernel/x86/gemv_t.S @@ -0,0 +1,583 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 88 +#endif + +#ifndef P +#define P 1000 +#endif + +#define STACK 16 +#define ARGS 24 + +#define NLDA 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_M 8 + STACK(%esp) +#define J 12 + STACK(%esp) +#define IS 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define LDA 28 + STACK + ARGS(%esp) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#define BUFFER 48 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define LDA 24 + STACK + ARGS(%esp) +#define X 28 + STACK + ARGS(%esp) +#define INCX 32 + STACK + ARGS(%esp) +#define Y 36 + STACK + ARGS(%esp) +#define INCY 40 + STACK + ARGS(%esp) +#define BUFFER 44 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA + + movl X, %edi # X + + movl $0, IS + + movl M, %ebx + movl N, %eax + + testl %ebx, %ebx + jle .L79 + testl %eax, %eax + jle .L79 + + movl INCX, %esi + leal (,%esi,SIZE), %esi + movl %esi, INCX + + movl INCY, %esi + leal (, %esi, SIZE), %esi + movl %esi, INCY + + movl LDA, %ebx + + imull %ebx, %eax + movl $P, %esi + subl %eax, %esi + leal (, %esi, SIZE), %esi + movl %esi, NLDA + + leal (,%ebx,SIZE), %esi + movl %esi, LDA + ALIGN_2 + +.L32: + movl IS, %esi + + movl $P, %edx + movl M, %eax + subl %esi, %eax + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + movl %eax, MIN_M + + movl IS, %ecx + leal (%edi,%ecx,SIZE), %ecx # xp = x + is + movl INCX, %ebx + movl %ecx, XP + cmpl $SIZE, %ebx + je .L34 + + movl BUFFER, %esi + movl MIN_M, %ecx + movl %esi, XP + sarl $2, %ecx + jle .L35 + + ALIGN_3 + +.L36: + FLD (%edi) + addl %ebx, %edi + FST 0 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 1 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 2 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 3 * SIZE(%esi) + + addl $4 * SIZE, %esi + decl %ecx + jg .L36 + ALIGN_3 + +.L35: + movl MIN_M, %ecx + andl $3,%ecx + jle .L34 + ALIGN_2 + +.L42: + FLD (%edi) + addl %ebx, %edi + FST (%esi) + addl $SIZE, %esi + decl %ecx + jg .L42 + ALIGN_3 + +/* Main Routine */ + +.L34: + movl Y, %ebp # coffset = y + + movl N, %esi + sarl $2, %esi + movl %esi, J + jle .L47 + ALIGN_3 + +.L48: + movl A, %ebx # a_offset = a + fldz + movl LDA, %edx + fldz + + leal (%ebx, %edx), %ecx # a_offset2 = a + lda + fldz + leal (%ebx, %edx, 4), %eax + fldz + + movl %eax, A + movl XP, %esi + FLD (%esi) + + movl MIN_M, %eax + sarl $2,%eax + jle .L51 + ALIGN_3 + +#define PRESIZE 8 + +.L80: +#ifdef PENTIUM3 + prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) + FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + prefetcht0 PRESIZE * SIZE(%ecx) + faddp %st,%st(2) # ct1 += at1 + FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + + prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + prefetcht0 PRESIZE * SIZE(%ebx) + FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + + fmul %st(1),%st + faddp %st,%st(4) + FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 2 * SIZE(%esi) + + FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 3 * SIZE(%esi) + FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(2) # ct1 += at1 + FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(3) # ct2 += at1 + FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + addl $4 * SIZE, %ebx + faddp %st,%st(4) + addl $4 * SIZE, %ecx + + FLD 4 * SIZE(%esi) + addl $4 * SIZE, %esi + +#else + +#if defined(HAS_PREFETCH) + prefetcht0 PRESIZE * SIZE(%ebx) + prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) + prefetcht0 PRESIZE * SIZE(%ecx) + prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) +#endif + + FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + + FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 2 * SIZE(%esi) + + FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 3 * SIZE(%esi) + + FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 4 * SIZE(%esi) + + addl $4 * SIZE, %ebx + addl $4 * SIZE, %ecx + addl $4 * SIZE, %esi +#endif + + decl %eax + jg .L80 + ALIGN_3 + +.L51: + movl MIN_M, %eax + andl $3, %eax + je .L81 + ALIGN_3 + +.L52: + + FLD (%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD (%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD (%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL (%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + + addl $SIZE, %ebx + addl $SIZE, %ecx + addl $SIZE, %esi + decl %eax + jg .L52 + ALIGN_3 + +.L81: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + movl INCY, %eax + + FADD (%ebp) + FST (%ebp) + addl %eax, %ebp + + FADD (%ebp) + FST (%ebp) + addl %eax, %ebp + + FADD (%ebp) + FST (%ebp) + addl %eax, %ebp + + FADD (%ebp) + FST (%ebp) + addl %eax, %ebp + + decl J + jg .L48 + ALIGN_3 + +.L47: + movl N, %esi + andl $3,%esi + movl %esi, J + jle .L60 + ALIGN_2 + +.L61: + movl A, %ebx # a_offset = a + fldz # ct1 = ZERO + movl LDA, %edx + fldz # ct1 = ZERO + + addl %ebx, %edx + fldz # ct1 = ZERO + movl %edx, A + fldz # ct1 = ZERO + + movl XP, %esi + + movl MIN_M, %eax + sarl $3,%eax + jle .L64 + ALIGN_3 + +.L65: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * 2 * SIZE(%ebx) + prefetcht0 PRESIZE * 2 * SIZE(%ebx) +#endif + + FLD 0 * SIZE(%esi) + FMUL 0 * SIZE(%ebx) + faddp %st,%st(1) + + FLD 1 * SIZE(%esi) + FMUL 1 * SIZE(%ebx) + faddp %st,%st(2) + + FLD 2 * SIZE(%esi) + FMUL 2 * SIZE(%ebx) + faddp %st,%st(3) + + FLD 3 * SIZE(%esi) + FMUL 3 * SIZE(%ebx) + faddp %st,%st(4) + + FLD 4 * SIZE(%esi) + FMUL 4 * SIZE(%ebx) + faddp %st,%st(1) + + FLD 5 * SIZE(%esi) + FMUL 5 * SIZE(%ebx) + faddp %st,%st(2) + + FLD 6 * SIZE(%esi) + FMUL 6 * SIZE(%ebx) + faddp %st,%st(3) + + FLD 7 * SIZE(%esi) + FMUL 7 * SIZE(%ebx) + faddp %st,%st(4) + + addl $8 * SIZE, %esi + addl $8 * SIZE, %ebx + + decl %eax + jg .L65 + ALIGN_3 + +.L64: + movl MIN_M, %eax + andl $7, %eax + jle .L70 + ALIGN_3 + +.L71: + FLD (%esi) + FMUL (%ebx) + faddp %st,%st(1) + + addl $SIZE, %esi + addl $SIZE, %ebx + decl %eax + jg .L71 + ALIGN_3 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1),%st + FADD (%ebp) + FST (%ebp) + addl INCY, %ebp + decl J + jg .L61 + ALIGN_3 + +.L60: + movl A, %ebx + addl NLDA, %ebx + movl %ebx, A + + addl $P, IS + movl M, %esi + cmpl %esi, IS + jl .L32 + ALIGN_3 + +.L79: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_t_atom.S b/kernel/x86/gemv_t_atom.S new file mode 100644 index 0000000..a21416d --- /dev/null +++ b/kernel/x86/gemv_t_atom.S @@ -0,0 +1,616 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + leal (,INCX, SIZE), INCX + leal (,INCY, SIZE), INCY + leal (,LDA, SIZE), LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $3, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addl INCX, X + movhpd (X), %xmm0 + addl INCX, X + + movsd (X), %xmm1 + addl INCX, X + movhpd (X), %xmm1 + addl INCX, X + + movsd (X), %xmm2 + addl INCX, X + movhpd (X), %xmm2 + addl INCX, X + + movsd (X), %xmm3 + addl INCX, X + movhpd (X), %xmm3 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $7, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addl INCX, X + movsd %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movsd -16 * SIZE(X), %xmm2 + movsd -15 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -15 * SIZE(A1), %xmm6 + movsd -15 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm2, %xmm4 + mulsd %xmm2, %xmm5 + movsd -14 * SIZE(X), %xmm2 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -13 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -14 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -13 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -11 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -10 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -11 * SIZE(A1, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1, LDA) +#endif + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -10 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -9 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -9 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -8 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -9 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -7 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -8 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -7 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -6 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -7 * SIZE(A1, LDA), %xmm7 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -13 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -14 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -13 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -11 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -10 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -11 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -10 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -9 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -9 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -8 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -9 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + movsd -7 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addl $8 * SIZE, A1 + addsd %xmm7, %xmm1 + addl $8 * SIZE, X + ALIGN_4 + +.L15: + testl $4, M + jle .L16 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + + movsd -15 * SIZE(A1), %xmm6 + movsd -15 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm2, %xmm4 + mulsd %xmm2, %xmm5 + movsd -14 * SIZE(X), %xmm2 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -13 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -14 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -13 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addsd %xmm7, %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, X + ALIGN_4 + +.L16: + testl $2, M + jle .L17 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + + movsd -15 * SIZE(A1), %xmm6 + movsd -15 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm2, %xmm4 + mulsd %xmm2, %xmm5 + movsd -14 * SIZE(X), %xmm2 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addsd %xmm7, %xmm1 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L17: + testl $1, M + jle .L18 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + mulsd %xmm2, %xmm5 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + ALIGN_4 + +.L18: + movsd ALPHA, %xmm7 + + mulpd %xmm7, %xmm0 + mulpd %xmm7, %xmm1 + + addsd (Y1), %xmm0 + addsd (Y1, INCY), %xmm1 + + movsd %xmm0, (Y1) + movsd %xmm1, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L999 + + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movsd -16 * SIZE(X), %xmm2 + movsd -15 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + movsd -14 * SIZE(A1), %xmm6 + movsd -13 * SIZE(A1), %xmm7 + + mulsd %xmm2, %xmm4 + movsd -14 * SIZE(X), %xmm2 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + + decl I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulsd %xmm2, %xmm6 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -11 * SIZE(A1), %xmm5 + + addsd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm4 + movsd -10 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -9 * SIZE(A1), %xmm7 + mulsd %xmm3, %xmm5 + movsd -9 * SIZE(X), %xmm3 + + mulsd %xmm2, %xmm6 + movsd -8 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -7 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -7 * SIZE(A1), %xmm5 + + addsd %xmm6, %xmm0 + movsd -6 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm4 + movsd -6 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -5 * SIZE(A1), %xmm7 + mulsd %xmm3, %xmm5 + movsd -5 * SIZE(X), %xmm3 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L22 + ALIGN_4 + +.L23: + mulsd %xmm2, %xmm6 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -11 * SIZE(A1), %xmm5 + + addsd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm4 + movsd -10 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -9 * SIZE(A1), %xmm7 + mulsd %xmm3, %xmm5 + movsd -9 * SIZE(X), %xmm3 + + mulsd %xmm2, %xmm6 + movsd -8 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + movsd -7 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addsd %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L25: + testl $4, M + jle .L26 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + movsd -14 * SIZE(A1), %xmm6 + movsd -13 * SIZE(A1), %xmm7 + + mulsd %xmm2, %xmm4 + movsd -14 * SIZE(X), %xmm2 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + + mulsd %xmm2, %xmm6 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addsd %xmm7, %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, X + ALIGN_4 + +.L26: + testl $2, M + jle .L27 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + + mulsd %xmm2, %xmm4 + movsd -14 * SIZE(X), %xmm2 + mulsd %xmm3, %xmm5 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L27: + testl $1, M + jle .L28 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm4 + addsd %xmm4, %xmm0 + ALIGN_4 + +.L28: + movsd ALPHA, %xmm7 + addsd %xmm1, %xmm0 + + mulpd %xmm7, %xmm0 + + addsd (Y1), %xmm0 + + movsd %xmm0, (Y1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S new file mode 100644 index 0000000..a499011 --- /dev/null +++ b/kernel/x86/gemv_t_sse.S @@ -0,0 +1,637 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef movsd +#undef movsd +#endif + +#ifdef PENTIUM3 +#ifdef HAVE_SSE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 20 + STACKSIZE(%esp) +#define STACK_LDA 24 + STACKSIZE(%esp) +#define STACK_X 28 + STACKSIZE(%esp) +#define STACK_INCX 32 + STACKSIZE(%esp) +#define Y 36 + STACKSIZE(%esp) +#define STACK_INCY 40 + STACKSIZE(%esp) +#define BUFFER 44 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + leal (,INCX, SIZE), INCX + leal (,INCY, SIZE), INCY + leal (,LDA, SIZE), LDA + + subl $-32 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $3, I + jle .L05 + ALIGN_4 + +.L02: + movss (X), %xmm0 + addl INCX, X + movss (X), %xmm1 + addl INCX, X + + unpcklps %xmm1, %xmm0 + + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + + unpcklps %xmm3, %xmm2 + + movss (X), %xmm4 + addl INCX, X + movss (X), %xmm5 + addl INCX, X + + unpcklps %xmm5, %xmm4 + + movss (X), %xmm6 + addl INCX, X + movss (X), %xmm7 + addl INCX, X + + unpcklps %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(Y1) + movlps %xmm2, 2 * SIZE(Y1) + movlps %xmm4, 4 * SIZE(Y1) + movlps %xmm6, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $7, I + jle .L10 + ALIGN_2 + +.L06: + movss (X), %xmm0 + addl INCX, X + movss %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, X + addl $32 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movaps -32 * SIZE(X), %xmm2 + movaps -28 * SIZE(X), %xmm3 + + movl M, I + sarl $4, I + jle .L15 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movhps -30 * SIZE(A1, LDA), %xmm5 + + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + movsd -28 * SIZE(A1, LDA), %xmm7 + movhps -26 * SIZE(A1, LDA), %xmm7 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + movsd -24 * SIZE(A1, LDA), %xmm5 + movhps -22 * SIZE(A1, LDA), %xmm5 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm7 + movhps -18 * SIZE(A1, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1, LDA) +#endif + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movsd -16 * SIZE(A1), %xmm4 + movhps -14 * SIZE(A1), %xmm4 + mulps %xmm2, %xmm5 + movaps -16 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + movsd -16 * SIZE(A1, LDA), %xmm5 + movhps -14 * SIZE(A1, LDA), %xmm5 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movsd -12 * SIZE(A1), %xmm6 + movhps -10 * SIZE(A1), %xmm6 + mulps %xmm3, %xmm7 + movaps -12 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm7 + movhps -10 * SIZE(A1, LDA), %xmm7 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + movsd -24 * SIZE(A1, LDA), %xmm5 + movhps -22 * SIZE(A1, LDA), %xmm5 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm7 + movhps -18 * SIZE(A1, LDA), %xmm7 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -16 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -12 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + ALIGN_4 + +.L15: + testl $8, M + jle .L16 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movhps -30 * SIZE(A1, LDA), %xmm5 + + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + movsd -28 * SIZE(A1, LDA), %xmm7 + movhps -26 * SIZE(A1, LDA), %xmm7 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L16: + testl $4, M + jle .L17 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + + movsd -32 * SIZE(A1, LDA), %xmm5 + movhps -30 * SIZE(A1, LDA), %xmm5 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm3, %xmm2 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L17: + testl $2, M + jle .L18 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm4 + +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd -32 * SIZE(A1, LDA), %xmm5 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm1 + movhlps %xmm2, %xmm2 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L18: + testl $1, M + jle .L19 + + movss -32 * SIZE(A1), %xmm4 + mulss %xmm2, %xmm4 + addss %xmm4, %xmm0 + movss -32 * SIZE(A1, LDA), %xmm5 + mulss %xmm2, %xmm5 + addss %xmm5, %xmm1 + ALIGN_4 + +.L19: +#ifdef HAVE_SSE3 + haddps %xmm0, %xmm0 + haddps %xmm1, %xmm1 + + haddps %xmm0, %xmm0 + haddps %xmm1, %xmm1 +#else + movhlps %xmm0, %xmm2 + movhlps %xmm1, %xmm3 + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + shufps $1, %xmm0, %xmm0 + movaps %xmm1, %xmm3 + shufps $1, %xmm1, %xmm1 + + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 +#endif + + movss ALPHA, %xmm7 + + mulss %xmm7, %xmm0 + mulss %xmm7, %xmm1 + + addss (Y1), %xmm0 + addss (Y1, INCY), %xmm1 + + movss %xmm0, (Y1) + movss %xmm1, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L999 + + movl BUFFER, X + addl $32 * SIZE, X + + movl A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movaps -32 * SIZE(X), %xmm2 + movaps -28 * SIZE(X), %xmm3 + + movl M, I + sarl $4, I + jle .L25 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + + decl I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulps %xmm2, %xmm4 + movaps -24 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + + mulps %xmm3, %xmm6 + movaps -20 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + + mulps %xmm2, %xmm4 + movaps -16 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + movsd -16 * SIZE(A1), %xmm4 + movhps -14 * SIZE(A1), %xmm4 + + mulps %xmm3, %xmm6 + movaps -12 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + movsd -12 * SIZE(A1), %xmm6 + movhps -10 * SIZE(A1), %xmm6 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + + decl I + jg .L22 + ALIGN_4 + +.L23: + mulps %xmm2, %xmm4 + movaps -24 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + + mulps %xmm3, %xmm6 + movaps -20 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + + mulps %xmm2, %xmm4 + movaps -16 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + + mulps %xmm3, %xmm6 + movaps -12 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + ALIGN_4 + +.L25: + testl $8, M + jle .L26 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + + mulps %xmm2, %xmm4 + movaps -24 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + + mulps %xmm3, %xmm6 + movaps -20 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L26: + testl $4, M + jle .L27 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movaps %xmm3, %xmm2 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L27: + testl $2, M + jle .L28 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm4 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movhlps %xmm2, %xmm2 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L28: + testl $1, M + jle .L29 + + movss -32 * SIZE(A1), %xmm4 + mulss %xmm2, %xmm4 + addss %xmm4, %xmm0 + ALIGN_4 + +.L29: +#ifdef HAVE_SSE3 + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#else + movhlps %xmm0, %xmm2 + + addps %xmm2, %xmm0 + + movaps %xmm0, %xmm2 + shufps $1, %xmm0, %xmm0 + + addss %xmm2, %xmm0 +#endif + + movss ALPHA, %xmm7 + + mulss %xmm7, %xmm0 + + addss (Y1), %xmm0 + + movss %xmm0, (Y1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S new file mode 100644 index 0000000..9960b5c --- /dev/null +++ b/kernel/x86/gemv_t_sse2.S @@ -0,0 +1,569 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetch +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + leal (,INCX, SIZE), INCX + leal (,INCY, SIZE), INCY + leal (,LDA, SIZE), LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $3, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addl INCX, X + movhpd (X), %xmm0 + addl INCX, X + + movsd (X), %xmm1 + addl INCX, X + movhpd (X), %xmm1 + addl INCX, X + + movsd (X), %xmm2 + addl INCX, X + movhpd (X), %xmm2 + addl INCX, X + + movsd (X), %xmm3 + addl INCX, X + movhpd (X), %xmm3 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $7, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addl INCX, X + movsd %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movapd -16 * SIZE(X), %xmm2 + movapd -14 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movhpd -15 * SIZE(A1, LDA), %xmm5 + + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm7 + movhpd -13 * SIZE(A1, LDA), %xmm7 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + movhpd -11 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm7 + movhpd -9 * SIZE(A1, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1, LDA) +#endif + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + movapd -8 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + movsd -8 * SIZE(A1, LDA), %xmm5 + movhpd -7 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + movapd -6 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + movsd -6 * SIZE(A1, LDA), %xmm7 + movhpd -5 * SIZE(A1, LDA), %xmm7 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + movhpd -11 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm7 + movhpd -9 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + movapd -8 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + movapd -6 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L15: + testl $4, M + jle .L16 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movhpd -15 * SIZE(A1, LDA), %xmm5 + + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm7 + movhpd -13 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, X + ALIGN_4 + +.L16: + testl $2, M + jle .L17 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + + movsd -16 * SIZE(A1, LDA), %xmm5 + movhpd -15 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm1 + movapd %xmm3, %xmm2 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L17: + testl $1, M + jle .L18 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm5 + mulsd %xmm2, %xmm5 + addsd %xmm5, %xmm1 + ALIGN_4 + +.L18: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + addpd %xmm2, %xmm0 +#endif + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm7 +#else + movsd ALPHA, %xmm7 + unpcklpd %xmm7, %xmm7 +#endif + + mulpd %xmm7, %xmm0 + + movsd (Y1), %xmm4 + movhpd (Y1, INCY), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L999 + + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movapd -16 * SIZE(X), %xmm2 + movapd -14 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + decl I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulpd %xmm2, %xmm4 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + + mulpd %xmm3, %xmm6 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + + mulpd %xmm2, %xmm4 + movapd -8 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + + mulpd %xmm3, %xmm6 + movapd -6 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L22 + ALIGN_4 + +.L23: + mulpd %xmm2, %xmm4 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + + mulpd %xmm3, %xmm6 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + + mulpd %xmm2, %xmm4 + movapd -8 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + + mulpd %xmm3, %xmm6 + movapd -6 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L25: + testl $4, M + jle .L26 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + mulpd %xmm2, %xmm4 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + + mulpd %xmm3, %xmm6 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + + addl $4 * SIZE, A1 + addl $4 * SIZE, X + ALIGN_4 + +.L26: + testl $2, M + jle .L27 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movapd %xmm3, %xmm2 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L27: + testl $1, M + jle .L28 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm4 + addsd %xmm4, %xmm0 + ALIGN_4 + +.L28: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + addsd %xmm2, %xmm0 +#endif + + movsd ALPHA, %xmm7 + + mulpd %xmm7, %xmm0 + + addsd (Y1), %xmm0 + + movlpd %xmm0, (Y1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/iamax.S b/kernel/x86/iamax.S new file mode 100644 index 0000000..33204c0 --- /dev/null +++ b/kernel/x86/iamax.S @@ -0,0 +1,364 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %ebx +#define INCX %esi +#define X %ecx +#define I %edx +#define NUM %edi +#define RET %eax + +#ifndef USE_MIN +#define FMOV fcmovbe +#define IMOV cmovnbe +#else +#define FMOV fcmovnbe +#define IMOV cmovb +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_INCX, INCX + movl STACK_X, X + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + sall $BASE_SHIFT, INCX + + fldz + xorl RET, RET + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + fstp %st(0) + movl $2, NUM + movl $1, RET + + FLD (X) +#ifdef USE_ABS + fabs +#endif + addl INCX, X + decl M + jle .L999 + + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 1 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 2 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 3 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 4 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 5 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 6 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 7 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl $8 * SIZE, X + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + + addl $1 * SIZE, X + incl NUM + decl I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl INCX, X + decl I + jg .L61 + ALIGN_4 + +.L999: + fstp %st(0) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/iamax_sse.S b/kernel/x86/iamax_sse.S new file mode 100644 index 0000000..3b64ebd --- /dev/null +++ b/kernel/x86/iamax_sse.S @@ -0,0 +1,968 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#ifndef HAVE_SSE2 +#define pxor xorps +#define movsd movlps +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + pxor %xmm0, %xmm0 /* Return Value(Float) */ +#ifdef USE_ABS + pxor %xmm7, %xmm7 /* Generate USE_ABS */ +#endif + xor RET, RET /* Return Value(Int) */ + testl M, M + jle .L999 + leal (, INCX, SIZE), INCX + testl INCX, INCX + jle .L999 + + movl M, MM + movl X, XX + +#ifdef USE_ABS +#ifndef HAVE_SSE2 + subl $8, %esp + movl $0x7fffffff, (%esp) + movss (%esp), %xmm7 + shufps $0, %xmm7, %xmm7 + addl $8, %esp +#else + cmpeqps %xmm7, %xmm7 + psrld $1, %xmm7 /* Generate USE_ABS */ +#endif +#endif + + movss (XX), %xmm0 + addl INCX, XX + decl MM + shufps $0, %xmm0, %xmm0 +#ifdef USE_ABS + andps %xmm7, %xmm0 +#endif + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 /* Generating "seed value" */ + cmpl $SIZE, INCX + jne .L80 /* Incx != 1 goto L80 */ + +/* Analigned Check */ + testl $3, XX /* 00000011 */ + jne .L30 /* Purely Unaligned Mode */ + + cmpl $8, MM + jle .L30 /* if M <= 8 goto Unaligned mode */ + + testl $4, XX /* bit test 000100 */ + je .L05 + + movss 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + decl MM + addl $SIZE, XX + ALIGN_3 + +.L05: + testl $8, XX + je .L06 + + movsd 0 * SIZE(XX), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + subl $2, MM + addl $2 * SIZE, XX + ALIGN_3 + +.L06: + movl MM, I + sarl $4, I + jle .L15 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movaps 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps 4 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + + movaps 8 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm2 + + movaps 12 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm3 + + addl $16 * SIZE, XX + decl I + jg .L11 + ALIGN_4 + +.L15: + andl $15, MM + jle .L20 + + testl $8, MM + je .L16 + + movaps 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps 4 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + addl $8 * SIZE, XX + ALIGN_3 + +.L16: + testl $4, MM + je .L17 + + movaps 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm2 + addl $4 * SIZE, XX + ALIGN_3 + +.L17: + testl $2, MM + je .L18 + + movsd 0 * SIZE(XX), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm3 + addl $2 * SIZE, XX + +.L18: + testl $1, MM + je .L20 + + movss 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + ALIGN_3 + +.L20: + movl X, XX + movl M, MM + + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + testl $4, XX + je .L21 + + movss 0 * SIZE(XX), %xmm1 + + decl MM + addl $SIZE, XX + +#ifdef USE_ABS + andps %xmm7, %xmm1 +#endif + incl RET + comiss %xmm0, %xmm1 + je .L999 + ALIGN_3 + +.L21: + testl $8, XX + je .L22 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + + subl $2, MM + addl $2 * SIZE, XX + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L22: + movl MM, I + sarl $3, I + jle .L25 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movaps 0 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andps %xmm7, %xmm1 +#endif + cmpeqps %xmm0, %xmm1 + + movaps 4 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm2 +#endif + cmpeqps %xmm0, %xmm2 + + orps %xmm2, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L24 + + addl $8 * SIZE, XX + addl $8, RET + decl I + jg .L23 + jmp .L25 + ALIGN_3 + +.L24: + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + + movss 4 * SIZE(XX), %xmm1 + movss 5 * SIZE(XX), %xmm2 + movss 6 * SIZE(XX), %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 +#endif + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_4 + +.L25: + testl $4, MM + je .L26 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + addl $4 * SIZE, XX + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L26: + testl $2, MM + je .L27 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + addl $2 * SIZE, XX + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L27: + incl RET + jmp .L999 + ALIGN_3 + +/* Unaligned Mode */ +.L30: + movl MM, I + sarl $4, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm4 + movhps 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movsd 4 * SIZE(XX), %xmm4 + movhps 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + + movsd 8 * SIZE(XX), %xmm4 + movhps 10 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm2 + + movsd 12 * SIZE(XX), %xmm4 + movhps 14 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm3 + + addl $16 * SIZE, XX + decl I + jg .L31 + ALIGN_4 + +.L35: + andl $15, MM + jle .L40 + + testl $8, MM + je .L36 + + movsd 0 * SIZE(XX), %xmm4 + movhps 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movsd 4 * SIZE(XX), %xmm4 + movhps 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + + addl $8 * SIZE, XX + ALIGN_3 + +.L36: + testl $4, MM + je .L37 + + movsd 0 * SIZE(XX), %xmm4 + movhps 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm2 + addl $4 * SIZE, XX + ALIGN_3 + +.L37: + testl $2, MM + je .L38 + + movsd 0 * SIZE(XX), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm3 + addl $2 * SIZE, XX + +.L38: + testl $1, MM + je .L40 + + movss 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + jmp .L40 + ALIGN_4 + +.L40: + movl X, XX + movl M, MM + + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movl MM, I + sarl $3, I + jle .L45 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andps %xmm7, %xmm1 +#endif + cmpeqps %xmm0, %xmm1 + + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm2 +#endif + cmpeqps %xmm0, %xmm2 + + orps %xmm2, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L44 + + addl $8 * SIZE, XX + addl $8, RET + decl I + jg .L43 + jmp .L45 + ALIGN_3 + +.L44: + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + + movss 4 * SIZE(XX), %xmm1 + movss 5 * SIZE(XX), %xmm2 + movss 6 * SIZE(XX), %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 +#endif + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_4 + +.L45: + testl $4, MM + je .L46 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + addl $4 * SIZE, XX + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L46: + testl $2, MM + je .L47 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + addl $2 * SIZE, XX + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L47: + incl RET + jmp .L999 + ALIGN_3 + +.L80: + movl MM, I + sarl $3, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm1 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm2 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm3 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm1 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm2 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm3 + + decl I + jg .L81 + ALIGN_4 + +.L85: + andl $7, MM + jle .L90 + + testl $4, MM + je .L86 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm1 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm2 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm3 + ALIGN_3 + +.L86: + testl $2, MM + je .L87 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm1 + ALIGN_3 + +.L87: + testl $1, MM + je .L90 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm2 + ALIGN_4 + +.L90: + movl X, XX + movl M, MM + + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm2 + maxss %xmm2, %xmm0 + shufps $0, %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L96 + ALIGN_4 + +.L92: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movss 0 * SIZE(XX), %xmm1 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm1 +#endif + cmpeqss %xmm0, %xmm1 + + movss 0 * SIZE(XX), %xmm2 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm2 +#endif + cmpeqss %xmm0, %xmm2 + + movss 0 * SIZE(XX), %xmm3 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm3 +#endif + cmpeqss %xmm0, %xmm3 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + cmpeqss %xmm0, %xmm4 + + orps %xmm2, %xmm1 + orps %xmm4, %xmm3 + orps %xmm3, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L93 + + addl $4, RET + decl I + jg .L92 + jmp .L96 + ALIGN_3 + +.L93: + leal (, INCX, 4), TEMP + subl TEMP, XX + + movss 0 * SIZE(XX), %xmm1 + addl INCX, XX + movss 0 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + addl INCX, XX + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L96: + testl $2, MM + je .L97 + + movss 0 * SIZE(XX), %xmm1 + addl INCX, XX + movss 0 * SIZE(XX), %xmm2 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L97: + incl RET + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/iamax_sse2.S b/kernel/x86/iamax_sse2.S new file mode 100644 index 0000000..a0ddb26 --- /dev/null +++ b/kernel/x86/iamax_sse2.S @@ -0,0 +1,1152 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + pxor %xmm0, %xmm0 +#ifdef USE_ABS + pxor %xmm7, %xmm7 +#endif + xor RET, RET + testl M, M + jle .L999 + leal (, INCX, SIZE), INCX + testl INCX, INCX + jle .L999 + + movl M, MM + movl X, XX + +#ifdef USE_ABS + cmpeqpd %xmm7, %xmm7 + psrlq $1, %xmm7 +#endif + + movsd (XX), %xmm0 + addl INCX, XX + decl MM +#ifdef USE_ABS + andpd %xmm7, %xmm0 +#endif + unpcklpd %xmm0, %xmm0 + movapd %xmm0, %xmm1 + movapd %xmm0, %xmm2 + movapd %xmm0, %xmm3 + cmpl $SIZE, INCX + jne .L80 + +/* Analigned Check */ + cmpl $7, MM + jle .L50 + + testl $7, XX + jne .L50 # Purely Unaligned Mode + + testl $15, XX # Checking for 128bit align + je .L05 + + movsd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + decl MM + addl $SIZE, XX + ALIGN_3 + +.L05: + movl MM, I + sarl $4, I + jle .L15 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movapd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movapd 4 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movapd 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) +#endif + + movapd 8 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 10 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movapd 12 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movapd 14 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + + addl $16 * SIZE, XX + decl I + jg .L11 + ALIGN_4 + +.L15: + andl $15, MM + jle .L20 + + testl $8, MM + je .L16 + + movapd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movapd 4 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movapd 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + addl $8 * SIZE, XX + ALIGN_3 + +.L16: + testl $4, MM + je .L17 + + movapd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + addl $4 * SIZE, XX + ALIGN_3 + +.L17: + testl $2, MM + je .L18 + + movapd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + addl $2 * SIZE, XX + +.L18: + testl $1, MM + je .L20 + + movsd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + ALIGN_3 + +/* Finding Index */ +.L20: + movl X, XX + movl M, MM + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + testl $15, XX # Checking for 128bit align + je .L21 + + movsd 0 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andpd %xmm7, %xmm1 +#endif + incl RET + comisd %xmm0, %xmm1 + je .L999 + addl $SIZE, XX + decl MM + ALIGN_3 + +.L21: + movl MM, I + sarl $3, I + jle .L25 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movapd 0 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andpd %xmm7, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movapd 2 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andpd %xmm7, %xmm2 +#endif + cmpeqpd %xmm0, %xmm2 + + movapd 4 * SIZE(XX), %xmm3 +#ifdef USE_ABS + andpd %xmm7, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movapd 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + cmpeqpd %xmm0, %xmm4 + + orpd %xmm2, %xmm1 + orpd %xmm4, %xmm3 + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L23 + + addl $8 * SIZE, XX + addl $8, RET + decl I + jg .L22 + jmp .L25 + ALIGN_4 + +.L23: + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + + movsd 4 * SIZE(XX), %xmm1 + movsd 5 * SIZE(XX), %xmm2 + movsd 6 * SIZE(XX), %xmm3 + +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 +#endif + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_3 + +.L25: + testl $4, MM + je .L27 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + addl $4 * SIZE, XX + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L27: + testl $2, MM + je .L28 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 +#endif + addl $2 * SIZE, XX + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L28: + incl RET + jmp .L999 + ALIGN_3 + +.L50: +/* Unaligned Mode */ + movl MM, I + sarl $4, I + jle .L55 + ALIGN_4 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm4 + movhpd 1 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(XX), %xmm4 + movhpd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 4 * SIZE(XX), %xmm4 + movhpd 5 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 6 * SIZE(XX), %xmm4 + movhpd 7 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) +#endif + + movsd 8 * SIZE(XX), %xmm4 + movhpd 9 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 10 * SIZE(XX), %xmm4 + movhpd 11 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 12 * SIZE(XX), %xmm4 + movhpd 13 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 14 * SIZE(XX), %xmm4 + movhpd 15 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + + addl $16 * SIZE, XX + decl I + jg .L51 + ALIGN_4 + +.L55: + andl $15, MM + jle .L60 + + testl $8, MM + je .L56 + + movsd 0 * SIZE(XX), %xmm4 + movhpd 1 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(XX), %xmm4 + movhpd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 4 * SIZE(XX), %xmm4 + movhpd 5 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 6 * SIZE(XX), %xmm4 + movhpd 7 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + + addl $8 * SIZE, XX + ALIGN_3 + +.L56: + testl $4, MM + je .L57 + + movsd 0 * SIZE(XX), %xmm4 + movhpd 1 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(XX), %xmm4 + movhpd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + addl $4 * SIZE, XX + ALIGN_3 + +.L57: + testl $2, MM + je .L58 + + movsd 0 * SIZE(XX), %xmm4 + movhpd 1 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + addl $2 * SIZE, XX + +.L58: + testl $1, MM + je .L60 + + movsd 0 * SIZE(XX), %xmm4 + unpcklpd %xmm4, %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + ALIGN_3 + +.L60: + movl X, XX + movl M, MM + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movl MM, I + sarl $3, I + jle .L65 + ALIGN_4 + +.L62: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andpd %xmm7, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movsd 2 * SIZE(XX), %xmm2 + movhpd 3 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andpd %xmm7, %xmm2 +#endif + cmpeqpd %xmm0, %xmm2 + + movsd 4 * SIZE(XX), %xmm3 + movhpd 5 * SIZE(XX), %xmm3 +#ifdef USE_ABS + andpd %xmm7, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movsd 6 * SIZE(XX), %xmm4 + movhpd 7 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + cmpeqpd %xmm0, %xmm4 + + orpd %xmm2, %xmm1 + orpd %xmm4, %xmm3 + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L63 + + addl $8 * SIZE, XX + addl $8, RET + decl I + jg .L62 + jmp .L65 + ALIGN_4 + +.L63: + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 + +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + incl RET + + movsd 4 * SIZE(XX), %xmm1 + movsd 5 * SIZE(XX), %xmm2 + movsd 6 * SIZE(XX), %xmm3 + +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 +#endif + + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_3 + +.L65: + testl $4, MM + je .L67 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + addl $4 * SIZE, XX + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L67: + testl $2, MM + je .L68 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 +#endif + addl $2 * SIZE, XX + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L68: + incl RET + jmp .L999 + ALIGN_4 + +.L80: + movl MM, I + sarl $4, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + + decl I + jg .L81 + ALIGN_4 + +.L85: + andl $15, MM + jle .L90 + + testl $8, MM + je .L86 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + ALIGN_3 + +.L86: + testl $4, MM + je .L87 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + ALIGN_3 + +.L87: + testl $2, MM + je .L88 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + ALIGN_3 + +.L88: + testl $1, MM + je .L90 + + movsd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + ALIGN_4 + +.L90: + movl X, XX + movl M, MM + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movl MM, I + sarl $3, I + jle .L95 + ALIGN_4 + +.L92: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm2 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm2 +#endif + cmpeqpd %xmm0, %xmm2 + + movsd 0 * SIZE(XX), %xmm3 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + cmpeqpd %xmm0, %xmm4 + + orpd %xmm2, %xmm1 + orpd %xmm4, %xmm3 + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L93 + + addl $8, RET + decl I + jg .L92 + jmp .L95 + ALIGN_4 + +.L93: + leal (, INCX, 8), TEMP + subl TEMP, XX + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 +#endif + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_3 + +.L95: + testl $4, MM + je .L97 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L97: + testl $2, MM + je .L98 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 +#endif + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L98: + incl RET + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/izamax.S b/kernel/x86/izamax.S new file mode 100644 index 0000000..63bcaef --- /dev/null +++ b/kernel/x86/izamax.S @@ -0,0 +1,289 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + + PROLOGUE + +#define M %ebx +#define INCX %esi +#define X %ecx +#define I %edx +#define NUM %edi +#define RET %eax + +#ifndef USE_MIN +#define FMOV fcmovbe +#define IMOV cmovnbe +#else +#define FMOV fcmovnb +#define IMOV cmovb +#endif + +#include "l1param.h" + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_INCX, INCX + movl STACK_X, X + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + sall $ZBASE_SHIFT, INCX + + fldz + xorl RET, RET + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + fstp %st(0) + movl $2, NUM + movl $1, RET + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + addl INCX, X + decl M + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl $8 * SIZE, X + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L999 + ALIGN_4 + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl INCX, X + decl I + jg .L61 + ALIGN_4 + +.L999: + fstp %st(0) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/izamax_sse.S b/kernel/x86/izamax_sse.S new file mode 100644 index 0000000..95223fe --- /dev/null +++ b/kernel/x86/izamax_sse.S @@ -0,0 +1,596 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#ifndef HAVE_SSE2 +#define pxor xorps +#define movsd movlps +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + pxor %xmm0, %xmm0 + pxor %xmm7, %xmm7 + xor RET, RET + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + movl M, MM + movl X, XX + +#ifdef USE_ABS +#ifndef HAVE_SSE2 + subl $8, %esp + movl $0x7fffffff, (%esp) + movss (%esp), %xmm7 + shufps $0, %xmm7, %xmm7 + addl $8, %esp +#else + cmpeqps %xmm7, %xmm7 + psrld $1, %xmm7 +#endif +#endif + + movss 0 * SIZE(XX), %xmm0 + movss 1 * SIZE(XX), %xmm1 + addl INCX, XX + decl MM + andps %xmm7, %xmm0 + andps %xmm7, %xmm1 + addps %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + cmpl $2 * SIZE, INCX + jne .L70 + +.L30: + movl MM, I + sarl $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + movsd 8 * SIZE(XX), %xmm1 + movhps 10 * SIZE(XX), %xmm1 + movsd 12 * SIZE(XX), %xmm2 + movhps 14 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + addl $16 * SIZE, XX + decl I + jg .L31 + ALIGN_4 + +.L35: + andl $7, MM + jle .L40 + + testl $4, MM + je .L36 + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + addl $8 * SIZE, XX + ALIGN_3 + +.L36: + testl $2, MM + je .L37 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm0 + addl $4 * SIZE, XX + ALIGN_3 + +.L37: + testl $1, MM + je .L40 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + addps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L40: + movl X, XX + movl M, MM + + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + cmpeqps %xmm0, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L43 + + addl $8 * SIZE, XX + addl $4, RET + decl I + jg .L41 + jmp .L45 + ALIGN_4 + +.L43: + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + + movss 4 * SIZE(XX), %xmm1 + movss 5 * SIZE(XX), %xmm2 + movss 6 * SIZE(XX), %xmm3 + movss 7 * SIZE(XX), %xmm4 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + addl $8 * SIZE, XX + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L45: + testl $2, MM + je .L47 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + addl $4 * SIZE, XX + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L47: + incl RET + jmp .L999 + ALIGN_3 + +.L70: + movl MM, I + sarl $3, I + jle .L75 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + decl I + jg .L71 + ALIGN_4 + +.L75: + andl $7, MM + jle .L80 + + testl $4, MM + je .L76 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + ALIGN_3 + +.L76: + testl $2, MM + je .L77 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm0 + ALIGN_3 + +.L77: + testl $1, MM + je .L80 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + addps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L80: + movl X, XX + movl M, MM + + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + cmpeqps %xmm0, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L83 + + addl $4, RET + decl I + jg .L81 + jmp .L85 + ALIGN_4 + +.L83: + leal (, INCX, 4), TEMP + subl TEMP, XX + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L85: + testl $2, MM + je .L87 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L87: + incl RET + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/izamax_sse2.S b/kernel/x86/izamax_sse2.S new file mode 100644 index 0000000..0392e1d --- /dev/null +++ b/kernel/x86/izamax_sse2.S @@ -0,0 +1,619 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + pxor %xmm0, %xmm0 + pxor %xmm7, %xmm7 + xor RET, RET + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + movl M, MM + movl X, XX + + cmpeqpd %xmm7, %xmm7 + psrlq $1, %xmm7 + + movsd 0 * SIZE(XX), %xmm0 + movsd 1 * SIZE(XX), %xmm1 + addl INCX, XX + decl MM + andpd %xmm7, %xmm0 + andpd %xmm7, %xmm1 + addpd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + cmpl $2 * SIZE, INCX + jne .L60 + + movl MM, I + sarl $3, I + jle .L25 + ALIGN_4 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) +#endif + + movsd 8 * SIZE(XX), %xmm1 + movsd 9 * SIZE(XX), %xmm2 + movhpd 10 * SIZE(XX), %xmm1 + movhpd 11 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 12 * SIZE(XX), %xmm3 + movsd 13 * SIZE(XX), %xmm4 + movhpd 14 * SIZE(XX), %xmm3 + movhpd 15 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + + addl $16 * SIZE, XX + decl I + jg .L21 + ALIGN_4 + +.L25: + andl $7, MM + jle .L30 + + testl $4, MM + je .L26 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + addl $8 * SIZE, XX + ALIGN_3 + +.L26: + testl $2, MM + je .L27 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + addl $4 * SIZE, XX + ALIGN_3 + +.L27: + testl $1, MM + je .L30 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L30: + movl X, XX + movl M, MM + + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + cmpeqpd %xmm0, %xmm1 + cmpeqpd %xmm0, %xmm3 + + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L33 + + addl $8 * SIZE, XX + addl $4, RET + decl I + jg .L31 + jmp .L35 + ALIGN_4 + +.L33: + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + + movsd 4 * SIZE(XX), %xmm1 + movsd 5 * SIZE(XX), %xmm2 + movsd 6 * SIZE(XX), %xmm3 + movsd 7 * SIZE(XX), %xmm4 + addl $8 * SIZE, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L35: + testl $2, MM + je .L36 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 + addl $4 * SIZE, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L36: + incl RET + jmp .L999 + ALIGN_3 + +.L60: + movl MM, I + sarl $3, I + jle .L65 + ALIGN_4 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + + decl I + jg .L61 + ALIGN_4 + +.L65: + andl $7, MM + jle .L70 + + testl $4, MM + je .L66 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + ALIGN_3 + +.L66: + testl $2, MM + je .L67 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + ALIGN_3 + +.L67: + testl $1, MM + je .L70 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxsd %xmm1, %xmm0 + ALIGN_3 + +.L70: + movl X, XX + movl M, MM + + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L75 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + cmpeqpd %xmm0, %xmm1 + cmpeqpd %xmm0, %xmm3 + + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L73 + + addl $4, RET + decl I + jg .L71 + jmp .L75 + ALIGN_4 + +.L73: + leal (, INCX, 4), TEMP + subl TEMP, XX + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L75: + testl $2, MM + je .L76 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L76: + incl RET + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/lsame.S b/kernel/x86/lsame.S new file mode 100644 index 0000000..d7e48ad --- /dev/null +++ b/kernel/x86/lsame.S @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + movl 4(%esp), %eax + movl 8(%esp), %edx + movb (%eax), %al # a = *A + movb (%edx), %dl # b = *B + + andl $255, %eax + andl $255, %edx + + subl $65, %eax + subl $65, %edx + +#ifndef HAVE_CMOV + movl %eax, %ecx + subl $32, %ecx + jle .L1 + movl %ecx, %eax +.L1: + + movl %edx, %ecx + subl $32, %ecx + jle .L2 + movl %ecx, %edx +.L2: + subl %eax, %edx + movl $0, %eax + movl $1, %edx + jne .L3 + movl %edx, %eax +.L3: +#else + movl %eax, %ecx + subl $32, %ecx + cmovg %ecx, %eax + + movl %edx, %ecx + subl $32, %ecx + cmovg %ecx, %edx + + subl %eax, %edx + movl $0, %eax + movl $1, %edx + cmove %edx, %eax +#endif + ret + + EPILOGUE diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S new file mode 100644 index 0000000..c098249 --- /dev/null +++ b/kernel/x86/nrm2.S @@ -0,0 +1,226 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $BASE_SHIFT, INCX + fldz + fldz + fldz + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + FLD 2 * SIZE(X) + fmul %st(0), %st + FLD 3 * SIZE(X) + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fmul %st(0), %st + FLD 5 * SIZE(X) + fmul %st(0), %st + FLD 6 * SIZE(X) + fmul %st(0), %st + FLD 7 * SIZE(X) + fmul %st(0), %st + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L21: + FLD (X) + fmul %st(0), %st + faddp %st,%st(1) + addl $1 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addl INCX, X + fmul %st(0), %st + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + fsqrt + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S new file mode 100644 index 0000000..e704609 --- /dev/null +++ b/kernel/x86/nrm2_sse.S @@ -0,0 +1,418 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + pxor %xmm0, %xmm0 + testl M, M + jle .L999 + pxor %xmm1, %xmm1 + testl INCX, INCX + jle .L999 + + leal (, INCX, SIZE), INCX + cmpl $SIZE, INCX + jne .L40 + + subl $-32 * SIZE, X + + testl $SIZE, X + je .L05 + + movss -32 * SIZE(X), %xmm0 + cvtss2sd %xmm0, %xmm0 + mulsd %xmm0, %xmm0 + + addl INCX, X + decl M + jle .L998 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L13 + + movsd -32 * SIZE(X), %xmm4 + movsd -30 * SIZE(X), %xmm5 + movsd -28 * SIZE(X), %xmm6 + movsd -26 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_3 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + cvtps2pd %xmm4, %xmm2 + movsd -24 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -22 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -20 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -18 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + movsd -16 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -14 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -12 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -10 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + decl I + jg .L10 + ALIGN_3 + +.L12: + cvtps2pd %xmm4, %xmm2 + movsd -24 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -22 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -20 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -18 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + ALIGN_4 + +.L13: + testl $8, M + je .L14 + + movsd -32 * SIZE(X), %xmm4 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -30 * SIZE(X), %xmm5 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + movsd -28 * SIZE(X), %xmm6 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -26 * SIZE(X), %xmm7 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L14: + testl $4, M + je .L15 + + movsd -32 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -30 * SIZE(X), %xmm5 + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + addl $4 * SIZE, X + ALIGN_3 + +.L15: + testl $2, M + je .L16 + + movsd -32 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + addl $2 * SIZE, X + ALIGN_3 + +.L16: + testl $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L44 + ALIGN_4 + +.L41: + movss (X), %xmm4 + addl INCX, X + + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm5 + addl INCX, X + + cvtss2sd %xmm5, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + movss (X), %xmm6 + addl INCX, X + + cvtss2sd %xmm6, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm7 + addl INCX, X + + cvtss2sd %xmm7, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + movss (X), %xmm4 + addl INCX, X + + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm5 + addl INCX, X + + cvtss2sd %xmm5, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + movss (X), %xmm6 + addl INCX, X + + cvtss2sd %xmm6, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm7 + addl INCX, X + + cvtss2sd %xmm7, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + decl I + jg .L41 + ALIGN_3 + +.L44: + testl $4, M + je .L45 + + movss (X), %xmm4 + addl INCX, X + + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm5 + addl INCX, X + + cvtss2sd %xmm5, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + movss (X), %xmm6 + addl INCX, X + + cvtss2sd %xmm6, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm7 + addl INCX, X + + cvtss2sd %xmm7, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + ALIGN_3 + +.L45: + testl $2, M + je .L46 + + movss (X), %xmm4 + addl INCX, X + + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm5 + addl INCX, X + + cvtss2sd %xmm5, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + ALIGN_3 + +.L46: + testl $1, M + je .L998 + + movss (X), %xmm4 + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + sqrtsd %xmm0, %xmm0 + + cvtsd2ss %xmm0, %xmm0 + + movss %xmm0, STACK_M + flds STACK_M + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/qaxpy.S b/kernel/x86/qaxpy.S new file mode 100644 index 0000000..0497ea3 --- /dev/null +++ b/kernel/x86/qaxpy.S @@ -0,0 +1,254 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) +#define STACK_Y 40 + STACK + ARGS(%esp) +#define STACK_INCY 44 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + FLD STACK_ALPHA + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $BASE_SHIFT, INCX + sall $BASE_SHIFT, INCY + + testl M, M + jle .L40 + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl M, %eax + sarl $3, %eax + jle .L15 + ALIGN_3 + +#define PRESIZE 33 + +.L16: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * SIZE(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + fmul %st(1),%st + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1),%st + FLD 2 * SIZE(Y) + faddp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + fmul %st(1),%st + FLD 3 * SIZE(Y) + faddp %st, %st(1) + FST 3 * SIZE(Y) + +#ifdef HAS_PREFETCH + prefetcht0 (4 + PRESIZE) * SIZE(X) +#endif + + FLD 4 * SIZE(X) + fmul %st(1),%st + FLD 4 * SIZE(Y) + faddp %st, %st(1) + FST 4 * SIZE(Y) + + FLD 5 * SIZE(X) + fmul %st(1),%st + FLD 5 * SIZE(Y) + faddp %st, %st(1) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1),%st + FLD 6 * SIZE(Y) + faddp %st, %st(1) + FST 6 * SIZE(Y) + + FLD 7 * SIZE(X) + fmul %st(1),%st + FLD 7 * SIZE(Y) + faddp %st, %st(1) + FST 7 * SIZE(Y) + +#ifdef HAVE_3DNOW + prefetchw 24 * SIZE(Y) +#endif + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl M, %eax + andl $7, %eax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + addl $SIZE, X + addl $SIZE, Y + decl %eax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movl M, %eax + sarl $2, %eax + jle .L28 + ALIGN_3 + +.L29: + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L35: + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + xorl %eax,%eax + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/qconjg.S b/kernel/x86/qconjg.S new file mode 100644 index 0000000..3b40e0c --- /dev/null +++ b/kernel/x86/qconjg.S @@ -0,0 +1,60 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl 4(%esp), %eax + movl 8(%esp), %ecx + fldz + FLD 1 * SIZE(%ecx) + fsubrp %st, %st(1) + FLD 0 * SIZE(%ecx) + + FST 0 * SIZE(%eax) + FST 1 * SIZE(%eax) + ret + + EPILOGUE diff --git a/kernel/x86/qdot.S b/kernel/x86/qdot.S new file mode 100644 index 0000000..ce5ff29 --- /dev/null +++ b/kernel/x86/qdot.S @@ -0,0 +1,229 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + sall $BASE_SHIFT, INCX + sall $BASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl N, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(1) + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(2) + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(3) + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(4) + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addl $SIZE, X + FLD (Y) + fmulp %st, %st(1) + addl $SIZE, Y + faddp %st,%st(1) + decl %eax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L29: +#endif + movl N, %eax + sarl $2, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(1) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(2) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(3) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(4) + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st, %st(1) + decl %eax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/qgemm_kernel_2x2.S b/kernel/x86/qgemm_kernel_2x2.S new file mode 100644 index 0000000..a2852f2 --- /dev/null +++ b/kernel/x86/qgemm_kernel_2x2.S @@ -0,0 +1,810 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl ARG_LDC, LDC + movl ARG_B, B + + addl $8 * SIZE, A + addl $8 * SIZE, B + + sall $BASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + lea (, LDC, 2), %eax + addl %eax, C + + movl M, I + sarl $1, I + je .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO + decl I + jne .L11 + ALIGN_4 + +.L20: + movl M, %eax + andl $1, %eax + je .L29 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal ( B, %eax, 2), BO +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, CO + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + decl J + jne .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + addl LDC, C + + movl M, I + sarl $1, I + je .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal ( B, %eax, 1), BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO + decl I + jne .L31 + ALIGN_4 + +.L40: + movl M, %eax + andl $1, %eax + je .L49 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal ( B, %eax, 1), BO +#endif + + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + FLD ALPHA + + fmulp %st, %st(1) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) +#else + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, CO + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl BO, B + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qgemv_n.S b/kernel/x86/qgemv_n.S new file mode 100644 index 0000000..8424232 --- /dev/null +++ b/kernel/x86/qgemv_n.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 32 +#endif + +#if defined(ATHLON) || defined(OPTERON) +#define P 32 +#endif + +#ifndef P +#define P DTB_ENTRIES +#endif + +#define STACK 16 +#define ARGS 16 + +#define PLDA_M 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_N 8 + STACK(%esp) +#define IS 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) + +#define A 32 + STACK + ARGS(%esp) +#define LDA 36 + STACK + ARGS(%esp) +#define X 40 + STACK + ARGS(%esp) +#define INCX 44 + STACK + ARGS(%esp) +#define Y 48 + STACK + ARGS(%esp) +#define INCY 52 + STACK + ARGS(%esp) +#define BUFFER 56 + STACK + ARGS(%esp) + + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA + movl X, %edi + + movl LDA, %ebx + sall $BASE_SHIFT, %ebx + + movl $0, IS + movl M, %edx + movl N, %esi + + test %esi, %esi + jle .L79 # goto END + test %edx, %edx + jle .L79 # goto END + + movl INCY, %eax + sall $BASE_SHIFT, %eax + movl %eax, INCY + + movl LDA, %eax + imull $P, %eax # P * lda + subl M ,%eax # P * lda - m + sall $BASE_SHIFT, %eax + movl %eax, PLDA_M + ALIGN_2 + +.L32: + movl IS, %esi + movl $P, %edx + movl N, %eax + subl %esi,%eax # n - is + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + + movl %eax, MIN_N + movl INCX, %edx + + sall $BASE_SHIFT, %esi + leal (%edi, %esi, 1), %esi + + movl %esi, XP + cmpl $1, %edx + je .L34 # if incx == 1 goto L34 + + movl BUFFER, %esi + sall $BASE_SHIFT, %edx + movl %esi, XP # xp = buffer + sarl $2,%eax + jle .L35 + ALIGN_2 + +.L36: + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_N, %eax + andl $3, %eax + jle .L34 + ALIGN_2 + +.L42: + FLD (%edi) + addl %edx, %edi + FST (%esi) + addl $SIZE, %esi + decl %eax + jg .L42 + ALIGN_3 + +/* Main Routine */ +.L34: + movl Y, %ecx # c_offset + movl M, %ebp + sarl $2, %ebp # j = (m >> 2) + jle .L47 + ALIGN_2 + +.L48: + movl A, %edx # a_offset = a + fldz + addl $4 * SIZE, A # a += 4 + fldz + movl XP, %esi # b_offset = xp + fldz + movl MIN_N, %eax # i = min_n + fldz + FLD (%esi) # bt1 = b_offset + sarl $1, %eax + jle .L51 + ALIGN_2 + +#ifdef PENTIUM3 +#define PRESIZE 8 +#else +#define PRESIZE 24 +#endif + +.L80: +#ifdef PENTIUM3 + prefetcht1 PRESIZE * SIZE(%edx, %ebx, 1) + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + + prefetcht1 PRESIZE * SIZE(%esi) + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(%esi) # bt1 = b_offset + + prefetcht1 PRESIZE * SIZE(%edx, %ebx, 2) + addl %ebx, %edx # a_offset += lda + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + addl %ebx, %edx + faddp %st, %st(4) # ct4 += at1 + + FLD 2 * SIZE(%esi) # bt1 = b_offset + addl $2 * SIZE, %esi # b_offset += 2 + +#else +#ifdef PENTIUM4 + prefetchnta 8 * SIZE(%esi) +#endif + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(%esi) # bt1 = b_offset + + addl %ebx, %edx # a_offset += lda + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 2 * SIZE(%esi) # bt1 = b_offset + + addl %ebx, %edx + addl $2 * SIZE, %esi # b_offset += 2 +#endif + decl %eax + jg .L80 + +.L51: + movl MIN_N,%eax + andl $1, %eax + je .L57 + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + fldz + ALIGN_2 + +.L57: + ffreep %st(0) + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + movl INCY, %eax + + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + + decl %ebp # j -- + jg .L48 + ALIGN_3 + +.L47: + movl M, %ebp + andl $3, %ebp # j = (m & 3) + jle .L60 + ALIGN_2 + +.L61: + + movl A, %edx # a_offset = a + fldz + addl $SIZE, A # a++ + fldz + movl XP,%esi + fldz + movl MIN_N,%eax + fldz + sarl $3,%eax + jle .L64 + ALIGN_2 + +.L65: + FLD 0 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(1) + addl %ebx, %edx + + FLD 1 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(2) + addl %ebx ,%edx + + FLD 2 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(3) + addl %ebx, %edx + + FLD 3 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(4) + addl %ebx, %edx + + FLD 4 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st,%st(1) + addl %ebx, %edx + + FLD 5 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(2) + addl %ebx, %edx + + FLD 6 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st,%st(3) + addl %ebx, %edx + + FLD 7 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st,%st(4) + addl %ebx, %edx + + addl $8 * SIZE, %esi + decl %eax + jg .L65 + +.L64: + movl MIN_N,%eax + andl $7, %eax + jle .L70 + ALIGN_2 + +.L71: + FLD (%esi) + addl $SIZE, %esi # b_offset ++ + FLD (%edx) + fmulp %st, %st(1) + addl %ebx, %edx # a_offset += lda + faddp %st, %st(1) + decl %eax + jg .L71 + ALIGN_2 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1), %st + movl INCY, %eax + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + decl %ebp + jg .L61 + +.L60: + movl PLDA_M, %esi + addl %esi, A # a += P * lda - m + addl $P, IS + movl N, %esi + cmpl %esi,IS + jl .L32 + +.L79: + ffreep %st(0) + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qgemv_t.S b/kernel/x86/qgemv_t.S new file mode 100644 index 0000000..ff2ba80 --- /dev/null +++ b/kernel/x86/qgemv_t.S @@ -0,0 +1,585 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 88 +#endif + +#ifndef P +#define P 1000 +#endif + +#define STACK 16 +#define ARGS 24 + +#define NLDA 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_M 8 + STACK(%esp) +#define J 12 + STACK(%esp) +#define IS 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) + +#define A 32 + STACK + ARGS(%esp) +#define LDA 36 + STACK + ARGS(%esp) +#define X 40 + STACK + ARGS(%esp) +#define INCX 44 + STACK + ARGS(%esp) +#define Y 48 + STACK + ARGS(%esp) +#define INCY 52 + STACK + ARGS(%esp) +#define BUFFER 56 + STACK + ARGS(%esp) + + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA + + movl X, %edi # X + + movl $0, IS + + movl M, %ebx + movl N, %eax + + testl %ebx, %ebx + jle .L79 + testl %eax, %eax + jle .L79 + + movl INCX, %esi + sall $BASE_SHIFT, %esi + movl %esi, INCX + + movl INCY, %esi + sall $BASE_SHIFT, %esi + movl %esi, INCY + + movl LDA, %ebx + + imull %ebx, %eax + movl $P, %esi + subl %eax, %esi + sall $BASE_SHIFT, %esi + movl %esi, NLDA + + movl %ebx, %esi + sall $BASE_SHIFT, %esi + movl %esi, LDA + ALIGN_2 + +.L32: + movl IS, %esi + + movl $P, %edx + movl M, %eax + subl %esi, %eax + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + movl %eax, MIN_M + + movl IS, %ecx + sall $BASE_SHIFT, %ecx + leal (%edi,%ecx, 1), %ecx + movl INCX, %ebx + movl %ecx, XP + cmpl $SIZE, %ebx + je .L34 + + movl BUFFER, %esi + movl MIN_M, %ecx + movl %esi, XP + sarl $2, %ecx + jle .L35 + + ALIGN_3 + +.L36: + FLD (%edi) + addl %ebx, %edi + FST 0 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 1 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 2 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 3 * SIZE(%esi) + + addl $4 * SIZE, %esi + decl %ecx + jg .L36 + ALIGN_3 + +.L35: + movl MIN_M, %ecx + andl $3,%ecx + jle .L34 + ALIGN_2 + +.L42: + FLD (%edi) + addl %ebx, %edi + FST (%esi) + addl $SIZE, %esi + decl %ecx + jg .L42 + ALIGN_3 + +/* Main Routine */ + +.L34: + movl Y, %ebp # coffset = y + + movl N, %esi + sarl $2, %esi + movl %esi, J + jle .L47 + ALIGN_3 + +.L48: + movl A, %ebx # a_offset = a + fldz + movl LDA, %edx + fldz + + leal (%ebx, %edx), %ecx # a_offset2 = a + lda + fldz + leal (%ebx, %edx, 4), %eax + fldz + + movl %eax, A + movl XP, %esi + FLD (%esi) + + movl MIN_M, %eax + sarl $2,%eax + jle .L51 + ALIGN_3 + +#define PRESIZE 8 + +.L80: +#ifdef PENTIUM3 + prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) + FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + prefetcht0 PRESIZE * SIZE(%ecx) + faddp %st,%st(2) # ct1 += at1 + FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + + prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + prefetcht0 PRESIZE * SIZE(%ebx) + FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + + fmul %st(1),%st + faddp %st,%st(4) + FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 2 * SIZE(%esi) + + FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 3 * SIZE(%esi) + FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(2) # ct1 += at1 + FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(3) # ct2 += at1 + FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + addl $4 * SIZE, %ebx + faddp %st,%st(4) + addl $4 * SIZE, %ecx + + FLD 4 * SIZE(%esi) + addl $4 * SIZE, %esi + +#else + +#if defined(HAS_PREFETCH) + prefetcht0 PRESIZE * SIZE(%ebx) + prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) + prefetcht0 PRESIZE * SIZE(%ecx) + prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) +#endif + + FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + + FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 2 * SIZE(%esi) + + FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 3 * SIZE(%esi) + + FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 4 * SIZE(%esi) + + addl $4 * SIZE, %ebx + addl $4 * SIZE, %ecx + addl $4 * SIZE, %esi +#endif + + decl %eax + jg .L80 + ALIGN_3 + +.L51: + movl MIN_M, %eax + andl $3, %eax + je .L81 + ALIGN_3 + +.L52: + + FLD (%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD (%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD (%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD (%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + + addl $SIZE, %ebx + addl $SIZE, %ecx + addl $SIZE, %esi + decl %eax + jg .L52 + ALIGN_3 + +.L81: + ffreep %st(0) + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + movl INCY, %eax + + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl %eax, %ebp + + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl %eax, %ebp + + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl %eax, %ebp + + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl %eax, %ebp + + decl J + jg .L48 + ALIGN_3 + +.L47: + movl N, %esi + andl $3,%esi + movl %esi, J + jle .L60 + ALIGN_2 + +.L61: + movl A, %ebx # a_offset = a + fldz # ct1 = ZERO + movl LDA, %edx + fldz # ct1 = ZERO + + addl %ebx, %edx + fldz # ct1 = ZERO + movl %edx, A + fldz # ct1 = ZERO + + movl XP, %esi + + movl MIN_M, %eax + sarl $3,%eax + jle .L64 + ALIGN_3 + +.L65: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * 2 * SIZE(%ebx) + prefetcht0 PRESIZE * 2 * SIZE(%ebx) +#endif + + FLD 0 * SIZE(%esi) + FLD 0 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(1) + + FLD 1 * SIZE(%esi) + FLD 1 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(2) + + FLD 2 * SIZE(%esi) + FLD 2 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(3) + + FLD 3 * SIZE(%esi) + FLD 3 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 4 * SIZE(%esi) + FLD 4 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(1) + + FLD 5 * SIZE(%esi) + FLD 5 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(2) + + FLD 6 * SIZE(%esi) + FLD 6 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(3) + + FLD 7 * SIZE(%esi) + FLD 7 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(4) + + addl $8 * SIZE, %esi + addl $8 * SIZE, %ebx + + decl %eax + jg .L65 + ALIGN_3 + +.L64: + movl MIN_M, %eax + andl $7, %eax + jle .L70 + ALIGN_3 + +.L71: + FLD (%esi) + FLD (%ebx) + fmulp %st, %st(1) + faddp %st,%st(1) + + addl $SIZE, %esi + addl $SIZE, %ebx + decl %eax + jg .L71 + ALIGN_3 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1),%st + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl INCY, %ebp + decl J + jg .L61 + ALIGN_3 + +.L60: + movl A, %ebx + addl NLDA, %ebx + movl %ebx, A + + addl $P, IS + movl M, %esi + cmpl %esi, IS + jl .L32 + ALIGN_3 + +.L79: + ffreep %st(0) + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qtrsm_kernel_LN_2x2.S b/kernel/x86/qtrsm_kernel_LN_2x2.S new file mode 100644 index 0000000..37c268b --- /dev/null +++ b/kernel/x86/qtrsm_kernel_LN_2x2.S @@ -0,0 +1,1231 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_LDC, LDC + movl ARG_B, B + sall $BASE_SHIFT, LDC + + addl $8 * SIZE, A + addl $8 * SIZE, B + + +#ifdef LN + movl M, %eax + sall $BASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $BASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + lea (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %eax + andl $1, %eax + je .L20 + ALIGN_4 + +.L21: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, I + sarl $1, I + je .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J + jne .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + je .L999 + +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %eax + andl $1, %eax + je .L40 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L40: + movl M, I + sarl $1, I + je .L49 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L31 + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qtrsm_kernel_LT_2x2.S b/kernel/x86/qtrsm_kernel_LT_2x2.S new file mode 100644 index 0000000..157e12d --- /dev/null +++ b/kernel/x86/qtrsm_kernel_LT_2x2.S @@ -0,0 +1,1229 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define AORIG 8 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_LDC, LDC + movl ARG_B, B + sall $BASE_SHIFT, LDC + + addl $8 * SIZE, A + addl $8 * SIZE, B + +#ifdef LN + movl M, %eax + sall $BASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $BASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + lea (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + sarl $1, I + je .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L11 + ALIGN_4 + +.L20: + movl M, %eax + andl $1, %eax + je .L29 + ALIGN_4 + +.L21: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J + jne .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + je .L999 + +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + sarl $1, I + je .L40 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L31 + ALIGN_4 + +.L40: + movl M, %eax + andl $1, %eax + je .L49 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qtrsm_kernel_RT_2x2.S b/kernel/x86/qtrsm_kernel_RT_2x2.S new file mode 100644 index 0000000..a0a4daf --- /dev/null +++ b/kernel/x86/qtrsm_kernel_RT_2x2.S @@ -0,0 +1,1231 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_LDC, LDC + movl ARG_B, B + sall $BASE_SHIFT, LDC + + addl $8 * SIZE, A + addl $8 * SIZE, B + + +#ifdef LN + movl M, %eax + sall $BASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $BASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + je .L30 + +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + sarl $1, I + je .L40 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L31 + ALIGN_4 + +.L40: + movl M, %eax + andl $1, %eax + je .L49 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L30: + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + lea (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + sarl $1, I + je .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L11 + ALIGN_4 + +.L20: + movl M, %eax + andl $1, %eax + je .L29 + ALIGN_4 + +.L21: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J + jne .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/rot.S b/kernel/x86/rot.S new file mode 100644 index 0000000..111266a --- /dev/null +++ b/kernel/x86/rot.S @@ -0,0 +1,388 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#ifdef XDOUBLE +#define STACK_S 40 + STACK + ARGS(%esp) +#elif defined DOUBLE +#define STACK_S 32 + STACK + ARGS(%esp) +#else +#define STACK_S 28 + STACK + ARGS(%esp) +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCH_SIZE 144 +#endif + +#ifdef OPTERON +#define PREFETCH prefetchw +#define PREFETCH_SIZE 144 +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + FLD STACK_S + FLD STACK_C + + sall $BASE_SHIFT, INCX + sall $BASE_SHIFT, INCY + + testl N, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + movl N, I + sarl $2, I + jle .L15 + ALIGN_4 + +.L10: +#ifdef PENTIUM4 + PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) +#endif +#ifdef OPTERON + PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + +#ifdef PENTIUM4 + PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) +#endif +#ifdef OPTERON + PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 2 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 3 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 3 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + + decl I + jg .L10 + ALIGN_4 + +.L15: + movl N, I + andl $3, I + jle .L999 + ALIGN_4 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl $SIZE, X + addl $SIZE, Y + + decl I + jg .L16 + jmp .L999 + ALIGN_4 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_4 + +.L51: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L51 + ALIGN_4 + +.L55: + movl N, I + andl $3, I + jle .L999 + ALIGN_4 + +.L56: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_4 + + +.L999: + ffreep %st(0) + ffreep %st(0) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/rot_sse.S b/kernel/x86/rot_sse.S new file mode 100644 index 0000000..af9f12f --- /dev/null +++ b/kernel/x86/rot_sse.S @@ -0,0 +1,1119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#define STACK_S 28 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#define C %xmm6 +#define S %xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + movss STACK_C, C + movss STACK_S, S + + shufps $0x0, C, C + shufps $0x0, S, S + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + testl $SIZE, X + je .L05 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + jle .L999 + +.L05: + testl $2 * SIZE, X + je .L10 + + cmpl $1, N + je .L17 + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, N + jle .L999 + ALIGN_2 + +.L10: + testl $3 * SIZE, Y + jne .L20 + + movl N, I + sarl $5, I + jle .L14 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movaps 16 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 16 * SIZE(X) + movlps %xmm2, 16 * SIZE(Y) + movhps %xmm2, 18 * SIZE(Y) + + movsd 20 * SIZE(Y), %xmm1 + movhps 22 * SIZE(Y), %xmm1 + movaps 20 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 20 * SIZE(X) + movlps %xmm2, 20 * SIZE(Y) + movhps %xmm2, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movaps 24 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 24 * SIZE(X) + movlps %xmm2, 24 * SIZE(Y) + movhps %xmm2, 26 * SIZE(Y) + + movsd 28 * SIZE(Y), %xmm1 + movhps 30 * SIZE(Y), %xmm1 + movaps 28 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 28 * SIZE(X) + movlps %xmm2, 28 * SIZE(Y) + movhps %xmm2, 30 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl I + jg .L11 + ALIGN_3 + +.L14: + testl $31, N + jle .L999 + + testl $16, N + jle .L15 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L15: + testl $8, N + jle .L16 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $4, N + jle .L17 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $2, N + jle .L18 + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L18: + testl $1, N + jle .L999 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movl N, I + sarl $5, I + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movaps 16 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 16 * SIZE(X) + movlps %xmm2, 16 * SIZE(Y) + movhps %xmm2, 18 * SIZE(Y) + + movsd 20 * SIZE(Y), %xmm1 + movhps 22 * SIZE(Y), %xmm1 + movaps 20 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 20 * SIZE(X) + movlps %xmm2, 20 * SIZE(Y) + movhps %xmm2, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movaps 24 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 24 * SIZE(X) + movlps %xmm2, 24 * SIZE(Y) + movhps %xmm2, 26 * SIZE(Y) + + movsd 28 * SIZE(Y), %xmm1 + movhps 30 * SIZE(Y), %xmm1 + movaps 28 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 28 * SIZE(X) + movlps %xmm2, 28 * SIZE(Y) + movhps %xmm2, 30 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + decl I + jg .L21 + ALIGN_3 + +.L24: + testl $31, N + jle .L999 + + testl $16, N + jle .L25 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + + +.L26: + testl $4, N + jle .L27 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, N + jle .L999 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_3 + +.L53: + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L53 + ALIGN_3 + +.L55: + movl N, I + andl $3, I + jle .L999 + ALIGN_3 + +.L56: + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/rot_sse2.S b/kernel/x86/rot_sse2.S new file mode 100644 index 0000000..8ec1d44 --- /dev/null +++ b/kernel/x86/rot_sse2.S @@ -0,0 +1,960 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#define STACK_S 32 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#include "l1param.h" + +#define C %xmm6 +#define S %xmm7 + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + movsd STACK_C, C + movsd STACK_S, S + + pshufd $0x44, C, C + pshufd $0x44, S, S + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + testl $SIZE, X + je .L10 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + jle .L999 + ALIGN_2 + +.L10: + testl $SIZE, Y + jne .L20 + + movl N, I + sarl $4, I + jle .L14 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 8 * SIZE(Y), %xmm1 + movapd 8 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 8 * SIZE(Y) + + movapd 10 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movapd %xmm2, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 12 * SIZE(Y), %xmm1 + movapd 12 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movapd %xmm2, 12 * SIZE(Y) + + movapd 14 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movapd %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + + decl I + jg .L11 + ALIGN_3 + +.L14: + testl $15, N + jle .L999 + + testl $8, N + jle .L15 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, N + jle .L16 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, N + jle .L17 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movapd -1 * SIZE(Y), %xmm1 + + movl N, I + sarl $4, I + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + movapd 5 * SIZE(Y), %xmm4 + movapd 4 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movlpd %xmm2, 4 * SIZE(Y) + movhpd %xmm2, 5 * SIZE(Y) + + movapd 7 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movlpd %xmm2, 6 * SIZE(Y) + movhpd %xmm2, 7 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 9 * SIZE(Y), %xmm4 + movapd 8 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movlpd %xmm2, 8 * SIZE(Y) + movhpd %xmm2, 9 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 11 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movlpd %xmm2, 10 * SIZE(Y) + movhpd %xmm2, 11 * SIZE(Y) + + movapd 13 * SIZE(Y), %xmm4 + movapd 12 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movlpd %xmm2, 12 * SIZE(Y) + movhpd %xmm2, 13 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 15 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movlpd %xmm2, 14 * SIZE(Y) + movhpd %xmm2, 15 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + decl I + jg .L21 + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + movapd 5 * SIZE(Y), %xmm4 + movapd 4 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movlpd %xmm2, 4 * SIZE(Y) + movhpd %xmm2, 5 * SIZE(Y) + + movapd 7 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movlpd %xmm2, 6 * SIZE(Y) + movhpd %xmm2, 7 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + movapd %xmm4, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + unpckhpd %xmm1, %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_3 + +.L53: + movsd (Y), %xmm1 + movhpd (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhpd (X, INCX), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, (X) + movhpd %xmm0, (X, INCX) + movlpd %xmm2, (Y) + movhpd %xmm2, (Y, INCY) + + leal (X, INCX, 2), X + leal (Y, INCY, 2), Y + + movsd (Y), %xmm1 + movhpd (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhpd (X, INCX), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, (X) + movhpd %xmm0, (X, INCX) + movlpd %xmm2, (Y) + movhpd %xmm2, (Y, INCY) + + leal (X, INCX, 2), X + leal (Y, INCY, 2), Y + + decl I + jg .L53 + ALIGN_3 + +.L55: + movl N, I + andl $3, I + jle .L999 + ALIGN_3 + +.L56: + movsd (Y), %xmm1 + movsd (X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, (X) + movsd %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/scal.S b/kernel/x86/scal.S new file mode 100644 index 0000000..377d4ef --- /dev/null +++ b/kernel/x86/scal.S @@ -0,0 +1,352 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl 16(%esp),%edx + FLD 28(%esp) + +#ifdef XDOUBLE + movl 44(%esp),%edi + movl 48(%esp),%esi +#elif defined(DOUBLE) + movl 36(%esp),%edi + movl 40(%esp),%esi +#else + movl 32(%esp),%edi + movl 36(%esp),%esi +#endif + + ftst + fnstsw %ax + andb $68, %ah + je .L300 # Alpha != ZERO + +/* Alpha == ZERO */ + cmpl $1,%esi + jne .L104 + + movl %edx, %ecx # ecx = n + sarl $3, %ecx # (n >> 3) + jle .L102 + ALIGN_4 + +.L101: +#ifndef XDOUBLE + FSTU 0 * SIZE(%edi) + FSTU 1 * SIZE(%edi) + FSTU 2 * SIZE(%edi) + FSTU 3 * SIZE(%edi) + FSTU 4 * SIZE(%edi) + FSTU 5 * SIZE(%edi) + FSTU 6 * SIZE(%edi) + FSTU 7 * SIZE(%edi) +#else + fld %st + FST 0 * SIZE(%edi) + fld %st + FST 1 * SIZE(%edi) + fld %st + FST 2 * SIZE(%edi) + fld %st + FST 3 * SIZE(%edi) + fld %st + FST 4 * SIZE(%edi) + fld %st + FST 5 * SIZE(%edi) + fld %st + FST 6 * SIZE(%edi) + fld %st + FST 7 * SIZE(%edi) +#endif + + addl $8 * SIZE, %edi + decl %ecx + jg .L101 + ALIGN_4 + +.L102: + movl %edx, %ecx + andl $7, %ecx + jle .L999 + ALIGN_4 + +.L103: +#ifndef XDOUBLE + FSTU 0 * SIZE(%edi) +#else + fld %st + FST 0 * SIZE(%edi) +#endif + addl $SIZE, %edi + decl %ecx + jg .L103 + jmp .L999 + ALIGN_4 + +.L104: + sall $BASE_SHIFT, %esi + + movl %edx, %ecx # ecx = n + sarl $3, %ecx # (n >> 3) + jle .L106 + ALIGN_4 + +.L105: +#ifndef XDOUBLE + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi +#else + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi +#endif + + decl %ecx + jg .L105 + ALIGN_4 + +.L106: + movl %edx, %ecx + andl $7, %ecx + jle .L999 + ALIGN_4 + +.L107: +#ifndef XDOUBLE + FSTU 0 * SIZE(%edi) +#else + fld %st + FST 0 * SIZE(%edi) +#endif + addl %esi, %edi + decl %ecx + jg .L107 + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L300: + cmpl $1,%esi + jne .L304 + + movl %edx, %ecx # ecx = n + sarl $3, %ecx # (n >> 3) + jle .L302 + ALIGN_4 + +.L301: + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + + FLD 1 * SIZE(%edi) + fmul %st(1), %st + FST 1 * SIZE(%edi) + + FLD 2 * SIZE(%edi) + fmul %st(1), %st + FST 2 * SIZE(%edi) + + FLD 3 * SIZE(%edi) + fmul %st(1), %st + FST 3 * SIZE(%edi) + + FLD 4 * SIZE(%edi) + fmul %st(1), %st + FST 4 * SIZE(%edi) + + FLD 5 * SIZE(%edi) + fmul %st(1), %st + FST 5 * SIZE(%edi) + + FLD 6 * SIZE(%edi) + fmul %st(1), %st + FST 6 * SIZE(%edi) + + FLD 7 * SIZE(%edi) + fmul %st(1), %st + FST 7 * SIZE(%edi) + + addl $8 * SIZE, %edi + decl %ecx + jg .L301 + ALIGN_4 + +.L302: + movl %edx, %ecx + andl $7, %ecx + jle .L999 + ALIGN_4 + +.L303: + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl $SIZE, %edi + decl %ecx + jg .L303 + jmp .L999 + ALIGN_4 + +.L304: + sall $BASE_SHIFT, %esi + + movl %edx, %ecx # ecx = n + sarl $3, %ecx # (n >> 3) + jle .L306 + ALIGN_4 + +.L305: + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + decl %ecx + jg .L305 + ALIGN_4 + +.L306: + movl %edx, %ecx + andl $7, %ecx + jle .L999 + ALIGN_4 + +.L307: + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + decl %ecx + jg .L307 + ALIGN_4 + +.L999: + ffreep %st(0) + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/scal_sse.S b/kernel/x86/scal_sse.S new file mode 100644 index 0000000..aa5ab76 --- /dev/null +++ b/kernel/x86/scal_sse.S @@ -0,0 +1,637 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 20 + STACK + ARGS(%esp) +#define STACK_INCX 24 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define XX %edi + +#include "l1param.h" + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + lea (, INCX, SIZE), INCX + + movss STACK_ALPHA, %xmm0 + + testl M, M + jle .L999 + + xorps %xmm1, %xmm1 + comiss %xmm0, %xmm1 + shufps $0, %xmm0, %xmm0 + + jne .L100 + +/* Alpha == ZERO */ + cmpl $SIZE, INCX + jne .L50 + +/* INCX == 1 */ + cmpl $3, M + jle .L14 + + testl $4, X # aligned for double word? + je .L05 + + movss %xmm1, 0 * SIZE(X) + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L05: + testl $8, X # aligned for quad word? + je .L06 + + movsd %xmm1, 0 * SIZE(X) + addl $2 * SIZE, X + subl $2, M + jle .L999 + ALIGN_3 + +.L06: + movl M, I + sarl $4, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 8 * SIZE(X) + movaps %xmm1, 12 * SIZE(X) + addl $16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: + testl $15, M + je .L999 + testl $8, M + je .L13 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L13: + testl $4, M + je .L14 + + movaps %xmm1, 0 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L14: + testl $2, M + je .L15 + + movsd %xmm1, 0 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L15: + testl $1, M + je .L999 + + movss %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L50: + movl M, I # rcx = n + sarl $3, I # (n >> 3) + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + + decl I + jg .L51 + ALIGN_4 + +.L52: + testl $7, M + je .L999 + + testl $4, M + je .L53 + + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + ALIGN_3 + +.L53: + testl $2, M + je .L54 + + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + ALIGN_3 + +.L54: + testl $1, M + je .L999 + + movss %xmm1, (X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + cmpl $SIZE, INCX + jne .L150 + + subl $-32 * SIZE, X + + cmpl $3, M + jle .L116 + + testl $SIZE, X + je .L105 + + movss -32 * SIZE(X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, -32 * SIZE(X) + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L105: + testl $2 * SIZE, X + je .L110 + + movsd -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movsd %xmm1, -32 * SIZE(X) + addl $2 * SIZE, X + subl $2, M + jle .L999 + ALIGN_3 + +.L110: + movl M, I + sarl $5, I + jle .L113 + +#if defined(BARCELONA) + + movaps %xmm0, %xmm1 + mulps -32 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulps -28 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulps -24 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulps -20 * SIZE(X), %xmm4 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, -32 * SIZE(X) + movaps %xmm0, %xmm1 + mulps -16 * SIZE(X), %xmm1 + + movaps %xmm2, -28 * SIZE(X) + movaps %xmm0, %xmm2 + mulps -12 * SIZE(X), %xmm2 + + movaps %xmm3, -24 * SIZE(X) + movaps %xmm0, %xmm3 + mulps -8 * SIZE(X), %xmm3 + + movaps %xmm4, -20 * SIZE(X) + movaps %xmm0, %xmm4 + mulps -4 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, -16 * SIZE(X) + movaps %xmm0, %xmm1 + mulps 0 * SIZE(X), %xmm1 + + movaps %xmm2, -12 * SIZE(X) + movaps %xmm0, %xmm2 + mulps 4 * SIZE(X), %xmm2 + + movaps %xmm3, -8 * SIZE(X) + movaps %xmm0, %xmm3 + mulps 8 * SIZE(X), %xmm3 + + movaps %xmm4, -4 * SIZE(X) + movaps %xmm0, %xmm4 + mulps 12 * SIZE(X), %xmm4 + + subl $-32 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + movaps %xmm1, -32 * SIZE(X) + movaps %xmm0, %xmm1 + mulps -16 * SIZE(X), %xmm1 + + movaps %xmm2, -28 * SIZE(X) + movaps %xmm0, %xmm2 + mulps -12 * SIZE(X), %xmm2 + + movaps %xmm3, -24 * SIZE(X) + movaps %xmm0, %xmm3 + mulps -8 * SIZE(X), %xmm3 + + movaps %xmm4, -20 * SIZE(X) + movaps %xmm0, %xmm4 + mulps -4 * SIZE(X), %xmm4 + + movaps %xmm1, -16 * SIZE(X) + movaps %xmm2, -12 * SIZE(X) + movaps %xmm3, -8 * SIZE(X) + movaps %xmm4, -4 * SIZE(X) + +#else + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm4 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + movaps -16 * SIZE(X), %xmm1 + + mulps %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(X) + movaps -12 * SIZE(X), %xmm2 + + mulps %xmm0, %xmm3 + movaps %xmm3, -24 * SIZE(X) + movaps -8 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm4 + movaps %xmm4, -20 * SIZE(X) + movaps -4 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + movaps 0 * SIZE(X), %xmm1 + + mulps %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(X) + movaps 4 * SIZE(X), %xmm2 + + mulps %xmm0, %xmm3 + movaps %xmm3, -8 * SIZE(X) + movaps 8 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm4 + movaps %xmm4, -4 * SIZE(X) + movaps 12 * SIZE(X), %xmm4 + + subl $-32 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + movaps -16 * SIZE(X), %xmm1 + + mulps %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(X) + movaps -12 * SIZE(X), %xmm2 + + mulps %xmm0, %xmm3 + movaps %xmm3, -24 * SIZE(X) + movaps -8 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm4 + movaps %xmm4, -20 * SIZE(X) + movaps -4 * SIZE(X), %xmm4 + + mulps %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulps %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -8 * SIZE(X) + mulps %xmm0, %xmm4 + movaps %xmm4, -4 * SIZE(X) + +#endif + + subl $-32 * SIZE, X + ALIGN_3 + +.L113: + testl $31, M + je .L999 + + testl $16, M + je .L114 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm3 + movaps -24 * SIZE(X), %xmm5 + movaps -20 * SIZE(X), %xmm7 + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -28 * SIZE(X) + mulps %xmm0, %xmm5 + movaps %xmm5, -24 * SIZE(X) + mulps %xmm0, %xmm7 + movaps %xmm7, -20 * SIZE(X) + + addl $16 * SIZE, X + ALIGN_3 + +.L114: + testl $8, M + je .L115 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -28 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L115: + testl $4, M + je .L116 + + movaps -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L116: + testl $2, M + je .L117 + + movsd -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movsd %xmm1, -32 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L117: + testl $1, M + je .L999 + + movss -32 * SIZE(X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, -32 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movl X, XX + movl M, I # rcx = n + sarl $3, I # (n >> 3) + jle .L152 + ALIGN_4 + +.L151: + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + movss (X), %xmm4 + addl INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + + movss %xmm1, (XX) + addl INCX, XX + movss %xmm2, (XX) + addl INCX, XX + movss %xmm3, (XX) + addl INCX, XX + movss %xmm4, (XX) + addl INCX, XX + + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + movss (X), %xmm4 + addl INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + + movss %xmm1, (XX) + addl INCX, XX + movss %xmm2, (XX) + addl INCX, XX + movss %xmm3, (XX) + addl INCX, XX + movss %xmm4, (XX) + addl INCX, XX + + decl I + jg .L151 + ALIGN_4 + +.L152: + testl $7, M + je .L999 + + testl $4, M + je .L153 + + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + movss (X), %xmm4 + addl INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + + movss %xmm1, (XX) + addl INCX, XX + movss %xmm2, (XX) + addl INCX, XX + movss %xmm3, (XX) + addl INCX, XX + movss %xmm4, (XX) + addl INCX, XX + ALIGN_3 + +.L153: + testl $2, M + je .L154 + + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + + movss %xmm1, (XX) + addl INCX, XX + movss %xmm2, (XX) + addl INCX, XX + ALIGN_3 + +.L154: + testl $1, M + je .L999 + + movss (X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (X) + ALIGN_4 + +.L999: + xorl %eax, %eax + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/scal_sse2.S b/kernel/x86/scal_sse2.S new file mode 100644 index 0000000..dab5434 --- /dev/null +++ b/kernel/x86/scal_sse2.S @@ -0,0 +1,556 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define XX %edi + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + movsd STACK_ALPHA, %xmm0 + + testl M, M + jle .L999 + + leal (, INCX, SIZE), INCX + + xorps %xmm1, %xmm1 + comisd %xmm0, %xmm1 + jne .L100 # Alpha != ZERO + +/* Alpha == ZERO */ + cmpl $SIZE, INCX + jne .L50 + +/* INCX == 1 */ + testl $15, X # aligned for quad word? + je .L05 + + movsd %xmm1, 0 * SIZE(X) + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 +.L05: + +/* Aligned Mode */ + movl M, I # rcx = n + sarl $4, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, 8 * SIZE(X) + movaps %xmm1, 10 * SIZE(X) + movaps %xmm1, 12 * SIZE(X) + movaps %xmm1, 14 * SIZE(X) + + addl $16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: + testl $15, M + je .L999 + testl $8, M + je .L13 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 6 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L13: + testl $4, M + je .L14 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L14: + testl $2, M + je .L15 + + movaps %xmm1, 0 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L15: + testl $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +.L50: + movl M, I + sarl $3, I + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + + decl I + jg .L51 + ALIGN_4 + +.L52: + testl $7, M + je .L999 + + testl $4, M + je .L53 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + ALIGN_3 + +.L53: + testl $2, M + je .L54 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + ALIGN_3 + +.L54: + testl $1, M + je .L999 + + movsd %xmm1, (X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + unpcklpd %xmm0, %xmm0 + + cmpl $SIZE, INCX + jne .L150 + + testl $SIZE, X + je .L105 + + movsd 0 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, 0 * SIZE(X) + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 +.L105: + subl $-16 * SIZE, X + + movl M, I # rcx = n + sarl $4, I + jle .L113 + +#if defined(BARCELONA) + + movaps %xmm0, %xmm1 + mulpd -16 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulpd -14 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulpd -12 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulpd -10 * SIZE(X), %xmm4 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, -16 * SIZE(X) + movaps %xmm0, %xmm1 + mulpd -8 * SIZE(X), %xmm1 + + movaps %xmm2, -14 * SIZE(X) + movaps %xmm0, %xmm2 + mulpd -6 * SIZE(X), %xmm2 + + movaps %xmm3, -12 * SIZE(X) + movaps %xmm0, %xmm3 + mulpd -4 * SIZE(X), %xmm3 + + movaps %xmm4, -10 * SIZE(X) + movaps %xmm0, %xmm4 + mulpd -2 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, -8 * SIZE(X) + movaps %xmm0, %xmm1 + mulpd 0 * SIZE(X), %xmm1 + + movaps %xmm2, -6 * SIZE(X) + movaps %xmm0, %xmm2 + mulpd 2 * SIZE(X), %xmm2 + + movaps %xmm3, -4 * SIZE(X) + movaps %xmm0, %xmm3 + mulpd 4 * SIZE(X), %xmm3 + + movaps %xmm4, -2 * SIZE(X) + movaps %xmm0, %xmm4 + mulpd 6 * SIZE(X), %xmm4 + + subl $-16 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + movaps %xmm1, -16 * SIZE(X) + movaps %xmm0, %xmm1 + mulpd -8 * SIZE(X), %xmm1 + + movaps %xmm2, -14 * SIZE(X) + movaps %xmm0, %xmm2 + mulpd -6 * SIZE(X), %xmm2 + + movaps %xmm3, -12 * SIZE(X) + movaps %xmm0, %xmm3 + mulpd -4 * SIZE(X), %xmm3 + + movaps %xmm4, -10 * SIZE(X) + movaps %xmm0, %xmm4 + mulpd -2 * SIZE(X), %xmm4 + + movaps %xmm1, -8 * SIZE(X) + movaps %xmm2, -6 * SIZE(X) + movaps %xmm3, -4 * SIZE(X) + movaps %xmm4, -2 * SIZE(X) + +#else + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + movaps -8 * SIZE(X), %xmm1 + + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + movaps -6 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + movaps -4 * SIZE(X), %xmm3 + + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + movaps -2 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd %xmm0, %xmm1 + movaps %xmm1, -8 * SIZE(X) + movaps 0 * SIZE(X), %xmm1 + + mulpd %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(X) + movaps 2 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm3 + movaps %xmm3, -4 * SIZE(X) + movaps 4 * SIZE(X), %xmm3 + + mulpd %xmm0, %xmm4 + movaps %xmm4, -2 * SIZE(X) + movaps 6 * SIZE(X), %xmm4 + + subl $-16 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + movaps -8 * SIZE(X), %xmm1 + + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + movaps -6 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + movaps -4 * SIZE(X), %xmm3 + + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + movaps -2 * SIZE(X), %xmm4 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -8 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(X) + mulpd %xmm0, %xmm3 + movaps %xmm3, -4 * SIZE(X) + mulpd %xmm0, %xmm4 + movaps %xmm4, -2 * SIZE(X) +#endif + + subl $-16 * SIZE, X + ALIGN_3 + +.L113: + testl $15, M + je .L999 + + testl $8, M + je .L114 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L114: + testl $4, M + je .L115 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L115: + testl $2, M + je .L116 + + movaps -16 * SIZE(X), %xmm1 + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L116: + testl $1, M + je .L999 + + movsd -16 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, -16 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movl X, XX + movl M, I + sarl $2, I + jle .L152 + ALIGN_4 + +.L151: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + movsd (X), %xmm1 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movsd (X), %xmm3 + addl INCX, X + movsd (X), %xmm4 + addl INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + + movsd %xmm1, (XX) + addl INCX, XX + movsd %xmm2, (XX) + addl INCX, XX + movsd %xmm3, (XX) + addl INCX, XX + movsd %xmm4, (XX) + addl INCX, XX + + decl I + jg .L151 + ALIGN_4 + +.L152: + testl $2, M + je .L154 + + movsd (X), %xmm1 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + + movsd %xmm1, (XX) + addl INCX, XX + movsd %xmm2, (XX) + addl INCX, XX + ALIGN_3 + +.L154: + testl $1, M + je .L999 + + movsd (X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, (X) + ALIGN_4 + +.L999: + xorl %eax, %eax + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/staticbuffer.S b/kernel/x86/staticbuffer.S new file mode 100644 index 0000000..b041c62 --- /dev/null +++ b/kernel/x86/staticbuffer.S @@ -0,0 +1,49 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + ALIGN_6 +#ifdef __CYGWIN__ + .comm _alloc_area, (NUM_BUFFERS * BUFFER_SIZE) +#else + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 4096 +#endif +#endif diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S new file mode 100644 index 0000000..d32c1a3 --- /dev/null +++ b/kernel/x86/swap.S @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define N 4 + STACK + ARGS(%esp) +#ifdef XDOUBLE +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#elif defined(DOUBLE) +#define X 24 + STACK + ARGS(%esp) +#define INCX 28 + STACK + ARGS(%esp) +#define Y 32 + STACK + ARGS(%esp) +#define INCY 36 + STACK + ARGS(%esp) +#else +#define X 20 + STACK + ARGS(%esp) +#define INCX 24 + STACK + ARGS(%esp) +#define Y 28 + STACK + ARGS(%esp) +#define INCY 32 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl N, %edx + movl X, %esi + movl Y, %edi + movl INCX, %ebx + movl INCY, %ecx + + sall $BASE_SHIFT, %ebx + sall $BASE_SHIFT, %ecx + + cmpl $SIZE, %ebx + jne .L14 + cmpl $SIZE, %ecx + jne .L14 + + movl %edx, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 3 * SIZE(%esi) + FLD 2 * SIZE(%esi) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + FLD 3 * SIZE(%edi) + FLD 2 * SIZE(%edi) + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) + FST 2 * SIZE(%edi) + FST 3 * SIZE(%edi) + + addl $4 * SIZE, %esi + addl $4 * SIZE, %edi + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl %edx, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (%esi) + FLD (%edi) + FST (%esi) + FST (%edi) + addl $SIZE, %esi + addl $SIZE, %edi + decl %eax + jg .L22 + jmp .L27 + ALIGN_3 + +/* INCX != 1 or INCY != 1 */ + +.L14: + movl %edx, %eax + sarl $2, %eax + jle .L28 + ALIGN_2 + +.L29: + FLD (%esi) + addl %ebx, %esi + FLD (%esi) + addl %ebx, %esi + FLD (%esi) + addl %ebx, %esi + FLD (%esi) + + FLD (%edi) + addl %ecx, %edi + FLD (%edi) + addl %ecx, %edi + FLD (%edi) + addl %ecx, %edi + FLD (%edi) + + FST (%esi) + subl %ebx, %esi + FST (%esi) + subl %ebx, %esi + FST (%esi) + subl %ebx, %esi + FST (%esi) + leal (%esi, %ebx, 4), %esi + + FST (%edi) + subl %ecx, %edi + FST (%edi) + subl %ecx, %edi + FST (%edi) + subl %ecx, %edi + FST (%edi) + leal (%edi, %ecx, 4), %edi + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl %edx, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L35: + FLD (%esi) + FLD (%edi) + FST (%esi) + addl %ebx, %esi + FST (%edi) + addl %ecx, %edi + decl %eax + jg .L35 + ALIGN_3 + +.L27: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/swap_sse.S b/kernel/x86/swap_sse.S new file mode 100644 index 0000000..39c0d2f --- /dev/null +++ b/kernel/x86/swap_sse.S @@ -0,0 +1,1139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 20 + STACK + ARGS(%esp) +#define STACK_INCX 24 + STACK + ARGS(%esp) +#define STACK_Y 28 + STACK + ARGS(%esp) +#define STACK_INCY 32 + STACK + ARGS(%esp) + +#define M %edx +#define X %esi +#define Y %edi +#define INCX %ebx +#define INCY %ecx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_Y, Y + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $BASE_SHIFT, %ebx + sall $BASE_SHIFT, %ecx + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + cmpl $3, M + jle .L16 + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + ALIGN_3 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_3 + +.L10: + cmpl $3, M + jle .L16 + + testl $2 * SIZE, X + jne .L30 + + testl $1 * SIZE, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + + decl %eax + jg .L11 + ALIGN_3 + +.L13: + testl $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + testl $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + addl $2 * SIZE, X + movlps %xmm0, -32 * SIZE(Y) + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L20: + movaps -33 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + PSHUFD2($0x39, %xmm1, %xmm3) + movlps %xmm3, -31 * SIZE(X) + + subl $3, M + + movl M, %eax + sarl $5, %eax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -13 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -5 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -5 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L23: + testl $16, M + jle .L24 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $8, M + jle .L25 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, M + jle .L26 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + PSHUFD2($0x39, %xmm0, %xmm2) + PSHUFD1($0xff, %xmm0) + + movlps %xmm2, -32 * SIZE(Y) + movss %xmm0, -30 * SIZE(Y) + + testl $2, M + jle .L27 + + movsd -29 * SIZE(X), %xmm0 + movsd -29 * SIZE(Y), %xmm1 + + movlps %xmm0, -29 * SIZE(Y) + movlps %xmm1, -29 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, M + jle .L29 + + movss -29 * SIZE(X), %xmm0 + movss -29 * SIZE(Y), %xmm1 + + movss %xmm0, -29 * SIZE(Y) + movss %xmm1, -29 * SIZE(X) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L30: + testl $1 * SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + subl $2, M + + movl M, %eax + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -6 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -6 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L33: + testl $16, M + jle .L34 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + testl $8, M + jle .L35 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, M + jle .L36 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + movhps %xmm0, -32 * SIZE(Y) + + testl $2, M + jle .L37 + + movsd -30 * SIZE(X), %xmm0 + movsd -30 * SIZE(Y), %xmm1 + + movlps %xmm0, -30 * SIZE(Y) + movlps %xmm1, -30 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, M + jle .L39 + + movss -30 * SIZE(X), %xmm0 + movss -30 * SIZE(Y), %xmm1 + + movss %xmm0, -30 * SIZE(Y) + movss %xmm1, -30 * SIZE(X) + ALIGN_3 + +.L39: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + + subl $3, M + + movl M, %eax + sarl $5, %eax + jle .L43 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -11 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -3 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -3 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L43: + testl $16, M + jle .L44 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + testl $8, M + jle .L45 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, M + jle .L46 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + movsd -31 * SIZE(X), %xmm2 + + PSHUFD2($0x39, %xmm1, %xmm1) + movlps %xmm1, -31 * SIZE(X) + + PSHUFD1($0xff, %xmm0) + + movss %xmm0, -32 * SIZE(Y) + movlps %xmm2, -31 * SIZE(Y) + + addl $3 * SIZE, X + addl $3 * SIZE, Y + + testl $2, M + jle .L47 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm0, -32 * SIZE(Y) + movlps %xmm1, -32 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, M + jle .L49 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm0, -32 * SIZE(Y) + movss %xmm1, -32 * SIZE(X) + ALIGN_3 + +.L49: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L50: + movl M, %eax + sarl $3, %eax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $7, %eax + jle .L57 + ALIGN_3 + +.L56: + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + movss %xmm0, (Y) + + addl INCX, X + addl INCY, Y + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/swap_sse2.S b/kernel/x86/swap_sse2.S new file mode 100644 index 0000000..b880812 --- /dev/null +++ b/kernel/x86/swap_sse2.S @@ -0,0 +1,572 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %edx +#define X %esi +#define Y %edi +#define INCX %ebx +#define INCY %ecx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_Y, Y + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + cmpl $SIZE, INCX + jne .L40 + cmpl $SIZE, INCY + jne .L40 + + testl $SIZE, Y + je .L10 + + movsd 0 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm0, 0 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_4 + +.L10: + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + + decl %eax + jg .L11 + ALIGN_3 + +.L13: + testl $8, M + jle .L14 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + testl $4, M + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L16: + testl $1, M + jle .L19 + + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + movlps %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L20: + movhps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + decl M + jle .L29 + + movl M, %eax + sarl $4, %eax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -5 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -3 * SIZE(X), %xmm2 + movaps -2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + + movhps %xmm0, -16 * SIZE(Y) + movhps -15 * SIZE(X), %xmm0 + movhps %xmm1, -15 * SIZE(X) + + addl $SIZE, X + addl $SIZE, Y + ALIGN_3 + +.L29: + movhps %xmm0, -16 * SIZE(Y) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L40: + movl M, %eax + sarl $3, %eax + jle .L45 + ALIGN_3 + +.L41: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L45: + movl M, %eax + andl $7, %eax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + movsd %xmm0, (Y) + + addl INCX, X + addl INCY, Y + decl %eax + jg .L46 + ALIGN_3 + +.L47: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x2.S b/kernel/x86/trsm_kernel_LN_2x2.S new file mode 100644 index 0000000..d1c741b --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x2.S @@ -0,0 +1,1127 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + +#define AA %edx +#define BB %ecx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + leal (, %ebp, SIZE), %ebp + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, %ebx + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax # j = (n >> 1) # MEMORY + sarl $1, %eax + movl %eax, J # j = (n >> 1) # MEMORY + je .L8 + ALIGN_4 + +.L34: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + lea (, %ebp, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %eax # m # MEMORY + andl $1, %eax + je .L12 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + + FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + ALIGN_4 + +.L33: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmulp %st, %st(1) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmulp %st, %st(2) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L12: + movl M, %esi + sarl $1, %esi + je .L27 + ALIGN_4 + +.MainHead: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(BB) # b5 + FLD 4 * SIZE(AA) # a5 + FLD 0 * SIZE(BB) # b1 + FLD 0 * SIZE(AA) # a1 + +#ifdef LN +#if defined(HAVE_3DNOW) + prefetchw -2 * SIZE(%edi) + prefetchw -2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta -2 * SIZE(%edi) + prefetchnta -2 * SIZE(%edi, %ebp, 1) +#endif +#else +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(BB) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(AA) + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(AA) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(AA) + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(AA) + + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(AA) + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(AA) + fxch %st(2) + + subl $-8 * SIZE, BB + subl $-8 * SIZE, AA + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + addl $2 * SIZE,BB + addl $2 * SIZE,AA + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) + FLD 2 * SIZE(BB) + fsubp %st, %st(3) + FLD 3 * SIZE(BB) + fsubp %st, %st(4) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(3) + FLD 2 * SIZE(AA) + fsubp %st, %st(2) + FLD 3 * SIZE(AA) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 2 * SIZE(AA) + fmul %st(3), %st + FLD 2 * SIZE(AA) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + FLD 1 * SIZE(AA) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + FLD 1 * SIZE(BB) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + FLD 2 * SIZE(BB) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) + fxch %st(2) + FSTU 2 * SIZE(BB) + fxch %st(3) + FSTU 3 * SIZE(BB) + + FST 1 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) +#else + FSTU 0 * SIZE(AA) + fxch %st(2) + FSTU 1 * SIZE(AA) + fxch %st(1) + FSTU 2 * SIZE(AA) + fxch %st(3) + FSTU 3 * SIZE(AA) + + FST 1 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) +#endif + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .MainHead + ALIGN_4 + +.L27: +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (%ebx, %eax, 2), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.L8: + movl N, %eax # n # MEMORY + andl $1, %eax + je .End + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + +#ifdef RT + subl %ebp, C +#endif + movl C, %edi # c # MEMORY +#ifndef RT + addl %ebp, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .L36 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + +#ifdef LN +#if defined(HAVE_3DNOW) + prefetchw -2 * SIZE(%edi) +#elif defined(HAVE_SSE) + prefetchnta -2 * SIZE(%edi) +#endif +#else +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) +#endif +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + test %eax, %eax + jle .L52 + ALIGN_3 + +.L51: + FLD (AA) + FMUL (BB) + addl $1 * SIZE,AA + addl $1 * SIZE,BB + faddp %st,%st(1) + decl %eax + jne .L51 + ALIGN_4 + +.L52: + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FMUL 0 * SIZE(AA) +#else + FMUL 0 * SIZE(BB) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L36: + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L99 + ALIGN_4 + +.L46: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $4 * SIZE,AA + addl $2 * SIZE,BB + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $2 * SIZE,AA + addl $1 * SIZE,BB + ALIGN_4 + +.L45: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmulp %st, %st(2) + + FLD 2 * SIZE(AA) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD 0 * SIZE(AA) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmulp %st, %st(1) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(AA) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (%ebx, %eax, SIZE), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x2_atom.S b/kernel/x86/trsm_kernel_LN_2x2_atom.S new file mode 100644 index 0000000..846a848 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x2_atom.S @@ -0,0 +1,1145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm7 + + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 3 * SIZE(BB), %xmm7 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm1 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 0 * SIZE(BB), %xmm7 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, %ebx + sarl $1, %ebx + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + movsd 2 * SIZE(BB), %xmm2 + movsd 3 * SIZE(BB), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + movsd 2 * SIZE(AA), %xmm1 + movsd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 + subsd %xmm5, %xmm1 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm2, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm1 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm6 + subsd %xmm5, %xmm2 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm2 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm2, %xmm6 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm1 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm1, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm2 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) + movsd %xmm2, 2 * SIZE(BB) + movsd %xmm3, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) + movsd %xmm1, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movsd %xmm3, 1 * SIZE(CO1, LDC) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L40 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L40: + movl M, %ebx + sarl $1, %ebx + jle .L49 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + movsd 0 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm2 + mulsd %xmm7, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm2 +#endif + + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L31 + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S new file mode 100644 index 0000000..6645b79 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -0,0 +1,2076 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx + jle .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + movapd -14 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm1 + movapd -14 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -10 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd -9 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -5 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd -2 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd -3 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd -4 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -7 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd -8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -12 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) + movsd %xmm2, -14 * SIZE(AA) + movsd %xmm3, -13 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, %ebx + sarl $1, %ebx + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -2 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 1 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + movsd %xmm0, %xmm7 + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm5 + movapd -12 * SIZE(BB), %xmm3 + movapd -10 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup -14 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup -15 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -10 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup -9 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -5 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup -2 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup -3 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup -4 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -7 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup -8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -12 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm2 + movhps -15 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + +.L56: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd -16 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx + jle .L59 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#else + prefetcht0 1 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup -15 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm3, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx + jle .L80 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm2 + movhps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L80: + movl M, %ebx + sarl $1, %ebx + jle .L89 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm1, -15 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S new file mode 100644 index 0000000..9a7a466 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -0,0 +1,2584 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movd OLD_OFFT, %mm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movd %mm4, OFFSET + movd %mm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + movlpd 2 * SIZE(AA), %xmm2 + movlpd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) + movlpd %xmm2, 2 * SIZE(AA) + movlpd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) + prefetchw -2 * SIZE(CO1, LDC, 2) + prefetchw -2 * SIZE(CO1, %eax) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) + prefetchw 1 * SIZE(CO1, LDC, 2) + prefetchw 1 * SIZE(CO1, %eax) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm3 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + movhpd 6 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + movhpd 7 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + movhpd 11 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + movhpd 14 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + movhpd 13 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + movhpd 12 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + movhpd 9 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + movhpd 8 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + movhpd 4 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm3, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) + movlpd %xmm3, 8 * SIZE(BB) + movlpd %xmm3, 9 * SIZE(BB) + movhpd %xmm3, 10 * SIZE(BB) + movhpd %xmm3, 11 * SIZE(BB) + movlpd %xmm7, 12 * SIZE(BB) + movlpd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L32 + ALIGN_2 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L36 + ALIGN_4 + +.L40: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L50 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movlpd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movlpd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movlpd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movlpd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L59 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm3, 4 * SIZE(BB) + movlpd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_2 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L66 + ALIGN_4 + +.L70: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L80 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movlpd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movlpd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movlpd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movlpd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(B), %xmm2 + subsd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L80: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B,%eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse3.S b/kernel/x86/trsm_kernel_LN_2x4_sse3.S new file mode 100644 index 0000000..5ab4ab3 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x4_sse3.S @@ -0,0 +1,2031 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + movapd 2 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm1 + movapd 2 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 6 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd 7 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 11 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd 14 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd 13 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd 12 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 9 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd 8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 4 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) + movsd %xmm2, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) + prefetchnta -2 * SIZE(CO1, LDC, 2) + prefetchnta -2 * SIZE(CO1, %eax, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) + prefetchnta 2 * SIZE(CO1, LDC, 2) + prefetchnta 2 * SIZE(CO1, %eax, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm5 + movapd 4 * SIZE(BB), %xmm3 + movapd 6 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 6 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup 7 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 11 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup 14 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup 13 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup 12 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 9 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 4 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm5, 2 * SIZE(BB) + movapd %xmm3, 4 * SIZE(BB) + movapd %xmm7, 6 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L50 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd 0 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L59 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm3, 2 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L80 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + movhpd 9 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movhpd 1 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + movhpd 9 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + + +.L80: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L89 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_4x2_core2.S b/kernel/x86/trsm_kernel_LN_4x2_core2.S new file mode 100644 index 0000000..d974fa6 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_4x2_core2.S @@ -0,0 +1,2100 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L30 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm1 +#else + movsd -16 * SIZE(AA), %xmm0 + movsd -15 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd -13 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm1 + movsd -14 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + movsd %xmm1, -15 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) + movsd %xmm1, -14 * SIZE(BB) + movsd %xmm1, -13 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movddup -14 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movhpd %xmm1, 1 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 -3 * SIZE(CO1, LDC) +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 3 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + movapd -12 * SIZE(B), %xmm5 + movapd -10 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movddup -2 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movddup -3 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movddup -4 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -7 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movddup -8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -12 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -10 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movddup -9 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -5 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movddup -15 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm1, %xmm5 + subpd %xmm5, %xmm3 + + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + subpd %xmm5, %xmm1 + + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movsd %xmm7, 3 * SIZE(CO1) + + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) + movhpd %xmm5, 2 * SIZE(CO1, LDC) + movhpd %xmm7, 3 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm2, 1 * SIZE(CO1, LDC) + movsd %xmm3, 2 * SIZE(CO1, LDC) + movhpd %xmm3, 3 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + movapd %xmm5, -12 * SIZE(B) + movapd %xmm7, -10 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + movddup %xmm5, %xmm4 + movddup %xmm7, %xmm6 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpckhpd %xmm5, %xmm5 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L130 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd -16 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 +#else + movapd -16 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -14 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + + movddup %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + + movapd %xmm1, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L159 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -2 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd -3 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd -4 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -7 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd -8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -12 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -14 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd -13 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd -10 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd -9 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + movsd -5 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movddup %xmm0, %xmm2 + movddup %xmm1, %xmm3 + + unpckhpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm1, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_4x2_sse2.S b/kernel/x86/trsm_kernel_LN_4x2_sse2.S new file mode 100644 index 0000000..a1fb8a1 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_4x2_sse2.S @@ -0,0 +1,2293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define PREFETCHSIZE (8 * 4) + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L30 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd 3 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + movsd %xmm1, 1 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movsd %xmm1, 2 * SIZE(BB) + movsd %xmm1, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + movhpd %xmm1, 1 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetcht2 -4 * SIZE(%esi) + prefetcht2 -4 * SIZE(%esi, LDC) +#else + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movsd 14 * SIZE(AA), %xmm0 + movhpd 14 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movsd 13 * SIZE(AA), %xmm0 + movhpd 13 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movsd 12 * SIZE(AA), %xmm0 + movhpd 12 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 9 * SIZE(AA), %xmm0 + movhpd 9 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movsd 8 * SIZE(AA), %xmm0 + movhpd 8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + movhpd 4 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 6 * SIZE(AA), %xmm0 + movhpd 6 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movsd 7 * SIZE(AA), %xmm0 + movhpd 7 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 11 * SIZE(AA), %xmm0 + movhpd 11 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) + movsd %xmm5, 8 * SIZE(BB) + movsd %xmm5, 9 * SIZE(BB) + movhpd %xmm5, 10 * SIZE(BB) + movhpd %xmm5, 11 * SIZE(BB) + movsd %xmm7, 12 * SIZE(BB) + movsd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movsd %xmm7, 3 * SIZE(%esi) + + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) + movhpd %xmm5, 2 * SIZE(%esi, LDC) + movhpd %xmm7, 3 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + + movsd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm2, 1 * SIZE(%esi, LDC) + movsd %xmm3, 2 * SIZE(%esi, LDC) + movhpd %xmm3, 3 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L130 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, BB + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetcht2 -4 * SIZE(%esi) +#else + prefetcht2 4 * SIZE(%esi) +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 +#else + movapd 0 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L159 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 + movapd 2 * SIZE(B), %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 14 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd 13 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd 12 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 9 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd 8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 4 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 2 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 6 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd 7 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 11 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) + movsd %xmm1, 4 * SIZE(BB) + movsd %xmm1, 5 * SIZE(BB) + movhpd %xmm1, 6 * SIZE(BB) + movhpd %xmm1, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S new file mode 100644 index 0000000..bb33918 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -0,0 +1,3129 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 20 + STACK + ARGS(%esp) +#define ARG_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define ARG_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, M + je .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movsd -32 * SIZE(AA), %xmm0 + movhps -30 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) + movss %xmm2, -30 * SIZE(AA) + movss %xmm3, -29 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + movsd -28 * SIZE(AA), %xmm2 + movsd -26 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) + movlps %xmm2, -28 * SIZE(AA) + movlps %xmm3, -26 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -4 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -4 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm4 + + movaps %xmm6, %xmm2 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm1 + movlhps %xmm2, %xmm0 + movhlps %xmm2, %xmm1 + + movaps %xmm6, %xmm7 + movlhps %xmm4, %xmm6 + movhlps %xmm4, %xmm7 + + pshufd $0x39, %xmm1, %xmm2 + pshufd $0x39, %xmm7, %xmm4 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + movaps -24 * SIZE(BB), %xmm5 + movaps -20 * SIZE(BB), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm3 + subps %xmm6, %xmm5 + subps %xmm4, %xmm7 +#else + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + movaps -24 * SIZE(AA), %xmm2 + movaps -20 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + movaps %xmm5, -24 * SIZE(BB) + movaps %xmm7, -20 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) + movaps %xmm2, -24 * SIZE(AA) + movaps %xmm3, -20 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, M + je .L60 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm5, %xmm4 + + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + movss -31 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + movhlps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + movsd -28 * SIZE(BB), %xmm5 + movsd -26 * SIZE(BB), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + movlps %xmm5, -28 * SIZE(BB) + movlps %xmm7, -26 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, M + je .L100 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + + leal (AA, %eax, SIZE), AA + leal (B, %eax, SIZE), BB +#endif + + haddps %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BB), %xmm1 + subss %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss -32 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss -32 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA + leal (BB, %eax, SIZE), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -32 * SIZE(AA), %xmm4 + movhps -30 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + movss -30 * SIZE(BB), %xmm5 + movss -29 * SIZE(BB), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) + movss %xmm5, -30 * SIZE(BB) + movss %xmm7, -29 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S new file mode 100644 index 0000000..147ed19 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -0,0 +1,3691 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movss STACK_OFFT, %xmm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + testl $1, M + je .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (AA, %eax, SIZE), AA + + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + unpcklps %xmm5, %xmm4 + + movaps 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + movss 2 * SIZE(AA), %xmm2 + movss 3 * SIZE(AA), %xmm3 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 + subss %xmm6, %xmm2 + subss %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 4 * SIZE(AA), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 6 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) + movlps %xmm2, 4 * SIZE(AA) + movlps %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + PREFETCHW -4 * SIZE(CO1) + PREFETCHW -4 * SIZE(CO1, LDC) + PREFETCHW -4 * SIZE(CO1, LDC, 2) + PREFETCHW -4 * SIZE(CO1, %eax) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $2 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm5, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm2, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm2, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + testl $1, M + je .L60 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW -4 * SIZE(CO1) + PREFETCHW -4 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 4 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + movaps %xmm0, 24 * SIZE(BB) + movaps %xmm2, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L86 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + testl $1, M + je .L100 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW -4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + movss 2 * SIZE(B), %xmm5 + movss 3 * SIZE(B), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + movss %xmm5, 2 * SIZE(B) + movss %xmm7, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + movaps %xmm0, 8 * SIZE(BB) + pshufd $0x00, %xmm7, %xmm0 + movaps %xmm0, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_8x2_sse.S b/kernel/x86/trsm_kernel_LN_8x2_sse.S new file mode 100644 index 0000000..16a2c2f --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_8x2_sse.S @@ -0,0 +1,3605 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define TRMASK 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-STACK_ALIGN, %esp + + STACK_TOUCHING + + movss STACK_M, %xmm0 + movl STACK_N, %eax + movss STACK_K, %xmm1 + movss STACK_A, %xmm2 + movl STACK_B, B + movss STACK_C, %xmm3 + movl STACK_LDC, LDC + movss STACK_OFFT, %xmm4 + + movss %xmm1, K + movl %eax, N + movss %xmm0, M + movss %xmm2, A + movss %xmm3, C + movl %esi, OLD_STACK + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LN) || defined(LT) + movl $0x3f800000, 0 + TRMASK # 1.0 + movl $0x00000000, 4 + TRMASK # 0.0 + movl $0x3f800000, 8 + TRMASK # 1.0 + movl $0x00000000, 12 + TRMASK # 0.0 +#endif + + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm3 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + testl $1, M + jle .L30 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm2 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm0, %xmm5 + subss %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm2, %xmm5 + subss %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movss %xmm2, 0 * SIZE(CO1) + movss %xmm0, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L30: + testl $2, M + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 2 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm2, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + testl $4, M + jle .L70 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm2, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + movl M, %ebx + sarl $3, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + PREFETCHW 7 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movaps %xmm6, %xmm1 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm1 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + movsd 8 * SIZE(B), %xmm5 + movhps 10 * SIZE(B), %xmm5 + movsd 12 * SIZE(B), %xmm7 + movhps 14 * SIZE(B), %xmm7 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 + subps %xmm6, %xmm5 + subps %xmm1, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 + subps %xmm5, %xmm2 + subps %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm6 + + subps %xmm5, %xmm2 + subps %xmm6, %xmm3 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + mulps %xmm3, %xmm6 + + subps %xmm5, %xmm0 + subps %xmm6, %xmm1 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + movlps %xmm5, 8 * SIZE(B) + movhps %xmm5, 10 * SIZE(B) + movlps %xmm7, 12 * SIZE(B) + movhps %xmm7, 14 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm1, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm7, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm1, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movaps %xmm5, %xmm4 + shufps $0x88, %xmm7, %xmm5 + shufps $0xdd, %xmm7, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) + movlps %xmm4, 4 * SIZE(CO1, LDC) + movhps %xmm4, 6 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + testl $1, N + jle .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + + shufps $0x00, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + testl $1, M + jle .L130 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +.L172: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + shufps $0x00, %xmm1, %xmm1 + movaps %xmm1, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L130: + testl $2, M + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + shufps $1, %xmm5, %xmm5 + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 + + movaps %xmm4, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm0 + mulss %xmm4, %xmm0 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + mulss %xmm4, %xmm0 + movaps %xmm4, %xmm6 + shufps $0x55, %xmm6, %xmm6 + mulss %xmm0, %xmm6 + subss %xmm6, %xmm1 + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + + shufps $0x00, %xmm0, %xmm0 + shufps $0x00, %xmm1, %xmm1 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + testl $4, M + jle .L170 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + movhps 2 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movsd 16 * SIZE(AA), %xmm1 + movhps 18 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 + + xorps %xmm5, %xmm5 + + movaps %xmm2, %xmm3 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L170: + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L179 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm5 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm6 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm5 + movhps 6 * SIZE(B), %xmm5 + + subps %xmm4, %xmm2 + subps %xmm6, %xmm5 + + xorps %xmm0, %xmm0 + + movaps %xmm2, %xmm3 + unpcklps %xmm0, %xmm2 + unpckhps %xmm0, %xmm3 + + movaps %xmm5, %xmm7 + unpcklps %xmm0, %xmm5 + unpckhps %xmm0, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + shufps $0x88, %xmm7, %xmm5 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movhps %xmm5, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L179: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_1x4.S b/kernel/x86/trsm_kernel_LT_1x4.S new file mode 100644 index 0000000..5670746 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_1x4.S @@ -0,0 +1,1251 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 32 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) +#define AORIG 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define STACK_A 20 + STACK + ARGS(%esp) +#define STACK_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define STACK_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define B_ORIG %ebx +#define LDC %ebp + +#define PREFETCHSIZE (5 + 8 * 10) + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_B, B_ORIG + movl STACK_LDC, LDC + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, STACK_A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B_ORIG + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + subl $-16 * SIZE, B_ORIG + subl $-16 * SIZE, STACK_A + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + sarl $2, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L11: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + + leal (, LDC, 4), %eax +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + jle .L13 + ALIGN_4 + +.L12: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + movl 16 * SIZE(B), %esi + movl 24 * SIZE(B), %esi + movl 32 * SIZE(B), %esi + movl 40 * SIZE(B), %esi + subl $-64 * SIZE, B + decl %eax + jne .L12 + ALIGN_3 + +.L13: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L14: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 4), B +#else + movl B_ORIG, B +#endif + + leal (%edi, LDC, 2), %eax + + fldz + fldz + fldz + fldz + + FLD -8 * SIZE(A) + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + movl $32 * SIZE, %esi + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + prefetchw 1 * SIZE(%eax) + prefetchw 1 * SIZE(%eax, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L16 + ALIGN_3 + +.L15: + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -15 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -13 * SIZE(B) + + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -11 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -9 * SIZE(B) + + faddp %st, %st(5) + FLD -14 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -7 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -5 * SIZE(B) + + faddp %st, %st(5) + FLD -13 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -3 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -1 * SIZE(B) + + faddp %st, %st(5) + FLD -12 * SIZE(A) + FLD 0 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 1 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 3 * SIZE(B) + + faddp %st, %st(5) + FLD -11 * SIZE(A) + FLD 4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 5 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 7 * SIZE(B) + + faddp %st, %st(5) + FLD -10 * SIZE(A) + FLD 8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 9 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 11 * SIZE(B) + + faddp %st, %st(5) + FLD -9 * SIZE(A) + FLD 12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 13 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 15 * SIZE(B) + + faddp %st, %st(5) + FLD 0 * SIZE(A) + + PADDING prefetch PREFETCHSIZE * SIZE(A) + + addl $8 * SIZE, A + fxch %st(1) + addl $32 * SIZE, B + + FLD -16 * SIZE(B) + decl %eax + jne .L15 + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L19 + ALIGN_4 + +.L17: + fmul %st(1), %st + faddp %st, %st(3) + + FLD -15 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(4) + + FLD -14 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(5) + + FMUL -13 * SIZE(B) + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + addl $1 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, A + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 4), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(B) + fsubp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fsubp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(B) + fsubp %st, %st(4) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(A) + fsubp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(A) + fsubp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(A) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef LT + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FMUL 0 * SIZE - 16 * SIZE(B) + + FLD 1 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(4) + + FLD 5 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 6 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(3) + FLD 7 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(4) + + FLD 10 * SIZE - 16 * SIZE(B) + fmulp %st, %st(3) + FLD 11 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(4) + + FLD 15 * SIZE - 16 * SIZE(B) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 15 * SIZE - 16 * SIZE(B) + fmulp %st, %st(4) + + FLD 14 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(3) + FLD 13 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(2) + FLD 12 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(1) + + FLD 10 * SIZE - 16 * SIZE(B) + fmulp %st, %st(3) + FLD 9 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(2) + FLD 8 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(1) + + FLD 5 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 4 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(1) + + FLD 0 * SIZE - 16 * SIZE(B) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(B) + fxch %st(2) + FSTU 2 * SIZE - 16 * SIZE(B) + fxch %st(3) + FSTU 3 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(A) + fxch %st(2) + FSTU 2 * SIZE - 16 * SIZE(A) + fxch %st(3) + FSTU 3 * SIZE - 16 * SIZE(A) +#endif + + leal (%edi, LDC, 2), %eax + + FST 0 * SIZE(%eax, LDC) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi, LDC) + FST 0 * SIZE(%eax) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 4), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L14 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 4), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J + jne .L11 + ALIGN_4 + +.L20: + movl N, %eax + andl $2, %eax + je .L30 + +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + + leal (, LDC, 2), %eax +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + jle .L23 + ALIGN_4 + +.L22: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L22 + ALIGN_3 + +.L23: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L24: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 2), B +#else + movl B_ORIG, B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -14 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -13 * SIZE(A) + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -7 * SIZE(B) + faddp %st, %st(2) + + FLD -11 * SIZE(A) + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -5 * SIZE(B) + faddp %st, %st(4) + + FLD -10 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -3 * SIZE(B) + faddp %st, %st(2) + + FLD -9 * SIZE(A) + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -1 * SIZE(B) + faddp %st, %st(4) + + FLD -8 * SIZE(A) + FLD 0 * SIZE(B) + + addl $ 8 * SIZE, A + subl $-16 * SIZE, B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + addl $1 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(2) + faddp %st, %st(2) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, A + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 2), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(B) + fsubp %st, %st(2) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(A) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FMUL 0 * SIZE - 16 * SIZE(B) + + FLD 1 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(2) + + FLD 3 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(1) + + FLD 0 * SIZE - 16 * SIZE(B) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(A) +#endif + + FST 0 * SIZE(%edi, LDC) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 2), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + decl I + jne .L24 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 2), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L30: + movl N, %eax + andl $1, %eax + je .L999 + ALIGN_3 + +.L31: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %edi +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L32: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L32 + ALIGN_3 + +.L33: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L34: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 1), B +#else + movl B_ORIG, B +#endif + + fldz + fldz + fldz + fldz + + prefetchw 1 * SIZE(%edi) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L36 + ALIGN_3 + +.L35: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + FLD -15 * SIZE(A) + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -14 * SIZE(A) + FMUL -14 * SIZE(B) + faddp %st, %st(3) + + FLD -13 * SIZE(A) + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FMUL -12 * SIZE(B) + faddp %st, %st(1) + + FLD -11 * SIZE(A) + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -10 * SIZE(A) + FMUL -10 * SIZE(B) + faddp %st, %st(3) + + FLD -9 * SIZE(A) + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + addl $8 * SIZE, A + addl $8 * SIZE, B + + decl %eax + jne .L35 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L39 + ALIGN_4 + +.L37: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + addl $1 * SIZE,A + addl $1 * SIZE,B + decl %eax + jne .L37 + ALIGN_4 + +.L39: + faddp %st, %st(2) + faddp %st, %st(2) + faddp %st, %st(1) + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, A + leal (A, %eax, SIZE), A + leal (B_ORIG, %eax, SIZE), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(A) + fmulp %st, %st(1) +#endif + +#if defined(RN) || defined(RT) + FMUL 0 * SIZE - 16 * SIZE(B) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (A, %eax, SIZE), A + leal (B, %eax, SIZE), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + decl I + jne .L34 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 1), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x2.S b/kernel/x86/trsm_kernel_LT_2x2.S new file mode 100644 index 0000000..d21909d --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x2.S @@ -0,0 +1,1104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + +#define AA %edx +#define BB %ecx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + leal (, %ebp, SIZE), %ebp + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, %ebx + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax # j = (n >> 1) # MEMORY + sarl $1, %eax + movl %eax, J # j = (n >> 1) # MEMORY + je .L8 + ALIGN_4 + +.L34: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + lea (, %ebp, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %esi + sarl $1, %esi + je .L12 + ALIGN_4 + +.MainHead: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(BB) # b5 + FLD 4 * SIZE(AA) # a5 + FLD 0 * SIZE(BB) # b1 + FLD 0 * SIZE(AA) # a1 + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(BB) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(AA) + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(AA) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(AA) + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(AA) + + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(AA) + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(AA) + fxch %st(2) + + subl $-8 * SIZE, BB + subl $-8 * SIZE, AA + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + addl $2 * SIZE,BB + addl $2 * SIZE,AA + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) + FLD 2 * SIZE(BB) + fsubp %st, %st(3) + FLD 3 * SIZE(BB) + fsubp %st, %st(4) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(3) + FLD 2 * SIZE(AA) + fsubp %st, %st(2) + FLD 3 * SIZE(AA) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 2 * SIZE(AA) + fmul %st(3), %st + FLD 2 * SIZE(AA) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + FLD 1 * SIZE(AA) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + FLD 1 * SIZE(BB) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + FLD 2 * SIZE(BB) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) + fxch %st(2) + FSTU 2 * SIZE(BB) + fxch %st(3) + FSTU 3 * SIZE(BB) + + FST 1 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) +#else + FSTU 0 * SIZE(AA) + fxch %st(2) + FSTU 1 * SIZE(AA) + fxch %st(1) + FSTU 2 * SIZE(AA) + fxch %st(3) + FSTU 3 * SIZE(AA) + + FST 1 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) +#endif + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .MainHead + ALIGN_4 + +.L12: + movl M, %eax # m # MEMORY + andl $1, %eax + je .L27 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + + FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + ALIGN_4 + +.L33: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmulp %st, %st(1) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmulp %st, %st(2) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L27: +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (%ebx, %eax, 2), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.L8: + movl N, %eax # n # MEMORY + andl $1, %eax + je .End + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + +#ifdef RT + subl %ebp, C +#endif + movl C, %edi # c # MEMORY +#ifndef RT + addl %ebp, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L36 + ALIGN_4 + +.L46: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $4 * SIZE,AA + addl $2 * SIZE,BB + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $2 * SIZE,AA + addl $1 * SIZE,BB + ALIGN_4 + +.L45: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmulp %st, %st(2) + + FLD 2 * SIZE(AA) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD 0 * SIZE(AA) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmulp %st, %st(1) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(AA) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L36: + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + test %eax, %eax + jle .L52 + ALIGN_3 + +.L51: + FLD (AA) + FMUL (BB) + addl $1 * SIZE,AA + addl $1 * SIZE,BB + faddp %st,%st(1) + decl %eax + jne .L51 + ALIGN_4 + +.L52: + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FMUL 0 * SIZE(AA) +#else + FMUL 0 * SIZE(BB) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (%ebx, %eax, SIZE), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x2_atom.S b/kernel/x86/trsm_kernel_LT_2x2_atom.S new file mode 100644 index 0000000..3835005 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x2_atom.S @@ -0,0 +1,1145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + movsd 2 * SIZE(BB), %xmm2 + movsd 3 * SIZE(BB), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + movsd 2 * SIZE(AA), %xmm1 + movsd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 + subsd %xmm5, %xmm1 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm2, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm1 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm6 + subsd %xmm5, %xmm2 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm2 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm2, %xmm6 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm1 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm1, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm2 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) + movsd %xmm2, 2 * SIZE(BB) + movsd %xmm3, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) + movsd %xmm1, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movsd %xmm3, 1 * SIZE(CO1, LDC) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm7 + + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 3 * SIZE(BB), %xmm7 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm1 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 0 * SIZE(BB), %xmm7 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + movsd 0 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm2 + mulsd %xmm7, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm2 +#endif + + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L49 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S new file mode 100644 index 0000000..55c69e4 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -0,0 +1,2071 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -2 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 1 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + movsd %xmm0, %xmm7 + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm5 + movapd -12 * SIZE(BB), %xmm3 + movapd -10 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup -14 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup -15 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -10 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup -9 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -5 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup -2 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup -3 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup -4 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -7 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup -8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -12 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + movapd -14 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm1 + movapd -14 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -10 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd -9 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -5 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd -2 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd -3 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd -4 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -7 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd -8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -12 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) + movsd %xmm2, -14 * SIZE(AA) + movsd %xmm3, -13 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#else + prefetcht0 1 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup -15 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm3, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + +.L56: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd -16 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm1, -15 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L89 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S new file mode 100644 index 0000000..e4f5981 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -0,0 +1,2583 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movd OLD_OFFT, %mm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movd %mm4, OFFSET + movd %mm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) + prefetchw -2 * SIZE(CO1, LDC, 2) + prefetchw -2 * SIZE(CO1, %eax) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) + prefetchw 1 * SIZE(CO1, LDC, 2) + prefetchw 1 * SIZE(CO1, %eax) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm3 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + movhpd 6 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + movhpd 7 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + movhpd 11 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + movhpd 14 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + movhpd 13 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + movhpd 12 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + movhpd 9 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + movhpd 8 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + movhpd 4 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm3, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) + movlpd %xmm3, 8 * SIZE(BB) + movlpd %xmm3, 9 * SIZE(BB) + movhpd %xmm3, 10 * SIZE(BB) + movhpd %xmm3, 11 * SIZE(BB) + movlpd %xmm7, 12 * SIZE(BB) + movlpd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + movlpd 2 * SIZE(AA), %xmm2 + movlpd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) + movlpd %xmm2, 2 * SIZE(AA) + movlpd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L32 + ALIGN_2 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L36 + ALIGN_4 + +.L40: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm3, 4 * SIZE(BB) + movlpd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movlpd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movlpd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movlpd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movlpd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_2 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L66 + ALIGN_4 + +.L70: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L99 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movlpd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movlpd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movlpd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movlpd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(B), %xmm2 + subsd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B,%eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse3.S b/kernel/x86/trsm_kernel_LT_2x4_sse3.S new file mode 100644 index 0000000..487f059 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x4_sse3.S @@ -0,0 +1,2030 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) + prefetchnta -2 * SIZE(CO1, LDC, 2) + prefetchnta -2 * SIZE(CO1, %eax, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) + prefetchnta 2 * SIZE(CO1, LDC, 2) + prefetchnta 2 * SIZE(CO1, %eax, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm5 + movapd 4 * SIZE(BB), %xmm3 + movapd 6 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 6 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup 7 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 11 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup 14 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup 13 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup 12 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 9 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 4 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm5, 2 * SIZE(BB) + movapd %xmm3, 4 * SIZE(BB) + movapd %xmm7, 6 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + movapd 2 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm1 + movapd 2 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 6 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd 7 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 11 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd 14 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd 13 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd 12 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 9 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd 8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 4 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) + movsd %xmm2, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm3, 2 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd 0 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L89 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + movhpd 9 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movhpd 1 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + movhpd 9 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_4x2_core2.S b/kernel/x86/trsm_kernel_LT_4x2_core2.S new file mode 100644 index 0000000..dba627f --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_4x2_core2.S @@ -0,0 +1,2100 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 -3 * SIZE(CO1, LDC) +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 3 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + movapd -12 * SIZE(B), %xmm5 + movapd -10 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movddup -2 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movddup -3 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movddup -4 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -7 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movddup -8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -12 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -10 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movddup -9 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -5 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movddup -15 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm1, %xmm5 + subpd %xmm5, %xmm3 + + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + subpd %xmm5, %xmm1 + + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movsd %xmm7, 3 * SIZE(CO1) + + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) + movhpd %xmm5, 2 * SIZE(CO1, LDC) + movhpd %xmm7, 3 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm2, 1 * SIZE(CO1, LDC) + movsd %xmm3, 2 * SIZE(CO1, LDC) + movhpd %xmm3, 3 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + movapd %xmm5, -12 * SIZE(B) + movapd %xmm7, -10 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + movddup %xmm5, %xmm4 + movddup %xmm7, %xmm6 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpckhpd %xmm5, %xmm5 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movddup -14 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movhpd %xmm1, 1 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm1 +#else + movsd -16 * SIZE(AA), %xmm0 + movsd -15 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd -13 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm1 + movsd -14 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + movsd %xmm1, -15 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) + movsd %xmm1, -14 * SIZE(BB) + movsd %xmm1, -13 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -2 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd -3 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd -4 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -7 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd -8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -12 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -14 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd -13 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd -10 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd -9 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + movsd -5 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movddup %xmm0, %xmm2 + movddup %xmm1, %xmm3 + + unpckhpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm1, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 +#else + movapd -16 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -14 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + + movddup %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + + movapd %xmm1, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L159 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd -16 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_4x2_sse2.S b/kernel/x86/trsm_kernel_LT_4x2_sse2.S new file mode 100644 index 0000000..626d75a --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_4x2_sse2.S @@ -0,0 +1,2280 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define PREFETCHSIZE (8 * 4) + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movsd 14 * SIZE(AA), %xmm0 + movhpd 14 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movsd 13 * SIZE(AA), %xmm0 + movhpd 13 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movsd 12 * SIZE(AA), %xmm0 + movhpd 12 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 9 * SIZE(AA), %xmm0 + movhpd 9 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movsd 8 * SIZE(AA), %xmm0 + movhpd 8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + movhpd 4 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 6 * SIZE(AA), %xmm0 + movhpd 6 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movsd 7 * SIZE(AA), %xmm0 + movhpd 7 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 11 * SIZE(AA), %xmm0 + movhpd 11 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) + movsd %xmm5, 8 * SIZE(BB) + movsd %xmm5, 9 * SIZE(BB) + movhpd %xmm5, 10 * SIZE(BB) + movhpd %xmm5, 11 * SIZE(BB) + movsd %xmm7, 12 * SIZE(BB) + movsd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movsd %xmm7, 3 * SIZE(%esi) + + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) + movhpd %xmm5, 2 * SIZE(%esi, LDC) + movhpd %xmm7, 3 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + + movsd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm2, 1 * SIZE(%esi, LDC) + movsd %xmm3, 2 * SIZE(%esi, LDC) + movhpd %xmm3, 3 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + movhpd %xmm1, 1 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd 3 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + movsd %xmm1, 1 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movsd %xmm1, 2 * SIZE(BB) + movsd %xmm1, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 + movapd 2 * SIZE(B), %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 14 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd 13 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd 12 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 9 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd 8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 4 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 2 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 6 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd 7 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 11 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) + movsd %xmm1, 4 * SIZE(BB) + movsd %xmm1, 5 * SIZE(BB) + movhpd %xmm1, 6 * SIZE(BB) + movhpd %xmm1, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 +#else + movapd 0 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L159 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, BB + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S new file mode 100644 index 0000000..11cc104 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -0,0 +1,3129 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 20 + STACK + ARGS(%esp) +#define ARG_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define ARG_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -4 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -4 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm4 + + movaps %xmm6, %xmm2 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm1 + movlhps %xmm2, %xmm0 + movhlps %xmm2, %xmm1 + + movaps %xmm6, %xmm7 + movlhps %xmm4, %xmm6 + movhlps %xmm4, %xmm7 + + pshufd $0x39, %xmm1, %xmm2 + pshufd $0x39, %xmm7, %xmm4 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + movaps -24 * SIZE(BB), %xmm5 + movaps -20 * SIZE(BB), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm3 + subps %xmm6, %xmm5 + subps %xmm4, %xmm7 +#else + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + movaps -24 * SIZE(AA), %xmm2 + movaps -20 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + movaps %xmm5, -24 * SIZE(BB) + movaps %xmm7, -20 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) + movaps %xmm2, -24 * SIZE(AA) + movaps %xmm3, -20 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + movsd -28 * SIZE(AA), %xmm2 + movsd -26 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) + movlps %xmm2, -28 * SIZE(AA) + movlps %xmm3, -26 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movsd -32 * SIZE(AA), %xmm0 + movhps -30 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) + movss %xmm2, -30 * SIZE(AA) + movss %xmm3, -29 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + movsd -28 * SIZE(BB), %xmm5 + movsd -26 * SIZE(BB), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + movlps %xmm5, -28 * SIZE(BB) + movlps %xmm7, -26 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + movhlps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm5, %xmm4 + + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + movss -31 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + movss -30 * SIZE(BB), %xmm5 + movss -29 * SIZE(BB), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) + movss %xmm5, -30 * SIZE(BB) + movss %xmm7, -29 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -32 * SIZE(AA), %xmm4 + movhps -30 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + testl $1, M + je .L119 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + + leal (AA, %eax, SIZE), AA + leal (B, %eax, SIZE), BB +#endif + + haddps %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BB), %xmm1 + subss %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss -32 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss -32 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA + leal (BB, %eax, SIZE), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S new file mode 100644 index 0000000..8d61898 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -0,0 +1,3690 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movss STACK_OFFT, %xmm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + PREFETCHW 3 * SIZE(CO1, LDC, 2) + PREFETCHW 3 * SIZE(CO1, %eax) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $2 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm5, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm2, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm2, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 4 * SIZE(AA), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 6 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) + movlps %xmm2, 4 * SIZE(AA) + movlps %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (AA, %eax, SIZE), AA + + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + unpcklps %xmm5, %xmm4 + + movaps 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + movss 2 * SIZE(AA), %xmm2 + movss 3 * SIZE(AA), %xmm3 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 + subss %xmm6, %xmm2 + subss %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 4 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + movaps %xmm0, 24 * SIZE(BB) + movaps %xmm2, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L86 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + movss 2 * SIZE(B), %xmm5 + movss 3 * SIZE(B), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + movss %xmm5, 2 * SIZE(B) + movss %xmm7, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + movaps %xmm0, 8 * SIZE(BB) + pshufd $0x00, %xmm7, %xmm0 + movaps %xmm0, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + testl $1, M + je .L119 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_8x2_sse.S b/kernel/x86/trsm_kernel_LT_8x2_sse.S new file mode 100644 index 0000000..5d59698 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_8x2_sse.S @@ -0,0 +1,3604 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define TRMASK 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-STACK_ALIGN, %esp + + STACK_TOUCHING + + movss STACK_M, %xmm0 + movl STACK_N, %eax + movss STACK_K, %xmm1 + movss STACK_A, %xmm2 + movl STACK_B, B + movss STACK_C, %xmm3 + movl STACK_LDC, LDC + movss STACK_OFFT, %xmm4 + + movss %xmm1, K + movl %eax, N + movss %xmm0, M + movss %xmm2, A + movss %xmm3, C + movl %esi, OLD_STACK + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LN) || defined(LT) + movl $0x3f800000, 0 + TRMASK # 1.0 + movl $0x00000000, 4 + TRMASK # 0.0 + movl $0x3f800000, 8 + TRMASK # 1.0 + movl $0x00000000, 12 + TRMASK # 0.0 +#endif + + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm3 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $3, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + PREFETCHW 7 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movaps %xmm6, %xmm1 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm1 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + movsd 8 * SIZE(B), %xmm5 + movhps 10 * SIZE(B), %xmm5 + movsd 12 * SIZE(B), %xmm7 + movhps 14 * SIZE(B), %xmm7 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 + subps %xmm6, %xmm5 + subps %xmm1, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 + subps %xmm5, %xmm2 + subps %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm6 + + subps %xmm5, %xmm2 + subps %xmm6, %xmm3 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + mulps %xmm3, %xmm6 + + subps %xmm5, %xmm0 + subps %xmm6, %xmm1 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + movlps %xmm5, 8 * SIZE(B) + movhps %xmm5, 10 * SIZE(B) + movlps %xmm7, 12 * SIZE(B) + movhps %xmm7, 14 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm1, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm7, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm1, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movaps %xmm5, %xmm4 + shufps $0x88, %xmm7, %xmm5 + shufps $0xdd, %xmm7, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) + movlps %xmm4, 4 * SIZE(CO1, LDC) + movhps %xmm4, 6 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + testl $4, M + jle .L50 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm2, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + testl $2, M + jle .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 2 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm2, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + testl $1, M + jle .L99 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm2 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm0, %xmm5 + subss %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm2, %xmm5 + subss %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movss %xmm2, 0 * SIZE(CO1) + movss %xmm0, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + testl $1, N + jle .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + + shufps $0x00, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm5 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm6 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm5 + movhps 6 * SIZE(B), %xmm5 + + subps %xmm4, %xmm2 + subps %xmm6, %xmm5 + + xorps %xmm0, %xmm0 + + movaps %xmm2, %xmm3 + unpcklps %xmm0, %xmm2 + unpckhps %xmm0, %xmm3 + + movaps %xmm5, %xmm7 + unpcklps %xmm0, %xmm5 + unpckhps %xmm0, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + shufps $0x88, %xmm7, %xmm5 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movhps %xmm5, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + testl $4, M + jle .L150 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + movhps 2 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movsd 16 * SIZE(AA), %xmm1 + movhps 18 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 + + xorps %xmm5, %xmm5 + + movaps %xmm2, %xmm3 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + testl $2, M + jle .L170 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + shufps $1, %xmm5, %xmm5 + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 + + movaps %xmm4, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm0 + mulss %xmm4, %xmm0 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + mulss %xmm4, %xmm0 + movaps %xmm4, %xmm6 + shufps $0x55, %xmm6, %xmm6 + mulss %xmm0, %xmm6 + subss %xmm6, %xmm1 + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + + shufps $0x00, %xmm0, %xmm0 + shufps $0x00, %xmm1, %xmm1 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L170: + testl $1, M + jle .L179 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +.L172: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + shufps $0x00, %xmm1, %xmm1 + movaps %xmm1, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 +.L179: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_1x4.S b/kernel/x86/trsm_kernel_RT_1x4.S new file mode 100644 index 0000000..b7f17e2 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_1x4.S @@ -0,0 +1,1251 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 32 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) +#define AORIG 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define STACK_A 20 + STACK + ARGS(%esp) +#define STACK_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define STACK_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define B_ORIG %ebx +#define LDC %ebp + +#define PREFETCHSIZE (5 + 8 * 10) + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_B, B_ORIG + movl STACK_LDC, LDC + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, STACK_A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B_ORIG + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + subl $-16 * SIZE, B_ORIG + subl $-16 * SIZE, STACK_A + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + andl $1, %eax + je .L20 + ALIGN_3 + +.L31: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %edi +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L32: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L32 + ALIGN_3 + +.L33: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L34: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 1), B +#else + movl B_ORIG, B +#endif + + fldz + fldz + fldz + fldz + + prefetchw 1 * SIZE(%edi) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L36 + ALIGN_3 + +.L35: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + FLD -15 * SIZE(A) + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -14 * SIZE(A) + FMUL -14 * SIZE(B) + faddp %st, %st(3) + + FLD -13 * SIZE(A) + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FMUL -12 * SIZE(B) + faddp %st, %st(1) + + FLD -11 * SIZE(A) + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -10 * SIZE(A) + FMUL -10 * SIZE(B) + faddp %st, %st(3) + + FLD -9 * SIZE(A) + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + addl $8 * SIZE, A + addl $8 * SIZE, B + + decl %eax + jne .L35 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L39 + ALIGN_4 + +.L37: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + addl $1 * SIZE,A + addl $1 * SIZE,B + decl %eax + jne .L37 + ALIGN_4 + +.L39: + faddp %st, %st(2) + faddp %st, %st(2) + faddp %st, %st(1) + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, A + leal (A, %eax, SIZE), A + leal (B_ORIG, %eax, SIZE), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(A) + fmulp %st, %st(1) +#endif + +#if defined(RN) || defined(RT) + FMUL 0 * SIZE - 16 * SIZE(B) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (A, %eax, SIZE), A + leal (B, %eax, SIZE), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + decl I + jne .L34 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 1), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L20: + movl N, %eax + andl $2, %eax + je .L30 + +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + + leal (, LDC, 2), %eax +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + jle .L23 + ALIGN_4 + +.L22: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L22 + ALIGN_3 + +.L23: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L24: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 2), B +#else + movl B_ORIG, B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -14 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -13 * SIZE(A) + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -7 * SIZE(B) + faddp %st, %st(2) + + FLD -11 * SIZE(A) + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -5 * SIZE(B) + faddp %st, %st(4) + + FLD -10 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -3 * SIZE(B) + faddp %st, %st(2) + + FLD -9 * SIZE(A) + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -1 * SIZE(B) + faddp %st, %st(4) + + FLD -8 * SIZE(A) + FLD 0 * SIZE(B) + + addl $ 8 * SIZE, A + subl $-16 * SIZE, B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + addl $1 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(2) + faddp %st, %st(2) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, A + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 2), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(B) + fsubp %st, %st(2) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(A) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FMUL 0 * SIZE - 16 * SIZE(B) + + FLD 1 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(2) + + FLD 3 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(1) + + FLD 0 * SIZE - 16 * SIZE(B) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(A) +#endif + + FST 0 * SIZE(%edi, LDC) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 2), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + decl I + jne .L24 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 2), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L30: + movl N, %eax + sarl $2, %eax + movl %eax, J + je .L999 + ALIGN_3 + +.L11: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + + leal (, LDC, 4), %eax +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + jle .L13 + ALIGN_4 + +.L12: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + movl 16 * SIZE(B), %esi + movl 24 * SIZE(B), %esi + movl 32 * SIZE(B), %esi + movl 40 * SIZE(B), %esi + subl $-64 * SIZE, B + decl %eax + jne .L12 + ALIGN_3 + +.L13: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L14: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 4), B +#else + movl B_ORIG, B +#endif + + leal (%edi, LDC, 2), %eax + + fldz + fldz + fldz + fldz + + FLD -8 * SIZE(A) + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + movl $32 * SIZE, %esi + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + prefetchw 1 * SIZE(%eax) + prefetchw 1 * SIZE(%eax, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L16 + ALIGN_3 + +.L15: + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -15 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -13 * SIZE(B) + + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -11 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -9 * SIZE(B) + + faddp %st, %st(5) + FLD -14 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -7 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -5 * SIZE(B) + + faddp %st, %st(5) + FLD -13 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -3 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -1 * SIZE(B) + + faddp %st, %st(5) + FLD -12 * SIZE(A) + FLD 0 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 1 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 3 * SIZE(B) + + faddp %st, %st(5) + FLD -11 * SIZE(A) + FLD 4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 5 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 7 * SIZE(B) + + faddp %st, %st(5) + FLD -10 * SIZE(A) + FLD 8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 9 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 11 * SIZE(B) + + faddp %st, %st(5) + FLD -9 * SIZE(A) + FLD 12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 13 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 15 * SIZE(B) + + faddp %st, %st(5) + FLD 0 * SIZE(A) + + PADDING prefetch PREFETCHSIZE * SIZE(A) + + addl $8 * SIZE, A + fxch %st(1) + addl $32 * SIZE, B + + FLD -16 * SIZE(B) + decl %eax + jne .L15 + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L19 + ALIGN_4 + +.L17: + fmul %st(1), %st + faddp %st, %st(3) + + FLD -15 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(4) + + FLD -14 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(5) + + FMUL -13 * SIZE(B) + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + addl $1 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, A + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 4), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(B) + fsubp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fsubp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(B) + fsubp %st, %st(4) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(A) + fsubp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(A) + fsubp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(A) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef LT + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FMUL 0 * SIZE - 16 * SIZE(B) + + FLD 1 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(4) + + FLD 5 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 6 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(3) + FLD 7 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(4) + + FLD 10 * SIZE - 16 * SIZE(B) + fmulp %st, %st(3) + FLD 11 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(4) + + FLD 15 * SIZE - 16 * SIZE(B) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 15 * SIZE - 16 * SIZE(B) + fmulp %st, %st(4) + + FLD 14 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(3) + FLD 13 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(2) + FLD 12 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(1) + + FLD 10 * SIZE - 16 * SIZE(B) + fmulp %st, %st(3) + FLD 9 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(2) + FLD 8 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(1) + + FLD 5 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 4 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(1) + + FLD 0 * SIZE - 16 * SIZE(B) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(B) + fxch %st(2) + FSTU 2 * SIZE - 16 * SIZE(B) + fxch %st(3) + FSTU 3 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(A) + fxch %st(2) + FSTU 2 * SIZE - 16 * SIZE(A) + fxch %st(3) + FSTU 3 * SIZE - 16 * SIZE(A) +#endif + + leal (%edi, LDC, 2), %eax + + FST 0 * SIZE(%eax, LDC) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi, LDC) + FST 0 * SIZE(%eax) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 4), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L14 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 4), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J + jne .L11 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x2.S b/kernel/x86/trsm_kernel_RT_2x2.S new file mode 100644 index 0000000..8603446 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x2.S @@ -0,0 +1,1102 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + +#define AA %edx +#define BB %ecx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + leal (, %ebp, SIZE), %ebp + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, %ebx + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax # n # MEMORY + andl $1, %eax + je .L8 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + +#ifdef RT + subl %ebp, C +#endif + movl C, %edi # c # MEMORY +#ifndef RT + addl %ebp, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L36 + ALIGN_4 + +.L46: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $4 * SIZE,AA + addl $2 * SIZE,BB + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $2 * SIZE,AA + addl $1 * SIZE,BB + ALIGN_4 + +.L45: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmulp %st, %st(2) + + FLD 2 * SIZE(AA) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD 0 * SIZE(AA) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmulp %st, %st(1) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(AA) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L36: + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + test %eax, %eax + jle .L52 + ALIGN_3 + +.L51: + FLD (AA) + FMUL (BB) + addl $1 * SIZE,AA + addl $1 * SIZE,BB + faddp %st,%st(1) + decl %eax + jne .L51 + ALIGN_4 + +.L52: + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FMUL 0 * SIZE(AA) +#else + FMUL 0 * SIZE(BB) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (%ebx, %eax, SIZE), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L8: + movl N, %eax # j = (n >> 1) # MEMORY + sarl $1, %eax + movl %eax, J # j = (n >> 1) # MEMORY + je .End + ALIGN_4 + +.L34: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + lea (, %ebp, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %esi + sarl $1, %esi + je .L12 + ALIGN_4 + +.MainHead: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(BB) # b5 + FLD 4 * SIZE(AA) # a5 + FLD 0 * SIZE(BB) # b1 + FLD 0 * SIZE(AA) # a1 + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(BB) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(AA) + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(AA) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(AA) + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(AA) + + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(AA) + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(AA) + fxch %st(2) + + subl $-8 * SIZE, BB + subl $-8 * SIZE, AA + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + addl $2 * SIZE,BB + addl $2 * SIZE,AA + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) + FLD 2 * SIZE(BB) + fsubp %st, %st(3) + FLD 3 * SIZE(BB) + fsubp %st, %st(4) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(3) + FLD 2 * SIZE(AA) + fsubp %st, %st(2) + FLD 3 * SIZE(AA) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 2 * SIZE(AA) + fmul %st(3), %st + FLD 2 * SIZE(AA) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + FLD 1 * SIZE(AA) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + FLD 1 * SIZE(BB) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + FLD 2 * SIZE(BB) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) + fxch %st(2) + FSTU 2 * SIZE(BB) + fxch %st(3) + FSTU 3 * SIZE(BB) + + FST 1 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) +#else + FSTU 0 * SIZE(AA) + fxch %st(2) + FSTU 1 * SIZE(AA) + fxch %st(1) + FSTU 2 * SIZE(AA) + fxch %st(3) + FSTU 3 * SIZE(AA) + + FST 1 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) +#endif + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .MainHead + ALIGN_4 + +.L12: + movl M, %eax # m # MEMORY + andl $1, %eax + je .L27 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + + FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + ALIGN_4 + +.L33: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmulp %st, %st(1) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmulp %st, %st(2) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L27: +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (%ebx, %eax, 2), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x2_atom.S b/kernel/x86/trsm_kernel_RT_2x2_atom.S new file mode 100644 index 0000000..97af198 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x2_atom.S @@ -0,0 +1,1145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L30 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + movsd 0 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm2 + mulsd %xmm7, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm2 +#endif + + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L49 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L30: + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + movsd 2 * SIZE(BB), %xmm2 + movsd 3 * SIZE(BB), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + movsd 2 * SIZE(AA), %xmm1 + movsd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 + subsd %xmm5, %xmm1 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm2, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm1 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm6 + subsd %xmm5, %xmm2 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm2 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm2, %xmm6 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm1 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm1, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm2 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) + movsd %xmm2, 2 * SIZE(BB) + movsd %xmm3, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) + movsd %xmm1, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movsd %xmm3, 1 * SIZE(CO1, LDC) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm7 + + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 3 * SIZE(BB), %xmm7 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm1 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 0 * SIZE(BB), %xmm7 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S new file mode 100644 index 0000000..01876a5 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -0,0 +1,2075 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L30 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + movhps -15 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm1, -15 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L89 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm2 + movhps -15 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#else + prefetcht0 1 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup -15 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm3, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + +.L56: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd -16 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -2 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 1 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + movsd %xmm0, %xmm7 + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm5 + movapd -12 * SIZE(BB), %xmm3 + movapd -10 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup -14 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup -15 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -10 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup -9 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -5 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup -2 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup -3 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup -4 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -7 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup -8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -12 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhps %xmm5, 0 * SIZE(CO1, %eax, 1) + movhps %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + movapd -14 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm1 + movapd -14 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -10 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd -9 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -5 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd -2 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd -3 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd -4 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -7 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd -8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -12 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) + movsd %xmm2, -14 * SIZE(AA) + movsd %xmm3, -13 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S new file mode 100644 index 0000000..6c2682a --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -0,0 +1,2586 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movd OLD_OFFT, %mm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movd %mm4, OFFSET + movd %mm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L30 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_2 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L66 + ALIGN_4 + +.L70: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L99 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movlpd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movlpd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movlpd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movlpd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(B), %xmm2 + subsd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B,%eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.L30: + testl $2, N + je .L60 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L32 + ALIGN_2 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L36 + ALIGN_4 + +.L40: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm3, 4 * SIZE(BB) + movlpd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movlpd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movlpd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movlpd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movlpd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) + prefetchw -2 * SIZE(CO1, LDC, 2) + prefetchw -2 * SIZE(CO1, %eax) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) + prefetchw 1 * SIZE(CO1, LDC, 2) + prefetchw 1 * SIZE(CO1, %eax) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm3 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + movhpd 6 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + movhpd 7 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + movhpd 11 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + movhpd 14 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + movhpd 13 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + movhpd 12 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + movhpd 9 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + movhpd 8 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + movhpd 4 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm3, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) + movlpd %xmm3, 8 * SIZE(BB) + movlpd %xmm3, 9 * SIZE(BB) + movhpd %xmm3, 10 * SIZE(BB) + movhpd %xmm3, 11 * SIZE(BB) + movlpd %xmm7, 12 * SIZE(BB) + movlpd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + movlpd 2 * SIZE(AA), %xmm2 + movlpd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) + movlpd %xmm2, 2 * SIZE(AA) + movlpd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse3.S b/kernel/x86/trsm_kernel_RT_2x4_sse3.S new file mode 100644 index 0000000..6be1d86 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x4_sse3.S @@ -0,0 +1,2030 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L30 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L89 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + movhpd 9 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movhpd 1 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + movhpd 9 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm3, 2 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd 0 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) + prefetchnta -2 * SIZE(CO1, LDC, 2) + prefetchnta -2 * SIZE(CO1, %eax, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) + prefetchnta 2 * SIZE(CO1, LDC, 2) + prefetchnta 2 * SIZE(CO1, %eax, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm5 + movapd 4 * SIZE(BB), %xmm3 + movapd 6 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 6 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup 7 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 11 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup 14 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup 13 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup 12 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 9 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 4 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm5, 2 * SIZE(BB) + movapd %xmm3, 4 * SIZE(BB) + movapd %xmm7, 6 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + movapd 2 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm1 + movapd 2 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 6 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd 7 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 11 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd 14 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd 13 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd 12 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 9 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd 8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 4 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) + movsd %xmm2, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_4x2_core2.S b/kernel/x86/trsm_kernel_RT_4x2_core2.S new file mode 100644 index 0000000..866eddf --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_4x2_core2.S @@ -0,0 +1,2100 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + jle .L100 + ALIGN_2 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -2 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd -3 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd -4 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -7 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd -8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -12 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -14 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd -13 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd -10 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd -9 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + movsd -5 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movddup %xmm0, %xmm2 + movddup %xmm1, %xmm3 + + unpckhpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm1, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 +#else + movapd -16 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -14 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + + movddup %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + + movapd %xmm1, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L159 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd -16 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L100: + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 -3 * SIZE(CO1, LDC) +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 3 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + movapd -12 * SIZE(B), %xmm5 + movapd -10 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movddup -2 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movddup -3 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movddup -4 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -7 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movddup -8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -12 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -10 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movddup -9 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -5 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movddup -15 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm1, %xmm5 + subpd %xmm5, %xmm3 + + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + subpd %xmm5, %xmm1 + + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movsd %xmm7, 3 * SIZE(CO1) + + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) + movhpd %xmm5, 2 * SIZE(CO1, LDC) + movhpd %xmm7, 3 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm2, 1 * SIZE(CO1, LDC) + movsd %xmm3, 2 * SIZE(CO1, LDC) + movhpd %xmm3, 3 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + movapd %xmm5, -12 * SIZE(B) + movapd %xmm7, -10 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + movddup %xmm5, %xmm4 + movddup %xmm7, %xmm6 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpckhpd %xmm5, %xmm5 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movddup -14 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movhpd %xmm1, 1 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm1 +#else + movsd -16 * SIZE(AA), %xmm0 + movsd -15 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd -13 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm1 + movsd -14 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + movsd %xmm1, -15 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) + movsd %xmm1, -14 * SIZE(BB) + movsd %xmm1, -13 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_4x2_sse2.S b/kernel/x86/trsm_kernel_RT_4x2_sse2.S new file mode 100644 index 0000000..68b52ba --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_4x2_sse2.S @@ -0,0 +1,2282 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define PREFETCHSIZE (8 * 4) + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + jle .L100 + ALIGN_2 + +.L101: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 + movapd 2 * SIZE(B), %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 14 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd 13 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd 12 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 9 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd 8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 4 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 2 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 6 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd 7 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 11 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) + movsd %xmm1, 4 * SIZE(BB) + movsd %xmm1, 5 * SIZE(BB) + movhpd %xmm1, 6 * SIZE(BB) + movhpd %xmm1, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 +#else + movapd 0 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L159 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, BB + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L100: + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movsd 14 * SIZE(AA), %xmm0 + movhpd 14 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movsd 13 * SIZE(AA), %xmm0 + movhpd 13 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movsd 12 * SIZE(AA), %xmm0 + movhpd 12 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 9 * SIZE(AA), %xmm0 + movhpd 9 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movsd 8 * SIZE(AA), %xmm0 + movhpd 8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + movhpd 4 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 6 * SIZE(AA), %xmm0 + movhpd 6 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movsd 7 * SIZE(AA), %xmm0 + movhpd 7 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 11 * SIZE(AA), %xmm0 + movhpd 11 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) + movsd %xmm5, 8 * SIZE(BB) + movsd %xmm5, 9 * SIZE(BB) + movhpd %xmm5, 10 * SIZE(BB) + movhpd %xmm5, 11 * SIZE(BB) + movsd %xmm7, 12 * SIZE(BB) + movsd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movsd %xmm7, 3 * SIZE(%esi) + + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) + movhpd %xmm5, 2 * SIZE(%esi, LDC) + movhpd %xmm7, 3 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + + movsd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm2, 1 * SIZE(%esi, LDC) + movsd %xmm3, 2 * SIZE(%esi, LDC) + movhpd %xmm3, 3 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + movhpd %xmm1, 1 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd 3 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + movsd %xmm1, 1 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movsd %xmm1, 2 * SIZE(BB) + movsd %xmm1, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S new file mode 100644 index 0000000..40a9604 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -0,0 +1,3128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 20 + STACK + ARGS(%esp) +#define ARG_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define ARG_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L40 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + movss -30 * SIZE(BB), %xmm5 + movss -29 * SIZE(BB), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) + movss %xmm5, -30 * SIZE(BB) + movss %xmm7, -29 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -32 * SIZE(AA), %xmm4 + movhps -30 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + testl $1, M + je .L119 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + + leal (AA, %eax, SIZE), AA + leal (B, %eax, SIZE), BB +#endif + + haddps %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BB), %xmm1 + subss %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss -32 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss -32 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA + leal (BB, %eax, SIZE), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + movsd -28 * SIZE(BB), %xmm5 + movsd -26 * SIZE(BB), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + movlps %xmm5, -28 * SIZE(BB) + movlps %xmm7, -26 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + movhlps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm5, %xmm4 + + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + movss -31 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -4 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -4 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm4 + + movaps %xmm6, %xmm2 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm1 + movlhps %xmm2, %xmm0 + movhlps %xmm2, %xmm1 + + movaps %xmm6, %xmm7 + movlhps %xmm4, %xmm6 + movhlps %xmm4, %xmm7 + + pshufd $0x39, %xmm1, %xmm2 + pshufd $0x39, %xmm7, %xmm4 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + movaps -24 * SIZE(BB), %xmm5 + movaps -20 * SIZE(BB), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm3 + subps %xmm6, %xmm5 + subps %xmm4, %xmm7 +#else + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + movaps -24 * SIZE(AA), %xmm2 + movaps -20 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + movaps %xmm5, -24 * SIZE(BB) + movaps %xmm7, -20 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) + movaps %xmm2, -24 * SIZE(AA) + movaps %xmm3, -20 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + movsd -28 * SIZE(AA), %xmm2 + movsd -26 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) + movlps %xmm2, -28 * SIZE(AA) + movlps %xmm3, -26 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movsd -32 * SIZE(AA), %xmm0 + movhps -30 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) + movss %xmm2, -30 * SIZE(AA) + movss %xmm3, -29 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S new file mode 100644 index 0000000..0d2fcb6 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -0,0 +1,3683 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movss STACK_OFFT, %xmm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L40 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L86 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + movss 2 * SIZE(B), %xmm5 + movss 3 * SIZE(B), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + movss %xmm5, 2 * SIZE(B) + movss %xmm7, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + movaps %xmm0, 8 * SIZE(BB) + pshufd $0x00, %xmm7, %xmm0 + movaps %xmm0, 12 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + testl $1, M + je .L119 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 4 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + movaps %xmm0, 24 * SIZE(BB) + movaps %xmm2, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + PREFETCHW 3 * SIZE(CO1, LDC, 2) + PREFETCHW 3 * SIZE(CO1, %eax) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $2 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm5, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm2, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm2, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 4 * SIZE(AA), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 6 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) + movlps %xmm2, 4 * SIZE(AA) + movlps %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (AA, %eax, SIZE), AA + + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + unpcklps %xmm5, %xmm4 + + movaps 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + movss 2 * SIZE(AA), %xmm2 + movss 3 * SIZE(AA), %xmm3 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 + subss %xmm6, %xmm2 + subss %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_8x2_sse.S b/kernel/x86/trsm_kernel_RT_8x2_sse.S new file mode 100644 index 0000000..6bc1d21 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_8x2_sse.S @@ -0,0 +1,3607 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define TRMASK 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-STACK_ALIGN, %esp + + STACK_TOUCHING + + movss STACK_M, %xmm0 + movl STACK_N, %eax + movss STACK_K, %xmm1 + movss STACK_A, %xmm2 + movl STACK_B, B + movss STACK_C, %xmm3 + movl STACK_LDC, LDC + movss STACK_OFFT, %xmm4 + + movss %xmm1, K + movl %eax, N + movss %xmm0, M + movss %xmm2, A + movss %xmm3, C + movl %esi, OLD_STACK + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LN) || defined(LT) + movl $0x3f800000, 0 + TRMASK # 1.0 + movl $0x00000000, 4 + TRMASK # 0.0 + movl $0x3f800000, 8 + TRMASK # 1.0 + movl $0x00000000, 12 + TRMASK # 0.0 +#endif + + testl $1, N + jle .L100 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + + shufps $0x00, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm5 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm6 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm5 + movhps 6 * SIZE(B), %xmm5 + + subps %xmm4, %xmm2 + subps %xmm6, %xmm5 + + xorps %xmm0, %xmm0 + + movaps %xmm2, %xmm3 + unpcklps %xmm0, %xmm2 + unpckhps %xmm0, %xmm3 + + movaps %xmm5, %xmm7 + unpcklps %xmm0, %xmm5 + unpckhps %xmm0, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + shufps $0x88, %xmm7, %xmm5 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movhps %xmm5, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + testl $4, M + jle .L150 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + movhps 2 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movsd 16 * SIZE(AA), %xmm1 + movhps 18 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 + + xorps %xmm5, %xmm5 + + movaps %xmm2, %xmm3 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + testl $2, M + jle .L170 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + shufps $1, %xmm5, %xmm5 + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 + + movaps %xmm4, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm0 + mulss %xmm4, %xmm0 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + mulss %xmm4, %xmm0 + movaps %xmm4, %xmm6 + shufps $0x55, %xmm6, %xmm6 + mulss %xmm0, %xmm6 + subss %xmm6, %xmm1 + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + + shufps $0x00, %xmm0, %xmm0 + shufps $0x00, %xmm1, %xmm1 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L170: + testl $1, M + jle .L179 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +.L172: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + shufps $0x00, %xmm1, %xmm1 + movaps %xmm1, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 +.L179: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm3 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $3, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + PREFETCHW 7 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movaps %xmm6, %xmm1 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm1 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + movsd 8 * SIZE(B), %xmm5 + movhps 10 * SIZE(B), %xmm5 + movsd 12 * SIZE(B), %xmm7 + movhps 14 * SIZE(B), %xmm7 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 + subps %xmm6, %xmm5 + subps %xmm1, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 + subps %xmm5, %xmm2 + subps %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm6 + + subps %xmm5, %xmm2 + subps %xmm6, %xmm3 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + mulps %xmm3, %xmm6 + + subps %xmm5, %xmm0 + subps %xmm6, %xmm1 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + movlps %xmm5, 8 * SIZE(B) + movhps %xmm5, 10 * SIZE(B) + movlps %xmm7, 12 * SIZE(B) + movhps %xmm7, 14 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm1, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm7, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm1, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movaps %xmm5, %xmm4 + shufps $0x88, %xmm7, %xmm5 + shufps $0xdd, %xmm7, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) + movlps %xmm4, 4 * SIZE(CO1, LDC) + movhps %xmm4, 6 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + testl $4, M + jle .L50 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm2, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + testl $2, M + jle .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 2 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm2, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + testl $1, M + jle .L99 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm2 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm0, %xmm5 + subss %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm2, %xmm5 + subss %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd %xmm2, 0 * SIZE(B) + + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movss %xmm2, 0 * SIZE(CO1) + movss %xmm0, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/xaxpy.S b/kernel/x86/xaxpy.S new file mode 100644 index 0000000..554aa0c --- /dev/null +++ b/kernel/x86/xaxpy.S @@ -0,0 +1,356 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 32 + STACK + ARGS(%esp) +#define STACK_X 48 + STACK + ARGS(%esp) +#define STACK_INCX 52 + STACK + ARGS(%esp) +#define STACK_Y 56 + STACK + ARGS(%esp) +#define STACK_INCY 60 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#ifndef CONJ +#define ADD1 fsubrp +#define ADD2 faddp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + FLD STACK_ALPHA_I + FLD STACK_ALPHA_R + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L40 + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl M, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1), %st + FLD 3 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 2 * SIZE(Y) + faddp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(2), %st + FLD 3 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 3 * SIZE(Y) + faddp %st, %st(1) + FST 3 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(1), %st + FLD 5 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 4 * SIZE(Y) + faddp %st, %st(1) + FST 4 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(2), %st + FLD 5 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 5 * SIZE(Y) + faddp %st, %st(1) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1), %st + FLD 7 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 6 * SIZE(Y) + faddp %st, %st(1) + FST 6 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(2), %st + FLD 7 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 7 * SIZE(Y) + faddp %st, %st(1) + FST 7 * SIZE(Y) + +#ifdef HAVE_3DNOW + prefetch 20 * SIZE(X) + prefetchw 20 * SIZE(Y) +#endif + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl %eax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movl M, %eax + sarl $2, %eax + jle .L28 + ALIGN_3 + +.L29: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L35: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl %eax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + ffreep %st(0) + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/xdot.S b/kernel/x86/xdot.S new file mode 100644 index 0000000..4a5af46 --- /dev/null +++ b/kernel/x86/xdot.S @@ -0,0 +1,331 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) +#else +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + testl N, N + jle .L88 + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl N, %eax + sarl $1, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX # if (incx < 0) + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY # if (incy < 0) + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 + +.L29: +#endif + + movl N, %eax + sarl $1, %eax + jle .L30 + ALIGN_3 + + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addl INCX, X + + FLD 0 * SIZE(X) + addl INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + ALIGN_3 + +.L27: +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + movl RESULT, %eax +#endif + +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#else + fxch %st(1) +#endif + + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L88: +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + movl RESULT, %eax +#endif + + fldz + fldz + +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/xgemm3m_kernel_2x2.S b/kernel/x86/xgemm3m_kernel_2x2.S new file mode 100644 index 0000000..b844875 --- /dev/null +++ b/kernel/x86/xgemm3m_kernel_2x2.S @@ -0,0 +1,796 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define ARG_B 52 + STACK + ARGS(%esp) +#define C 56 + STACK + ARGS(%esp) +#define ARG_LDC 60 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl ARG_LDC, LDC + movl ARG_B, B + + addl $8 * SIZE, A + addl $8 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + lea (, LDC, 2), %eax + addl %eax, C + + movl M, I + sarl $1, I + je .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fld %st(3) + fmul %st(1), %st + + FLD 2 * SIZE(CO) + faddp %st, %st(1) + FST 2 * SIZE(CO) + + fld %st(4) + fmul %st(1), %st + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + fmul %st(5), %st + + FLD 2 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 2 * SIZE(CO, LDC) + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 3 * SIZE(CO) + faddp %st, %st(1) + FST 3 * SIZE(CO) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) + + FLD 3 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 3 * SIZE(CO, LDC) + + addl $4 * SIZE, CO + decl I + jne .L11 + ALIGN_4 + +.L20: + movl M, %eax + andl $1, %eax + je .L29 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal ( B, %eax, 2), BO +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmul %st(3), %st + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + decl J + jne .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + addl LDC, C + + movl M, I + sarl $1, I + je .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal ( B, %eax, 1), BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmul %st(3), %st + + FLD 2 * SIZE(CO) + faddp %st, %st(1) + FST 2 * SIZE(CO) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 3 * SIZE(CO) + faddp %st, %st(1) + FST 3 * SIZE(CO) + + addl $4 * SIZE, CO + decl I + jne .L31 + ALIGN_4 + +.L40: + movl M, %eax + andl $1, %eax + je .L49 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal ( B, %eax, 1), BO +#endif + + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: + FLD ALPHA_I + FLD ALPHA_R + + fmul %st(2), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmulp %st(1), %st + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl BO, B + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/xgemm_kernel_1x1.S b/kernel/x86/xgemm_kernel_1x1.S new file mode 100644 index 0000000..b401bd2 --- /dev/null +++ b/kernel/x86/xgemm_kernel_1x1.S @@ -0,0 +1,374 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define ARG_B 52 + STACK + ARGS(%esp) +#define C 56 + STACK + ARGS(%esp) +#define ARG_LDC 60 + STACK + ARGS(%esp) +#define OFFSET 64 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 faddp +#define ADD4 faddp +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 fsubrp +#define ADD4 faddp +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 faddp +#define ADD4 fsubrp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 fsubrp +#define ADD4 fsubrp +#endif + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl ARG_LDC, LDC + movl ARG_B, B + + addl $8 * SIZE, A + addl $8 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + cmpl $0, M + jle .L999 + + movl N, %eax + movl %eax, J + testl %eax, %eax + jle .L999 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + addl LDC, C + + movl M, I + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: + faddp %st, %st(3) + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FLD ALPHA_R + fld %st + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + FLD ALPHA_I + fmul %st, %st(3) + fmulp %st, %st(4) + + fsubp %st, %st(2) + faddp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $ZBASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, CO + decl I + jne .L11 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl BO, B + decl J + jne .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/xgemv_n.S b/kernel/x86/xgemv_n.S new file mode 100644 index 0000000..0bf4445 --- /dev/null +++ b/kernel/x86/xgemv_n.S @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 32 +#endif + +#if defined(PENTIUM4) || defined(ATHLON) +#define P (DTB_ENTRIES / 2) +#endif + +#ifndef P +#define P DTB_ENTRIES +#endif + +#define STACK 16 +#define ARGS 16 + +#define PLDA_M 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_N 8 + STACK(%esp) +#define IS 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) + +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define LDA 52 + STACK + ARGS(%esp) +#define X 56 + STACK + ARGS(%esp) +#define INCX 60 + STACK + ARGS(%esp) +#define Y 64 + STACK + ARGS(%esp) +#define INCY 68 + STACK + ARGS(%esp) +#define BUFFER 72 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movl X, %edi + + movl LDA, %ebx + sall $ZBASE_SHIFT, %ebx + + movl $0, IS + + movl M, %ecx + movl N, %esi + + test %ecx, %ecx + jle .L79 # goto END + test %esi, %esi + jle .L79 # goto END + + movl INCY, %eax + sall $ZBASE_SHIFT, %eax + movl %eax, INCY + + movl LDA, %eax + imull $P, %eax # P * lda + subl M ,%eax # P * lda - m + sall $ZBASE_SHIFT, %eax + movl %eax, PLDA_M + ALIGN_2 + +.L32: + movl IS, %esi + movl $P, %edx + movl N, %eax + subl %esi,%eax # n - is + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + + movl %eax, MIN_N + + sall $ZBASE_SHIFT, %esi + leal (%edi, %esi, 1), %esi + movl %esi, XP + + movl INCX, %edx + cmpl $1, %edx + je .L34 # if incx == 1 goto L34 + + movl BUFFER, %esi + movl %esi, XP # xp = buffer + + sall $ZBASE_SHIFT, %edx + sarl $1,%eax + jle .L35 + ALIGN_2 + +.L36: + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_N, %eax + andl $1, %eax + jle .L34 + + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + ALIGN_3 + +/* Main Routine */ +.L34: + movl Y, %ecx # c_offset + movl M, %ebp # j = m + ALIGN_3 + +.L61: + movl A, %edx # a_offset = a + fldz + addl $2 * SIZE, A # a++ + fldz + movl XP,%esi + fldz + movl MIN_N,%eax + fldz + FLD (%esi) # bt1 = *(b_offset + 0) + sarl $1, %eax + jle .L64 + ALIGN_3 + +.L65: +#ifdef PENTIUM4 + prefetchnta 16 * SIZE(%esi) +#endif + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) + + addl $2 * SIZE, %esi # b_offset += 2 + addl %ebx, %edx # a_offset += lda + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) + + addl $2 * SIZE, %esi # b_offset += 2 + addl %ebx, %edx # a_offset += lda + + decl %eax + jg .L65 + +.L64: + movl MIN_N, %eax + andl $1, %eax + jle .L70 + ALIGN_2 + +.L71: + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_2 + +.L70: + ffreep %st(0) + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4) + fld %st(2) + fmul %st(4) + fsubp %st, %st(1) + + movl INCY, %eax + + FLD 0 * SIZE(%ecx) + faddp %st, %st(1) + FST 0 * SIZE(%ecx) + + fmul %st(2) + fxch %st(1) + fmul %st(3) + faddp %st, %st(1) + + FLD 1 * SIZE(%ecx) + faddp %st, %st(1) + FST 1 * SIZE(%ecx) + + addl %eax, %ecx + decl %ebp + jg .L61 + +.L60: + movl PLDA_M, %esi + addl %esi, A # a += P * lda - m + addl $P, IS + movl N, %esi + cmpl %esi,IS + jl .L32 + +.L79: + ffreep %st(0) + ffreep %st(0) + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/xgemv_t.S b/kernel/x86/xgemv_t.S new file mode 100644 index 0000000..1397a10 --- /dev/null +++ b/kernel/x86/xgemv_t.S @@ -0,0 +1,369 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 88 +#endif + +#ifndef P +#define P 400 +#endif + +#define STACK 16 +#define ARGS 24 + +#define NLDA 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_M 8 + STACK(%esp) +#define J 12 + STACK(%esp) +#define IS 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) + +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define LDA 52 + STACK + ARGS(%esp) +#define X 56 + STACK + ARGS(%esp) +#define INCX 60 + STACK + ARGS(%esp) +#define Y 64 + STACK + ARGS(%esp) +#define INCY 68 + STACK + ARGS(%esp) +#define BUFFER 72 + STACK + ARGS(%esp) + + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movl X, %edi # X + + movl $0, IS + + movl M, %ebx + movl N, %ecx + testl %ebx, %ebx + jle .L79 + + testl %ecx, %ecx + jle .L79 + + movl INCX, %esi + sall $ZBASE_SHIFT, %esi + movl %esi, INCX + + movl INCY, %esi + sall $ZBASE_SHIFT, %esi + movl %esi, INCY + + movl LDA, %ebx + + movl N, %eax + imull %ebx, %eax + movl $P, %esi + subl %eax, %esi + sall $ZBASE_SHIFT, %esi + movl %esi, NLDA + + movl %ebx, %esi + sall $ZBASE_SHIFT, %esi + movl %esi, LDA + ALIGN_2 + +.L32: + movl IS, %esi + + movl $P, %edx + movl M, %eax + subl %esi, %eax + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + movl %eax, MIN_M + + movl IS, %ecx + sall $ZBASE_SHIFT, %ecx + leal (%edi, %ecx, 1), %ecx # xp = x + is + movl INCX, %ebx + movl %ecx, XP + cmpl $2 * SIZE, %ebx + je .L34 + + movl BUFFER, %esi + movl MIN_M, %eax + movl %esi, XP + sarl $1, %eax + jle .L35 + + ALIGN_3 + +.L36: + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_M, %eax + andl $1,%eax + jle .L34 + + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + ALIGN_3 + +/* Main Routine */ + +.L34: + movl Y, %ebp # coffset = y + + movl N, %ecx + testl %ecx, %ecx + jle .L60 + ALIGN_2 + +.L61: + movl A, %ebx # a_offset = a + fldz # ct1 = ZERO + movl LDA, %edx + fldz # ct1 = ZERO + + addl %ebx, %edx + fldz # ct1 = ZERO + movl %edx, A + fldz # ct1 = ZERO + + movl XP, %esi + + FLD (%esi) # bt1 = *(b_offset + 0) + + movl MIN_M, %eax + sarl $1, %eax + jle .L64 + ALIGN_3 + +#define PRESIZE 8 + +.L65: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * SIZE(%ebx) + prefetcht0 PRESIZE * SIZE(%esi) +#endif + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 3 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 4 * SIZE(%esi) # bt1 = *(b_offset + 1) + + addl $4 * SIZE, %esi + addl $4 * SIZE, %ebx + decl %eax + jg .L65 + ALIGN_3 + +.L64: + movl MIN_M, %eax + andl $1, %eax + jle .L70 + ALIGN_3 + +.L71: + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_3 + +.L70: + ffreep %st(0) + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4) + fld %st(2) + fmul %st(4) + fsubp %st, %st(1) + + FLD 0 * SIZE(%ebp) + faddp %st, %st(1) + FST 0 * SIZE(%ebp) + + fmul %st(2) + fxch %st(1) + fmul %st(3) + faddp %st, %st(1) + + FLD 1 * SIZE(%ebp) + faddp %st, %st(1) + FST 1 * SIZE(%ebp) + addl INCY, %ebp + + decl %ecx + jg .L61 + ALIGN_3 + +.L60: + movl A, %ebx + addl NLDA, %ebx + movl %ebx, A + + addl $P, IS + movl M, %esi + cmpl %esi, IS + jl .L32 + ALIGN_3 + +.L79: + ffreep %st(0) + ffreep %st(0) + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE + diff --git a/kernel/x86/xtrsm_kernel_LT_1x1.S b/kernel/x86/xtrsm_kernel_LT_1x1.S new file mode 100644 index 0000000..e05266f --- /dev/null +++ b/kernel/x86/xtrsm_kernel_LT_1x1.S @@ -0,0 +1,493 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define ARG_B 52 + STACK + ARGS(%esp) +#define C 56 + STACK + ARGS(%esp) +#define ARG_LDC 60 + STACK + ARGS(%esp) +#define OFFSET 64 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#ifndef CONJ +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 faddp +#define ADD4 faddp +#elif defined(LN) || defined(LT) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 fsubrp +#define ADD4 faddp +#else +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 faddp +#define ADD4 fsubrp +#endif + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_LDC, LDC + movl ARG_B, B + + sall $ZBASE_SHIFT, LDC + + addl $8 * SIZE, A + addl $8 * SIZE, B + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + cmpl $0, M + jle .L999 + + movl N, %eax + movl %eax, J + testl %eax, %eax + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: + faddp %st, %st(3) + faddp %st, %st(1) + + fxch %st(1) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + sall $ZBASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st(1), %st + FLD -8 * SIZE(AO) + fmul %st(3), %st + FLD -7 * SIZE(AO) + fmulp %st, %st(3) + FLD -7 * SIZE(AO) + fmulp %st, %st(4) +#endif + +#if defined(RN) || defined(RT) + FLD -8 * SIZE(BO) + fmul %st(1), %st + FLD -8 * SIZE(BO) + fmul %st(3), %st + FLD -7 * SIZE(BO) + fmulp %st, %st(3) + FLD -7 * SIZE(BO) + fmulp %st, %st(4) +#endif + +#ifndef CONJ + faddp %st, %st(2) + fsubp %st, %st(2) +#else + fsubp %st, %st(2) + faddp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -7 * SIZE(BO) + fxch %st(1) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -8 * SIZE(AO) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L11 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J + jne .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zamax.S b/kernel/x86/zamax.S new file mode 100644 index 0000000..3056c1e --- /dev/null +++ b/kernel/x86/zamax.S @@ -0,0 +1,261 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + + PROLOGUE + +#define M %ebx +#define INCX %esi +#define X %ecx +#define I %edx + +#ifndef USE_MIN +#define FMOV fcmovbe +#else +#define FMOV fcmovnbe +#endif + +#include "l1param.h" + + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_INCX, INCX + movl STACK_X, X + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + sall $ZBASE_SHIFT, INCX + + fldz + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + fstp %st(0) + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + addl INCX, X + decl M + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl $8 * SIZE, X + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl INCX, X + decl I + jg .L61 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zamax_sse.S b/kernel/x86/zamax_sse.S new file mode 100644 index 0000000..60dd25b --- /dev/null +++ b/kernel/x86/zamax_sse.S @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#ifndef HAVE_SSE2 +#define pxor xorps +#define movsd movlps +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + pxor %xmm0, %xmm0 + pxor %xmm7, %xmm7 + xor RET, RET + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + movl M, MM + movl X, XX + +#ifdef USE_ABS +#ifndef HAVE_SSE2 + subl $8, %esp + movl $0x7fffffff, (%esp) + movss (%esp), %xmm7 + shufps $0, %xmm7, %xmm7 + addl $8, %esp +#else + cmpeqps %xmm7, %xmm7 + psrld $1, %xmm7 /* Generate USE_ABS */ +#endif +#endif + + movss 0 * SIZE(XX), %xmm0 + movss 1 * SIZE(XX), %xmm1 + addl INCX, XX + decl MM + +#ifdef USE_ABS + andps %xmm7, %xmm0 + andps %xmm7, %xmm1 +#endif + addps %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + cmpl $2 * SIZE, INCX + jne .L70 + +.L30: + movl MM, I + sarl $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 +#endif + + movsd 8 * SIZE(XX), %xmm1 + movhps 10 * SIZE(XX), %xmm1 + movsd 12 * SIZE(XX), %xmm2 + movhps 14 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + addl $16 * SIZE, XX + decl I + jg .L31 + ALIGN_4 + +.L35: + andl $7, MM + jle .L40 + + testl $4, MM + je .L36 + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + addl $8 * SIZE, XX + ALIGN_3 + +.L36: + testl $2, MM + je .L37 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm0 + addl $4 * SIZE, XX + ALIGN_3 + +.L37: + testl $1, MM + je .L40 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + addps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L40: + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + jmp .L999 + ALIGN_4 + +.L70: + movl MM, I + sarl $3, I + jle .L75 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + decl I + jg .L71 + ALIGN_4 + +.L75: + andl $7, MM + jle .L80 + + testl $4, MM + je .L76 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + ALIGN_3 + +.L76: + testl $2, MM + je .L77 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm0 + ALIGN_3 + +.L77: + testl $1, MM + je .L80 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + addps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L80: + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L999: + subl $8, %esp + movss %xmm0, (%esp) + flds (%esp) + addl $8, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zamax_sse2.S b/kernel/x86/zamax_sse2.S new file mode 100644 index 0000000..50adffb --- /dev/null +++ b/kernel/x86/zamax_sse2.S @@ -0,0 +1,373 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + pxor %xmm0, %xmm0 + pxor %xmm7, %xmm7 + xor RET, RET + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + movl M, MM + movl X, XX + + cmpeqpd %xmm7, %xmm7 + psrlq $1, %xmm7 + + movsd 0 * SIZE(XX), %xmm0 + movsd 1 * SIZE(XX), %xmm1 + addl INCX, XX + decl MM + andpd %xmm7, %xmm0 + andpd %xmm7, %xmm1 + addpd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + cmpl $2 * SIZE, INCX + jne .L60 + + movl MM, I + sarl $3, I + jle .L25 + ALIGN_4 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) +#endif + + movsd 8 * SIZE(XX), %xmm1 + movsd 9 * SIZE(XX), %xmm2 + movhpd 10 * SIZE(XX), %xmm1 + movhpd 11 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 12 * SIZE(XX), %xmm3 + movsd 13 * SIZE(XX), %xmm4 + movhpd 14 * SIZE(XX), %xmm3 + movhpd 15 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + + addl $16 * SIZE, XX + decl I + jg .L21 + ALIGN_4 + +.L25: + andl $7, MM + jle .L30 + + testl $4, MM + je .L26 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + addl $8 * SIZE, XX + ALIGN_3 + +.L26: + testl $2, MM + je .L27 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + addl $4 * SIZE, XX + ALIGN_3 + +.L27: + testl $1, MM + je .L30 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L30: + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + jmp .L999 + ALIGN_3 + +.L60: + movl MM, I + sarl $3, I + jle .L65 + ALIGN_4 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + + decl I + jg .L61 + ALIGN_4 + +.L65: + andl $7, MM + jle .L70 + + testl $4, MM + je .L66 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + ALIGN_3 + +.L66: + testl $2, MM + je .L67 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + ALIGN_3 + +.L67: + testl $1, MM + je .L70 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxsd %xmm1, %xmm0 + ALIGN_3 + +.L70: + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + subl $8, %esp + movsd %xmm0, (%esp) + fldl (%esp) + addl $8, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zasum.S b/kernel/x86/zasum.S new file mode 100644 index 0000000..84b8f60 --- /dev/null +++ b/kernel/x86/zasum.S @@ -0,0 +1,228 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpl $SIZE * 2, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st,%st(3) + faddp %st,%st(1) + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + faddp %st,%st(3) + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zasum_sse.S b/kernel/x86/zasum_sse.S new file mode 100644 index 0000000..ff8230c --- /dev/null +++ b/kernel/x86/zasum_sse.S @@ -0,0 +1,341 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define I %eax +#define M %ecx +#define X %esi +#define INCX %ebx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + +#ifdef HAVE_SSE2 + pcmpeqb %xmm3, %xmm3 + psrld $1, %xmm3 +#else + movl $0x7fffffff, STACK_M + movss STACK_M, %xmm3 + shufps $0, %xmm3, %xmm3 +#endif + + sall $ZBASE_SHIFT, INCX + + cmpl $2 * SIZE, INCX + jne .L100 + + subl $-32 * SIZE, X + addl M, M + + cmpl $3, M + jle .L18 + + testl $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + andps %xmm3, %xmm0 + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L05: + testl $8, X + je .L10 + + movsd -32 * SIZE(X), %xmm1 + andps %xmm3, %xmm1 + addl $2 * SIZE, X + subl $2, M + jle .L999 + ALIGN_3 + +.L10: + movl M, I + sarl $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps 12 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + decl I + jg .L11 + ALIGN_3 + +.L12: + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + + addl $32 * SIZE, X + ALIGN_3 + +.L14: + testl $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + + movaps -20 * SIZE(X), %xmm7 + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + + addl $16 * SIZE, X + ALIGN_3 + +.L16: + testl $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L17: + testl $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + addl $4 * SIZE, X + ALIGN_3 + +.L18: + testl $2, M + je .L19 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm1 + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, M + je .L999 + + movss -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + jmp .L999 + ALIGN_4 + +.L100: + movl M, I + sarl $2, I + jle .L105 + ALIGN_4 + +.L101: + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + decl I + jg .L101 + ALIGN_4 + +.L105: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andl $3, M + jle .L999 + ALIGN_4 + +.L106: + movsd (X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + addl INCX, X + decl M + jg .L106 + ALIGN_4 + +.L999: + addps %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + + movss %xmm0, STACK_M + flds STACK_M + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zasum_sse2.S b/kernel/x86/zasum_sse2.S new file mode 100644 index 0000000..b7dbc15 --- /dev/null +++ b/kernel/x86/zasum_sse2.S @@ -0,0 +1,320 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define I %eax +#define M %ecx +#define X %esi +#define INCX %ebx + +#define xmm8 xmm4 +#define xmm9 xmm5 +#define xmm10 xmm6 +#define xmm11 xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + pcmpeqb %xmm3, %xmm3 + psrlq $1, %xmm3 + + sall $ZBASE_SHIFT, INCX + + cmpl $2 * SIZE, INCX + jne .L40 + + subl $-16 * SIZE, X + addl M, M + + testl $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm0 + addl $SIZE, X + + andps %xmm3, %xmm0 + subl $1, M + jle .L999 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decl I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps 6 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + decl I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + subl $-16 * SIZE, X + ALIGN_3 + +.L20: + andl $15, M + jle .L999 + + testl $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L21: + testl $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + ALIGN_3 + +.L22: + testl $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + addl $2 * SIZE, X + +.L23: + testl $1, M + je .L999 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + movsd 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addl INCX, X + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + + movsd 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addl INCX, X + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + decl I + jg .L50 + ALIGN_4 + +.L60: + andl $3, M + jle .L999 + ALIGN_4 + + +.L61: + movsd 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + addl INCX, X + decl M + jg .L61 + ALIGN_4 + +.L999: + addpd %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movaps %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + movsd %xmm0, STACK_M + fldl STACK_M + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zaxpy.S b/kernel/x86/zaxpy.S new file mode 100644 index 0000000..0894f5d --- /dev/null +++ b/kernel/x86/zaxpy.S @@ -0,0 +1,348 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) +#define STACK_Y 40 + STACK + ARGS(%esp) +#define STACK_INCY 44 + STACK + ARGS(%esp) +#else +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) +#endif + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#ifndef CONJ +#define ADD1 fsubrp +#define ADD2 faddp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + FLD STACK_ALPHA_I + FLD STACK_ALPHA_R + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + addl INCX, INCX + addl INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L40 + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl M, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1), %st + FLD 3 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 2 * SIZE(Y) + FST 2 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(2), %st + FLD 3 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 3 * SIZE(Y) + FST 3 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(1), %st + FLD 5 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 4 * SIZE(Y) + FST 4 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(2), %st + FLD 5 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 5 * SIZE(Y) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1), %st + FLD 7 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 6 * SIZE(Y) + FST 6 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(2), %st + FLD 7 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 7 * SIZE(Y) + FST 7 * SIZE(Y) + +#ifdef HAVE_3DNOW + prefetch 20 * SIZE(X) + prefetchw 20 * SIZE(Y) +#endif + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl %eax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movl M, %eax + sarl $2, %eax + jle .L28 + ALIGN_3 + +.L29: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L35: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl %eax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + ffreep %st(0) + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zaxpy_sse.S b/kernel/x86/zaxpy_sse.S new file mode 100644 index 0000000..edd9929 --- /dev/null +++ b/kernel/x86/zaxpy_sse.S @@ -0,0 +1,3103 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx +#define YY %ebp + +#define ALPHA_R %xmm6 +#define ALPHA_I %xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movss STACK_ALPHA_R, ALPHA_R + movss STACK_ALPHA_I, ALPHA_I + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L100 + cmpl $2 * SIZE, INCY + jne .L100 + +#ifdef HAVE_SSE2 + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 +#else + movl $0x80000000, STACK_M + movss STACK_M, %xmm5 + shufps $0x11, %xmm5, %xmm5 +#endif + + shufps $0, ALPHA_R, ALPHA_R + shufps $0, ALPHA_I, ALPHA_I + +#ifndef CONJ + shufps $0xb1, %xmm5, %xmm5 + xorps %xmm5, ALPHA_I +#else + xorps %xmm5, ALPHA_R +#endif + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + testl $2 * SIZE, Y + je .L10 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 +#ifndef HAVE_SSE2 + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps %xmm5, %xmm0 + addps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl M + jle .L999 + ALIGN_2 + +.L10: + testl $SIZE, Y + jne .L50 + + testl $3 * SIZE, X + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 8 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L11 + ALIGN_3 + +.L12: + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L15: + testl $8, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L16: + testl $4, M + jle .L17 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L17: + testl $2, M + jle .L18 + + movaps -32 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L18: + testl $1, M + jle .L999 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + +#ifndef HAVE_SSE2 + xorps %xmm1, %xmm1 + movlps -32 * SIZE(Y), %xmm1 +#else + movsd -32 * SIZE(Y), %xmm1 +#endif + addps %xmm1, %xmm0 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + testl $2 * SIZE, X + jne .L30 + + subl $1 * SIZE, X + + movaps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L25 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, M + jle .L26 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L26: + testl $4, M + jle .L27 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L27: + testl $2, M + jle .L28 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L28: + testl $1, M + jle .L999 + + PSHUFD2($0x06, %xmm0, %xmm5) + PSHUFD2($0x09, %xmm0, %xmm0) + + mulps ALPHA_I, %xmm5 + mulps ALPHA_R, %xmm0 + +#ifndef HAVE_SSE2 + xorps %xmm1, %xmm1 + movlps -32 * SIZE(Y), %xmm1 +#else + movsd -32 * SIZE(Y), %xmm1 +#endif + addps %xmm1, %xmm0 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y) + + jmp .L999 + ALIGN_3 + +.L30: + testl $1 * SIZE, X + jne .L40 +#endif + + movl M, %eax + sarl $4, %eax + jle .L35 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decl %eax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L35: + testl $8, M + jle .L36 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L36: + testl $4, M + jle .L37 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L37: + testl $2, M + jle .L38 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L38: + testl $1, M + jle .L999 + + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS + +.L40: + subl $3 * SIZE, X + + movaps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L45 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl %eax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L45: + testl $8, M + jle .L46 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L46: + testl $4, M + jle .L47 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L47: + testl $2, M + jle .L48 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L48: + testl $1, M + jle .L999 + + movaps -28 * SIZE(X), %xmm1 + movsd -32 * SIZE(Y), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps %xmm5, %xmm0 + addps %xmm2, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + jmp .L999 + ALIGN_3 +#endif + +.L50: + xorps %xmm0, %xmm0 + + subl $1 * SIZE, Y + + testl $3 * SIZE, X + jne .L60 + + movl M, %eax + sarl $4, %eax + jle .L55 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decl %eax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 8 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L51 + ALIGN_3 + +.L52: + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L55: + testl $8, M + jle .L56 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L56: + testl $4, M + jle .L57 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L57: + testl $2, M + jle .L58 + + movaps -32 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L58: + testl $1, M + jle .L59 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L59: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L60: +#ifdef ALIGNED_ACCESS + + testl $2 * SIZE, X + jne .L70 + + subl $1 * SIZE, X + + movaps -32 * SIZE(X), %xmm1 + + movl M, %eax + sarl $4, %eax + jle .L65 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decl %eax + jle .L62 + ALIGN_3 + +.L61: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 8 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L61 + ALIGN_3 + +.L62: + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L65: + testl $8, M + jle .L66 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L66: + testl $4, M + jle .L67 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L67: + testl $2, M + jle .L68 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L68: + testl $1, M + jle .L69 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + movhlps %xmm0, %xmm0 + movss %xmm0, -30 * SIZE(Y) + jmp .L999 + +.L69: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L70: + testl $1 * SIZE, X + jne .L80 +#endif + + movl M, %eax + sarl $4, %eax + jle .L75 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + movsd -24 * SIZE(X), %xmm3 + movhps -22 * SIZE(X), %xmm3 + + decl %eax + jle .L72 + ALIGN_3 + +.L71: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movsd -20 * SIZE(X), %xmm0 + movhps -18 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movsd -16 * SIZE(X), %xmm1 + movhps -14 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movsd -12 * SIZE(X), %xmm2 + movhps -10 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movsd -8 * SIZE(X), %xmm3 + movhps -6 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -4 * SIZE(X), %xmm0 + movhps -2 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movsd 0 * SIZE(X), %xmm1 + movhps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movsd 8 * SIZE(X), %xmm3 + movhps 10 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L71 + ALIGN_3 + +.L72: + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movsd -20 * SIZE(X), %xmm0 + movhps -18 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movsd -16 * SIZE(X), %xmm1 + movhps -14 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movsd -12 * SIZE(X), %xmm2 + movhps -10 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movsd -8 * SIZE(X), %xmm3 + movhps -6 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -4 * SIZE(X), %xmm0 + movhps -2 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L75: + testl $8, M + jle .L76 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm3 + movhps -22 * SIZE(X), %xmm3 + movsd -20 * SIZE(X), %xmm0 + movhps -18 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L76: + testl $4, M + jle .L77 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L77: + testl $2, M + jle .L78 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L78: + testl $1, M + jle .L79 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L79: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS + +.L80: + subl $3 * SIZE, X + + movaps -32 * SIZE(X), %xmm1 + + movl M, %eax + sarl $4, %eax + jle .L85 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decl %eax + jle .L82 + ALIGN_3 + +.L81: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 8 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L81 + ALIGN_3 + +.L82: + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L85: + testl $8, M + jle .L86 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L86: + testl $4, M + jle .L87 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L87: + testl $2, M + jle .L88 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L88: + testl $1, M + jle .L89 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + movhlps %xmm0, %xmm0 + movss %xmm0, -30 * SIZE(Y) + jmp .L999 + +.L89: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 +#endif + +.L100: + shufps $0, ALPHA_R, ALPHA_R + shufps $0, ALPHA_I, ALPHA_I + +#ifndef CONJ + xorps %xmm5, %xmm5 + subps ALPHA_I, %xmm5 + + unpcklps ALPHA_R, %xmm5 + unpcklps ALPHA_I, ALPHA_R + movaps %xmm5, ALPHA_I +#else + xorps %xmm5, %xmm5 + subps ALPHA_R, %xmm5 + + unpcklps ALPHA_I, ALPHA_R + unpcklps %xmm5, ALPHA_I +#endif + + movl Y, YY + + movl M, %eax + sarl $3, %eax + jle .L105 + ALIGN_3 + +.L102: + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 + + movaps %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 + shufps $0xf5, %xmm3, %xmm3 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm3 + + movsd (Y), %xmm4 + addl INCY, Y + movhps (Y), %xmm4 + addl INCY, Y + movsd (Y), %xmm5 + addl INCY, Y + movhps (Y), %xmm5 + addl INCY, Y + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + addps %xmm2, %xmm5 + addps %xmm3, %xmm5 + + movsd %xmm4, (YY) + addl INCY, YY + movhps %xmm4, (YY) + addl INCY, YY + movsd %xmm5, (YY) + addl INCY, YY + movhps %xmm5, (YY) + addl INCY, YY + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 + + movaps %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 + shufps $0xf5, %xmm3, %xmm3 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm3 + + movsd (Y), %xmm4 + addl INCY, Y + movhps (Y), %xmm4 + addl INCY, Y + movsd (Y), %xmm5 + addl INCY, Y + movhps (Y), %xmm5 + addl INCY, Y + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + addps %xmm2, %xmm5 + addps %xmm3, %xmm5 + + movsd %xmm4, (YY) + addl INCY, YY + movhps %xmm4, (YY) + addl INCY, YY + movsd %xmm5, (YY) + addl INCY, YY + movhps %xmm5, (YY) + addl INCY, YY + + decl %eax + jg .L102 + ALIGN_3 + +.L105: + testl $4, M + jle .L106 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 + + movaps %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 + shufps $0xf5, %xmm3, %xmm3 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm3 + + movsd (Y), %xmm4 + addl INCY, Y + movhps (Y), %xmm4 + addl INCY, Y + movsd (Y), %xmm5 + addl INCY, Y + movhps (Y), %xmm5 + addl INCY, Y + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + addps %xmm2, %xmm5 + addps %xmm3, %xmm5 + + movsd %xmm4, (YY) + addl INCY, YY + movhps %xmm4, (YY) + addl INCY, YY + movsd %xmm5, (YY) + addl INCY, YY + movhps %xmm5, (YY) + addl INCY, YY + ALIGN_3 + +.L106: + testl $2, M + jle .L107 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + + movsd (Y), %xmm4 + addl INCY, Y + movhps (Y), %xmm4 + addl INCY, Y + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + + movsd %xmm4, (YY) + addl INCY, YY + movhps %xmm4, (YY) + addl INCY, YY + ALIGN_3 + +.L107: + testl $1, M + jle .L999 + + movsd (X), %xmm0 + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + + movsd (Y), %xmm4 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + + movsd %xmm4, (Y) + ALIGN_3 + +.L999: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE + diff --git a/kernel/x86/zaxpy_sse2.S b/kernel/x86/zaxpy_sse2.S new file mode 100644 index 0000000..40afdc3 --- /dev/null +++ b/kernel/x86/zaxpy_sse2.S @@ -0,0 +1,1522 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) +#define STACK_Y 40 + STACK + ARGS(%esp) +#define STACK_INCY 44 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx +#define YY %ebp + +#define ALPHA_R %xmm6 +#define ALPHA_I %xmm7 + +#if defined(HAVE_SSE3) && !defined(CORE_OPTERON) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movsd STACK_ALPHA_R, %xmm0 + movsd STACK_ALPHA_I, %xmm1 + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#ifdef HAVE_SSE3 + movddup %xmm0, ALPHA_R + movddup %xmm1, ALPHA_I +#else + pshufd $0x44, %xmm0, ALPHA_R + pshufd $0x44, %xmm1, ALPHA_I +#endif + +#ifndef CONJ + shufps $0x0c, %xmm5, %xmm5 + xorpd %xmm5, ALPHA_I +#else + shufps $0xc0, %xmm5, %xmm5 + xorpd %xmm5, ALPHA_R +#endif + + testl $SIZE, Y + jne .L30 + + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $3, %eax + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -8 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -6 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -4 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -2 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -8 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -6 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -4 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -2 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 6 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L11 + ALIGN_3 + +.L12: + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -8 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -6 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -4 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -2 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -8 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -6 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -4 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -2 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L15: + movl M, %eax + andl $4, %eax + jle .L16 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + movl M, %eax + andl $2, %eax + jle .L17 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + movl M, %eax + andl $1, %eax + jle .L999 + + movaps -16 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movl M, %eax + sarl $3, %eax + jle .L25 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -8 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -6 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -4 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -2 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -8 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -6 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -4 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -2 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $4, %eax + jle .L26 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $2, %eax + jle .L27 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + movl M, %eax + andl $1, %eax + jle .L999 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + testl $SIZE, X + jne .L40 + + movaps -16 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + xorps %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm0 + + xorps %xmm4, %xmm4 + movhps -16 * SIZE(Y), %xmm4 + + addpd %xmm0, %xmm4 + movhps %xmm4, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $2 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L39 + + movl M, %eax + sarl $3, %eax + jle .L35 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + + decl %eax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -10 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -8 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -6 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 2 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 4 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -10 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -8 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -6 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L35: + movl M, %eax + andl $4, %eax + jle .L36 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L36: + movl M, %eax + andl $2, %eax + jle .L37 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L37: + movl M, %eax + andl $1, %eax + jle .L39 + + movaps -16 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L39: + SHUFPD_1 %xmm0, %xmm0 + + addsd -16 * SIZE(Y), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L40: + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + xorps %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm0 + + xorps %xmm4, %xmm4 + movhps -16 * SIZE(Y), %xmm4 + + addpd %xmm0, %xmm4 + movhps %xmm4, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $2 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L49 + + movl M, %eax + sarl $3, %eax + jle .L45 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + movsd -12 * SIZE(X), %xmm3 + movhps -11 * SIZE(X), %xmm3 + + decl %eax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -10 * SIZE(X), %xmm0 + movhps -9 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -8 * SIZE(X), %xmm1 + movhps -7 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -6 * SIZE(X), %xmm2 + movhps -5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -3 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd -2 * SIZE(X), %xmm0 + movhps -1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movsd 2 * SIZE(X), %xmm2 + movhps 3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movsd 4 * SIZE(X), %xmm3 + movhps 5 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -10 * SIZE(X), %xmm0 + movhps -9 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -8 * SIZE(X), %xmm1 + movhps -7 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -6 * SIZE(X), %xmm2 + movhps -5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -3 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd -2 * SIZE(X), %xmm0 + movhps -1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L45: + movl M, %eax + andl $4, %eax + jle .L46 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm3 + movhps -11 * SIZE(X), %xmm3 + movsd -10 * SIZE(X), %xmm4 + movhps -9 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L46: + movl M, %eax + andl $2, %eax + jle .L47 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L47: + movl M, %eax + andl $1, %eax + jle .L49 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + addl $2 * SIZE, Y + ALIGN_3 + +.L49: + SHUFPD_1 %xmm0, %xmm0 + + addsd -16 * SIZE(Y), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: +#ifndef CONJ + movaps %xmm0, ALPHA_R + + pxor ALPHA_I, ALPHA_I + subsd %xmm1, ALPHA_I + + unpcklpd ALPHA_R, ALPHA_I + unpcklpd %xmm1, ALPHA_R +#else + movaps %xmm0, ALPHA_R + movaps %xmm1, ALPHA_I + + pxor %xmm5, %xmm5 + subsd %xmm0, %xmm5 + + unpcklpd %xmm5, ALPHA_I + unpcklpd %xmm1, ALPHA_R +#endif + + movl Y, YY + movl M, %eax + sarl $2, %eax + jle .L55 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + decl %eax + jle .L52 + ALIGN_3 + +.L51: + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L52: + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + ALIGN_3 + +.L55: + movl M, %eax + andl $2, %eax + jle .L57 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + ALIGN_3 + +.L57: + movl M, %eax + andl $1, %eax + jle .L999 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm4 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zcopy.S b/kernel/x86/zcopy.S new file mode 100644 index 0000000..153853e --- /dev/null +++ b/kernel/x86/zcopy.S @@ -0,0 +1,250 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define X 8 + STACK + ARGS(%esp) +#define INCX 12 + STACK + ARGS(%esp) +#define Y 16 + STACK + ARGS(%esp) +#define INCY 20 + STACK + ARGS(%esp) + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl M, %ebx + movl X, %ecx + movl INCX, %esi + movl Y, %edx + movl INCY, %edi + + testl %ebx, %ebx # if m == 0 goto End + jle .L999 + + sall $ZBASE_SHIFT, %esi + sall $ZBASE_SHIFT, %edi + + cmpl $2 * SIZE, %esi # if incx != 1 + jne .L100 + cmpl $2 * SIZE, %edi # if incy != 1 + jne .L100 + + movl %ebx, %eax # i = m + sarl $2, %eax + jle .L20 + ALIGN_2 + +.L11: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 7 * SIZE(%ecx) + FLD 6 * SIZE(%ecx) + FLD 5 * SIZE(%ecx) + FLD 4 * SIZE(%ecx) + FLD 3 * SIZE(%ecx) + FLD 2 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + FST 2 * SIZE(%edx) + FST 3 * SIZE(%edx) + FST 4 * SIZE(%edx) + FST 5 * SIZE(%edx) + FST 6 * SIZE(%edx) + FST 7 * SIZE(%edx) +#else + fldl 6 * SIZE(%ecx) + fldl 4 * SIZE(%ecx) + fldl 2 * SIZE(%ecx) + fldl 0 * SIZE(%ecx) + + fstpl 0 * SIZE(%edx) + fstpl 2 * SIZE(%edx) + fstpl 4 * SIZE(%edx) + fstpl 6 * SIZE(%edx) +#endif + + addl $8 * SIZE, %ecx + addl $8 * SIZE, %edx + decl %eax + jg .L11 + ALIGN_2 + +.L20: + movl %ebx, %eax # i = m + andl $3, %eax + jle .L99 + ALIGN_2 + +.L21: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) +#else + fldl 0 * SIZE(%ecx) + fstpl 0 * SIZE(%edx) +#endif + + addl $2 * SIZE, %ecx + addl $2 * SIZE, %edx + decl %eax + jg .L21 + +.L99: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L100: + movl %ebx, %eax + sarl $2, %eax + jle .L120 + ALIGN_2 + +.L111: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + fxch %st(7) + FST 0 * SIZE(%edx) + fxch %st(5) + FST 1 * SIZE(%edx) + addl %edi, %edx + + fxch %st(3) + FST 0 * SIZE(%edx) + fxch %st(1) + FST 1 * SIZE(%edx) + addl %edi, %edx + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + addl %edi, %edx + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + addl %edi, %edx +#else + fldl 0 * SIZE(%ecx) + addl %esi, %ecx + fldl 0 * SIZE(%ecx) + addl %esi, %ecx + fldl 0 * SIZE(%ecx) + addl %esi, %ecx + fldl 0 * SIZE(%ecx) + addl %esi, %ecx + + fxch %st(3) + fstpl 0 * SIZE(%edx) + addl %edi, %edx + + fxch %st(1) + fstpl 0 * SIZE(%edx) + addl %edi, %edx + + fstpl 0 * SIZE(%edx) + addl %edi, %edx + + fstpl 0 * SIZE(%edx) + addl %edi, %edx +#endif + + decl %eax + jg .L111 + +.L120: + movl %ebx, %eax + andl $3, %eax + jle .L999 + ALIGN_2 + +.L121: + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + fxch %st(1) + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + addl %edi, %edx + + decl %eax + jg .L121 + +.L999: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zcopy_sse.S b/kernel/x86/zcopy_sse.S new file mode 100644 index 0000000..8393005 --- /dev/null +++ b/kernel/x86/zcopy_sse.S @@ -0,0 +1,994 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + cmpl $2 * SIZE, INCX + jne .L100 + cmpl $2 * SIZE, INCY + jne .L100 + + cmpl $3, M + jle .L106 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + addl M, M + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + ALIGN_4 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_4 + +.L10: + testl $3 * SIZE, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -32 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -28 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -24 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm2) + movaps %xmm3, -20 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4,-16 * SIZE(Y) + LOAD(16 * SIZE, X, %xmm4) + movaps %xmm5,-12 * SIZE(Y) + LOAD(20 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -8 * SIZE(Y) + LOAD(24 * SIZE, X, %xmm6) + movaps %xmm7, -4 * SIZE(Y) + LOAD(28 * SIZE, X, %xmm7) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + movaps %xmm4, -16 * SIZE(Y) + movaps %xmm5, -12 * SIZE(Y) + movaps %xmm6, -8 * SIZE(Y) + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + ALIGN_3 + +.L13: + testl $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + testl $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: + testl $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + movaps -6 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 14 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 18 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 22 * SIZE(X), %xmm6 + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 26 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L23: + testl $16, M + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $8, M + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm1, %xmm0 + shufps $0x4e, %xmm2, %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, M + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, M + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, M + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L30: + testl $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + movaps -5 * SIZE(X), %xmm7 + + decl %eax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 15 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 19 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 23 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 27 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L33: + testl $16, M + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + testl $8, M + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, M + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + testl $2, M + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, M + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L39: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + movaps -7 * SIZE(X), %xmm7 + + decl %eax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 13 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 17 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 21 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 25 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L43: + testl $16, M + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + testl $8, M + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, M + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + testl $2, M + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, M + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L49: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_4 + +.L100: + movl M, %eax + sarl $3, %eax + jle .L105 + ALIGN_3 + +.L102: + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + movsd %xmm0, (Y) + addl INCY, Y + movhps %xmm0, (Y) + addl INCY, Y + movsd %xmm1, (Y) + addl INCY, Y + movhps %xmm1, (Y) + addl INCY, Y + movsd %xmm2, (Y) + addl INCY, Y + movhps %xmm2, (Y) + addl INCY, Y + movsd %xmm3, (Y) + addl INCY, Y + movhps %xmm3, (Y) + addl INCY, Y + + decl %eax + jg .L102 + ALIGN_3 + +.L105: + testl $4, M + jle .L106 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + movsd %xmm0, (Y) + addl INCY, Y + movhps %xmm0, (Y) + addl INCY, Y + movsd %xmm1, (Y) + addl INCY, Y + movhps %xmm1, (Y) + addl INCY, Y + ALIGN_3 + +.L106: + testl $2, M + jle .L107 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + movsd %xmm0, (Y) + addl INCY, Y + movhps %xmm0, (Y) + addl INCY, Y + ALIGN_3 + +.L107: + testl $1, M + jle .L999 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zcopy_sse2.S b/kernel/x86/zcopy_sse2.S new file mode 100644 index 0000000..f936a34 --- /dev/null +++ b/kernel/x86/zcopy_sse2.S @@ -0,0 +1,668 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define xmm8 xmm0 +#define xmm9 xmm1 +#define xmm10 xmm2 +#define xmm11 xmm3 +#define xmm12 xmm4 +#define xmm13 xmm5 +#define xmm14 xmm6 +#define xmm15 xmm7 + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + addl M, M + +#ifdef ALIGNED_ACCESS + testl $SIZE, Y +#else + testl $SIZE, X +#endif + je .L10 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_4 + +.L10: + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + +#ifdef ALIGNED_ACCESS + testl $SIZE, X +#else + testl $SIZE, Y +#endif + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -16 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -14 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -12 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movaps %xmm3, -10 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4, -8 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movaps %xmm5, -6 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -4 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movaps %xmm7, -2 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, -8 * SIZE(Y) + movaps %xmm5, -6 * SIZE(Y) + movaps %xmm6, -4 * SIZE(Y) + movaps %xmm7, -2 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L13: + testl $8, M + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + testl $4, M + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L16: + testl $1, M + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + LOAD( 1 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + LOAD( 3 * SIZE, X, %xmm2) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + LOAD( 5 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + LOAD( 7 * SIZE, X, %xmm4) + + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + LOAD( 9 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + LOAD(11 * SIZE, X, %xmm6) + + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + LOAD(13 * SIZE, X, %xmm7) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +#else + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movaps -10 * SIZE(X), %xmm3 + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 +#endif + +.L50: + movl M, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L51: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + movlps %xmm1, 0 * SIZE(Y) + movhps %xmm1, 1 * SIZE(Y) + addl INCY, Y + + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + addl INCY, Y + + movlps %xmm3, 0 * SIZE(Y) + movhps %xmm3, 1 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $3, %eax + jle .L57 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zdot.S b/kernel/x86/zdot.S new file mode 100644 index 0000000..aa4481f --- /dev/null +++ b/kernel/x86/zdot.S @@ -0,0 +1,310 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#if defined(DOUBLE) || defined(XDOUBLE) +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) +#else +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + testl N, N + jle .L88 + + addl INCX, INCX + fldz + addl INCY, INCY + fldz + + leal (, INCX, SIZE), INCX + fldz + leal (, INCY, SIZE), INCY + fldz + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl N, %eax + sarl $1, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + FLD 2 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(Y) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 3 * SIZE(Y) + faddp %st, %st(4) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + jmp .L27 + ALIGN_3 + +.L14: + movl N, %eax + sarl $1, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + addl INCX, X + + FLD 0 * SIZE(X) + addl INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + ALIGN_3 + +.L27: +#if defined(DOUBLE) || defined(XDOUBLE) + movl RESULT, %eax +#endif + +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + +#if !defined(DOUBLE) && !defined(XDOUBLE) + subl $2 * SIZE, %esp + FST 1 * SIZE(%esp) + FST 0 * SIZE(%esp) + movl 0 * SIZE(%esp), %eax + movl 1 * SIZE(%esp), %edx + addl $2 * SIZE, %esp +#else + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L88: +#if defined(DOUBLE) || defined(XDOUBLE) + movl RESULT, %eax +#endif + + fldz + fldz + +#if !defined(DOUBLE) && !defined(XDOUBLE) + xor %eax, %eax + xor %edx, %edx +#else + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zdot_amd.S b/kernel/x86/zdot_amd.S new file mode 100644 index 0000000..97a1e72 --- /dev/null +++ b/kernel/x86/zdot_amd.S @@ -0,0 +1,377 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#if !defined(DOUBLE) && !defined(XDOUBLE) +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) +#else +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#if defined(F_INTERFACE) + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + testl N, N + jle .L88 + + fldz + fldz + fldz + fldz + + addl INCX, INCX + addl INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl N, %eax + sarl $2, %eax + jle .L15 + + FLD 0 * SIZE(X) + ALIGN_3 + +.L16: + FLD 0 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + FLD 2 * SIZE(X) + + FLD 2 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(2) + + FMUL 3 * SIZE(Y) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(4) + + FMUL 3 * SIZE(Y) + faddp %st, %st(4) + FLD 4 * SIZE(X) + + FLD 4 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(2) + + FMUL 5 * SIZE(Y) + faddp %st, %st(2) + FLD 5 * SIZE(X) + + FLD 4 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(4) + + FMUL 5 * SIZE(Y) + faddp %st, %st(4) + FLD 6 * SIZE(X) + + FLD 6 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(2) + + FMUL 7 * SIZE(Y) + faddp %st, %st(2) + FLD 7 * SIZE(X) + + FLD 6 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(4) + + FMUL 7 * SIZE(Y) + faddp %st, %st(4) + FLD 8 * SIZE(X) + + prefetch 32 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ffreep %st(0) + ALIGN_3 + +.L15: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + + decl %eax + jg .L22 + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX # if (incx < 0) + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY # if (incy < 0) + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 + +.L29: +#endif + + movl N, %eax + sarl $1, %eax + jle .L30 + ALIGN_3 + + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + addl INCX, X + + FLD 0 * SIZE(X) + addl INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + ALIGN_3 + +.L27: +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + +#if !defined(DOUBLE) && !defined(XDOUBLE) + subl $2 * SIZE, %esp + FST 1 * SIZE(%esp) + FST 0 * SIZE(%esp) + movl 0 * SIZE(%esp), %eax + movl 1 * SIZE(%esp), %edx + addl $2 * SIZE, %esp +#else + movl RESULT, %eax + + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi +#if defined(F_INTERFACE) && defined(F_PATHSCALE) + ret $0x4 +#else + ret +#endif + + ALIGN_3 + +.L88: +#if !defined(DOUBLE) && !defined(XDOUBLE) + xor %eax, %eax + xor %edx, %edx +#else + movl RESULT, %eax + + fldz + fldz + + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi +#if defined(F_INTERFACE) && defined(F_PATHSCALE) + ret $0x4 +#else + ret +#endif + + EPILOGUE diff --git a/kernel/x86/zdot_sse.S b/kernel/x86/zdot_sse.S new file mode 100644 index 0000000..cc22964 --- /dev/null +++ b/kernel/x86/zdot_sse.S @@ -0,0 +1,3457 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + testl N, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L200 + cmpl $2 * SIZE, INCY + jne .L200 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + testl $SIZE, X + jne .L50 + +.L0x: + testl $2 * SIZE, X + je .L10 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm1) + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl N + ALIGN_3 + +.L10: + testl $3 * SIZE, Y + jne .L20 + + movl N, %eax + sarl $4, %eax + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm6 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -24 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -20 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -16 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -16 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps 0 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps 0 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps 4 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps 4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L11 + ALIGN_3 + +.L12: + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -24 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -20 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -16 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -16 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L15: + testl $8, N + jle .L16 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm6 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm7 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -24 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -20 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L16: + testl $4, N + jle .L17 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm6 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm7 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L17: + testl $2, N + jle .L18 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L18: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + testl $2 * SIZE, Y + jne .L30 + + movaps -33 * SIZE(Y), %xmm6 + addl $3 * SIZE, Y + + shufps $0xb1, %xmm1, %xmm1 + + movl N, %eax + sarl $4, %eax + jle .L25 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps 0 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, N + jle .L27 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm7, %xmm6 + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, N + jle .L29 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L29: + shufps $0xb1, %xmm1, %xmm1 + jmp .L98 + ALIGN_3 + +.L30: + testl $SIZE, Y + jne .L40 +#endif + + movl N, %eax + sarl $4, %eax + jle .L35 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm6 + movhps -30 * SIZE(Y), %xmm6 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm7 + movhps -26 * SIZE(Y), %xmm7 + + decl %eax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -24 * SIZE(Y), %xmm6 + movhps -22 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -20 * SIZE(Y), %xmm7 + movhps -18 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -16 * SIZE(Y), %xmm6 + movhps -14 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -16 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -12 * SIZE(Y), %xmm7 + movhps -10 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -8 * SIZE(Y), %xmm6 + movhps -6 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -4 * SIZE(Y), %xmm7 + movhps -2 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd 0 * SIZE(Y), %xmm6 + movhps 2 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps 0 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd 4 * SIZE(Y), %xmm7 + movhps 6 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps 4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L32: + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -24 * SIZE(Y), %xmm6 + movhps -22 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -20 * SIZE(Y), %xmm7 + movhps -18 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -16 * SIZE(Y), %xmm6 + movhps -14 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -16 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -12 * SIZE(Y), %xmm7 + movhps -10 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -8 * SIZE(Y), %xmm6 + movhps -6 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -4 * SIZE(Y), %xmm7 + movhps -2 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L35: + testl $8, N + jle .L36 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm6 + movhps -30 * SIZE(Y), %xmm6 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm7 + movhps -26 * SIZE(Y), %xmm7 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -24 * SIZE(Y), %xmm6 + movhps -22 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -20 * SIZE(Y), %xmm7 + movhps -18 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L36: + testl $4, N + jle .L37 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm6 + movhps -30 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm7 + movhps -26 * SIZE(Y), %xmm7 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L37: + testl $2, N + jle .L38 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm6 + movhps -30 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L38: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +#ifdef ALIGNED_ACCESS +.L40: + movaps -35 * SIZE(Y), %xmm6 + addl $1 * SIZE, Y + + shufps $0xb1, %xmm1, %xmm1 + + movl N, %eax + sarl $4, %eax + jle .L45 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + decl %eax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps 0 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L45: + testl $8, N + jle .L46 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L46: + testl $4, N + jle .L47 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L47: + testl $2, N + jle .L48 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm7, %xmm6 + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L48: + testl $1, N + jle .L49 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + movss -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L49: + shufps $0xb1, %xmm1, %xmm1 + jmp .L98 + ALIGN_3 +#endif + +.L50: + testl $SIZE, Y + jne .L70 + +#ifdef ALIGNED_ACCESS + + testl $2 * SIZE, Y + je .L50x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + + PSHUFD2($0xb1, %xmm0, %xmm1) + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addl $2 * SIZE, X + addl $2 * SIZE, Y + + decl N + ALIGN_3 + +.L50x: + testl $2 * SIZE, X + jne .L60 + + movaps -33 * SIZE(X), %xmm6 + addl $3 * SIZE, X + + shufps $0xb1, %xmm1, %xmm1 + + movl N, %eax + sarl $4, %eax + jle .L55 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + decl %eax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps 0 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L52: + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L55: + testl $8, N + jle .L56 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L56: + testl $4, N + jle .L57 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L57: + testl $2, N + jle .L58 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm7, %xmm6 + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L58: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L60: + movaps -35 * SIZE(X), %xmm6 + addl $1 * SIZE, X + + shufps $0xb1, %xmm1, %xmm1 + + movl N, %eax + sarl $4, %eax + jle .L65 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + decl %eax + jle .L62 + ALIGN_3 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps 0 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L61 + ALIGN_3 + +.L62: + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L65: + testl $8, N + jle .L66 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L66: + testl $4, N + jle .L67 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L67: + testl $2, N + jle .L68 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm7, %xmm6 + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L68: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + movss -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +#else + + testl $2 * SIZE, Y + je .L50x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(Y), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + + PSHUFD2($0xb1, %xmm0, %xmm1) + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addl $2 * SIZE, X + addl $2 * SIZE, Y + + decl N + ALIGN_3 + +.L50x: + movl N, %eax + sarl $4, %eax + jle .L55 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm6 + movhps -30 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm7 + movhps -26 * SIZE(X), %xmm7 + + decl %eax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -16 * SIZE(X), %xmm6 + movhps -14 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -12 * SIZE(X), %xmm7 + movhps -10 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps 0 * SIZE(X), %xmm6 + movhps 2 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps 4 * SIZE(X), %xmm7 + movhps 6 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L52: + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -16 * SIZE(X), %xmm6 + movhps -14 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -12 * SIZE(X), %xmm7 + movhps -10 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L55: + testl $8, N + jle .L56 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm6 + movhps -30 * SIZE(X), %xmm6 + + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm7 + movhps -26 * SIZE(X), %xmm7 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L56: + testl $4, N + jle .L57 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm6 + movhps -30 * SIZE(X), %xmm6 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm7 + movhps -26 * SIZE(X), %xmm7 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L57: + testl $2, N + jle .L58 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm6 + movhps -30 * SIZE(X), %xmm6 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L58: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(X), %xmm6 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 +#endif + +.L70: + testl $2 * SIZE, Y + je .L70x + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + addl $2 * SIZE, X +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + addl $2 * SIZE, Y + + PSHUFD2($0xb1, %xmm1, %xmm0) + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + decl N + ALIGN_3 + +.L70x: + testl $2 * SIZE, X + jne .L80 + + movaps -33 * SIZE(X), %xmm4 + addl $3 * SIZE, X + movaps -33 * SIZE(Y), %xmm6 + addl $3 * SIZE, Y + + movl N, %eax + sarl $4, %eax + jle .L75 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + decl %eax + jle .L72 + ALIGN_3 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -28 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -24 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -20 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -16 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -16 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -8 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -4 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps 0 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps 0 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L71 + ALIGN_3 + +.L72: + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -28 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -24 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -20 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -16 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -16 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -8 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -4 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L75: + testl $8, N + jle .L76 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -28 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -24 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -20 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L76: + testl $4, N + jle .L77 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -28 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L77: + testl $2, N + jle .L78 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + ALIGN_3 + +.L78: + testl $1, N + jle .L79 + + xorps %xmm5, %xmm5 + movss %xmm5, %xmm4 + movss %xmm5, %xmm6 + + shufps $0x24, %xmm4, %xmm4 + PSHUFD2($0x18, %xmm6, %xmm3) + shufps $0x24, %xmm6, %xmm6 + + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L79: + shufps $0x39, %xmm0, %xmm0 + shufps $0x39, %xmm1, %xmm1 + jmp .L98 + ALIGN_3 + +.L80: + movsd -33 * SIZE(X), %xmm4 + movhps -31 * SIZE(X), %xmm4 + addl $3 * SIZE, X + movaps -33 * SIZE(Y), %xmm6 + addl $3 * SIZE, Y + + movl N, %eax + sarl $4, %eax + jle .L85 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + decl %eax + jle .L82 + ALIGN_3 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -28 * SIZE(X), %xmm4 + movhps -26 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -24 * SIZE(X), %xmm5 + movhps -22 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -16 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -12 * SIZE(X), %xmm4 + movhps -10 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -8 * SIZE(X), %xmm5 + movhps -6 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -4 * SIZE(X), %xmm4 + movhps -2 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps 0 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd 0 * SIZE(X), %xmm5 + movhps 2 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L81 + ALIGN_3 + +.L82: + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -28 * SIZE(X), %xmm4 + movhps -26 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -24 * SIZE(X), %xmm5 + movhps -22 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -16 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -12 * SIZE(X), %xmm4 + movhps -10 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -8 * SIZE(X), %xmm5 + movhps -6 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -4 * SIZE(X), %xmm4 + movhps -2 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L85: + testl $8, N + jle .L86 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -28 * SIZE(X), %xmm4 + movhps -26 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -24 * SIZE(X), %xmm5 + movhps -22 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L86: + testl $4, N + jle .L87 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -28 * SIZE(X), %xmm4 + movhps -26 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L87: + testl $2, N + jle .L88 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + ALIGN_3 + +.L88: + testl $1, N + jle .L89 + + xorps %xmm5, %xmm5 + movss %xmm5, %xmm4 + movss %xmm5, %xmm6 + + shufps $0x24, %xmm4, %xmm4 + PSHUFD2($0x18, %xmm6, %xmm3) + shufps $0x24, %xmm6, %xmm6 + + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L89: + shufps $0x39, %xmm0, %xmm0 + shufps $0x39, %xmm1, %xmm1 + jmp .L98 + ALIGN_3 + +.L200: + movl N, %eax + sarl $4, %eax + jle .L205 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + decl %eax + jle .L204 + ALIGN_3 + +.L203: + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + decl %eax + jg .L203 + ALIGN_3 + +.L204: + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L205: + testl $8, N + jle .L206 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L206: + testl $4, N + jle .L207 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L207: + testl $2, N + jle .L208 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L208: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd (X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd (Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L98: + movhlps %xmm0, %xmm2 + movhlps %xmm1, %xmm3 + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + PSHUFD2($1, %xmm0, %xmm2) + PSHUFD2($1, %xmm1, %xmm3) + +#ifndef CONJ + subss %xmm2, %xmm0 + addss %xmm3, %xmm1 +#else + addss %xmm2, %xmm0 + subss %xmm3, %xmm1 +#endif + ALIGN_4 + +.L999: + subl $2 * SIZE, %esp + movss %xmm0, 0 * SIZE(%esp) + movss %xmm1, 1 * SIZE(%esp) + movl 0 * SIZE(%esp), %eax + movl 1 * SIZE(%esp), %edx + addl $2 * SIZE, %esp + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zdot_sse2.S b/kernel/x86/zdot_sse2.S new file mode 100644 index 0000000..6304f01 --- /dev/null +++ b/kernel/x86/zdot_sse2.S @@ -0,0 +1,1543 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + +#undef movsd + +#ifndef OPTERON +#define movlps movsd +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + cmpl $0, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, Y + jne .L30 + + testl $SIZE, X + jne .L20 + + movl N, %eax + sarl $3, %eax + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -10 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -6 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -4 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -2 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps 0 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps 0 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps 2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps 2 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L11 + ALIGN_3 + +.L12: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -10 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -6 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -4 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -2 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, N + jle .L16 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -10 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, N + jle .L17 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, N + jle .L98 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L20: + movl N, %eax + sarl $3, %eax + jle .L25 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(X), %xmm4 + movhps -11 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(X), %xmm5 + movhps -9 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -4 * SIZE(X), %xmm4 + movhps -3 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -2 * SIZE(X), %xmm5 + movhps -1 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps 0 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps 2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps 2 * SIZE(X), %xmm5 + movhps 3 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(X), %xmm4 + movhps -11 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(X), %xmm5 + movhps -9 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -4 * SIZE(X), %xmm4 + movhps -3 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -2 * SIZE(X), %xmm5 + movhps -1 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(X), %xmm4 + movhps -11 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(X), %xmm5 + movhps -9 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L98 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L30: + testl $SIZE, X + jne .L40 + + movl N, %eax + sarl $3, %eax + jle .L35 + + movlps -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm6 + + movlps -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm7 + + decl %eax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(Y), %xmm4 + movhps -11 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(Y), %xmm5 + movhps -9 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -8 * SIZE(Y), %xmm4 + movhps -7 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -6 * SIZE(Y), %xmm5 + movhps -5 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -4 * SIZE(Y), %xmm4 + movhps -3 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -2 * SIZE(Y), %xmm5 + movhps -1 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps 0 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(Y), %xmm4 + movhps 1 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps 2 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps 2 * SIZE(Y), %xmm5 + movhps 3 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L32: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(Y), %xmm4 + movhps -11 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(Y), %xmm5 + movhps -9 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -8 * SIZE(Y), %xmm4 + movhps -7 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -6 * SIZE(Y), %xmm5 + movhps -5 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -4 * SIZE(Y), %xmm4 + movhps -3 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -2 * SIZE(Y), %xmm5 + movhps -1 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, N + jle .L36 + + movlps -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm6 + + movlps -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(Y), %xmm4 + movhps -11 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(Y), %xmm5 + movhps -9 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L36: + testl $2, N + jle .L37 + + movlps -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + movlps -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L37: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + + testl $1, N + jle .L98 + + movlps -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + SHUFPD_1 %xmm3, %xmm3 + addpd %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L40: + movhps -16 * SIZE(X), %xmm4 + addl $SIZE, X + movhps -16 * SIZE(Y), %xmm6 + addl $SIZE, Y + + movl N, %eax + sarl $3, %eax + jle .L45 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm7 + + decl %eax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -14 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -14 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -10 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -8 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -6 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -6 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -2 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -2 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps 0 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps 0 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -14 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -14 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -10 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -8 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -6 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -6 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -2 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -2 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, N + jle .L46 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm7 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -14 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -14 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -10 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L46: + testl $2, N + jle .L47 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm7 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -14 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -14 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, N + jle .L48 + + movlps -16 * SIZE(X), %xmm4 + movlps -16 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L48: + SHUFPD_1 %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm2, %xmm2 + SHUFPD_1 %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L50: + movl N, %eax + sarl $3, %eax + jle .L55 + + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + + decl %eax + jle .L54 + ALIGN_3 + +.L53: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + decl %eax + jg .L53 + ALIGN_3 + +.L54: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L55: + testl $4, N + jle .L56 + + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L56: + testl $2, N + jle .L57 + + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L57: + testl $1, N + jle .L98 + + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L98: + pshufd $0x4e, %xmm0, %xmm2 + pshufd $0x4e, %xmm1, %xmm3 + +#ifndef CONJ + subsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 +#else + addsd %xmm2, %xmm0 + subsd %xmm3, %xmm1 +#endif + +.L999: + movl RESULT, %eax + + movlps %xmm0, 0 * SIZE(%eax) + movlps %xmm1, 1 * SIZE(%eax) + + popl %ebx + popl %esi + popl %edi + ret + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_1x4_athlon.S b/kernel/x86/zgemm3m_kernel_1x4_athlon.S new file mode 100644 index 0000000..c57a8cb --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_1x4_athlon.S @@ -0,0 +1,979 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_A 32 + STACK + ARGS(%esp) +#define STACK_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define STACK_LDC 44 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define B_ORIG %ebx +#define LDC %ebp + +#define PREFETCHSIZE (5 + 8 * 10) + +/* + + A hint of scheduling is received from following URL + + http://www.netlib.org/atlas/atlas-comm/msg00260.html + + Julian's code is still faster than mine, since Athlon has big + defect ... So this is a sample coding and please don't expect too + much. + +*/ + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl STACK_B, B_ORIG + movl STACK_LDC, LDC + + sall $ZBASE_SHIFT, LDC + + subl $-16 * SIZE, B_ORIG + subl $-16 * SIZE, STACK_A + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + sarl $2, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L11: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl C, %edi + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (B_ORIG, %eax, 4), B +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + jle .L13 + ALIGN_4 + +.L12: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + movl 16 * SIZE(B), %esi + movl 24 * SIZE(B), %esi + movl 32 * SIZE(B), %esi + movl 40 * SIZE(B), %esi + subl $-64 * SIZE, B + decl %eax + jne .L12 + ALIGN_3 + +.L13: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L14: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 4), B +#endif + + leal (%edi, LDC, 2), %eax + + fldz + fldz + fldz + fldz + + FLD -8 * SIZE(A) + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + movl $32 * SIZE, %esi + +#ifdef HAVE_3DNOW + prefetchw 1 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, LDC) + prefetchw 1 * SIZE(%eax) + prefetchw 2 * SIZE(%eax, LDC) +#elif defined(HAVE_SSE) + prefetcht0 1 * SIZE(%edi) + prefetcht0 1 * SIZE(%edi, LDC) + prefetcht0 1 * SIZE(%eax) + prefetcht0 1 * SIZE(%eax, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L16 + ALIGN_3 + +.L15: + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -15 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -14 * SIZE(B) + +#if L1_DATA_LINESIZE == 32 +#ifdef HAVE_3DNOW + PADDING prefetch (PREFETCHSIZE - 4) * SIZE(A) +#elif defined(HAVE_SSE) + PADDING prefetcht0 (PREFETCHSIZE - 4) * SIZE(A) +#endif +#endif + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -13 * SIZE(B) + + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -11 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -9 * SIZE(B) + + faddp %st, %st(5) + FLD -14 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -7 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -5 * SIZE(B) + + faddp %st, %st(5) + FLD -13 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -3 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -1 * SIZE(B) + + faddp %st, %st(5) + FLD -12 * SIZE(A) + FLD 0 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 1 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 3 * SIZE(B) + + faddp %st, %st(5) + FLD -11 * SIZE(A) + FLD 4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 5 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 7 * SIZE(B) + + faddp %st, %st(5) + FLD -10 * SIZE(A) + FLD 8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 9 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 11 * SIZE(B) + + faddp %st, %st(5) + FLD -9 * SIZE(A) + FLD 12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 13 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 15 * SIZE(B) + + faddp %st, %st(5) + FLD 0 * SIZE(A) + +#ifdef HAVE_3DNOW + PADDING prefetch PREFETCHSIZE * SIZE(A) +#elif defined(HAVE_SSE) + PADDING prefetcht0 PREFETCHSIZE * SIZE(A) +#endif + + addl $8 * SIZE, A + fxch %st(1) + addl $32 * SIZE, B + + FLD -16 * SIZE(B) + decl %eax + jne .L15 + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L19 + ALIGN_4 + +.L17: + fmul %st(1), %st + faddp %st, %st(3) + + FLD -15 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(4) + + FLD -14 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(5) + + FMUL -13 * SIZE(B) + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + addl $1 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + leal (%edi, LDC, 2), %eax + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fld %st(3) + fmul %st(1), %st + + FLD 0 * SIZE(%edi, LDC) + faddp %st, %st(1) + FST 0 * SIZE(%edi, LDC) + + fld %st(4) + fmul %st(1), %st + + FLD 0 * SIZE(%eax) + faddp %st, %st(1) + FST 0 * SIZE(%eax) + + fmul %st(5), %st + + FLD 0 * SIZE(%eax, LDC) + faddp %st, %st(1) + FST 0 * SIZE(%eax, LDC) + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 1 * SIZE(%edi, LDC) + faddp %st, %st(1) + FST 1 * SIZE(%edi, LDC) + + FLD 1 * SIZE(%eax) + faddp %st, %st(1) + FST 1 * SIZE(%eax) + + FLD 1 * SIZE(%eax, LDC) + faddp %st, %st(1) + FST 1 * SIZE(%eax, LDC) + + addl $2 * SIZE, %edi + decl I + jne .L14 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C + movl B, B_ORIG + decl J + jne .L11 + ALIGN_4 + +.L20: + movl N, %eax + andl $2, %eax + je .L30 + ALIGN_3 + +.L21: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl C, %edi + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (B_ORIG, %eax, 2), B +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + jle .L23 + ALIGN_4 + +.L22: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L22 + ALIGN_3 + +.L23: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L24: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 2), B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -14 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -13 * SIZE(A) + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -7 * SIZE(B) + faddp %st, %st(2) + + FLD -11 * SIZE(A) + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -5 * SIZE(B) + faddp %st, %st(4) + + FLD -10 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -3 * SIZE(B) + faddp %st, %st(2) + + FLD -9 * SIZE(A) + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -1 * SIZE(B) + faddp %st, %st(4) + + FLD -8 * SIZE(A) + FLD 0 * SIZE(B) + + addl $ 8 * SIZE, A + subl $-16 * SIZE, B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + addl $1 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(2) + faddp %st, %st(2) + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmul %st(3), %st + + FLD 0 * SIZE(%edi, LDC) + faddp %st, %st(1) + FST 0 * SIZE(%edi, LDC) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 1 * SIZE(%edi, LDC) + faddp %st, %st(1) + FST 1 * SIZE(%edi, LDC) + + addl $2 * SIZE, %edi + decl I + jne .L24 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + movl B, B_ORIG + ALIGN_4 + +.L30: + movl N, %eax + andl $1, %eax + je .L999 + ALIGN_3 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl C, %edi + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (B_ORIG, %eax, 1), B +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L32: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L32 + ALIGN_3 + +.L33: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L34: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 1), B +#endif + + fldz + fldz + fldz + fldz + + prefetchw 1 * SIZE(%edi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L36 + ALIGN_3 + +.L35: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + FLD -15 * SIZE(A) + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -14 * SIZE(A) + FMUL -14 * SIZE(B) + faddp %st, %st(3) + + FLD -13 * SIZE(A) + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FMUL -12 * SIZE(B) + faddp %st, %st(1) + + FLD -11 * SIZE(A) + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -10 * SIZE(A) + FMUL -10 * SIZE(B) + faddp %st, %st(3) + + FLD -9 * SIZE(A) + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + addl $8 * SIZE, A + addl $8 * SIZE, B + + decl %eax + jne .L35 + ALIGN_4 + +.L36: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L39 + ALIGN_4 + +.L37: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + addl $1 * SIZE,A + addl $1 * SIZE,B + decl %eax + jne .L37 + ALIGN_4 + +.L39: + faddp %st, %st(2) + faddp %st, %st(2) + faddp %st, %st(1) + + FLD ALPHA_I + FLD ALPHA_R + + fmul %st(2), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmulp %st, %st(1) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + addl $2 * SIZE, %edi + decl I + jne .L34 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C + movl B, B_ORIG + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x2_atom.S b/kernel/x86/zgemm3m_kernel_2x2_atom.S new file mode 100644 index 0000000..ee918bf --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x2_atom.S @@ -0,0 +1,734 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define CO1 %esi +#define LDC %ebp +#define B %edi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 1, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, CO1 # coffset = c + leal (, LDC, 2), %eax + addl %eax, C + + movl A, AA # aoffset = a + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movl BX, %eax + prefetcht0 0 * SIZE(%eax) + subl $-8 * SIZE, BX + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + + movaps %xmm4, %xmm2 + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm2 + + movaps %xmm6, %xmm3 + mulsd %xmm0, %xmm6 + mulsd %xmm1, %xmm3 + + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm2 + addsd 2 * SIZE(CO1), %xmm6 + addsd 3 * SIZE(CO1), %xmm3 + + movlps %xmm4, 0 * SIZE(CO1) + movlps %xmm2, 1 * SIZE(CO1) + movlps %xmm6, 2 * SIZE(CO1) + movlps %xmm3, 3 * SIZE(CO1) + + movaps %xmm5, %xmm2 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm2 + + movaps %xmm7, %xmm3 + mulsd %xmm0, %xmm7 + mulsd %xmm1, %xmm3 + + addsd 0 * SIZE(CO1, LDC), %xmm5 + addsd 1 * SIZE(CO1, LDC), %xmm2 + addsd 2 * SIZE(CO1, LDC), %xmm7 + addsd 3 * SIZE(CO1, LDC), %xmm3 + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movlps %xmm2, 1 * SIZE(CO1, LDC) + movlps %xmm7, 2 * SIZE(CO1, LDC) + movlps %xmm3, 3 * SIZE(CO1, LDC) + + addl $4 * SIZE, CO1 + decl %ebx + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + + movaps %xmm4, %xmm2 + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm2 + + movaps %xmm5, %xmm3 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm3 + + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm2 + addsd 0 * SIZE(CO1, LDC), %xmm5 + addsd 1 * SIZE(CO1, LDC), %xmm3 + + movlps %xmm4, 0 * SIZE(CO1) + movlps %xmm2, 1 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) + movlps %xmm3, 1 * SIZE(CO1, LDC) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + decl J + jg .L10 + ALIGN_4 + +.L30: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO1 + addl LDC, C + + movl A, AA + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + movaps %xmm4, %xmm2 + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm2 + + movaps %xmm6, %xmm3 + mulsd %xmm0, %xmm6 + mulsd %xmm1, %xmm3 + + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm2 + addsd 2 * SIZE(CO1), %xmm6 + addsd 3 * SIZE(CO1), %xmm3 + + movlps %xmm4, 0 * SIZE(CO1) + movlps %xmm2, 1 * SIZE(CO1) + movlps %xmm6, 2 * SIZE(CO1) + movlps %xmm3, 3 * SIZE(CO1) + + addl $4 * SIZE, CO1 + decl %ebx + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addsd %xmm5, %xmm4 + + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + movaps %xmm4, %xmm2 + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm2 + + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm2 + + movlps %xmm4, 0 * SIZE(CO1) + movlps %xmm2, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x2_coppermine.S b/kernel/x86/zgemm3m_kernel_2x2_coppermine.S new file mode 100644 index 0000000..674829f --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x2_coppermine.S @@ -0,0 +1,722 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define LDC 44 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl N, %eax # j = (n >> 1) # MEMORY + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + + sall $ZBASE_SHIFT, %ebp + + sarl $1, %eax + + leal 0(%ecx) , %ecx # NOP + movl %eax, J # j = (n >> 1) # MEMORY + test %eax, %eax + je .L8 # if !(n >> 1) goto .L8 + ALIGN_4 + +.L34: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl %ebx, BX + + movl M, %esi # m # MEMORY + movl A, %edx # a # MEMORY + movl C, %edi # C # MEMORY + sarl $1, %esi # i = (m >> 1) + je .L12 + ALIGN_4 + +.MainHead: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ebx, %eax, 2), %ecx +#endif + +#ifdef HAVE_SSE + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + fldz + fldz + + FLD 4 * SIZE(%ecx) # b5 + FLD 4 * SIZE(%edx) # a5 + FLD 0 * SIZE(%ecx) # b1 + FLD 0 * SIZE(%edx) # a1 + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(%ecx) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(%ecx) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET) * SIZE(%edx) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(%edx) + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(%edx) + + fmul %st, %st(1) + FMUL 3 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(%edx) + fmul %st, %st(1) + FMUL 3 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(%edx) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(%ecx) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(%edx) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(%edx) + fmul %st, %st(3) + FMUL 5 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(%edx) + + fmul %st, %st(3) + FMUL 7 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(%edx) + fmul %st, %st(3) + FMUL 7 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(%edx) + fxch %st(2) + + subl $-8 * SIZE, %ecx + subl $-8 * SIZE, %edx + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(%edx) + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(%edx) + + addl $2 * SIZE,%ecx + addl $2 * SIZE,%edx + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fld %st(3) + fmul %st(1), %st + + FLD 0 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 0 * SIZE(%edi, %ebp) + + fld %st(4) + fmul %st(1), %st + + FLD 2 * SIZE(%edi) + faddp %st, %st(1) + FST 2 * SIZE(%edi) + + fmul %st(5), %st + + FLD 2 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 2 * SIZE(%edi, %ebp) + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 1 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 1 * SIZE(%edi, %ebp) + + FLD 3 * SIZE(%edi) + faddp %st, %st(1) + FST 3 * SIZE(%edi) + + FLD 3 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 3 * SIZE(%edi, %ebp) + + addl $4 * SIZE, %edi + rep + decl %esi # i -- + rep + jne .MainHead + ALIGN_4 + +.L12: + movl M, %eax # m # MEMORY + andl $1, %eax + je .L27 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ebx, %eax, 2), %ecx +#endif + fldz + fldz + + FLD 0 * SIZE(%edx) # temp1 = *(aoffset + 0) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(%edx) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, %edx + addl $4 * SIZE, %ecx + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, %edx + addl $2 * SIZE, %ecx + ALIGN_4 + +.L33: + ffreep %st(0) + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmul %st(3), %st + + FLD 0 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 0 * SIZE(%edi, %ebp) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 1 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 1 * SIZE(%edi, %ebp) + ALIGN_4 + +.L27: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + lea (, %ebp, 2), %eax + addl %eax, C # C + 2 * ldc # MEMORY + movl %ecx, %ebx # b # MEMORY + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.L8: + movl N, %eax # n # MEMORY + andl $1, %eax + je .End + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, %edi # c # MEMORY + movl A, %edx # a # MEMORY + + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L36 + ALIGN_4 + +.L46: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ebx, %eax, 1), %ecx +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + fldz + sarl $1, %eax + fldz + FLD 0 * SIZE(%ecx) # temp1 = *(boffset + 0) + + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%ecx) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(%ecx) # temp1 = *(boffset + 0) + + addl $4 * SIZE,%edx + addl $2 * SIZE,%ecx + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(%ecx) # temp1 = *(boffset + 0) + + addl $2 * SIZE,%edx + addl $1 * SIZE,%ecx + ALIGN_4 + +.L45: + ffreep %st(0) + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmul %st(3), %st + + FLD 2 * SIZE(%edi) + faddp %st, %st(1) + FST 2 * SIZE(%edi) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 3 * SIZE(%edi) + faddp %st, %st(1) + FST 3 * SIZE(%edi) + + addl $4 * SIZE, %edi + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L36: + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .End + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ebx, %eax, 1), %ecx +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + fldz + ALIGN_3 + +.L51: + FLD (%edx) + FMUL (%ecx) + addl $1 * SIZE,%edx + addl $1 * SIZE,%ecx + faddp %st,%st(1) + decl %eax + jne .L51 + + FLD ALPHA_I + FLD ALPHA_R + + fmul %st(2), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmulp %st, %st(1) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + ALIGN_4 + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x4_barcelona.S b/kernel/x86/zgemm3m_kernel_2x4_barcelona.S new file mode 100644 index 0000000..7822094 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x4_barcelona.S @@ -0,0 +1,1291 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define OLD_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define OLD_LDC 44 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define B %edi +#define LDC %ebp +#define AO %edx +#define BO %ecx +#define CO %esi +#define I %ebx + +#define movsd movlps +#define movapd movups +#define movlpd movlps +#define movhpd movhps + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm1; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL2(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -12 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL3(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -10 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL4(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd (BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup (AO, %eax, 2), %xmm0 + +#define KERNEL5(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -6 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL6(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -4 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL7(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -2 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL8(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl OLD_B, B + movl OLD_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax + +#ifndef LEFT + negl %eax +#endif + + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax + movl %eax, BX + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm4, %xmm4 + movddup -8 * SIZE(AO), %xmm3 + + leal (LDC, LDC, 2), %eax + + prefetchw 1 * SIZE(CO) + pxor %xmm5, %xmm5 + prefetchw 3 * SIZE(CO, LDC) + pxor %xmm6, %xmm6 + prefetchw 1 * SIZE(CO, LDC, 2) + pxor %xmm7, %xmm7 + prefetchw 3 * SIZE(CO, %eax) + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetch -16 * SIZE(%eax) + addl $8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + BRANCH + jl .L12 + ALIGN_3 + +.L15: + movups ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + je .L18 + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO, %eax, 4), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO, %eax, 2), %xmm0 + mulpd %xmm0, %xmm2 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + addpd %xmm0, %xmm7 + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + + addl $SIZE, %eax + jl .L17 + ALIGN_4 + +.L18: + leal (CO, LDC, 2), %eax + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 0 * SIZE(CO, LDC), %xmm1 + movhps 1 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 0 * SIZE(CO, LDC) + movhps %xmm1, 1 * SIZE(CO, LDC) + + movsd 2 * SIZE(CO), %xmm0 + movhps 3 * SIZE(CO), %xmm0 + movsd 2 * SIZE(CO, LDC), %xmm1 + movhps 3 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 2 * SIZE(CO) + movhps %xmm0, 3 * SIZE(CO) + movlps %xmm1, 2 * SIZE(CO, LDC) + movhps %xmm1, 3 * SIZE(CO, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 1 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%eax, LDC), %xmm1 + movhps 1 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 1 * SIZE(%eax) + movlps %xmm1, 0 * SIZE(%eax, LDC) + movhps %xmm1, 1 * SIZE(%eax, LDC) + + movsd 2 * SIZE(%eax), %xmm0 + movhps 3 * SIZE(%eax), %xmm0 + movsd 2 * SIZE(%eax, LDC), %xmm1 + movhps 3 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 2 * SIZE(%eax) + movhps %xmm0, 3 * SIZE(%eax) + movlps %xmm1, 2 * SIZE(%eax, LDC) + movhps %xmm1, 3 * SIZE(%eax, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + + decl I # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I # i = (m >> 2) + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + movddup -8 * SIZE(AO), %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd -8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -13 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd (BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -11 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd 8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -9 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -8 * SIZE(AO), %xmm0 + + subl $ -8 * SIZE, AO + subl $-32 * SIZE, BO + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movups ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO), %xmm0 + + addl $1 * SIZE, AO + addl $4 * SIZE, BO + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (CO, LDC, 2), %eax + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 0 * SIZE(CO, LDC), %xmm1 + movhps 1 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 0 * SIZE(CO, LDC) + movhps %xmm1, 1 * SIZE(CO, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 1 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%eax, LDC), %xmm1 + movhps 1 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 1 * SIZE(%eax) + movlps %xmm1, 0 * SIZE(%eax, LDC) + movhps %xmm1, 1 * SIZE(%eax, LDC) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BO, B + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + ALIGN_2 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + prefetchw 1 * SIZE(CO) + pxor %xmm5, %xmm5 + prefetchw 1 * SIZE(CO, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -16 * SIZE(BO), %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -14 * SIZE(BO), %xmm0 + movddup -13 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -14 * SIZE(BO), %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + mulpd -12 * SIZE(BO), %xmm0 + movddup -11 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -12 * SIZE(BO), %xmm1 + movddup -10 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -10 * SIZE(BO), %xmm1 + movddup -8 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AO) + + mulpd -8 * SIZE(BO), %xmm0 + movddup -7 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -8 * SIZE(BO), %xmm1 + movddup -6 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -6 * SIZE(BO), %xmm1 + movddup -4 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + mulpd -4 * SIZE(BO), %xmm0 + movddup -3 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -4 * SIZE(BO), %xmm1 + movddup -2 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -2 * SIZE(BO), %xmm0 + movddup -1 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -2 * SIZE(BO), %xmm1 + movddup 0 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + subl $-16 * SIZE, AO + subl $-16 * SIZE, BO + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movups ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -16 * SIZE(BO), %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + addl $2 * SIZE, AO + addl $2 * SIZE, BO + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 0 * SIZE(CO, LDC), %xmm1 + movhps 1 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 0 * SIZE(CO, LDC) + movhps %xmm1, 1 * SIZE(CO, LDC) + + movsd 2 * SIZE(CO), %xmm0 + movhps 3 * SIZE(CO), %xmm0 + movsd 2 * SIZE(CO, LDC), %xmm1 + movhps 3 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 2 * SIZE(CO) + movhps %xmm0, 3 * SIZE(CO) + movlps %xmm1, 2 * SIZE(CO, LDC) + movhps %xmm1, 3 * SIZE(CO, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + + decl I # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, I + testl $1, I # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(AO), %xmm0 + + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -14 * SIZE(AO), %xmm0 + + mulpd -12 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -13 * SIZE(AO), %xmm0 + + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -12 * SIZE(AO), %xmm0 + + mulpd -8 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -11 * SIZE(AO), %xmm0 + + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -10 * SIZE(AO), %xmm0 + + mulpd -4 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -9 * SIZE(AO), %xmm0 + + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -8 * SIZE(AO), %xmm0 + + subl $ -8 * SIZE, AO + subl $-16 * SIZE, BO + + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movups ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(AO), %xmm0 + + subl $-1 * SIZE, AO + subl $-2 * SIZE, BO + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + addpd %xmm5, %xmm4 + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 0 * SIZE(CO, LDC), %xmm1 + movhps 1 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 0 * SIZE(CO, LDC) + movhps %xmm1, 1 * SIZE(CO, LDC) + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + + leal (, LDC, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + prefetchw 1 * SIZE(CO) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(BO), %xmm0 + + mulpd -14 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -14 * SIZE(BO), %xmm0 + + mulpd -12 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -13 * SIZE(BO), %xmm0 + + mulpd -10 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -12 * SIZE(BO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + mulpd -8 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -11 * SIZE(BO), %xmm0 + + mulpd -6 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -10 * SIZE(BO), %xmm0 + + mulpd -4 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -9 * SIZE(BO), %xmm0 + + mulpd -2 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -8 * SIZE(BO), %xmm0 + + subl $-16 * SIZE, AO + subl $ -8 * SIZE, BO + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movups ALPHA, %xmm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd -16 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(BO), %xmm0 + + addl $2 * SIZE, AO + addl $1 * SIZE, BO + decl %eax + jg .L76 + ALIGN_4 + +.L78: + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 2 * SIZE(CO), %xmm1 + movhps 3 * SIZE(CO), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 2 * SIZE(CO) + movhps %xmm1, 3 * SIZE(CO) + + addl $4 * SIZE, %esi # coffset += 2 + + decl I # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, I + testl $1, I # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movapd -14 * SIZE(AO), %xmm0 + + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm5 + movapd -12 * SIZE(AO), %xmm0 + + mulpd -12 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm6 + movapd -10 * SIZE(AO), %xmm0 + + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm7 + movapd -8 * SIZE(AO), %xmm0 + + subl $-8 * SIZE, AO + subl $-8 * SIZE, BO + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movups ALPHA, %xmm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd -16 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm4 + movsd -15 * SIZE(AO), %xmm0 + + addl $1 * SIZE, AO + addl $1 * SIZE, BO + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + + unpcklpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x4_opteron.S b/kernel/x86/zgemm3m_kernel_2x4_opteron.S new file mode 100644 index 0000000..8e93a28 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x4_opteron.S @@ -0,0 +1,1803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA_R 16 + STACK + ARGS(%esi) +#define OLD_ALPHA_I 24 + STACK + ARGS(%esi) +#define OLD_A 32 + STACK + ARGS(%esi) +#define OLD_B 36 + STACK + ARGS(%esi) +#define OLD_C 40 + STACK + ARGS(%esi) +#define OLD_LDC 44 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define movsd movlpd +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movsd OLD_ALPHA_R, %xmm0 + movhps OLD_ALPHA_I, %xmm0 + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movss OLD_OFFT, %xmm4 +#endif + + movl OLD_B, %edi + movl OLD_C, %ebx + movapd %xmm0, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl %edi, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movl BX, %eax + + prefetchnta 0 * SIZE(%eax) + prefetchnta 8 * SIZE(%eax) + + subl $-8 * SIZE, BX + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + + prefetchw 1 * SIZE(%esi) + prefetchw 1 * SIZE(%esi, LDC) + prefetchw 1 * SIZE(%esi, LDC, 2) + prefetchw 1 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 1 * SIZE(%esi, LDC, 2), %xmm0 + movsd 2 * SIZE(%esi, LDC, 2), %xmm1 + movhps 3 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 1 * SIZE(%esi, LDC, 2) + movlps %xmm1, 2 * SIZE(%esi, LDC, 2) + movhps %xmm1, 3 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 1 * SIZE(%esi, %eax), %xmm0 + movsd 2 * SIZE(%esi, %eax), %xmm1 + movhps 3 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 1 * SIZE(%esi, %eax) + movlps %xmm1, 2 * SIZE(%esi, %eax) + movhps %xmm1, 3 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movsd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movsd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movsd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movsd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movsd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movsd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movsd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movsd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movsd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movsd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movsd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movsd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 1 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 1 * SIZE(%esi, %eax), %xmm1 + + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 1 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 1 * SIZE(%esi, %eax) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + ALIGN_2 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + movddup 2 * SIZE(%edi), %xmm2 + movddup 3 * SIZE(%edi), %xmm3 + movddup 4 * SIZE(%edi), %xmm4 + movddup 5 * SIZE(%edi), %xmm5 + movddup 6 * SIZE(%edi), %xmm6 + movddup 7 * SIZE(%edi), %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + movsd 2 * SIZE(%edi), %xmm2 + movsd 3 * SIZE(%edi), %xmm3 + movsd 4 * SIZE(%edi), %xmm4 + movsd 5 * SIZE(%edi), %xmm5 + movsd 6 * SIZE(%edi), %xmm6 + movsd 7 * SIZE(%edi), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpckhpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#endif + prefetcht0 80 * SIZE(%edi) + prefetcht1 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L32 + ALIGN_2 + +.L35: + movl K, %eax + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) +#endif +#endif + +#if defined(OPTERON) || defined(BARCELONA) + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) +#endif + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L36 + ALIGN_4 + +.L40: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + movddup 2 * SIZE(%edi), %xmm2 + movddup 3 * SIZE(%edi), %xmm3 + movddup 4 * SIZE(%edi), %xmm4 + movddup 5 * SIZE(%edi), %xmm5 + movddup 6 * SIZE(%edi), %xmm6 + movddup 7 * SIZE(%edi), %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + movsd 2 * SIZE(%edi), %xmm2 + movsd 3 * SIZE(%edi), %xmm3 + movsd 4 * SIZE(%edi), %xmm4 + movsd 5 * SIZE(%edi), %xmm5 + movsd 6 * SIZE(%edi), %xmm6 + movsd 7 * SIZE(%edi), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpckhpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#endif + prefetcht1 80 * SIZE(%edi) + prefetcht0 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L62 + ALIGN_2 + +.L65: + movl K, %eax + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) +#endif +#endif + +#if defined(OPTERON) || defined(BARCELONA) + movq 0 * SIZE(%edi), %mm0 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) +#endif + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L66 + ALIGN_4 + +.L70: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) +#endif + +#ifdef PENTIUM4 + prefetchnta 2 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x4_penryn.S b/kernel/x86/zgemm3m_kernel_2x4_penryn.S new file mode 100644 index 0000000..3920649 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x4_penryn.S @@ -0,0 +1,1344 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#ifdef NANO +#define PREFETCHSIZE (8 * 3 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht2 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht2 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 21 + 4) +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -16 * SIZE(%eax) + subl $-8 * SIZE, BX + + leal (C1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + PREFETCHW 1 * SIZE(C1) + pxor %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + PREFETCHW 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + PREFETCHW 1 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movups ALPHA, %xmm3 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + movsd %xmm0, %xmm7 + + leal (C1, LDC, 2), %eax + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm1 + movhps 3 * SIZE(C1), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 2 * SIZE(C1) + movhps %xmm1, 3 * SIZE(C1) + + movsd 0 * SIZE(C1, LDC), %xmm0 + movhps 1 * SIZE(C1, LDC), %xmm0 + movsd 2 * SIZE(C1, LDC), %xmm1 + movhps 3 * SIZE(C1, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(C1, LDC) + movhps %xmm0, 1 * SIZE(C1, LDC) + movlps %xmm1, 2 * SIZE(C1, LDC) + movhps %xmm1, 3 * SIZE(C1, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 1 * SIZE(%eax), %xmm0 + movsd 2 * SIZE(%eax), %xmm1 + movhps 3 * SIZE(%eax), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 1 * SIZE(%eax) + movlps %xmm1, 2 * SIZE(%eax) + movhps %xmm1, 3 * SIZE(%eax) + + movsd 0 * SIZE(%eax, LDC), %xmm0 + movhps 1 * SIZE(%eax, LDC), %xmm0 + movsd 2 * SIZE(%eax, LDC), %xmm1 + movhps 3 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax, LDC) + movhps %xmm0, 1 * SIZE(%eax, LDC) + movlps %xmm1, 2 * SIZE(%eax, LDC) + movhps %xmm1, 3 * SIZE(%eax, LDC) + + addl $4 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movups ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + leal (C1, LDC, 2), %eax + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 1 * SIZE(C1, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 0 * SIZE(C1, LDC) + movhps %xmm1, 1 * SIZE(C1, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 1 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%eax, LDC), %xmm1 + movhps 1 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 1 * SIZE(%eax) + movlps %xmm1, 0 * SIZE(%eax, LDC) + movhps %xmm1, 1 * SIZE(%eax, LDC) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $2, %eax + jle .L50 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1) + pxor %xmm6, %xmm6 + PREFETCHW 1 * SIZE(C1, LDC) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movups ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm1 + movhps 3 * SIZE(C1), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 2 * SIZE(C1) + movhps %xmm1, 3 * SIZE(C1) + + movsd 0 * SIZE(C1, LDC), %xmm0 + movhps 1 * SIZE(C1, LDC), %xmm0 + movsd 2 * SIZE(C1, LDC), %xmm1 + movhps 3 * SIZE(C1, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(C1, LDC) + movhps %xmm0, 1 * SIZE(C1, LDC) + movlps %xmm1, 2 * SIZE(C1, LDC) + movhps %xmm1, 3 * SIZE(C1, LDC) + + addl $4 * SIZE, C1 + decl I + jg .L31 + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movups ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 1 * SIZE(C1, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 0 * SIZE(C1, LDC) + movhps %xmm1, 1 * SIZE(C1, LDC) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movups ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm1 + movhps 3 * SIZE(C1), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 2 * SIZE(C1) + movhps %xmm1, 3 * SIZE(C1) + + addl $4 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movups ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + haddpd %xmm4, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + + pshufd $0x44, %xmm4, %xmm2 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x4_prescott.S b/kernel/x86/zgemm3m_kernel_2x4_prescott.S new file mode 100644 index 0000000..a32e0ae --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x4_prescott.S @@ -0,0 +1,1590 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm2, %xmm7 + +#define KERNEL6(address) \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm2, %xmm7; \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + movl BX, %eax + prefetcht2 0 * SIZE(%eax) + subl $-4 * SIZE, BX + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC, 1) + prefetchnta 3 * SIZE(%esi, LDC, 2) + prefetchnta 3 * SIZE(%esi, %eax, 1) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#ifdef CORE_PRESCOTT + andl $-8, %eax + sall $4, %eax + je .L15 + +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(16 * 8) + KERNEL2(16 * 8) + KERNEL3(16 * 8) + KERNEL4(16 * 8) + KERNEL5(16 * 8) + KERNEL6(16 * 8) + KERNEL7(16 * 8) + KERNEL8(16 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(16 * 9) + KERNEL2(16 * 9) + KERNEL3(16 * 9) + KERNEL4(16 * 9) + KERNEL5(16 * 9) + KERNEL6(16 * 9) + KERNEL7(16 * 9) + KERNEL8(16 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(16 * 10) + KERNEL2(16 * 10) + KERNEL3(16 * 10) + KERNEL4(16 * 10) + KERNEL5(16 * 10) + KERNEL6(16 * 10) + KERNEL7(16 * 10) + KERNEL8(16 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(16 * 11) + KERNEL2(16 * 11) + KERNEL3(16 * 11) + KERNEL4(16 * 11) + KERNEL5(16 * 11) + KERNEL6(16 * 11) + KERNEL7(16 * 11) + KERNEL8(16 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(16 * 12) + KERNEL2(16 * 12) + KERNEL3(16 * 12) + KERNEL4(16 * 12) + KERNEL5(16 * 12) + KERNEL6(16 * 12) + KERNEL7(16 * 12) + KERNEL8(16 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(16 * 13) + KERNEL2(16 * 13) + KERNEL3(16 * 13) + KERNEL4(16 * 13) + KERNEL5(16 * 13) + KERNEL6(16 * 13) + KERNEL7(16 * 13) + KERNEL8(16 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(16 * 14) + KERNEL2(16 * 14) + KERNEL3(16 * 14) + KERNEL4(16 * 14) + KERNEL5(16 * 14) + KERNEL6(16 * 14) + KERNEL7(16 * 14) + KERNEL8(16 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(16 * 15) + KERNEL2(16 * 15) + KERNEL3(16 * 15) + KERNEL4(16 * 15) + KERNEL5(16 * 15) + KERNEL6(16 * 15) + KERNEL7(16 * 15) + KERNEL8(16 * 15) +#else + addl $32 * 4 * SIZE, AA + addl $32 * 8 * SIZE, BB + subl $128 * 8, %eax + jg .L1X +#endif + +.L12: + leal (AA, %eax, 1), AA # * 16 + leal (BB, %eax, 2), BB # * 64 + +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 1 * SIZE(%esi, LDC, 2), %xmm0 + movsd 2 * SIZE(%esi, LDC, 2), %xmm1 + movhps 3 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 1 * SIZE(%esi, LDC, 2) + movlps %xmm1, 2 * SIZE(%esi, LDC, 2) + movhps %xmm1, 3 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 1 * SIZE(%esi, %eax), %xmm0 + movsd 2 * SIZE(%esi, %eax), %xmm1 + movhps 3 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 1 * SIZE(%esi, %eax) + movlps %xmm1, 2 * SIZE(%esi, %eax) + movhps %xmm1, 3 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_3 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (%esi, LDC, 1), %eax + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 1 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 1 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 1 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 1 * SIZE(%esi, %eax) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + movl BB, B + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + movl BB, B + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x2_core2.S b/kernel/x86/zgemm3m_kernel_4x2_core2.S new file mode 100644 index 0000000..0c01de8 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x2_core2.S @@ -0,0 +1,1328 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA_R 16 + STACK + ARGS(%esi) +#define OLD_ALPHA_I 24 + STACK + ARGS(%esi) +#define OLD_A 32 + STACK + ARGS(%esi) +#define OLD_B 36 + STACK + ARGS(%esi) +#define OLD_C 40 + STACK + ARGS(%esi) +#define OLD_LDC 44 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 256(%esp) + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 7 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl %esp, %esi # save old stack + + subl $512 + LOCAL_BUFFER_SIZE, %esp + andl $-4096, %esp # align stack + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movsd OLD_ALPHA_R, %xmm0 + movhps OLD_ALPHA_I, %xmm0 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, B + movl OLD_C, %ebx + + movaps %xmm0, ALPHA + movl %ebx, C + movl OLD_LDC, LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax + movl %eax, J + jle .L40 + ALIGN_4 + +.L01: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl B, BX + + movl C, C1 + movl A, AA + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(C1) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(C1, LDC) + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetcht0 (%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd -12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd -10 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + movapd -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + PADDING; + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm6 + movapd -4 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm4 + movapd -2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm6 + PADDING; + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + movapd 8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm4 + movapd 14 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + subl $-32 * SIZE, BB + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm6 + movapd -16 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 24 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + subl $-32 * SIZE, AA + decl %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 4 * SIZE(%esi), %xmm0 + movhps 5 * SIZE(%esi), %xmm0 + movsd 6 * SIZE(%esi), %xmm1 + movhps 7 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi) + movhps %xmm0, 5 * SIZE(%esi) + movlps %xmm1, 6 * SIZE(%esi) + movhps %xmm1, 7 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + movsd 4 * SIZE(%esi, LDC), %xmm0 + movhps 5 * SIZE(%esi, LDC), %xmm0 + movsd 6 * SIZE(%esi, LDC), %xmm1 + movhps 7 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi, LDC) + movhps %xmm0, 5 * SIZE(%esi, LDC) + movlps %xmm1, 6 * SIZE(%esi, LDC) + movhps %xmm1, 7 * SIZE(%esi, LDC) + + addl $8 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $2, I + jle .L30 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $2, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $1, I + jle .L39 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L40: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_4 + +.L41: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L45 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $7, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, C1 + movl A, AA + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht0 3 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 4 * SIZE(%esi), %xmm0 + movhps 5 * SIZE(%esi), %xmm0 + movsd 6 * SIZE(%esi), %xmm1 + movhps 7 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi) + movhps %xmm0, 5 * SIZE(%esi) + movlps %xmm1, 6 * SIZE(%esi) + movhps %xmm1, 7 * SIZE(%esi) + + addl $8 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $2, I + jle .L70 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L70: + movl M, I + testl $1, I + jle .L79 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + + unpcklpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + ALIGN_4 + +.L79: + addl LDC, C + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x2_northwood.S b/kernel/x86/zgemm3m_kernel_4x2_northwood.S new file mode 100644 index 0000000..fb7d639 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x2_northwood.S @@ -0,0 +1,1522 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movsd STACK_ALPHA_R, %xmm0 + movhps STACK_ALPHA_I, %xmm0 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movaps %xmm0, ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + BRANCH + jne .L04 + ALIGN_4 + +.L05: + movl B, BX + + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + NOBRANCH + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#endif + + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) + + movl BX, %eax + prefetcht2 0 * SIZE(%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + .align 8 + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 4 * SIZE(%esi), %xmm0 + movhps 5 * SIZE(%esi), %xmm0 + movsd 6 * SIZE(%esi), %xmm1 + movhps 7 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi) + movhps %xmm0, 5 * SIZE(%esi) + movlps %xmm1, 6 * SIZE(%esi) + movhps %xmm1, 7 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + movsd 4 * SIZE(%esi, LDC), %xmm0 + movhps 5 * SIZE(%esi, LDC), %xmm0 + movsd 6 * SIZE(%esi, LDC), %xmm1 + movhps 7 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi, LDC) + movhps %xmm0, 5 * SIZE(%esi, LDC) + movlps %xmm1, 6 * SIZE(%esi, LDC) + movhps %xmm1, 7 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi + decl %ebx # i -- + BRANCH + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + + pxor %xmm7, %xmm7 +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $2, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi # coffset += 4 + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 2 * ldc + BRANCH + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 4 * SIZE(%esi), %xmm0 + movhps 5 * SIZE(%esi), %xmm0 + movsd 6 * SIZE(%esi), %xmm1 + movhps 7 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi) + movhps %xmm0, 5 * SIZE(%esi) + movlps %xmm1, 6 * SIZE(%esi) + movhps %xmm1, 7 * SIZE(%esi) + + addl $8 * SIZE, %esi # coffset += 4 + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + addl $4 * SIZE, %esi + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + + unpcklpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S new file mode 100644 index 0000000..29158df --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S @@ -0,0 +1,2153 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA_R 16 + STACK(%esi) +#define OLD_ALPHA_I 20 + STACK(%esi) +#define OLD_A 24 + STACK(%esi) +#define OLD_B 28 + STACK(%esi) +#define OLD_C 32 + STACK(%esi) +#define OLD_LDC 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#if defined(OPTERON) || defined(BARCELONA) +#define movsd movlps +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA_R, %xmm0 + movss OLD_ALPHA_I, %xmm1 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, %edi + movl OLD_C, %ebx + + unpcklps %xmm1, %xmm0 + movlhps %xmm0, %xmm0 + + movaps %xmm0, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + sall $ZBASE_SHIFT, LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + movaps 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (%esi, LDC, 2), %eax + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + prefetchw 3 * SIZE(%eax) + prefetchw 3 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 4 * SIZE(%esi, LDC, 2), %xmm1 + movhps 6 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 4 * SIZE(%esi, LDC, 2) + movhps %xmm1, 6 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 2 * SIZE(%esi, %eax), %xmm0 + movsd 4 * SIZE(%esi, %eax), %xmm1 + movhps 6 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 2 * SIZE(%esi, %eax) + movlps %xmm1, 4 * SIZE(%esi, %eax) + movhps %xmm1, 6 * SIZE(%esi, %eax) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm4 + pshufd $0x50, %xmm5, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 2 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm6, %xmm6 + pshufd $0x50, %xmm7, %xmm7 + + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + + addps %xmm6, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 2 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (%esi, LDC, 2), %eax + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + movsd (%eax), %xmm1 + movhps (%eax, LDC), %xmm1 + + shufps $0, %xmm5, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + shufps $0, %xmm7, %xmm6 + mulps %xmm3, %xmm6 + addps %xmm6, %xmm1 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + movlps %xmm1, (%eax) + movhps %xmm1, (%eax, LDC) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + movaps 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movsd 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm4 + pshufd $0x50, %xmm5, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movups 0 * SIZE(%edi), %xmm3 + movups 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(%edi), %xmm3 + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi # coffset += 2 + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x4_opteron.S b/kernel/x86/zgemm3m_kernel_4x4_opteron.S new file mode 100644 index 0000000..511fc8b --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x4_opteron.S @@ -0,0 +1,2532 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA_R 16 + STACK(%esi) +#define OLD_ALPHA_I 20 + STACK(%esi) +#define OLD_A 24 + STACK(%esi) +#define OLD_B 28 + STACK(%esi) +#define OLD_C 32 + STACK(%esi) +#define OLD_LDC 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + + +#ifdef ATHLON +#define PREFETCH prefetch +#define PREFETCHSIZE 64 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#if defined(OPTERON) || defined(BARCELONA) +#define movsd movlps +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; +#endif + +#ifdef PENTIUM4 +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1 +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA_R, %xmm0 + movss OLD_ALPHA_I, %xmm1 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, %edi + movl OLD_C, %ebx + + unpcklps %xmm1, %xmm0 + movlhps %xmm0, %xmm0 + + movaps %xmm0, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC + + sall $ZBASE_SHIFT, LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht2 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) + prefetchnta 80 * SIZE(%edi) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) +#endif + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl %edi, BX + + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movl BX, %eax + +#ifdef HAVE_SSE + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) + prefetchw 4 * SIZE(%esi, LDC, 2) + prefetchw 4 * SIZE(%esi, %eax) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + prefetchnta 4 * SIZE(%esi, LDC, 2) + prefetchnta 4 * SIZE(%esi, %eax) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 4 * SIZE(%esi, LDC, 2), %xmm1 + movhps 6 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 4 * SIZE(%esi, LDC, 2) + movhps %xmm1, 6 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 2 * SIZE(%esi, %eax), %xmm0 + movsd 4 * SIZE(%esi, %eax), %xmm1 + movhps 6 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 2 * SIZE(%esi, %eax) + movlps %xmm1, 4 * SIZE(%esi, %eax) + movhps %xmm1, 6 * SIZE(%esi, %eax) + + addl $8 * SIZE, %esi # coffset += 2 + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + shufps $0x50, %xmm4, %xmm4 + shufps $0x50, %xmm5, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 2 * SIZE(%esi, %eax), %xmm1 + + shufps $0x50, %xmm6, %xmm6 + shufps $0x50, %xmm7, %xmm7 + + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + + addps %xmm6, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 2 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + + shufps $0, %xmm5, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + + movsd (%esi, LDC, 2), %xmm0 + movhps (%esi, %eax), %xmm0 + + shufps $0, %xmm7, %xmm6 + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + + movlps %xmm0, (%esi, LDC, 2) + movhps %xmm0, (%esi, %eax) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + prefetchnta 80 * SIZE(%edi) + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht1 112 * SIZE(%ecx) +#endif + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) +#endif + addl $2 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + shufps $0x50, %xmm4, %xmm4 + shufps $0x50, %xmm5, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + + shufps $0, %xmm5, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + prefetchnta 80 * SIZE(%edi) + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht1 112 * SIZE(%ecx) +#endif + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) +#endif + addl $1 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi # coffset += 2 + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + shufps $0x50, %xmm4, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + addl $4 * SIZE, %esi + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + + movsd (%esi), %xmm0 + + shufps $0, %xmm5, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (%esi) + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x4_penryn.S b/kernel/x86/zgemm3m_kernel_4x4_penryn.S new file mode 100644 index 0000000..802298c --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x4_penryn.S @@ -0,0 +1,1780 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH_R (8 * 4) + +#define PREFETCHSIZE (8 * 17 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + + prefetcht2 -32 * SIZE(%eax) + subl $-16 * SIZE, BX + + leal (C1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + + movddup ALPHA, %xmm3 + + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + leal (C1, LDC, 2), %eax + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm1 + movhps 6 * SIZE(C1), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 4 * SIZE(C1) + movhps %xmm1, 6 * SIZE(C1) + + movsd 0 * SIZE(C1, LDC), %xmm0 + movhps 2 * SIZE(C1, LDC), %xmm0 + movsd 4 * SIZE(C1, LDC), %xmm1 + movhps 6 * SIZE(C1, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(C1, LDC) + movhps %xmm0, 2 * SIZE(C1, LDC) + movlps %xmm1, 4 * SIZE(C1, LDC) + movhps %xmm1, 6 * SIZE(C1, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 2 * SIZE(%eax), %xmm0 + movsd 4 * SIZE(%eax), %xmm1 + movhps 6 * SIZE(%eax), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 2 * SIZE(%eax) + movlps %xmm1, 4 * SIZE(%eax) + movhps %xmm1, 6 * SIZE(%eax) + + movsd 0 * SIZE(%eax, LDC), %xmm0 + movhps 2 * SIZE(%eax, LDC), %xmm0 + movsd 4 * SIZE(%eax, LDC), %xmm1 + movhps 6 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax, LDC) + movhps %xmm0, 2 * SIZE(%eax, LDC) + movlps %xmm1, 4 * SIZE(%eax, LDC) + movhps %xmm1, 6 * SIZE(%eax, LDC) + + addl $8 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $2, I + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movddup ALPHA, %xmm3 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + leal (C1, LDC, 2), %eax + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 0 * SIZE(C1, LDC) + movhps %xmm1, 2 * SIZE(C1, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 2 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%eax, LDC), %xmm1 + movhps 2 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 2 * SIZE(%eax) + movlps %xmm1, 0 * SIZE(%eax, LDC) + movhps %xmm1, 2 * SIZE(%eax, LDC) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $1, I + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movddup ALPHA, %xmm3 + + leal (C1, LDC, 2), %eax + + movsd (C1), %xmm0 + movhps (C1, LDC), %xmm0 + movsd (%eax), %xmm1 + movhps (%eax, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, (C1) + movhps %xmm0, (C1, LDC) + movlps %xmm1, (%eax) + movhps %xmm1, (%eax, LDC) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L40: + movl N, %eax + testl $2, %eax + jle .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + movddup ALPHA, %xmm3 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm1 + movhps 6 * SIZE(C1), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 4 * SIZE(C1) + movhps %xmm1, 6 * SIZE(C1) + + movsd 0 * SIZE(C1, LDC), %xmm0 + movhps 2 * SIZE(C1, LDC), %xmm0 + movsd 4 * SIZE(C1, LDC), %xmm1 + movhps 6 * SIZE(C1, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(C1, LDC) + movhps %xmm0, 2 * SIZE(C1, LDC) + movlps %xmm1, 4 * SIZE(C1, LDC) + movhps %xmm1, 6 * SIZE(C1, LDC) + + addl $8 * SIZE, C1 + decl I + jg .L41 + ALIGN_4 + +.L50: + movl M, I + testl $2, I + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + movddup ALPHA, %xmm3 + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 0 * SIZE(C1, LDC) + movhps %xmm1, 2 * SIZE(C1, LDC) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L69 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movddup ALPHA, %xmm3 + + addps %xmm5, %xmm4 + + movsd (C1), %xmm0 + movhps (C1, LDC), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, (C1) + movhps %xmm0, (C1, LDC) + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L70: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + movddup ALPHA, %xmm3 + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm1 + movhps 6 * SIZE(C1), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 4 * SIZE(C1) + movhps %xmm1, 6 * SIZE(C1) + + addl $8 * SIZE, C1 + decl I + jg .L71 + ALIGN_4 + +.L80: + movl M, I + testl $2, I + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + ALIGN_4 + +.L86: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + movddup ALPHA, %xmm3 + + addps %xmm5, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L90: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + movddup ALPHA, %xmm3 + + haddps %xmm4, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x4_prescott.S b/kernel/x86/zgemm3m_kernel_4x4_prescott.S new file mode 100644 index 0000000..3d602e3 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x4_prescott.S @@ -0,0 +1,2060 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA_R 16 + STACK(%esi) +#define OLD_ALPHA_I 20 + STACK(%esi) +#define OLD_A 24 + STACK(%esi) +#define OLD_B 28 + STACK(%esi) +#define OLD_C 32 + STACK(%esi) +#define OLD_LDC 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm2, %xmm7; \ + movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm2, %xmm7; \ + movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm3, %xmm7; \ + movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm3, %xmm7; \ + movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm2, %xmm7 + +#define KERNEL6(address) \ + movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm2, %xmm7; \ + movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm3, %xmm7; \ + movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm3, %xmm7; \ + movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA_R, %xmm0 + movss OLD_ALPHA_I, %xmm1 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, %edi + movl OLD_C, %ebx + + unpcklps %xmm1, %xmm0 + movlhps %xmm0, %xmm0 + + movaps %xmm0, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + movddup 4 * SIZE(%edi), %xmm2 + movddup 6 * SIZE(%edi), %xmm3 + movddup 8 * SIZE(%edi), %xmm4 + movddup 10 * SIZE(%edi), %xmm5 + movddup 12 * SIZE(%edi), %xmm6 + movddup 14 * SIZE(%edi), %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_2 + +.L06: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $4 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + prefetchnta 4 * SIZE(%esi, LDC, 2) + prefetchnta 4 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(32 * 8) + KERNEL2(32 * 8) + KERNEL3(32 * 8) + KERNEL4(32 * 8) + KERNEL5(32 * 8) + KERNEL6(32 * 8) + KERNEL7(32 * 8) + KERNEL8(32 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(32 * 9) + KERNEL2(32 * 9) + KERNEL3(32 * 9) + KERNEL4(32 * 9) + KERNEL5(32 * 9) + KERNEL6(32 * 9) + KERNEL7(32 * 9) + KERNEL8(32 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(32 * 10) + KERNEL2(32 * 10) + KERNEL3(32 * 10) + KERNEL4(32 * 10) + KERNEL5(32 * 10) + KERNEL6(32 * 10) + KERNEL7(32 * 10) + KERNEL8(32 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(32 * 11) + KERNEL2(32 * 11) + KERNEL3(32 * 11) + KERNEL4(32 * 11) + KERNEL5(32 * 11) + KERNEL6(32 * 11) + KERNEL7(32 * 11) + KERNEL8(32 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(32 * 12) + KERNEL2(32 * 12) + KERNEL3(32 * 12) + KERNEL4(32 * 12) + KERNEL5(32 * 12) + KERNEL6(32 * 12) + KERNEL7(32 * 12) + KERNEL8(32 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(32 * 13) + KERNEL2(32 * 13) + KERNEL3(32 * 13) + KERNEL4(32 * 13) + KERNEL5(32 * 13) + KERNEL6(32 * 13) + KERNEL7(32 * 13) + KERNEL8(32 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(32 * 14) + KERNEL2(32 * 14) + KERNEL3(32 * 14) + KERNEL4(32 * 14) + KERNEL5(32 * 14) + KERNEL6(32 * 14) + KERNEL7(32 * 14) + KERNEL8(32 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(32 * 15) + KERNEL2(32 * 15) + KERNEL3(32 * 15) + KERNEL4(32 * 15) + KERNEL5(32 * 15) + KERNEL6(32 * 15) + KERNEL7(32 * 15) + KERNEL8(32 * 15) +#else + addl $128 * 4 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 +#endif + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsldup 8 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 4 * SIZE(%esi, LDC, 2), %xmm1 + movhps 6 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 4 * SIZE(%esi, LDC, 2) + movhps %xmm1, 6 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 2 * SIZE(%esi, %eax), %xmm0 + movsd 4 * SIZE(%esi, %eax), %xmm1 + movhps 6 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 2 * SIZE(%esi, %eax) + movlps %xmm1, 4 * SIZE(%esi, %eax) + movhps %xmm1, 6 * SIZE(%esi, %eax) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 44 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 60 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 2 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 2 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + shufps $0, %xmm0, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movhps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 8 * SIZE(BB), %xmm2 + shufps $0, %xmm0, %xmm0 + movhps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movhps 20 * SIZE(BB), %xmm3 + shufps $0, %xmm0, %xmm0 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + movss 3 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + shufps $0, %xmm0, %xmm0 + movhps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movss 8 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0, %xmm1, %xmm1 + movhps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movss 5 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movsd 40 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + movhps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movss 6 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + movhps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 56 * SIZE(BB), %xmm3 + shufps $0, %xmm1, %xmm1 + movhps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0, %xmm0, %xmm0 + movhps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 8 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + + addps %xmm5, %xmm4 + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + movsd (%esi, LDC, 2), %xmm1 + movhps (%esi, %eax), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + movlps %xmm1, (%esi, LDC, 2) + movhps %xmm1, (%esi, %eax) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L45 + ALIGN_4 + +.L42: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + movddup 4 * SIZE(%edi), %xmm2 + movddup 6 * SIZE(%edi), %xmm3 + movddup 8 * SIZE(%edi), %xmm4 + movddup 10 * SIZE(%edi), %xmm5 + movddup 12 * SIZE(%edi), %xmm6 + movddup 14 * SIZE(%edi), %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $7, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movddup 0 * SIZE(%edi), %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 16 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 48 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + shufps $0, %xmm0, %xmm0 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + shufps $0, %xmm0, %xmm0 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 3 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + shufps $0, %xmm0, %xmm0 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + shufps $0, %xmm1, %xmm1 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 6 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + shufps $0, %xmm1, %xmm1 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + shufps $0, %xmm1, %xmm1 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm5, %xmm4 + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + movss %xmm0, 0 * SIZE(%ecx) + movss %xmm0, 1 * SIZE(%ecx) + movss %xmm1, 2 * SIZE(%ecx) + movss %xmm1, 3 * SIZE(%ecx) + movss %xmm2, 4 * SIZE(%ecx) + movss %xmm2, 5 * SIZE(%ecx) + movss %xmm3, 6 * SIZE(%ecx) + movss %xmm3, 7 * SIZE(%ecx) + movss %xmm4, 8 * SIZE(%ecx) + movss %xmm4, 9 * SIZE(%ecx) + movss %xmm5, 10 * SIZE(%ecx) + movss %xmm5, 11 * SIZE(%ecx) + movss %xmm6, 12 * SIZE(%ecx) + movss %xmm6, 13 * SIZE(%ecx) + movss %xmm7, 14 * SIZE(%ecx) + movss %xmm7, 15 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(%edi), %xmm0 + movss %xmm0, 0 * SIZE(%ecx) + movss %xmm0, 1 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps 16 * SIZE(AA), %xmm1 + movddup 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movddup 2 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movddup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movddup 6 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movddup 16 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movddup 10 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movddup 12 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movddup 14 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movddup 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movddup 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(AA), %xmm1 + movsd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 16 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 10 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 12 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 14 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + movhlps %xmm4, %xmm5 + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 0 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movss 4 * SIZE(AA), %xmm1 + movss 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 2 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addss %xmm2, %xmm5 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 3 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 6 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 8 * SIZE(AA), %xmm0 + addss %xmm2, %xmm5 + movss 16 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 10 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 6 * SIZE(AA), %xmm1 + addss %xmm3, %xmm5 + movss 12 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 14 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + + movsd (%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, (%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_8x2_core2.S b/kernel/x86/zgemm3m_kernel_8x2_core2.S new file mode 100644 index 0000000..9a28c8e --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_8x2_core2.S @@ -0,0 +1,1628 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 512(%esp) + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 16 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#ifdef OPTERON +#define MOVSD movlps +#else +#define MOVSD movsd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $512 + LOCAL_BUFFER_SIZE, %esp + andl $-4096, %esp # align stack + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, B + movl STACK_C, %ebx + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, 0 + ALPHA + movlps %xmm0, 8 + ALPHA + + movl %ebx, C + movl STACK_LDC, LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax + movl %eax, J + jle .L50 + ALIGN_4 + +.L01: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BB) + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BB) + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl C, C1 + movl A, AA + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -16 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 7 * SIZE(C1) + pxor %xmm7, %xmm7 + prefetcht0 7 * SIZE(C1, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps -12 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps -8 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps -4 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps 16 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps 8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps 16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 20 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm2, %xmm6 + movaps 24 * SIZE(AA), %xmm3 + addps %xmm1, %xmm7 + + movaps 24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 28 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + subl $-64 * SIZE, BB + movaps 48 * SIZE(AA), %xmm3 + subl $-64 * SIZE, AA + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps -32 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA, %xmm3 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + MOVSD 8 * SIZE(%esi), %xmm0 + movhps 10 * SIZE(%esi), %xmm0 + MOVSD 12 * SIZE(%esi), %xmm1 + movhps 14 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi) + movhps %xmm0, 10 * SIZE(%esi) + movlps %xmm1, 12 * SIZE(%esi) + movhps %xmm1, 14 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + MOVSD 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + MOVSD 8 * SIZE(%esi, LDC), %xmm0 + movhps 10 * SIZE(%esi, LDC), %xmm0 + MOVSD 12 * SIZE(%esi, LDC), %xmm1 + movhps 14 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi, LDC) + movhps %xmm0, 10 * SIZE(%esi, LDC) + movlps %xmm1, 12 * SIZE(%esi, LDC) + movhps %xmm1, 14 * SIZE(%esi, LDC) + + addl $16 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $4, I + jle .L30 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + addps %xmm0, %xmm7 + movaps -24 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps -8 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + addps %xmm2, %xmm5 + movaps -12 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movaps 32 * SIZE(BB), %xmm1 + addps %xmm2, %xmm7 + movaps -8 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm2, %xmm5 + movaps -4 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm2, %xmm7 + movaps 16 * SIZE(AA), %xmm2 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $2, I + jle .L40 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -24 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addps %xmm0, %xmm7 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movsd -8 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd -16 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movsd 8 * SIZE(BB), %xmm1 + addps %xmm2, %xmm5 + movsd -22 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movsd 32 * SIZE(BB), %xmm1 + addps %xmm2, %xmm7 + movsd -20 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addps %xmm2, %xmm5 + movsd -18 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movsd 48 * SIZE(BB), %xmm3 + addps %xmm2, %xmm7 + movsd -8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + + pshufd $0x50, %xmm5, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ +#endif + + movss -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movss -28 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movss -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm0, %xmm1 + mulss -28 * SIZE(BB), %xmm0 + addss %xmm1, %xmm4 + movss -24 * SIZE(BB), %xmm1 + addss %xmm0, %xmm5 + movss -31 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm1 + mulss -20 * SIZE(BB), %xmm0 + addss %xmm1, %xmm6 + movss 0 * SIZE(BB), %xmm1 + addss %xmm0, %xmm7 + movss -30 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss -12 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss -8 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss -29 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss -4 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 16 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss -24 * SIZE(AA), %xmm0 + mulss %xmm2, %xmm1 + mulss 4 * SIZE(BB), %xmm2 + addss %xmm1, %xmm4 + movss 8 * SIZE(BB), %xmm1 + addss %xmm2, %xmm5 + movss -27 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm1 + mulss 12 * SIZE(BB), %xmm2 + addss %xmm1, %xmm6 + movss 32 * SIZE(BB), %xmm1 + addss %xmm2, %xmm7 + movss -26 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm3 + mulss 20 * SIZE(BB), %xmm2 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm2, %xmm5 + movss -25 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm3 + mulss 28 * SIZE(BB), %xmm2 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm2, %xmm7 + movss -20 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulss %xmm0, %xmm1 + mulss -28 * SIZE(BB), %xmm0 + addss %xmm1, %xmm4 + movss -24 * SIZE(BB), %xmm1 + addss %xmm0, %xmm5 + movss -31 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + + shufps $0, %xmm5, %xmm4 + + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_4 + +.L51: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L55 + ALIGN_4 + +.L52: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movl K, %eax + andl $7, %eax + BRANCH + jle .L60 + ALIGN_4 + +.L56: + movss -32 * SIZE(B), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L56 + ALIGN_4 + +.L60: + movl C, C1 + movl A, AA + movl M, I + sarl $3, I + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht0 3 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AA), %xmm1 + addps %xmm0, %xmm4 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm6 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + mulps -20 * SIZE(AA), %xmm1 + addps %xmm0, %xmm5 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm2 + mulps -12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movaps -8 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm2 + mulps -4 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 16 * SIZE(AA), %xmm2 + addps %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm3, %xmm0 + mulps 4 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps -12 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 12 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps -8 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm2 + mulps 20 * SIZE(AA), %xmm3 + addps %xmm2, %xmm4 + movaps 24 * SIZE(AA), %xmm2 + addps %xmm3, %xmm6 + movaps -4 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm2 + mulps 28 * SIZE(AA), %xmm3 + addps %xmm2, %xmm5 + movaps 48 * SIZE(AA), %xmm2 + addps %xmm3, %xmm7 + movaps 16 * SIZE(BB), %xmm3 + + addl $ 64 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AA), %xmm1 + addps %xmm0, %xmm4 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm6 + movaps -28 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 8 * SIZE(%esi), %xmm0 + movhps 10 * SIZE(%esi), %xmm0 + movsd 12 * SIZE(%esi), %xmm1 + movhps 14 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi) + movhps %xmm0, 10 * SIZE(%esi) + movlps %xmm1, 12 * SIZE(%esi) + movhps %xmm1, 14 * SIZE(%esi) + + addl $16 * SIZE, C1 + decl I + jg .L61 + ALIGN_4 + +.L70: + movl M, I + testl $4, I + jle .L80 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + movaps -16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + movaps -12 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movaps -12 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps -8 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movaps -8 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps -4 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movaps -4 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps 16 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + + subl $-32 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi + ALIGN_4 + +.L80: + movl M, I + testl $2, I + jle .L90 + +.L81: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -16 * SIZE(BB), %xmm3 + movsd -24 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + movsd -22 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movsd -12 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -20 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -18 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -8 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movsd 16 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + addl $4 * SIZE, %esi + ALIGN_4 + +.L90: + movl M, I + testl $1, I + jle .L99 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movss -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movss -16 * SIZE(BB), %xmm3 + movss -28 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -28 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -30 * SIZE(AA), %xmm0 + addss %xmm1, %xmm5 + movss -24 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -29 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -20 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -24 * SIZE(AA), %xmm0 + addss %xmm1, %xmm5 + movss -0 * SIZE(BB), %xmm1 + mulss %xmm2, %xmm3 + movss -27 * SIZE(AA), %xmm2 + addss %xmm3, %xmm4 + movss -12 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -26 * SIZE(AA), %xmm2 + addss %xmm3, %xmm5 + movss -8 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -25 * SIZE(AA), %xmm2 + addss %xmm3, %xmm4 + movss -4 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -20 * SIZE(AA), %xmm2 + addss %xmm3, %xmm5 + movss 16 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + ALIGN_4 + +.L99: + addl LDC, C + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_8x2_sse.S b/kernel/x86/zgemm3m_kernel_8x2_sse.S new file mode 100644 index 0000000..ea66dc1 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_8x2_sse.S @@ -0,0 +1,2803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCHSIZE 48 /* for PIII */ + +#define AA %edx +#define BB %ecx + +#ifdef HAVE_SSE2 +#define MOVSD movsd +#define XORPS pxor +#else +#define MOVSD movlps +#define XORPS xorps +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL4(address) \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL5(address) \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL6(address) \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, 0 + ALPHA + movlps %xmm0, 8 + ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + addl $2 * SIZE, B + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + addl $8 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef PENTIUM4 +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + + prefetchnta 7 * SIZE(%esi) + prefetchnta 7 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 8 * SIZE, AA + addl $64 * 8 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + + prefetchnta 8 * SIZE(%esi) + prefetchnta 8 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + MOVSD 8 * SIZE(%esi), %xmm0 + movhps 10 * SIZE(%esi), %xmm0 + MOVSD 12 * SIZE(%esi), %xmm1 + movhps 14 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm6, %xmm2 +#else + movaps %xmm6, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi) + movhps %xmm0, 10 * SIZE(%esi) + movlps %xmm1, 12 * SIZE(%esi) + movhps %xmm1, 14 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + MOVSD 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + MOVSD 8 * SIZE(%esi, LDC), %xmm0 + movhps 10 * SIZE(%esi, LDC), %xmm0 + MOVSD 12 * SIZE(%esi, LDC), %xmm1 + movhps 14 * SIZE(%esi, LDC), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm7, %xmm2 +#else + movaps %xmm7, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi, LDC) + movhps %xmm0, 10 * SIZE(%esi, LDC) + movlps %xmm1, 12 * SIZE(%esi, LDC) + movhps %xmm1, 14 * SIZE(%esi, LDC) + + addl $16 * SIZE, %esi + BRANCH + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + andl $7, %ebx + jle .L99 + + testl $4, %ebx + jle .L50 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 +#endif + +.L32: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + MOVSD 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi + ALIGN_2 + +.L50: + testl $2, %ebx + jle .L70 + + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + MOVSD 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + MOVSD 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + MOVSD 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + MOVSD 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + MOVSD 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + MOVSD 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + MOVSD 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + MOVSD 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + MOVSD 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + MOVSD 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + MOVSD 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + MOVSD 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + MOVSD 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + MOVSD 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + MOVSD 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + MOVSD 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + MOVSD 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + MOVSD 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + MOVSD 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + MOVSD 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + MOVSD 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + MOVSD 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + MOVSD 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + MOVSD 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + MOVSD 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + MOVSD 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + MOVSD 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + MOVSD 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 +#endif + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi + ALIGN_2 + +.L70: + testl $1, %ebx + jle .L99 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 40 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 48 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 72 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 +#endif + +.L72: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + MOVSD 0 * SIZE(%esi), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 2 * ldc + BRANCH + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 96 * SIZE(B) + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + addl $ 8 * SIZE, B + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + addl $32 * SIZE, %ecx + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + addl $1 * SIZE, B + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(%ecx) + addl $4 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm6 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm6 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +#else + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 20 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 28 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 40 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm0 + mulps 36 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 48 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 44 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 72 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 40 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 +#endif + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm5 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + MOVSD 8 * SIZE(%esi), %xmm0 + movhps 10 * SIZE(%esi), %xmm0 + MOVSD 12 * SIZE(%esi), %xmm1 + movhps 14 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi) + movhps %xmm0, 10 * SIZE(%esi) + movlps %xmm1, 12 * SIZE(%esi) + movhps %xmm1, 14 * SIZE(%esi) + + addl $16 * SIZE, %esi + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + andl $7, %ebx + jle .L999 + + testl $4, %ebx + jle .L150 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + mulps 12 * SIZE(BB), %xmm1 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + movaps 20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 20 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + mulps 28 * SIZE(BB), %xmm1 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 +#endif + +.L132: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi + ALIGN_2 + +.L150: + testl $2, %ebx + jle .L170 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + MOVSD 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + MOVSD 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + MOVSD 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + MOVSD 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + MOVSD 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + MOVSD 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + MOVSD 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + MOVSD 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + MOVSD 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + MOVSD 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + MOVSD 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + MOVSD 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + MOVSD 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + MOVSD 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + MOVSD 20 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + MOVSD 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + MOVSD 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + MOVSD 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 40 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 +#endif + +.L152: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + addl $4 * SIZE, %esi + ALIGN_2 + +.L170: + testl $1, %ebx + jle .L999 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + movss 3 * SIZE(AA), %xmm0 + addss %xmm3, %xmm6 + mulss 12 * SIZE(BB), %xmm0 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + movss 5 * SIZE(AA), %xmm1 + addss %xmm2, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 32 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addss %xmm3, %xmm6 + mulss 28 * SIZE(BB), %xmm1 + movss 40 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 +#endif + +.L172: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + + MOVSD 0 * SIZE(%esi), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/zgemm_beta.S b/kernel/x86/zgemm_beta.S new file mode 100644 index 0000000..c36e7c5 --- /dev/null +++ b/kernel/x86/zgemm_beta.S @@ -0,0 +1,242 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define BETA_R 16 + STACK + ARGS(%esp) +#define BETA_I 24 + STACK + ARGS(%esp) +#define C 48 + STACK + ARGS(%esp) +#define LDC 52 + STACK + ARGS(%esp) +#else +#define BETA_R 16 + STACK + ARGS(%esp) +#define BETA_I 20 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define LDC 44 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + + PROFCODE + + movl M, %ebp + movl N, %ecx + movl LDC, %edx + movl C, %edi + + FLD BETA_R + FLD BETA_I + + testl %ebp, %ebp # if n <= 0 goto End + jle .L83 + testl %ecx, %ecx # if m <= 0 goto End + jle .L83 + + fld %st(1) + fabs + fld %st(1) + fabs + faddp %st, %st(1) + + sall $ZBASE_SHIFT, %edx + + ftst + fnstsw %ax + andb $68, %ah +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + je .L71 + ALIGN_2 + +.L53: + movl %edi, %esi # c_offset1 = c_offset + addl %edx, %edi # c_offset += ldc + + movl %ebp, %eax + sarl $2, %eax + jle .L56 + ALIGN_2 + +.L57: +#if defined(HAS_PREFETCH) && defined(PENTIUM3) + prefetchnta 16 * SIZE(%esi) + prefetchnta 24 * SIZE(%esi) +#endif + + FSTU 0 * SIZE(%esi) # c_offset1 + FSTU 1 * SIZE(%esi) + FSTU 2 * SIZE(%esi) + FSTU 3 * SIZE(%esi) + FSTU 4 * SIZE(%esi) + FSTU 5 * SIZE(%esi) + FSTU 6 * SIZE(%esi) + FSTU 7 * SIZE(%esi) + addl $8 * SIZE, %esi # c_offset1 += 8 + decl %eax # i-- + jg .L57 + ALIGN_2 + +.L56: + movl %ebp, %eax + andl $3, %eax + jle .L62 + ALIGN_2 + +.L63: + FSTU 0 * SIZE(%esi) + FSTU 1 * SIZE(%esi) + addl $2 * SIZE,%esi + decl %eax + jg .L63 + ALIGN_2 + +.L62: + decl %ecx # j -- + jg .L53 + jmp .L83 + ALIGN_3 + +.L71: + movl %edi, %esi + addl %edx, %edi # c_offset += ldc + + + movl %ebp, %eax + sarl $1, %eax + jle .L84 + ALIGN_3 + +.L85: +#if defined(HAS_PREFETCH) && defined(PENTIUM3) + prefetchnta 16 * SIZE(%esi) +#endif + fld %st(0) + FMUL 0 * SIZE(%esi) + fld %st(2) + FMUL 1 * SIZE(%esi) + faddp %st,%st(1) + fld %st(2) + FMUL 0 * SIZE(%esi) + fld %st(2) + FMUL 1 * SIZE(%esi) + fsubrp %st,%st(1) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + + fld %st(0) + FMUL 2 * SIZE(%esi) + fld %st(2) + FMUL 3 * SIZE(%esi) + faddp %st,%st(1) + fld %st(2) + FMUL 2 * SIZE(%esi) + fld %st(2) + FMUL 3 * SIZE(%esi) + fsubrp %st,%st(1) + + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + addl $4 * SIZE, %esi + + decl %eax + jg .L85 + ALIGN_3 + +.L84: + movl %ebp, %eax + andl $1, %eax + jle .L74 + ALIGN_3 + +.L75: +#if defined(HAS_PREFETCH) && defined(PENTIUM3) + prefetchnta 16 * SIZE(%esi) +#endif + + fld %st(0) + FMUL 0 * SIZE(%esi) + fld %st(2) + FMUL 1 * SIZE(%esi) + faddp %st,%st(1) + fld %st(2) + FMUL 0 * SIZE(%esi) + fld %st(2) + FMUL 1 * SIZE(%esi) + fsubrp %st,%st(1) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + ALIGN_2 + +.L74: + decl %ecx + jg .L71 + ALIGN_2 + +.L83: +#ifndef C_SUN + ffreep %st(0) + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 + .byte 0xdf + .byte 0xc0 +#endif + + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x1.S b/kernel/x86/zgemm_kernel_1x1.S new file mode 100644 index 0000000..117b245 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x1.S @@ -0,0 +1,450 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define BX 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_A 32 + STACK + ARGS(%esp) +#define STACK_B 36 + STACK + ARGS(%esp) +#define STACK_C 40 + STACK + ARGS(%esp) +#define STACK_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define STACK_C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#define M %esi +#define K %edi + +#define A %ebx +#define B %ecx +#define C %edx +#define LDC %ebp + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl STACK_K, K + movl STACK_LDC, LDC + sall $ZBASE_SHIFT, LDC + + cmpl $0, STACK_N + jle .L29 + cmpl $0, STACK_M + jle .L29 + ALIGN_4 + +.L30: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl %ebx, BX + + movl STACK_A, A + movl STACK_C, C + movl STACK_M, M + ALIGN_4 + +.L34: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl STACK_B, B +#else + movl STACK_B, B + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 2), B +#endif + +#ifdef HAVE_SSE + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(B) # B5 + FLD 4 * SIZE(A) # A5 + FLD 0 * SIZE(B) # B0 + FLD 0 * SIZE(A) # A0 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + +#ifdef HAVE_SSE + prefetcht2 2 * SIZE(C) +#endif + sarl $2, %eax + je .L37 + ALIGN_4 + +#define PREFETCH_OFFSET 40 + +.L38: +#ifdef HAVE_SSE + prefetchnta (PREFETCH_OFFSET) * SIZE(B) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET) * SIZE(A) +#endif +#endif + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + FLD 0 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + FLD 1 * SIZE(A) + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 2 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 2 * SIZE(A) + + fmul %st, %st(1) + FMUL 3 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + FLD 2 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + FLD 3 * SIZE(A) + fmul %st, %st(1) + FMUL 3 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 8 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 8 * SIZE(A) + fxch %st(2) + +#ifdef HAVE_SSE +#ifdef DOUBLE + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(B) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(A) +#endif +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(B) + fxch %st(3) + faddp %st, %st(5) + FLD 4 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + FLD 5 * SIZE(A) + fmul %st, %st(3) + FMUL 5 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 6 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 6 * SIZE(A) + + fmul %st, %st(3) + FMUL 7 * SIZE(B) + fxch %st(3) + faddp %st, %st(5) + FLD 6 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + FLD 7 * SIZE(A) + fmul %st, %st(3) + FMUL 7 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 12 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 12 * SIZE(A) + fxch %st(2) + + subl $-8 * SIZE, B + subl $-8 * SIZE, A + decl %eax + jg .L38 + ALIGN_4 + +.L37: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax + jle .L43 + ALIGN_2 + +.L54: + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + + FLD 0 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + + FLD 1 * SIZE(A) + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 2 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 2 * SIZE(A) + + addl $2 * SIZE, A + addl $2 * SIZE, B + decl %eax + jg .L54 + ALIGN_3 + +.L43: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA_R + fxch %st(3) + FLD ALPHA_I + fxch %st(5) + + faddp %st, %st(2) # ctemp3 += ctemp4 + faddp %st, %st(2) # ctemp1 += ctemp2 + + fld %st(0) # copy ctemp2 + fmul %st(4), %st # ctemp3 *= alpha_i + fld %st(2) # copy ctemp1 + fmul %st(4), %st # ctemp1 *= alpha_r + fsubp %st, %st(1) # ctemp2 -= ctemp4 + +#ifndef TRMMKERNEL + FADD 0 * SIZE(C) +#endif + FST 0 * SIZE(C) + + fmulp %st, %st(2) # ctemp3 *= alpha_i + fmulp %st, %st(2) # ctemp1 *= alpha_r + faddp %st, %st(1) # ctemp1 += ctemp3 + +#ifndef TRMMKERNEL + FADD 1 * SIZE(C) +#endif + FST 1 * SIZE(C) + addl $2 * SIZE, C + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 2), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + decl M + jg .L34 + ALIGN_2 + +.L33: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl B, STACK_B + addl LDC, STACK_C + decl STACK_N + jg .L30 + ALIGN_2 + +.L29: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x1_atom.S b/kernel/x86/zgemm_kernel_1x1_atom.S new file mode 100644 index 0000000..5d276b9 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x1_atom.S @@ -0,0 +1,351 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define CO1 %esi +#define LDC %ebp +#define B %edi + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 subsd +#define ADDSD4 subsd +#endif + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + sall $ZBASE_SHIFT, LDC + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + movl N, %eax + testl %eax, %eax + movl %eax, J + jle .L999 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, CO1 + addl LDC, C + + movl A, AA + + movl M, %ebx + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movl BX, %eax + prefetcht0 0 * SIZE(%eax) + subl $-8 * SIZE, BX + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADDSD3 %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADDSD3 %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + ADDSD3 %xmm2, %xmm6 + ADDSD4 %xmm3, %xmm7 + + addsd %xmm7, %xmm4 + addsd %xmm5, %xmm6 + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm5 + mulsd %xmm1, %xmm6 + mulsd %xmm0, %xmm7 + + subsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, CO1 + decl %ebx + jg .L10 + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl BB, B + decl J + jg .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2.S b/kernel/x86/zgemm_kernel_1x2.S new file mode 100644 index 0000000..0f98069 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2.S @@ -0,0 +1,813 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_A 32 + STACK + ARGS(%esp) +#define STACK_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define STACK_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define BB %ebx +#define LDC %ebp +#define BX %esi + +#define ADD1 faddp + +#if defined(NN) || defined(CN) +#define ADD2 faddp +#else +#define ADD2 fsubrp +#endif + +#if defined(NN) || defined(CC) +#define ADD3 fsubrp +#else +#define ADD3 faddp +#endif + +#if defined(NN) || defined(NC) +#define ADD4 faddp +#else +#define ADD4 fsubrp +#endif + +#define PREFETCHSIZE (8 * 5 + 4) + +#define AOFFSET 1 +#define BOFFSET 1 + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#else +#define PREFETCH prefetcht0 +#endif + +#define KERNEL \ + PREFETCH PREFETCHSIZE * SIZE + AOFFSET(A, %eax, 2);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD -15 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD -14 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL -13 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -15 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -15 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD -16 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD -13 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL -14 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -14 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD -11 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD -10 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL -9 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -13 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -11 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD -12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD -9 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL -10 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -12 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD -7 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD -6 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL -5 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -11 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -7 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD -8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD -5 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL -6 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -10 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD -3 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD -2 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL -1 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -9 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -3 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD -4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD -1 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL -2 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD 8 * SIZE + AOFFSET(A, %eax, 2);\ + fxch %st(1);\ + FLD 0 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD 1 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + AOFFSET(A, %eax, 2);\ + ADD2 %st, %st(5);\ + FLD 2 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL 3 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -7 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 1 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD 0 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD 3 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL 2 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -6 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD 5 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD 6 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL 7 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -5 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 5 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD 4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD 7 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL 6 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -4 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD 9 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD 10 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL 11 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -3 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 9 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD 8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD 11 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL 10 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -2 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD 13 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD 14 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL 15 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -1 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 13 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD 12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD 15 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL 14 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD 16 * SIZE + AOFFSET(A, %eax, 2);\ + fxch %st(2);\ + FLD 0 * SIZE + BOFFSET(BB, %eax, 4);\ + subl $-8 * SIZE, %eax + +/* + + A hint of scheduling is received from following URL + + http://www.netlib.org/atlas/atlas-comm/msg00260.html + +*/ + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl STACK_LDC, LDC + sall $ZBASE_SHIFT, LDC + + subl $(AOFFSET - 16 * SIZE), STACK_A + subl $(BOFFSET - 16 * SIZE), STACK_B + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L11: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl K, BX + sall $ZBASE_SHIFT + 1, BX + addl B, BX + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L14: + prefetchnta -16 * SIZE + BOFFSET(BX) + prefetchnta -8 * SIZE + BOFFSET(BX) + subl $-16 * SIZE, BX + + movl STACK_B, B + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 4), B +#endif + + fldz + fldz + fldz + fldz + + FLD 0 * SIZE + AOFFSET(A) + FLD -8 * SIZE + AOFFSET(A) + FLD -16 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + +#ifdef HAVE_3DNOW + prefetchw 1 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, LDC) +#elif defined(HAVE_SSE) + prefetcht0 1 * SIZE(%edi) + prefetcht0 2 * SIZE(%edi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + + leal (, %eax, SIZE), %eax + leal (A, %eax, 2), A + leal 16 * SIZE(B, %eax, 4), BB + leal (B, %eax, 4), B + negl %eax + NOBRANCH + je .L16 + ALIGN_4 + +.L15: + KERNEL + jge .L16 + KERNEL + jge .L16 + KERNEL + jge .L16 + KERNEL + jl .L15 + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L19 + ALIGN_4 + + +.L17: + fmul %st(1), %st + ADD1 %st, %st(4) + FLD -15 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD2 %st, %st(5) + FLD -14 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD1 %st, %st(6) + FMUL -13 * SIZE + BOFFSET(B) + + ADD2 %st, %st(6) + FLD -15 * SIZE + AOFFSET(A) + FLD -15 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD3 %st, %st(4) + FLD -16 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD4 %st, %st(5) + FLD -13 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD3 %st, %st(6) + FMUL -14 * SIZE + BOFFSET(B) + + ADD4 %st, %st(6) + FLD -14 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + addl $2 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA_R + fmul %st(1), %st + FLD ALPHA_I + fmul %st(3), %st + fsubrp %st, %st(1) + fxch %st(2) + FMUL ALPHA_R + fxch %st(1) + FMUL ALPHA_I + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FADD 1 * SIZE(%edi) + FST 1 * SIZE(%edi) + FADD 0 * SIZE(%edi) + FST 0 * SIZE(%edi) +#else + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) +#endif + + FLD ALPHA_R + fmul %st(1), %st + FLD ALPHA_I + fmul %st(3), %st + fsubrp %st, %st(1) + fxch %st(2) + FMUL ALPHA_R + fxch %st(1) + FMUL ALPHA_I + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FADD 1 * SIZE(%edi,LDC) + FST 1 * SIZE(%edi,LDC) + FADD 0 * SIZE(%edi,LDC) + FST 0 * SIZE(%edi,LDC) +#else + FST 1 * SIZE(%edi,LDC) + FST 0 * SIZE(%edi,LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 4), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %edi + decl I + jne .L14 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + movl B, STACK_B + decl J + jne .L11 + ALIGN_4 + +.L20: + movl N, %eax + andl $1, %eax + je .L999 + ALIGN_3 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L24: + movl STACK_B, B +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 2), B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + + prefetchw 1 * SIZE(%edi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -15 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -15 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -15 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -14 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -13 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -13 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -13 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -12 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -11 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -11 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -11 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -10 * SIZE + AOFFSET(A) + FLD -10 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -9 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -9 * SIZE + AOFFSET(A) + FLD -10 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -9 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -8 * SIZE + AOFFSET(A) + FLD -8 * SIZE + BOFFSET(B) + + addl $8 * SIZE,A + addl $8 * SIZE,B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -15 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -15 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -15 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -14 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + addl $2 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(3) + faddp %st, %st(1) + + fxch %st(1) + + FLD ALPHA_R + fmul %st(1), %st + FLD ALPHA_I + fmul %st(3), %st + fsubrp %st, %st(1) + fxch %st(2) + FMUL ALPHA_R + fxch %st(1) + FMUL ALPHA_I + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FADD 1 * SIZE(%edi) + FST 1 * SIZE(%edi) + FADD 0 * SIZE(%edi) + FST 0 * SIZE(%edi) +#else + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 2), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %edi + decl I + jne .L24 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C + movl B, STACK_B + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_3dnow.S b/kernel/x86/zgemm_kernel_1x2_3dnow.S new file mode 100644 index 0000000..3699bb2 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_3dnow.S @@ -0,0 +1,958 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA_R 16 + STACK + ARGS(%esi) +#define OLD_ALPHA_I 20 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFSET 40 + STACK + ARGS(%esi) + +#define GAMMA_R 0(%esp) +#define GAMMA_I 8(%esp) +#define ALPHA 16(%esp) +#define K 24(%esp) +#define N 28(%esp) +#define M 32(%esp) +#define A 36(%esp) +#define C 40(%esp) +#define J 44(%esp) +#define OLD_STACK 48(%esp) +#define OFFSET 52(%esp) +#define KK 56(%esp) +#define KKK 60(%esp) +#define BUFFER 128(%esp) + +#define AA %edx +#define BB %ecx + +#define PREFETCHSIZE (16 * 2 + 6) + +#define AOFFSET -32 +#define BOFFSET 128 + +/* + + A hint of scheduling is received from following URL + +https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11 + +*/ + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + subl $AOFFSET * SIZE, %edx + movl %edx, A + movl %esi, OLD_STACK + + testl %ebx, %ebx + jle .L999 + + movl OLD_B, %edi + movl OLD_C, %ebx + + EMMS + + movd OLD_ALPHA_R, %mm0 + movd OLD_ALPHA_I, %mm1 + + movd %mm0, 0 + ALPHA + movd %mm1, 4 + ALPHA + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + movl $0x3f800000, 0 + GAMMA_R + movl $0x3f800000, 4 + GAMMA_R + movl $0xbf800000, 0 + GAMMA_I + movl $0x3f800000, 4 + GAMMA_I +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + movl $0x3f800000, 0 + GAMMA_R + movl $0x3f800000, 4 + GAMMA_R + movl $0x3f800000, 0 + GAMMA_I + movl $0xbf800000, 4 + GAMMA_I +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + movl $0x3f800000, 0 + GAMMA_R + movl $0xbF800000, 4 + GAMMA_R + movl $0x3f800000, 0 + GAMMA_I + movl $0x3F800000, 4 + GAMMA_I +#else + movl $0x3f800000, 0 + GAMMA_R + movl $0xbf800000, 4 + GAMMA_R + movl $0xbf800000, 0 + GAMMA_I + movl $0xbf800000, 4 + GAMMA_I +#endif + movl %ebx, C + movl OLD_LDC, %ebp + leal (, %ebp, SIZE * 2), %ebp + +#ifdef TRMMKERNEL + movl OLD_OFFSET, %eax + movl %eax, OFFSET +#ifndef LEFT + negl %eax + movl %eax, KK +#endif +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L20 + ALIGN_4 + +.L01: +/* Copying to Sub Buffer */ + leal BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm4, 8 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + + movd 8 * SIZE(%edi), %mm0 + movd 9 * SIZE(%edi), %mm1 + movd 10 * SIZE(%edi), %mm2 + movd 11 * SIZE(%edi), %mm3 + movd 12 * SIZE(%edi), %mm4 + movd 13 * SIZE(%edi), %mm5 + movd 14 * SIZE(%edi), %mm6 + movd 15 * SIZE(%edi), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 16 * SIZE(BB) + movq %mm1, 18 * SIZE(BB) + movq %mm2, 20 * SIZE(BB) + movq %mm3, 22 * SIZE(BB) + movq %mm4, 24 * SIZE(BB) + movq %mm5, 26 * SIZE(BB) + movq %mm6, 28 * SIZE(BB) + movq %mm7, 30 * SIZE(BB) + + addl $16 * SIZE, %edi + addl $32 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + + movq %mm0, 0 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + + addl $4 * SIZE, %edi + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + ALIGN_4 + +.L11: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L15 + ALIGN_4 + +.L12: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 10 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 18 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 26 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 34 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 42 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 50 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 58 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 66 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 68 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 72 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 74 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 76 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 96 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 82 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 84 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 88 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 90 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 92 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (112 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 98 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq (100 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq (104 + BOFFSET) * SIZE(BB), %mm2 + pfmul (102 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq (106 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq (108 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq (128 + BOFFSET) * SIZE(BB), %mm2 + pfmul (110 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq (114 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq (116 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (120 + BOFFSET) * SIZE(BB), %mm3 + pfmul (118 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq (122 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq (124 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (144 + BOFFSET) * SIZE(BB), %mm3 + pfmul (126 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movq GAMMA_R, %mm0 + movq GAMMA_I, %mm1 + movq ALPHA, %mm2 + + pswapd %mm5, %mm5 + pswapd %mm7, %mm7 + + pfmul %mm0, %mm4 + pfmul %mm1, %mm5 + pfmul %mm0, %mm6 + pfmul %mm1, %mm7 + + pfadd %mm5, %mm4 + pfadd %mm7, %mm6 + + pswapd %mm4, %mm5 + pswapd %mm6, %mm7 + pfmul %mm2, %mm4 + pfmul %mm2, %mm6 + pfmul %mm2, %mm5 + pfmul %mm2, %mm7 + + pfpnacc %mm5, %mm4 + pfpnacc %mm7, %mm6 + +#ifndef TRMMKERNEL + pfadd (%esi), %mm4 + pfadd (%esi, %ebp), %mm6 +#endif + movq %mm4, (%esi) + movq %mm6, (%esi, %ebp) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi + decl %ebx + jg .L11 + ALIGN_4 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, %ebp, 2), %eax + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L20: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L21: +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, BB + sarl $2, %eax + jle .L25 + ALIGN_4 + +.L22: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm4, 8 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movl K, %eax + andl $3, %eax + BRANCH + jle .L30 + ALIGN_4 + +.L26: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + + movd %mm0, 0 * SIZE(BB) + movd %mm0, 1 * SIZE(BB) + movd %mm1, 2 * SIZE(BB) + movd %mm1, 3 * SIZE(BB) + + addl $2 * SIZE, %edi + addl $4 * SIZE, BB + decl %eax + jne .L26 + ALIGN_4 + +.L30: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + ALIGN_3 + +.L31: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + + prefetchw 2 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L35 + ALIGN_4 + +.L32: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + pfadd %mm6, %mm4 + pfadd %mm7, %mm5 + + movq ALPHA, %mm2 + pswapd %mm5, %mm5 + + pfmul GAMMA_R, %mm4 + pfmul GAMMA_I, %mm5 + + pfadd %mm5, %mm4 + + pswapd %mm4, %mm5 + pfmul %mm2, %mm4 + pfmul %mm2, %mm5 + pfpnacc %mm5, %mm4 + +#ifndef TRMMKERNEL + pfadd 0 * SIZE(%esi), %mm4 +#endif + movq %mm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L31 + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_barcelona.S b/kernel/x86/zgemm_kernel_1x2_barcelona.S new file mode 100644 index 0000000..f71b095 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_barcelona.S @@ -0,0 +1,728 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define OLD_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define OLD_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define B %edi +#define LDC %ebp +#define AO %edx +#define BO %ecx +#define CO %esi +#define I %ebx + +#define movsd movlps +#define movapd movups +#define movlpd movlps +#define movhpd movhps + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addpd +#define ADD2 subpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 subpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm1; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %eax, 2); \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + ADD1 %xmm1, %xmm4; \ + movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm0, %xmm6; \ + movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + ADD2 %xmm0, %xmm7; \ + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL2(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + ADD1 %xmm1, %xmm4; \ + movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm0, %xmm6; \ + movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + ADD2 %xmm0, %xmm7; \ + movddup -12 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL3(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + ADD1 %xmm1, %xmm4; \ + movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm0, %xmm6; \ + movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + ADD2 %xmm0, %xmm7; \ + movddup -10 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL4(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + ADD1 %xmm1, %xmm4; \ + movapd (BO, %eax, 4), %xmm1; \ + ADD1 %xmm0, %xmm6; \ + movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + ADD2 %xmm0, %xmm7; \ + movddup (AO, %eax, 2), %xmm0 + +#define KERNEL5(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + ADD1 %xmm1, %xmm4; \ + movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm3, %xmm6; \ + movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + ADD2 %xmm3, %xmm7; \ + movddup -6 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL6(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + ADD1 %xmm1, %xmm4; \ + movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm3, %xmm6; \ + movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + ADD2 %xmm3, %xmm7; \ + movddup -4 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL7(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + ADD1 %xmm1, %xmm4; \ + movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm3, %xmm6; \ + movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + ADD2 %xmm3, %xmm7; \ + movddup -2 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL8(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + ADD1 %xmm1, %xmm4; \ + movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm3, %xmm6; \ + movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + ADD2 %xmm3, %xmm7; \ + movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl OLD_B, B + movl OLD_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax + +#ifndef LEFT + negl %eax +#endif + + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax + movl %eax, BX + + movl C, CO + movl A, AO + movl M, I + testl I, I + jle .L100 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 4), BO +#endif + + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + + subl $-8 * SIZE, BX + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm4, %xmm4 + movddup -8 * SIZE(AO), %xmm3 + pxor %xmm5, %xmm5 + + prefetchw 1 * SIZE(CO) + pxor %xmm6, %xmm6 + prefetchw 1 * SIZE(CO, LDC) + pxor %xmm7, %xmm7 + movapd %xmm1, %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + ALIGN_4 + +.L16: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BO, %eax, 4), %xmm1 + ADD1 %xmm0, %xmm6 + movddup -15 * SIZE(AO, %eax, 2), %xmm0 + mulpd %xmm0, %xmm2 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + ADD2 %xmm0, %xmm7 + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + ADD2 %xmm2, %xmm5 + movapd %xmm1, %xmm2 + + addl $SIZE, %eax + jl .L16 + ALIGN_4 + +.L14: +#ifndef TRMMKERNEL + movupd 0 * SIZE(CO), %xmm0 + movupd 0 * SIZE(CO, LDC), %xmm1 +#endif + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 +#else + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm5 + movapd %xmm7, %xmm6 + pshufd $0x4e, %xmm7, %xmm7 +#endif + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm2, %xmm6 + mulpd %xmm3, %xmm7 + + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movlpd %xmm4, 0 * SIZE(CO) + movhpd %xmm4, 1 * SIZE(CO) + movlpd %xmm6, 0 * SIZE(CO, LDC) + movhpd %xmm6, 1 * SIZE(CO, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, CO # coffset += 4 + decl I # i -- + jg .L10 + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + + leal (, LDC, 2), %eax + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L500 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO + movl A, AO + + movl M, I + testl %ebx, I + jle .L500 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + prefetchw 1 * SIZE(CO) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -14 * SIZE(AO), %xmm0 + mulpd -16 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -13 * SIZE(AO), %xmm1 + + mulpd -14 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm6 + movddup -12 * SIZE(AO), %xmm0 + mulpd -14 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm7 + movddup -11 * SIZE(AO), %xmm1 + + mulpd -12 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -10 * SIZE(AO), %xmm0 + mulpd -12 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -9 * SIZE(AO), %xmm1 + + mulpd -10 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm6 + movddup -8 * SIZE(AO), %xmm0 + mulpd -10 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm7 + movddup -7 * SIZE(AO), %xmm1 + + mulpd -8 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -6 * SIZE(AO), %xmm0 + mulpd -8 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -5 * SIZE(AO), %xmm1 + + mulpd -6 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm6 + movddup -4 * SIZE(AO), %xmm0 + mulpd -6 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm7 + movddup -3 * SIZE(AO), %xmm1 + + mulpd -4 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -2 * SIZE(AO), %xmm0 + mulpd -4 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -1 * SIZE(AO), %xmm1 + + mulpd -2 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm6 + movddup 0 * SIZE(AO), %xmm0 + mulpd -2 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm7 + movddup 1 * SIZE(AO), %xmm1 + + subl $-16 * SIZE, AO + subl $-16 * SIZE, BO + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulpd -16 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -14 * SIZE(AO), %xmm0 + mulpd -16 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -13 * SIZE(AO), %xmm1 + + addl $2 * SIZE, AO + addl $2 * SIZE, BO + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#ifndef TRMMKERNEL + movupd 0 * SIZE(CO), %xmm0 +#endif + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + addsubpd %xmm5, %xmm4 + pshufd $0x4e, %xmm4, %xmm5 +#else + addsubpd %xmm4, %xmm5 + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm5 +#endif + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + + addsubpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 +#endif + + movlpd %xmm4, 0 * SIZE(CO) + movhpd %xmm4, 1 * SIZE(CO) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, CO # coffset += 4 + decl I # i -- + jg .L110 + ALIGN_4 + +.L500: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_penryn.S b/kernel/x86/zgemm_kernel_1x2_penryn.S new file mode 100644 index 0000000..8493619 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_penryn.S @@ -0,0 +1,701 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef NANO +#define PREFETCHSIZE (8 * 3 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 13 + 4) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 addpd +#define ADD2 subpd +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L20 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, C1 # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -16 * SIZE(%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + PREFETCHW 1 * SIZE(C1) + xorps %xmm5, %xmm5 + PREFETCHW 3 * SIZE(C1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + movaps 2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, BB + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + ADD1 %xmm3, %xmm6 + pcmpeqb %xmm0, %xmm0 + ADD2 %xmm2, %xmm7 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 +#endif + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 + + mulpd %xmm2, %xmm4 + mulpd %xmm2, %xmm6 + + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm7 + + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 0 * SIZE(C1, LDC) + movhpd %xmm6, 1 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, C1 # coffset += 4 + decl %ebx # i -- + jg .L10 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L20: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(C1) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -10 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -6 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -2 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + pcmpeqb %xmm0, %xmm0 + addpd %xmm7, %xmm5 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#endif + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 +#endif + + haddpd %xmm5, %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + addsubpd %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, C1 + decl %ebx # i -- + jg .L21 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_sse2.S b/kernel/x86/zgemm_kernel_1x2_sse2.S new file mode 100644 index 0000000..63fc30a --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_sse2.S @@ -0,0 +1,909 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define OFFSET 76(%esp) +#define KK 80(%esp) +#define KKK 84(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#endif + +#define PREFETCHSIZE (8 * 10 + 4) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi + + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, B + movl STACK_C, %ebx +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movlpd STACK_ALPHA_R, %xmm0 + movlpd STACK_ALPHA_I, %xmm1 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm0, 0 + ALPHA_R + movlpd %xmm0, 8 + ALPHA_R + + movlpd %xmm1, 8 + ALPHA_I + xorpd %xmm7, %xmm1 + movlpd %xmm1, 0 + ALPHA_I + + movlpd %xmm2, 0 + POSINV + movlpd %xmm7, 8 + POSINV + + movl %ebx, C + movl STACK_LDC, LDC + +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + + movapd POSINV, %xmm7 + + movl K, %eax + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $1, %eax + BRANCH + jle .L05 + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + movapd POSINV, %xmm1 + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#ifndef TRMMKERNEL + movlpd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movlpd 0 * SIZE(%esi, LDC), %xmm1 + movhpd 1 * SIZE(%esi, LDC), %xmm1 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm2, %xmm6 + mulpd %xmm3, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movlpd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movlpd %xmm6, 0 * SIZE(%esi, LDC) + movhpd %xmm6, 1 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L500 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + movapd POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, %ecx + decl %eax + jne .L102 + ALIGN_4 + +.L103: + movl K, %eax + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + movapd POSINV, %xmm1 + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#ifndef TRMMKERNEL + movlpd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + + pshufd $0x4e, %xmm4, %xmm5 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + + addpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 +#endif + + movlpd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L500: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_sse3.S b/kernel/x86/zgemm_kernel_1x2_sse3.S new file mode 100644 index 0000000..70e6400 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_sse3.S @@ -0,0 +1,857 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addpd +#else +#define ADDSUB subpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7 + +#define KERNEL7(address) \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + movl BX, %eax + + prefetcht2 (PREFETCH_R + 0) * SIZE(%eax) + prefetcht2 (PREFETCH_R + 16) * SIZE(%eax) + + subl $-8 * SIZE, BX + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + +#ifdef CORE_PRESCOTT + andl $-8, %eax + sall $4, %eax + je .L12 + +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L11 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L11 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L11 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L11 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L11 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L11 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L11 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L11 + KERNEL1(16 * 8) + KERNEL2(16 * 8) + KERNEL3(16 * 8) + KERNEL4(16 * 8) + KERNEL5(16 * 8) + KERNEL6(16 * 8) + KERNEL7(16 * 8) + KERNEL8(16 * 8) + cmpl $128 * 9, %eax + jle .L11 + KERNEL1(16 * 9) + KERNEL2(16 * 9) + KERNEL3(16 * 9) + KERNEL4(16 * 9) + KERNEL5(16 * 9) + KERNEL6(16 * 9) + KERNEL7(16 * 9) + KERNEL8(16 * 9) + cmpl $128 * 10, %eax + jle .L11 + KERNEL1(16 * 10) + KERNEL2(16 * 10) + KERNEL3(16 * 10) + KERNEL4(16 * 10) + KERNEL5(16 * 10) + KERNEL6(16 * 10) + KERNEL7(16 * 10) + KERNEL8(16 * 10) + cmpl $128 * 11, %eax + jle .L11 + KERNEL1(16 * 11) + KERNEL2(16 * 11) + KERNEL3(16 * 11) + KERNEL4(16 * 11) + KERNEL5(16 * 11) + KERNEL6(16 * 11) + KERNEL7(16 * 11) + KERNEL8(16 * 11) + cmpl $128 * 12, %eax + jle .L11 + KERNEL1(16 * 12) + KERNEL2(16 * 12) + KERNEL3(16 * 12) + KERNEL4(16 * 12) + KERNEL5(16 * 12) + KERNEL6(16 * 12) + KERNEL7(16 * 12) + KERNEL8(16 * 12) + cmpl $128 * 13, %eax + jle .L11 + KERNEL1(16 * 13) + KERNEL2(16 * 13) + KERNEL3(16 * 13) + KERNEL4(16 * 13) + KERNEL5(16 * 13) + KERNEL6(16 * 13) + KERNEL7(16 * 13) + KERNEL8(16 * 13) + cmpl $128 * 14, %eax + jle .L11 + KERNEL1(16 * 14) + KERNEL2(16 * 14) + KERNEL3(16 * 14) + KERNEL4(16 * 14) + KERNEL5(16 * 14) + KERNEL6(16 * 14) + KERNEL7(16 * 14) + KERNEL8(16 * 14) + cmpl $128 * 15, %eax + jle .L11 + KERNEL1(16 * 15) + KERNEL2(16 * 15) + KERNEL3(16 * 15) + KERNEL4(16 * 15) + KERNEL5(16 * 15) + KERNEL6(16 * 15) + KERNEL7(16 * 15) + KERNEL8(16 * 15) +#else + addl $32 * 4 * SIZE, AA + addl $32 * 8 * SIZE, BB + subl $128 * 8, %eax + jg .L1X +#endif + +.L11: + leal (AA, %eax, 1), AA # * 16 + leal (BB, %eax, 2), BB # * 64 + +#else + + sarl $3, %eax + je .L12 + ALIGN_4 + +.L11: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA_R, %xmm1 + movddup ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + pcmpeqb %xmm0, %xmm0 + SHUFPD_1 %xmm5, %xmm5 + psllq $63, %xmm0 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + shufps $0x04, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#endif + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm6 + + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm7 + + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhpd 1 * SIZE(%esi, LDC), %xmm2 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm6, 0 * SIZE(%esi, LDC) + movhpd %xmm6, 1 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + movl BB, B + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + testl $1, %eax + jle .L500 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je L112 + ALIGN_4 + +L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne L111 + ALIGN_4 + +L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA_R, %xmm1 + movddup ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je L114 + ALIGN_4 + +L113: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L113 + ALIGN_4 + +L114: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + pcmpeqb %xmm0, %xmm0 + SHUFPD_1 %xmm5, %xmm5 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + shufps $0x04, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 +#else + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 +#endif + + addpd %xmm5, %xmm4 + + movaps %xmm4, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + + mulpd %xmm1, %xmm4 + mulpd %xmm3, %xmm5 + + addsubpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg L110 + ALIGN_4 + +.L500: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x1_core2.S b/kernel/x86/zgemm_kernel_2x1_core2.S new file mode 100644 index 0000000..3ed5342 --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x1_core2.S @@ -0,0 +1,695 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define BX 72(%esp) +#define OLD_STACK 76(%esp) +#define OFFSET 80(%esp) +#define KK 84(%esp) +#define KKK 88(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 16 + 4) +#define PREFETCH prefetcht0 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define C1 %esi + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 subpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 subpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movsd STACK_ALPHA_R, %xmm0 + movsd STACK_ALPHA_I, %xmm1 + + movddup %xmm0, %xmm0 + movddup %xmm1, %xmm1 + + movapd %xmm0, ALPHA_R + movapd %xmm1, ALPHA_I + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + sall $ZBASE_SHIFT, LDC + + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl B, BX + + movl C, C1 # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 16 * SIZE + BUFFER, BB +#else + + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(C1) + pxor %xmm7, %xmm7 + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetcht0 (%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + movapd -12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm4 + movapd -10 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm6 + movapd -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + PADDING; + movapd 0 * SIZE(AA), %xmm0 + ADD2 %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + ADD1 %xmm2, %xmm6 + movapd -4 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + ADD2 %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + ADD1 %xmm2, %xmm4 + movapd -2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm6 + PADDING; + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 8 * SIZE(AA), %xmm3 + ADD2 %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm4 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm6 + movapd 8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + ADD2 %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + ADD1 %xmm2, %xmm6 + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd 12 * SIZE(AA), %xmm3 + ADD2 %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + ADD1 %xmm2, %xmm4 + movapd 14 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + subl $-32 * SIZE, BB + ADD2 %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm6 + movapd -16 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 24 * SIZE(AA), %xmm3 + ADD2 %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + subl $-32 * SIZE, AA + decl %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AA), %xmm0 + ADD2 %xmm3, %xmm7 + movapd %xmm1, %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + + movapd %xmm4, %xmm5 + movapd %xmm6, %xmm7 +#else + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 + + movapd %xmm5, %xmm4 + movapd %xmm7, %xmm6 +#endif + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm1 + movhpd 3 * SIZE(C1), %xmm1 +#endif + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + + mulpd %xmm2, %xmm4 + mulpd %xmm2, %xmm6 + + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm7 + + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 2 * SIZE(C1) + movhpd %xmm6, 3 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, C1 # coffset += 4 + decl %ebx # i -- + jg .L10 + +.L20: + movl M, %ebx + testl $1, %ebx + je .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 16 * SIZE + BUFFER, %ecx +#else + + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + + sarl $3, %eax + jle .L22 + +.L21: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax # l-- + jg .L21 + ALIGN_2 + +.L22: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # l = (k & 3) + jle .L24 + ALIGN_2 + +.L23: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax # l-- + jg .L23 + +.L24: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm5, %xmm4 + movapd %xmm4, %xmm5 +#else + addsubpd %xmm4, %xmm5 + movapd %xmm5, %xmm4 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 +#endif + + SHUFPD_1 %xmm5, %xmm5 + + mulpd %xmm2, %xmm4 + + mulpd %xmm3, %xmm5 + + addsubpd %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + ALIGN_2 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C # c += ldc + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + emms + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x1_sse2.S b/kernel/x86/zgemm_kernel_2x1_sse2.S new file mode 100644 index 0000000..3ef96d1 --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x1_sse2.S @@ -0,0 +1,824 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define BX 72(%esp) +#define OLD_STACK 76(%esp) +#define OFFSET 80(%esp) +#define KK 84(%esp) +#define KKK 88(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movsd STACK_ALPHA_R, %xmm0 + movsd STACK_ALPHA_I, %xmm1 + + pxor %xmm7, %xmm7 + cmpeqpd %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm0, 0 + ALPHA_R + movsd %xmm0, 8 + ALPHA_R + + movsd %xmm1, 8 + ALPHA_I + xorpd %xmm7, %xmm1 + movsd %xmm1, 0 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movsd %xmm7, 0 + POSINV + movsd %xmm2, 8 + POSINV +#else + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV +#endif + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + movapd POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 +#else + xorpd %xmm7, %xmm0 + xorpd %xmm7, %xmm2 +#endif + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + movapd %xmm2, 4 * SIZE(BB) + movapd %xmm3, 6 * SIZE(BB) + + movsd 4 * SIZE(B), %xmm0 + movsd 5 * SIZE(B), %xmm1 + movsd 6 * SIZE(B), %xmm2 + movsd 7 * SIZE(B), %xmm3 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 +#else + xorpd %xmm7, %xmm0 + xorpd %xmm7, %xmm2 +#endif + + movapd %xmm0, 8 * SIZE(BB) + movapd %xmm1, 10 * SIZE(BB) + movapd %xmm2, 12 * SIZE(BB) + movapd %xmm3, 14 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm1 +#else + xorpd %xmm7, %xmm0 +#endif + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl B, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L10: + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + + subl $-8 * SIZE, BX + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#endif + + prefetchnta 3 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (BB, %eax, 4), BB + leal (AA, %eax, 4), AA + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + +.L14: + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + + movapd %xmm4, %xmm5 + movapd %xmm6, %xmm7 + + SHUFPD_1 %xmm4, %xmm4 + SHUFPD_1 %xmm6, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm6, 2 * SIZE(%esi) + movhpd %xmm6, 3 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + +.L50: + movl M, %ebx + testl $1, %ebx + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm1 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm2 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + + sarl $2, %eax # l = (k >> 2) + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 16 * SIZE(BB), %xmm1 + + mulpd %xmm0, %xmm3 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm5 + movapd 4 * SIZE(BB), %xmm3 + + mulpd %xmm0, %xmm3 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 10 * SIZE(BB), %xmm0 + + addpd %xmm2, %xmm4 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 24 * SIZE(BB), %xmm2 + + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm5 + movapd 8 * SIZE(AA), %xmm0 + + addl $ 8 * SIZE, AA # aoffset += 2 + addl $16 * SIZE, BB # boffset1 += 4 + + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 2 + addl $4 * SIZE, BB # boffset1 += 4 + decl %eax # l-- + jg .L53 + +.L54: + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + + movapd %xmm4, %xmm5 + + SHUFPD_1 %xmm4, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + SHUFPD_2 %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C # c += ldc + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x2_barcelona.S b/kernel/x86/zgemm_kernel_2x2_barcelona.S new file mode 100644 index 0000000..2ad6893 --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x2_barcelona.S @@ -0,0 +1,1363 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define OFFSET 76(%esp) +#define KK 80(%esp) +#define KKK 84(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + xorps %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + xorps %xmm2, %xmm2 + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 + ALPHA_R + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + movl %eax, J # j = n + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movaps POSINV, %xmm7 + + movl K, %eax + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + movaps 4 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + movaps %xmm0, 16 * SIZE(%ecx) + movaps %xmm1, 20 * SIZE(%ecx) + movaps %xmm2, 24 * SIZE(%ecx) + movaps %xmm3, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + addl $ 4 * SIZE, %edi + ALIGN_4 + +.L05: + movl C, %esi + movl A, %edx + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + shufps $0xe4, %xmm2, %xmm2 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhps 2 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movlps %xmm6, 0 * SIZE(%esi, LDC) + movhps %xmm6, 2 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + shufps $0xe4, %xmm6, %xmm6 + + movsd 0 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm6, 0 * SIZE(%esi, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (LDC, LDC), %eax + addl %eax, C # c += 2 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movaps POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + movaps 4 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + movaps %xmm0, 16 * SIZE(%ecx) + movaps %xmm1, 20 * SIZE(%ecx) + movaps %xmm2, 24 * SIZE(%ecx) + movaps %xmm3, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: + movl K, %eax + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movsd 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 +#else + xorps %xmm7, %xmm0 +#endif + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi + movl A, AA + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x2_penryn.S b/kernel/x86/zgemm_kernel_2x2_penryn.S new file mode 100644 index 0000000..edd89b1 --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x2_penryn.S @@ -0,0 +1,1210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef NANO +#define PREFETCHSIZE (16 * 3 + 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE (16 * 1 + 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (16 * 13 + 8) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addps +#define ADD2 addps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addps +#define ADD2 addps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addps +#define ADD2 addps +#else +#define ADD1 addps +#define ADD2 subps +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, C1 + movl A, AA + + movl M, %ebx + sarl $1, %ebx + jle .L20 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -32 * SIZE(%eax) + subl $-16 * SIZE, %eax + movl %eax, BX + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + PREFETCHW 3 * SIZE(C1) + xorps %xmm5, %xmm5 + PREFETCHW 7 * SIZE(C1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L16 + ALIGN_4 + +.L18: + ADD2 %xmm2, %xmm7 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm3, %xmm6 + psllq $63, %xmm0 + + movsd ALPHA_R, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + + shufps $0xd8, %xmm4, %xmm4 + shufps $0xd8, %xmm6, %xmm6 + + movaps %xmm4, %xmm5 + shufps $0xe4, %xmm6, %xmm4 + shufps $0xe4, %xmm5, %xmm6 + + pshufd $0x00, %xmm3, %xmm2 + pshufd $0x55, %xmm3, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + mulps %xmm2, %xmm6 + mulps %xmm3, %xmm7 + + addsubps %xmm5, %xmm4 + addsubps %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movhps 2 * SIZE(C1), %xmm2 + movsd 0 * SIZE(C1, LDC), %xmm3 + movhps 2 * SIZE(C1, LDC), %xmm3 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm6, 0 * SIZE(C1, LDC) + movhps %xmm6, 2 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, C1 + decl %ebx + jg .L10 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 + + movsd ALPHA_R, %xmm3 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + pshufd $0xb1, %xmm7, %xmm7 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 + + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#else + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#endif + + pshufd $0x00, %xmm3, %xmm2 + pshufd $0x55, %xmm3, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + mulps %xmm2, %xmm6 + mulps %xmm3, %xmm7 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 + + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movsd 0 * SIZE(C1, LDC), %xmm3 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movsd %xmm6, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_2 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + + decl J + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd -32 * SIZE(BB), %xmm1 + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + movsd ALPHA_R, %xmm3 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm5 + subps %xmm5, %xmm4 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm5 + addps %xmm5, %xmm4 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm4 + addps %xmm5, %xmm4 +#else + pxor %xmm0, %xmm4 + subps %xmm5, %xmm4 +#endif + + pshufd $0x00, %xmm3, %xmm2 + pshufd $0x55, %xmm3, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + pxor %xmm0, %xmm5 + subps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movhps 2 * SIZE(C1), %xmm2 + + addps %xmm2, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, C1 + decl %ebx + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -22 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -18 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + movsd ALPHA_R, %xmm3 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm5 + subps %xmm5, %xmm4 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm5 + addps %xmm5, %xmm4 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm4 + addps %xmm5, %xmm4 +#else + pxor %xmm0, %xmm4 + subps %xmm5, %xmm4 +#endif + + pshufd $0x00, %xmm3, %xmm2 + pshufd $0x55, %xmm3, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + pxor %xmm0, %xmm5 + subps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + addps %xmm2, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x2_sse.S b/kernel/x86/zgemm_kernel_2x2_sse.S new file mode 100644 index 0000000..fad42cc --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x2_sse.S @@ -0,0 +1,1562 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define OFFSET 76(%esp) +#define KK 80(%esp) +#define KKK 84(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#ifdef ATHLON +#define PREFETCHSIZE 64 +#define WPREFETCHSIZE 80 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCHSIZE (16 * 10 + 8) +#define WPREFETCHSIZE 112 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 168 +#endif + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; +#endif + +#ifdef PENTIUM4 +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1 +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + xorps %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + xorps %xmm2, %xmm2 + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 + ALPHA_R + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + movl %eax, J # j = n + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movaps POSINV, %xmm7 + + movl K, %eax + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + +#ifdef PENTIUM4 + prefetcht1 104 * SIZE(BB) +#endif + + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + addl $ 4 * SIZE, %edi + ALIGN_4 + +.L05: + movl C, %esi + movl A, %edx + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $ 32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + shufps $0xe4, %xmm2, %xmm2 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhps 2 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movlps %xmm6, 0 * SIZE(%esi, LDC) + movhps %xmm6, 2 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + shufps $0xe4, %xmm6, %xmm6 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm6, 0 * SIZE(%esi, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (LDC, LDC), %eax + addl %eax, C # c += 2 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movaps POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: + movl K, %eax + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 +#else + xorps %xmm7, %xmm0 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi + movl A, AA + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 4 * SIZE(%esi) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x2_sse3.S b/kernel/x86/zgemm_kernel_2x2_sse3.S new file mode 100644 index 0000000..23afa8f --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x2_sse3.S @@ -0,0 +1,1365 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define OFFSET 76(%esp) +#define KK 80(%esp) +#define KKK 84(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 168 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 168 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addps +#else +#define ADDSUB subps +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (address) * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + + movl %ebx, C + movl STACK_LDC, LDC + +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + movl %eax, J # j = n + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $4 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi + movl A, %edx + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(32 * 8) + KERNEL2(32 * 8) + KERNEL3(32 * 8) + KERNEL4(32 * 8) + KERNEL5(32 * 8) + KERNEL6(32 * 8) + KERNEL7(32 * 8) + KERNEL8(32 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(32 * 9) + KERNEL2(32 * 9) + KERNEL3(32 * 9) + KERNEL4(32 * 9) + KERNEL5(32 * 9) + KERNEL6(32 * 9) + KERNEL7(32 * 9) + KERNEL8(32 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(32 * 10) + KERNEL2(32 * 10) + KERNEL3(32 * 10) + KERNEL4(32 * 10) + KERNEL5(32 * 10) + KERNEL6(32 * 10) + KERNEL7(32 * 10) + KERNEL8(32 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(32 * 11) + KERNEL2(32 * 11) + KERNEL3(32 * 11) + KERNEL4(32 * 11) + KERNEL5(32 * 11) + KERNEL6(32 * 11) + KERNEL7(32 * 11) + KERNEL8(32 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(32 * 12) + KERNEL2(32 * 12) + KERNEL3(32 * 12) + KERNEL4(32 * 12) + KERNEL5(32 * 12) + KERNEL6(32 * 12) + KERNEL7(32 * 12) + KERNEL8(32 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(32 * 13) + KERNEL2(32 * 13) + KERNEL3(32 * 13) + KERNEL4(32 * 13) + KERNEL5(32 * 13) + KERNEL6(32 * 13) + KERNEL7(32 * 13) + KERNEL8(32 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(32 * 14) + KERNEL2(32 * 14) + KERNEL3(32 * 14) + KERNEL4(32 * 14) + KERNEL5(32 * 14) + KERNEL6(32 * 14) + KERNEL7(32 * 14) + KERNEL8(32 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(32 * 15) + KERNEL2(32 * 15) + KERNEL3(32 * 15) + KERNEL4(32 * 15) + KERNEL5(32 * 15) + KERNEL6(32 * 15) + KERNEL7(32 * 15) + KERNEL8(32 * 15) +#else + addl $128 * 4 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 +#endif + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + ADDSUB %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movsldup 8 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + + addsubps %xmm5, %xmm4 + addsubps %xmm7, %xmm6 + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 +#else + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + addsubps %xmm4, %xmm5 + addsubps %xmm6, %xmm7 + + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 +#endif + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + shufps $0xe4, %xmm2, %xmm2 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhps 2 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm6, 0 * SIZE(%esi, LDC) + movhps %xmm6, 2 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 44 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 60 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + movaps %xmm4, %xmm6 + movlhps %xmm5, %xmm4 + movhlps %xmm6, %xmm5 + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm7, %xmm5 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + + addsubps %xmm5, %xmm4 + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + + addsubps %xmm4, %xmm5 + + movaps %xmm5, %xmm4 + + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 0 * SIZE(%esi, LDC), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 0 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (LDC, LDC), %eax + addl %eax, C # c += 2 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $16 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup 0 * SIZE(B), %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 4 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi + movl A, AA + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 16 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movsldup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movsldup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movsldup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movsldup 48 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + + addl $ 4 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm5, %xmm5 + + addsubps %xmm5, %xmm4 + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + + addsubps %xmm4, %xmm5 + + movaps %xmm5, %xmm4 + + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm5, %xmm4 + + movhlps %xmm4, %xmm5 + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm7, %xmm5 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + + addsubps %xmm5, %xmm4 + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + + addsubps %xmm4, %xmm5 + + movaps %xmm5, %xmm4 + + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_4x1_core2.S b/kernel/x86/zgemm_kernel_4x1_core2.S new file mode 100644 index 0000000..ca232e4 --- /dev/null +++ b/kernel/x86/zgemm_kernel_4x1_core2.S @@ -0,0 +1,872 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define TEMP 76(%esp) +#define OFFSET 80(%esp) +#define KK 84(%esp) +#define KKK 88(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define C1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 16 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addps +#else +#define ADDSUB subps +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movd %mm1, K + movd %mm0, M + movl %eax, N + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + leal (, LDC, SIZE * 2), LDC + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + pcmpeqb %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 + ALPHA_R + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + +.L01: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L03 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BB) + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BB) + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L02 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + +.L04: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, C1 # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -16 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 7 * SIZE(C1) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps -24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps -16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + ADDSUB %xmm3, %xmm5 + movaps -12 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps -8 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps -8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + ADDSUB %xmm3, %xmm5 + movaps -4 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps 16 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps 8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps 16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + ADDSUB %xmm3, %xmm5 + movaps 20 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm2, %xmm6 + movaps 24 * SIZE(AA), %xmm3 + ADDSUB %xmm1, %xmm7 + + movaps 24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + ADDSUB %xmm3, %xmm5 + movaps 28 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + subl $-64 * SIZE, BB + movaps 48 * SIZE(AA), %xmm3 + subl $-64 * SIZE, AA + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + movaps -32 * SIZE(BB), %xmm1 + + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + +.L16: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + +.L18: + movaps ALPHA_R, %xmm0 + movaps ALPHA_I, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + + addsubps %xmm5, %xmm4 + addsubps %xmm7, %xmm6 + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 +#else + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + addsubps %xmm4, %xmm5 + addsubps %xmm6, %xmm7 + + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 +#endif + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movhps 2 * SIZE(C1), %xmm2 + movsd 4 * SIZE(C1), %xmm3 + movhps 6 * SIZE(C1), %xmm3 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm6, 4 * SIZE(C1) + movhps %xmm6, 6 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $8 * SIZE, C1 + decl %ebx + jg .L10 + ALIGN_2 + +.L20: + movl M, %ebx + testl $2, %ebx + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 32 * SIZE + BUFFER, BB +#else + + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm7 + movaps -24 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps -8 * SIZE(BB), %xmm3 + ADDSUB %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + ADDSUB %xmm0, %xmm7 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + ADDSUB %xmm2, %xmm5 + movaps -12 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movaps 32 * SIZE(BB), %xmm1 + ADDSUB %xmm2, %xmm7 + movaps -8 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + ADDSUB %xmm2, %xmm5 + movaps -4 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + ADDSUB %xmm2, %xmm7 + movaps 16 * SIZE(AA), %xmm2 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_2 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + +.L28: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps ALPHA_R, %xmm0 + movaps ALPHA_I, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + addsubps %xmm5, %xmm4 + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + addsubps %xmm4, %xmm5 + movaps %xmm5, %xmm4 + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movhps 2 * SIZE(C1), %xmm2 + + addps %xmm2, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + addl $4 * SIZE, C1 + ALIGN_2 + +.L30: + testl $1, %ebx + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 32 * SIZE + BUFFER, BB +#else + + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -24 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm7 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movsd -8 * SIZE(BB), %xmm3 + ADDSUB %xmm0, %xmm5 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + ADDSUB %xmm0, %xmm7 + movsd -16 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movsd 8 * SIZE(BB), %xmm1 + ADDSUB %xmm2, %xmm5 + movsd -22 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movsd 32 * SIZE(BB), %xmm1 + ADDSUB %xmm2, %xmm7 + movsd -20 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + ADDSUB %xmm2, %xmm5 + movsd -18 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movsd 48 * SIZE(BB), %xmm3 + ADDSUB %xmm2, %xmm7 + movsd -8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 64 * SIZE, BB + + decl %eax + jne .L32 + ALIGN_2 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + +.L36: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + +.L38: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps ALPHA_R, %xmm0 + movaps ALPHA_I, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + addsubps %xmm5, %xmm4 + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + addsubps %xmm4, %xmm5 + movaps %xmm5, %xmm4 + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + addps %xmm2, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_2 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_4x1_sse.S b/kernel/x86/zgemm_kernel_4x1_sse.S new file mode 100644 index 0000000..6c51463 --- /dev/null +++ b/kernel/x86/zgemm_kernel_4x1_sse.S @@ -0,0 +1,1508 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define TEMP 76(%esp) +#define OFFSET 80(%esp) +#define KK 84(%esp) +#define KKK 88(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL4(address) \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL5(address) \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL6(address) \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movd %mm1, K + movd %mm0, M + movl %eax, N + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + leal (, LDC, SIZE * 2), LDC + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + +#ifdef HAVE_SSE2 + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask +#else + movl $0x80000000, TEMP + movss TEMP, %xmm7 + shufps $0, %xmm7, %xmm7 +#endif + xorps %xmm2, %xmm2 + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 + ALPHA_R + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + movaps POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L03 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L02 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 +#else + xorps %xmm7, %xmm0 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L10: + +#ifdef PENTIUM4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + + prefetchnta 8 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + andl $-8, %eax + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 4 * SIZE, AA + addl $128 * 4 * SIZE, BB + subl $ 64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + prefetcht0 8 * SIZE(%esi) + je .L12 + ALIGN_4 + +#define PREFETCHSIZE 48 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + + addl $8 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + + decl %eax + jg .L13 + +.L14: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + shufps $0xe4, %xmm4, %xmm4 + shufps $0xe4, %xmm6, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm2 + movhps 6 * SIZE(%esi), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm6, 4 * SIZE(%esi) + movhps %xmm6, 6 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $8 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L50: + movl M, %ebx + testl $2, %ebx + jle .L70 + + +#if (L1_DATA_LINESIZE == 64) + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_4 + +.L51: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +#else + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_4 + +.L51: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 +#endif + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L53 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + addl $4 * SIZE, %esi # coffset += 4 + ALIGN_2 + +.L70: + testl $1, %ebx + jle .L99 + + +#if (L1_DATA_LINESIZE == 64) + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_4 + +.L71: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_4 + +.L71: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 +#endif + +.L72: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L73 + +.L74: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_ncopy_2.S b/kernel/x86/zgemm_ncopy_2.S new file mode 100644 index 0000000..bc80b47 --- /dev/null +++ b/kernel/x86/zgemm_ncopy_2.S @@ -0,0 +1,268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 8 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_A 12 + STACK + ARGS(%esp) +#define STACK_LDA 16 + STACK + ARGS(%esp) +#define STACK_B 20 + STACK + ARGS(%esp) + +#define I %eax +#define J %ecx +#define LDA %edx +#define A %edi +#define A1 %ebx +#define A2 %ebp +#define B %esi + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl STACK_A, A + movl STACK_LDA, LDA + movl STACK_B, B + sall $ZBASE_SHIFT, LDA + + movl STACK_N, J + sarl $1, J + je .L20 + ALIGN_3 + +.L21: + movl A, A1 + leal (A1, LDA), A2 + leal (A, LDA, 2), A + + movl STACK_M, I + sarl $1, I + je .L24 + ALIGN_3 + +.L25: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXLOAD 0 * SIZE(A2), %mm2 + MMXLOAD 1 * SIZE(A2), %mm3 + + MMXLOAD 2 * SIZE(A1), %mm4 + MMXLOAD 3 * SIZE(A1), %mm5 + MMXLOAD 2 * SIZE(A2), %mm6 + MMXLOAD 3 * SIZE(A2), %mm7 + + MMXSTORE %mm0, 0 * SIZE(B) + MMXSTORE %mm1, 1 * SIZE(B) + MMXSTORE %mm2, 2 * SIZE(B) + MMXSTORE %mm3, 3 * SIZE(B) + + MMXSTORE %mm4, 4 * SIZE(B) + MMXSTORE %mm5, 5 * SIZE(B) + MMXSTORE %mm6, 6 * SIZE(B) + MMXSTORE %mm7, 7 * SIZE(B) +#else + FLD 3 * SIZE(A2) + FLD 2 * SIZE(A2) + FLD 3 * SIZE(A1) + FLD 2 * SIZE(A1) + FLD 1 * SIZE(A2) + FLD 0 * SIZE(A2) + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + + FST 0 * SIZE(B) + FST 1 * SIZE(B) + FST 2 * SIZE(B) + FST 3 * SIZE(B) + FST 4 * SIZE(B) + FST 5 * SIZE(B) + FST 6 * SIZE(B) + FST 7 * SIZE(B) +#endif + addl $4 * SIZE, A1 + addl $4 * SIZE, A2 + addl $8 * SIZE, B + decl I + jne .L25 + ALIGN_3 + +.L24: + movl STACK_M, I + andl $1, I + jle .L30 + ALIGN_3 + +.L31: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXLOAD 0 * SIZE(A2), %mm2 + MMXLOAD 1 * SIZE(A2), %mm3 + MMXSTORE %mm0, 0 * SIZE(B) + MMXSTORE %mm1, 1 * SIZE(B) + MMXSTORE %mm2, 2 * SIZE(B) + MMXSTORE %mm3, 3 * SIZE(B) +#else + FLD 1 * SIZE(A2) + FLD 0 * SIZE(A2) + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + FST 0 * SIZE(B) + FST 1 * SIZE(B) + FST 2 * SIZE(B) + FST 3 * SIZE(B) +#endif + addl $2 * SIZE, A1 + addl $2 * SIZE, A2 + addl $4 * SIZE, B + decl I + jne .L31 + ALIGN_3 + +.L30: + decl J + jne .L21 + ALIGN_3 + +.L20: + movl A, A1 + movl STACK_N, J + andl $1, J + jle .L38 + ALIGN_3 + +.L39: + movl STACK_M, I + sarl $2, I + je .L42 + ALIGN_3 + +.L43: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXLOAD 2 * SIZE(A1), %mm2 + MMXLOAD 3 * SIZE(A1), %mm3 + MMXLOAD 4 * SIZE(A1), %mm4 + MMXLOAD 5 * SIZE(A1), %mm5 + MMXLOAD 6 * SIZE(A1), %mm6 + MMXLOAD 7 * SIZE(A1), %mm7 + + MMXSTORE %mm0, 0 * SIZE(B) + MMXSTORE %mm1, 1 * SIZE(B) + MMXSTORE %mm2, 2 * SIZE(B) + MMXSTORE %mm3, 3 * SIZE(B) + MMXSTORE %mm4, 4 * SIZE(B) + MMXSTORE %mm5, 5 * SIZE(B) + MMXSTORE %mm6, 6 * SIZE(B) + MMXSTORE %mm7, 7 * SIZE(B) +#else + FLD 7 * SIZE(A1) + FLD 6 * SIZE(A1) + FLD 5 * SIZE(A1) + FLD 4 * SIZE(A1) + FLD 3 * SIZE(A1) + FLD 2 * SIZE(A1) + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + + FST 0 * SIZE(B) + FST 1 * SIZE(B) + FST 2 * SIZE(B) + FST 3 * SIZE(B) + FST 4 * SIZE(B) + FST 5 * SIZE(B) + FST 6 * SIZE(B) + FST 7 * SIZE(B) +#endif + + addl $8 * SIZE, A1 + addl $8 * SIZE, B + decl I + jne .L43 + ALIGN_3 + +.L42: + movl STACK_M, I + andl $3, I + jle .L38 + ALIGN_3 + +.L49: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXSTORE %mm0, 0 * SIZE(B) + MMXSTORE %mm1, 1 * SIZE(B) +#else + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + FST 0 * SIZE(B) + FST 1 * SIZE(B) +#endif + addl $2 * SIZE, A1 + addl $2 * SIZE, B + decl I + jne .L49 + ALIGN_3 + +.L38: + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_tcopy_2.S b/kernel/x86/zgemm_tcopy_2.S new file mode 100644 index 0000000..f9a601d --- /dev/null +++ b/kernel/x86/zgemm_tcopy_2.S @@ -0,0 +1,174 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 8 + +#define J 0 + STACK(%esp) +#define BOFFSET2 4 + STACK(%esp) + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_A 12 + STACK + ARGS(%esp) +#define STACK_LDA 16 + STACK + ARGS(%esp) +#define STACK_B 20 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#define A %ebp +#define A1 %edx +#define LDA %ecx +#define B %edi +#define I %ebx +#define B1 %eax +#define M4 %esi + + EMMS + + movl STACK_A, A + movl STACK_B, B + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_LDA, LDA + sall $ZBASE_SHIFT, LDA + + andl $-2, %eax + addl %eax, %eax + imull %ebx, %eax # m * ( n & ~1) + leal (B, %eax, SIZE), %eax # boffset2 = b + m * (n & ~1) + movl %eax, BOFFSET2 + + movl STACK_M, M4 + sall $ZBASE_SHIFT + 1, M4 + + testl %ebx, %ebx # if !(m & 1) goto L28 + movl %ebx, J + jle .L999 + ALIGN_4 + +.L39: + movl A, A1 + addl LDA, A + movl B, B1 + addl $4 * SIZE, B + + movl STACK_N, I + sarl $1, I + jle .L32 + ALIGN_4 + +.L36: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXLOAD 2 * SIZE(A1), %mm2 + MMXLOAD 3 * SIZE(A1), %mm3 + + MMXSTORE %mm0, 0 * SIZE(B1) + MMXSTORE %mm1, 1 * SIZE(B1) + MMXSTORE %mm2, 2 * SIZE(B1) + MMXSTORE %mm3, 3 * SIZE(B1) +#else + FLD 3 * SIZE(A1) + FLD 2 * SIZE(A1) + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + + FST 0 * SIZE(B1) + FST 1 * SIZE(B1) + FST 2 * SIZE(B1) + FST 3 * SIZE(B1) +#endif + addl $4 * SIZE, A1 + addl M4, B1 + decl I + jne .L36 + ALIGN_4 + +.L32: + movl STACK_N, I + andl $1, I + jle .L99 + ALIGN_4 + + movl BOFFSET2, B1 + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + + MMXSTORE %mm0, 0 * SIZE(B1) + MMXSTORE %mm1, 1 * SIZE(B1) +#else + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + + FST 0 * SIZE(B1) + FST 1 * SIZE(B1) +#endif + addl $2 * SIZE, BOFFSET2 + ALIGN_4 + +.L99: + decl J + jne .L39 + ALIGN_4 + +.L999: + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS,%esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_n.S b/kernel/x86/zgemv_n.S new file mode 100644 index 0000000..8e2b2b8 --- /dev/null +++ b/kernel/x86/zgemv_n.S @@ -0,0 +1,367 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 32 +#endif + +#if defined(PENTIUM4) || defined(ATHLON) +#define P ((DTB_ENTRIES) >> 1) +#endif + +#ifndef P +#define P DTB_ENTRIES +#endif + +#define STACK 16 +#define ARGS 16 + +#define PLDA_M 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_N 8 + STACK(%esp) +#define IS 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define LDA 36 + STACK + ARGS(%esp) +#define X 40 + STACK + ARGS(%esp) +#define INCX 44 + STACK + ARGS(%esp) +#define Y 48 + STACK + ARGS(%esp) +#define INCY 52 + STACK + ARGS(%esp) +#define BUFFER 56 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define LDA 28 + STACK + ARGS(%esp) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#define BUFFER 48 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movl X, %edi + + movl LDA, %ebx + addl %ebx, %ebx # lda *= 2 + leal 0(,%ebx,SIZE),%ebx # EBX : lda + + movl $0, IS + + movl M, %ecx + movl N, %esi + + test %ecx, %ecx + jle .L79 # goto END + test %esi, %esi + jle .L79 # goto END + + movl INCY, %eax + addl %eax, %eax # incy *= 2 + leal (,%eax,SIZE),%eax + movl %eax, INCY + + movl LDA, %eax + imull $P, %eax # P * lda + subl M ,%eax # P * lda - m + leal (, %eax, SIZE), %eax + addl %eax, %eax + movl %eax, PLDA_M + ALIGN_2 + +.L32: + movl IS, %esi + movl $P, %edx + movl N, %eax + subl %esi,%eax # n - is + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + + movl %eax, MIN_N + movl INCX, %edx + addl %edx, %edx + + addl %esi, %esi + leal (%edi, %esi, SIZE), %esi # xp = x + is + movl %esi, XP + cmpl $2, %edx + je .L34 # if incx == 1 goto L34 + + movl BUFFER, %esi + leal (, %edx, SIZE), %edx + movl %esi, XP # xp = buffer + sarl $1,%eax + jle .L35 + ALIGN_2 + +.L36: + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_N, %eax + andl $1, %eax + jle .L34 + + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + ALIGN_3 + +/* Main Routine */ +.L34: + movl Y, %ecx # c_offset + movl M, %ebp # j = m + ALIGN_3 + +.L61: + movl A, %edx # a_offset = a + fldz + addl $2 * SIZE, A # a++ + fldz + movl XP,%esi + fldz + movl MIN_N,%eax + fldz + FLD (%esi) # bt1 = *(b_offset + 0) + sarl $1, %eax + jle .L64 + ALIGN_3 + +.L65: +#ifdef PENTIUM4 + prefetchnta 16 * SIZE(%esi) +#endif + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) + + addl $2 * SIZE, %esi # b_offset += 2 + addl %ebx, %edx # a_offset += lda + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) + + addl $2 * SIZE, %esi # b_offset += 2 + addl %ebx, %edx # a_offset += lda + + decl %eax + jg .L65 + +.L64: + movl MIN_N, %eax + andl $1, %eax + jle .L70 + ALIGN_2 + +.L71: + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_2 + +.L70: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4), %st + fld %st(2) + fmul %st(4), %st + fsubp %st, %st(1) + + movl INCY, %eax + + FADD 0 * SIZE(%ecx) + FST 0 * SIZE(%ecx) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + faddp %st, %st(1) + + FADD 1 * SIZE(%ecx) + FST 1 * SIZE(%ecx) + + addl %eax, %ecx + decl %ebp + jg .L61 + +.L60: + movl PLDA_M, %esi + addl %esi, A # a += P * lda - m + addl $P, IS + movl N, %esi + cmpl %esi,IS + jl .L32 + +.L79: +#ifndef C_SUN + ffreep %st(0) + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 + .byte 0xdf + .byte 0xc0 +#endif + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_n_atom.S b/kernel/x86/zgemv_n_atom.S new file mode 100644 index 0000000..3dba030 --- /dev/null +++ b/kernel/x86/zgemv_n_atom.S @@ -0,0 +1,545 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 24 + STACKSIZE(%esp) +#define A 32 + STACKSIZE(%esp) +#define STACK_LDA 36 + STACKSIZE(%esp) +#define STACK_X 40 + STACKSIZE(%esp) +#define STACK_INCX 44 + STACKSIZE(%esp) +#define Y 48 + STACKSIZE(%esp) +#define STACK_INCY 52 + STACKSIZE(%esp) +#define BUFFER 56 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 subsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 addsd +#define ADD4 subsd +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 addsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 subsd +#define ADD4 subsd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl N, J + + pxor %xmm7, %xmm7 + + movl M, %eax + addl $8, %eax + sarl $3, %eax + ALIGN_3 + +.L01: + movapd %xmm7, 0 * SIZE(Y1) + movapd %xmm7, 2 * SIZE(Y1) + movapd %xmm7, 4 * SIZE(Y1) + movapd %xmm7, 6 * SIZE(Y1) + movapd %xmm7, 8 * SIZE(Y1) + movapd %xmm7, 10 * SIZE(Y1) + movapd %xmm7, 12 * SIZE(Y1) + movapd %xmm7, 14 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + addl LDA, A + + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addl INCX, X + + movapd %xmm6, %xmm2 + mulsd ALPHA_R, %xmm6 + mulsd ALPHA_I, %xmm2 + movapd %xmm7, %xmm3 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_R, %xmm7 + +#ifndef XCONJ + subsd %xmm3, %xmm6 + addsd %xmm2, %xmm7 +#else + addsd %xmm3, %xmm6 + subsd %xmm2, %xmm7 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + ALIGN_3 + + movl M, I + sarl $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + mulsd %xmm7, %xmm4 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -13 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -11 * SIZE(A1), %xmm3 + mulsd %xmm7, %xmm4 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -9 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -12 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm0 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -7 * SIZE(A1), %xmm3 + mulsd %xmm7, %xmm4 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -10 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + movlpd %xmm1, -9 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -13 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -11 * SIZE(A1), %xmm3 + mulsd %xmm7, %xmm4 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -9 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -12 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm0 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + ADD3 %xmm3, %xmm0 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -10 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + movlpd %xmm1, -9 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + mulsd %xmm7, %xmm4 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -13 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + ADD3 %xmm3, %xmm0 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + mulsd %xmm7, %xmm4 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + ADD3 %xmm3, %xmm0 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movlpd %xmm1, -15 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L10 + ALIGN_4 + +.L990: + movl Y, Y1 + movl BUFFER, X + movl STACK_INCY, INCY + + movl Y1, A1 + sall $ZBASE_SHIFT, INCY + + movl M, %eax + sarl $2, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd 0 * SIZE(Y1), %xmm0 + movsd 1 * SIZE(Y1), %xmm1 + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm2 + movsd 1 * SIZE(Y1), %xmm3 + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm4 + movsd 1 * SIZE(Y1), %xmm5 + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm6 + movsd 1 * SIZE(Y1), %xmm7 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + addsd 2 * SIZE(X), %xmm2 + addsd 3 * SIZE(X), %xmm3 + addsd 4 * SIZE(X), %xmm4 + addsd 5 * SIZE(X), %xmm5 + addsd 6 * SIZE(X), %xmm6 + addsd 7 * SIZE(X), %xmm7 + + movlpd %xmm0, 0 * SIZE(A1) + movlpd %xmm1, 1 * SIZE(A1) + addl INCY, A1 + + movlpd %xmm2, 0 * SIZE(A1) + movlpd %xmm3, 1 * SIZE(A1) + addl INCY, A1 + + movlpd %xmm4, 0 * SIZE(A1) + movlpd %xmm5, 1 * SIZE(A1) + addl INCY, A1 + + movlpd %xmm6, 0 * SIZE(A1) + movlpd %xmm7, 1 * SIZE(A1) + addl INCY, A1 + + addl $8 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $2, M + jle .L996 + + movsd 0 * SIZE(Y1), %xmm0 + movsd 1 * SIZE(Y1), %xmm1 + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm2 + movsd 1 * SIZE(Y1), %xmm3 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + addsd 2 * SIZE(X), %xmm2 + addsd 3 * SIZE(X), %xmm3 + + movlpd %xmm0, 0 * SIZE(A1) + movlpd %xmm1, 1 * SIZE(A1) + addl INCY, A1 + + movlpd %xmm2, 0 * SIZE(A1) + movlpd %xmm3, 1 * SIZE(A1) + addl INCY, A1 + + addl $4 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movsd 0 * SIZE(Y1), %xmm0 + movsd 1 * SIZE(Y1), %xmm1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + + movlpd %xmm0, 0 * SIZE(A1) + movlpd %xmm1, 1 * SIZE(A1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S new file mode 100644 index 0000000..340b9d3 --- /dev/null +++ b/kernel/x86/zgemv_n_sse.S @@ -0,0 +1,604 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef movsd +#undef movsd +#endif + +#ifdef PENTIUM3 +#ifdef HAVE_SSE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 20 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#undef SUBPS + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPS subps +#else +#define SUBPS addps +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, LDA + + subl $-32 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl N, J + + xorps %xmm7, %xmm7 + + movl M, %eax + addl $8, %eax + sarl $3, %eax + ALIGN_3 + +.L01: + movaps %xmm7, 0 * SIZE(Y1) + movaps %xmm7, 4 * SIZE(Y1) + movaps %xmm7, 8 * SIZE(Y1) + movaps %xmm7, 12 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl BUFFER, Y1 + addl $32 * SIZE, Y1 + + movl A, A1 + addl LDA, A + + movsd (X), %xmm7 + addl INCX, X + +#ifdef HAVE_SSE2 + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 +#else + subl $8, %esp + movl $0x00000000, 0(%esp) + movl $0x80000000, 4(%esp) + movlps (%esp), %xmm5 + addl $8, %esp + movlhps %xmm5, %xmm5 +#endif + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm6 + pshufd $0x55, %xmm7, %xmm7 +#else + movaps %xmm7, %xmm6 + shufps $0x00, %xmm6, %xmm6 + shufps $0x55, %xmm7, %xmm7 +#endif + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm3 +#else + movsd ALPHA_R, %xmm3 + + movlhps %xmm3, %xmm3 +#endif + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm3, %xmm4 +#else + movaps %xmm3, %xmm4 + shufps $0xb1, %xmm4, %xmm4 +#endif + + +#ifndef XCONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + mulps %xmm3, %xmm6 + mulps %xmm4, %xmm7 + +#ifndef XCONJ + subps %xmm7, %xmm6 +#else + addps %xmm7, %xmm6 +#endif + +#ifdef HAVE_SSE2 + pshufd $0x55, %xmm6, %xmm7 + pshufd $0x00, %xmm6, %xmm6 +#else + movaps %xmm6, %xmm7 + shufps $0x55, %xmm7, %xmm7 + shufps $0x00, %xmm6, %xmm6 +#endif + +#ifndef CONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + ALIGN_3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm4 + movhps -26 * SIZE(A1), %xmm4 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + movsd -20 * SIZE(A1), %xmm4 + movhps -18 * SIZE(A1), %xmm4 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -16 * SIZE(A1), %xmm2 + movhps -14 * SIZE(A1), %xmm2 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + movsd -12 * SIZE(A1), %xmm4 + movhps -10 * SIZE(A1), %xmm4 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -24 * SIZE(Y1) + movaps -16 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -20 * SIZE(Y1) + movaps -12 * SIZE(Y1), %xmm1 + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + movsd -20 * SIZE(A1), %xmm4 + movhps -18 * SIZE(A1), %xmm4 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -24 * SIZE(Y1) + movaps -16 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -20 * SIZE(Y1) + movaps -12 * SIZE(Y1), %xmm1 + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $4, M + je .L17 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm4 + movhps -26 * SIZE(A1), %xmm4 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $2, M + je .L18 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L18: + testl $1, M + je .L19 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A1), %xmm2 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L10 + ALIGN_4 + +.L990: + movl Y, Y1 + movl BUFFER, X + + movl STACK_INCY, INCY + sall $ZBASE_SHIFT, INCY + + movl M, %eax + sarl $3, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 0 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 4 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 8 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 12 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $16 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $4, M + jle .L995 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 0 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 4 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $8 * SIZE, X + ALIGN_3 + +.L995: + testl $2, M + jle .L996 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 0 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $4 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd (Y1), %xmm0 + + addps 0 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S new file mode 100644 index 0000000..441fbb0 --- /dev/null +++ b/kernel/x86/zgemv_n_sse2.S @@ -0,0 +1,467 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 24 + STACKSIZE(%esp) +#define A 32 + STACKSIZE(%esp) +#define STACK_LDA 36 + STACKSIZE(%esp) +#define STACK_X 40 + STACKSIZE(%esp) +#define STACK_INCX 44 + STACKSIZE(%esp) +#define Y 48 + STACKSIZE(%esp) +#define STACK_INCY 52 + STACKSIZE(%esp) +#define BUFFER 56 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#undef SUBPD + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPD subpd +#else +#define SUBPD addpd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl N, J + + pxor %xmm7, %xmm7 + + movl M, %eax + addl $8, %eax + sarl $3, %eax + ALIGN_3 + +.L01: + movapd %xmm7, 0 * SIZE(Y1) + movapd %xmm7, 2 * SIZE(Y1) + movapd %xmm7, 4 * SIZE(Y1) + movapd %xmm7, 6 * SIZE(Y1) + movapd %xmm7, 8 * SIZE(Y1) + movapd %xmm7, 10 * SIZE(Y1) + movapd %xmm7, 12 * SIZE(Y1) + movapd %xmm7, 14 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + addl LDA, A + + movsd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm6 + addl INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0xc0, %xmm5, %xmm5 + + pshufd $0x4e, %xmm6, %xmm7 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm3 + movddup ALPHA_I, %xmm4 +#else + movsd ALPHA_R, %xmm3 + movsd ALPHA_I, %xmm4 + + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 +#endif + + xorpd %xmm5, %xmm7 + + mulpd %xmm3, %xmm6 + mulpd %xmm4, %xmm7 + +#ifndef XCONJ + subpd %xmm7, %xmm6 +#else + addpd %xmm7, %xmm6 +#endif + + pshufd $0xee, %xmm6, %xmm7 + pshufd $0x44, %xmm6, %xmm6 + +#ifndef CONJ + xorpd %xmm5, %xmm7 +#else + xorpd %xmm5, %xmm6 +#endif + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + ALIGN_3 + + movl M, I + sarl $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm4 + movhpd -13 * SIZE(A1), %xmm4 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + movsd -10 * SIZE(A1), %xmm4 + movhpd -9 * SIZE(A1), %xmm4 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + movhpd -7 * SIZE(A1), %xmm2 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + movsd -6 * SIZE(A1), %xmm4 + movhpd -5 * SIZE(A1), %xmm4 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -12 * SIZE(Y1) + movapd -8 * SIZE(Y1), %xmm0 + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -10 * SIZE(Y1) + movapd -6 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + movsd -10 * SIZE(A1), %xmm4 + movhpd -9 * SIZE(A1), %xmm4 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -12 * SIZE(Y1) + movapd -8 * SIZE(Y1), %xmm0 + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -10 * SIZE(Y1) + movapd -6 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm4 + movhpd -13 * SIZE(A1), %xmm4 + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -16 * SIZE(Y1) + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -14 * SIZE(Y1) + + movapd -12 * SIZE(Y1), %xmm0 + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + + movapd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L10 + ALIGN_4 + +.L990: + movl Y, Y1 + movl BUFFER, X + + movl STACK_INCY, INCY + sall $ZBASE_SHIFT, INCY + + movl M, %eax + sarl $2, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 2 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 4 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 6 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + addl $8 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $2, M + jle .L996 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 2 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + addl $4 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_t.S b/kernel/x86/zgemv_t.S new file mode 100644 index 0000000..452794c --- /dev/null +++ b/kernel/x86/zgemv_t.S @@ -0,0 +1,386 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 88 +#endif + +#ifndef P +#define P 400 +#endif + +#define STACK 16 +#define ARGS 24 + +#define NLDA 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_M 8 + STACK(%esp) +#define J 12 + STACK(%esp) +#define IS 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define LDA 36 + STACK + ARGS(%esp) +#define X 40 + STACK + ARGS(%esp) +#define INCX 44 + STACK + ARGS(%esp) +#define Y 48 + STACK + ARGS(%esp) +#define INCY 52 + STACK + ARGS(%esp) +#define BUFFER 56 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define LDA 28 + STACK + ARGS(%esp) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#define BUFFER 48 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movl X, %edi # X + + movl $0, IS + + movl M, %ebx + movl N, %ecx + testl %ebx, %ebx + jle .L79 + + testl %ecx, %ecx + jle .L79 + + movl INCX, %esi + addl %esi, %esi + leal (,%esi,SIZE), %esi + movl %esi, INCX + + movl INCY, %esi + addl %esi, %esi + leal (, %esi, SIZE), %esi + movl %esi, INCY + + movl LDA, %ebx + + movl N, %eax + imull %ebx, %eax + movl $P, %esi + subl %eax, %esi + leal (, %esi, SIZE), %esi + addl %esi, %esi + movl %esi, NLDA + + leal (,%ebx,SIZE), %esi + addl %esi, %esi + movl %esi, LDA + ALIGN_2 + +.L32: + movl IS, %esi + + movl $P, %edx + movl M, %eax + subl %esi, %eax + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + movl %eax, MIN_M + + movl IS, %ecx + addl %ecx, %ecx + leal (%edi,%ecx,SIZE), %ecx # xp = x + is + movl INCX, %ebx + movl %ecx, XP + cmpl $2 * SIZE, %ebx + je .L34 + + movl BUFFER, %esi + movl MIN_M, %eax + movl %esi, XP + sarl $1, %eax + jle .L35 + + ALIGN_3 + +.L36: + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_M, %eax + andl $1,%eax + jle .L34 + + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + ALIGN_3 + +/* Main Routine */ + +.L34: + movl Y, %ebp # coffset = y + + movl N, %ecx + testl %ecx, %ecx + jle .L60 + ALIGN_2 + +.L61: + movl A, %ebx # a_offset = a + fldz # ct1 = ZERO + movl LDA, %edx + fldz # ct1 = ZERO + + addl %ebx, %edx + fldz # ct1 = ZERO + movl %edx, A + fldz # ct1 = ZERO + + movl XP, %esi + + FLD (%esi) # bt1 = *(b_offset + 0) + + movl MIN_M, %eax + sarl $1, %eax + jle .L64 + ALIGN_3 + +#define PRESIZE 8 + +.L65: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * SIZE(%ebx) + prefetcht0 PRESIZE * SIZE(%esi) +#endif + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 3 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + FLD 4 * SIZE(%esi) # bt1 = *(b_offset + 1) + + addl $4 * SIZE, %esi + addl $4 * SIZE, %ebx + decl %eax + jg .L65 + ALIGN_3 + +.L64: + movl MIN_M, %eax + andl $1, %eax + jle .L70 + ALIGN_3 + +.L71: + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_3 + +.L70: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4), %st + fld %st(2) + fmul %st(4), %st + fsubp %st, %st(1) + + FADD 0 * SIZE(%ebp) + FST 0 * SIZE(%ebp) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + faddp %st, %st(1) + + FADD 1 * SIZE(%ebp) + FST 1 * SIZE(%ebp) + addl INCY, %ebp + + decl %ecx + jg .L61 + ALIGN_3 + +.L60: + movl A, %ebx + addl NLDA, %ebx + movl %ebx, A + + addl $P, IS + movl M, %esi + cmpl %esi, IS + jl .L32 + ALIGN_3 + +.L79: +#ifndef C_SUN + ffreep %st(0) + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 + .byte 0xdf + .byte 0xc0 +#endif + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE + diff --git a/kernel/x86/zgemv_t_atom.S b/kernel/x86/zgemv_t_atom.S new file mode 100644 index 0000000..6f0dee0 --- /dev/null +++ b/kernel/x86/zgemv_t_atom.S @@ -0,0 +1,445 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 24 + STACKSIZE(%esp) +#define A 32 + STACKSIZE(%esp) +#define STACK_LDA 36 + STACKSIZE(%esp) +#define STACK_X 40 + STACKSIZE(%esp) +#define STACK_INCX 44 + STACKSIZE(%esp) +#define Y 48 + STACKSIZE(%esp) +#define STACK_INCY 52 + STACKSIZE(%esp) +#define BUFFER 56 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 subsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 addsd +#define ADD4 subsd +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 addsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 subsd +#define ADD4 subsd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + sall $ZBASE_SHIFT, LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addl INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addl INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addl INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + addl $2 * SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + ALIGN_3 + +.L11: + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + addl LDA, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movsd -16 * SIZE(X), %xmm2 + movsd -15 * SIZE(X), %xmm3 + + movl M, I + sarl $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + mulsd %xmm3, %xmm6 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -14 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -13 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -11 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -12 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -11 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -9 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -10 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -10 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -9 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -7 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -8 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -7 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -14 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -13 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -11 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -12 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -11 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -9 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -10 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -10 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -9 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -7 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + mulsd %xmm2, %xmm7 + movsd -8 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + ADD3 %xmm5, %xmm0 + ADD4 %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L15: + testl $2, M + jle .L17 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + mulsd %xmm3, %xmm6 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -14 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -13 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -11 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + mulsd %xmm2, %xmm7 + movsd -12 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + ADD3 %xmm5, %xmm0 + ADD4 %xmm7, %xmm1 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L17: + testl $1, M + jle .L18 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + mulsd %xmm3, %xmm6 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + ADD1 %xmm4, %xmm0 + mulsd %xmm2, %xmm7 + ADD2 %xmm6, %xmm1 + + ADD3 %xmm5, %xmm0 + ADD4 %xmm7, %xmm1 + ALIGN_4 + +.L18: + movsd 0 * SIZE(Y1), %xmm4 + movapd %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movsd 1 * SIZE(Y1), %xmm5 + movapd %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + + mulsd ALPHA_I, %xmm2 + mulsd ALPHA_I, %xmm3 + + addsd %xmm2, %xmm1 + subsd %xmm3, %xmm0 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movlpd %xmm1, 1 * SIZE(Y1) + + addl INCY, Y1 + + decl J + jg .L11 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S new file mode 100644 index 0000000..4312ed1 --- /dev/null +++ b/kernel/x86/zgemv_t_sse.S @@ -0,0 +1,522 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef movsd +#undef movsd +#endif + +#ifdef PENTIUM3 +#ifdef HAVE_SSE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 20 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#undef SUBPS + +#ifndef CONJ +#define SUBPS addps +#else +#define SUBPS subps +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, LDA + sall $ZBASE_SHIFT, INCY + + subl $-32 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + movaps %xmm0, 0 * SIZE(Y1) + movaps %xmm1, 4 * SIZE(Y1) + movaps %xmm2, 8 * SIZE(Y1) + movaps %xmm3, 12 * SIZE(Y1) + + addl $16 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addl INCX, X + + movlps %xmm0, (Y1) + addl $2 * SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + ALIGN_3 + +.L11: + movl BUFFER, X + addl $32 * SIZE, X + + movl A, A1 + addl LDA, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movaps -32 * SIZE(X), %xmm2 + movaps -28 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -16 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -12 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + movsd -16 * SIZE(A1), %xmm4 + movhps -14 * SIZE(A1), %xmm4 + movsd -12 * SIZE(A1), %xmm6 + movhps -10 * SIZE(A1), %xmm6 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -16 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -12 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + ALIGN_4 + +.L15: + testl $4, M + jle .L17 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L17: + testl $2, M + jle .L18 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm3, %xmm2 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L18: + testl $1, M + jle .L19 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm4 + shufps $0x44, %xmm2, %xmm2 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + SUBPS %xmm5, %xmm1 + ALIGN_4 + +.L19: +#ifdef HAVE_SSE2 + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 +#else + subl $8, %esp + movl $0x00000000, 0(%esp) + movl $0x80000000, 4(%esp) + movlps (%esp), %xmm5 + addl $8, %esp + movlhps %xmm5, %xmm5 +#endif + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + addps %xmm2, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm0, %xmm1 +#else + movaps %xmm0, %xmm1 + shufps $0xb1, %xmm1, %xmm1 +#endif + + movsd ALPHA_R, %xmm7 + movlhps %xmm7, %xmm7 + + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd 0 * SIZE(Y1), %xmm4 + + shufps $0xd8, %xmm0, %xmm0 + addps %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(Y1) + addl INCY, Y1 + + decl J + jg .L11 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S new file mode 100644 index 0000000..78ca14c --- /dev/null +++ b/kernel/x86/zgemv_t_sse2.S @@ -0,0 +1,404 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 24 + STACKSIZE(%esp) +#define A 32 + STACKSIZE(%esp) +#define STACK_LDA 36 + STACKSIZE(%esp) +#define STACK_X 40 + STACKSIZE(%esp) +#define STACK_INCX 44 + STACKSIZE(%esp) +#define Y 48 + STACKSIZE(%esp) +#define STACK_INCY 52 + STACKSIZE(%esp) +#define BUFFER 56 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#undef SUBPD + +#ifndef CONJ +#define SUBPD addpd +#else +#define SUBPD subpd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + sall $ZBASE_SHIFT, LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addl INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addl INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addl INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + addl $2 * SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + ALIGN_3 + +.L11: + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + addl LDA, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movapd -16 * SIZE(X), %xmm2 + movapd -14 * SIZE(X), %xmm3 + + movl M, I + sarl $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + SUBPD %xmm5, %xmm1 + movapd -12 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + SUBPD %xmm7, %xmm1 + movapd -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + movapd -8 * SIZE(X), %xmm2 + SUBPD %xmm5, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + movapd -6 * SIZE(X), %xmm3 + SUBPD %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + SUBPD %xmm5, %xmm1 + movapd -12 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + SUBPD %xmm7, %xmm1 + movapd -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + movapd -8 * SIZE(X), %xmm2 + SUBPD %xmm5, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + movapd -6 * SIZE(X), %xmm3 + SUBPD %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L15: + testl $2, M + jle .L17 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + movapd -12 * SIZE(X), %xmm2 + SUBPD %xmm5, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + SUBPD %xmm7, %xmm1 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L17: + testl $1, M + jle .L18 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + SUBPD %xmm5, %xmm1 + ALIGN_4 + +.L18: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0xc0, %xmm5, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm5, %xmm0 +#else + xorpd %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + addpd %xmm2, %xmm0 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 +#else + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 +#endif + + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm1 + + xorpd %xmm5, %xmm1 + + subpd %xmm1, %xmm0 + + movsd 0 * SIZE(Y1), %xmm4 + movhpd 1 * SIZE(Y1), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + decl J + jg .L11 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S new file mode 100644 index 0000000..c645b57 --- /dev/null +++ b/kernel/x86/znrm2.S @@ -0,0 +1,228 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + +#ifdef F_INTERFACE + movl (M), %ebx + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + fldz + fldz + fldz + cmpl $SIZE * 2, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + FLD 2 * SIZE(X) + fmul %st(0), %st + FLD 3 * SIZE(X) + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fmul %st(0), %st + FLD 5 * SIZE(X) + fmul %st(0), %st + FLD 6 * SIZE(X) + fmul %st(0), %st + FLD 7 * SIZE(X) + fmul %st(0), %st + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + faddp %st,%st(3) + faddp %st,%st(1) + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + faddp %st,%st(3) + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + fsqrt + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S new file mode 100644 index 0000000..95ca9fd --- /dev/null +++ b/kernel/x86/znrm2_sse.S @@ -0,0 +1,465 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + pxor %xmm0, %xmm0 + testl M, M + jle .L999 + pxor %xmm1, %xmm1 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + + cmpl $2 * SIZE, INCX + jne .L40 + + addl M, M + + subl $-32 * SIZE, X + + testl $SIZE, X + je .L05 + + movss -32 * SIZE(X), %xmm0 + cvtss2sd %xmm0, %xmm0 + mulsd %xmm0, %xmm0 + + addl $SIZE, X + decl M + jle .L998 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L13 + + movsd -32 * SIZE(X), %xmm4 + movsd -30 * SIZE(X), %xmm5 + movsd -28 * SIZE(X), %xmm6 + movsd -26 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_3 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + cvtps2pd %xmm4, %xmm2 + movsd -24 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -22 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -20 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -18 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + movsd -16 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -14 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -12 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -10 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + decl I + jg .L10 + ALIGN_3 + +.L12: + cvtps2pd %xmm4, %xmm2 + movsd -24 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -22 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -20 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -18 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + ALIGN_4 + +.L13: + testl $8, M + je .L14 + + movsd -32 * SIZE(X), %xmm4 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -30 * SIZE(X), %xmm5 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + movsd -28 * SIZE(X), %xmm6 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -26 * SIZE(X), %xmm7 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L14: + testl $4, M + je .L15 + + movsd -32 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -30 * SIZE(X), %xmm5 + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + addl $4 * SIZE, X + ALIGN_3 + +.L15: + testl $2, M + je .L16 + + movsd -32 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + addl $2 * SIZE, X + ALIGN_3 + +.L16: + testl $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L43 + + movsd (X), %xmm4 + addl INCX, X + movsd (X), %xmm5 + addl INCX, X + movsd (X), %xmm6 + addl INCX, X + movsd (X), %xmm7 + addl INCX, X + + decl I + jle .L42 + ALIGN_3 + +.L41: + cvtps2pd %xmm4, %xmm2 + movsd (X), %xmm4 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd (X), %xmm6 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd (X), %xmm7 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + movsd (X), %xmm4 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd (X), %xmm6 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + movsd (X), %xmm7 + addl INCX, X + + decl I + jg .L41 + ALIGN_3 + +.L42: + cvtps2pd %xmm4, %xmm2 + movsd (X), %xmm4 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd (X), %xmm6 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd (X), %xmm7 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_4 + +.L43: + testl $4, M + je .L44 + + movsd (X), %xmm4 + addl INCX, X + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + movsd (X), %xmm6 + addl INCX, X + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd (X), %xmm7 + addl INCX, X + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L44: + testl $2, M + je .L45 + + movsd (X), %xmm4 + addl INCX, X + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L45: + testl $1, M + je .L998 + + movsd (X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + sqrtsd %xmm0, %xmm0 + + cvtsd2ss %xmm0, %xmm0 + + movss %xmm0, STACK_M + flds STACK_M + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zrot.S b/kernel/x86/zrot.S new file mode 100644 index 0000000..7ac984e --- /dev/null +++ b/kernel/x86/zrot.S @@ -0,0 +1,407 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#ifdef XDOUBLE +#define STACK_S 40 + STACK + ARGS(%esp) +#elif defined DOUBLE +#define STACK_S 32 + STACK + ARGS(%esp) +#else +#define STACK_S 28 + STACK + ARGS(%esp) +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCH_SIZE 144 +#endif + +#ifdef OPTERON +#define PREFETCH prefetchw +#define PREFETCH_SIZE 144 +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + FLD STACK_S + FLD STACK_C + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl N, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + movl N, I + sarl $1, I + jle .L15 + ALIGN_4 + +.L10: +#ifdef PENTIUM4 + PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) +#endif +#ifdef OPTERON + PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + +#ifdef PENTIUM4 + PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) +#endif +#ifdef OPTERON + PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 2 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 3 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 3 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + + decl I + jg .L10 + ALIGN_4 + +.L15: + movl N, I + andl $1, I + jle .L999 + ALIGN_4 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + jmp .L999 + ALIGN_4 + +.L50: + movl N, I + sarl $1, I + jle .L55 + ALIGN_4 + +.L51: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L51 + ALIGN_4 + +.L55: + movl N, I + andl $1, I + jle .L999 + ALIGN_4 + +.L56: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + ALIGN_4 + +.L999: + ffreep %st(0) + ffreep %st(0) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zrot_sse.S b/kernel/x86/zrot_sse.S new file mode 100644 index 0000000..d8d0100 --- /dev/null +++ b/kernel/x86/zrot_sse.S @@ -0,0 +1,1391 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#define STACK_S 28 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#include "l1param.h" + +#define C %xmm6 +#define S %xmm7 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + movss STACK_C, C + movss STACK_S, S + + shufps $0x0, C, C + shufps $0x0, S, S + + cmpl $0, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + testl $2 * SIZE, X + je .L10 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl N + jle .L999 + +.L10: + testl $1 * SIZE, X + jne .L30 + + testl $3 * SIZE, Y + jne .L20 + + movl N, I + sarl $4, I + jle .L14 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + movaps 4 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 4 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps 8 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 8 * SIZE(Y) + + movaps 12 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movaps %xmm2, 12 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps 16 * SIZE(Y), %xmm1 + movaps 16 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 16 * SIZE(Y) + + movaps 20 * SIZE(Y), %xmm1 + movaps 20 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 20 * SIZE(X) + movaps %xmm2, 20 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps 24 * SIZE(Y), %xmm1 + movaps 24 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 24 * SIZE(X) + movaps %xmm2, 24 * SIZE(Y) + + movaps 28 * SIZE(Y), %xmm1 + movaps 28 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 28 * SIZE(X) + movaps %xmm2, 28 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl I + jg .L11 + ALIGN_3 + +.L14: + testl $15, N + jle .L999 + + testl $8, N + jle .L15 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + movaps 4 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 4 * SIZE(Y) + + movaps 8 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 8 * SIZE(Y) + + movaps 12 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movaps %xmm2, 12 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, N + jle .L16 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + movaps 4 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 4 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, N + jle .L17 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, N + jle .L999 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movl N, I + sarl $4, I + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movaps 16 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 16 * SIZE(X) + movlps %xmm2, 16 * SIZE(Y) + movhps %xmm2, 18 * SIZE(Y) + + movsd 20 * SIZE(Y), %xmm1 + movhps 22 * SIZE(Y), %xmm1 + movaps 20 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 20 * SIZE(X) + movlps %xmm2, 20 * SIZE(Y) + movhps %xmm2, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movaps 24 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 24 * SIZE(X) + movlps %xmm2, 24 * SIZE(Y) + movhps %xmm2, 26 * SIZE(Y) + + movsd 28 * SIZE(Y), %xmm1 + movhps 30 * SIZE(Y), %xmm1 + movaps 28 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 28 * SIZE(X) + movlps %xmm2, 28 * SIZE(Y) + movhps %xmm2, 30 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl I + jg .L21 + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + movl N, I + sarl $4, I + jle .L34 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movsd 4 * SIZE(X), %xmm0 + movhps 6 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 4 * SIZE(X) + movhps %xmm0, 6 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 8 * SIZE(X), %xmm0 + movhps 10 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 8 * SIZE(X) + movhps %xmm0, 10 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movsd 12 * SIZE(X), %xmm0 + movhps 14 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 12 * SIZE(X) + movhps %xmm0, 14 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movsd 16 * SIZE(X), %xmm0 + movhps 18 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 16 * SIZE(X) + movhps %xmm0, 18 * SIZE(X) + movlps %xmm2, 16 * SIZE(Y) + movhps %xmm2, 18 * SIZE(Y) + + movsd 20 * SIZE(Y), %xmm1 + movhps 22 * SIZE(Y), %xmm1 + movsd 20 * SIZE(X), %xmm0 + movhps 22 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 20 * SIZE(X) + movhps %xmm0, 22 * SIZE(X) + movlps %xmm2, 20 * SIZE(Y) + movhps %xmm2, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movsd 24 * SIZE(X), %xmm0 + movhps 26 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 24 * SIZE(X) + movhps %xmm0, 26 * SIZE(X) + movlps %xmm2, 24 * SIZE(Y) + movhps %xmm2, 26 * SIZE(Y) + + movsd 28 * SIZE(Y), %xmm1 + movhps 30 * SIZE(Y), %xmm1 + movsd 28 * SIZE(X), %xmm0 + movhps 30 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 28 * SIZE(X) + movhps %xmm0, 30 * SIZE(X) + movlps %xmm2, 28 * SIZE(Y) + movhps %xmm2, 30 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl I + jg .L31 + ALIGN_3 + +.L34: + testl $15, N + jle .L999 + + testl $8, N + jle .L35 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movsd 4 * SIZE(X), %xmm0 + movhps 6 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 4 * SIZE(X) + movhps %xmm0, 6 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 8 * SIZE(X), %xmm0 + movhps 10 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 8 * SIZE(X) + movhps %xmm0, 10 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movsd 12 * SIZE(X), %xmm0 + movhps 14 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 12 * SIZE(X) + movhps %xmm0, 14 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, N + jle .L36 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movsd 4 * SIZE(X), %xmm0 + movhps 6 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 4 * SIZE(X) + movhps %xmm0, 6 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L36: + testl $2, N + jle .L37 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, N + jle .L999 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + ALIGN_3 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_3 + +.L53: + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movhps %xmm0, (X, INCX) + movlps %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leal (X, INCX, 2), X + leal (Y, INCY, 2), Y + + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movhps %xmm0, (X, INCX) + movlps %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leal (X, INCX, 2), X + leal (Y, INCY, 2), Y + + decl I + jg .L53 + ALIGN_3 + +.L55: +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movl N, I + andl $3, I + jle .L999 + ALIGN_3 + +.L56: + movsd (Y), %xmm1 + movsd (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movlps %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/zrot_sse2.S b/kernel/x86/zrot_sse2.S new file mode 100644 index 0000000..7787f45 --- /dev/null +++ b/kernel/x86/zrot_sse2.S @@ -0,0 +1,1665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#define STACK_S 32 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#include "l1param.h" + +#define C %xmm6 +#define S %xmm7 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + movsd STACK_C, C + movsd STACK_S, S + + pshufd $0x44, C, C + pshufd $0x44, S, S + + cmpl $0, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + +.L10: + testl $SIZE, X + jne .L30 + + testl $SIZE, Y + jne .L20 + + movl N, I + sarl $3, I + jle .L14 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 8 * SIZE(Y), %xmm1 + movapd 8 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 8 * SIZE(Y) + + movapd 10 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movapd %xmm2, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 12 * SIZE(Y), %xmm1 + movapd 12 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movapd %xmm2, 12 * SIZE(Y) + + movapd 14 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movapd %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + + decl I + jg .L11 + ALIGN_3 + +.L14: + testl $7, N + jle .L999 + + testl $4, N + jle .L15 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $2, N + jle .L16 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $1, N + jle .L999 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movapd -1 * SIZE(Y), %xmm1 + + movl N, I + sarl $3, I + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 5 * SIZE(Y), %xmm4 + movapd 4 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movlpd %xmm2, 4 * SIZE(Y) + movhpd %xmm2, 5 * SIZE(Y) + + movapd 7 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movlpd %xmm2, 6 * SIZE(Y) + movhpd %xmm2, 7 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 9 * SIZE(Y), %xmm4 + movapd 8 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movlpd %xmm2, 8 * SIZE(Y) + movhpd %xmm2, 9 * SIZE(Y) + + movapd 11 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movlpd %xmm2, 10 * SIZE(Y) + movhpd %xmm2, 11 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 13 * SIZE(Y), %xmm4 + movapd 12 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movlpd %xmm2, 12 * SIZE(Y) + movhpd %xmm2, 13 * SIZE(Y) + + movapd 15 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movlpd %xmm2, 14 * SIZE(Y) + movhpd %xmm2, 15 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + decl I + jg .L21 + ALIGN_3 + +.L24: + testl $7, N + jle .L999 + + testl $4, N + jle .L25 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + movapd 5 * SIZE(Y), %xmm4 + movapd 4 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movlpd %xmm2, 4 * SIZE(Y) + movhpd %xmm2, 5 * SIZE(Y) + + movapd 7 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movlpd %xmm2, 6 * SIZE(Y) + movhpd %xmm2, 7 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, N + jle .L26 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, N + jle .L999 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + testl $SIZE, Y + jne .L40 + + movapd -1 * SIZE(X), %xmm0 + + movl N, I + sarl $3, I + jle .L34 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 3 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 2 * SIZE(X) + movhpd %xmm4, 3 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 5 * SIZE(X), %xmm4 + movapd 4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 4 * SIZE(X) + movhpd %xmm0, 5 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 7 * SIZE(X), %xmm0 + movapd 6 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 6 * SIZE(X) + movhpd %xmm4, 7 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 9 * SIZE(X), %xmm4 + movapd 8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 8 * SIZE(X) + movhpd %xmm0, 9 * SIZE(X) + movapd %xmm2, 8 * SIZE(Y) + + movapd 11 * SIZE(X), %xmm0 + movapd 10 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 10 * SIZE(X) + movhpd %xmm4, 11 * SIZE(X) + movapd %xmm2, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 13 * SIZE(X), %xmm4 + movapd 12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 12 * SIZE(X) + movhpd %xmm0, 13 * SIZE(X) + movapd %xmm2, 12 * SIZE(Y) + + movapd 15 * SIZE(X), %xmm0 + movapd 14 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 14 * SIZE(X) + movhpd %xmm4, 15 * SIZE(X) + movapd %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, Y + addl $16 * SIZE, X + decl I + jg .L31 + ALIGN_3 + +.L34: + testl $7, N + jle .L999 + + testl $4, N + jle .L35 + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 3 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 2 * SIZE(X) + movhpd %xmm4, 3 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + movapd 5 * SIZE(X), %xmm4 + movapd 4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 4 * SIZE(X) + movhpd %xmm0, 5 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 7 * SIZE(X), %xmm0 + movapd 6 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 6 * SIZE(X) + movhpd %xmm4, 7 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, Y + addl $8 * SIZE, X + ALIGN_3 + +.L35: + testl $2, N + jle .L36 + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 3 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 2 * SIZE(X) + movhpd %xmm4, 3 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, Y + addl $4 * SIZE, X + ALIGN_3 + +.L36: + testl $1, N + jle .L999 + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L40: + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + addl $1 * SIZE, Y + addl $1 * SIZE, X + + decl N + jle .L47 + + movl N, I + sarl $3, I + jle .L44 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 8 * SIZE(Y), %xmm1 + movapd 8 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 8 * SIZE(Y) + + movapd 10 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movapd %xmm2, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 12 * SIZE(Y), %xmm1 + movapd 12 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movapd %xmm2, 12 * SIZE(Y) + + movapd 14 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movapd %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + + decl I + jg .L41 + ALIGN_3 + +.L44: + testl $4, N + jle .L45 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $2, N + jle .L46 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + testl $1, N + jle .L47 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, Y + addl $2 * SIZE, X + ALIGN_3 + +.L47: + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L53 + ALIGN_3 + +.L55: + movl N, I + andl $3, I + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S new file mode 100644 index 0000000..7505cea --- /dev/null +++ b/kernel/x86/zscal.S @@ -0,0 +1,318 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 + +#define STACK_N 4 + STACK(%esp) +#ifdef XDOUBLE +#define ALPHA_R 16 + STACK(%esp) +#define ALPHA_I 32 + STACK(%esp) +#define STACK_X 48 + STACK(%esp) +#define STACK_INCX 52 + STACK(%esp) +#elif defined(DOUBLE) +#define ALPHA_R 16 + STACK(%esp) +#define ALPHA_I 24 + STACK(%esp) +#define STACK_X 32 + STACK(%esp) +#define STACK_INCX 36 + STACK(%esp) +#else +#define ALPHA_R 16 + STACK(%esp) +#define ALPHA_I 20 + STACK(%esp) +#define STACK_X 24 + STACK(%esp) +#define STACK_INCX 28 + STACK(%esp) +#endif + +#define N %esi +#define X %edx +#define INCX %ebx + +#define I %ecx + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + + sall $ZBASE_SHIFT, INCX + + FLD ALPHA_R + FLD ALPHA_I + + testl N, N + jle .L999 + + fld %st(1) + fabs + fld %st(1) + fabs + faddp %st, %st(1) + + fldz + fcomip %st(1), %st + ffreep %st(0) + jne .L30 + + EMMS + + pxor %mm0, %mm0 + + cmpl $2 * SIZE, INCX + jne .L20 + + movl N, I + sarl $2, I + jle .L15 + ALIGN_4 + +.L12: +#ifdef XDOUBLE + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + movq %mm0, 32(X) + movq %mm0, 40(X) + movq %mm0, 48(X) + movq %mm0, 56(X) + movq %mm0, 64(X) + movq %mm0, 72(X) + movq %mm0, 80(X) + movq %mm0, 88(X) + movq %mm0, 96(X) + movq %mm0, 104(X) + movq %mm0, 112(X) + movq %mm0, 120(X) +#elif defined(DOUBLE) + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + movq %mm0, 32(X) + movq %mm0, 40(X) + movq %mm0, 48(X) + movq %mm0, 56(X) +#else + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) +#endif + + addl $8 * SIZE, X + decl I + jg .L12 + ALIGN_3 + +.L15: + movl N, I + andl $3, I + jle .L18 + ALIGN_2 + +.L16: +#ifdef XDOUBLE + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) +#elif defined(DOUBLE) + movq %mm0, 0(X) + movq %mm0, 8(X) +#else + movq %mm0, 0(X) +#endif + + addl $2 * SIZE, X + decl I + jg .L16 + +.L18: + EMMS + + xorl %eax, %eax + popl %ebx + popl %esi + ret + ALIGN_2 + +.L20: + movl N, I + sarl $2, I + jle .L25 + ALIGN_3 + +.L22: +#ifdef XDOUBLE + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X + +#elif defined(DOUBLE) + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X +#else + movq %mm0, 0(X) + addl INCX, X + + movq %mm0, 0(X) + addl INCX, X + + movq %mm0, 0(X) + addl INCX, X + + movq %mm0, 0(X) + addl INCX, X +#endif + + decl I + jg .L22 + ALIGN_3 + +.L25: + movl N, I + andl $3, I + jle .L28 + ALIGN_3 + +.L26: +#ifdef XDOUBLE + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X +#elif defined(DOUBLE) + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X +#else + movq %mm0, 0(X) + addl INCX, X +#endif + + decl I + jg .L26 + +.L28: + EMMS + + xorl %eax, %eax + popl %ebx + popl %esi + ret + ALIGN_3 + +.L30: + movl N, I + ALIGN_2 + +.L32: + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 1 * SIZE(X) + fmul %st(3),%st + faddp %st,%st(1) + + FLD 0 * SIZE(X) + fmul %st(3),%st + FLD 1 * SIZE(X) + fmul %st(3),%st + fsubrp %st,%st(1) + + FST 0 * SIZE(X) + FST 1 * SIZE(X) + addl INCX, X + decl I + jg .L32 + ALIGN_2 + +.L999: + ffreep %st(0) + ffreep %st(0) + + xorl %eax,%eax + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S new file mode 100644 index 0000000..849d787 --- /dev/null +++ b/kernel/x86/zscal_sse.S @@ -0,0 +1,1389 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define XX %edi +#define FLAG %ebp + +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#define USE_PSHUFD +#else +#define USE_PSHUFD_HALF +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + sall $ZBASE_SHIFT, INCX + xor FLAG, FLAG + + testl M, M + jle .L999 + + xorps %xmm7, %xmm7 + comiss %xmm0, %xmm7 + jne .L100 # Alpha_r != ZERO + + comiss %xmm1, %xmm7 + jne .L100 # Alpha_i != ZERO + +/* Alpha == ZERO */ + cmpl $2 * SIZE, INCX + jne .L50 + +/* INCX == 1 */ + cmpl $3, M + jle .L13 + + testl $4, X + je .L05 + movss %xmm7, 0 * SIZE(X) + addl $SIZE, X + movl $1, FLAG + decl M + ALIGN_3 + +.L05: + testl $8, X + je .L06 + + movlps %xmm7, 0 * SIZE(X) + addl $2 * SIZE, X + subl $1, M + ALIGN_3 +.L06: + + movl M, I # rcx = n + sarl $3, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 4 * SIZE(X) + movaps %xmm7, 8 * SIZE(X) + movaps %xmm7, 12 * SIZE(X) + addl $16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: + testl $7, M + je .L19 + testl $4, M + je .L13 + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 4 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L13: + testl $2, M + je .L14 + + movlps %xmm7, 0 * SIZE(X) + movhps %xmm7, 2 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L14: + testl $1, M + je .L19 + + movlps %xmm7, 0 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, FLAG + je .L999 + + movss %xmm7, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L50: + movl M, I # rcx = n + sarl $2, I + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + decl I + jg .L51 + ALIGN_4 + +.L52: + testl $2, M + je .L53 + + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + ALIGN_3 + +.L53: + testl $1, M + je .L999 + + movsd %xmm7, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + testl $SIZE, X + jne .L130 + + cmpl $2 * SIZE, INCX + jne .L120 + + movaps %xmm0, %xmm6 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm1, %xmm1 + subps %xmm1, %xmm7 + unpcklps %xmm1, %xmm7 + + subl $-32 * SIZE, X + + testl $2 * SIZE, X + je .L105 + + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + addl $2 * SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L105: + movl M, I + sarl $4, I + jle .L115 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps -16 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(X) + movaps -12 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(X) + movaps -8 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(X) + movaps -4 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + movaps 0 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(X) + movaps 4 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(X) + movaps 8 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(X) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps -16 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(X) + movaps -12 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(X) + movaps -8 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(X) + movaps -4 * SIZE(X), %xmm3 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(X) + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(X) + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(X) + + subl $-32 * SIZE, X + ALIGN_4 + +.L115: + testl $8, M + je .L116 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + addl $16 * SIZE, X + ALIGN_3 + +.L116: + testl $4, M + je .L117 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L117: + testl $2, M + je .L118 + + movaps -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + addl $4 * SIZE, X + ALIGN_3 + +.L118: + testl $1, M + je .L999 + + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + jmp .L999 + ALIGN_3 + +.L120: + PSHUFD2($0, %xmm0, %xmm6) + PSHUFD2($0, %xmm1, %xmm1) + subps %xmm1, %xmm7 + unpcklps %xmm1, %xmm7 + + movl X, XX + + movl M, I + sarl $3, I + jle .L125 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + decl I + jle .L122 + ALIGN_4 + +.L121: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + addl INCX, XX + movhps %xmm0, (XX) + addl INCX, XX + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movlps %xmm1, (XX) + addl INCX, XX + movhps %xmm1, (XX) + addl INCX, XX + + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + + movlps %xmm2, (XX) + addl INCX, XX + movhps %xmm2, (XX) + addl INCX, XX + + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + + movlps %xmm3, (XX) + addl INCX, XX + movhps %xmm3, (XX) + addl INCX, XX + + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + decl I + jg .L121 + ALIGN_4 + +.L122: + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + addl INCX, XX + movhps %xmm0, (XX) + addl INCX, XX + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movlps %xmm1, (XX) + addl INCX, XX + movhps %xmm1, (XX) + addl INCX, XX + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + + movlps %xmm2, (XX) + addl INCX, XX + movhps %xmm2, (XX) + addl INCX, XX + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + + movlps %xmm3, (XX) + addl INCX, XX + movhps %xmm3, (XX) + addl INCX, XX + ALIGN_4 + +.L125: + testl $4, M + je .L127 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + addl INCX, XX + movhps %xmm0, (XX) + addl INCX, XX + + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movlps %xmm1, (XX) + addl INCX, XX + movhps %xmm1, (XX) + addl INCX, XX + ALIGN_3 + +.L127: + testl $2, M + je .L128 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + addl INCX, XX + movhps %xmm0, (XX) + addl INCX, XX + ALIGN_3 + +.L128: + testl $1, M + je .L999 + + movsd (X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + jmp .L999 + ALIGN_3 + +.L130: + cmpl $2 * SIZE, INCX + jne .L120 + +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) + + PSHUFD2($0, %xmm0, %xmm6) + PSHUFD2($0, %xmm1, %xmm1) + subps %xmm1, %xmm7 + unpcklps %xmm1, %xmm7 + + subl $-31 * SIZE, X + + testl $2 * SIZE, X + je .L130x + + movsd -31 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -31 * SIZE(X) + addl $2 * SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L130x: + shufps $0xb1, %xmm7, %xmm7 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, %xmm4 + + movl M, I + sarl $4, I + jle .L135 + + movaps -28 * SIZE(X), %xmm1 + + + decl I + jle .L132 + ALIGN_4 + +.L131: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -20 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -24 * SIZE(X) + + movaps -16 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -20 * SIZE(X) + + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + movaps -8 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -12 * SIZE(X) + + movaps -4 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -8 * SIZE(X) + + movaps 0 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -4 * SIZE(X) + + movaps 4 * SIZE(X), %xmm1 + + subl $-32 * SIZE, X + decl I + jg .L131 + ALIGN_4 + +.L132: + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -20 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -24 * SIZE(X) + + movaps -16 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -20 * SIZE(X) + + movaps -12 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + movaps -8 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -12 * SIZE(X) + + movaps -4 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -8 * SIZE(X) + + movaps 0 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -4 * SIZE(X) + + subl $-32 * SIZE, X + ALIGN_4 + +.L135: + testl $8, M + je .L136 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -20 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -24 * SIZE(X) + + movaps -16 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -20 * SIZE(X) + + addl $16 * SIZE, X + ALIGN_3 + +.L136: + testl $4, M + je .L137 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L137: + testl $2, M + je .L138 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps %xmm2, %xmm4 + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + ALIGN_3 + +.L138: + movss %xmm4, -32 * SIZE(X) + + testl $1, M + je .L999 + + PSHUFD2( $0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + PSHUFD1( $0x39, %xmm0) + + movlps %xmm0, -31 * SIZE(X) + jmp .L999 + ALIGN_3 + + +#else + + PSHUFD2($0, %xmm0, %xmm6) + PSHUFD2($0, %xmm1, %xmm1) + subps %xmm1, %xmm7 + unpcklps %xmm1, %xmm7 + + subl $-32 * SIZE, X + + testl $2 * SIZE, X + je .L130x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + addl $2 * SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L130x: + movl M, I + sarl $4, I + jle .L135 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decl I + jle .L132 + ALIGN_4 + +.L131: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -14 * SIZE(X) + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -12 * SIZE(X) + movhps %xmm1, -10 * SIZE(X) + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -8 * SIZE(X) + movhps %xmm2, -6 * SIZE(X) + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -4 * SIZE(X) + movhps %xmm3, -2 * SIZE(X) + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + decl I + jg .L131 + ALIGN_4 + +.L132: + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -14 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -12 * SIZE(X) + movhps %xmm1, -10 * SIZE(X) + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -8 * SIZE(X) + movhps %xmm2, -6 * SIZE(X) + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -4 * SIZE(X) + movhps %xmm3, -2 * SIZE(X) + + subl $-32 * SIZE, X + ALIGN_4 + +.L135: + testl $8, M + je .L136 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + + addl $16 * SIZE, X + ALIGN_3 + +.L136: + testl $4, M + je .L137 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L137: + testl $2, M + je .L138 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + addl $4 * SIZE, X + ALIGN_3 + +.L138: + testl $1, M + je .L999 + + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + ALIGN_3 +#endif + +.L999: + xorl %eax, %eax + popl %ebp + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S new file mode 100644 index 0000000..5b1da61 --- /dev/null +++ b/kernel/x86/zscal_sse2.S @@ -0,0 +1,1745 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define XX %edi +#define FLAG %ebp + +#include "l1param.h" + +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#define USE_PSHUFD +#else +#define USE_PSHUFD_HALF +#endif + + +#define xmm8 xmm0 +#define xmm9 xmm1 +#define xmm10 xmm2 +#define xmm11 xmm3 +#define xmm12 xmm4 +#define xmm13 xmm5 +#define xmm14 xmm6 +#define xmm15 xmm7 + + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + movsd STACK_ALPHA_R, %xmm0 + movsd STACK_ALPHA_I, %xmm1 + + sall $ZBASE_SHIFT, INCX + xor FLAG, FLAG + + testl M, M + jle .L999 + + xorps %xmm7, %xmm7 + comisd %xmm0, %xmm7 + jne .L100 + + comisd %xmm1, %xmm7 + jne .L100 + +/* Alpha == ZERO */ + cmpl $2 * SIZE, INCX + jne .L20 + +/* INCX == 1 */ + testl $SIZE, X + je .L05 + + movsd %xmm7, 0 * SIZE(X) + addl $SIZE, X + movl $1, FLAG + decl M + jle .L19 + ALIGN_3 +.L05: + + movl M, I # rcx = n + sarl $3, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 2 * SIZE(X) + movaps %xmm7, 4 * SIZE(X) + movaps %xmm7, 6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm7, 8 * SIZE(X) + movaps %xmm7, 10 * SIZE(X) + movaps %xmm7, 12 * SIZE(X) + movaps %xmm7, 14 * SIZE(X) + + addl $16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: + testl $4, M + je .L13 + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 2 * SIZE(X) + movaps %xmm7, 4 * SIZE(X) + movaps %xmm7, 6 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L13: + testl $2, M + je .L14 + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 2 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L14: + testl $1, M + je .L19 + movaps %xmm7, 0 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, FLAG + je .L999 + + movsd %xmm7, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L20: + testl $SIZE, X + jne .L30 + +/* Aligned Mode */ + movl M, I # rcx = n + sarl $2, I + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm7, (X) + addl INCX, X + movaps %xmm7, (X) + addl INCX, X + movaps %xmm7, (X) + addl INCX, X + movaps %xmm7, (X) + addl INCX, X + decl I + jg .L21 + ALIGN_4 + +.L22: + testl $3, M + je .L999 + + testl $2, M + je .L23 + + movaps %xmm7, (X) + addl INCX, X + movaps %xmm7, (X) + addl INCX, X + ALIGN_3 + +.L23: + testl $1, M + je .L999 + + movaps %xmm7, (X) + jmp .L999 + ALIGN_4 + + +/* Unaligned Mode */ +.L30: + movl M, I # rcx = n + sarl $2, I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + decl I + jg .L31 + ALIGN_4 + +.L32: + testl $3, M + je .L999 + + testl $2, M + je .L33 + + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + ALIGN_3 + +.L33: + testl $1, M + je .L999 + + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ +.L100: + testl $SIZE, X + jne .L200 + +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm6 +#else + pshufd $0x44, %xmm0, %xmm6 +#endif + + xorps %xmm7, %xmm7 + subsd %xmm1, %xmm7 + movlhps %xmm1, %xmm7 + + cmpl $2 * SIZE, INCX + jne .L120 + + subl $-16 * SIZE, X + + movl M, I + sarl $3, I + jle .L115 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm5 +#else + movsd -15 * SIZE(X), %xmm5 + movhps -16 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + movaps -8 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm5 +#else + movsd -13 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(X) + movaps -6 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm5 +#else + movsd -11 * SIZE(X), %xmm5 + movhps -12 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(X) + movaps -4 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm5 +#else + movsd -9 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(X) + movaps -2 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm5 +#else + movsd -7 * SIZE(X), %xmm5 + movhps -8 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(X) + movaps 0 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm5 +#else + movsd -5 * SIZE(X), %xmm5 + movhps -6 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(X) + movaps 2 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm5 +#else + movsd -3 * SIZE(X), %xmm5 + movhps -4 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(X) + movaps 4 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm5 +#else + movsd -1 * SIZE(X), %xmm5 + movhps -2 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(X) + movaps 6 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm5 +#else + movsd -15 * SIZE(X), %xmm5 + movhps -16 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + movaps -8 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm5 +#else + movsd -13 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(X) + movaps -6 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm5 +#else + movsd -11 * SIZE(X), %xmm5 + movhps -12 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(X) + movaps -4 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm5 +#else + movsd -9 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(X) + movaps -2 * SIZE(X), %xmm3 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm5 +#else + movsd -7 * SIZE(X), %xmm5 + movhps -8 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(X) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm5 +#else + movsd -5 * SIZE(X), %xmm5 + movhps -6 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(X) + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm5 +#else + movsd -3 * SIZE(X), %xmm5 + movhps -4 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(X) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm5 +#else + movsd -1 * SIZE(X), %xmm5 + movhps -2 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subl $-16 * SIZE, X + ALIGN_3 + +.L115: + testl $7, M + je .L999 + + testl $4, M + je .L116 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L116: + testl $2, M + je .L117 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + addl $4 * SIZE, X + ALIGN_3 + +.L117: + testl $1, M + je .L999 + + movaps -16 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + + movaps %xmm0, -16 * SIZE(X) + jmp .L999 + ALIGN_3 + +.L120: + movl X, XX + + movl M, I + sarl $3, I + jle .L125 + + movaps (X), %xmm0 + addl INCX, X + movaps (X), %xmm1 + addl INCX, X + movaps (X), %xmm2 + addl INCX, X + movaps (X), %xmm3 + addl INCX, X + + decl I + jle .L122 + ALIGN_4 + +.L121: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + movaps (X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + movaps (X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + movaps (X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + movaps (X), %xmm3 + addl INCX, X + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + movaps (X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + movaps (X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + movaps (X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + movaps (X), %xmm3 + addl INCX, X + + decl I + jg .L121 + ALIGN_4 + +.L122: + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + movaps (X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + movaps (X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + movaps (X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + movaps (X), %xmm3 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + ALIGN_3 + +.L125: + testl $7, M + je .L999 + + testl $4, M + je .L126 + + movaps (X), %xmm0 + addl INCX, X + movaps (X), %xmm1 + addl INCX, X + + movaps (X), %xmm2 + addl INCX, X + movaps (X), %xmm3 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + ALIGN_3 + +.L126: + testl $2, M + je .L127 + + movaps (X), %xmm0 + addl INCX, X + movaps (X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + ALIGN_3 + +.L127: + testl $1, M + je .L999 + + movaps (X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + + movaps %xmm0, (XX) + jmp .L999 + ALIGN_3 + +.L200: + cmpl $2 * SIZE, INCX + jne .L220 + +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) + +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm6 +#else + pshufd $0x44, %xmm0, %xmm6 +#endif + pxor %xmm7, %xmm7 + subsd %xmm1, %xmm7 + movlhps %xmm1, %xmm7 + shufpd $1, %xmm7, %xmm7 + + movhps 0 * SIZE(X), %xmm0 + movaps 1 * SIZE(X), %xmm1 + subl $-16 * SIZE, X + + unpckhpd %xmm0, %xmm0 + mulsd %xmm6, %xmm0 + movaps %xmm1, %xmm5 + mulsd %xmm7, %xmm5 + subsd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + + decl M + + movl M, I + sarl $3, I + jle .L205 + + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + + decl I + jle .L202 + ALIGN_4 + +.L201: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + movaps -9 * SIZE(X), %xmm0 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -13 * SIZE(X) + movaps -7 * SIZE(X), %xmm1 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -11 * SIZE(X) + movaps -5 * SIZE(X), %xmm2 + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -9 * SIZE(X) + movaps -3 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -7 * SIZE(X) + movaps -1 * SIZE(X), %xmm0 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -5 * SIZE(X) + movaps 1 * SIZE(X), %xmm1 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -3 * SIZE(X) + movaps 3 * SIZE(X), %xmm2 + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -1 * SIZE(X) + movaps 5 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + decl I + jg .L201 + ALIGN_4 + +.L202: + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + movaps -9 * SIZE(X), %xmm0 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -13 * SIZE(X) + movaps -7 * SIZE(X), %xmm1 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -11 * SIZE(X) + movaps -5 * SIZE(X), %xmm2 + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -9 * SIZE(X) + movaps -3 * SIZE(X), %xmm3 + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -7 * SIZE(X) + movaps -1 * SIZE(X), %xmm0 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -5 * SIZE(X) + movaps 1 * SIZE(X), %xmm1 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -3 * SIZE(X) + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-16 * SIZE, X + ALIGN_3 + +.L205: + testl $4, M + je .L206 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -11 * SIZE(X) + + movaps -7 * SIZE(X), %xmm1 + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L206: + testl $2, M + je .L207 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + ALIGN_3 + +.L207: + testl $1, M + je .L208 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + addl $2 * SIZE, X + ALIGN_3 + +.L208: + unpckhpd %xmm0, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm0 + addsd %xmm1, %xmm0 + movlps %xmm0, -15 * SIZE(X) + jmp .L999 + ALIGN_3 + +#else + +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm6 +#else + pshufd $0x44, %xmm0, %xmm6 +#endif + pxor %xmm7, %xmm7 + subsd %xmm1, %xmm7 + movlhps %xmm1, %xmm7 + + subl $-16 * SIZE, X + + movl M, I + sarl $3, I + jle .L205 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decl I + jle .L202 + ALIGN_4 + +.L201: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -8 * SIZE(X) + movhps %xmm0, -7 * SIZE(X) + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -6 * SIZE(X) + movhps %xmm1, -5 * SIZE(X) + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -4 * SIZE(X) + movhps %xmm2, -3 * SIZE(X) + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -2 * SIZE(X) + movhps %xmm3, -1 * SIZE(X) + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + decl I + jg .L201 + ALIGN_4 + +.L202: + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -8 * SIZE(X) + movhps %xmm0, -7 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -6 * SIZE(X) + movhps %xmm1, -5 * SIZE(X) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -4 * SIZE(X) + movhps %xmm2, -3 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -2 * SIZE(X) + movhps %xmm3, -1 * SIZE(X) + + subl $-16 * SIZE, X + ALIGN_3 + +.L205: + testl $7, M + je .L999 + + testl $4, M + je .L206 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L206: + testl $2, M + je .L207 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + addl $4 * SIZE, X + ALIGN_3 + +.L207: + testl $1, M + je .L999 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + jmp .L999 + ALIGN_3 + +#endif + +.L220: +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm6 +#else + pshufd $0x44, %xmm0, %xmm6 +#endif + pxor %xmm7, %xmm7 + subsd %xmm1, %xmm7 + movlhps %xmm1, %xmm7 + + movl X, XX + + movl M, I + sarl $3, I + jle .L225 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + decl I + jle .L222 + ALIGN_4 + +.L221: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + decl I + jg .L221 + ALIGN_4 + +.L222: + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + ALIGN_3 + +.L225: + testl $7, M + je .L999 + + testl $4, M + je .L226 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + ALIGN_3 + +.L226: + testl $2, M + je .L227 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + ALIGN_3 + +.L227: + testl $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + ALIGN_3 + +.L999: + xorl %eax, %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE + diff --git a/kernel/x86/zswap.S b/kernel/x86/zswap.S new file mode 100644 index 0000000..ca4660f --- /dev/null +++ b/kernel/x86/zswap.S @@ -0,0 +1,248 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define N 4 + STACK + ARGS(%esp) +#ifdef XDOUBLE +#define X 48 + STACK + ARGS(%esp) +#define INCX 52 + STACK + ARGS(%esp) +#define Y 56 + STACK + ARGS(%esp) +#define INCY 60 + STACK + ARGS(%esp) +#elif defined(DOUBLE) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#else +#define X 24 + STACK + ARGS(%esp) +#define INCX 28 + STACK + ARGS(%esp) +#define Y 32 + STACK + ARGS(%esp) +#define INCY 36 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl N, %edx + movl X, %esi + movl Y, %edi + movl INCX, %ebx + movl INCY, %ecx + + sall $ZBASE_SHIFT, %ebx + sall $ZBASE_SHIFT, %ecx + + cmpl $2 * SIZE, %ebx + jne .L14 + cmpl $2 * SIZE, %ecx + jne .L14 + + movl %edx, %eax + sarl $1, %eax + jle .L15 + ALIGN_3 + +.L16: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 3 * SIZE(%esi) + FLD 2 * SIZE(%esi) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + FLD 3 * SIZE(%edi) + FLD 2 * SIZE(%edi) + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) + FST 2 * SIZE(%edi) + FST 3 * SIZE(%edi) +#else + fldl 2 * SIZE(%esi) + fldl 0 * SIZE(%esi) + fldl 2 * SIZE(%edi) + fldl 0 * SIZE(%edi) + + fstpl 0 * SIZE(%esi) + fstpl 2 * SIZE(%esi) + fstpl 0 * SIZE(%edi) + fstpl 2 * SIZE(%edi) +#endif + addl $4 * SIZE, %esi + addl $4 * SIZE, %edi + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl %edx, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L22: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) +#else + fldl 0 * SIZE(%esi) + fldl 0 * SIZE(%edi) + fstpl 0 * SIZE(%esi) + fstpl 0 * SIZE(%edi) +#endif + + jmp .L27 + ALIGN_3 + +/* INCX != 1 or INCY != 1 */ + +.L14: + movl %edx, %eax + sarl $1, %eax + jle .L28 + ALIGN_2 + +.L29: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + addl %ebx, %esi + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + addl %ecx, %edi + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + subl %ebx, %esi + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + leal (%esi, %ebx, 2), %esi + + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) + subl %ecx, %edi + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) + leal (%edi, %ecx, 2), %edi +#else + fldl 0 * SIZE(%esi) + addl %ebx, %esi + fldl 0 * SIZE(%esi) + + fldl 0 * SIZE(%edi) + addl %ecx, %edi + fldl 0 * SIZE(%edi) + + fstpl 0 * SIZE(%esi) + subl %ebx, %esi + fstpl 0 * SIZE(%esi) + leal (%esi, %ebx, 2), %esi + + fstpl 0 * SIZE(%edi) + subl %ecx, %edi + fstpl 0 * SIZE(%edi) + leal (%edi, %ecx, 2), %edi +#endif + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl %edx, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L35: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) +#else + fldl 0 * SIZE(%esi) + fldl 0 * SIZE(%edi) + fstpl 0 * SIZE(%esi) + fstpl 0 * SIZE(%edi) +#endif + ALIGN_3 + +.L27: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zswap_sse.S b/kernel/x86/zswap_sse.S new file mode 100644 index 0000000..24d0001 --- /dev/null +++ b/kernel/x86/zswap_sse.S @@ -0,0 +1,1112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %edx +#define X %esi +#define Y %edi +#define INCX %ebx +#define INCY %ecx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_Y, Y + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L19 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + addl M, M + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + cmpl $3, M + jle .L16 + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + ALIGN_3 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_3 + +.L10: + cmpl $3, M + jle .L16 + + testl $2 * SIZE, X + jne .L30 + + testl $1 * SIZE, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + + decl %eax + jg .L11 + ALIGN_3 + +.L13: + testl $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + testl $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + addl $2 * SIZE, X + movlps %xmm0, -32 * SIZE(Y) + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L20: + movaps -33 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + PSHUFD2($0x39, %xmm1, %xmm3) + movlps %xmm3, -31 * SIZE(X) + + subl $3, M + + movl M, %eax + sarl $5, %eax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -13 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -5 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -5 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L23: + testl $16, M + jle .L24 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $8, M + jle .L25 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, M + jle .L26 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + PSHUFD2($0x39, %xmm0, %xmm2) + PSHUFD1($0xff, %xmm0) + + movlps %xmm2, -32 * SIZE(Y) + movss %xmm0, -30 * SIZE(Y) + + testl $2, M + jle .L27 + + movsd -29 * SIZE(X), %xmm0 + movsd -29 * SIZE(Y), %xmm1 + + movlps %xmm0, -29 * SIZE(Y) + movlps %xmm1, -29 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, M + jle .L29 + + movss -29 * SIZE(X), %xmm0 + movss -29 * SIZE(Y), %xmm1 + + movss %xmm0, -29 * SIZE(Y) + movss %xmm1, -29 * SIZE(X) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L30: + testl $1 * SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + subl $2, M + + movl M, %eax + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -6 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -6 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L33: + testl $16, M + jle .L34 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + testl $8, M + jle .L35 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, M + jle .L36 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + movhps %xmm0, -32 * SIZE(Y) + + testl $2, M + jle .L37 + + movsd -30 * SIZE(X), %xmm0 + movsd -30 * SIZE(Y), %xmm1 + + movlps %xmm0, -30 * SIZE(Y) + movlps %xmm1, -30 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, M + jle .L39 + + movss -30 * SIZE(X), %xmm0 + movss -30 * SIZE(Y), %xmm1 + + movss %xmm0, -30 * SIZE(Y) + movss %xmm1, -30 * SIZE(X) + ALIGN_3 + +.L39: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + + subl $3, M + + movl M, %eax + sarl $5, %eax + jle .L43 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -11 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -3 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -3 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L43: + testl $16, M + jle .L44 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + testl $8, M + jle .L45 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, M + jle .L46 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + movsd -31 * SIZE(X), %xmm2 + + PSHUFD2($0x39, %xmm1, %xmm1) + movlps %xmm1, -31 * SIZE(X) + + PSHUFD1($0xff, %xmm0) + + movss %xmm0, -32 * SIZE(Y) + movlps %xmm2, -31 * SIZE(Y) + + addl $3 * SIZE, X + addl $3 * SIZE, Y + + testl $2, M + jle .L47 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm0, -32 * SIZE(Y) + movlps %xmm1, -32 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, M + jle .L49 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm0, -32 * SIZE(Y) + movss %xmm1, -32 * SIZE(X) + ALIGN_3 + +.L49: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L50: + movl M, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L51: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $3, %eax + jle .L57 + ALIGN_3 + +.L56: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zswap_sse2.S b/kernel/x86/zswap_sse2.S new file mode 100644 index 0000000..d900ea5 --- /dev/null +++ b/kernel/x86/zswap_sse2.S @@ -0,0 +1,978 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) +#define STACK_Y 40 + STACK + ARGS(%esp) +#define STACK_INCY 44 + STACK + ARGS(%esp) + +#define M %edx +#define X %esi +#define Y %edi +#define INCX %ebx +#define INCY %ecx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_Y, Y + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L19 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, Y + jne .L30 + + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $3, %eax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + + decl %eax + jg .L11 + ALIGN_3 + +.L13: + testl $4, M + jle .L14 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + testl $2, M + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + testl $1, M + jle .L19 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L19: + xorl %eax,%eax + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L20: + movhps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + decl M + jle .L29 + + movl M, %eax + sarl $3, %eax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -5 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -3 * SIZE(X), %xmm2 + movaps -2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L23: + testl $4, M + jle .L24 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $2, M + jle .L25 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $1, M + jle .L29 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L29: + movaps -15 * SIZE(X), %xmm2 + + movhps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L30: + testl $SIZE, X + jne .L40 + + movhps -16 * SIZE(Y), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movlps %xmm1, -16 * SIZE(Y) + decl M + jle .L39 + + movl M, %eax + sarl $3, %eax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -11 * SIZE(Y), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(Y) + + movaps -9 * SIZE(Y), %xmm0 + movaps -8 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(Y), %xmm2 + movaps -6 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(Y) + + movaps -5 * SIZE(Y), %xmm0 + movaps -4 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -3 * SIZE(Y), %xmm2 + movaps -2 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(Y) + + movaps -1 * SIZE(Y), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L33: + testl $4, M + jle .L34 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + + movaps -11 * SIZE(Y), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(Y) + + movaps -9 * SIZE(Y), %xmm0 + movaps -8 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L34: + testl $2, M + jle .L35 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L35: + testl $1, M + jle .L39 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L39: + movaps -15 * SIZE(Y), %xmm2 + + movhps %xmm1, -15 * SIZE(Y) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L40: + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm0, -16 * SIZE(Y) + movlps %xmm1, -16 * SIZE(X) + + addl $SIZE, X + addl $SIZE, Y + decl M + jle .L49 + + movl M, %eax + sarl $3, %eax + jle .L43 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + + decl %eax + jg .L41 + ALIGN_3 + +.L43: + testl $4, M + jle .L44 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L44: + testl $2, M + jle .L45 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L45: + testl $1, M + jle .L49 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L49: + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm0, -16 * SIZE(Y) + movlps %xmm1, -16 * SIZE(X) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L50: + testl $SIZE, X + jne .L60 + testl $SIZE, Y + jne .L60 + + movl M, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L51: + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $3, %eax + jle .L57 + ALIGN_3 + +.L56: + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L60: + movl M, %eax + sarl $2, %eax + jle .L65 + ALIGN_3 + +.L61: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L61 + ALIGN_3 + +.L65: + movl M, %eax + andl $3, %eax + jle .L67 + ALIGN_3 + +.L66: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L66 + ALIGN_3 + +.L67: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_2x1_core2.S b/kernel/x86/ztrsm_kernel_LN_2x1_core2.S new file mode 100644 index 0000000..1d3107a --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_2x1_core2.S @@ -0,0 +1,1057 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addpd +#define ADD2 addpd + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $ZBASE_SHIFT, LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + je .L50 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax # l-- + jg .L53 + +.L54: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AA), %xmm5 + + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(B) + + movddup %xmm5, %xmm4 + unpckhpd %xmm5, %xmm5 + + movapd %xmm4, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + ADD2 %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AA), %xmm0 + ADD2 %xmm3, %xmm7 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm5 + movapd -14 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm5 + movapd -14 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -10 * SIZE(AA), %xmm2 + movddup -9 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup -12 * SIZE(AA), %xmm2 + movddup -11 * SIZE(AA), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup -14 * SIZE(AA), %xmm2 + movddup -13 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup -10 * SIZE(AA), %xmm2 + movddup -9 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + movsd %xmm7, 2 * SIZE(CO1) + movhpd %xmm7, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + + movddup %xmm5, %xmm4 + unpckhpd %xmm5, %xmm5 + movddup %xmm7, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm4, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm6, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) + movapd %xmm7, -14 * SIZE(AA) + +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S b/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S new file mode 100644 index 0000000..7aef336 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S @@ -0,0 +1,1163 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + movapd %xmm2, 4 * SIZE(BB) + movapd %xmm3, 6 * SIZE(BB) + movapd %xmm4, 8 * SIZE(BB) + movapd %xmm5, 10 * SIZE(BB) + movapd %xmm6, 12 * SIZE(BB) + movapd %xmm7, 14 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + je .L50 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax # l = (k >> 2) + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 16 * SIZE(BB), %xmm1 + + mulpd %xmm0, %xmm3 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm5 + movapd 4 * SIZE(BB), %xmm3 + + mulpd %xmm0, %xmm3 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 10 * SIZE(BB), %xmm0 + + addpd %xmm2, %xmm4 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 24 * SIZE(BB), %xmm2 + + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm5 + movapd 8 * SIZE(AA), %xmm0 + + addl $ 8 * SIZE, AA # aoffset += 2 + addl $16 * SIZE, BB # boffset1 += 4 + + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 2 + addl $4 * SIZE, BB # boffset1 += 4 + decl %eax # l-- + jg .L53 + +.L54: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + + movsd %xmm5, 0 * SIZE(BB) + movsd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (BB, %eax, 4), BB + leal (AA, %eax, 4), AA + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + movapd 2 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movsd 6 * SIZE(AA), %xmm2 + movhpd 6 * SIZE(AA), %xmm2 + movsd 7 * SIZE(AA), %xmm3 + movhpd 7 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movsd 4 * SIZE(AA), %xmm2 + movhpd 4 * SIZE(AA), %xmm2 + movsd 5 * SIZE(AA), %xmm3 + movhpd 5 * SIZE(AA), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movsd 2 * SIZE(AA), %xmm2 + movhpd 2 * SIZE(AA), %xmm2 + movsd 3 * SIZE(AA), %xmm3 + movhpd 3 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movsd 6 * SIZE(AA), %xmm2 + movhpd 6 * SIZE(AA), %xmm2 + movsd 7 * SIZE(AA), %xmm3 + movhpd 7 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + movsd %xmm7, 2 * SIZE(CO1) + movhpd %xmm7, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + + movsd %xmm5, 0 * SIZE(BB) + movsd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) + movsd %xmm7, 4 * SIZE(BB) + movsd %xmm7, 5 * SIZE(BB) + movhpd %xmm7, 6 * SIZE(BB) + movhpd %xmm7, 7 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S new file mode 100644 index 0000000..e5949aa --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -0,0 +1,1966 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + andl $1, %ebx + jle .L30 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + pshufd $0xb1, %xmm7, %xmm7 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + movsd -30 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, -32 * SIZE(AA) + movlps %xmm5, -30 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + movl M, %ebx + sarl $1, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + ADD2 %xmm2, %xmm7 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm3, %xmm6 + psllq $63, %xmm0 + +#ifndef CONJ + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + shufps $0xb1, %xmm0, %xmm0 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else + pshufd $0xb1, %xmm0, %xmm1 + + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm7 +#endif +#endif + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + + shufps $0xd8, %xmm4, %xmm4 + shufps $0xd8, %xmm6, %xmm6 + + movaps %xmm4, %xmm5 + shufps $0xe4, %xmm6, %xmm4 + shufps $0xe4, %xmm5, %xmm6 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps -32 * SIZE(BB), %xmm2 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + movaps -28 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, -32 * SIZE(AA) + movaps %xmm5, -28 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + andl $1, %ebx + jle .L130 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -22 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -18 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L130: + movl M, %ebx + sarl $1, %ebx + jle .L149 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + movhps -30 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movsd -32 * SIZE(BB), %xmm2 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S new file mode 100644 index 0000000..f77a06d --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -0,0 +1,2201 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCHSIZE (16 * 10 + 8) +#define WPREFETCHSIZE 112 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + xorps %xmm7, %xmm7 + pcmpeqb %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $ 4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + andl $1, %ebx + jle .L30 + ALIGN_4 + +.L40: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, 0 * SIZE(AA) + movlps %xmm5, 2 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + movl M, %ebx + sarl $1, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + PREFETCHW -4 * SIZE(CO1) + PREFETCHW -4 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $ 32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + movaps 4 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm5, 28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, 0 * SIZE(AA) + movaps %xmm5, 4 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + andl $1, %ebx + jle .L130 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L130: + movl M, %ebx + sarl $1, %ebx + jle .L149 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW -4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm1, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_4x1_sse.S b/kernel/x86/ztrsm_kernel_LN_4x1_sse.S new file mode 100644 index 0000000..877a3ba --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_4x1_sse.S @@ -0,0 +1,1893 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + +#ifndef CONJ + movl $0x80000000, 0 + POSINV + movl $0x00000000, 4 + POSINV + movl $0x80000000, 8 + POSINV + movl $0x00000000, 12 + POSINV +#else + movl $0x00000000, 0 + POSINV + movl $0x80000000, 4 + POSINV + movl $0x00000000, 8 + POSINV + movl $0x80000000, 12 + POSINV +#endif + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L02 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_4 + +.L71: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + je .L74 + +.L73: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L73 + +.L74: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm5, %xmm5 +#endif +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AA), %xmm5 +#endif + + subps %xmm4, %xmm5 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm1 +#else + movsd 0 * SIZE(B), %xmm1 +#endif + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm5, 0 * SIZE(B) + + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm5, 0 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $2, %ebx + jle .L70 + +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_4 + +.L51: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_4 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L53 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 + movhps 2 * SIZE(B), %xmm5 +#else + movaps 0 * SIZE(AA), %xmm5 +#endif + + subps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + movhlps %xmm5, %xmm4 +#endif + +#ifdef LN +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 4 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef LT +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + + movsd 2 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + + movaps %xmm1, %xmm2 + shufps $0x44, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x11, %xmm2, %xmm3 + + movaps %xmm5, %xmm4 + shufps $0xa0, %xmm4, %xmm4 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlhps %xmm4, %xmm5 + + movsd %xmm5, 0 * SIZE(B) + movhps %xmm5, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm2 + pshufd $0xff, %xmm5, %xmm3 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) +#else + movaps %xmm5, 0 * SIZE(AA) +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhps %xmm5, 2 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + movl M, %ebx + sarl $2, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + prefetcht0 8 * SIZE(CO1) + je .L12 + ALIGN_4 + +#define PREFETCHSIZE 48 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + + addl $8 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + + decl %eax + jg .L13 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 + movhps 2 * SIZE(B), %xmm5 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 +#else + movaps 0 * SIZE(AA), %xmm5 + movaps 4 * SIZE(AA), %xmm7 +#endif + + subps %xmm4, %xmm5 + subps %xmm6, %xmm7 + +#if defined(LN) || defined(LT) + movhlps %xmm5, %xmm4 + movhlps %xmm7, %xmm6 +#endif + +#ifdef LN +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 30 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps POSINV, %xmm6 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm6 + + addps %xmm3, %xmm6 + + movsd 28 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 26 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 24 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 20 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps POSINV, %xmm7 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm7 + + addps %xmm3, %xmm7 + + movsd 18 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 16 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 10 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 8 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + + movsd 2 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 4 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 10 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 12 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 14 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 20 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps POSINV, %xmm7 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm7 + + addps %xmm3, %xmm7 + + movsd 22 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 30 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps POSINV, %xmm6 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm6 + + addps %xmm3, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x44, %xmm1, %xmm2 + pshufd $0x11, %xmm1, %xmm3 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 +#else + movaps %xmm1, %xmm2 + shufps $0x44, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x11, %xmm3, %xmm3 + + movaps %xmm5, %xmm4 + shufps $0xa0, %xmm4, %xmm4 + shufps $0xf5, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xa0, %xmm6, %xmm6 + shufps $0xf5, %xmm7, %xmm7 +#endif + +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm2, %xmm6 + mulps %xmm3, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlhps %xmm4, %xmm5 + movlhps %xmm6, %xmm7 + + movsd %xmm5, 0 * SIZE(B) + movhps %xmm5, 2 * SIZE(B) + movsd %xmm7, 4 * SIZE(B) + movhps %xmm7, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm2 + pshufd $0xff, %xmm5, %xmm3 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm2 + pshufd $0xff, %xmm7, %xmm3 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) +#else + movaps %xmm5, 0 * SIZE(AA) + movaps %xmm7, 4 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + movhps %xmm5, 2 * SIZE(CO1) + movlps %xmm7, 4 * SIZE(CO1) + movhps %xmm7, 6 * SIZE(CO1) + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x1.S b/kernel/x86/ztrsm_kernel_LT_1x1.S new file mode 100644 index 0000000..5b13a54 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x1.S @@ -0,0 +1,493 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define KK 0 + STACK(%esp) +#define KKK 4 + STACK(%esp) +#define AORIG 8 + STACK(%esp) + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_A 32 + STACK + ARGS(%esp) +#define STACK_B 36 + STACK + ARGS(%esp) +#define STACK_C 40 + STACK + ARGS(%esp) +#define STACK_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define STACK_C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#define M %esi +#define K %edi + +#define A %ebx +#define B %ecx +#define C %edx +#define LDC %ebp + + movl STACK_K, K + movl STACK_LDC, LDC + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl STACK_M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, STACK_C + imull K, %eax + addl %eax, STACK_A +#endif + +#ifdef RT + movl STACK_N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, STACK_B + + movl STACK_N, %eax + imull LDC, %eax + addl %eax, STACK_C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl STACK_N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + cmpl $0, STACK_N + jle .L29 + cmpl $0, STACK_M + jle .L29 + ALIGN_4 + +.L30: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, STACK_B +#endif + +#ifdef RT + subl LDC, STACK_C +#endif + movl STACK_C, C +#ifndef RT + addl LDC, STACK_C +#endif + + movl STACK_M, M + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + ALIGN_4 + +.L34: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + movl AORIG, A + movl STACK_B, B + addl %eax, A + addl %eax, B +#else + movl STACK_B, B +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(B) # B5 + FLD 4 * SIZE(A) # A5 + FLD 0 * SIZE(B) # B0 + FLD 0 * SIZE(A) # A0 + +#ifdef HAVE_SSE + prefetcht2 2 * SIZE(C) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L37 + ALIGN_4 + +#define PREFETCH_OFFSET 40 + +.L38: +#ifdef HAVE_SSE + prefetchnta (PREFETCH_OFFSET) * SIZE(B) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET) * SIZE(A) +#endif +#endif + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + FLD 0 * SIZE(B) + fxch %st(1) + faddp %st, %st(4) + FLD 1 * SIZE(A) + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(B) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(A) + + fmul %st, %st(1) + FMUL 3 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + FLD 2 * SIZE(B) + fxch %st(1) + faddp %st, %st(4) + FLD 3 * SIZE(A) + fmul %st, %st(1) + FMUL 3 * SIZE(B) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(B) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(A) + fxch %st(2) + +#ifdef HAVE_SSE +#ifdef DOUBLE + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(B) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(A) +#endif +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(B) + fxch %st(3) + faddp %st, %st(5) + FLD 4 * SIZE(B) + fxch %st(3) + faddp %st, %st(4) + FLD 5 * SIZE(A) + fmul %st, %st(3) + FMUL 5 * SIZE(B) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(B) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(A) + + fmul %st, %st(3) + FMUL 7 * SIZE(B) + fxch %st(3) + faddp %st, %st(5) + FLD 6 * SIZE(B) + fxch %st(3) + faddp %st, %st(4) + FLD 7 * SIZE(A) + fmul %st, %st(3) + FMUL 7 * SIZE(B) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(B) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(A) + fxch %st(2) + + subl $-8 * SIZE, B + subl $-8 * SIZE, A + decl %eax + jg .L38 + ALIGN_4 + +.L37: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + jle .L43 + ALIGN_2 + +.L54: + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + + FLD 0 * SIZE(B) + fxch %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(A) + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(B) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(A) + + addl $2 * SIZE, A + addl $2 * SIZE, B + decl %eax + jg .L54 + ALIGN_3 + +.L43: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(LT) +#ifndef CONJ + faddp %st, %st(3) # ctemp3 += ctemp4 + fsubp %st, %st(1) # ctemp1 += ctemp2 +#else + fsubp %st, %st(3) # ctemp1 += ctemp2 + faddp %st, %st(1) # ctemp3 += ctemp4 +#endif +#endif + +#if defined(RN) || defined(RT) +#ifndef CONJ + faddp %st, %st(3) # ctemp3 += ctemp4 + fsubp %st, %st(1) # ctemp1 += ctemp2 +#else + fsubrp %st, %st(3) # ctemp1 += ctemp2 + faddp %st, %st(1) # ctemp3 += ctemp4 +#endif +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + sall $ZBASE_SHIFT, %eax + + movl AORIG, A + movl STACK_B, B + addl %eax, A + addl %eax, B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE(B) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE(A) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(A) + fmul %st(1), %st + FLD 0 * SIZE(A) + fmul %st(3), %st + FLD 1 * SIZE(A) + fmulp %st, %st(3) + FLD 1 * SIZE(A) + fmulp %st, %st(4) +#endif + +#if defined(RN) || defined(RT) + FLD 0 * SIZE(B) + fmul %st(1), %st + FLD 0 * SIZE(B) + fmul %st(3), %st + FLD 1 * SIZE(B) + fmulp %st, %st(3) + FLD 1 * SIZE(B) + fmulp %st, %st(4) +#endif + +#ifndef CONJ + faddp %st, %st(2) + fsubp %st, %st(2) +#else + fsubp %st, %st(2) + faddp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, C +#endif + +#if defined(LN) || defined(LT) + FSTU 1 * SIZE(B) + fxch %st(1) + FSTU 0 * SIZE(B) +#else + FSTU 1 * SIZE(A) + fxch %st(1) + FSTU 0 * SIZE(A) +#endif + FST 0 * SIZE(C) + FST 1 * SIZE(C) + +#ifndef LN + addl $2 * SIZE, C +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, A + addl %eax, B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl M + jg .L34 + ALIGN_2 + +.L33: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, STACK_B +#endif +#if defined(LT) || defined(RN) + movl B, STACK_B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl STACK_N + jg .L30 + ALIGN_2 + +.L29: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x1_atom.S b/kernel/x86/ztrsm_kernel_LT_1x1_atom.S new file mode 100644 index 0000000..bc0d03e --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x1_atom.S @@ -0,0 +1,453 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#ifndef CONJ +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd + +#elif defined(LN) || defined(LT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl %eax, %eax + movl %eax, J # j = n + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADDSD3 %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADDSD3 %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + ADDSD3 %xmm2, %xmm6 + ADDSD4 %xmm3, %xmm7 + + addsd %xmm7, %xmm4 + addsd %xmm5, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm6 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm7 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BB), %xmm6 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BB), %xmm7 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD2 %xmm5, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S new file mode 100644 index 0000000..b01498f --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -0,0 +1,969 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps 2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + ADD1 %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm1 + ADD2 %xmm2, %xmm7 + psllq $63, %xmm1 + +#ifndef CONJ + pshufd $0x40, %xmm1, %xmm0 + shufps $0x04, %xmm1, %xmm1 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm1, %xmm0 +#else + pshufd $0x04, %xmm1, %xmm0 +#endif + shufps $0x40, %xmm1, %xmm1 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm5 + movapd -14 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm5 + movapd -14 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup -14 * SIZE(BB), %xmm2 + movddup -13 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup -10 * SIZE(BB), %xmm2 + movddup -9 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup -10 * SIZE(BB), %xmm2 + movddup -9 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup -12 * SIZE(BB), %xmm2 + movddup -11 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(BB) + movapd %xmm7, -14 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) + movapd %xmm7, -14 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + ALIGN_4 + +L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je L115 + ALIGN_4 + +L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -10 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -6 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -2 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne L112 + ALIGN_4 + +L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je L118 + ALIGN_4 + +L116: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L116 + ALIGN_4 + +L118: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addpd %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm1 + addpd %xmm7, %xmm5 + psllq $63, %xmm1 + +#ifndef CONJ + pshufd $0x40, %xmm1, %xmm0 + shufps $0x04, %xmm1, %xmm1 + + pxor %xmm0, %xmm4 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm1, %xmm0 +#else + pshufd $0x04, %xmm1, %xmm0 +#endif + shufps $0x40, %xmm1, %xmm1 + + pxor %xmm0, %xmm5 +#endif + + haddpd %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg L110 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S b/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S new file mode 100644 index 0000000..fdeecc7 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S @@ -0,0 +1,1328 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#else +#define PREFETCH prefetcht0 +#endif + +#define PREFETCHSIZE (8 * 10 + 4) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, B + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm7, 8 + POSINV + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + leal (, LDC, 2), %eax + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + leal (, LDC, 2), %eax + addl %eax, C +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 2 * SIZE(CO1) + prefetchw 2 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + movapd 2 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movlpd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movlpd 6 * SIZE(B), %xmm2 + movhpd 6 * SIZE(B), %xmm2 + movlpd 7 * SIZE(B), %xmm3 + movhpd 7 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm2 + movhpd 6 * SIZE(B), %xmm2 + movlpd 7 * SIZE(B), %xmm3 + movhpd 7 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movlpd 4 * SIZE(B), %xmm2 + movhpd 4 * SIZE(B), %xmm2 + movlpd 5 * SIZE(B), %xmm3 + movhpd 5 * SIZE(B), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + + movlpd %xmm5, 0 * SIZE(BB) + movlpd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) + movlpd %xmm7, 4 * SIZE(BB) + movlpd %xmm7, 5 * SIZE(BB) + movhpd %xmm7, 6 * SIZE(BB) + movhpd %xmm7, 7 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L500 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L199 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movlpd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + + movlpd %xmm5, 0 * SIZE(BB) + movlpd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L199: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L500: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S b/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S new file mode 100644 index 0000000..29103ba --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S @@ -0,0 +1,965 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#define ADDSUB addpd + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7 + +#define KERNEL7(address) \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_4 + +.L11: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + pcmpeqb %xmm1, %xmm1 + psllq $63, %xmm1 + + shufps $0x40, %xmm1, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 + + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else +#if defined(LN) || defined(LT) + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#else + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#endif + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm5 + movapd 2 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movddup 0 * SIZE(AA), %xmm2 + movddup 1 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + movapd %xmm7, %xmm6 + + SHUFPD_1 %xmm4, %xmm4 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup 2 * SIZE(BB), %xmm2 + movddup 3 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + movapd %xmm5, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup 6 * SIZE(BB), %xmm2 + movddup 7 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup 6 * SIZE(BB), %xmm2 + movddup 7 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup 4 * SIZE(BB), %xmm2 + movddup 5 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm4 + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(BB) + movapd %xmm7, 2 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + testl $1, %eax + jle .L500 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je L112 + ALIGN_4 + +L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne L111 + ALIGN_4 + +L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je L114 + ALIGN_4 + +L113: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L113 + ALIGN_4 + +L114: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + pcmpeqb %xmm1, %xmm1 + psllq $63, %xmm1 + + shufps $0x40, %xmm1, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + xorpd %xmm1, %xmm5 + + subpd %xmm5, %xmm4 +#else +#if defined(LN) || defined(LT) + xorpd %xmm1, %xmm4 +#else + xorpd %xmm1, %xmm5 +#endif + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movddup 0 * SIZE(AA), %xmm2 + movddup 1 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg L110 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L500: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_2x1_core2.S b/kernel/x86/ztrsm_kernel_LT_2x1_core2.S new file mode 100644 index 0000000..4674654 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_2x1_core2.S @@ -0,0 +1,1056 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addpd +#define ADD2 addpd + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $ZBASE_SHIFT, LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + ADD2 %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AA), %xmm0 + ADD2 %xmm3, %xmm7 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm5 + movapd -14 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm5 + movapd -14 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -10 * SIZE(AA), %xmm2 + movddup -9 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup -12 * SIZE(AA), %xmm2 + movddup -11 * SIZE(AA), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup -14 * SIZE(AA), %xmm2 + movddup -13 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup -10 * SIZE(AA), %xmm2 + movddup -9 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + movsd %xmm7, 2 * SIZE(CO1) + movhpd %xmm7, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + + movddup %xmm5, %xmm4 + unpckhpd %xmm5, %xmm5 + movddup %xmm7, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm4, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm6, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) + movapd %xmm7, -14 * SIZE(AA) + +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + +.L50: + movl M, %ebx + testl $1, %ebx + je .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax # l-- + jg .L53 + +.L54: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AA), %xmm5 + + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(B) + + movddup %xmm5, %xmm4 + unpckhpd %xmm5, %xmm5 + + movapd %xmm4, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S b/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S new file mode 100644 index 0000000..77f3026 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S @@ -0,0 +1,1164 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + movapd %xmm2, 4 * SIZE(BB) + movapd %xmm3, 6 * SIZE(BB) + movapd %xmm4, 8 * SIZE(BB) + movapd %xmm5, 10 * SIZE(BB) + movapd %xmm6, 12 * SIZE(BB) + movapd %xmm7, 14 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (BB, %eax, 4), BB + leal (AA, %eax, 4), AA + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + movapd 2 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + + +#ifdef LN + movsd 6 * SIZE(AA), %xmm2 + movhpd 6 * SIZE(AA), %xmm2 + movsd 7 * SIZE(AA), %xmm3 + movhpd 7 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movsd 4 * SIZE(AA), %xmm2 + movhpd 4 * SIZE(AA), %xmm2 + movsd 5 * SIZE(AA), %xmm3 + movhpd 5 * SIZE(AA), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movsd 2 * SIZE(AA), %xmm2 + movhpd 2 * SIZE(AA), %xmm2 + movsd 3 * SIZE(AA), %xmm3 + movhpd 3 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movsd 6 * SIZE(AA), %xmm2 + movhpd 6 * SIZE(AA), %xmm2 + movsd 7 * SIZE(AA), %xmm3 + movhpd 7 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + movsd %xmm7, 2 * SIZE(CO1) + movhpd %xmm7, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + + movsd %xmm5, 0 * SIZE(BB) + movsd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) + movsd %xmm7, 4 * SIZE(BB) + movsd %xmm7, 5 * SIZE(BB) + movhpd %xmm7, 6 * SIZE(BB) + movhpd %xmm7, 7 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + +.L50: + movl M, %ebx + testl $1, %ebx + je .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax # l = (k >> 2) + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 16 * SIZE(BB), %xmm1 + + mulpd %xmm0, %xmm3 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm5 + movapd 4 * SIZE(BB), %xmm3 + + mulpd %xmm0, %xmm3 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 10 * SIZE(BB), %xmm0 + + addpd %xmm2, %xmm4 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 24 * SIZE(BB), %xmm2 + + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm5 + movapd 8 * SIZE(AA), %xmm0 + + addl $ 8 * SIZE, AA # aoffset += 2 + addl $16 * SIZE, BB # boffset1 += 4 + + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 2 + addl $4 * SIZE, BB # boffset1 += 4 + decl %eax # l-- + jg .L53 + +.L54: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + + movsd %xmm5, 0 * SIZE(BB) + movsd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S new file mode 100644 index 0000000..3668ee2 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -0,0 +1,1966 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + ADD2 %xmm2, %xmm7 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm3, %xmm6 + psllq $63, %xmm0 + +#ifndef CONJ + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + shufps $0xb1, %xmm0, %xmm0 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else + pshufd $0xb1, %xmm0, %xmm1 + + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm7 +#endif +#endif + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + + shufps $0xd8, %xmm4, %xmm4 + shufps $0xd8, %xmm6, %xmm6 + + movaps %xmm4, %xmm5 + shufps $0xe4, %xmm6, %xmm4 + shufps $0xe4, %xmm5, %xmm6 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps -32 * SIZE(BB), %xmm2 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + movaps -28 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, -32 * SIZE(AA) + movaps %xmm5, -28 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + pshufd $0xb1, %xmm7, %xmm7 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + movsd -30 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, -32 * SIZE(AA) + movlps %xmm5, -30 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + movhps -30 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movsd -32 * SIZE(BB), %xmm2 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L149 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -22 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -18 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S new file mode 100644 index 0000000..84d40dd --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -0,0 +1,2201 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCHSIZE (16 * 10 + 8) +#define WPREFETCHSIZE 112 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + xorps %xmm7, %xmm7 + pcmpeqb %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $ 4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $ 32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + movaps 4 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm5, 28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, 0 * SIZE(AA) + movaps %xmm5, 4 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, 0 * SIZE(AA) + movlps %xmm5, 2 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm1, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L149 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_4x1_sse.S b/kernel/x86/ztrsm_kernel_LT_4x1_sse.S new file mode 100644 index 0000000..4f324bc --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_4x1_sse.S @@ -0,0 +1,1898 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + +#ifndef CONJ + movl $0x80000000, 0 + POSINV + movl $0x00000000, 4 + POSINV + movl $0x80000000, 8 + POSINV + movl $0x00000000, 12 + POSINV +#else + movl $0x00000000, 0 + POSINV + movl $0x80000000, 4 + POSINV + movl $0x00000000, 8 + POSINV + movl $0x80000000, 12 + POSINV +#endif + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L02 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx + jle .L50 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + prefetcht0 8 * SIZE(CO1) + je .L12 + ALIGN_4 + +#define PREFETCHSIZE 48 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + + addl $8 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + + decl %eax + jg .L13 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 + movhps 2 * SIZE(B), %xmm5 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 +#else + movaps 0 * SIZE(AA), %xmm5 + movaps 4 * SIZE(AA), %xmm7 +#endif + + subps %xmm4, %xmm5 + subps %xmm6, %xmm7 + +#if defined(LN) || defined(LT) + movhlps %xmm5, %xmm4 + movhlps %xmm7, %xmm6 +#endif + +#ifdef LN +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 30 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps POSINV, %xmm6 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm6 + + addps %xmm3, %xmm6 + + movsd 28 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 26 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 24 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 20 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps POSINV, %xmm7 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm7 + + addps %xmm3, %xmm7 + + movsd 18 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 16 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 10 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 8 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef LT +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + + movsd 2 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 4 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 10 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 12 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 14 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 20 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps POSINV, %xmm7 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm7 + + addps %xmm3, %xmm7 + + movsd 22 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 30 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps POSINV, %xmm6 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm6 + + addps %xmm3, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x44, %xmm1, %xmm2 + pshufd $0x11, %xmm1, %xmm3 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 +#else + movaps %xmm1, %xmm2 + shufps $0x44, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x11, %xmm3, %xmm3 + + movaps %xmm5, %xmm4 + shufps $0xa0, %xmm4, %xmm4 + shufps $0xf5, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xa0, %xmm6, %xmm6 + shufps $0xf5, %xmm7, %xmm7 +#endif + +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm2, %xmm6 + mulps %xmm3, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlhps %xmm4, %xmm5 + movlhps %xmm6, %xmm7 + + movlps %xmm5, 0 * SIZE(B) + movhps %xmm5, 2 * SIZE(B) + movlps %xmm7, 4 * SIZE(B) + movhps %xmm7, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm2 + pshufd $0xff, %xmm5, %xmm3 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm2 + pshufd $0xff, %xmm7, %xmm3 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) +#else + movaps %xmm5, 0 * SIZE(AA) + movaps %xmm7, 4 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + movhps %xmm5, 2 * SIZE(CO1) + movlps %xmm7, 4 * SIZE(CO1) + movhps %xmm7, 6 * SIZE(CO1) + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L50: + movl M, %ebx + testl $2, %ebx + jle .L70 + +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_4 + +.L51: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_4 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L53 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 + movhps 2 * SIZE(B), %xmm5 +#else + movaps 0 * SIZE(AA), %xmm5 +#endif + + subps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + movhlps %xmm5, %xmm4 +#endif + +#ifdef LN +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 4 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef LT +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + + movsd 2 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + + movaps %xmm1, %xmm2 + shufps $0x44, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x11, %xmm2, %xmm3 + + movaps %xmm5, %xmm4 + shufps $0xa0, %xmm4, %xmm4 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlhps %xmm4, %xmm5 + + movlps %xmm5, 0 * SIZE(B) + movhps %xmm5, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm2 + pshufd $0xff, %xmm5, %xmm3 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) +#else + movaps %xmm5, 0 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + movhps %xmm5, 2 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + + movaps 0 * SIZE(BB), %xmm2 + + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_4 + +.L71: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + je .L74 + +.L73: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L73 + +.L74: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm5, %xmm5 +#endif +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AA), %xmm5 +#endif + subps %xmm4, %xmm5 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm1 +#else + movsd 0 * SIZE(B), %xmm1 +#endif + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm5, 0 * SIZE(B) + + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm5, 0 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S new file mode 100644 index 0000000..1306416 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -0,0 +1,969 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + jle .L100 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + ALIGN_4 + +L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je L115 + ALIGN_4 + +L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -10 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -6 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -2 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne L112 + ALIGN_4 + +L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je L118 + ALIGN_4 + +L116: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L116 + ALIGN_4 + +L118: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addpd %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm1 + addpd %xmm7, %xmm5 + psllq $63, %xmm1 + +#ifndef CONJ + pshufd $0x40, %xmm1, %xmm0 + shufps $0x04, %xmm1, %xmm1 + + pxor %xmm0, %xmm4 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm1, %xmm0 +#else + pshufd $0x04, %xmm1, %xmm0 +#endif + shufps $0x40, %xmm1, %xmm1 + + pxor %xmm0, %xmm5 +#endif + + haddpd %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg L110 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps 2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + ADD1 %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm1 + ADD2 %xmm2, %xmm7 + psllq $63, %xmm1 + +#ifndef CONJ + pshufd $0x40, %xmm1, %xmm0 + shufps $0x04, %xmm1, %xmm1 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm1, %xmm0 +#else + pshufd $0x04, %xmm1, %xmm0 +#endif + shufps $0x40, %xmm1, %xmm1 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm5 + movapd -14 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm5 + movapd -14 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup -14 * SIZE(BB), %xmm2 + movddup -13 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup -10 * SIZE(BB), %xmm2 + movddup -9 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup -10 * SIZE(BB), %xmm2 + movddup -9 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup -12 * SIZE(BB), %xmm2 + movddup -11 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(BB) + movapd %xmm7, -14 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) + movapd %xmm7, -14 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S b/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S new file mode 100644 index 0000000..8824868 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S @@ -0,0 +1,1325 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#else +#define PREFETCH prefetcht0 +#endif + +#define PREFETCHSIZE (8 * 10 + 4) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, B + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm7, 8 + POSINV + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + andl $1, %eax + jle .L100 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L199 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movlpd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + + movlpd %xmm5, 0 * SIZE(BB) + movlpd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L199: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L500 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + leal (, LDC, 2), %eax + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + leal (, LDC, 2), %eax + addl %eax, C +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 2 * SIZE(CO1) + prefetchw 2 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + movapd 2 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movlpd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movlpd 6 * SIZE(B), %xmm2 + movhpd 6 * SIZE(B), %xmm2 + movlpd 7 * SIZE(B), %xmm3 + movhpd 7 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm2 + movhpd 6 * SIZE(B), %xmm2 + movlpd 7 * SIZE(B), %xmm3 + movhpd 7 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movlpd 4 * SIZE(B), %xmm2 + movhpd 4 * SIZE(B), %xmm2 + movlpd 5 * SIZE(B), %xmm3 + movhpd 5 * SIZE(B), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + + movlpd %xmm5, 0 * SIZE(BB) + movlpd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) + movlpd %xmm7, 4 * SIZE(BB) + movlpd %xmm7, 5 * SIZE(BB) + movhpd %xmm7, 6 * SIZE(BB) + movhpd %xmm7, 7 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + + +.L500: + movl OLD_STACK, %esp + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S b/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S new file mode 100644 index 0000000..8b7bf6b --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S @@ -0,0 +1,965 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#define ADDSUB addpd + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7 + +#define KERNEL7(address) \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + jle .L100 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je L112 + ALIGN_4 + +L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne L111 + ALIGN_4 + +L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je L114 + ALIGN_4 + +L113: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L113 + ALIGN_4 + +L114: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + pcmpeqb %xmm1, %xmm1 + psllq $63, %xmm1 + + shufps $0x40, %xmm1, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + xorpd %xmm1, %xmm5 + + subpd %xmm5, %xmm4 +#else +#if defined(LN) || defined(LT) + xorpd %xmm1, %xmm4 +#else + xorpd %xmm1, %xmm5 +#endif + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movddup 0 * SIZE(AA), %xmm2 + movddup 1 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg L110 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L500 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetcht0 -2 * SIZE(CO1) + prefetcht0 -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_4 + +.L11: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + pcmpeqb %xmm1, %xmm1 + psllq $63, %xmm1 + + shufps $0x40, %xmm1, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 + + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else +#if defined(LN) || defined(LT) + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#else + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#endif + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm5 + movapd 2 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movddup 0 * SIZE(AA), %xmm2 + movddup 1 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + movapd %xmm7, %xmm6 + + SHUFPD_1 %xmm4, %xmm4 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup 2 * SIZE(BB), %xmm2 + movddup 3 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + movapd %xmm5, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup 6 * SIZE(BB), %xmm2 + movddup 7 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup 6 * SIZE(BB), %xmm2 + movddup 7 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup 4 * SIZE(BB), %xmm2 + movddup 5 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm4 + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(BB) + movapd %xmm7, 2 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L500: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S new file mode 100644 index 0000000..ebff425 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -0,0 +1,1966 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + andl $1, %eax + jle .L100 + +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + movhps -30 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movsd -32 * SIZE(BB), %xmm2 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L149 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -22 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -18 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + movl %eax, J + sarl $1, J + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + ADD2 %xmm2, %xmm7 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm3, %xmm6 + psllq $63, %xmm0 + +#ifndef CONJ + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + shufps $0xb1, %xmm0, %xmm0 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else + pshufd $0xb1, %xmm0, %xmm1 + + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm7 +#endif +#endif + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + + shufps $0xd8, %xmm4, %xmm4 + shufps $0xd8, %xmm6, %xmm6 + + movaps %xmm4, %xmm5 + shufps $0xe4, %xmm6, %xmm4 + shufps $0xe4, %xmm5, %xmm6 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps -32 * SIZE(BB), %xmm2 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + movaps -28 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, -32 * SIZE(AA) + movaps %xmm5, -28 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + pshufd $0xb1, %xmm7, %xmm7 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + movsd -30 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, -32 * SIZE(AA) + movlps %xmm5, -30 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S new file mode 100644 index 0000000..bce0b02 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -0,0 +1,2202 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCHSIZE (16 * 10 + 8) +#define WPREFETCHSIZE 112 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + xorps %xmm7, %xmm7 + pcmpeqb %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + andl $1, %eax + jle .L100 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm1, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L149 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + movl %eax, J + sarl $1, J + jle .L999 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $ 4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $ 32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + movaps 4 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm5, 28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, 0 * SIZE(AA) + movaps %xmm5, 4 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, 0 * SIZE(AA) + movlps %xmm5, 2 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86_64/._KERNEL b/kernel/x86_64/._KERNEL new file mode 100644 index 0000000..9abfa3d Binary files /dev/null and b/kernel/x86_64/._KERNEL differ diff --git a/kernel/x86_64/._KERNEL.ATOM b/kernel/x86_64/._KERNEL.ATOM new file mode 100644 index 0000000..9aff5be Binary files /dev/null and b/kernel/x86_64/._KERNEL.ATOM differ diff --git a/kernel/x86_64/._KERNEL.BARCELONA b/kernel/x86_64/._KERNEL.BARCELONA new file mode 100644 index 0000000..2da9c1b Binary files /dev/null and b/kernel/x86_64/._KERNEL.BARCELONA differ diff --git a/kernel/x86_64/._KERNEL.CORE2 b/kernel/x86_64/._KERNEL.CORE2 new file mode 100644 index 0000000..cab6696 Binary files /dev/null and b/kernel/x86_64/._KERNEL.CORE2 differ diff --git a/kernel/x86_64/._KERNEL.DUNNINGTON b/kernel/x86_64/._KERNEL.DUNNINGTON new file mode 100644 index 0000000..a85c8a6 Binary files /dev/null and b/kernel/x86_64/._KERNEL.DUNNINGTON differ diff --git a/kernel/x86_64/._KERNEL.NANO b/kernel/x86_64/._KERNEL.NANO new file mode 100644 index 0000000..6a90e29 Binary files /dev/null and b/kernel/x86_64/._KERNEL.NANO differ diff --git a/kernel/x86_64/._KERNEL.NEHALEM b/kernel/x86_64/._KERNEL.NEHALEM new file mode 100644 index 0000000..b988cad Binary files /dev/null and b/kernel/x86_64/._KERNEL.NEHALEM differ diff --git a/kernel/x86_64/._KERNEL.OPTERON b/kernel/x86_64/._KERNEL.OPTERON new file mode 100644 index 0000000..2d5f3b2 Binary files /dev/null and b/kernel/x86_64/._KERNEL.OPTERON differ diff --git a/kernel/x86_64/._KERNEL.OPTERON_SSE3 b/kernel/x86_64/._KERNEL.OPTERON_SSE3 new file mode 100644 index 0000000..a79e1b2 Binary files /dev/null and b/kernel/x86_64/._KERNEL.OPTERON_SSE3 differ diff --git a/kernel/x86_64/._KERNEL.PENRYN b/kernel/x86_64/._KERNEL.PENRYN new file mode 100644 index 0000000..6dc39fd Binary files /dev/null and b/kernel/x86_64/._KERNEL.PENRYN differ diff --git a/kernel/x86_64/._KERNEL.PRESCOTT b/kernel/x86_64/._KERNEL.PRESCOTT new file mode 100644 index 0000000..659c001 Binary files /dev/null and b/kernel/x86_64/._KERNEL.PRESCOTT differ diff --git a/kernel/x86_64/._Makefile b/kernel/x86_64/._Makefile new file mode 100644 index 0000000..bb080dd Binary files /dev/null and b/kernel/x86_64/._Makefile differ diff --git a/kernel/x86_64/._amax.S b/kernel/x86_64/._amax.S new file mode 100644 index 0000000..17cc97b Binary files /dev/null and b/kernel/x86_64/._amax.S differ diff --git a/kernel/x86_64/._amax_atom.S b/kernel/x86_64/._amax_atom.S new file mode 100644 index 0000000..c4020ab Binary files /dev/null and b/kernel/x86_64/._amax_atom.S differ diff --git a/kernel/x86_64/._amax_sse.S b/kernel/x86_64/._amax_sse.S new file mode 100644 index 0000000..0d9a2b8 Binary files /dev/null and b/kernel/x86_64/._amax_sse.S differ diff --git a/kernel/x86_64/._amax_sse2.S b/kernel/x86_64/._amax_sse2.S new file mode 100644 index 0000000..5cd6c4e Binary files /dev/null and b/kernel/x86_64/._amax_sse2.S differ diff --git a/kernel/x86_64/._asum.S b/kernel/x86_64/._asum.S new file mode 100644 index 0000000..3c6d31d Binary files /dev/null and b/kernel/x86_64/._asum.S differ diff --git a/kernel/x86_64/._asum_atom.S b/kernel/x86_64/._asum_atom.S new file mode 100644 index 0000000..64b4d61 Binary files /dev/null and b/kernel/x86_64/._asum_atom.S differ diff --git a/kernel/x86_64/._asum_sse.S b/kernel/x86_64/._asum_sse.S new file mode 100644 index 0000000..63417dc Binary files /dev/null and b/kernel/x86_64/._asum_sse.S differ diff --git a/kernel/x86_64/._asum_sse2.S b/kernel/x86_64/._asum_sse2.S new file mode 100644 index 0000000..dd122cf Binary files /dev/null and b/kernel/x86_64/._asum_sse2.S differ diff --git a/kernel/x86_64/._axpy.S b/kernel/x86_64/._axpy.S new file mode 100644 index 0000000..bff7926 Binary files /dev/null and b/kernel/x86_64/._axpy.S differ diff --git a/kernel/x86_64/._axpy_atom.S b/kernel/x86_64/._axpy_atom.S new file mode 100644 index 0000000..c9a3f70 Binary files /dev/null and b/kernel/x86_64/._axpy_atom.S differ diff --git a/kernel/x86_64/._axpy_sse.S b/kernel/x86_64/._axpy_sse.S new file mode 100644 index 0000000..e12c045 Binary files /dev/null and b/kernel/x86_64/._axpy_sse.S differ diff --git a/kernel/x86_64/._axpy_sse2.S b/kernel/x86_64/._axpy_sse2.S new file mode 100644 index 0000000..380ba45 Binary files /dev/null and b/kernel/x86_64/._axpy_sse2.S differ diff --git a/kernel/x86_64/._builtin_stinit.S b/kernel/x86_64/._builtin_stinit.S new file mode 100644 index 0000000..3dfd3f5 Binary files /dev/null and b/kernel/x86_64/._builtin_stinit.S differ diff --git a/kernel/x86_64/._cabs.S b/kernel/x86_64/._cabs.S new file mode 100644 index 0000000..5011a14 Binary files /dev/null and b/kernel/x86_64/._cabs.S differ diff --git a/kernel/x86_64/._cgemv_n.S b/kernel/x86_64/._cgemv_n.S new file mode 100644 index 0000000..eb129ac Binary files /dev/null and b/kernel/x86_64/._cgemv_n.S differ diff --git a/kernel/x86_64/._cgemv_t.S b/kernel/x86_64/._cgemv_t.S new file mode 100644 index 0000000..ac034a8 Binary files /dev/null and b/kernel/x86_64/._cgemv_t.S differ diff --git a/kernel/x86_64/._copy.S b/kernel/x86_64/._copy.S new file mode 100644 index 0000000..eb6a37e Binary files /dev/null and b/kernel/x86_64/._copy.S differ diff --git a/kernel/x86_64/._copy_sse.S b/kernel/x86_64/._copy_sse.S new file mode 100644 index 0000000..efe5d95 Binary files /dev/null and b/kernel/x86_64/._copy_sse.S differ diff --git a/kernel/x86_64/._copy_sse2.S b/kernel/x86_64/._copy_sse2.S new file mode 100644 index 0000000..58468ec Binary files /dev/null and b/kernel/x86_64/._copy_sse2.S differ diff --git a/kernel/x86_64/._dgemm_ncopy_2.S b/kernel/x86_64/._dgemm_ncopy_2.S new file mode 100644 index 0000000..2a9dc17 Binary files /dev/null and b/kernel/x86_64/._dgemm_ncopy_2.S differ diff --git a/kernel/x86_64/._dgemm_ncopy_4.S b/kernel/x86_64/._dgemm_ncopy_4.S new file mode 100644 index 0000000..85faf35 Binary files /dev/null and b/kernel/x86_64/._dgemm_ncopy_4.S differ diff --git a/kernel/x86_64/._dgemm_ncopy_8.S b/kernel/x86_64/._dgemm_ncopy_8.S new file mode 100644 index 0000000..ad9fd4a Binary files /dev/null and b/kernel/x86_64/._dgemm_ncopy_8.S differ diff --git a/kernel/x86_64/._dgemm_tcopy_2.S b/kernel/x86_64/._dgemm_tcopy_2.S new file mode 100644 index 0000000..b0476f4 Binary files /dev/null and b/kernel/x86_64/._dgemm_tcopy_2.S differ diff --git a/kernel/x86_64/._dgemm_tcopy_4.S b/kernel/x86_64/._dgemm_tcopy_4.S new file mode 100644 index 0000000..5f3f468 Binary files /dev/null and b/kernel/x86_64/._dgemm_tcopy_4.S differ diff --git a/kernel/x86_64/._dgemm_tcopy_8.S b/kernel/x86_64/._dgemm_tcopy_8.S new file mode 100644 index 0000000..84cac5b Binary files /dev/null and b/kernel/x86_64/._dgemm_tcopy_8.S differ diff --git a/kernel/x86_64/._dgemv_n.S b/kernel/x86_64/._dgemv_n.S new file mode 100644 index 0000000..b1f032d Binary files /dev/null and b/kernel/x86_64/._dgemv_n.S differ diff --git a/kernel/x86_64/._dgemv_n_atom.S b/kernel/x86_64/._dgemv_n_atom.S new file mode 100644 index 0000000..820e786 Binary files /dev/null and b/kernel/x86_64/._dgemv_n_atom.S differ diff --git a/kernel/x86_64/._dgemv_t.S b/kernel/x86_64/._dgemv_t.S new file mode 100644 index 0000000..4bddb5a Binary files /dev/null and b/kernel/x86_64/._dgemv_t.S differ diff --git a/kernel/x86_64/._dgemv_t_atom.S b/kernel/x86_64/._dgemv_t_atom.S new file mode 100644 index 0000000..a0aba56 Binary files /dev/null and b/kernel/x86_64/._dgemv_t_atom.S differ diff --git a/kernel/x86_64/._dot.S b/kernel/x86_64/._dot.S new file mode 100644 index 0000000..72bd592 Binary files /dev/null and b/kernel/x86_64/._dot.S differ diff --git a/kernel/x86_64/._dot_atom.S b/kernel/x86_64/._dot_atom.S new file mode 100644 index 0000000..4342cfa Binary files /dev/null and b/kernel/x86_64/._dot_atom.S differ diff --git a/kernel/x86_64/._dot_sse.S b/kernel/x86_64/._dot_sse.S new file mode 100644 index 0000000..7c5cc05 Binary files /dev/null and b/kernel/x86_64/._dot_sse.S differ diff --git a/kernel/x86_64/._dot_sse2.S b/kernel/x86_64/._dot_sse2.S new file mode 100644 index 0000000..48cc8ec Binary files /dev/null and b/kernel/x86_64/._dot_sse2.S differ diff --git a/kernel/x86_64/._gemm_beta.S b/kernel/x86_64/._gemm_beta.S new file mode 100644 index 0000000..8c991ae Binary files /dev/null and b/kernel/x86_64/._gemm_beta.S differ diff --git a/kernel/x86_64/._gemm_kernel_2x8_nehalem.S b/kernel/x86_64/._gemm_kernel_2x8_nehalem.S new file mode 100644 index 0000000..8f2a254 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_2x8_nehalem.S differ diff --git a/kernel/x86_64/._gemm_kernel_4x2_atom.S b/kernel/x86_64/._gemm_kernel_4x2_atom.S new file mode 100644 index 0000000..3d5c1a9 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_4x2_atom.S differ diff --git a/kernel/x86_64/._gemm_kernel_4x4_barcelona.S b/kernel/x86_64/._gemm_kernel_4x4_barcelona.S new file mode 100644 index 0000000..516e364 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_4x4_barcelona.S differ diff --git a/kernel/x86_64/._gemm_kernel_4x4_core2.S b/kernel/x86_64/._gemm_kernel_4x4_core2.S new file mode 100644 index 0000000..c7a2ba7 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_4x4_core2.S differ diff --git a/kernel/x86_64/._gemm_kernel_4x4_penryn.S b/kernel/x86_64/._gemm_kernel_4x4_penryn.S new file mode 100644 index 0000000..3e64cd0 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_4x4_penryn.S differ diff --git a/kernel/x86_64/._gemm_kernel_4x4_sse2.S b/kernel/x86_64/._gemm_kernel_4x4_sse2.S new file mode 100644 index 0000000..2d3b7ae Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_4x4_sse2.S differ diff --git a/kernel/x86_64/._gemm_kernel_4x4_sse3.S b/kernel/x86_64/._gemm_kernel_4x4_sse3.S new file mode 100644 index 0000000..6b03bb3 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_4x4_sse3.S differ diff --git a/kernel/x86_64/._gemm_kernel_4x8_nano.S b/kernel/x86_64/._gemm_kernel_4x8_nano.S new file mode 100644 index 0000000..437577e Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_4x8_nano.S differ diff --git a/kernel/x86_64/._gemm_kernel_4x8_nehalem.S b/kernel/x86_64/._gemm_kernel_4x8_nehalem.S new file mode 100644 index 0000000..6cb3176 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_4x8_nehalem.S differ diff --git a/kernel/x86_64/._gemm_kernel_8x4_barcelona.S b/kernel/x86_64/._gemm_kernel_8x4_barcelona.S new file mode 100644 index 0000000..ebd7b86 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_8x4_barcelona.S differ diff --git a/kernel/x86_64/._gemm_kernel_8x4_core2.S b/kernel/x86_64/._gemm_kernel_8x4_core2.S new file mode 100644 index 0000000..917891f Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_8x4_core2.S differ diff --git a/kernel/x86_64/._gemm_kernel_8x4_penryn.S b/kernel/x86_64/._gemm_kernel_8x4_penryn.S new file mode 100644 index 0000000..f5fb2fb Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_8x4_penryn.S differ diff --git a/kernel/x86_64/._gemm_kernel_8x4_sse.S b/kernel/x86_64/._gemm_kernel_8x4_sse.S new file mode 100644 index 0000000..2680165 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_8x4_sse.S differ diff --git a/kernel/x86_64/._gemm_kernel_8x4_sse3.S b/kernel/x86_64/._gemm_kernel_8x4_sse3.S new file mode 100644 index 0000000..a3901b3 Binary files /dev/null and b/kernel/x86_64/._gemm_kernel_8x4_sse3.S differ diff --git a/kernel/x86_64/._gemm_ncopy_2.S b/kernel/x86_64/._gemm_ncopy_2.S new file mode 100644 index 0000000..f2983ef Binary files /dev/null and b/kernel/x86_64/._gemm_ncopy_2.S differ diff --git a/kernel/x86_64/._gemm_ncopy_4.S b/kernel/x86_64/._gemm_ncopy_4.S new file mode 100644 index 0000000..df895de Binary files /dev/null and b/kernel/x86_64/._gemm_ncopy_4.S differ diff --git a/kernel/x86_64/._gemm_ncopy_4_opteron.S b/kernel/x86_64/._gemm_ncopy_4_opteron.S new file mode 100644 index 0000000..bec5662 Binary files /dev/null and b/kernel/x86_64/._gemm_ncopy_4_opteron.S differ diff --git a/kernel/x86_64/._gemm_tcopy_2.S b/kernel/x86_64/._gemm_tcopy_2.S new file mode 100644 index 0000000..7b32ceb Binary files /dev/null and b/kernel/x86_64/._gemm_tcopy_2.S differ diff --git a/kernel/x86_64/._gemm_tcopy_4.S b/kernel/x86_64/._gemm_tcopy_4.S new file mode 100644 index 0000000..30d78cc Binary files /dev/null and b/kernel/x86_64/._gemm_tcopy_4.S differ diff --git a/kernel/x86_64/._gemm_tcopy_4_opteron.S b/kernel/x86_64/._gemm_tcopy_4_opteron.S new file mode 100644 index 0000000..5a06f76 Binary files /dev/null and b/kernel/x86_64/._gemm_tcopy_4_opteron.S differ diff --git a/kernel/x86_64/._iamax.S b/kernel/x86_64/._iamax.S new file mode 100644 index 0000000..c8e25b7 Binary files /dev/null and b/kernel/x86_64/._iamax.S differ diff --git a/kernel/x86_64/._iamax_sse.S b/kernel/x86_64/._iamax_sse.S new file mode 100644 index 0000000..6e02805 Binary files /dev/null and b/kernel/x86_64/._iamax_sse.S differ diff --git a/kernel/x86_64/._iamax_sse2.S b/kernel/x86_64/._iamax_sse2.S new file mode 100644 index 0000000..e07e65a Binary files /dev/null and b/kernel/x86_64/._iamax_sse2.S differ diff --git a/kernel/x86_64/._izamax.S b/kernel/x86_64/._izamax.S new file mode 100644 index 0000000..fcd2389 Binary files /dev/null and b/kernel/x86_64/._izamax.S differ diff --git a/kernel/x86_64/._izamax_sse.S b/kernel/x86_64/._izamax_sse.S new file mode 100644 index 0000000..755905e Binary files /dev/null and b/kernel/x86_64/._izamax_sse.S differ diff --git a/kernel/x86_64/._izamax_sse2.S b/kernel/x86_64/._izamax_sse2.S new file mode 100644 index 0000000..5aedcc3 Binary files /dev/null and b/kernel/x86_64/._izamax_sse2.S differ diff --git a/kernel/x86_64/._lsame.S b/kernel/x86_64/._lsame.S new file mode 100644 index 0000000..49cdbeb Binary files /dev/null and b/kernel/x86_64/._lsame.S differ diff --git a/kernel/x86_64/._mcount.S b/kernel/x86_64/._mcount.S new file mode 100644 index 0000000..0a01444 Binary files /dev/null and b/kernel/x86_64/._mcount.S differ diff --git a/kernel/x86_64/._nrm2.S b/kernel/x86_64/._nrm2.S new file mode 100644 index 0000000..e3015ad Binary files /dev/null and b/kernel/x86_64/._nrm2.S differ diff --git a/kernel/x86_64/._nrm2_sse.S b/kernel/x86_64/._nrm2_sse.S new file mode 100644 index 0000000..906ac0b Binary files /dev/null and b/kernel/x86_64/._nrm2_sse.S differ diff --git a/kernel/x86_64/._qconjg.S b/kernel/x86_64/._qconjg.S new file mode 100644 index 0000000..8709042 Binary files /dev/null and b/kernel/x86_64/._qconjg.S differ diff --git a/kernel/x86_64/._qdot.S b/kernel/x86_64/._qdot.S new file mode 100644 index 0000000..38151d8 Binary files /dev/null and b/kernel/x86_64/._qdot.S differ diff --git a/kernel/x86_64/._qgemm_kernel_2x2.S b/kernel/x86_64/._qgemm_kernel_2x2.S new file mode 100644 index 0000000..6e00fb1 Binary files /dev/null and b/kernel/x86_64/._qgemm_kernel_2x2.S differ diff --git a/kernel/x86_64/._qgemv_n.S b/kernel/x86_64/._qgemv_n.S new file mode 100644 index 0000000..f1426ef Binary files /dev/null and b/kernel/x86_64/._qgemv_n.S differ diff --git a/kernel/x86_64/._qgemv_t.S b/kernel/x86_64/._qgemv_t.S new file mode 100644 index 0000000..8955f9b Binary files /dev/null and b/kernel/x86_64/._qgemv_t.S differ diff --git a/kernel/x86_64/._qtrsm_kernel_LN_2x2.S b/kernel/x86_64/._qtrsm_kernel_LN_2x2.S new file mode 100644 index 0000000..58c8a44 Binary files /dev/null and b/kernel/x86_64/._qtrsm_kernel_LN_2x2.S differ diff --git a/kernel/x86_64/._qtrsm_kernel_LT_2x2.S b/kernel/x86_64/._qtrsm_kernel_LT_2x2.S new file mode 100644 index 0000000..84a78cc Binary files /dev/null and b/kernel/x86_64/._qtrsm_kernel_LT_2x2.S differ diff --git a/kernel/x86_64/._qtrsm_kernel_RT_2x2.S b/kernel/x86_64/._qtrsm_kernel_RT_2x2.S new file mode 100644 index 0000000..010fde5 Binary files /dev/null and b/kernel/x86_64/._qtrsm_kernel_RT_2x2.S differ diff --git a/kernel/x86_64/._rot.S b/kernel/x86_64/._rot.S new file mode 100644 index 0000000..898d103 Binary files /dev/null and b/kernel/x86_64/._rot.S differ diff --git a/kernel/x86_64/._rot_sse.S b/kernel/x86_64/._rot_sse.S new file mode 100644 index 0000000..9d2bfe6 Binary files /dev/null and b/kernel/x86_64/._rot_sse.S differ diff --git a/kernel/x86_64/._rot_sse2.S b/kernel/x86_64/._rot_sse2.S new file mode 100644 index 0000000..13c5719 Binary files /dev/null and b/kernel/x86_64/._rot_sse2.S differ diff --git a/kernel/x86_64/._scal.S b/kernel/x86_64/._scal.S new file mode 100644 index 0000000..f8e5ed4 Binary files /dev/null and b/kernel/x86_64/._scal.S differ diff --git a/kernel/x86_64/._scal_atom.S b/kernel/x86_64/._scal_atom.S new file mode 100644 index 0000000..eff66e1 Binary files /dev/null and b/kernel/x86_64/._scal_atom.S differ diff --git a/kernel/x86_64/._scal_sse.S b/kernel/x86_64/._scal_sse.S new file mode 100644 index 0000000..a66442f Binary files /dev/null and b/kernel/x86_64/._scal_sse.S differ diff --git a/kernel/x86_64/._scal_sse2.S b/kernel/x86_64/._scal_sse2.S new file mode 100644 index 0000000..d1c8a96 Binary files /dev/null and b/kernel/x86_64/._scal_sse2.S differ diff --git a/kernel/x86_64/._sgemv_n.S b/kernel/x86_64/._sgemv_n.S new file mode 100644 index 0000000..3c773f4 Binary files /dev/null and b/kernel/x86_64/._sgemv_n.S differ diff --git a/kernel/x86_64/._sgemv_t.S b/kernel/x86_64/._sgemv_t.S new file mode 100644 index 0000000..857d8c0 Binary files /dev/null and b/kernel/x86_64/._sgemv_t.S differ diff --git a/kernel/x86_64/._staticbuffer.S b/kernel/x86_64/._staticbuffer.S new file mode 100644 index 0000000..f9917b1 Binary files /dev/null and b/kernel/x86_64/._staticbuffer.S differ diff --git a/kernel/x86_64/._swap.S b/kernel/x86_64/._swap.S new file mode 100644 index 0000000..457f75f Binary files /dev/null and b/kernel/x86_64/._swap.S differ diff --git a/kernel/x86_64/._swap_sse.S b/kernel/x86_64/._swap_sse.S new file mode 100644 index 0000000..b698058 Binary files /dev/null and b/kernel/x86_64/._swap_sse.S differ diff --git a/kernel/x86_64/._swap_sse2.S b/kernel/x86_64/._swap_sse2.S new file mode 100644 index 0000000..f8a0a99 Binary files /dev/null and b/kernel/x86_64/._swap_sse2.S differ diff --git a/kernel/x86_64/._symv_L_sse.S b/kernel/x86_64/._symv_L_sse.S new file mode 100644 index 0000000..0670db1 Binary files /dev/null and b/kernel/x86_64/._symv_L_sse.S differ diff --git a/kernel/x86_64/._symv_L_sse2.S b/kernel/x86_64/._symv_L_sse2.S new file mode 100644 index 0000000..a418b36 Binary files /dev/null and b/kernel/x86_64/._symv_L_sse2.S differ diff --git a/kernel/x86_64/._symv_U_sse.S b/kernel/x86_64/._symv_U_sse.S new file mode 100644 index 0000000..a10e729 Binary files /dev/null and b/kernel/x86_64/._symv_U_sse.S differ diff --git a/kernel/x86_64/._symv_U_sse2.S b/kernel/x86_64/._symv_U_sse2.S new file mode 100644 index 0000000..aed718c Binary files /dev/null and b/kernel/x86_64/._symv_U_sse2.S differ diff --git a/kernel/x86_64/._trsm_kernel_LN_2x8_nehalem.S b/kernel/x86_64/._trsm_kernel_LN_2x8_nehalem.S new file mode 100644 index 0000000..5637457 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LN_2x8_nehalem.S differ diff --git a/kernel/x86_64/._trsm_kernel_LN_4x2_atom.S b/kernel/x86_64/._trsm_kernel_LN_4x2_atom.S new file mode 100644 index 0000000..db1fa2b Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LN_4x2_atom.S differ diff --git a/kernel/x86_64/._trsm_kernel_LN_4x4_barcelona.S b/kernel/x86_64/._trsm_kernel_LN_4x4_barcelona.S new file mode 100644 index 0000000..9211223 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LN_4x4_barcelona.S differ diff --git a/kernel/x86_64/._trsm_kernel_LN_4x4_core2.S b/kernel/x86_64/._trsm_kernel_LN_4x4_core2.S new file mode 100644 index 0000000..fe1c3b1 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LN_4x4_core2.S differ diff --git a/kernel/x86_64/._trsm_kernel_LN_4x4_penryn.S b/kernel/x86_64/._trsm_kernel_LN_4x4_penryn.S new file mode 100644 index 0000000..6aa827b Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LN_4x4_penryn.S differ diff --git a/kernel/x86_64/._trsm_kernel_LN_4x4_sse2.S b/kernel/x86_64/._trsm_kernel_LN_4x4_sse2.S new file mode 100644 index 0000000..027b8bb Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LN_4x4_sse2.S differ diff --git a/kernel/x86_64/._trsm_kernel_LN_4x4_sse3.S b/kernel/x86_64/._trsm_kernel_LN_4x4_sse3.S new file mode 100644 index 0000000..611c54a Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LN_4x4_sse3.S differ diff --git a/kernel/x86_64/._trsm_kernel_LN_4x8_nehalem.S b/kernel/x86_64/._trsm_kernel_LN_4x8_nehalem.S new file mode 100644 index 0000000..5716e99 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LN_4x8_nehalem.S differ diff --git a/kernel/x86_64/._trsm_kernel_LN_8x4_sse.S b/kernel/x86_64/._trsm_kernel_LN_8x4_sse.S new file mode 100644 index 0000000..fb149b9 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LN_8x4_sse.S differ diff --git a/kernel/x86_64/._trsm_kernel_LT_2x8_nehalem.S b/kernel/x86_64/._trsm_kernel_LT_2x8_nehalem.S new file mode 100644 index 0000000..74ffb17 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LT_2x8_nehalem.S differ diff --git a/kernel/x86_64/._trsm_kernel_LT_4x2_atom.S b/kernel/x86_64/._trsm_kernel_LT_4x2_atom.S new file mode 100644 index 0000000..8f34a94 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LT_4x2_atom.S differ diff --git a/kernel/x86_64/._trsm_kernel_LT_4x4_barcelona.S b/kernel/x86_64/._trsm_kernel_LT_4x4_barcelona.S new file mode 100644 index 0000000..8edf874 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LT_4x4_barcelona.S differ diff --git a/kernel/x86_64/._trsm_kernel_LT_4x4_core2.S b/kernel/x86_64/._trsm_kernel_LT_4x4_core2.S new file mode 100644 index 0000000..e94d711 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LT_4x4_core2.S differ diff --git a/kernel/x86_64/._trsm_kernel_LT_4x4_penryn.S b/kernel/x86_64/._trsm_kernel_LT_4x4_penryn.S new file mode 100644 index 0000000..0708ffd Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LT_4x4_penryn.S differ diff --git a/kernel/x86_64/._trsm_kernel_LT_4x4_sse2.S b/kernel/x86_64/._trsm_kernel_LT_4x4_sse2.S new file mode 100644 index 0000000..a61a072 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LT_4x4_sse2.S differ diff --git a/kernel/x86_64/._trsm_kernel_LT_4x4_sse3.S b/kernel/x86_64/._trsm_kernel_LT_4x4_sse3.S new file mode 100644 index 0000000..082d296 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LT_4x4_sse3.S differ diff --git a/kernel/x86_64/._trsm_kernel_LT_4x8_nehalem.S b/kernel/x86_64/._trsm_kernel_LT_4x8_nehalem.S new file mode 100644 index 0000000..647eb19 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LT_4x8_nehalem.S differ diff --git a/kernel/x86_64/._trsm_kernel_LT_8x4_sse.S b/kernel/x86_64/._trsm_kernel_LT_8x4_sse.S new file mode 100644 index 0000000..78b5e94 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_LT_8x4_sse.S differ diff --git a/kernel/x86_64/._trsm_kernel_RT_2x8_nehalem.S b/kernel/x86_64/._trsm_kernel_RT_2x8_nehalem.S new file mode 100644 index 0000000..1962b88 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_RT_2x8_nehalem.S differ diff --git a/kernel/x86_64/._trsm_kernel_RT_4x2_atom.S b/kernel/x86_64/._trsm_kernel_RT_4x2_atom.S new file mode 100644 index 0000000..9e8f02c Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_RT_4x2_atom.S differ diff --git a/kernel/x86_64/._trsm_kernel_RT_4x4_barcelona.S b/kernel/x86_64/._trsm_kernel_RT_4x4_barcelona.S new file mode 100644 index 0000000..c958480 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_RT_4x4_barcelona.S differ diff --git a/kernel/x86_64/._trsm_kernel_RT_4x4_core2.S b/kernel/x86_64/._trsm_kernel_RT_4x4_core2.S new file mode 100644 index 0000000..4ff392b Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_RT_4x4_core2.S differ diff --git a/kernel/x86_64/._trsm_kernel_RT_4x4_penryn.S b/kernel/x86_64/._trsm_kernel_RT_4x4_penryn.S new file mode 100644 index 0000000..c9fc801 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_RT_4x4_penryn.S differ diff --git a/kernel/x86_64/._trsm_kernel_RT_4x4_sse2.S b/kernel/x86_64/._trsm_kernel_RT_4x4_sse2.S new file mode 100644 index 0000000..9974308 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_RT_4x4_sse2.S differ diff --git a/kernel/x86_64/._trsm_kernel_RT_4x4_sse3.S b/kernel/x86_64/._trsm_kernel_RT_4x4_sse3.S new file mode 100644 index 0000000..3727c7e Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_RT_4x4_sse3.S differ diff --git a/kernel/x86_64/._trsm_kernel_RT_4x8_nehalem.S b/kernel/x86_64/._trsm_kernel_RT_4x8_nehalem.S new file mode 100644 index 0000000..6d4c282 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_RT_4x8_nehalem.S differ diff --git a/kernel/x86_64/._trsm_kernel_RT_8x4_sse.S b/kernel/x86_64/._trsm_kernel_RT_8x4_sse.S new file mode 100644 index 0000000..33663a6 Binary files /dev/null and b/kernel/x86_64/._trsm_kernel_RT_8x4_sse.S differ diff --git a/kernel/x86_64/._xdot.S b/kernel/x86_64/._xdot.S new file mode 100644 index 0000000..7202827 Binary files /dev/null and b/kernel/x86_64/._xdot.S differ diff --git a/kernel/x86_64/._xgemm3m_kernel_2x2.S b/kernel/x86_64/._xgemm3m_kernel_2x2.S new file mode 100644 index 0000000..806d298 Binary files /dev/null and b/kernel/x86_64/._xgemm3m_kernel_2x2.S differ diff --git a/kernel/x86_64/._xgemm_kernel_1x1.S b/kernel/x86_64/._xgemm_kernel_1x1.S new file mode 100644 index 0000000..826a3cf Binary files /dev/null and b/kernel/x86_64/._xgemm_kernel_1x1.S differ diff --git a/kernel/x86_64/._xgemv_n.S b/kernel/x86_64/._xgemv_n.S new file mode 100644 index 0000000..25ba842 Binary files /dev/null and b/kernel/x86_64/._xgemv_n.S differ diff --git a/kernel/x86_64/._xgemv_t.S b/kernel/x86_64/._xgemv_t.S new file mode 100644 index 0000000..facef58 Binary files /dev/null and b/kernel/x86_64/._xgemv_t.S differ diff --git a/kernel/x86_64/._xtrsm_kernel_LT_1x1.S b/kernel/x86_64/._xtrsm_kernel_LT_1x1.S new file mode 100644 index 0000000..c2505ba Binary files /dev/null and b/kernel/x86_64/._xtrsm_kernel_LT_1x1.S differ diff --git a/kernel/x86_64/._zamax.S b/kernel/x86_64/._zamax.S new file mode 100644 index 0000000..746512c Binary files /dev/null and b/kernel/x86_64/._zamax.S differ diff --git a/kernel/x86_64/._zamax_atom.S b/kernel/x86_64/._zamax_atom.S new file mode 100644 index 0000000..4bca3fc Binary files /dev/null and b/kernel/x86_64/._zamax_atom.S differ diff --git a/kernel/x86_64/._zamax_sse.S b/kernel/x86_64/._zamax_sse.S new file mode 100644 index 0000000..e579a4b Binary files /dev/null and b/kernel/x86_64/._zamax_sse.S differ diff --git a/kernel/x86_64/._zamax_sse2.S b/kernel/x86_64/._zamax_sse2.S new file mode 100644 index 0000000..4184580 Binary files /dev/null and b/kernel/x86_64/._zamax_sse2.S differ diff --git a/kernel/x86_64/._zasum.S b/kernel/x86_64/._zasum.S new file mode 100644 index 0000000..2c726bd Binary files /dev/null and b/kernel/x86_64/._zasum.S differ diff --git a/kernel/x86_64/._zasum_atom.S b/kernel/x86_64/._zasum_atom.S new file mode 100644 index 0000000..87be227 Binary files /dev/null and b/kernel/x86_64/._zasum_atom.S differ diff --git a/kernel/x86_64/._zasum_sse.S b/kernel/x86_64/._zasum_sse.S new file mode 100644 index 0000000..457df89 Binary files /dev/null and b/kernel/x86_64/._zasum_sse.S differ diff --git a/kernel/x86_64/._zasum_sse2.S b/kernel/x86_64/._zasum_sse2.S new file mode 100644 index 0000000..51f691d Binary files /dev/null and b/kernel/x86_64/._zasum_sse2.S differ diff --git a/kernel/x86_64/._zaxpy.S b/kernel/x86_64/._zaxpy.S new file mode 100644 index 0000000..c7cc05d Binary files /dev/null and b/kernel/x86_64/._zaxpy.S differ diff --git a/kernel/x86_64/._zaxpy_atom.S b/kernel/x86_64/._zaxpy_atom.S new file mode 100644 index 0000000..5a6a50b Binary files /dev/null and b/kernel/x86_64/._zaxpy_atom.S differ diff --git a/kernel/x86_64/._zaxpy_sse.S b/kernel/x86_64/._zaxpy_sse.S new file mode 100644 index 0000000..568bd12 Binary files /dev/null and b/kernel/x86_64/._zaxpy_sse.S differ diff --git a/kernel/x86_64/._zaxpy_sse2.S b/kernel/x86_64/._zaxpy_sse2.S new file mode 100644 index 0000000..8fe842a Binary files /dev/null and b/kernel/x86_64/._zaxpy_sse2.S differ diff --git a/kernel/x86_64/._zcopy.S b/kernel/x86_64/._zcopy.S new file mode 100644 index 0000000..85e91e9 Binary files /dev/null and b/kernel/x86_64/._zcopy.S differ diff --git a/kernel/x86_64/._zcopy_sse.S b/kernel/x86_64/._zcopy_sse.S new file mode 100644 index 0000000..6e863da Binary files /dev/null and b/kernel/x86_64/._zcopy_sse.S differ diff --git a/kernel/x86_64/._zcopy_sse2.S b/kernel/x86_64/._zcopy_sse2.S new file mode 100644 index 0000000..f72225a Binary files /dev/null and b/kernel/x86_64/._zcopy_sse2.S differ diff --git a/kernel/x86_64/._zdot.S b/kernel/x86_64/._zdot.S new file mode 100644 index 0000000..d8c032c Binary files /dev/null and b/kernel/x86_64/._zdot.S differ diff --git a/kernel/x86_64/._zdot_atom.S b/kernel/x86_64/._zdot_atom.S new file mode 100644 index 0000000..327672c Binary files /dev/null and b/kernel/x86_64/._zdot_atom.S differ diff --git a/kernel/x86_64/._zdot_sse.S b/kernel/x86_64/._zdot_sse.S new file mode 100644 index 0000000..f9a2cd5 Binary files /dev/null and b/kernel/x86_64/._zdot_sse.S differ diff --git a/kernel/x86_64/._zdot_sse2.S b/kernel/x86_64/._zdot_sse2.S new file mode 100644 index 0000000..fcd83c1 Binary files /dev/null and b/kernel/x86_64/._zdot_sse2.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_2x8_nehalem.S b/kernel/x86_64/._zgemm3m_kernel_2x8_nehalem.S new file mode 100644 index 0000000..3968c92 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_2x8_nehalem.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_4x2_atom.S b/kernel/x86_64/._zgemm3m_kernel_4x2_atom.S new file mode 100644 index 0000000..df47a41 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_4x2_atom.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_4x4_barcelona.S b/kernel/x86_64/._zgemm3m_kernel_4x4_barcelona.S new file mode 100644 index 0000000..269cd70 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_4x4_barcelona.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_4x4_core2.S b/kernel/x86_64/._zgemm3m_kernel_4x4_core2.S new file mode 100644 index 0000000..bcd6cb5 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_4x4_core2.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_4x4_penryn.S b/kernel/x86_64/._zgemm3m_kernel_4x4_penryn.S new file mode 100644 index 0000000..dc41dca Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_4x4_penryn.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_4x4_sse2.S b/kernel/x86_64/._zgemm3m_kernel_4x4_sse2.S new file mode 100644 index 0000000..ed438b7 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_4x4_sse2.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_4x4_sse3.S b/kernel/x86_64/._zgemm3m_kernel_4x4_sse3.S new file mode 100644 index 0000000..7437c28 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_4x4_sse3.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_4x8_nehalem.S b/kernel/x86_64/._zgemm3m_kernel_4x8_nehalem.S new file mode 100644 index 0000000..18b0342 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_4x8_nehalem.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_8x4_barcelona.S b/kernel/x86_64/._zgemm3m_kernel_8x4_barcelona.S new file mode 100644 index 0000000..caed9a6 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_8x4_barcelona.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_8x4_core2.S b/kernel/x86_64/._zgemm3m_kernel_8x4_core2.S new file mode 100644 index 0000000..941baa5 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_8x4_core2.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_8x4_penryn.S b/kernel/x86_64/._zgemm3m_kernel_8x4_penryn.S new file mode 100644 index 0000000..486ff31 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_8x4_penryn.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_8x4_sse.S b/kernel/x86_64/._zgemm3m_kernel_8x4_sse.S new file mode 100644 index 0000000..49c44c2 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_8x4_sse.S differ diff --git a/kernel/x86_64/._zgemm3m_kernel_8x4_sse3.S b/kernel/x86_64/._zgemm3m_kernel_8x4_sse3.S new file mode 100644 index 0000000..6848118 Binary files /dev/null and b/kernel/x86_64/._zgemm3m_kernel_8x4_sse3.S differ diff --git a/kernel/x86_64/._zgemm_beta.S b/kernel/x86_64/._zgemm_beta.S new file mode 100644 index 0000000..b994b8d Binary files /dev/null and b/kernel/x86_64/._zgemm_beta.S differ diff --git a/kernel/x86_64/._zgemm_kernel_1x4_nehalem.S b/kernel/x86_64/._zgemm_kernel_1x4_nehalem.S new file mode 100644 index 0000000..69939be Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_1x4_nehalem.S differ diff --git a/kernel/x86_64/._zgemm_kernel_2x1_atom.S b/kernel/x86_64/._zgemm_kernel_2x1_atom.S new file mode 100644 index 0000000..a0eb721 Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_2x1_atom.S differ diff --git a/kernel/x86_64/._zgemm_kernel_2x2_barcelona.S b/kernel/x86_64/._zgemm_kernel_2x2_barcelona.S new file mode 100644 index 0000000..19fa641 Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_2x2_barcelona.S differ diff --git a/kernel/x86_64/._zgemm_kernel_2x2_core2.S b/kernel/x86_64/._zgemm_kernel_2x2_core2.S new file mode 100644 index 0000000..403281f Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_2x2_core2.S differ diff --git a/kernel/x86_64/._zgemm_kernel_2x2_penryn.S b/kernel/x86_64/._zgemm_kernel_2x2_penryn.S new file mode 100644 index 0000000..4a1fbfe Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_2x2_penryn.S differ diff --git a/kernel/x86_64/._zgemm_kernel_2x2_sse2.S b/kernel/x86_64/._zgemm_kernel_2x2_sse2.S new file mode 100644 index 0000000..c90268d Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_2x2_sse2.S differ diff --git a/kernel/x86_64/._zgemm_kernel_2x2_sse3.S b/kernel/x86_64/._zgemm_kernel_2x2_sse3.S new file mode 100644 index 0000000..bac6e50 Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_2x2_sse3.S differ diff --git a/kernel/x86_64/._zgemm_kernel_2x4_nehalem.S b/kernel/x86_64/._zgemm_kernel_2x4_nehalem.S new file mode 100644 index 0000000..db5484a Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_2x4_nehalem.S differ diff --git a/kernel/x86_64/._zgemm_kernel_4x2_barcelona.S b/kernel/x86_64/._zgemm_kernel_4x2_barcelona.S new file mode 100644 index 0000000..93ba405 Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_4x2_barcelona.S differ diff --git a/kernel/x86_64/._zgemm_kernel_4x2_core2.S b/kernel/x86_64/._zgemm_kernel_4x2_core2.S new file mode 100644 index 0000000..8688b89 Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_4x2_core2.S differ diff --git a/kernel/x86_64/._zgemm_kernel_4x2_penryn.S b/kernel/x86_64/._zgemm_kernel_4x2_penryn.S new file mode 100644 index 0000000..5888cc3 Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_4x2_penryn.S differ diff --git a/kernel/x86_64/._zgemm_kernel_4x2_sse.S b/kernel/x86_64/._zgemm_kernel_4x2_sse.S new file mode 100644 index 0000000..7bcd65e Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_4x2_sse.S differ diff --git a/kernel/x86_64/._zgemm_kernel_4x2_sse3.S b/kernel/x86_64/._zgemm_kernel_4x2_sse3.S new file mode 100644 index 0000000..d643e8a Binary files /dev/null and b/kernel/x86_64/._zgemm_kernel_4x2_sse3.S differ diff --git a/kernel/x86_64/._zgemm_ncopy_1.S b/kernel/x86_64/._zgemm_ncopy_1.S new file mode 100644 index 0000000..2fd9f2d Binary files /dev/null and b/kernel/x86_64/._zgemm_ncopy_1.S differ diff --git a/kernel/x86_64/._zgemm_ncopy_2.S b/kernel/x86_64/._zgemm_ncopy_2.S new file mode 100644 index 0000000..2b31202 Binary files /dev/null and b/kernel/x86_64/._zgemm_ncopy_2.S differ diff --git a/kernel/x86_64/._zgemm_tcopy_1.S b/kernel/x86_64/._zgemm_tcopy_1.S new file mode 100644 index 0000000..0060bca Binary files /dev/null and b/kernel/x86_64/._zgemm_tcopy_1.S differ diff --git a/kernel/x86_64/._zgemm_tcopy_2.S b/kernel/x86_64/._zgemm_tcopy_2.S new file mode 100644 index 0000000..d1127e4 Binary files /dev/null and b/kernel/x86_64/._zgemm_tcopy_2.S differ diff --git a/kernel/x86_64/._zgemv_n.S b/kernel/x86_64/._zgemv_n.S new file mode 100644 index 0000000..0658d4b Binary files /dev/null and b/kernel/x86_64/._zgemv_n.S differ diff --git a/kernel/x86_64/._zgemv_n_atom.S b/kernel/x86_64/._zgemv_n_atom.S new file mode 100644 index 0000000..cc344ba Binary files /dev/null and b/kernel/x86_64/._zgemv_n_atom.S differ diff --git a/kernel/x86_64/._zgemv_n_dup.S b/kernel/x86_64/._zgemv_n_dup.S new file mode 100644 index 0000000..97e1862 Binary files /dev/null and b/kernel/x86_64/._zgemv_n_dup.S differ diff --git a/kernel/x86_64/._zgemv_t.S b/kernel/x86_64/._zgemv_t.S new file mode 100644 index 0000000..b3ef4e4 Binary files /dev/null and b/kernel/x86_64/._zgemv_t.S differ diff --git a/kernel/x86_64/._zgemv_t_atom.S b/kernel/x86_64/._zgemv_t_atom.S new file mode 100644 index 0000000..c472ca7 Binary files /dev/null and b/kernel/x86_64/._zgemv_t_atom.S differ diff --git a/kernel/x86_64/._zgemv_t_dup.S b/kernel/x86_64/._zgemv_t_dup.S new file mode 100644 index 0000000..b4203f5 Binary files /dev/null and b/kernel/x86_64/._zgemv_t_dup.S differ diff --git a/kernel/x86_64/._znrm2.S b/kernel/x86_64/._znrm2.S new file mode 100644 index 0000000..5d2232a Binary files /dev/null and b/kernel/x86_64/._znrm2.S differ diff --git a/kernel/x86_64/._znrm2_sse.S b/kernel/x86_64/._znrm2_sse.S new file mode 100644 index 0000000..7174d5a Binary files /dev/null and b/kernel/x86_64/._znrm2_sse.S differ diff --git a/kernel/x86_64/._zrot.S b/kernel/x86_64/._zrot.S new file mode 100644 index 0000000..5a0154f Binary files /dev/null and b/kernel/x86_64/._zrot.S differ diff --git a/kernel/x86_64/._zrot_sse.S b/kernel/x86_64/._zrot_sse.S new file mode 100644 index 0000000..cd20866 Binary files /dev/null and b/kernel/x86_64/._zrot_sse.S differ diff --git a/kernel/x86_64/._zrot_sse2.S b/kernel/x86_64/._zrot_sse2.S new file mode 100644 index 0000000..c0ae314 Binary files /dev/null and b/kernel/x86_64/._zrot_sse2.S differ diff --git a/kernel/x86_64/._zscal.S b/kernel/x86_64/._zscal.S new file mode 100644 index 0000000..235ff8f Binary files /dev/null and b/kernel/x86_64/._zscal.S differ diff --git a/kernel/x86_64/._zscal_atom.S b/kernel/x86_64/._zscal_atom.S new file mode 100644 index 0000000..2cc92b7 Binary files /dev/null and b/kernel/x86_64/._zscal_atom.S differ diff --git a/kernel/x86_64/._zscal_sse.S b/kernel/x86_64/._zscal_sse.S new file mode 100644 index 0000000..7583a19 Binary files /dev/null and b/kernel/x86_64/._zscal_sse.S differ diff --git a/kernel/x86_64/._zscal_sse2.S b/kernel/x86_64/._zscal_sse2.S new file mode 100644 index 0000000..e46bb80 Binary files /dev/null and b/kernel/x86_64/._zscal_sse2.S differ diff --git a/kernel/x86_64/._zswap.S b/kernel/x86_64/._zswap.S new file mode 100644 index 0000000..7f8927d Binary files /dev/null and b/kernel/x86_64/._zswap.S differ diff --git a/kernel/x86_64/._zswap_sse.S b/kernel/x86_64/._zswap_sse.S new file mode 100644 index 0000000..8a89657 Binary files /dev/null and b/kernel/x86_64/._zswap_sse.S differ diff --git a/kernel/x86_64/._zswap_sse2.S b/kernel/x86_64/._zswap_sse2.S new file mode 100644 index 0000000..650ee47 Binary files /dev/null and b/kernel/x86_64/._zswap_sse2.S differ diff --git a/kernel/x86_64/._zsymv_L_sse.S b/kernel/x86_64/._zsymv_L_sse.S new file mode 100644 index 0000000..950910b Binary files /dev/null and b/kernel/x86_64/._zsymv_L_sse.S differ diff --git a/kernel/x86_64/._zsymv_L_sse2.S b/kernel/x86_64/._zsymv_L_sse2.S new file mode 100644 index 0000000..3c92147 Binary files /dev/null and b/kernel/x86_64/._zsymv_L_sse2.S differ diff --git a/kernel/x86_64/._zsymv_U_sse.S b/kernel/x86_64/._zsymv_U_sse.S new file mode 100644 index 0000000..a5b0995 Binary files /dev/null and b/kernel/x86_64/._zsymv_U_sse.S differ diff --git a/kernel/x86_64/._zsymv_U_sse2.S b/kernel/x86_64/._zsymv_U_sse2.S new file mode 100644 index 0000000..3128514 Binary files /dev/null and b/kernel/x86_64/._zsymv_U_sse2.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LN_2x1_atom.S b/kernel/x86_64/._ztrsm_kernel_LN_2x1_atom.S new file mode 100644 index 0000000..947506e Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LN_2x1_atom.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LN_2x2_core2.S b/kernel/x86_64/._ztrsm_kernel_LN_2x2_core2.S new file mode 100644 index 0000000..cc1a483 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LN_2x2_core2.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86_64/._ztrsm_kernel_LN_2x2_penryn.S new file mode 100644 index 0000000..d6e479f Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LN_2x2_penryn.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/._ztrsm_kernel_LN_2x2_sse2.S new file mode 100644 index 0000000..cab1a74 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LN_2x2_sse2.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LN_2x2_sse3.S b/kernel/x86_64/._ztrsm_kernel_LN_2x2_sse3.S new file mode 100644 index 0000000..15a11c1 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LN_2x2_sse3.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LN_2x4_nehalem.S b/kernel/x86_64/._ztrsm_kernel_LN_2x4_nehalem.S new file mode 100644 index 0000000..e7bee10 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LN_2x4_nehalem.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/._ztrsm_kernel_LN_4x2_sse.S new file mode 100644 index 0000000..4ad2791 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LN_4x2_sse.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LT_1x4_nehalem.S b/kernel/x86_64/._ztrsm_kernel_LT_1x4_nehalem.S new file mode 100644 index 0000000..d706291 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LT_1x4_nehalem.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LT_2x1_atom.S b/kernel/x86_64/._ztrsm_kernel_LT_2x1_atom.S new file mode 100644 index 0000000..0f5a22d Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LT_2x1_atom.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LT_2x2_core2.S b/kernel/x86_64/._ztrsm_kernel_LT_2x2_core2.S new file mode 100644 index 0000000..354b5d0 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LT_2x2_core2.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86_64/._ztrsm_kernel_LT_2x2_penryn.S new file mode 100644 index 0000000..1de41ae Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LT_2x2_penryn.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/._ztrsm_kernel_LT_2x2_sse2.S new file mode 100644 index 0000000..e8da742 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LT_2x2_sse2.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LT_2x2_sse3.S b/kernel/x86_64/._ztrsm_kernel_LT_2x2_sse3.S new file mode 100644 index 0000000..97743b9 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LT_2x2_sse3.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LT_2x4_nehalem.S b/kernel/x86_64/._ztrsm_kernel_LT_2x4_nehalem.S new file mode 100644 index 0000000..dcbaccf Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LT_2x4_nehalem.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/._ztrsm_kernel_LT_4x2_sse.S new file mode 100644 index 0000000..9344f5f Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_LT_4x2_sse.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_RT_1x4_nehalem.S b/kernel/x86_64/._ztrsm_kernel_RT_1x4_nehalem.S new file mode 100644 index 0000000..4b5406c Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_RT_1x4_nehalem.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_RT_2x2_core2.S b/kernel/x86_64/._ztrsm_kernel_RT_2x2_core2.S new file mode 100644 index 0000000..9a17b58 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_RT_2x2_core2.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86_64/._ztrsm_kernel_RT_2x2_penryn.S new file mode 100644 index 0000000..cc57544 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_RT_2x2_penryn.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/._ztrsm_kernel_RT_2x2_sse2.S new file mode 100644 index 0000000..87ef432 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_RT_2x2_sse2.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_RT_2x2_sse3.S b/kernel/x86_64/._ztrsm_kernel_RT_2x2_sse3.S new file mode 100644 index 0000000..110d208 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_RT_2x2_sse3.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_RT_2x4_nehalem.S b/kernel/x86_64/._ztrsm_kernel_RT_2x4_nehalem.S new file mode 100644 index 0000000..b33e30e Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_RT_2x4_nehalem.S differ diff --git a/kernel/x86_64/._ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/._ztrsm_kernel_RT_4x2_sse.S new file mode 100644 index 0000000..f150475 Binary files /dev/null and b/kernel/x86_64/._ztrsm_kernel_RT_4x2_sse.S differ diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL new file mode 100644 index 0000000..3d980f9 --- /dev/null +++ b/kernel/x86_64/KERNEL @@ -0,0 +1,456 @@ +ifndef SAMAXKERNEL +SAMAXKERNEL = amax_sse.S +endif + +ifndef DAMAXKERNEL +DAMAXKERNEL = amax_sse2.S +endif + +ifndef QAMAXKERNEL +QAMAXKERNEL = amax.S +endif + +ifndef CAMAXKERNEL +CAMAXKERNEL = zamax_sse.S +endif + +ifndef ZAMAXKERNEL +ZAMAXKERNEL = zamax_sse2.S +endif + +ifndef XAMAXKERNEL +XAMAXKERNEL = zamax.S +endif + +ifndef SASUMKERNEL +SASUMKERNEL = asum_sse.S +endif + +ifndef DASUMKERNEL +DASUMKERNEL = asum_sse2.S +endif + +ifndef CASUMKERNEL +CASUMKERNEL = zasum_sse.S +endif + +ifndef ZASUMKERNEL +ZASUMKERNEL = zasum_sse2.S +endif + +ifndef QASUMKERNEL +QASUMKERNEL = asum.S +endif + +ifndef XASUMKERNEL +XASUMKERNEL = zasum.S +endif + +ifndef SAMINKERNEL +SAMINKERNEL = amax_sse.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax_sse2.S +endif + +ifndef QAMINKERNEL +QAMINKERNEL = amax.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax_sse.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax_sse2.S +endif + +ifndef XAMINKERNEL +XAMINKERNEL = zamax.S +endif + +ifndef SAXPYKERNEL +SAXPYKERNEL = axpy_sse.S +endif + +ifndef DAXPYKERNEL +DAXPYKERNEL = axpy_sse2.S +endif + +ifndef CAXPYKERNEL +CAXPYKERNEL = zaxpy_sse.S +endif + +ifndef ZAXPYKERNEL +ZAXPYKERNEL = zaxpy_sse2.S +endif + +ifndef QAXPYKERNEL +QAXPYKERNEL = axpy.S +endif + +ifndef XAXPYKERNEL +XAXPYKERNEL = zaxpy.S +endif + +ifndef SCOPYKERNEL +SCOPYKERNEL = copy_sse.S +endif + +ifndef DCOPYKERNEL +DCOPYKERNEL = copy_sse2.S +endif + +ifndef CCOPYKERNEL +CCOPYKERNEL = zcopy_sse.S +endif + +ifndef ZCOPYKERNEL +ZCOPYKERNEL = zcopy_sse2.S +endif + +ifndef QCOPYKERNEL +QCOPYKERNEL = copy.S +endif + +ifndef XCOPYKERNEL +XCOPYKERNEL = zcopy.S +endif + +ifndef SDOTKERNEL +SDOTKERNEL = dot_sse.S +endif + +ifndef DDOTKERNEL +DDOTKERNEL = dot_sse2.S +endif + +ifndef CDOTKERNEL +CDOTKERNEL = zdot_sse.S +endif + +ifndef ZDOTKERNEL +ZDOTKERNEL = zdot_sse2.S +endif + +ifndef QDOTKERNEL +QDOTKERNEL = dot.S +endif + +ifndef XDOTKERNEL +XDOTKERNEL = zdot.S +endif + +ifndef ISAMAXKERNEL +ISAMAXKERNEL = iamax_sse.S +endif + +ifndef IDAMAXKERNEL +IDAMAXKERNEL = iamax_sse2.S +endif + +ifndef IQAMAXKERNEL +IQAMAXKERNEL = iamax.S +endif + +ifndef ICAMAXKERNEL +ICAMAXKERNEL = izamax_sse.S +endif + +ifndef IZAMAXKERNEL +IZAMAXKERNEL = izamax_sse2.S +endif + +ifndef IXAMAXKERNEL +IXAMAXKERNEL = izamax.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax_sse.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax_sse2.S +endif + +ifndef IQAMINKERNEL +IQAMINKERNEL = iamax.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax_sse.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax_sse2.S +endif + +ifndef IXAMINKERNEL +IXAMINKERNEL = izamax.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = iamax_sse.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = iamax_sse2.S +endif + +ifndef IQMAXKERNEL +IQMAXKERNEL = iamax.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax_sse.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax_sse2.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = iamax.S +endif + +ifndef SMAXKERNEL +SMAXKERNEL = amax_sse.S +endif + +ifndef DMAXKERNEL +DMAXKERNEL = amax_sse2.S +endif + +ifndef QMAXKERNEL +QMAXKERNEL = amax.S +endif + +ifndef SMINKERNEL +SMINKERNEL = amax_sse.S +endif + +ifndef DMINKERNEL +DMINKERNEL = amax_sse2.S +endif + +ifndef QMINKERNEL +QMINKERNEL = amax.S +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2_sse.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.S +endif + +ifndef QNRM2KERNEL +QNRM2KERNEL = nrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2_sse.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef XNRM2KERNEL +XNRM2KERNEL = znrm2.S +endif + +ifndef SROTKERNEL +SROTKERNEL = rot_sse.S +endif + +ifndef DROTKERNEL +DROTKERNEL = rot_sse2.S +endif + +ifndef QROTKERNEL +QROTKERNEL = rot.S +endif + +ifndef CROTKERNEL +CROTKERNEL = zrot_sse.S +endif + +ifndef ZROTKERNEL +ZROTKERNEL = zrot_sse2.S +endif + +ifndef XROTKERNEL +XROTKERNEL = zrot.S +endif + +ifndef SSCALKERNEL +SSCALKERNEL = scal_sse.S +endif + +ifndef DSCALKERNEL +DSCALKERNEL = scal_sse2.S +endif + +ifndef CSCALKERNEL +CSCALKERNEL = zscal_sse.S +endif + +ifndef ZSCALKERNEL +ZSCALKERNEL = zscal_sse2.S +endif + +ifndef ASCALKERNEL +QSCALKERNEL = scal.S +endif + +ifndef XSCALKERNEL +XSCALKERNEL = zscal.S +endif + +ifndef SSWAPKERNEL +SSWAPKERNEL = swap_sse.S +endif + +ifndef DSWAPKERNEL +DSWAPKERNEL = swap_sse2.S +endif + +ifndef CSWAPKERNEL +CSWAPKERNEL = zswap_sse.S +endif + +ifndef ZSWAPKERNEL +ZSWAPKERNEL = zswap_sse2.S +endif + +ifndef QSWAPKERNEL +QSWAPKERNEL = swap.S +endif + +ifndef XSWAPKERNEL +XSWAPKERNEL = zswap.S +endif + +ifndef SSYMV_U_KERNEL +SSYMV_U_KERNEL = symv_U_sse.S +endif + +ifndef SSYMV_L_KERNEL +SSYMV_L_KERNEL = symv_L_sse.S +endif + +ifndef DSYMV_U_KERNEL +DSYMV_U_KERNEL = symv_U_sse2.S +endif + +ifndef DSYMV_L_KERNEL +DSYMV_L_KERNEL = symv_L_sse2.S +endif + +ifndef ZSYMV_U_KERNEL +ZSYMV_U_KERNEL = zsymv_U_sse2.S +endif + +ifndef ZSYMV_L_KERNEL +ZSYMV_L_KERNEL = zsymv_L_sse2.S +endif + +ifndef ZHEMV_U_KERNEL +ZHEMV_U_KERNEL = zsymv_U_sse2.S +endif + +ifndef ZHEMV_L_KERNEL +ZHEMV_L_KERNEL = zsymv_L_sse2.S +endif + +GEMVDEP = ../l2param.h + +ifndef SGEMVNKERNEL +SGEMVNKERNEL = sgemv_n.S +endif + +ifndef SGEMVTKERNEL +SGEMVTKERNEL = sgemv_t.S +endif + +ifndef DGEMVNKERNEL +DGEMVNKERNEL = dgemv_n.S +endif + +ifndef DGEMVTKERNEL +DGEMVTKERNEL = dgemv_t.S +endif + +ifndef CGEMVNKERNEL +CGEMVNKERNEL = cgemv_n.S +endif + +ifndef CGEMVTKERNEL +CGEMVTKERNEL = cgemv_t.S +endif + +ifndef ZGEMVNKERNEL +ZGEMVNKERNEL = zgemv_n.S +endif + +ifndef ZGEMVTKERNEL +ZGEMVTKERNEL = zgemv_t.S +endif + +ifndef QGEMVNKERNEL +QGEMVNKERNEL = qgemv_n.S +endif + +ifndef QGEMVTKERNEL +QGEMVTKERNEL = qgemv_t.S +endif + +ifndef XGEMVNKERNEL +XGEMVNKERNEL = xgemv_n.S +endif + +ifndef XGEMVTKERNEL +XGEMVTKERNEL = xgemv_t.S +endif + +QGEMMKERNEL = qgemm_kernel_2x2.S +QGEMMINCOPY = +QGEMMITCOPY = +QGEMMONCOPY = ../generic/gemm_ncopy_2.c +QGEMMOTCOPY = ../generic/gemm_tcopy_2.c +QGEMMINCOPYOBJ = +QGEMMITCOPYOBJ = +QGEMMONCOPYOBJ = qgemm_oncopy$(TSUFFIX).$(SUFFIX) +QGEMMOTCOPYOBJ = qgemm_otcopy$(TSUFFIX).$(SUFFIX) + +XGEMMKERNEL = xgemm_kernel_1x1.S +XGEMMINCOPY = +XGEMMITCOPY = +XGEMMONCOPY = ../generic/zgemm_ncopy_1.c +XGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +XGEMMINCOPYOBJ = +XGEMMITCOPYOBJ = +XGEMMONCOPYOBJ = xgemm_oncopy$(TSUFFIX).$(SUFFIX) +XGEMMOTCOPYOBJ = xgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S +QGEMM_BETA = ../generic/gemm_beta.c +XGEMM_BETA = ../generic/zgemm_beta.c + +QTRSMKERNEL_LN = qtrsm_kernel_LN_2x2.S +QTRSMKERNEL_LT = qtrsm_kernel_LT_2x2.S +QTRSMKERNEL_RN = qtrsm_kernel_LT_2x2.S +QTRSMKERNEL_RT = qtrsm_kernel_RT_2x2.S + +XTRSMKERNEL_LN = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_LT = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S + +XGEMM3MKERNEL = xgemm3m_kernel_2x2.S diff --git a/kernel/x86_64/KERNEL.ATOM b/kernel/x86_64/KERNEL.ATOM new file mode 100644 index 0000000..cfbd05a --- /dev/null +++ b/kernel/x86_64/KERNEL.ATOM @@ -0,0 +1,85 @@ +DAMAXKERNEL = amax_atom.S +ZAMAXKERNEL = zamax_atom.S + +DAMINKERNEL = amax_atom.S +ZAMINKERNEL = zamax_atom.S + +DASUMKERNEL = asum_atom.S +ZASUMKERNEL = zasum_atom.S + +DAXPYKERNEL = axpy_atom.S +ZAXPYKERNEL = zaxpy_atom.S + +DDOTKERNEL = dot_atom.S +ZDOTKERNEL = zdot_atom.S + +DMAXKERNEL = amax_atom.S +DMINKERNEL = amax_atom.S + +DSCALKERNEL = scal_atom.S +ZSCALKERNEL = zscal_atom.S + +DGEMVNKERNEL = dgemv_n_atom.S +DGEMVTKERNEL = dgemv_t_atom.S +ZGEMVNKERNEL = zgemv_n_atom.S +ZGEMVTKERNEL = zgemv_t_atom.S + +SGEMMKERNEL = gemm_kernel_8x4_penryn.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x2_atom.S +DGEMMINCOPY = gemm_ncopy_4.S +DGEMMITCOPY = gemm_tcopy_4.S +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_penryn.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x1_atom.S +ZGEMMINCOPY = zgemm_ncopy_2.S +ZGEMMITCOPY = zgemm_tcopy_2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x2_atom.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x2_atom.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x2_atom.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x2_atom.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x1_atom.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x1_atom.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x1_atom.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_2x1_atom.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x2_atom.S diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA new file mode 100644 index 0000000..051a522 --- /dev/null +++ b/kernel/x86_64/KERNEL.BARCELONA @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_barcelona.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_barcelona.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S diff --git a/kernel/x86_64/KERNEL.CORE2 b/kernel/x86_64/KERNEL.CORE2 new file mode 100644 index 0000000..8a07e80 --- /dev/null +++ b/kernel/x86_64/KERNEL.CORE2 @@ -0,0 +1,60 @@ +SGEMMKERNEL = gemm_kernel_8x4_core2.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_core2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_core2.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_core2.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_core2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_core2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_core2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_core2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S + diff --git a/kernel/x86_64/KERNEL.DUNNINGTON b/kernel/x86_64/KERNEL.DUNNINGTON new file mode 100644 index 0000000..b96daa0 --- /dev/null +++ b/kernel/x86_64/KERNEL.DUNNINGTON @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x4_penryn.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_penryn.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_penryn.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_penryn.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S diff --git a/kernel/x86_64/KERNEL.NANO b/kernel/x86_64/KERNEL.NANO new file mode 100644 index 0000000..0b771a4 --- /dev/null +++ b/kernel/x86_64/KERNEL.NANO @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x8_nano.S +SGEMMINCOPY = gemm_ncopy_4.S +SGEMMITCOPY = gemm_tcopy_4.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_penryn.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMINCOPY = zgemm_ncopy_2.S +CGEMMITCOPY = zgemm_tcopy_2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_penryn.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM new file mode 100644 index 0000000..58a8832 --- /dev/null +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x8_nehalem.S +SGEMMINCOPY = gemm_ncopy_4.S +SGEMMITCOPY = gemm_tcopy_4.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x8_nehalem.S +DGEMMINCOPY = dgemm_ncopy_2.S +DGEMMITCOPY = dgemm_tcopy_2.S +DGEMMONCOPY = dgemm_ncopy_8.S +DGEMMOTCOPY = dgemm_tcopy_8.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMINCOPY = zgemm_ncopy_2.S +CGEMMITCOPY = zgemm_tcopy_2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S +ZGEMMINCOPY = zgemm_ncopy_1.S +ZGEMMITCOPY = zgemm_tcopy_1.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/KERNEL.OPTERON b/kernel/x86_64/KERNEL.OPTERON new file mode 100644 index 0000000..27fb785 --- /dev/null +++ b/kernel/x86_64/KERNEL.OPTERON @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x4_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_sse2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_sse2.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse2.S diff --git a/kernel/x86_64/KERNEL.OPTERON_SSE3 b/kernel/x86_64/KERNEL.OPTERON_SSE3 new file mode 100644 index 0000000..565daf3 --- /dev/null +++ b/kernel/x86_64/KERNEL.OPTERON_SSE3 @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_sse2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_sse2.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse2.S diff --git a/kernel/x86_64/KERNEL.PENRYN b/kernel/x86_64/KERNEL.PENRYN new file mode 100644 index 0000000..b96daa0 --- /dev/null +++ b/kernel/x86_64/KERNEL.PENRYN @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x4_penryn.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_penryn.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_penryn.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_penryn.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S diff --git a/kernel/x86_64/KERNEL.PRESCOTT b/kernel/x86_64/KERNEL.PRESCOTT new file mode 100644 index 0000000..e155531 --- /dev/null +++ b/kernel/x86_64/KERNEL.PRESCOTT @@ -0,0 +1,63 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_sse3.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_sse3.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_sse3.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_sse3.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_sse3.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_sse3.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_sse3.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_sse3.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse3.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse3.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse3.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse3.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + diff --git a/kernel/x86_64/Makefile b/kernel/x86_64/Makefile new file mode 100644 index 0000000..efae70d --- /dev/null +++ b/kernel/x86_64/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/x86_64/amax.S b/kernel/x86_64/amax.S new file mode 100644 index 0000000..d096d88 --- /dev/null +++ b/kernel/x86_64/amax.S @@ -0,0 +1,307 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define I %rax + +#ifndef USE_MIN +#define FMOV fcmovbe +#else +#define FMOV fcmovnbe +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $BASE_SHIFT, INCX + + fldz + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + ffreep %st + + FLD (X) +#ifdef USE_ABS + fabs +#endif + addq INCX, X + decq M + jle .L999 + + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 1 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 2 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 3 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 4 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 5 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 6 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 7 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq $8 * SIZE, X + + decq I + jg .L10 + ALIGN_4 + +.L20: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq $1 * SIZE, X + decq I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + decq I + jg .L50 + ALIGN_4 + +.L60: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq INCX, X + decq I + jg .L61 + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/amax_atom.S b/kernel/x86_64/amax_atom.S new file mode 100644 index 0000000..fa7b9a3 --- /dev/null +++ b/kernel/x86_64/amax_atom.S @@ -0,0 +1,460 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + leaq (, INCX, SIZE), INCX + + testq M, M + jle .L999 + + testq INCX, INCX + jle .L999 + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 +#endif + + movsd (X), %xmm0 + addq INCX, X + +#ifdef USE_ABS + andps %xmm15, %xmm0 +#endif + decq M + jle .L999 + + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 + + cmpq $SIZE, INCX + jne .L20 + + movq M, I + sarq $3, I + jle .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movsd 2 * SIZE(X), %xmm6 + movsd 3 * SIZE(X), %xmm7 + + movsd 4 * SIZE(X), %xmm8 + movsd 5 * SIZE(X), %xmm9 + movsd 6 * SIZE(X), %xmm10 + movsd 7 * SIZE(X), %xmm11 + + decq I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm1 + movsd 8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm2 + movsd 9 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm1 + movsd 10 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm2 + movsd 11 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + maxsd %xmm8, %xmm1 + movsd 12 * SIZE(X), %xmm8 + +#ifdef USE_ABS + andps %xmm15, %xmm9 +#endif + maxsd %xmm9, %xmm2 + movsd 13 * SIZE(X), %xmm9 + +#ifdef USE_ABS + andps %xmm15, %xmm10 +#endif + maxsd %xmm10, %xmm1 + movsd 14 * SIZE(X), %xmm10 + +#ifdef USE_ABS + andps %xmm15, %xmm11 +#endif + maxsd %xmm11, %xmm2 + movsd 15 * SIZE(X), %xmm11 + + addq $8 * SIZE, X + decq I + jg .L12 + ALIGN_4 + +.L13: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm3 + +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + maxsd %xmm8, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm9 +#endif + maxsd %xmm9, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm10 +#endif + maxsd %xmm10, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm11 +#endif + maxsd %xmm11, %xmm3 + + addq $8 * SIZE, X + ALIGN_4 + +.L15: + testq $4, M + jle .L17 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movsd 2 * SIZE(X), %xmm6 + movsd 3 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm3 + + addq $4 * SIZE, X + ALIGN_3 + +.L17: + testq $2, M + jle .L18 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm2 + addq $2 * SIZE, X + ALIGN_3 + +.L18: + testq $1, M + jle .L998 + + movsd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm3 + jmp .L998 + ALIGN_3 + +.L20: + movq M, I + sarq $3, I + jle .L25 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + movsd (X), %xmm8 + addq INCX, X + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm10 + addq INCX, X + movsd (X), %xmm11 + + decq I + jle .L23 + ALIGN_4 + +.L22: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + addq INCX, X + maxsd %xmm4, %xmm1 + movsd (X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + addq INCX, X + maxsd %xmm5, %xmm2 + movsd (X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + addq INCX, X + maxsd %xmm6, %xmm1 + movsd (X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + addq INCX, X + maxsd %xmm7, %xmm2 + movsd (X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + addq INCX, X + maxsd %xmm8, %xmm1 + movsd (X), %xmm8 + +#ifdef USE_ABS + andps %xmm15, %xmm9 +#endif + addq INCX, X + maxsd %xmm9, %xmm2 + movsd (X), %xmm9 + +#ifdef USE_ABS + andps %xmm15, %xmm10 +#endif + addq INCX, X + maxsd %xmm10, %xmm1 + movsd (X), %xmm10 + +#ifdef USE_ABS + andps %xmm15, %xmm11 +#endif + addq INCX, X + maxsd %xmm11, %xmm2 + movsd (X), %xmm11 + + decq I + jg .L22 + ALIGN_4 + +.L23: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + addq INCX, X + maxsd %xmm4, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm3 + +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + maxsd %xmm8, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm9 +#endif + maxsd %xmm9, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm10 +#endif + maxsd %xmm10, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm11 +#endif + maxsd %xmm11, %xmm3 + ALIGN_4 + +.L25: + testq $4, M + jle .L27 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm3 + ALIGN_3 + +.L27: + testq $2, M + jle .L28 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm2 + ALIGN_3 + +.L28: + testq $1, M + jle .L998 + + movsd (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm3 + ALIGN_3 + +.L998: + maxsd %xmm1, %xmm0 + maxsd %xmm3, %xmm2 + maxsd %xmm2, %xmm0 + ALIGN_4 + +.L999: + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/amax_sse.S b/kernel/x86_64/amax_sse.S new file mode 100644 index 0000000..22b8b16 --- /dev/null +++ b/kernel/x86_64/amax_sse.S @@ -0,0 +1,475 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + leaq (, INCX, SIZE), INCX + + testq M, M + jle .L999 + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 +#endif + + movss (X), %xmm0 + shufps $0, %xmm0, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm0 +#endif + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 + addq INCX, X + decq M + jle .L999 + + cmpq $SIZE, INCX + jne .L40 + + subq $-32 * SIZE, X + + cmpq $3, M + jle .L17 + + testq $SIZE, X + je .L05 + + movss -32 * SIZE(X), %xmm1 + shufps $0, %xmm1, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + decq M + addq $SIZE, X + ALIGN_3 + +.L05: + testq $2 * SIZE, X + je .L06 + + movsd -32 * SIZE(X), %xmm2 + unpcklps %xmm2, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm2 +#endif + subq $2, M + addq $2 * SIZE, X + ALIGN_3 + +.L06: + movq M, I + sarq $5, I + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decq I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + movaps -8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + movaps -8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + + subq $-32 * SIZE, X + ALIGN_3 + + +.L15: + testq $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + + movaps -20 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + + addq $16 * SIZE, X + ALIGN_3 + +.L16: + testq $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + addq $8 * SIZE, X + ALIGN_3 + +.L17: + testq $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L18: + testq $2, M + je .L19 + + movsd -32 * SIZE(X), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm3 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L40: + movq M, I + sarq $3, I + jle .L45 + ALIGN_4 + +.L41: + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + + decq I + jg .L41 + ALIGN_4 + +.L45: + testq $4, M + je .L46 + + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + ALIGN_3 + +.L46: + testq $2, M + je .L47 + + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + ALIGN_3 + +.L47: + testq $1, M + je .L998 + + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm2 + ALIGN_4 + +.L998: + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/amax_sse2.S b/kernel/x86_64/amax_sse2.S new file mode 100644 index 0000000..033e8e1 --- /dev/null +++ b/kernel/x86_64/amax_sse2.S @@ -0,0 +1,498 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + leaq (, INCX, SIZE), INCX + + testq M, M + jle .L999 + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 +#endif + + movsd (X), %xmm0 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm0 +#endif + unpcklpd %xmm0, %xmm0 + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 + decq M + jle .L999 + + cmpq $SIZE, INCX + jne .L40 + + subq $-16 * SIZE, X + + testq $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + addq $SIZE, X + decq M + jle .L998 + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decq I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + movaps -4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + + subq $-16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + movaps -4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + movaps -2 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + subq $-16 * SIZE, X + ALIGN_4 + +.L15: + testq $8, M + jle .L16 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movaps -14 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movaps -12 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movaps -10 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L16: + testq $4, M + jle .L17 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movaps -14 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L17: + testq $2, M + jle .L18 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm2 + addq $2 * SIZE, X + ALIGN_3 + +.L18: + testq $1, M + jle .L998 + + movsd -16 * SIZE(X), %xmm4 + unpcklpd %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm3 + jmp .L998 + ALIGN_3 + +.L40: + movq M, I + sarq $4, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + decq I + jg .L41 + ALIGN_4 + +.L45: + andq $15, M + jle .L998 + + testq $8, M + je .L46 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + ALIGN_3 + +.L46: + testq $4, M + je .L47 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + ALIGN_3 + +.L47: + testq $2, M + je .L48 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + ALIGN_3 + +.L48: + testq $1, M + je .L998 + + movsd (X), %xmm7 + unpcklpd %xmm7, %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + ALIGN_4 + +.L998: + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movaps %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/asum.S b/kernel/x86_64/asum.S new file mode 100644 index 0000000..13c6f4f --- /dev/null +++ b/kernel/x86_64/asum.S @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $BASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $7, M + jle .L998 + ALIGN_4 + +.L21: + FLD (X) + fabs + faddp %st,%st(1) + addq $1 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $7, M + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addq INCX, X + fabs + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/asum_atom.S b/kernel/x86_64/asum_atom.S new file mode 100644 index 0000000..b6ea65f --- /dev/null +++ b/kernel/x86_64/asum_atom.S @@ -0,0 +1,433 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + salq $BASE_SHIFT, INCX + xorps %xmm13, %xmm13 + + cmpq $SIZE, INCX + jne .L20 + + testq $SIZE, X + je .L05 + + movsd (X), %xmm0 + addq $SIZE, X + andps %xmm15, %xmm0 + decq M + jle .L999 + ALIGN_3 + +.L05: + subq $-16 * SIZE, X + + movq M, I + sarq $4, I + jle .L12 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm8, %xmm12 + addsd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm9, %xmm13 + addsd %xmm9, %xmm2 + movaps 10 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm10, %xmm12 + addsd %xmm10, %xmm0 + movaps 12 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm11, %xmm13 + addsd %xmm11, %xmm2 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm15, %xmm4 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + + andps %xmm15, %xmm6 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + + andps %xmm15, %xmm8 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm8, %xmm12 + addsd %xmm8, %xmm0 + + andps %xmm15, %xmm9 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm9, %xmm13 + addsd %xmm9, %xmm2 + + andps %xmm15, %xmm10 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm10, %xmm12 + addsd %xmm10, %xmm0 + + andps %xmm15, %xmm11 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm11, %xmm13 + addsd %xmm11, %xmm2 + + addsd %xmm13, %xmm3 + subq $-16 * SIZE, X + ALIGN_3 + +.L12: + andq $15, M + jle .L998 + + testq $8, M + je .L13 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + addq $8 * SIZE, X + + andps %xmm15, %xmm4 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + addsd %xmm13, %xmm3 + andps %xmm15, %xmm6 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + addsd %xmm13, %xmm3 + ALIGN_3 + +.L13: + testq $4, M + je .L14 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + addq $4 * SIZE, X + + andps %xmm15, %xmm4 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + addsd %xmm13, %xmm3 + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movaps -16 * SIZE(X), %xmm4 + addq $2 * SIZE, X + andps %xmm15, %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + addsd %xmm4, %xmm2 + addsd %xmm5, %xmm3 + ALIGN_3 + +.L15: + testq $1, M + je .L998 + + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L20: + movq M, I + sarq $3, I + jle .L25 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + movsd (X), %xmm8 + addq INCX, X + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm10 + addq INCX, X + movsd (X), %xmm11 + + decq I + jle .L23 + ALIGN_4 + +.L22: + andps %xmm15, %xmm4 + addq INCX, X + addsd %xmm4, %xmm0 + movsd (X), %xmm4 + andps %xmm15, %xmm5 + addq INCX, X + addsd %xmm5, %xmm1 + movsd (X), %xmm5 + andps %xmm15, %xmm6 + addq INCX, X + addsd %xmm6, %xmm2 + movsd (X), %xmm6 + andps %xmm15, %xmm7 + addq INCX, X + addsd %xmm7, %xmm3 + movsd (X), %xmm7 + + andps %xmm15, %xmm8 + addq INCX, X + addsd %xmm8, %xmm0 + movsd (X), %xmm8 + andps %xmm15, %xmm9 + addq INCX, X + addsd %xmm9, %xmm1 + movsd (X), %xmm9 + andps %xmm15, %xmm10 + addq INCX, X + addsd %xmm10, %xmm2 + movsd (X), %xmm10 + andps %xmm15, %xmm11 + addq INCX, X + addsd %xmm11, %xmm3 + movsd (X), %xmm11 + + decq I + jg .L22 + ALIGN_4 + +.L23: + andps %xmm15, %xmm4 + addq INCX, X + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + andps %xmm15, %xmm6 + addsd %xmm6, %xmm2 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + + andps %xmm15, %xmm8 + addsd %xmm8, %xmm0 + andps %xmm15, %xmm9 + addsd %xmm9, %xmm1 + andps %xmm15, %xmm10 + addsd %xmm10, %xmm2 + andps %xmm15, %xmm11 + addsd %xmm11, %xmm3 + ALIGN_3 + +.L25: + andq $7, M + jle .L998 + + testq $4, M + je .L26 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + addq INCX, X + movsd (X), %xmm7 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + addq INCX, X + + andps %xmm15, %xmm6 + addsd %xmm6, %xmm2 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + ALIGN_3 + +.L26: + testq $2, M + je .L27 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + ALIGN_3 + +.L27: + testq $1, M + je .L998 + + movsd (X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + ALIGN_3 + +.L998: + addsd %xmm1, %xmm0 + addsd %xmm3, %xmm2 + addsd %xmm2, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/asum_sse.S b/kernel/x86_64/asum_sse.S new file mode 100644 index 0000000..840e193 --- /dev/null +++ b/kernel/x86_64/asum_sse.S @@ -0,0 +1,345 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 + + leaq (, INCX, SIZE), INCX + + cmpq $SIZE, INCX + jne .L100 + + subq $-32 * SIZE, X + + cmpq $3, M + jle .L18 + + testq $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + andps %xmm15, %xmm0 + addq $SIZE, X + decq M + jle .L998 + ALIGN_3 + +.L05: + testq $8, X + je .L10 + + movsd -32 * SIZE(X), %xmm1 + andps %xmm15, %xmm1 + addq $2 * SIZE, X + subq $2, M + jle .L998 + ALIGN_3 + +.L10: + movq M, I + sarq $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + movaps -16 * SIZE(X), %xmm8 + movaps -12 * SIZE(X), %xmm9 + movaps -8 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + decq I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps 16 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addps %xmm9, %xmm1 + movaps 20 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addps %xmm10, %xmm2 + movaps 24 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addps %xmm11, %xmm3 + movaps 28 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + decq I + jg .L11 + ALIGN_3 + +.L12: + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + + andps %xmm15, %xmm8 + addps %xmm8, %xmm0 + andps %xmm15, %xmm9 + addps %xmm9, %xmm1 + + andps %xmm15, %xmm10 + addps %xmm10, %xmm2 + andps %xmm15, %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + ALIGN_3 + +.L14: + testq $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -20 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + addq $16 * SIZE, X + ALIGN_3 + +.L16: + testq $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L17: + testq $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L18: + testq $2, M + je .L19 + +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd -32 * SIZE(X), %xmm7 + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, M + je .L998 + + movss -32 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + jmp .L998 + ALIGN_4 + +.L100: + movq M, I + sarq $3, I + jle .L105 + ALIGN_4 + +.L101: + movss 0 * SIZE(X), %xmm4 + addq INCX, X + andps %xmm15, %xmm4 + addss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X + andps %xmm15, %xmm5 + addss %xmm5, %xmm1 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X + andps %xmm15, %xmm6 + addss %xmm6, %xmm2 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X + andps %xmm15, %xmm7 + addss %xmm7, %xmm3 + + movss 0 * SIZE(X), %xmm8 + addq INCX, X + andps %xmm15, %xmm8 + addss %xmm8, %xmm0 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X + andps %xmm15, %xmm4 + addss %xmm4, %xmm1 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X + andps %xmm15, %xmm5 + addss %xmm5, %xmm2 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X + andps %xmm15, %xmm6 + addss %xmm6, %xmm3 + + decq I + jg .L101 + ALIGN_4 + +.L105: + andq $7, M + jle .L998 + ALIGN_4 + +.L106: + movss 0 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + addq INCX, X + decq M + jg .L106 + ALIGN_4 + +.L998: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/asum_sse2.S b/kernel/x86_64/asum_sse2.S new file mode 100644 index 0000000..7286fc0 --- /dev/null +++ b/kernel/x86_64/asum_sse2.S @@ -0,0 +1,311 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + salq $BASE_SHIFT, INCX + + subq $-16 * SIZE, X + + cmpq $SIZE, INCX + jne .L40 + + testq $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm0 + addq $SIZE, X + + andps %xmm15, %xmm0 + subq $1, M + jle .L999 + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addpd %xmm9, %xmm1 + movaps 10 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addpd %xmm10, %xmm2 + movaps 12 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addpd %xmm11, %xmm3 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + andps %xmm15, %xmm8 + andps %xmm15, %xmm9 + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + ALIGN_3 + +.L20: + andq $15, M + jle .L998 + + testq $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L21: + testq $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L22: + testq $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addpd %xmm6, %xmm3 + addq $2 * SIZE, X + +.L23: + testq $1, M + je .L998 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + movsd -16 * SIZE(X), %xmm4 + addq INCX, X + movhpd -16 * SIZE(X), %xmm4 + addq INCX, X + andps %xmm15, %xmm4 + addpd %xmm4, %xmm0 + + movsd -16 * SIZE(X), %xmm5 + addq INCX, X + movhpd -16 * SIZE(X), %xmm5 + addq INCX, X + andps %xmm15, %xmm5 + addpd %xmm5, %xmm1 + + movsd -16 * SIZE(X), %xmm6 + addq INCX, X + movhpd -16 * SIZE(X), %xmm6 + addq INCX, X + andps %xmm15, %xmm6 + addpd %xmm6, %xmm2 + + movsd -16 * SIZE(X), %xmm7 + addq INCX, X + movhpd -16 * SIZE(X), %xmm7 + addq INCX, X + andps %xmm15, %xmm7 + addpd %xmm7, %xmm3 + + decq I + jg .L50 + ALIGN_4 + +.L60: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andq $7, M + jle .L998 + ALIGN_4 + +.L61: + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addpd %xmm4, %xmm0 + addq INCX, X + decq M + jg .L61 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + ALIGN_4 + +.L999: +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/axpy.S b/kernel/x86_64/axpy.S new file mode 100644 index 0000000..478cc88 --- /dev/null +++ b/kernel/x86_64/axpy.S @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG4 /* rsi */ +#define INCX ARG5 /* rdx */ +#define Y ARG6 /* rcx */ +#define INCY ARG2 /* r8 */ + +#define ALPHA 8(%rsp) + +#include "l1param.h" + + PROLOGUE + PROFCODE + + movq 24(%rsp), INCY + + FLD ALPHA + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + testq M, M + jle .L40 + + cmpq $SIZE, INCX + jne .L14 + cmpq $SIZE, INCY + jne .L14 + + movq M, %rax + sarq $3, %rax + jle .L15 + ALIGN_3 + +#define PRESIZE 33 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + fmul %st(1),%st + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1),%st + FLD 2 * SIZE(Y) + faddp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + fmul %st(1),%st + FLD 3 * SIZE(Y) + faddp %st, %st(1) + FST 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 4 * SIZE(X) + fmul %st(1),%st + FLD 4 * SIZE(Y) + faddp %st, %st(1) + FST 4 * SIZE(Y) + + FLD 5 * SIZE(X) + fmul %st(1),%st + FLD 5 * SIZE(Y) + faddp %st, %st(1) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1),%st + FLD 6 * SIZE(Y) + faddp %st, %st(1) + FST 6 * SIZE(Y) + + FLD 7 * SIZE(X) + fmul %st(1),%st + FLD 7 * SIZE(Y) + faddp %st, %st(1) + FST 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq M, %rax + andq $7, %rax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + addq $SIZE, X + addq $SIZE, Y + decq %rax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movq M, %rax + sarq $2, %rax + jle .L28 + ALIGN_3 + +.L29: + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + decq %rax + jg .L29 + ALIGN_3 + +.L28: + movq M, %rax + andq $3, %rax + jle .L40 + ALIGN_3 + +.L35: + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + decq %rax + jg .L35 + +.L40: + ffreep %st(0) + ret + + EPILOGUE diff --git a/kernel/x86_64/axpy_atom.S b/kernel/x86_64/axpy_atom.S new file mode 100644 index 0000000..a786329 --- /dev/null +++ b/kernel/x86_64/axpy_atom.S @@ -0,0 +1,555 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 24(%rsp), INCY +#endif + movaps %xmm0, ALPHA +#else + movaps %xmm3, ALPHA + + movq 40(%rsp), X + movq 48(%rsp), INCX + movq 56(%rsp), Y + movq 64(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + testq M, M + jle .L29 + + cmpq $SIZE, INCX + jne .L20 + cmpq $SIZE, INCY + jne .L20 + + movq M, %rax + sarq $3, %rax + jle .L13 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 2 * SIZE(X), %xmm2 + movsd 3 * SIZE(X), %xmm3 + + movsd 0 * SIZE(Y), %xmm4 + movsd 1 * SIZE(Y), %xmm5 + movsd 2 * SIZE(Y), %xmm6 + movsd 3 * SIZE(Y), %xmm7 + + movsd 4 * SIZE(X), %xmm8 + mulsd ALPHA, %xmm0 + movsd 5 * SIZE(X), %xmm9 + mulsd ALPHA, %xmm1 + movsd 6 * SIZE(X), %xmm10 + mulsd ALPHA, %xmm2 + movsd 7 * SIZE(X), %xmm11 + mulsd ALPHA, %xmm3 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addsd %xmm4, %xmm0 + movsd 4 * SIZE(Y), %xmm4 + addsd %xmm5, %xmm1 + movsd 5 * SIZE(Y), %xmm5 + addsd %xmm6, %xmm2 + movsd 6 * SIZE(Y), %xmm6 + addsd %xmm7, %xmm3 + movsd 7 * SIZE(Y), %xmm7 + + movsd %xmm0, 0 * SIZE(Y) + mulsd ALPHA, %xmm8 + movsd 8 * SIZE(X), %xmm0 + + movsd %xmm1, 1 * SIZE(Y) + mulsd ALPHA, %xmm9 + movsd 9 * SIZE(X), %xmm1 + + movsd %xmm2, 2 * SIZE(Y) + mulsd ALPHA, %xmm10 + movsd 10 * SIZE(X), %xmm2 + + movsd %xmm3, 3 * SIZE(Y) + mulsd ALPHA, %xmm11 + movsd 11 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + addsd %xmm4, %xmm8 + movsd 8 * SIZE(Y), %xmm4 + addsd %xmm5, %xmm9 + movsd 9 * SIZE(Y), %xmm5 + addsd %xmm6, %xmm10 + movsd 10 * SIZE(Y), %xmm6 + addsd %xmm7, %xmm11 + movsd 11 * SIZE(Y), %xmm7 + + movsd %xmm8, 4 * SIZE(Y) + mulsd ALPHA, %xmm0 + movsd 12 * SIZE(X), %xmm8 + + movsd %xmm9, 5 * SIZE(Y) + mulsd ALPHA, %xmm1 + movsd 13 * SIZE(X), %xmm9 + + movsd %xmm10, 6 * SIZE(Y) + mulsd ALPHA, %xmm2 + movsd 14 * SIZE(X), %xmm10 + + movsd %xmm11, 7 * SIZE(Y) + mulsd ALPHA, %xmm3 + movsd 15 * SIZE(X), %xmm11 + + addq $8 * SIZE, Y + addq $8 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + addsd %xmm4, %xmm0 + movsd 4 * SIZE(Y), %xmm4 + addsd %xmm5, %xmm1 + movsd 5 * SIZE(Y), %xmm5 + addsd %xmm6, %xmm2 + movsd 6 * SIZE(Y), %xmm6 + addsd %xmm7, %xmm3 + movsd 7 * SIZE(Y), %xmm7 + + movsd %xmm0, 0 * SIZE(Y) + mulsd ALPHA, %xmm8 + movsd %xmm1, 1 * SIZE(Y) + mulsd ALPHA, %xmm9 + movsd %xmm2, 2 * SIZE(Y) + mulsd ALPHA, %xmm10 + movsd %xmm3, 3 * SIZE(Y) + mulsd ALPHA, %xmm11 + + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + addsd %xmm6, %xmm10 + addsd %xmm7, %xmm11 + + movsd %xmm8, 4 * SIZE(Y) + movsd %xmm9, 5 * SIZE(Y) + movsd %xmm10, 6 * SIZE(Y) + movsd %xmm11, 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L13: + movq M, %rax + andq $4, %rax + jle .L15 + ALIGN_3 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 2 * SIZE(X), %xmm2 + movsd 3 * SIZE(X), %xmm3 + + movsd 0 * SIZE(Y), %xmm4 + mulsd ALPHA, %xmm0 + movsd 1 * SIZE(Y), %xmm5 + mulsd ALPHA, %xmm1 + movsd 2 * SIZE(Y), %xmm6 + mulsd ALPHA, %xmm2 + movsd 3 * SIZE(Y), %xmm7 + mulsd ALPHA, %xmm3 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + movsd %xmm1, 1 * SIZE(Y) + movsd %xmm2, 2 * SIZE(Y) + movsd %xmm3, 3 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $2, %rax + jle .L16 + ALIGN_3 + + movsd 0 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm4 + movsd 1 * SIZE(X), %xmm1 + movsd 1 * SIZE(Y), %xmm5 + + mulsd ALPHA, %xmm0 + mulsd ALPHA, %xmm1 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movsd %xmm0, 0 * SIZE(Y) + movsd %xmm1, 1 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + movq M, %rax + andq $1, %rax + jle .L19 + ALIGN_3 + + movsd 0 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd 0 * SIZE(Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: + movq Y, YY + + movq M, %rax + sarq $3, %rax + jle .L23 + + movsd (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + + movsd (Y), %xmm4 + addq INCY, Y + movsd (Y), %xmm5 + addq INCY, Y + movsd (Y), %xmm6 + addq INCY, Y + movsd (Y), %xmm7 + addq INCY, Y + + movsd (X), %xmm8 + addq INCX, X + mulsd ALPHA, %xmm0 + movsd (X), %xmm9 + addq INCX, X + mulsd ALPHA, %xmm1 + movsd (X), %xmm10 + addq INCX, X + mulsd ALPHA, %xmm2 + movsd (X), %xmm11 + addq INCX, X + mulsd ALPHA, %xmm3 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + addsd %xmm4, %xmm0 + movsd (Y), %xmm4 + addq INCY, Y + addsd %xmm5, %xmm1 + movsd (Y), %xmm5 + addq INCY, Y + + addsd %xmm6, %xmm2 + movsd (Y), %xmm6 + addq INCY, Y + addsd %xmm7, %xmm3 + movsd (Y), %xmm7 + addq INCY, Y + + movsd %xmm0, (YY) + addq INCY, YY + movsd (X), %xmm0 + addq INCX, X + mulsd ALPHA, %xmm8 + + movsd %xmm1, (YY) + addq INCY, YY + movsd (X), %xmm1 + addq INCX, X + mulsd ALPHA, %xmm9 + + movsd %xmm2, (YY) + addq INCY, YY + movsd (X), %xmm2 + addq INCX, X + mulsd ALPHA, %xmm10 + + movsd %xmm3, (YY) + addq INCY, YY + movsd (X), %xmm3 + addq INCX, X + mulsd ALPHA, %xmm11 + + addsd %xmm4, %xmm8 + movsd (Y), %xmm4 + addq INCY, Y + addsd %xmm5, %xmm9 + movsd (Y), %xmm5 + addq INCY, Y + + addsd %xmm6, %xmm10 + movsd (Y), %xmm6 + addq INCY, Y + addsd %xmm7, %xmm11 + movsd (Y), %xmm7 + addq INCY, Y + + movsd %xmm8, (YY) + addq INCY, YY + movsd (X), %xmm8 + addq INCX, X + mulsd ALPHA, %xmm0 + + movsd %xmm9, (YY) + addq INCY, YY + movsd (X), %xmm9 + addq INCX, X + mulsd ALPHA, %xmm1 + + movsd %xmm10, (YY) + addq INCY, YY + movsd (X), %xmm10 + addq INCX, X + mulsd ALPHA, %xmm2 + + movsd %xmm11, (YY) + addq INCY, YY + movsd (X), %xmm11 + addq INCX, X + mulsd ALPHA, %xmm3 + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + addsd %xmm4, %xmm0 + movsd (Y), %xmm4 + addq INCY, Y + addsd %xmm5, %xmm1 + movsd (Y), %xmm5 + addq INCY, Y + addsd %xmm6, %xmm2 + movsd (Y), %xmm6 + addq INCY, Y + addsd %xmm7, %xmm3 + movsd (Y), %xmm7 + addq INCY, Y + + movsd %xmm0, (YY) + addq INCY, YY + mulsd ALPHA, %xmm8 + + movsd %xmm1, (YY) + addq INCY, YY + mulsd ALPHA, %xmm9 + + movsd %xmm2, (YY) + addq INCY, YY + mulsd ALPHA, %xmm10 + + movsd %xmm3, (YY) + addq INCY, YY + mulsd ALPHA, %xmm11 + + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + addsd %xmm6, %xmm10 + addsd %xmm7, %xmm11 + + movsd %xmm8, (YY) + addq INCY, YY + movsd %xmm9, (YY) + addq INCY, YY + movsd %xmm10, (YY) + addq INCY, YY + movsd %xmm11, (YY) + addq INCY, YY + ALIGN_3 + +.L23: + movq M, %rax + andq $4, %rax + jle .L25 + ALIGN_3 + + movsd (X), %xmm0 + addq INCX, X + movsd (Y), %xmm4 + addq INCY, Y + movsd (X), %xmm1 + addq INCX, X + movsd (Y), %xmm5 + addq INCY, Y + + movsd (X), %xmm2 + addq INCX, X + mulsd ALPHA, %xmm0 + movsd (Y), %xmm6 + addq INCY, Y + mulsd ALPHA, %xmm1 + movsd (X), %xmm3 + addq INCX, X + mulsd ALPHA, %xmm2 + movsd (Y), %xmm7 + addq INCY, Y + mulsd ALPHA, %xmm3 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + movsd %xmm0, (YY) + addq INCY, YY + movsd %xmm1, (YY) + addq INCY, YY + movsd %xmm2, (YY) + addq INCY, YY + movsd %xmm3, (YY) + addq INCY, YY + ALIGN_3 + +.L25: + movq M, %rax + andq $2, %rax + jle .L26 + ALIGN_3 + + movsd (X), %xmm0 + addq INCX, X + movsd (Y), %xmm4 + addq INCY, Y + movsd (X), %xmm1 + addq INCX, X + movsd (Y), %xmm5 + addq INCY, Y + + mulsd ALPHA, %xmm0 + mulsd ALPHA, %xmm1 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movsd %xmm0, (YY) + addq INCY, YY + movsd %xmm1, (YY) + addq INCY, YY + ALIGN_3 + +.L26: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movsd (X), %xmm0 + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + + movsd %xmm0, (YY) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/axpy_sse.S b/kernel/x86_64/axpy_sse.S new file mode 100644 index 0000000..23c2ec5 --- /dev/null +++ b/kernel/x86_64/axpy_sse.S @@ -0,0 +1,1576 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 24(%rsp), INCY +#endif + movaps %xmm0, ALPHA +#else + movaps %xmm3, ALPHA + + movq 40(%rsp), X + movq 48(%rsp), INCX + movq 56(%rsp), Y + movq 64(%rsp), INCY +#endif + + SAVEREGISTERS + + shufps $0, ALPHA, ALPHA + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + testq M, M + jle .L19 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + cmpq $3, M + jle .L16 + + testq $SIZE, Y + je .L00 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_3 + +.L00: + testq $SIZE * 2, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_3 + +.L10: + testq $SIZE * 3, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decq %rax + jle .L12 + ALIGN_4 + +.L11: + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 8 * SIZE(X), %xmm2 + movaps 12 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L13: + movq M, %rax + andq $16, %rax + jle .L14 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + movq M, %rax + andq $8, %rax + jle .L15 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $4, %rax + jle .L16 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + movq M, %rax + andq $2, %rax + jle .L17 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + movq M, %rax + andq $1, %rax + jle .L19 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: + +#ifdef ALIGNED_ACCESS + + testq $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -6 * SIZE(X), %xmm7 + movaps -2 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 2 * SIZE(X), %xmm1 + movaps 6 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + SHUFPD_1 %xmm6, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 10 * SIZE(X), %xmm3 + movaps 14 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -6 * SIZE(X), %xmm7 + movaps -2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + SHUFPD_1 %xmm6, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L23: + movq M, %rax + andq $16, %rax + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + SHUFPD_1 %xmm3, %xmm2 + SHUFPD_1 %xmm4, %xmm3 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + movq M, %rax + andq $8, %rax + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $4, %rax + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $2, %rax + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L30: + testq $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + decq %rax + jle .L32 + ALIGN_4 + +.L31: + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -5 * SIZE(X), %xmm7 + movaps -1 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 3 * SIZE(X), %xmm1 + movaps 7 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 11 * SIZE(X), %xmm3 + movaps 15 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -5 * SIZE(X), %xmm7 + movaps -1 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L33: + movq M, %rax + andq $16, %rax + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + movq M, %rax + andq $8, %rax + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + movq M, %rax + andq $4, %rax + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + movq M, %rax + andq $2, %rax + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + movq M, %rax + andq $1, %rax + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + decq %rax + jle .L42 + ALIGN_4 + +.L41: + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -7 * SIZE(X), %xmm7 + movaps -3 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 1 * SIZE(X), %xmm1 + movaps 5 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 9 * SIZE(X), %xmm3 + movaps 13 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -7 * SIZE(X), %xmm7 + movaps -3 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L43: + movq M, %rax + andq $16, %rax + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + movq M, %rax + andq $8, %rax + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + movq M, %rax + andq $4, %rax + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + movq M, %rax + andq $2, %rax + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + movq M, %rax + andq $1, %rax + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + +#else + + movq M, %rax + sarq $5, %rax + jle .L23 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L23: + movq M, %rax + andq $16, %rax + jle .L24 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + movq M, %rax + andq $8, %rax + jle .L25 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $4, %rax + jle .L26 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $2, %rax + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret +#endif + ALIGN_3 + + +.L50: + movq M, %rax + movq Y, YY + sarq $3, %rax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + addq INCX, X + mulss ALPHA, %xmm0 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm0 + + movss (X), %xmm1 + addq INCX, X + mulss ALPHA, %xmm1 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm1 + + movss (X), %xmm2 + addq INCX, X + mulss ALPHA, %xmm2 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm2 + + movss (X), %xmm3 + addq INCX, X + mulss ALPHA, %xmm3 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm3 + + movss %xmm0, (Y) + addq INCY, Y + movss %xmm1, (Y) + addq INCY, Y + movss %xmm2, (Y) + addq INCY, Y + movss %xmm3, (Y) + addq INCY, Y + + movss (X), %xmm0 + addq INCX, X + mulss ALPHA, %xmm0 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm0 + + movss (X), %xmm1 + addq INCX, X + mulss ALPHA, %xmm1 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm1 + + movss (X), %xmm2 + addq INCX, X + mulss ALPHA, %xmm2 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm2 + + movss (X), %xmm3 + addq INCX, X + mulss ALPHA, %xmm3 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm3 + + movss %xmm0, (Y) + addq INCY, Y + movss %xmm1, (Y) + addq INCY, Y + movss %xmm2, (Y) + addq INCY, Y + movss %xmm3, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $7, %rax + jle .L59 + ALIGN_3 + +.L56: + movss (X), %xmm0 + addq INCX, X + mulss ALPHA, %xmm0 + movss (Y), %xmm6 + addss %xmm6, %xmm0 + movss %xmm0, (Y) + addq INCY, Y + decq %rax + jg .L56 + ALIGN_3 + +.L59: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + + + EPILOGUE diff --git a/kernel/x86_64/axpy_sse2.S b/kernel/x86_64/axpy_sse2.S new file mode 100644 index 0000000..5546029 --- /dev/null +++ b/kernel/x86_64/axpy_sse2.S @@ -0,0 +1,906 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 24(%rsp), INCY +#endif + movaps %xmm0, ALPHA +#else + movaps %xmm3, ALPHA + + movq 40(%rsp), X + movq 48(%rsp), INCX + movq 56(%rsp), Y + movq 64(%rsp), INCY +#endif + + SAVEREGISTERS + + unpcklpd ALPHA, ALPHA + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + testq M, M + jle .L47 + + cmpq $SIZE, INCX + jne .L40 + cmpq $SIZE, INCY + jne .L40 + + testq $SIZE, Y + je .L10 + + movsd (X), %xmm0 + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + movsd %xmm0, (Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, X + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm2 + movaps 6 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L13: + movq M, %rax + andq $8, %rax + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + movq M, %rax + andq $4, %rax + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $2, %rax + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + movq M, %rax + andq $1, %rax + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps -1 * SIZE(X), %xmm0 + movaps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + SHUFPD_1 %xmm6, %xmm5 + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movaps 3 * SIZE(X), %xmm2 + movaps 5 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm7 + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + SHUFPD_1 %xmm6, %xmm5 + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm7 + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L23: + movq M, %rax + andq $8, %rax + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm8, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm8, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + movq M, %rax + andq $4, %rax + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $2, %rax + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#else + movq M, %rax + sarq $4, %rax + jle .L23 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L23: + movq M, %rax + andq $8, %rax + jle .L24 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + movq M, %rax + andq $4, %rax + jle .L25 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $2, %rax + jle .L26 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 +#endif + +.L40: + movq Y, YY + movq M, %rax + sarq $3, %rax + jle .L45 + ALIGN_3 + +.L41: + movsd 0 * SIZE(X), %xmm0 + addq INCX, X + movhpd 0 * SIZE(X), %xmm0 + addq INCX, X + mulpd ALPHA, %xmm0 + + movsd 0 * SIZE(YY), %xmm6 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addq INCY, YY + addpd %xmm6, %xmm0 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + addq INCX, X + mulpd ALPHA, %xmm1 + + movsd 0 * SIZE(YY), %xmm6 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addq INCY, YY + addpd %xmm6, %xmm1 + + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movhpd 0 * SIZE(X), %xmm2 + addq INCX, X + mulpd ALPHA, %xmm2 + + movsd 0 * SIZE(YY), %xmm6 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addq INCY, YY + addpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + addq INCX, X + mulpd ALPHA, %xmm3 + + movsd 0 * SIZE(YY), %xmm6 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addq INCY, YY + addpd %xmm6, %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L45: + movq M, %rax + andq $7, %rax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + addq INCX, X + mulsd %xmm15, %xmm0 + addsd (Y), %xmm0 + movsd %xmm0, (Y) + addq INCY, Y + decq %rax + jg .L46 + ALIGN_3 + +.L47: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/builtin_stinit.S b/kernel/x86_64/builtin_stinit.S new file mode 100644 index 0000000..c05a1c5 --- /dev/null +++ b/kernel/x86_64/builtin_stinit.S @@ -0,0 +1,61 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + cmpq $4096, %rax + jle .L999 + ALIGN_3 + +.L01: + subq $4096, %rax + subq $4096, %rsp + movq $0, (%rsp) + cmpq $4096, %rax + jg .L01 + ALIGN_3 + +.L999: + subq %rax, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cabs.S b/kernel/x86_64/cabs.S new file mode 100644 index 0000000..0b1a911 --- /dev/null +++ b/kernel/x86_64/cabs.S @@ -0,0 +1,70 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + +#ifdef DOUBLE + movsd 0 * SIZE(ARG1), %xmm0 + movsd 1 * SIZE(ARG1), %xmm1 + pcmpeqb %xmm4, %xmm4 + + psrlq $1, %xmm4 + andpd %xmm4, %xmm0 + andpd %xmm4, %xmm1 + addpd %xmm1, %xmm0 +#else + movss 0 * SIZE(ARG1), %xmm0 + movss 1 * SIZE(ARG1), %xmm1 + pcmpeqb %xmm4, %xmm4 + + psrld $1, %xmm4 + andps %xmm4, %xmm0 + andps %xmm4, %xmm1 + addps %xmm1, %xmm0 +#endif + +#if !defined(DOUBLE) && defined(NEED_F2CCONV) + cvtss2sd %xmm0, %xmm0 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemv_n.S b/kernel/x86_64/cgemv_n.S new file mode 100644 index 0000000..77e9b3d --- /dev/null +++ b/kernel/x86_64/cgemv_n.S @@ -0,0 +1,4302 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define A1 %r11 +#define A2 %r12 + +#define Y1 %r13 +#define BUFFER %r14 + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + +#undef SUBPS + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPS subps +#else +#define SUBPS addps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, ALPHA + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + subq $-32 * SIZE, A + + movq BUFFER, Y1 + + pxor %xmm4, %xmm4 + + movq M, %rax + addq $8, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movaps %xmm4, 0 * SIZE(Y1) + movaps %xmm4, 4 * SIZE(Y1) + movaps %xmm4, 8 * SIZE(Y1) + movaps %xmm4, 12 * SIZE(Y1) + + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: +#ifdef ALIGNED_ACCESS + movq M, MM + + movq A, %rax + andq $4 * SIZE - 1, %rax + leaq 2 * SIZE(BUFFER), A1 + leaq -1(M), A2 + + cmpq $2 * SIZE, %rax + cmovge A1, BUFFER + cmovge A2, MM + + testq $SIZE, A + jne .L200 + + testq $2 * SIZE, LDA + jne .L100 +#endif + +#if GEMV_UNROLL >= 4 + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm11 + addq INCX, X + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm6 +#else + movsd ALPHA, %xmm6 + unpcklpd %xmm6, %xmm6 +#endif + + pshufd $0xb1, %xmm6, %xmm5 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + + pshufd $0x00, %xmm9, %xmm8 + pshufd $0x55, %xmm9, %xmm9 + pshufd $0x00, %xmm11, %xmm10 + pshufd $0x55, %xmm11, %xmm11 + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + + mulps %xmm6, %xmm8 + mulps %xmm5, %xmm9 + mulps %xmm6, %xmm10 + mulps %xmm5, %xmm11 + mulps %xmm6, %xmm12 + mulps %xmm5, %xmm13 + mulps %xmm6, %xmm14 + mulps %xmm5, %xmm15 + +#ifndef XCONJ + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + pshufd $0x55, %xmm10, %xmm11 + pshufd $0x00, %xmm10, %xmm10 + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L1X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L1X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L15 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A2, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm6) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A2, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm6) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm2 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $4, MM + je .L17 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $2, MM + je .L18 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L18: + testq $1, MM + je .L19 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L19: + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + mulps %xmm8, %xmm14 + mulps %xmm9, %xmm15 + +#ifndef XCONJ + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L2X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L2X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq MM, I + sarq $3, I + jle .L25 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm10) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm10) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm2 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $4, MM + je .L27 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + MOVUPS_A1(-28 * SIZE, A2, %xmm10) + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm0 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $2, MM + je .L28 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-32 * SIZE, A2, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L28: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L29 +#else + je .L30 +#endif + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L29: + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: + cmpq $1, N + jl .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd (X), %xmm13 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + +#ifndef XCONJ + subps %xmm13, %xmm12 +#else + addps %xmm13, %xmm12 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + +#ifndef CONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L3X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 +.L3X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq MM, I + sarq $3, I + jle .L35 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $4, MM + je .L37 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $2, MM + je .L38 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L38: + testq $1, MM + je .L990 + + movsd -32 * SIZE(A1), %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + +#ifdef ALIGNED_ACCESS + jmp .L990 + ALIGN_3 + +.L100: +#if GEMV_UNROLL >= 4 + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm11 + addq INCX, X + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm6 +#else + movsd ALPHA, %xmm6 + unpcklpd %xmm6, %xmm6 +#endif + + pshufd $0xb1, %xmm6, %xmm5 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + + pshufd $0x00, %xmm9, %xmm8 + pshufd $0x55, %xmm9, %xmm9 + pshufd $0x00, %xmm11, %xmm10 + pshufd $0x55, %xmm11, %xmm11 + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + + mulps %xmm6, %xmm8 + mulps %xmm5, %xmm9 + mulps %xmm6, %xmm10 + mulps %xmm5, %xmm11 + mulps %xmm6, %xmm12 + mulps %xmm5, %xmm13 + mulps %xmm6, %xmm14 + mulps %xmm5, %xmm15 + +#ifndef XCONJ + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + pshufd $0x55, %xmm10, %xmm11 + pshufd $0x00, %xmm10, %xmm10 + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L10X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L10X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L105 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm2 + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm3 + movsd -28 * SIZE(A1, LDA), %xmm6 + movhps -26 * SIZE(A1, LDA), %xmm6 + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1, LDA), %xmm4 + movhps -22 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm6 + movhps -18 * SIZE(A1, LDA), %xmm6 + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A2, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm4 + movhps -30 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm3 + movsd -28 * SIZE(A2, LDA), %xmm6 + movhps -26 * SIZE(A2, LDA), %xmm6 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A2, LDA), %xmm4 + movhps -22 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + movsd -20 * SIZE(A2, LDA), %xmm6 + movhps -18 * SIZE(A2, LDA), %xmm6 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm2 + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm3 + movsd -28 * SIZE(A1, LDA), %xmm6 + movhps -26 * SIZE(A1, LDA), %xmm6 + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1, LDA), %xmm4 + movhps -22 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm6 + movhps -18 * SIZE(A1, LDA), %xmm6 + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A2, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm4 + movhps -30 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm3 + movsd -28 * SIZE(A2, LDA), %xmm6 + movhps -26 * SIZE(A2, LDA), %xmm6 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A2, LDA), %xmm4 + movhps -22 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + movsd -20 * SIZE(A2, LDA), %xmm6 + movhps -18 * SIZE(A2, LDA), %xmm6 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm2 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L105: + testq $4, MM + je .L107 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + movsd -28 * SIZE(A1, LDA), %xmm6 + movhps -26 * SIZE(A1, LDA), %xmm6 + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm4 + movhps -30 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movsd -28 * SIZE(A2, LDA), %xmm6 + movhps -26 * SIZE(A2, LDA), %xmm6 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L107: + testq $2, MM + je .L108 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + movsd -32 * SIZE(A1, LDA), %xmm6 + movhps -30 * SIZE(A1, LDA), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + movhps -30 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L108: + testq $1, MM + je .L109 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L109: + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L120 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L111: +#endif + + subq $2, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + mulps %xmm8, %xmm14 + mulps %xmm9, %xmm15 + +#ifndef XCONJ + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L11X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L11X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq MM, I + sarq $3, I + jle .L115 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + movhps -30 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movsd -28 * SIZE(A2), %xmm6 + movhps -26 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + movsd -24 * SIZE(A2), %xmm8 + movhps -22 * SIZE(A2), %xmm8 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movsd -20 * SIZE(A2), %xmm10 + movhps -18 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + movhps -30 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movsd -28 * SIZE(A2), %xmm6 + movhps -26 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + movsd -24 * SIZE(A2), %xmm8 + movhps -22 * SIZE(A2), %xmm8 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movsd -20 * SIZE(A2), %xmm10 + movhps -18 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm2 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L115: + testq $4, MM + je .L117 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + movsd -32 * SIZE(A2), %xmm8 + movhps -30 * SIZE(A2), %xmm8 + movsd -28 * SIZE(A2), %xmm10 + movhps -26 * SIZE(A2), %xmm10 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm0 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L117: + testq $2, MM + je .L118 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + movsd -32 * SIZE(A2), %xmm6 + movhps -30 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L118: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L119 +#else + je .L120 +#endif + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L119: + cmpq $2, N + jge .L111 +#endif + ALIGN_3 + +.L120: +#endif + + cmpq $1, N + jl .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd (X), %xmm13 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + +#ifndef XCONJ + subps %xmm13, %xmm12 +#else + addps %xmm13, %xmm12 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + +#ifndef CONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L12X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 +.L12X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq MM, I + sarq $3, I + jle .L125 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L125: + testq $4, MM + je .L127 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L127: + testq $2, MM + je .L128 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L128: + testq $1, MM + je .L990 + + movsd -32 * SIZE(A1), %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + jmp .L990 + ALIGN_3 + +.L200: + testq $2 * SIZE, LDA + jne .L300 + + cmpq $2, N + jl .L210 + ALIGN_3 + +.L201: + subq $2, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + mulps %xmm8, %xmm14 + mulps %xmm9, %xmm15 + +#ifndef XCONJ + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L20X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L20X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + movaps -33 * SIZE(A2), %xmm6 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L205 + + movaps -29 * SIZE(A1), %xmm8 + movaps -25 * SIZE(A1), %xmm9 + movaps -21 * SIZE(A1), %xmm10 + + decq I + jle .L204 + ALIGN_3 + +.L203: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + movaps -29 * SIZE(A2), %xmm8 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movaps -25 * SIZE(A2), %xmm9 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -21 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + movaps -17 * SIZE(A2), %xmm6 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm1 + movaps -13 * SIZE(A1), %xmm8 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm2 + movaps -9 * SIZE(A1), %xmm9 + + movss %xmm6, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + movaps -5 * SIZE(A1), %xmm10 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L203 + ALIGN_3 + +.L204: + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + movaps -29 * SIZE(A2), %xmm8 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movaps -25 * SIZE(A2), %xmm9 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -21 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + movaps -17 * SIZE(A2), %xmm6 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm2 + + movss %xmm6, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L205: + testq $4, MM + je .L207 + + movaps -29 * SIZE(A1), %xmm8 + movaps -25 * SIZE(A1), %xmm9 + movaps -29 * SIZE(A2), %xmm10 + movaps -25 * SIZE(A2), %xmm11 + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movss %xmm11, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm9, %xmm4 + movaps %xmm11, %xmm6 + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L207: + testq $2, MM + je .L208 + + movaps -29 * SIZE(A1), %xmm8 + movaps -29 * SIZE(A2), %xmm9 + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movss %xmm9, %xmm6 + shufps $0x39, %xmm6, %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L208: + testq $1, MM + je .L209 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L209: + cmpq $2, N + jge .L201 + ALIGN_3 + +.L210: + cmpq $1, N + jl .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd (X), %xmm13 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + +#ifndef XCONJ + subps %xmm13, %xmm12 +#else + addps %xmm13, %xmm12 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + +#ifndef CONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L21X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 +.L21X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L215 + + movaps -29 * SIZE(A1), %xmm6 + movaps -25 * SIZE(A1), %xmm8 + movaps -21 * SIZE(A1), %xmm10 + + decq I + jle .L214 + ALIGN_3 + +.L213: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movaps -13 * SIZE(A1), %xmm6 + + movss %xmm10, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + movaps -9 * SIZE(A1), %xmm8 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -5 * SIZE(A1), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L213 + ALIGN_3 + +.L214: + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + movss %xmm10, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L215: + testq $4, MM + je .L217 + + movaps -29 * SIZE(A1), %xmm6 + movaps -25 * SIZE(A1), %xmm8 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + movaps %xmm8, %xmm4 + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L217: + testq $2, MM + je .L218 + + movaps -29 * SIZE(A1), %xmm6 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L218: + testq $1, MM + je .L990 + + movsd -32 * SIZE(A1), %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L300: + cmpq $2, N + jl .L310 + ALIGN_3 + +.L301: + subq $2, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + mulps %xmm8, %xmm14 + mulps %xmm9, %xmm15 + +#ifndef XCONJ + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L30X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L30X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + movaps -35 * SIZE(A2), %xmm6 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L305 + + movaps -29 * SIZE(A1), %xmm8 + movaps -25 * SIZE(A1), %xmm9 + movaps -21 * SIZE(A1), %xmm10 + + decq I + jle .L304 + ALIGN_3 + +.L303: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + movaps -31 * SIZE(A2), %xmm8 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movaps -27 * SIZE(A2), %xmm9 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -23 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + movaps -19 * SIZE(A2), %xmm6 + + movss %xmm9, %xmm8 + shufps $0x93, %xmm9, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm1 + movaps -13 * SIZE(A1), %xmm8 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x93, %xmm10, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm2 + movaps -9 * SIZE(A1), %xmm9 + + movss %xmm6, %xmm10 + shufps $0x93, %xmm6, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + movaps -5 * SIZE(A1), %xmm10 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L303 + ALIGN_3 + +.L304: + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + movaps -31 * SIZE(A2), %xmm8 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movaps -27 * SIZE(A2), %xmm9 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -23 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + movaps -19 * SIZE(A2), %xmm6 + + movss %xmm9, %xmm8 + shufps $0x93, %xmm9, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x93, %xmm10, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm2 + + movss %xmm6, %xmm10 + shufps $0x93, %xmm6, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L305: + testq $4, MM + je .L307 + + movaps -29 * SIZE(A1), %xmm8 + movaps -25 * SIZE(A1), %xmm9 + movaps -31 * SIZE(A2), %xmm10 + movaps -27 * SIZE(A2), %xmm11 + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm6 + shufps $0x93, %xmm10, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movss %xmm11, %xmm10 + shufps $0x93, %xmm11, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm9, %xmm4 + movaps %xmm11, %xmm6 + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L307: + testq $2, MM + je .L308 + + movaps -29 * SIZE(A1), %xmm8 + movaps -31 * SIZE(A2), %xmm9 + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm9, %xmm6 + shufps $0x93, %xmm9, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L308: + testq $1, MM + je .L309 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L309: + cmpq $2, N + jge .L301 + ALIGN_3 + +.L310: + cmpq $1, N + jl .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd (X), %xmm13 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + +#ifndef XCONJ + subps %xmm13, %xmm12 +#else + addps %xmm13, %xmm12 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + +#ifndef CONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L31X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 +.L31X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L315 + + movaps -29 * SIZE(A1), %xmm6 + movaps -25 * SIZE(A1), %xmm8 + movaps -21 * SIZE(A1), %xmm10 + + decq I + jle .L314 + ALIGN_3 + +.L313: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movaps -13 * SIZE(A1), %xmm6 + + movss %xmm10, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + movaps -9 * SIZE(A1), %xmm8 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -5 * SIZE(A1), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L313 + ALIGN_3 + +.L314: + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + movss %xmm10, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L315: + testq $4, MM + je .L317 + + movaps -29 * SIZE(A1), %xmm6 + movaps -25 * SIZE(A1), %xmm8 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + movaps %xmm8, %xmm4 + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L317: + testq $2, MM + je .L318 + + movaps -29 * SIZE(A1), %xmm6 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L318: + testq $1, MM + je .L990 + + movsd -32 * SIZE(A1), %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) +#endif + ALIGN_3 + +.L990: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L991 + + movsd (Y), %xmm0 + addq INCY, Y + + movsd (BUFFER), %xmm1 + addq $2 * SIZE, BUFFER + + addps %xmm1, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + ALIGN_3 + +.L991: +#endif + movq MM, %rax + sarq $3, %rax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y), %xmm0 + addq INCY, Y + movhps (Y), %xmm0 + addq INCY, Y + + movsd (Y), %xmm1 + addq INCY, Y + movhps (Y), %xmm1 + addq INCY, Y + + movsd (Y), %xmm2 + addq INCY, Y + movhps (Y), %xmm2 + addq INCY, Y + + movsd (Y), %xmm3 + addq INCY, Y + movhps (Y), %xmm3 + addq INCY, Y + + addps 0 * SIZE(BUFFER), %xmm0 + addps 4 * SIZE(BUFFER), %xmm1 + addps 8 * SIZE(BUFFER), %xmm2 + addps 12 * SIZE(BUFFER), %xmm3 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + movlps %xmm1, (Y1) + addq INCY, Y1 + movhps %xmm1, (Y1) + addq INCY, Y1 + + movlps %xmm2, (Y1) + addq INCY, Y1 + movhps %xmm2, (Y1) + addq INCY, Y1 + + movlps %xmm3, (Y1) + addq INCY, Y1 + movhps %xmm3, (Y1) + addq INCY, Y1 + + addq $16 * SIZE, BUFFER + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $7, MM + jle .L999 + + testq $4, MM + jle .L995 + + movsd (Y), %xmm0 + addq INCY, Y + movhps (Y), %xmm0 + addq INCY, Y + + movsd (Y), %xmm1 + addq INCY, Y + movhps (Y), %xmm1 + addq INCY, Y + + addps 0 * SIZE(BUFFER), %xmm0 + addps 4 * SIZE(BUFFER), %xmm1 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + movlps %xmm1, (Y1) + addq INCY, Y1 + movhps %xmm1, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L995: + testq $2, MM + jle .L996 + + movsd (Y), %xmm0 + addq INCY, Y + movhps (Y), %xmm0 + addq INCY, Y + + addps 0 * SIZE(BUFFER), %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L996: + testq $1, MM + jle .L999 + + movsd (Y), %xmm0 + + addps 0 * SIZE(BUFFER), %xmm0 + + movlps %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemv_t.S b/kernel/x86_64/cgemv_t.S new file mode 100644 index 0000000..c268e4f --- /dev/null +++ b/kernel/x86_64/cgemv_t.S @@ -0,0 +1,4378 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define A1 %r11 +#define A2 %r12 + +#define X1 %rbx +#define Y1 %r13 +#define BUFFER %r14 + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + +#undef SUBPS + +#ifndef CONJ +#define SUBPS addps +#else +#define SUBPS subps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, ALPHA + + testq M, M + jle .L999 + testq N, N + jle .L999 + + subq $-32 * SIZE, A + + movq BUFFER, X1 + +#ifdef ALIGNED_ACCESS + movq M, MM + movq A, %rax + andq $4 * SIZE - 1, %rax + cmpq $2 * SIZE, %rax + + jl .L0X + + movsd (X), %xmm0 + addq INCX, X + movlps %xmm0, 2 * SIZE(X1) + + addq $2 * SIZE, BUFFER + addq $4 * SIZE, X1 + decq MM + +.L0X: +#endif + + movq MM, I + sarq $3, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + movaps %xmm0, 0 * SIZE(X1) + movaps %xmm1, 4 * SIZE(X1) + movaps %xmm2, 8 * SIZE(X1) + movaps %xmm3, 12 * SIZE(X1) + + addq $16 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq MM, I + andq $7, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addq INCX, X + movlps %xmm0, 0 * SIZE(X1) + addq $2 * SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + jne .L200 + + testq $2 * SIZE, LDA + jne .L100 +#endif + +#if GEMV_UNROLL >= 4 + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L1X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A1, LDA), %xmm9 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd -32 * SIZE(A2), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd -32 * SIZE(A2, LDA), %xmm11 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L1X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#ifdef PREFETCHW + PREFETCHW 7 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L15 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-32 * SIZE, A2, %xmm10) + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-28 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-28 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-20 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-28 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-28 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-20 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L15: + testq $4, MM + je .L17 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-32 * SIZE, A2, %xmm10) + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11) + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-28 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-28 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_3 + +.L17: + testq $2, MM + je .L18 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-32 * SIZE, A2, %xmm10) + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11) + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + testq $1, MM + je .L19 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A1, LDA), %xmm9 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd -32 * SIZE(A2), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd -32 * SIZE(A2, LDA), %xmm11 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + ALIGN_3 + +.L19: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm11, %xmm0 + xorps %xmm11, %xmm2 + xorps %xmm11, %xmm4 + xorps %xmm11, %xmm6 +#else + xorps %xmm11, %xmm1 + xorps %xmm11, %xmm3 + xorps %xmm11, %xmm5 + xorps %xmm11, %xmm7 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + haddps %xmm6, %xmm4 +#else + + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm9 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm9 + + movaps %xmm4, %xmm10 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm10 + + movaps %xmm6, %xmm12 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm12 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm2 + addps %xmm10, %xmm4 + addps %xmm12, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movlhps %xmm2, %xmm0 + movlhps %xmm6, %xmm4 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + pshufd $0xb1, %xmm4, %xmm5 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm5 + + xorps %xmm11, %xmm0 + xorps %xmm11, %xmm4 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm5, %xmm4 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + movaps %xmm4, %xmm6 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm4 +#endif + + movsd (Y), %xmm2 + addq INCY, Y + movhps (Y), %xmm2 + addq INCY, Y + movsd (Y), %xmm6 + addq INCY, Y + movhps (Y), %xmm6 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + shufps $0xd8, %xmm4, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm4 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + movlps %xmm4, (Y1) + addq INCY, Y1 + movhps %xmm4, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + + cmpq $2, N + jl .L30 +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + subq $2, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L2X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L2X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#if (GEMV_UNROLL == 2) && defined(PREFETCHW) + PREFETCHW 3 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L25 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-32 * SIZE, A2, %xmm9) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + MOVUPS_A1(-28 * SIZE, A2, %xmm11) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -16 * SIZE(X1), %xmm12 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm11) + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -16 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L25: + testq $4, MM + je .L27 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-32 * SIZE, A2, %xmm9) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + MOVUPS_A1(-28 * SIZE, A2, %xmm11) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L27: + testq $2, MM + je .L28 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-32 * SIZE, A2, %xmm9) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L28: + testq $1, MM + je .L29 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + ALIGN_3 + +.L29: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 + xorps %xmm5, %xmm2 +#else + xorps %xmm5, %xmm1 + xorps %xmm5, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm4 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm4 + + addps %xmm8, %xmm0 + addps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + movlhps %xmm2, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + movhps (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: + cmpq $1, N + jl .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L3X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 +.L3X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + + movq MM, I + sarq $3, I + jle .L35 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -16 * SIZE(X1), %xmm12 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -16 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L35: + testq $4, MM + je .L37 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + ALIGN_3 + +.L37: + testq $2, MM + je .L38 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L38: + testq $1, MM + je .L39 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + ALIGN_3 + +.L39: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + addps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_3 + +.L100: + +#if GEMV_UNROLL >= 4 + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L10X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A1, LDA), %xmm9 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd -32 * SIZE(A2), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd -32 * SIZE(A2, LDA), %xmm11 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L10X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#ifdef PREFETCHW + PREFETCHW 7 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L105 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A1, LDA), %xmm9 + movhps -30 * SIZE(A1, LDA), %xmm9 + + movaps -32 * SIZE(A2), %xmm10 + movsd -32 * SIZE(A2, LDA), %xmm11 + movhps -30 * SIZE(A2, LDA), %xmm11 + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -28 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -28 * SIZE(A1, LDA), %xmm9 + movhps -26 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -28 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -28 * SIZE(A2, LDA), %xmm11 + movhps -26 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -24 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + movsd -24 * SIZE(A1, LDA), %xmm9 + movhps -22 * SIZE(A1, LDA), %xmm9 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + movaps -24 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + movsd -24 * SIZE(A2, LDA), %xmm11 + movhps -22 * SIZE(A2, LDA), %xmm11 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -20 * SIZE(A1, LDA), %xmm9 + movhps -18 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -20 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -20 * SIZE(A2, LDA), %xmm11 + movhps -18 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm9 + movhps -14 * SIZE(A1, LDA), %xmm9 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + movaps -16 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + movsd -16 * SIZE(A2, LDA), %xmm11 + movhps -14 * SIZE(A2, LDA), %xmm11 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -28 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -28 * SIZE(A1, LDA), %xmm9 + movhps -26 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -28 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -28 * SIZE(A2, LDA), %xmm11 + movhps -26 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -24 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + movsd -24 * SIZE(A1, LDA), %xmm9 + movhps -22 * SIZE(A1, LDA), %xmm9 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + movaps -24 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + movsd -24 * SIZE(A2, LDA), %xmm11 + movhps -22 * SIZE(A2, LDA), %xmm11 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -20 * SIZE(A1, LDA), %xmm9 + movhps -18 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -20 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -20 * SIZE(A2, LDA), %xmm11 + movhps -18 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L105: + testq $4, MM + je .L107 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A1, LDA), %xmm9 + movhps -30 * SIZE(A1, LDA), %xmm9 + + movaps -32 * SIZE(A2), %xmm10 + movsd -32 * SIZE(A2, LDA), %xmm11 + movhps -30 * SIZE(A2, LDA), %xmm11 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -28 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -28 * SIZE(A1, LDA), %xmm9 + movhps -26 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -28 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -28 * SIZE(A2, LDA), %xmm11 + movhps -26 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_3 + +.L107: + testq $2, MM + je .L108 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A1, LDA), %xmm9 + movhps -30 * SIZE(A1, LDA), %xmm9 + + movaps -32 * SIZE(A2), %xmm10 + movsd -32 * SIZE(A2, LDA), %xmm11 + movhps -30 * SIZE(A2, LDA), %xmm11 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L108: + testq $1, MM + je .L109 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A1, LDA), %xmm9 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd -32 * SIZE(A2), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd -32 * SIZE(A2, LDA), %xmm11 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + ALIGN_3 + +.L109: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm11, %xmm0 + xorps %xmm11, %xmm2 + xorps %xmm11, %xmm4 + xorps %xmm11, %xmm6 +#else + xorps %xmm11, %xmm1 + xorps %xmm11, %xmm3 + xorps %xmm11, %xmm5 + xorps %xmm11, %xmm7 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + haddps %xmm6, %xmm4 +#else + + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm9 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm9 + + movaps %xmm4, %xmm10 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm10 + + movaps %xmm6, %xmm11 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm11 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm2 + addps %xmm10, %xmm4 + addps %xmm11, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movlhps %xmm2, %xmm0 + movlhps %xmm6, %xmm4 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + pshufd $0xb1, %xmm4, %xmm5 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm5 + + xorps %xmm11, %xmm0 + xorps %xmm11, %xmm4 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm5, %xmm4 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + movaps %xmm4, %xmm6 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm4 +#endif + + movsd (Y), %xmm2 + addq INCY, Y + movhps (Y), %xmm2 + addq INCY, Y + movsd (Y), %xmm6 + addq INCY, Y + movhps (Y), %xmm6 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + shufps $0xd8, %xmm4, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm4 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + movlps %xmm4, (Y1) + addq INCY, Y1 + movhps %xmm4, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: +#endif + + cmpq $2, N + jl .L120 +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L111: +#endif + subq $2, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L11X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L11X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#if (GEMV_UNROLL == 2) && defined(PREFETCHW) + PREFETCHW 3 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L115 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A2), %xmm9 + movhps -30 * SIZE(A2), %xmm9 + + movaps -28 * SIZE(A1), %xmm10 + movsd -28 * SIZE(A2), %xmm11 + movhps -26 * SIZE(A2), %xmm11 + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -24 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -24 * SIZE(A2), %xmm9 + movhps -22 * SIZE(A2), %xmm9 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + movaps -20 * SIZE(A1), %xmm10 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + movsd -20 * SIZE(A2), %xmm11 + movhps -18 * SIZE(A2), %xmm11 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -16 * SIZE(A2), %xmm9 + movhps -14 * SIZE(A2), %xmm9 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -16 * SIZE(X1), %xmm12 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(A1), %xmm10 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + movsd -12 * SIZE(A2), %xmm11 + movhps -10 * SIZE(A2), %xmm11 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -24 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -24 * SIZE(A2), %xmm9 + movhps -22 * SIZE(A2), %xmm9 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + movaps -20 * SIZE(A1), %xmm10 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + movsd -20 * SIZE(A2), %xmm11 + movhps -18 * SIZE(A2), %xmm11 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -16 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L115: + testq $4, MM + je .L117 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A2), %xmm9 + movhps -30 * SIZE(A2), %xmm9 + + movaps -28 * SIZE(A1), %xmm10 + movsd -28 * SIZE(A2), %xmm11 + movhps -26 * SIZE(A2), %xmm11 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L117: + testq $2, MM + je .L118 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A2), %xmm9 + movhps -30 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L118: + testq $1, MM + je .L119 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + ALIGN_3 + +.L119: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 + xorps %xmm5, %xmm2 +#else + xorps %xmm5, %xmm1 + xorps %xmm5, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm4 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm4 + + addps %xmm8, %xmm0 + addps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + movlhps %xmm2, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + movhps (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L111 +#endif + ALIGN_3 + +.L120: + cmpq $1, N + jl .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L12X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 +.L12X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + + movq MM, I + sarq $3, I + jle .L125 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -16 * SIZE(X1), %xmm12 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -16 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L125: + testq $4, MM + je .L127 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + ALIGN_3 + +.L127: + testq $2, MM + je .L128 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L128: + testq $1, MM + je .L129 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + ALIGN_3 + +.L129: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + addps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_3 + + +.L200: + testq $2 * SIZE, LDA + jne .L300 + + cmpq $2, N + jl .L210 + ALIGN_3 + +.L201: + subq $2, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L20X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L20X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + movaps -33 * SIZE(A2), %xmm5 + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L205 + + movaps -29 * SIZE(A1), %xmm6 + movaps -29 * SIZE(A2), %xmm7 + + decq I + jle .L204 + ALIGN_3 + +.L203: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -25 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -25 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -21 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -21 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -17 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -13 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -13 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L203 + ALIGN_3 + +.L204: + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -25 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -25 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -21 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -21 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -17 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + mulps %xmm13, %xmm14 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L205: + testq $4, MM + je .L207 + + movaps -29 * SIZE(A1), %xmm6 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps -29 * SIZE(A2), %xmm7 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps -25 * SIZE(A1), %xmm8 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps -25 * SIZE(A2), %xmm9 + + movss %xmm9, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps %xmm8, %xmm4 + movaps %xmm9, %xmm5 + + movaps -24 * SIZE(X1), %xmm12 + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L207: + testq $2, MM + je .L208 + + movaps -29 * SIZE(A1), %xmm6 + movaps -29 * SIZE(A2), %xmm7 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps %xmm6, %xmm4 + movaps %xmm7, %xmm5 + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L208: + testq $1, MM + je .L209 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + ALIGN_3 + +.L209: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 + xorps %xmm5, %xmm2 +#else + xorps %xmm5, %xmm1 + xorps %xmm5, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm4 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm4 + + addps %xmm8, %xmm0 + addps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + movlhps %xmm2, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + movhps (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + cmpq $2, N + jge .L201 + ALIGN_3 + +.L210: + cmpq $1, N + jl .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L21X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 +.L21X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + + movq MM, I + sarq $3, I + jle .L215 + + movaps -29 * SIZE(A1), %xmm5 + movaps -25 * SIZE(A1), %xmm6 + movaps -21 * SIZE(A1), %xmm7 + + decq I + jle .L214 + ALIGN_3 + +.L213: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + movaps -13 * SIZE(A1), %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm0 + movaps -9 * SIZE(A1), %xmm6 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm4, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm15 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm0 + movaps -5 * SIZE(A1), %xmm7 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L213 + ALIGN_3 + +.L214: + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm4, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm15 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L215: + testq $4, MM + je .L217 + + movaps -29 * SIZE(A1), %xmm5 + movaps -25 * SIZE(A1), %xmm6 + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + movaps -20 * SIZE(X1), %xmm13 + movaps %xmm6, %xmm4 + + addq $8 * SIZE, A1 + ALIGN_3 + +.L217: + testq $2, MM + je .L218 + + movaps -29 * SIZE(A1), %xmm5 + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L218: + testq $1, MM + je .L219 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + ALIGN_3 + +.L219: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + addps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + jmp .L999 + +.L300: + cmpq $2, N + jl .L310 + ALIGN_3 + +.L301: + subq $2, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L30X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L30X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + movaps -35 * SIZE(A2), %xmm5 + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L305 + + movaps -29 * SIZE(A1), %xmm6 + movaps -31 * SIZE(A2), %xmm7 + + decq I + jle .L304 + ALIGN_3 + +.L303: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -25 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -27 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -21 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x93, %xmm5, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -23 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -19 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -13 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x93, %xmm5, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -15 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L303 + ALIGN_3 + +.L304: + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -25 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -27 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -21 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x93, %xmm5, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -23 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -19 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x93, %xmm5, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + mulps %xmm13, %xmm14 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L305: + testq $4, MM + je .L307 + + movaps -29 * SIZE(A1), %xmm6 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps -31 * SIZE(A2), %xmm7 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps -25 * SIZE(A1), %xmm8 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps -27 * SIZE(A2), %xmm9 + + movss %xmm9, %xmm7 + shufps $0x93, %xmm9, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps %xmm8, %xmm4 + movaps %xmm9, %xmm5 + + movaps -24 * SIZE(X1), %xmm12 + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L307: + testq $2, MM + je .L308 + + movaps -29 * SIZE(A1), %xmm6 + movaps -31 * SIZE(A2), %xmm7 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps %xmm6, %xmm4 + movaps %xmm7, %xmm5 + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L308: + testq $1, MM + je .L309 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + ALIGN_3 + +.L309: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 + xorps %xmm5, %xmm2 +#else + xorps %xmm5, %xmm1 + xorps %xmm5, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm4 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm4 + + addps %xmm8, %xmm0 + addps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + movlhps %xmm2, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + movhps (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + cmpq $2, N + jge .L301 + ALIGN_3 + +.L310: + cmpq $1, N + jl .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L31X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 +.L31X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + + movq MM, I + sarq $3, I + jle .L315 + + movaps -29 * SIZE(A1), %xmm5 + movaps -25 * SIZE(A1), %xmm6 + movaps -21 * SIZE(A1), %xmm7 + + decq I + jle .L314 + ALIGN_3 + +.L313: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + movaps -13 * SIZE(A1), %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm0 + movaps -9 * SIZE(A1), %xmm6 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm4, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm15 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm0 + movaps -5 * SIZE(A1), %xmm7 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L313 + ALIGN_3 + +.L314: + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm4, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm15 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L315: + testq $4, MM + je .L317 + + movaps -29 * SIZE(A1), %xmm5 + movaps -25 * SIZE(A1), %xmm6 + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + movaps -20 * SIZE(X1), %xmm13 + movaps %xmm6, %xmm4 + + addq $8 * SIZE, A1 + ALIGN_3 + +.L317: + testq $2, MM + je .L318 + + movaps -29 * SIZE(A1), %xmm5 + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L318: + testq $1, MM + je .L319 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + ALIGN_3 + +.L319: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + addps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 +#endif + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/copy.S b/kernel/x86_64/copy.S new file mode 100644 index 0000000..bb66d10 --- /dev/null +++ b/kernel/x86_64/copy.S @@ -0,0 +1,366 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#define FLAG ARG6 +#else +#define INCY %r10 +#define FLAG %r11 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + EMMS + + testq N, N # if m == 0 goto End + jle .L999 + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + cmpq $SIZE, INCX # if incx != 1 + jne .L100 + cmpq $SIZE, INCY # if incy != 1 + jne .L100 + + movq N, %rax # i = m + sarq $3, %rax + jle .L20 + ALIGN_2 + +.L11: +#ifdef XDOUBLE + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq 0(X), %mm0 + movq 8(X), %mm1 + + movq %mm0, 0(Y) + movq %mm1, 8(Y) + + movq 16(X), %mm2 + movq 24(X), %mm3 + + movq %mm2, 16(Y) + movq %mm3, 24(Y) + + movq 32(X), %mm4 + movq 40(X), %mm5 + + movq %mm4, 32(Y) + movq %mm5, 40(Y) + + movq 48(X), %mm6 + movq 56(X), %mm7 + + movq %mm6, 48(Y) + movq %mm7, 56(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq 64(X), %mm0 + movq 72(X), %mm1 + + movq %mm0, 64(Y) + movq %mm1, 72(Y) + + movq 80(X), %mm2 + movq 88(X), %mm3 + + movq %mm2, 80(Y) + movq %mm3, 88(Y) + + movq 96(X), %mm4 + movq 104(X), %mm5 + + movq %mm4, 96(Y) + movq %mm5, 104(Y) + + movq 112(X), %mm6 + movq 120(X), %mm7 + + movq %mm6, 112(Y) + movq %mm7, 120(Y) +#elif defined(DOUBLE) + + movq 0(X), %mm0 + movq 8(X), %mm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq %mm0, 0(Y) + movq %mm1, 8(Y) + + movq 16(X), %mm2 + movq 24(X), %mm3 + + movq %mm2, 16(Y) + movq %mm3, 24(Y) + + movq 32(X), %mm4 + movq 40(X), %mm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 32(Y) + movq %mm5, 40(Y) + + movq 48(X), %mm6 + movq 56(X), %mm7 + + movq %mm6, 48(Y) + movq %mm7, 56(Y) +#else + movq 0 * SIZE(X), %mm0 + movq 2 * SIZE(X), %mm2 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq %mm0, 0 * SIZE(Y) + movq %mm2, 2 * SIZE(Y) + + movq 4 * SIZE(X), %mm4 + movq 6 * SIZE(X), %mm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 4 * SIZE(Y) + movq %mm6, 6 * SIZE(Y) +#endif + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L11 + ALIGN_2 + +.L20: + movq N, %rax + andq $7, %rax + jle .L99 + ALIGN_2 + +.L21: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq %mm0, 0(Y) + movq %mm1, 8(Y) +#else + MOVQ (X), %mm0 + MOVQ %mm0, (Y) +#endif + + addq $SIZE, X + addq $SIZE, Y + decq %rax + jg .L21 + +.L99: + xorq %rax,%rax + EMMS + ret + ALIGN_3 + +.L100: + movq N, %rax + sarq $3, %rax + jle .L120 + ALIGN_2 + +.L111: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + + movq 0(X), %mm2 + movq 8(X), %mm3 + addq INCX, X + + movq 0(X), %mm4 + movq 8(X), %mm5 + addq INCX, X + + movq 0(X), %mm6 + movq 8(X), %mm7 + addq INCX, X + + movq %mm0, 0(Y) + movq %mm1, 8(Y) + addq INCY, Y + + movq %mm2, 0(Y) + movq %mm3, 8(Y) + addq INCY, Y + + movq %mm4, 0(Y) + movq %mm5, 8(Y) + addq INCY, Y + + movq %mm6, 0(Y) + movq %mm7, 8(Y) + addq INCY, Y + + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + + movq 0(X), %mm2 + movq 8(X), %mm3 + addq INCX, X + + movq 0(X), %mm4 + movq 8(X), %mm5 + addq INCX, X + + movq 0(X), %mm6 + movq 8(X), %mm7 + addq INCX, X + + movq %mm0, 0(Y) + movq %mm1, 8(Y) + addq INCY, Y + + movq %mm2, 0(Y) + movq %mm3, 8(Y) + addq INCY, Y + + movq %mm4, 0(Y) + movq %mm5, 8(Y) + addq INCY, Y + + movq %mm6, 0(Y) + movq %mm7, 8(Y) + addq INCY, Y +#else + MOVQ (X), %mm0 + addq INCX, X + MOVQ (X), %mm1 + addq INCX, X + MOVQ (X), %mm2 + addq INCX, X + MOVQ (X), %mm3 + addq INCX, X + MOVQ (X), %mm4 + addq INCX, X + MOVQ (X), %mm5 + addq INCX, X + MOVQ (X), %mm6 + addq INCX, X + MOVQ (X), %mm7 + addq INCX, X + + MOVQ %mm0, (Y) + addq INCY, Y + MOVQ %mm1, (Y) + addq INCY, Y + MOVQ %mm2, (Y) + addq INCY, Y + MOVQ %mm3, (Y) + addq INCY, Y + MOVQ %mm4, (Y) + addq INCY, Y + MOVQ %mm5, (Y) + addq INCY, Y + MOVQ %mm6, (Y) + addq INCY, Y + MOVQ %mm7, (Y) + addq INCY, Y +#endif + + decq %rax + jg .L111 + +.L120: + movq N, %rax + andq $7, %rax + jle .L999 + ALIGN_2 + +.L121: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq %mm0, 0(Y) + movq %mm1, 8(Y) +#else + MOVQ (X), %mm0 + MOVQ %mm0, (Y) +#endif + addq INCX, X + addq INCY, Y + + decq %rax + jg .L121 + +.L999: + xorq %rax,%rax + EMMS + ret + + EPILOGUE + diff --git a/kernel/x86_64/copy_sse.S b/kernel/x86_64/copy_sse.S new file mode 100644 index 0000000..e949172 --- /dev/null +++ b/kernel/x86_64/copy_sse.S @@ -0,0 +1,959 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + cmpq $3, M + jle .L55 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + ALIGN_4 + +.L05: + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_4 + +.L10: + testq $3 * SIZE, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -32 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -28 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -24 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm2) + movaps %xmm3, -20 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4,-16 * SIZE(Y) + LOAD(16 * SIZE, X, %xmm4) + movaps %xmm5,-12 * SIZE(Y) + LOAD(20 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -8 * SIZE(Y) + LOAD(24 * SIZE, X, %xmm6) + movaps %xmm7, -4 * SIZE(Y) + LOAD(28 * SIZE, X, %xmm7) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + movaps %xmm4, -16 * SIZE(Y) + movaps %xmm5, -12 * SIZE(Y) + movaps %xmm6, -8 * SIZE(Y) + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + ALIGN_3 + +.L13: + testq $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + + +.L20: + testq $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + movaps -6 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 14 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 18 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 22 * SIZE(X), %xmm6 + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 26 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L23: + testq $16, M + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + testq $8, M + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm1, %xmm0 + shufps $0x4e, %xmm2, %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, M + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, M + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, M + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L30: + testq $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + movaps -5 * SIZE(X), %xmm7 + + decq %rax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 15 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 19 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 23 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 27 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L33: + testq $16, M + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + testq $8, M + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, M + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + testq $2, M + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, M + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + movaps -7 * SIZE(X), %xmm7 + + decq %rax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 13 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 17 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 21 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 25 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L43: + testq $16, M + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $8, M + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, M + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + testq $2, M + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, M + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_4 + +.L50: + movq M, %rax + sarq $3, %rax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + addq INCX, X + movss (X), %xmm1 + addq INCX, X + movss (X), %xmm2 + addq INCX, X + movss (X), %xmm3 + addq INCX, X + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm7 + addq INCX, X + + movss %xmm0, (Y) + addq INCY, Y + movss %xmm1, (Y) + addq INCY, Y + movss %xmm2, (Y) + addq INCY, Y + movss %xmm3, (Y) + addq INCY, Y + movss %xmm4, (Y) + addq INCY, Y + movss %xmm5, (Y) + addq INCY, Y + movss %xmm6, (Y) + addq INCY, Y + movss %xmm7, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $7, %rax + jle .L57 + ALIGN_3 + +.L56: + movss (X), %xmm0 + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/copy_sse2.S b/kernel/x86_64/copy_sse2.S new file mode 100644 index 0000000..200daaf --- /dev/null +++ b/kernel/x86_64/copy_sse2.S @@ -0,0 +1,650 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + cmpq $SIZE, INCX + jne .L40 + cmpq $SIZE, INCY + jne .L40 + +#ifdef ALIGNED_ACCESS + testq $SIZE, Y +#else + testq $SIZE, X +#endif + je .L10 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + +#ifdef ALIGNED_ACCESS + testq $SIZE, X +#else + testq $SIZE, Y +#endif + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -16 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -14 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -12 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movaps %xmm3, -10 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4, -8 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movaps %xmm5, -6 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -4 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movaps %xmm7, -2 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, -8 * SIZE(Y) + movaps %xmm5, -6 * SIZE(Y) + movaps %xmm6, -4 * SIZE(Y) + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L13: + testq $8, M + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + testq $4, M + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + testq $2, M + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + testq $1, M + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + LOAD( 1 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + LOAD( 3 * SIZE, X, %xmm2) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + LOAD( 5 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + LOAD( 7 * SIZE, X, %xmm4) + + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + LOAD( 9 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + LOAD(11 * SIZE, X, %xmm6) + + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + LOAD(13 * SIZE, X, %xmm7) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm8, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#else + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movaps -10 * SIZE(X), %xmm3 + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#endif + +.L40: + movq M, %rax + sarq $3, %rax + jle .L45 + ALIGN_3 + +.L41: + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + movlps %xmm0, (Y) + addq INCY, Y + movhps %xmm0, (Y) + addq INCY, Y + movlps %xmm1, (Y) + addq INCY, Y + movhps %xmm1, (Y) + addq INCY, Y + movlps %xmm2, (Y) + addq INCY, Y + movhps %xmm2, (Y) + addq INCY, Y + movlps %xmm3, (Y) + addq INCY, Y + movhps %xmm3, (Y) + addq INCY, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L45: + movq M, %rax + andq $7, %rax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + decq %rax + jg .L46 + ALIGN_3 + +.L47: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_ncopy_2.S b/kernel/x86_64/dgemm_ncopy_2.S new file mode 100644 index 0000000..2724cfe --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_2.S @@ -0,0 +1,597 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef NEHALEM +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef MOVAPS +#define MOVAPS movaps +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define MM %r13 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA + subq $-16 * SIZE, B + + movq M, MM + leaq -1(M), %rax + testq $SIZE, A + cmovne %rax, MM + + testq $SIZE, LDA + jne .L50 + + movq N, J + sarq $1, J + jle .L30 + ALIGN_4 + +.L21: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L22 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L22: + movq MM, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 2 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm4, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 4 * SIZE(AO2), %xmm1 + MOVAPS 6 * SIZE(AO1), %xmm2 + MOVAPS 6 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm4, -6 * SIZE(B) + movaps %xmm2, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, MM + jle .L26 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 2 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm4, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L26: + testq $2, MM + jle .L28 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + + movaps %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L28: + testq $1, MM + jle .L29 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L29: + decq J + jg .L21 + ALIGN_4 + +.L30: + testq $1, N + jle .L999 + +.L30x: + movq A, AO1 + + testq $SIZE, A + jne .L35 + + movq M, I + sarq $3, I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm1 + MOVAPS 4 * SIZE(AO1), %xmm2 + MOVAPS 6 * SIZE(AO1), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, B + + decq I + jg .L31 + ALIGN_4 + +.L32: + testq $4, M + jle .L33 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L33: + testq $2, M + jle .L34 + + MOVAPS 0 * SIZE(AO1), %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L34: + testq $1, M + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L35: + movaps -1 * SIZE(AO1), %xmm0 + + movq M, I + sarq $3, I + jle .L36 + ALIGN_4 + +.L36: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + MOVAPS 5 * SIZE(AO1), %xmm3 + MOVAPS 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + movaps %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L36 + ALIGN_4 + +.L37: + testq $4, M + jle .L38 + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + movaps %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L38: + testq $2, M + jle .L39 + + MOVAPS 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + movaps %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L39: + testq $1, M + jle .L999 + + movhpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L50: + movq N, J + sarq $1, J + jle .L30 + ALIGN_4 + +.L61: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L62 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L62: + MOVAPS -1 * SIZE(AO2), %xmm5 + + movq MM, I + sarq $3, I + jle .L64 + ALIGN_4 + +.L63: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + movaps %xmm1, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 5 * SIZE(AO2), %xmm1 + MOVAPS 6 * SIZE(AO1), %xmm2 + MOVAPS 7 * SIZE(AO2), %xmm5 + + movsd %xmm0, %xmm3 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm5, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm3, -8 * SIZE(B) + movaps %xmm0, -6 * SIZE(B) + movaps %xmm1, -4 * SIZE(B) + movaps %xmm2, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L63 + ALIGN_4 + +.L64: + testq $4, MM + jle .L66 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + movaps %xmm1, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + + movaps %xmm3, %xmm5 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L66: + testq $2, MM + jle .L68 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L68: + testq $1, MM + jle .L69 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L69: + decq J + jg .L61 + + testq $1, N + jne .L30 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_ncopy_4.S b/kernel/x86_64/dgemm_ncopy_4.S new file mode 100644 index 0000000..52115bd --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_4.S @@ -0,0 +1,1237 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef ATOM +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NANO +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef OPTERON +#define PREFETCHSIZE 16 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef GENERIC +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define MM %r13 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA + subq $-16 * SIZE, B + + movq M, MM + leaq -1(M), %rax + testq $SIZE, A + cmovne %rax, MM + + testq $SIZE, LDA + jne .L50 + + movq N, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L12 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L12: + movq MM, I + sarq $3, I + jle .L14 + ALIGN_4 + +.L13: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 0 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + movapd %xmm4, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + movapd 2 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1, LDA), %xmm1 + movapd 2 * SIZE(AO2), %xmm2 + movapd 2 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movapd %xmm0, -8 * SIZE(B) + movapd %xmm2, -6 * SIZE(B) + movapd %xmm4, -4 * SIZE(B) + movapd %xmm6, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + movapd 4 * SIZE(AO1), %xmm0 + movapd 4 * SIZE(AO1, LDA), %xmm1 + movapd 4 * SIZE(AO2), %xmm2 + movapd 4 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) +#endif + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm2, 2 * SIZE(B) + movapd %xmm4, 4 * SIZE(B) + movapd %xmm6, 6 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + movapd 6 * SIZE(AO1), %xmm0 + movapd 6 * SIZE(AO1, LDA), %xmm1 + movapd 6 * SIZE(AO2), %xmm2 + movapd 6 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) +#endif + + movapd %xmm0, 8 * SIZE(B) + movapd %xmm2, 10 * SIZE(B) + movapd %xmm4, 12 * SIZE(B) + movapd %xmm6, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $4, MM + jle .L16 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 0 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + movapd %xmm4, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + + movapd 2 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1, LDA), %xmm1 + movapd 2 * SIZE(AO2), %xmm2 + movapd 2 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movapd %xmm0, -8 * SIZE(B) + movapd %xmm2, -6 * SIZE(B) + movapd %xmm4, -4 * SIZE(B) + movapd %xmm6, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L16: + testq $2, MM + jle .L18 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 0 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + movapd %xmm4, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L18: + testq $1, MM + jle .L19 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $2, N + jle .L30 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L22 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L22: + movq MM, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO2), %xmm1 + movapd 2 * SIZE(AO1), %xmm2 + movapd 2 * SIZE(AO2), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm4, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + movapd 4 * SIZE(AO1), %xmm0 + movapd 4 * SIZE(AO2), %xmm1 + movapd 6 * SIZE(AO1), %xmm2 + movapd 6 * SIZE(AO2), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movapd %xmm0, -8 * SIZE(B) + movapd %xmm4, -6 * SIZE(B) + movapd %xmm2, -4 * SIZE(B) + movapd %xmm6, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, MM + jle .L26 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO2), %xmm1 + movapd 2 * SIZE(AO1), %xmm2 + movapd 2 * SIZE(AO2), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm6 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm4, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L26: + testq $2, MM + jle .L28 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO2), %xmm1 + + movapd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L28: + testq $1, MM + jle .L30 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L30: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L35 + + movq MM, I + sarq $3, I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1), %xmm1 + movapd 4 * SIZE(AO1), %xmm2 + movapd 6 * SIZE(AO1), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm3, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L31 + ALIGN_4 + +.L32: + testq $4, MM + jle .L33 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1), %xmm1 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L33: + testq $2, MM + jle .L34 + + movapd 0 * SIZE(AO1), %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L34: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L35: + movapd -1 * SIZE(AO1), %xmm0 + + movq MM, I + sarq $3, I + jle .L36 + ALIGN_4 + +.L36: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + movapd 1 * SIZE(AO1), %xmm1 + movapd 3 * SIZE(AO1), %xmm2 + movapd 5 * SIZE(AO1), %xmm3 + movapd 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm3, -10 * SIZE(B) + + movapd %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L36 + ALIGN_4 + +.L37: + testq $4, MM + jle .L38 + + movapd 1 * SIZE(AO1), %xmm1 + movapd 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movapd %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L38: + testq $2, MM + jle .L39 + + movapd 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + movapd %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L39: + testq $1, MM + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L50: + movq N, J + sarq $2, J + jle .L60 + ALIGN_4 + +.L51: + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L52 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L52: + movapd -1 * SIZE(AO1, LDA), %xmm5 + movapd -1 * SIZE(AO2, LDA), %xmm7 + + movq MM, I + sarq $3, I + jle .L54 + ALIGN_4 + +.L53: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + movapd %xmm0, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + movapd 2 * SIZE(AO1), %xmm0 + movapd 3 * SIZE(AO1, LDA), %xmm5 + movapd 2 * SIZE(AO2), %xmm2 + movapd 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm0, -4 * SIZE(B) + movapd %xmm2, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + movapd 4 * SIZE(AO1), %xmm0 + movapd 5 * SIZE(AO1, LDA), %xmm1 + movapd 4 * SIZE(AO2), %xmm2 + movapd 5 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) +#endif + + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + movapd %xmm0, 4 * SIZE(B) + movapd %xmm2, 6 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + movapd 6 * SIZE(AO1), %xmm0 + movapd 7 * SIZE(AO1, LDA), %xmm5 + movapd 6 * SIZE(AO2), %xmm2 + movapd 7 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) +#endif + + movapd %xmm1, 8 * SIZE(B) + movapd %xmm3, 10 * SIZE(B) + movapd %xmm0, 12 * SIZE(B) + movapd %xmm2, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L53 + ALIGN_4 + +.L54: + testq $4, MM + jle .L56 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm7 + shufpd $1, %xmm3, %xmm2 + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + movapd %xmm0, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + + movapd 2 * SIZE(AO1), %xmm0 + movapd 3 * SIZE(AO1, LDA), %xmm5 + movapd 2 * SIZE(AO2), %xmm2 + movapd 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + shufpd $1, %xmm5, %xmm0 + movsd %xmm2, %xmm3 + shufpd $1, %xmm7, %xmm2 + + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm0, -4 * SIZE(B) + movapd %xmm2, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L56: + testq $2, MM + jle .L58 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + movapd %xmm0, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L58: + testq $1, MM + jle .L59 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L59: + decq J + jg .L51 + ALIGN_4 + +.L60: + testq $2, N + jle .L70 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L62 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L62: + movapd -1 * SIZE(AO2), %xmm5 + + movq MM, I + sarq $3, I + jle .L64 + ALIGN_4 + +.L63: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO2), %xmm1 + movapd 2 * SIZE(AO1), %xmm2 + movapd 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm0, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + movapd 4 * SIZE(AO1), %xmm0 + movapd 5 * SIZE(AO2), %xmm1 + movapd 6 * SIZE(AO1), %xmm2 + movapd 7 * SIZE(AO2), %xmm5 + + movsd %xmm0, %xmm3 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm5, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm3, -8 * SIZE(B) + movapd %xmm0, -6 * SIZE(B) + movapd %xmm1, -4 * SIZE(B) + movapd %xmm2, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L63 + ALIGN_4 + +.L64: + testq $4, MM + jle .L66 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO2), %xmm1 + movapd 2 * SIZE(AO1), %xmm2 + movapd 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm0, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + + movaps %xmm3, %xmm5 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L66: + testq $2, MM + jle .L68 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO2), %xmm1 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm0, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L68: + testq $1, MM + jle .L70 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L75 + + movq MM, I + sarq $3, I + jle .L72 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1), %xmm2 + movapd 4 * SIZE(AO1), %xmm4 + movapd 6 * SIZE(AO1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + movapd %xmm4, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L71 + ALIGN_4 + +.L72: + testq $4, MM + jle .L73 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1), %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L73: + testq $2, MM + jle .L74 + + movapd 0 * SIZE(AO1), %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L74: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L75: + movapd -1 * SIZE(AO1), %xmm0 + + movq MM, I + sarq $3, I + jle .L76 + ALIGN_4 + +.L76: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + movapd 1 * SIZE(AO1), %xmm1 + movapd 3 * SIZE(AO1), %xmm2 + movapd 5 * SIZE(AO1), %xmm3 + movapd 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm3, -10 * SIZE(B) + + movapd %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L76 + ALIGN_4 + +.L77: + testq $4, MM + jle .L78 + + movapd 1 * SIZE(AO1), %xmm1 + movapd 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movapd %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L78: + testq $2, MM + jle .L79 + + movapd 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + movapd %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L79: + testq $1, MM + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_ncopy_8.S b/kernel/x86_64/dgemm_ncopy_8.S new file mode 100644 index 0000000..5d36272 --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_8.S @@ -0,0 +1,2002 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef NEHALEM +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef MOVAPS +#define MOVAPS movaps +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define J %r12 +#define MM %r13 + +#else + +#define STACKSIZE 128 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r15 + +#define AO1 %r10 +#define AO2 %r11 +#define LDA3 %r12 +#define J %r13 +#define MM %r14 + +#endif + +#define I %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + subq $-16 * SIZE, B + + movq M, MM + leaq -1(M), %rax + testq $SIZE, A + cmovne %rax, MM + + testq $SIZE, LDA + jne .L50 + + movq N, J + sarq $3, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA, 4), AO2 + leaq (A, LDA, 8), A + + testq $SIZE, A + je .L12 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_3 + +.L12: + movq MM, I + sarq $3, I + jle .L14 + ALIGN_4 + +.L13: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 0 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm8, -8 * SIZE(B) + movaps %xmm9, -6 * SIZE(B) + movaps %xmm10, -4 * SIZE(B) + movaps %xmm11, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) +#endif + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1, LDA), %xmm1 + MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 2 * SIZE(AO1, LDA3), %xmm3 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) +#endif + + MOVAPS 2 * SIZE(AO2), %xmm4 + MOVAPS 2 * SIZE(AO2, LDA), %xmm5 + MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 2 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) +#endif + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm2, 2 * SIZE(B) + movaps %xmm4, 4 * SIZE(B) + movaps %xmm6, 6 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) +#endif + + movaps %xmm8, 8 * SIZE(B) + movaps %xmm9, 10 * SIZE(B) + movaps %xmm10, 12 * SIZE(B) + movaps %xmm11, 14 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 4 * SIZE(AO1, LDA), %xmm1 + MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 4 * SIZE(AO1, LDA3), %xmm3 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + MOVAPS 4 * SIZE(AO2), %xmm4 + MOVAPS 4 * SIZE(AO2, LDA), %xmm5 + MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 4 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B) +#endif + + movaps %xmm0, 16 * SIZE(B) + movaps %xmm2, 18 * SIZE(B) + movaps %xmm4, 20 * SIZE(B) + movaps %xmm6, 22 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B) +#endif + + movaps %xmm8, 24 * SIZE(B) + movaps %xmm9, 26 * SIZE(B) + movaps %xmm10, 28 * SIZE(B) + movaps %xmm11, 30 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) +#endif + + MOVAPS 6 * SIZE(AO1), %xmm0 + MOVAPS 6 * SIZE(AO1, LDA), %xmm1 + MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 6 * SIZE(AO1, LDA3), %xmm3 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) +#endif + + MOVAPS 6 * SIZE(AO2), %xmm4 + MOVAPS 6 * SIZE(AO2, LDA), %xmm5 + MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 6 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B) +#endif + + movaps %xmm0, 32 * SIZE(B) + movaps %xmm2, 34 * SIZE(B) + movaps %xmm4, 36 * SIZE(B) + movaps %xmm6, 38 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 56) * SIZE(B) +#endif + + movaps %xmm8, 40 * SIZE(B) + movaps %xmm9, 42 * SIZE(B) + movaps %xmm10, 44 * SIZE(B) + movaps %xmm11, 46 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-64 * SIZE, B + + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $4, MM + jle .L16 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 + + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 0 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + movaps %xmm8, -8 * SIZE(B) + movaps %xmm9, -6 * SIZE(B) + movaps %xmm10, -4 * SIZE(B) + movaps %xmm11, -2 * SIZE(B) + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1, LDA), %xmm1 + MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 2 * SIZE(AO1, LDA3), %xmm3 + + MOVAPS 2 * SIZE(AO2), %xmm4 + MOVAPS 2 * SIZE(AO2, LDA), %xmm5 + MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 2 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm2, 2 * SIZE(B) + movaps %xmm4, 4 * SIZE(B) + movaps %xmm6, 6 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + movaps %xmm8, 8 * SIZE(B) + movaps %xmm9, 10 * SIZE(B) + movaps %xmm10, 12 * SIZE(B) + movaps %xmm11, 14 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-32 * SIZE, B + ALIGN_4 + +.L16: + testq $2, MM + jle .L18 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 + + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 0 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + movaps %xmm8, -8 * SIZE(B) + movaps %xmm9, -6 * SIZE(B) + movaps %xmm10, -4 * SIZE(B) + movaps %xmm11, -2 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L18: + testq $1, MM + jle .L19 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + subq $-8 * SIZE, B + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $4, N + jle .L30 + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L22 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L22: + movq MM, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 0 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA) +#endif + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1, LDA), %xmm1 + MOVAPS 2 * SIZE(AO2), %xmm2 + MOVAPS 2 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 4 * SIZE(AO1, LDA), %xmm1 + MOVAPS 4 * SIZE(AO2), %xmm2 + MOVAPS 4 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) +#endif + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm2, 2 * SIZE(B) + movaps %xmm4, 4 * SIZE(B) + movaps %xmm6, 6 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA) +#endif + + MOVAPS 6 * SIZE(AO1), %xmm0 + MOVAPS 6 * SIZE(AO1, LDA), %xmm1 + MOVAPS 6 * SIZE(AO2), %xmm2 + MOVAPS 6 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) +#endif + + movaps %xmm0, 8 * SIZE(B) + movaps %xmm2, 10 * SIZE(B) + movaps %xmm4, 12 * SIZE(B) + movaps %xmm6, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, MM + jle .L26 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 0 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1, LDA), %xmm1 + MOVAPS 2 * SIZE(AO2), %xmm2 + MOVAPS 2 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L26: + testq $2, MM + jle .L28 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 0 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L28: + testq $1, MM + jle .L30 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L30: + testq $2, N + jle .L40 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L32 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L32: + movq MM, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 2 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm4, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 4 * SIZE(AO2), %xmm1 + MOVAPS 6 * SIZE(AO1), %xmm2 + MOVAPS 6 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm4, -6 * SIZE(B) + movaps %xmm2, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, MM + jle .L36 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 2 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm4, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L36: + testq $2, MM + jle .L38 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + + movaps %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L38: + testq $1, MM + jle .L40 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L40: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L45 + + movq MM, I + sarq $3, I + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm1 + MOVAPS 4 * SIZE(AO1), %xmm2 + MOVAPS 6 * SIZE(AO1), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L41 + ALIGN_4 + +.L42: + testq $4, MM + jle .L43 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L43: + testq $2, MM + jle .L44 + + MOVAPS 0 * SIZE(AO1), %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L44: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L45: + MOVAPS -1 * SIZE(AO1), %xmm0 + + movq M, I + sarq $3, I + jle .L46 + ALIGN_4 + +.L46: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + MOVAPS 5 * SIZE(AO1), %xmm3 + MOVAPS 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + movaps %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L46 + ALIGN_4 + +.L47: + testq $4, M + jle .L48 + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + movaps %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L48: + testq $2, M + jle .L49 + + MOVAPS 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + movaps %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L49: + testq $1, M + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L50: + movq N, J + sarq $3, J + jle .L60 + ALIGN_4 + +.L51: + movq A, AO1 + leaq (A, LDA, 4), AO2 + leaq (A, LDA, 8), A + + testq $SIZE, A + je .L52 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_3 + +.L52: + MOVAPS -1 * SIZE(AO1, LDA), %xmm9 + MOVAPS -1 * SIZE(AO1, LDA3), %xmm10 + MOVAPS -1 * SIZE(AO2, LDA), %xmm11 + MOVAPS -1 * SIZE(AO2, LDA3), %xmm12 + + movq MM, I + sarq $3, I + jle .L54 + ALIGN_4 + +.L53: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 1 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm9, -16 * SIZE(B) + movaps %xmm10, -14 * SIZE(B) + movaps %xmm11, -12 * SIZE(B) + movaps %xmm12, -10 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) +#endif + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 3 * SIZE(AO1, LDA), %xmm9 + MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 3 * SIZE(AO1, LDA3), %xmm10 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) +#endif + + MOVAPS 2 * SIZE(AO2), %xmm4 + MOVAPS 3 * SIZE(AO2, LDA), %xmm11 + MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 3 * SIZE(AO2, LDA3), %xmm12 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + movsd %xmm4, %xmm5 + movsd %xmm6, %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) +#endif + + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 2 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm7, 6 * SIZE(B) + + shufpd $1, %xmm9, %xmm0 + shufpd $1, %xmm10, %xmm2 + shufpd $1, %xmm11, %xmm4 + shufpd $1, %xmm12, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) +#endif + + movaps %xmm0, 8 * SIZE(B) + movaps %xmm2, 10 * SIZE(B) + movaps %xmm4, 12 * SIZE(B) + movaps %xmm6, 14 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 5 * SIZE(AO1, LDA), %xmm1 + MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 5 * SIZE(AO1, LDA3), %xmm3 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + MOVAPS 4 * SIZE(AO2), %xmm4 + MOVAPS 5 * SIZE(AO2, LDA), %xmm5 + MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 5 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B) +#endif + + movaps %xmm9, 16 * SIZE(B) + movaps %xmm10, 18 * SIZE(B) + movaps %xmm11, 20 * SIZE(B) + movaps %xmm12, 22 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movaps %xmm0, 24 * SIZE(B) + movaps %xmm2, 26 * SIZE(B) + movaps %xmm4, 28 * SIZE(B) + movaps %xmm6, 30 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) +#endif + + MOVAPS 6 * SIZE(AO1), %xmm0 + MOVAPS 7 * SIZE(AO1, LDA), %xmm9 + MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 7 * SIZE(AO1, LDA3), %xmm10 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) +#endif + + MOVAPS 6 * SIZE(AO2), %xmm4 + MOVAPS 7 * SIZE(AO2, LDA), %xmm11 + MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 7 * SIZE(AO2, LDA3), %xmm12 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + movsd %xmm4, %xmm5 + movsd %xmm6, %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B) +#endif + + movaps %xmm1, 32 * SIZE(B) + movaps %xmm3, 34 * SIZE(B) + movaps %xmm5, 36 * SIZE(B) + movaps %xmm7, 38 * SIZE(B) + + shufpd $1, %xmm9, %xmm0 + shufpd $1, %xmm10, %xmm2 + shufpd $1, %xmm11, %xmm4 + shufpd $1, %xmm12, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B) +#endif + movaps %xmm0, 40 * SIZE(B) + movaps %xmm2, 42 * SIZE(B) + movaps %xmm4, 44 * SIZE(B) + movaps %xmm6, 46 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-64 * SIZE, B + + decq I + jg .L53 + ALIGN_4 + +.L54: + testq $4, MM + jle .L56 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 1 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + + movaps %xmm9, -16 * SIZE(B) + movaps %xmm10, -14 * SIZE(B) + movaps %xmm11, -12 * SIZE(B) + movaps %xmm12, -10 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 3 * SIZE(AO1, LDA), %xmm9 + MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 3 * SIZE(AO1, LDA3), %xmm10 + MOVAPS 2 * SIZE(AO2), %xmm4 + MOVAPS 3 * SIZE(AO2, LDA), %xmm11 + MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 3 * SIZE(AO2, LDA3), %xmm12 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + movsd %xmm4, %xmm5 + movsd %xmm6, %xmm7 + + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 2 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm7, 6 * SIZE(B) + + shufpd $1, %xmm9, %xmm0 + shufpd $1, %xmm10, %xmm2 + shufpd $1, %xmm11, %xmm4 + shufpd $1, %xmm12, %xmm6 + + movaps %xmm0, 8 * SIZE(B) + movaps %xmm2, 10 * SIZE(B) + movaps %xmm4, 12 * SIZE(B) + movaps %xmm6, 14 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-32 * SIZE, B + ALIGN_4 + +.L56: + testq $2, MM + jle .L58 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 1 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + + movaps %xmm9, -16 * SIZE(B) + movaps %xmm10, -14 * SIZE(B) + movaps %xmm11, -12 * SIZE(B) + movaps %xmm12, -10 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L58: + testq $1, MM + jle .L59 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + subq $-8 * SIZE, B + ALIGN_4 + +.L59: + decq J + jg .L51 + ALIGN_4 + +.L60: + testq $4, N + jle .L70 + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L62 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L62: + movaps -1 * SIZE(AO1, LDA), %xmm5 + movaps -1 * SIZE(AO2, LDA), %xmm7 + + movq MM, I + sarq $3, I + jle .L64 + ALIGN_4 + +.L63: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm7, -14 * SIZE(B) + movaps %xmm0, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA) +#endif + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 3 * SIZE(AO1, LDA), %xmm5 + MOVAPS 2 * SIZE(AO2), %xmm2 + MOVAPS 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm1, -8 * SIZE(B) + movaps %xmm3, -6 * SIZE(B) + movaps %xmm0, -4 * SIZE(B) + movaps %xmm2, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 5 * SIZE(AO1, LDA), %xmm1 + MOVAPS 4 * SIZE(AO2), %xmm2 + MOVAPS 5 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) +#endif + + movaps %xmm5, 0 * SIZE(B) + movaps %xmm7, 2 * SIZE(B) + movaps %xmm0, 4 * SIZE(B) + movaps %xmm2, 6 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA) +#endif + + MOVAPS 6 * SIZE(AO1), %xmm0 + MOVAPS 7 * SIZE(AO1, LDA), %xmm5 + MOVAPS 6 * SIZE(AO2), %xmm2 + MOVAPS 7 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) +#endif + + movaps %xmm1, 8 * SIZE(B) + movaps %xmm3, 10 * SIZE(B) + movaps %xmm0, 12 * SIZE(B) + movaps %xmm2, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L63 + ALIGN_4 + +.L64: + testq $4, MM + jle .L66 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm7 + shufpd $1, %xmm3, %xmm2 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm7, -14 * SIZE(B) + movaps %xmm0, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 3 * SIZE(AO1, LDA), %xmm5 + MOVAPS 2 * SIZE(AO2), %xmm2 + MOVAPS 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + shufpd $1, %xmm5, %xmm0 + movsd %xmm2, %xmm3 + shufpd $1, %xmm7, %xmm2 + + movaps %xmm1, -8 * SIZE(B) + movaps %xmm3, -6 * SIZE(B) + movaps %xmm0, -4 * SIZE(B) + movaps %xmm2, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L66: + testq $2, MM + jle .L68 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm7, -14 * SIZE(B) + movaps %xmm0, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L68: + testq $1, MM + jle .L70 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L70: + testq $2, N + jle .L80 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L72 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L72: + MOVAPS -1 * SIZE(AO2), %xmm5 + + movq MM, I + sarq $3, I + jle .L74 + ALIGN_4 + +.L73: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + movaps %xmm1, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 5 * SIZE(AO2), %xmm1 + MOVAPS 6 * SIZE(AO1), %xmm2 + MOVAPS 7 * SIZE(AO2), %xmm5 + + movsd %xmm0, %xmm3 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm5, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm3, -8 * SIZE(B) + movaps %xmm0, -6 * SIZE(B) + movaps %xmm1, -4 * SIZE(B) + movaps %xmm2, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L73 + ALIGN_4 + +.L74: + testq $4, MM + jle .L76 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + movaps %xmm1, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + + movaps %xmm3, %xmm5 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L76: + testq $2, MM + jle .L78 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L78: + testq $1, MM + jle .L80 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L80: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L85 + + movq MM, I + sarq $3, I + jle .L82 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 4 * SIZE(AO1), %xmm4 + MOVAPS 6 * SIZE(AO1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L81 + ALIGN_4 + +.L82: + testq $4, MM + jle .L83 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L83: + testq $2, MM + jle .L84 + + MOVAPS 0 * SIZE(AO1), %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L84: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L85: + MOVAPS -1 * SIZE(AO1), %xmm0 + + movq M, I + sarq $3, I + jle .L86 + ALIGN_4 + +.L86: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + MOVAPS 5 * SIZE(AO1), %xmm3 + MOVAPS 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + movaps %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L86 + ALIGN_4 + +.L87: + testq $4, M + jle .L88 + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + movaps %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L88: + testq $2, M + jle .L89 + + MOVAPS 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + movaps %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L89: + testq $1, M + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_tcopy_2.S b/kernel/x86_64/dgemm_tcopy_2.S new file mode 100644 index 0000000..06e5999 --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_2.S @@ -0,0 +1,334 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef OPTERON +#define PREFETCHSIZE 16 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef MOVUPS_A +#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS +#else +#define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS +#endif + +#ifndef WINDOWS_ABI + +#define N ARG1 /* rsi */ +#define M ARG2 /* rdi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define M8 %r12 + +#else + +#define N ARG1 /* rdx */ +#define M ARG2 /* rcx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 40(%rsp) + +#define B %r12 + +#define AO1 %rsi +#define AO2 %rdi +#define LDA3 %r10 +#define M8 %r11 +#endif + +#define I %rax +#define B0 %rbp +#define B3 %r13 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + + pushq %r12 + pushq %r13 + pushq %rbp + +#ifdef WINDOWS_ABI + movq OLD_B, B +#endif + + subq $-16 * SIZE, B + + movq M, B3 + andq $-2, B3 + imulq N, B3 + + leaq (B, B3, SIZE), B3 + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + + leaq (, N, SIZE), M8 + + cmpq $2, N + jl .L40 + ALIGN_4 + +.L31: + subq $2, N + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq B, B0 + addq $4 * SIZE, B + + movq M, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A1(2 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm2, -14 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + movaps %xmm3, -14 * SIZE(B0, M8, 2) + + leaq (B0, M8, 4), B0 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVUPS_A1(4 * SIZE, AO1, %xmm0) + MOVUPS_A1(6 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm2, -14 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + movaps %xmm3, -14 * SIZE(B0, M8, 2) + + leaq (B0, M8, 4), B0 + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, M + jle .L36 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A1(2 * SIZE, AO2, %xmm3) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm2, -14 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + movaps %xmm3, -14 * SIZE(B0, M8, 2) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L36: + testq $2, M + jle .L38 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(0 * SIZE, AO2, %xmm1) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + leaq (B0, M8, 2), B0 + ALIGN_4 + +.L38: + testq $1, M + jle .L39 + + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, -16 * SIZE(B3) + subq $-2 * SIZE, B3 + ALIGN_4 + +.L39: + cmpq $2, N + jge .L31 + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + + movq A, AO1 + movq B, B0 + + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + addq $8 * SIZE, AO1 + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + leaq (B0, M8, 4), B0 + movaps %xmm2, -16 * SIZE(B0) + movaps %xmm3, -16 * SIZE(B0, M8, 2) + leaq (B0, M8, 4), B0 + + decq I + jg .L43 + ALIGN_4 + +.L44: + testq $4, M + jle .L45 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + + addq $4 * SIZE, AO1 + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L45: + testq $2, M + jle .L46 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + + movaps %xmm0, -16 * SIZE(B0) + + addq $2 * SIZE, AO1 + ALIGN_4 + +.L46: + testq $1, M + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B3) + ALIGN_4 + +.L999: + popq %rbp + popq %r13 + popq %r12 + +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_tcopy_4.S b/kernel/x86_64/dgemm_tcopy_4.S new file mode 100644 index 0000000..8b81c41 --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_4.S @@ -0,0 +1,516 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define MOVUPS_A movups +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef OPTERON +#define PREFETCHSIZE 16 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef MOVUPS_A +#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS +#else +#define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS +#endif + +#ifndef WINDOWS_ABI + +#define N ARG1 /* rsi */ +#define M ARG2 /* rdi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define M8 %r12 + +#else + +#define STACKSIZE 256 + +#define N ARG1 /* rdx */ +#define M ARG2 /* rcx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 64 + 32 + STACKSIZE(%rsp) + +#define B %r12 + +#define AO1 %rsi +#define AO2 %rdi +#define LDA3 %r10 +#define M8 %r11 +#endif + +#define I %rax + +#define B0 %rbp +#define B2 %r14 +#define B3 %r15 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + +#ifdef WINDOWS_ABI + movq OLD_B, B +#endif + + subq $-16 * SIZE, B + + movq M, B2 + movq M, B3 + + andq $-4, B2 + andq $-2, B3 + + imulq N, B2 + imulq N, B3 + + leaq (B, B2, SIZE), B2 + leaq (B, B3, SIZE), B3 + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + + leaq (, N, SIZE), M8 + + cmpq $4, N + jl .L30 + ALIGN_4 + +.L21: + subq $4, N + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + movq B, B0 + addq $16 * SIZE, B + + movq M, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -16 * SIZE(B0, M8, 4) + movaps %xmm3, -14 * SIZE(B0, M8, 4) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movaps %xmm0, -12 * SIZE(B0) + movaps %xmm1, -10 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0, M8, 4) + movaps %xmm3, -10 * SIZE(B0, M8, 4) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -8 * SIZE(B0, M8, 4) + movaps %xmm3, -6 * SIZE(B0, M8, 4) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) +#endif + + movaps %xmm0, -4 * SIZE(B0) + movaps %xmm1, -2 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0, M8, 4) + movaps %xmm3, -2 * SIZE(B0, M8, 4) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, M + jle .L26 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0) + movaps %xmm3, -2 * SIZE(B0) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L26: + testq $2, M + jle .L28 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + movaps %xmm2, -12 * SIZE(B2) + movaps %xmm3, -10 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B2 + ALIGN_4 + +.L28: + testq $1, M + jle .L29 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B3) + movaps %xmm2, -14 * SIZE(B3) + subq $-4 * SIZE, B3 + ALIGN_4 + +.L29: + cmpq $4, N + jge .L21 + ALIGN_4 + +.L30: + cmpq $2, N + jl .L40 + + subq $2, N + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq B, B0 + addq $8 * SIZE, B + + movq M, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -16 * SIZE(B0, M8, 4) + movaps %xmm3, -14 * SIZE(B0, M8, 4) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movaps %xmm0, -12 * SIZE(B0) + movaps %xmm1, -10 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0, M8, 4) + movaps %xmm3, -10 * SIZE(B0, M8, 4) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, M + jle .L36 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A1(2 * SIZE, AO2, %xmm3) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L36: + testq $2, M + jle .L38 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(0 * SIZE, AO2, %xmm1) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B2 + ALIGN_4 + +.L38: + testq $1, M + jle .L40 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B3) + subq $-2 * SIZE, B3 + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + + movq A, AO1 + + movq B, B0 + + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -16 * SIZE(B0, M8, 4) + movaps %xmm3, -14 * SIZE(B0, M8, 4) + + addq $8 * SIZE, AO1 + leaq (B0, M8, 8), B0 + + decq I + jg .L43 + ALIGN_4 + +.L44: + testq $4, M + jle .L45 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + + addq $4 * SIZE, AO1 + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L45: + testq $2, M + jle .L46 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + + movaps %xmm0, -16 * SIZE(B2) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B2 + ALIGN_4 + +.L46: + testq $1, M + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B3) + jmp .L999 + ALIGN_4 + +.L999: + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_tcopy_8.S b/kernel/x86_64/dgemm_tcopy_8.S new file mode 100644 index 0000000..9760337 --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_8.S @@ -0,0 +1,780 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef NEHALEM +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + +#ifdef MOVUPS_A +#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS +#else +#define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS +#endif + +#ifndef WINDOWS_ABI + +#define N ARG1 /* rsi */ +#define M ARG2 /* rdi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define M8 %r12 + +#else + +#define N ARG1 /* rdx */ +#define M ARG2 /* rcx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 56(%rsp) + +#define B %r12 + +#define AO1 %rsi +#define AO2 %rdi +#define LDA3 %r10 +#define M8 %r11 +#endif + +#define I %rax + +#define B0 %rbp +#define B1 %r13 +#define B2 %r14 +#define B3 %r15 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + +#ifdef WINDOWS_ABI + movq OLD_B, B +#endif + + subq $-16 * SIZE, B + + movq M, B1 + movq M, B2 + movq M, B3 + + andq $-8, B1 + andq $-4, B2 + andq $-2, B3 + + imulq N, B1 + imulq N, B2 + imulq N, B3 + + leaq (B, B1, SIZE), B1 + leaq (B, B2, SIZE), B2 + leaq (B, B3, SIZE), B3 + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + + leaq (, N, SIZE), M8 + + cmpq $8, N + jl .L20 + ALIGN_4 + +.L11: + subq $8, N + + movq A, AO1 + leaq (A, LDA, 4), AO2 + leaq (A, LDA, 8), A + + movq B, B0 + addq $64 * SIZE, B + + movq M, I + sarq $3, I + jle .L14 + ALIGN_4 + +.L13: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 48 * SIZE(B0) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 56 * SIZE(B0) +#endif + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0) + movaps %xmm3, -2 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 64 * SIZE(B0) +#endif + + movaps %xmm0, 0 * SIZE(B0) + movaps %xmm1, 2 * SIZE(B0) + movaps %xmm2, 4 * SIZE(B0) + movaps %xmm3, 6 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 72 * SIZE(B0) +#endif + + movaps %xmm0, 8 * SIZE(B0) + movaps %xmm1, 10 * SIZE(B0) + movaps %xmm2, 12 * SIZE(B0) + movaps %xmm3, 14 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 80 * SIZE(B0) +#endif + + movaps %xmm0, 16 * SIZE(B0) + movaps %xmm1, 18 * SIZE(B0) + movaps %xmm2, 20 * SIZE(B0) + movaps %xmm3, 22 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 88 * SIZE(B0) +#endif + + movaps %xmm0, 24 * SIZE(B0) + movaps %xmm1, 26 * SIZE(B0) + movaps %xmm2, 28 * SIZE(B0) + movaps %xmm3, 30 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 96 * SIZE(B0) +#endif + + movaps %xmm0, 32 * SIZE(B0) + movaps %xmm1, 34 * SIZE(B0) + movaps %xmm2, 36 * SIZE(B0) + movaps %xmm3, 38 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 104 * SIZE(B0) +#endif + + movaps %xmm0, 40 * SIZE(B0) + movaps %xmm1, 42 * SIZE(B0) + movaps %xmm2, 44 * SIZE(B0) + movaps %xmm3, 46 * SIZE(B0) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $4, M + jle .L16 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B1) + movaps %xmm1, -14 * SIZE(B1) + movaps %xmm2, -12 * SIZE(B1) + movaps %xmm3, -10 * SIZE(B1) + + MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3) + + movaps %xmm0, -8 * SIZE(B1) + movaps %xmm1, -6 * SIZE(B1) + movaps %xmm2, -4 * SIZE(B1) + movaps %xmm3, -2 * SIZE(B1) + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, 0 * SIZE(B1) + movaps %xmm1, 2 * SIZE(B1) + movaps %xmm2, 4 * SIZE(B1) + movaps %xmm3, 6 * SIZE(B1) + + MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3) + + movaps %xmm0, 8 * SIZE(B1) + movaps %xmm1, 10 * SIZE(B1) + movaps %xmm2, 12 * SIZE(B1) + movaps %xmm3, 14 * SIZE(B1) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-32 * SIZE, B1 + ALIGN_4 + +.L16: + testq $2, M + jle .L18 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2) + MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + movaps %xmm2, -12 * SIZE(B2) + movaps %xmm3, -10 * SIZE(B2) + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2) + MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3) + + movaps %xmm0, -8 * SIZE(B2) + movaps %xmm1, -6 * SIZE(B2) + movaps %xmm2, -4 * SIZE(B2) + movaps %xmm3, -2 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-16 * SIZE, B2 + ALIGN_4 + +.L18: + testq $1, M + jle .L19 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B3) + movaps %xmm2, -14 * SIZE(B3) + + movsd 0 * SIZE(AO2), %xmm0 + movsd 0 * SIZE(AO2, LDA), %xmm1 + movsd 0 * SIZE(AO2, LDA, 2), %xmm2 + movsd 0 * SIZE(AO2, LDA3), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -12 * SIZE(B3) + movaps %xmm2, -10 * SIZE(B3) + + subq $-8 * SIZE, B3 + ALIGN_4 + +.L19: + cmpq $8, N + jge .L11 + ALIGN_4 + +.L20: + cmpq $4, N + jl .L30 + + subq $4, N + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + movq B, B0 + addq $32 * SIZE, B + + movq M, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 16 * SIZE(B0) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 24 * SIZE(B0) +#endif + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0) + movaps %xmm3, -2 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 32 * SIZE(B0) +#endif + + movaps %xmm0, 0 * SIZE(B0) + movaps %xmm1, 2 * SIZE(B0) + movaps %xmm2, 4 * SIZE(B0) + movaps %xmm3, 6 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 40 * SIZE(B0) +#endif + + movaps %xmm0, 8 * SIZE(B0) + movaps %xmm1, 10 * SIZE(B0) + movaps %xmm2, 12 * SIZE(B0) + movaps %xmm3, 14 * SIZE(B0) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, M + jle .L26 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B1) + movaps %xmm1, -14 * SIZE(B1) + movaps %xmm2, -12 * SIZE(B1) + movaps %xmm3, -10 * SIZE(B1) + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, -8 * SIZE(B1) + movaps %xmm1, -6 * SIZE(B1) + movaps %xmm2, -4 * SIZE(B1) + movaps %xmm3, -2 * SIZE(B1) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B1 + ALIGN_4 + +.L26: + testq $2, M + jle .L28 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + movaps %xmm2, -12 * SIZE(B2) + movaps %xmm3, -10 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B2 + ALIGN_4 + +.L28: + testq $1, M + jle .L30 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B3) + movaps %xmm2, -14 * SIZE(B3) + subq $-4 * SIZE, B3 + ALIGN_4 + +.L30: + cmpq $2, N + jl .L40 + + subq $2, N + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq B, B0 + addq $16 * SIZE, B + + movq M, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 0 * SIZE(B0) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 8 * SIZE(B0) +#endif + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0) + movaps %xmm3, -2 * SIZE(B0) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, M + jle .L36 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A1(2 * SIZE, AO2, %xmm3) + + movaps %xmm0, -16 * SIZE(B1) + movaps %xmm1, -14 * SIZE(B1) + movaps %xmm2, -12 * SIZE(B1) + movaps %xmm3, -10 * SIZE(B1) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B1 + ALIGN_4 + +.L36: + testq $2, M + jle .L38 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(0 * SIZE, AO2, %xmm1) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B2 + ALIGN_4 + +.L38: + testq $1, M + jle .L40 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B3) + subq $-2 * SIZE, B3 + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + + movq A, AO1 + + movq B, B0 + + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW -8 * SIZE(B0) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + + addq $8 * SIZE, AO1 + leaq (B0, M8, 8), B0 + + decq I + jg .L43 + ALIGN_4 + +.L44: + testq $4, M + jle .L45 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + + movaps %xmm0, -16 * SIZE(B1) + movaps %xmm1, -14 * SIZE(B1) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B1 + ALIGN_4 + +.L45: + testq $2, M + jle .L46 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + + movaps %xmm0, -16 * SIZE(B2) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B2 + ALIGN_4 + +.L46: + testq $1, M + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B3) + jmp .L999 + ALIGN_4 + +.L999: + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemv_n.S b/kernel/x86_64/dgemv_n.S new file mode 100644 index 0000000..3c3cdfb --- /dev/null +++ b/kernel/x86_64/dgemv_n.S @@ -0,0 +1,2843 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + movsd %xmm0, ALPHA +#else + movsd %xmm3, ALPHA +#endif + + leaq -1(INCY), %rax + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + leaq (LDA, LDA, 2), LDA3 + + subq $-16 * SIZE, A + +#ifdef ALIGNED_ACCESS + leaq -1 (M), MM + testq $SIZE, A + cmoveq M, MM +#endif + + testq N, N # if n <= 0 goto END + jle .L999 + testq M, M # if n <= 0 goto END + jle .L999 + +#if !defined(COPY_FORCE) && !defined(ALIGNED_ACCESS) +#ifndef NOCOPY_UNALIGNED + movq Y, Y1 + andq $0xf, Y1 + orq Y1, %rax +#endif + testq %rax, %rax + cmoveq Y, BUFFER + je .L10 +#endif + + movq BUFFER, Y1 + + pxor %xmm4, %xmm4 + + movq M, %rax + addq $16, %rax + sarq $4, %rax + ALIGN_3 + +.L01: + movapd %xmm4, 0 * SIZE(Y1) + movapd %xmm4, 2 * SIZE(Y1) + movapd %xmm4, 4 * SIZE(Y1) + movapd %xmm4, 6 * SIZE(Y1) + movapd %xmm4, 8 * SIZE(Y1) + movapd %xmm4, 10 * SIZE(Y1) + movapd %xmm4, 12 * SIZE(Y1) + movapd %xmm4, 14 * SIZE(Y1) + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: + +#ifdef ALIGNED_ACCESS + leaq SIZE(BUFFER), %rax + testq $SIZE, A + cmovne %rax, BUFFER + + testq $SIZE, LDA + jne .L50 +#endif + +#if GEMV_UNROLL >= 8 + + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 4), A2 + leaq (A, LDA, 8), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm8 + addq INCX, X + movddup (X), %xmm9 + addq INCX, X + movddup (X), %xmm10 + addq INCX, X + movddup (X), %xmm11 + addq INCX, X + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + movddup (X), %xmm14 + addq INCX, X + movddup (X), %xmm15 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm8 + unpcklpd %xmm8, %xmm8 + addq INCX, X + movsd (X), %xmm9 + unpcklpd %xmm9, %xmm9 + addq INCX, X + movsd (X), %xmm10 + unpcklpd %xmm10, %xmm10 + addq INCX, X + movsd (X), %xmm11 + unpcklpd %xmm11, %xmm11 + addq INCX, X + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + movsd (X), %xmm14 + unpcklpd %xmm14, %xmm14 + addq INCX, X + movsd (X), %xmm15 + unpcklpd %xmm15, %xmm15 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L1X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -16 * SIZE(A1, LDA, 2), %xmm6 + movsd -16 * SIZE(A1, LDA3), %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm8, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + mulsd %xmm9, %xmm5 + addsd %xmm5, %xmm0 + movsd -16 * SIZE(A2, LDA), %xmm5 + mulsd %xmm10, %xmm6 + addsd %xmm6, %xmm0 + movsd -16 * SIZE(A2, LDA, 2), %xmm6 + mulsd %xmm11, %xmm7 + addsd %xmm7, %xmm0 + movsd -16 * SIZE(A2, LDA3), %xmm7 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm7 + addsd %xmm7, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L1X: +#endif + + movq MM, I + sarq $3, I + jle .L15 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1) +#endif + + mulpd %xmm9, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + mulpd %xmm9, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + + mulpd %xmm9, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) + mulpd %xmm9, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) +#endif + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) +#endif + + mulpd %xmm11, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm11, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm11, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + mulpd %xmm11, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1) +#endif + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) +#endif + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) +#endif + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1( -6 * SIZE, A1, %xmm5) + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1( -4 * SIZE, A1, %xmm6) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1( -2 * SIZE, A1, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + mulpd %xmm9, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + mulpd %xmm9, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + + mulpd %xmm9, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) + mulpd %xmm9, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) + + mulpd %xmm11, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm11, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm11, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + mulpd %xmm11, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $4, MM + je .L16 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm7) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + + mulpd %xmm9, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm6) + mulpd %xmm9, %xmm7 + addpd %xmm7, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm7) + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm11, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm11, %xmm7 + addpd %xmm7, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm7) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm7) + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm1 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L16: + testq $2, MM + je .L17 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm6) + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm7) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm9, %xmm5 + addpd %xmm5, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm6) + mulpd %xmm11, %xmm7 + addpd %xmm7, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm7) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm0 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, MM + je .L18 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -16 * SIZE(A1, LDA, 2), %xmm6 + movsd -16 * SIZE(A1, LDA3), %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm8, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + mulsd %xmm9, %xmm5 + addsd %xmm5, %xmm0 + movsd -16 * SIZE(A2, LDA), %xmm5 + mulsd %xmm10, %xmm6 + addsd %xmm6, %xmm0 + movsd -16 * SIZE(A2, LDA, 2), %xmm6 + mulsd %xmm11, %xmm7 + addsd %xmm7, %xmm0 + movsd -16 * SIZE(A2, LDA3), %xmm7 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm7 + addsd %xmm7, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L18: + cmpq $8, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + movddup (X), %xmm14 + addq INCX, X + movddup (X), %xmm15 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + movsd (X), %xmm14 + unpcklpd %xmm14, %xmm14 + addq INCX, X + movsd (X), %xmm15 + unpcklpd %xmm15, %xmm15 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L2X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -16 * SIZE(A2), %xmm6 + movsd -16 * SIZE(A2, LDA), %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm7 + addsd %xmm7, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L2X: +#endif + + movq MM, I + sarq $3, I + jle .L25 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + MOVUPS_A1(-12 * SIZE, A1, %xmm2) + MOVUPS_A1(-10 * SIZE, A1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1(-16 * SIZE, A2, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1(-14 * SIZE, A2, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1(-12 * SIZE, A2, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1(-10 * SIZE, A2, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm14, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1( -8 * SIZE, A1, %xmm0) + mulpd %xmm14, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1( -6 * SIZE, A1, %xmm1) + + mulpd %xmm14, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1( -4 * SIZE, A1, %xmm2) + mulpd %xmm14, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1( -2 * SIZE, A1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm4) + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_A2( -6 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm6) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_A2( -2 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1(-16 * SIZE, A2, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1(-14 * SIZE, A2, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1(-12 * SIZE, A2, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1(-10 * SIZE, A2, %xmm3) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + mulpd %xmm14, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm14, %xmm1 + addpd %xmm1, %xmm9 + + mulpd %xmm14, %xmm2 + addpd %xmm2, %xmm10 + mulpd %xmm14, %xmm3 + addpd %xmm3, %xmm11 + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $4, MM + je .L26 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + + MOVUPS_A1(-16 * SIZE, A2, %xmm0) + MOVUPS_A1(-14 * SIZE, A2, %xmm1) + + mulpd %xmm14, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm14, %xmm1 + addpd %xmm1, %xmm9 + + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm8 + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm9 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L26: + testq $2, MM + je .L27 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm0 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm15, %xmm11 + addpd %xmm11, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, MM +#if GEMV_UNROLL == 4 + je .L28 +#else + je .L30 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + + movsd -16 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A1, LDA), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -16 * SIZE(A2, LDA), %xmm11 + + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + mulsd %xmm13, %xmm9 + addsd %xmm9, %xmm0 + mulsd %xmm14, %xmm10 + addsd %xmm10, %xmm0 + mulsd %xmm15, %xmm11 + addsd %xmm11, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 4 +.L28: + cmpq $4, N + jge .L21 + ALIGN_3 + +#endif + +.L30: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L40 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L31: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 2), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L3X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A2), %xmm5 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L3X: +#endif + + movq MM, I + sarq $3, I + jle .L35 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + MOVUPS_A1(-12 * SIZE, A1, %xmm2) + MOVUPS_A1(-10 * SIZE, A1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1( -8 * SIZE, A1, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1( -6 * SIZE, A1, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1( -4 * SIZE, A1, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1( -2 * SIZE, A1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_A1( -8 * SIZE, A2, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_A1( -6 * SIZE, A2, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_A1( -4 * SIZE, A2, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_A1( -2 * SIZE, A2, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $4, MM + je .L36 + + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L36: + testq $2, MM + je .L37 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L38 +#else + je .L40 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + + movsd -16 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A2), %xmm9 + + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + mulsd %xmm13, %xmm9 + addsd %xmm9, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 2 +.L38: + cmpq $2, N + jge .L31 + ALIGN_3 + +#endif + +.L40: + cmpq $1, N + jl .L900 +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L4X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, Y1 + ALIGN_3 + +.L4X: +#endif + + movq MM, I + sarq $3, I + jle .L45 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + MOVUPS_A1(-12 * SIZE, A1, %xmm2) + MOVUPS_A1(-10 * SIZE, A1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + decq I + jle .L44 + ALIGN_3 + +.L43: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1( -8 * SIZE, A1, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1( -6 * SIZE, A1, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1( -4 * SIZE, A1, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1( -2 * SIZE, A1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L43 + ALIGN_3 + +.L44: + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L45: + testq $4, MM + je .L46 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L46: + testq $2, MM + je .L47 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L47: + testq $1, MM + je .L900 + + movsd -16 * SIZE(Y1), %xmm0 + movsd -16 * SIZE(A1), %xmm8 + + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#ifdef ALIGNED_ACCESS + jmp .L900 + ALIGN_3 + +.L50: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L60 + ALIGN_3 + +.L51: + + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + movddup (X), %xmm14 + addq INCX, X + movddup (X), %xmm15 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + movsd (X), %xmm14 + unpcklpd %xmm14, %xmm14 + addq INCX, X + movsd (X), %xmm15 + unpcklpd %xmm15, %xmm15 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + testq $SIZE, A + je .L5X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -16 * SIZE(A2), %xmm6 + movsd -16 * SIZE(A2, LDA), %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm7 + addsd %xmm7, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L5X: + movhpd -16 * SIZE(A1, LDA), %xmm8 + movhpd -16 * SIZE(A2, LDA), %xmm9 + + movq MM, I + sarq $3, I + jle .L55 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L54 + ALIGN_3 + +.L53: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A1, LDA) +#endif + + shufpd $1, %xmm4, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm1 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + shufpd $1, %xmm8, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A2, LDA) +#endif + + shufpd $1, %xmm4, %xmm9 + mulpd %xmm15, %xmm9 + addpd %xmm9, %xmm0 + MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm1 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm5) + shufpd $1, %xmm9, %xmm6 + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm6) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L53 + ALIGN_3 + +.L54: + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) + + shufpd $1, %xmm4, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm1 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + shufpd $1, %xmm8, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) + + shufpd $1, %xmm4, %xmm9 + mulpd %xmm15, %xmm9 + addpd %xmm9, %xmm0 + MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + shufpd $1, %xmm5, %xmm4 + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm1 + shufpd $1, %xmm6, %xmm5 + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm2 + shufpd $1, %xmm9, %xmm6 + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L55: + testq $4, MM + je .L56 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7) + + shufpd $1, %xmm6, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm7, %xmm8 + shufpd $1, %xmm7, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm1 + + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6) + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7) + + shufpd $1, %xmm6, %xmm9 + mulpd %xmm15, %xmm9 + addpd %xmm9, %xmm0 + movaps %xmm7, %xmm9 + shufpd $1, %xmm7, %xmm6 + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L56: + testq $2, MM + je .L57 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm5, %xmm8 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + shufpd $1, %xmm7, %xmm9 + mulpd %xmm15, %xmm9 + addpd %xmm9, %xmm0 + movaps %xmm7, %xmm9 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L57: + testq $1, MM + je .L58 + + movsd -16 * SIZE(Y1), %xmm0 + + movsd -16 * SIZE(A1), %xmm4 + shufpd $1, %xmm8, %xmm8 + movsd -16 * SIZE(A2), %xmm6 + shufpd $1, %xmm9, %xmm9 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm8 + addsd %xmm8, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm9 + addsd %xmm9, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L58: + cmpq $4, N + jge .L51 + ALIGN_3 + +.L60: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L70 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L61: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 2), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + testq $SIZE, A + je .L6X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A2), %xmm5 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L6X: + movhpd -16 * SIZE(A2), %xmm8 + + movq MM, I + sarq $3, I + jle .L65 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L64 + ALIGN_3 + +.L63: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-15 * SIZE, A2, %xmm4) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-13 * SIZE, A2, %xmm5) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1(-11 * SIZE, A2, %xmm6) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(A2) +#endif + + shufpd $1, %xmm4, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -9 * SIZE, A2, %xmm8) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm1 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm5) + shufpd $1, %xmm8, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm6) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L63 + ALIGN_3 + +.L64: + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-15 * SIZE, A2, %xmm4) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-13 * SIZE, A2, %xmm5) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1(-11 * SIZE, A2, %xmm6) + + shufpd $1, %xmm4, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -9 * SIZE, A2, %xmm8) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm1 + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + shufpd $1, %xmm8, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L65: + testq $4, MM + je .L66 + + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + + MOVUPS_A1(-15 * SIZE, A2, %xmm6) + MOVUPS_A1(-13 * SIZE, A2, %xmm7) + + shufpd $1, %xmm6, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm7, %xmm8 + shufpd $1, %xmm7, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L66: + testq $2, MM + je .L67 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-15 * SIZE, A2, %xmm5) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm5, %xmm8 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L67: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L68 +#else + je .L70 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + + movsd -16 * SIZE(A1), %xmm4 + shufpd $1, %xmm8, %xmm8 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm8 + addsd %xmm8, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 2 +.L68: + cmpq $2, N + jge .L61 + ALIGN_3 + +#endif + +.L70: + cmpq $1, N + jl .L900 + +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + + testq $SIZE, A + je .L7X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, Y1 + ALIGN_3 + +.L7X: + + movq MM, I + sarq $3, I + jle .L75 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + MOVUPS_A1(-12 * SIZE, A1, %xmm2) + MOVUPS_A1(-10 * SIZE, A1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + decq I + jle .L74 + ALIGN_3 + +.L73: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1( -8 * SIZE, A1, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1( -6 * SIZE, A1, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1( -4 * SIZE, A1, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1( -2 * SIZE, A1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L73 + ALIGN_3 + +.L74: + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L75: + testq $4, MM + je .L76 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L76: + testq $2, MM + je .L77 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L77: + testq $1, MM + je .L900 + + movsd -16 * SIZE(Y1), %xmm0 + movsd -16 * SIZE(A1), %xmm8 + + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) +#endif + ALIGN_3 + + +.L900: +#ifndef COPY_FORCE + cmpq Y, BUFFER + je .L999 +#endif + + cmpq $SIZE, INCY + jne .L950 + + testq $SIZE, Y + je .L910 + + movsd (Y), %xmm0 + addsd (BUFFER), %xmm0 + movsd %xmm0, (Y) + + addq $SIZE, Y + addq $SIZE, BUFFER + + decq M + jle .L999 + ALIGN_4 + +.L910: + testq $SIZE, BUFFER + jne .L920 + + movq M, %rax + sarq $3, %rax + jle .L914 + ALIGN_3 + +.L912: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y) +#endif + + movapd 0 * SIZE(Y), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + movapd 4 * SIZE(Y), %xmm2 + movapd 6 * SIZE(Y), %xmm3 + + movapd 0 * SIZE(BUFFER), %xmm4 + movapd 2 * SIZE(BUFFER), %xmm5 + movapd 4 * SIZE(BUFFER), %xmm6 + movapd 6 * SIZE(BUFFER), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 + PREOFFSET(BUFFER) +#endif + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + movapd %xmm0, 0 * SIZE(Y) + movapd %xmm1, 2 * SIZE(Y) + movapd %xmm2, 4 * SIZE(Y) + movapd %xmm3, 6 * SIZE(Y) + + addq $8 * SIZE, Y + addq $8 * SIZE, BUFFER + + decq %rax + jg .L912 + ALIGN_3 + +.L914: + testq $7, M + jle .L999 + + testq $4, M + jle .L915 + + movapd 0 * SIZE(Y), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + movapd 0 * SIZE(BUFFER), %xmm4 + movapd 2 * SIZE(BUFFER), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + movapd %xmm0, 0 * SIZE(Y) + movapd %xmm1, 2 * SIZE(Y) + + addq $4 * SIZE, Y + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L915: + testq $2, M + jle .L916 + + movapd (Y), %xmm0 + + movapd (BUFFER), %xmm4 + + addpd %xmm4, %xmm0 + + movapd %xmm0, (Y) + + addq $2 * SIZE, Y + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L916: + testq $1, M + jle .L999 + + movsd (Y), %xmm0 + + movsd 0 * SIZE(BUFFER), %xmm4 + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y) + ALIGN_3 + + jmp .L999 + ALIGN_4 + +.L920: + movapd -1 * SIZE(BUFFER), %xmm4 + + movq M, %rax + sarq $3, %rax + jle .L924 + ALIGN_3 + +.L922: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y) +#endif + + movapd 0 * SIZE(Y), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + movapd 4 * SIZE(Y), %xmm2 + movapd 6 * SIZE(Y), %xmm3 + + movapd 1 * SIZE(BUFFER), %xmm5 + movapd 3 * SIZE(BUFFER), %xmm6 + movapd 5 * SIZE(BUFFER), %xmm7 + movapd 7 * SIZE(BUFFER), %xmm8 + + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm6, %xmm5 + shufpd $1, %xmm7, %xmm6 + shufpd $1, %xmm8, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 + PREOFFSET(BUFFER) +#endif + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + movapd %xmm0, 0 * SIZE(Y) + movapd %xmm1, 2 * SIZE(Y) + movapd %xmm2, 4 * SIZE(Y) + movapd %xmm3, 6 * SIZE(Y) + + movapd %xmm8, %xmm4 + + addq $8 * SIZE, Y + addq $8 * SIZE, BUFFER + + decq %rax + jg .L922 + ALIGN_3 + +.L924: + testq $7, M + jle .L999 + + testq $4, M + jle .L925 + + movapd 0 * SIZE(Y), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + movapd 1 * SIZE(BUFFER), %xmm5 + movapd 3 * SIZE(BUFFER), %xmm6 + + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm6, %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + movapd %xmm0, 0 * SIZE(Y) + movapd %xmm1, 2 * SIZE(Y) + + movapd %xmm6, %xmm4 + + addq $4 * SIZE, Y + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L925: + testq $2, M + jle .L926 + + movapd (Y), %xmm0 + + movapd 1 * SIZE(BUFFER), %xmm5 + + shufpd $1, %xmm5, %xmm4 + + addpd %xmm4, %xmm0 + + movapd %xmm0, (Y) + + movaps %xmm5, %xmm4 + + addq $2 * SIZE, Y + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L926: + testq $1, M + jle .L999 + + movsd (Y), %xmm0 + + shufpd $1, %xmm4, %xmm4 + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y) + ALIGN_3 + + jmp .L999 + ALIGN_4 + +.L950: + testq $SIZE, BUFFER + je .L960 + + movsd (Y), %xmm0 + addsd (BUFFER), %xmm0 + movsd %xmm0, (Y) + + addq INCY, Y + addq $SIZE, BUFFER + + decq M + jle .L999 + ALIGN_4 + +.L960: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L964 + ALIGN_3 + +.L962: + movsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + movapd 0 * SIZE(BUFFER), %xmm4 + + movsd (Y), %xmm1 + addq INCY, Y + movhpd (Y), %xmm1 + addq INCY, Y + + movapd 2 * SIZE(BUFFER), %xmm5 + + movsd (Y), %xmm2 + addq INCY, Y + movhpd (Y), %xmm2 + addq INCY, Y + + movapd 4 * SIZE(BUFFER), %xmm6 + + addpd %xmm4, %xmm0 + + movsd (Y), %xmm3 + addq INCY, Y + movhpd (Y), %xmm3 + addq INCY, Y + + movapd 6 * SIZE(BUFFER), %xmm7 + + addpd %xmm5, %xmm1 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + + addpd %xmm6, %xmm2 + + movlpd %xmm1, (Y1) + addq INCY, Y1 + movhpd %xmm1, (Y1) + addq INCY, Y1 + + addpd %xmm7, %xmm3 + + movlpd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + movlpd %xmm3, (Y1) + addq INCY, Y1 + movhpd %xmm3, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + decq %rax + jg .L962 + ALIGN_3 + +.L964: + testq $7, M + jle .L999 + + testq $4, M + jle .L965 + + movsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + movapd 0 * SIZE(BUFFER), %xmm4 + + movsd (Y), %xmm1 + addq INCY, Y + movhpd (Y), %xmm1 + addq INCY, Y + + movapd 2 * SIZE(BUFFER), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + movlpd %xmm1, (Y1) + addq INCY, Y1 + movhpd %xmm1, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L965: + testq $2, M + jle .L966 + + movsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + movapd 0 * SIZE(BUFFER), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L966: + testq $1, M + jle .L999 + + movsd (Y), %xmm0 + + movsd 0 * SIZE(BUFFER), %xmm4 + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + + ret + EPILOGUE diff --git a/kernel/x86_64/dgemv_n_atom.S b/kernel/x86_64/dgemv_n_atom.S new file mode 100644 index 0000000..27a763a --- /dev/null +++ b/kernel/x86_64/dgemv_n_atom.S @@ -0,0 +1,788 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCH_SIZE (8 * 6) + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define STACK_ALPHA 48 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define STACK_ALPHA 224 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %r11 +#define A1 %r12 +#define A2 %r13 +#define Y1 %r14 +#define BUFFER %r15 +#define MM %rbx + +#define ALPHA %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#endif + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + movsd %xmm0, STACK_ALPHA +#else + movsd %xmm3, STACK_ALPHA +#endif + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq N, N + jle .L999 + testq M, M + jle .L999 + + cmpq $SIZE, INCY + cmoveq Y, BUFFER + je .L10 + + movq BUFFER, Y1 + xorps %xmm4, %xmm4 + + movq M, %rax + addq $7, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movsd %xmm4, 0 * SIZE(Y1) + movsd %xmm4, 1 * SIZE(Y1) + movsd %xmm4, 2 * SIZE(Y1) + movsd %xmm4, 3 * SIZE(Y1) + movsd %xmm4, 4 * SIZE(Y1) + movsd %xmm4, 5 * SIZE(Y1) + movsd %xmm4, 6 * SIZE(Y1) + movsd %xmm4, 7 * SIZE(Y1) + + addq $8 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: + movq N, J + sarq $1, J + jle .L20 + ALIGN_3 + +.L11: + movq BUFFER, Y1 + + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd STACK_ALPHA, %xmm0 + + movsd (X), %xmm14 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + + mulsd %xmm0, %xmm14 + mulsd %xmm0, %xmm15 + + movq M, I + sarq $3, I + jle .L15 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + movsd 2 * SIZE(A1), %xmm2 + movsd 3 * SIZE(A1), %xmm3 + + movsd 0 * SIZE(A2), %xmm4 + movsd 1 * SIZE(A2), %xmm5 + movsd 2 * SIZE(A2), %xmm6 + movsd 3 * SIZE(A2), %xmm7 + + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd 2 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd 3 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + decq I + jle .L14 + ALIGN_3 + +.L13: + PREFETCH PREFETCH_SIZE * SIZE(A1) + mulsd %xmm15, %xmm4 + PREFETCH PREFETCH_SIZE * SIZE(A2) + addsd %xmm0, %xmm8 + movsd 4 * SIZE(A1), %xmm0 + + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + movsd 5 * SIZE(A1), %xmm1 + + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + movsd 6 * SIZE(A1), %xmm2 + + mulsd %xmm15, %xmm7 + addsd %xmm3, %xmm11 + movsd 7 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm8 + mulsd %xmm14, %xmm0 + movsd 4 * SIZE(A2), %xmm4 + + addsd %xmm5, %xmm9 + mulsd %xmm14, %xmm1 + movsd 5 * SIZE(A2), %xmm5 + + addsd %xmm6, %xmm10 + mulsd %xmm14, %xmm2 + movsd 6 * SIZE(A2), %xmm6 + + addsd %xmm7, %xmm11 + mulsd %xmm14, %xmm3 + movsd 7 * SIZE(A2), %xmm7 + + movsd %xmm8, 0 * SIZE(Y1) + movsd 4 * SIZE(Y1), %xmm8 + movsd %xmm9, 1 * SIZE(Y1) + movsd 5 * SIZE(Y1), %xmm9 + + movsd %xmm10, 2 * SIZE(Y1) + movsd 6 * SIZE(Y1), %xmm10 + movsd %xmm11, 3 * SIZE(Y1) + movsd 7 * SIZE(Y1), %xmm11 + + mulsd %xmm15, %xmm4 + addsd %xmm0, %xmm8 + movsd 8 * SIZE(A1), %xmm0 + + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + movsd 9 * SIZE(A1), %xmm1 + + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + movsd 10 * SIZE(A1), %xmm2 + + mulsd %xmm15, %xmm7 + addq $8 * SIZE, A2 + addsd %xmm3, %xmm11 + movsd 11 * SIZE(A1), %xmm3 + + mulsd %xmm14, %xmm0 + addsd %xmm4, %xmm8 + movsd 0 * SIZE(A2), %xmm4 + + mulsd %xmm14, %xmm1 + addq $8 * SIZE, Y1 + addsd %xmm5, %xmm9 + movsd 1 * SIZE(A2), %xmm5 + + mulsd %xmm14, %xmm2 + addq $8 * SIZE, A1 + addsd %xmm6, %xmm10 + movsd 2 * SIZE(A2), %xmm6 + + mulsd %xmm14, %xmm3 + decq I + addsd %xmm7, %xmm11 + movsd 3 * SIZE(A2), %xmm7 + + movsd %xmm8, -4 * SIZE(Y1) + movsd 0 * SIZE(Y1), %xmm8 + movsd %xmm9, -3 * SIZE(Y1) + movsd 1 * SIZE(Y1), %xmm9 + + movsd %xmm10,-2 * SIZE(Y1) + movsd 2 * SIZE(Y1), %xmm10 + movsd %xmm11,-1 * SIZE(Y1) + movsd 3 * SIZE(Y1), %xmm11 + jg .L13 + ALIGN_3 + +.L14: + mulsd %xmm15, %xmm4 + addsd %xmm0, %xmm8 + movsd 4 * SIZE(A1), %xmm0 + + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + movsd 5 * SIZE(A1), %xmm1 + + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + movsd 6 * SIZE(A1), %xmm2 + + mulsd %xmm15, %xmm7 + addsd %xmm3, %xmm11 + movsd 7 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm8 + mulsd %xmm14, %xmm0 + movsd 4 * SIZE(A2), %xmm4 + + addsd %xmm5, %xmm9 + mulsd %xmm14, %xmm1 + movsd 5 * SIZE(A2), %xmm5 + + addsd %xmm6, %xmm10 + mulsd %xmm14, %xmm2 + movsd 6 * SIZE(A2), %xmm6 + + addsd %xmm7, %xmm11 + mulsd %xmm14, %xmm3 + movsd 7 * SIZE(A2), %xmm7 + + movsd %xmm8, 0 * SIZE(Y1) + movsd 4 * SIZE(Y1), %xmm8 + movsd %xmm9, 1 * SIZE(Y1) + movsd 5 * SIZE(Y1), %xmm9 + + movsd %xmm10, 2 * SIZE(Y1) + movsd 6 * SIZE(Y1), %xmm10 + movsd %xmm11, 3 * SIZE(Y1) + movsd 7 * SIZE(Y1), %xmm11 + + mulsd %xmm15, %xmm4 + addsd %xmm0, %xmm8 + + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + + mulsd %xmm15, %xmm7 + addq $8 * SIZE, A2 + addsd %xmm3, %xmm11 + + mulsd %xmm14, %xmm0 + addsd %xmm4, %xmm8 + + mulsd %xmm14, %xmm1 + addq $8 * SIZE, Y1 + addsd %xmm5, %xmm9 + + mulsd %xmm14, %xmm2 + addq $8 * SIZE, A1 + addsd %xmm6, %xmm10 + + mulsd %xmm14, %xmm3 + addsd %xmm7, %xmm11 + + movsd %xmm8, -4 * SIZE(Y1) + movsd %xmm9, -3 * SIZE(Y1) + movsd %xmm10,-2 * SIZE(Y1) + movsd %xmm11,-1 * SIZE(Y1) + ALIGN_3 + +.L15: + testq $4, M + je .L17 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + movsd 2 * SIZE(A1), %xmm2 + movsd 3 * SIZE(A1), %xmm3 + + movsd 0 * SIZE(A2), %xmm4 + movsd 1 * SIZE(A2), %xmm5 + movsd 2 * SIZE(A2), %xmm6 + movsd 3 * SIZE(A2), %xmm7 + + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd 2 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd 3 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + mulsd %xmm15, %xmm4 + addsd %xmm0, %xmm8 + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + mulsd %xmm15, %xmm7 + addsd %xmm3, %xmm11 + + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + addsd %xmm6, %xmm10 + addsd %xmm7, %xmm11 + + movsd %xmm8, 0 * SIZE(Y1) + movsd %xmm9, 1 * SIZE(Y1) + movsd %xmm10, 2 * SIZE(Y1) + movsd %xmm11, 3 * SIZE(Y1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $2, M + je .L18 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + + movsd 0 * SIZE(A2), %xmm4 + movsd 1 * SIZE(A2), %xmm5 + + mulsd %xmm14, %xmm0 + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm1 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm15, %xmm4 + mulsd %xmm15, %xmm5 + + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + + movsd %xmm8, 0 * SIZE(Y1) + movsd %xmm9, 1 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L18: + testq $1, M + je .L19 + + movsd 0 * SIZE(Y1), %xmm8 + + movsd 0 * SIZE(A1), %xmm0 + movsd 0 * SIZE(A2), %xmm4 + + mulsd %xmm14, %xmm0 + mulsd %xmm15, %xmm4 + + addsd %xmm0, %xmm8 + addsd %xmm4, %xmm8 + + movsd %xmm8, 0 * SIZE(Y1) + ALIGN_3 + +.L19: + decq J + jg .L11 + ALIGN_3 + +.L20: + testq $1, N + je .L990 + + movq BUFFER, Y1 + movq A, A1 + + movsd (X), %xmm14 + mulsd STACK_ALPHA, %xmm14 + + movq M, I + sarq $3, I + jle .L25 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + movsd 2 * SIZE(A1), %xmm2 + movsd 3 * SIZE(A1), %xmm3 + + movsd 4 * SIZE(A1), %xmm4 + movsd 5 * SIZE(A1), %xmm5 + movsd 6 * SIZE(A1), %xmm6 + movsd 7 * SIZE(A1), %xmm7 + + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd 2 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd 3 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + decq I + jle .L24 + ALIGN_3 + +.L23: + PREFETCH PREFETCH_SIZE * SIZE(A1) + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(A1), %xmm0 + addsd %xmm1, %xmm9 + movsd 9 * SIZE(A1), %xmm1 + addsd %xmm2, %xmm10 + movsd 10 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm11 + movsd 11 * SIZE(A1), %xmm3 + + movsd %xmm8, 0 * SIZE(Y1) + movsd 4 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm4 + movsd %xmm9, 1 * SIZE(Y1) + movsd 5 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm5 + + movsd %xmm10, 2 * SIZE(Y1) + movsd 6 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm6 + + movsd %xmm11, 3 * SIZE(Y1) + movsd 7 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm7 + + addsd %xmm4, %xmm8 + movsd 12 * SIZE(A1), %xmm4 + addsd %xmm5, %xmm9 + movsd 13 * SIZE(A1), %xmm5 + addsd %xmm6, %xmm10 + movsd 14 * SIZE(A1), %xmm6 + addsd %xmm7, %xmm11 + movsd 15 * SIZE(A1), %xmm7 + + movsd %xmm8, 4 * SIZE(Y1) + movsd 8 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd %xmm9, 5 * SIZE(Y1) + movsd 9 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd %xmm10, 6 * SIZE(Y1) + movsd 10 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd %xmm11, 7 * SIZE(Y1) + movsd 11 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + addq $8 * SIZE, Y1 + addq $8 * SIZE, A1 + + decq I + jg .L23 + ALIGN_3 + +.L24: + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + mulsd %xmm14, %xmm4 + movsd %xmm8, 0 * SIZE(Y1) + movsd 4 * SIZE(Y1), %xmm8 + + mulsd %xmm14, %xmm5 + movsd %xmm9, 1 * SIZE(Y1) + movsd 5 * SIZE(Y1), %xmm9 + + mulsd %xmm14, %xmm6 + movsd %xmm10, 2 * SIZE(Y1) + movsd 6 * SIZE(Y1), %xmm10 + + mulsd %xmm14, %xmm7 + movsd %xmm11, 3 * SIZE(Y1) + movsd 7 * SIZE(Y1), %xmm11 + + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + addsd %xmm6, %xmm10 + addsd %xmm7, %xmm11 + + movsd %xmm8, 4 * SIZE(Y1) + movsd %xmm9, 5 * SIZE(Y1) + movsd %xmm10, 6 * SIZE(Y1) + movsd %xmm11, 7 * SIZE(Y1) + + addq $8 * SIZE, Y1 + addq $8 * SIZE, A1 + ALIGN_3 + +.L25: + testq $4, M + je .L27 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + movsd 2 * SIZE(A1), %xmm2 + movsd 3 * SIZE(A1), %xmm3 + + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd 2 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd 3 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + movsd %xmm8, 0 * SIZE(Y1) + movsd %xmm9, 1 * SIZE(Y1) + movsd %xmm10, 2 * SIZE(Y1) + movsd %xmm11, 3 * SIZE(Y1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $2, M + je .L28 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + + mulsd %xmm14, %xmm0 + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm1 + movsd 1 * SIZE(Y1), %xmm9 + + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + + movsd %xmm8, 0 * SIZE(Y1) + movsd %xmm9, 1 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L28: + testq $1, M + je .L990 + + movsd 0 * SIZE(Y1), %xmm8 + movsd 0 * SIZE(A1), %xmm0 + + mulsd %xmm14, %xmm0 + addsd %xmm0, %xmm8 + + movsd %xmm8, 0 * SIZE(Y1) + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq Y, Y1 + + movq M, %rax + sarq $2, %rax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y), %xmm0 + addq INCY, Y + movsd (Y), %xmm1 + addq INCY, Y + movsd (Y), %xmm2 + addq INCY, Y + movsd (Y), %xmm3 + addq INCY, Y + + addsd 0 * SIZE(BUFFER), %xmm0 + addsd 1 * SIZE(BUFFER), %xmm1 + addsd 2 * SIZE(BUFFER), %xmm2 + addsd 3 * SIZE(BUFFER), %xmm3 + addq $4 * SIZE, BUFFER + + movsd %xmm0, (Y1) + addq INCY, Y1 + movsd %xmm1, (Y1) + addq INCY, Y1 + movsd %xmm2, (Y1) + addq INCY, Y1 + movsd %xmm3, (Y1) + addq INCY, Y1 + + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $2, M + jle .L996 + + movsd (Y), %xmm0 + addq INCY, Y + movsd (Y), %xmm1 + addq INCY, Y + + addsd 0 * SIZE(BUFFER), %xmm0 + addsd 1 * SIZE(BUFFER), %xmm1 + addq $2 * SIZE, BUFFER + + movsd %xmm0, (Y1) + addq INCY, Y1 + movsd %xmm1, (Y1) + addq INCY, Y1 + ALIGN_3 + +.L996: + testq $1, M + jle .L999 + + movsd (Y), %xmm0 + + addsd (BUFFER), %xmm0 + + movsd %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/dgemv_t.S b/kernel/x86_64/dgemv_t.S new file mode 100644 index 0000000..0719207 --- /dev/null +++ b/kernel/x86_64/dgemv_t.S @@ -0,0 +1,2490 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp +#define X1 %r15 + +#ifdef ALIGNED_ACCESS +#define MM INCX +#else +#define MM M +#endif + +#define ALPHA %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + + leaq -1(INCX), %rax + + leaq (,LDA, SIZE), LDA + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + + leaq (LDA, LDA, 2), LDA3 + + subq $-16 * SIZE, A + +#ifdef HAVE_SSE3 +#ifndef WINDOWS_ABI + movddup %xmm0, ALPHA +#else + movddup %xmm3, ALPHA +#endif +#else +#ifndef WINDOWS_ABI + movapd %xmm0, ALPHA +#else + movapd %xmm3, ALPHA +#endif + unpcklpd ALPHA, ALPHA +#endif + + testq M, M + jle .L999 + testq N, N + jle .L999 + + movq BUFFER, X1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L01 + + movsd (X), %xmm0 + addq INCX, X + + movsd %xmm0, 1 * SIZE(BUFFER) + addq $1 * SIZE, BUFFER + addq $2 * SIZE, X1 + decq M + jle .L10 + ALIGN_4 + +.L01: +#endif + + movq M, I + sarq $3, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addq INCX, X + movhpd (X), %xmm0 + addq INCX, X + + movsd (X), %xmm1 + addq INCX, X + movhpd (X), %xmm1 + addq INCX, X + + movsd (X), %xmm2 + addq INCX, X + movhpd (X), %xmm2 + addq INCX, X + + movsd (X), %xmm3 + addq INCX, X + movhpd (X), %xmm3 + addq INCX, X + + movapd %xmm0, 0 * SIZE(X1) + movapd %xmm1, 2 * SIZE(X1) + movapd %xmm2, 4 * SIZE(X1) + movapd %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $7, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addq INCX, X + movsd %xmm0, 0 * SIZE(X1) + addq $SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, LDA + jne .L50 +#endif + +#if GEMV_UNROLL >= 8 + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 4), A2 + leaq (A1, LDA, 8), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef PREFETCHW + PREFETCHW 7 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L1X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + movsd -16 * SIZE(A1, LDA, 2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm2 + movsd -16 * SIZE(A1, LDA3), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm3 + movsd -16 * SIZE(A2), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm4 + movsd -16 * SIZE(A2, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm5 + movsd -16 * SIZE(A2, LDA, 2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm6 + movsd -16 * SIZE(A2, LDA3), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm7 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L1X: +#endif + + movq M, I + sarq $3, I + jle .L15 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10) + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-8 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-8 * SIZE, A1, LDA, 1, %xmm9) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-8 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + MOVUPS_A2(-8 * SIZE, A1, LDA3, 1, %xmm11) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L12 + ALIGN_4 + +.L13: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm7 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L15: + testq $4, M + jle .L16 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10) + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L16: + testq $2, M + jle .L17 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10) + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L17: + testq $1, M + je .L18 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + movsd -16 * SIZE(A1, LDA, 2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm2 + movsd -16 * SIZE(A1, LDA3), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm3 + movsd -16 * SIZE(A2), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm4 + movsd -16 * SIZE(A2, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm5 + movsd -16 * SIZE(A2, LDA, 2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm6 + movsd -16 * SIZE(A2, LDA3), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm7 + ALIGN_4 + +.L18: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + movapd %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm10 + + movapd %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 +#endif + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm2 + mulpd ALPHA, %xmm4 + mulpd ALPHA, %xmm6 + + cmpq $SIZE, INCY + jne .L19 + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + movsd 2 * SIZE(Y), %xmm9 + movhpd 3 * SIZE(Y), %xmm9 + movsd 4 * SIZE(Y), %xmm10 + movhpd 5 * SIZE(Y), %xmm10 + movsd 6 * SIZE(Y), %xmm11 + movhpd 7 * SIZE(Y), %xmm11 + addq $8 * SIZE, Y + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + movlpd %xmm2, 2 * SIZE(Y1) + movhpd %xmm2, 3 * SIZE(Y1) + movlpd %xmm4, 4 * SIZE(Y1) + movhpd %xmm4, 5 * SIZE(Y1) + movlpd %xmm6, 6 * SIZE(Y1) + movhpd %xmm6, 7 * SIZE(Y1) + addq $8 * SIZE, Y1 + + cmpq $8, N + jge .L11 + jmp .L20 + ALIGN_4 + +.L19: + movsd (Y), %xmm8 + addq INCY, Y + movhpd (Y), %xmm8 + addq INCY, Y + movsd (Y), %xmm9 + addq INCY, Y + movhpd (Y), %xmm9 + addq INCY, Y + movsd (Y), %xmm10 + addq INCY, Y + movhpd (Y), %xmm10 + addq INCY, Y + movsd (Y), %xmm11 + addq INCY, Y + movhpd (Y), %xmm11 + addq INCY, Y + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + movlpd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + movlpd %xmm4, (Y1) + addq INCY, Y1 + movhpd %xmm4, (Y1) + addq INCY, Y1 + movlpd %xmm6, (Y1) + addq INCY, Y1 + movhpd %xmm6, (Y1) + addq INCY, Y1 + + cmpq $8, N + jge .L11 + ALIGN_4 + +.L20: +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#if (GEMV_UNROLL == 4 ) && defined(PREFETCHW) + PREFETCHW 3 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L2X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + movsd -16 * SIZE(A2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm3 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L2X: +#endif + + movq M, I + sarq $3, I + jle .L25 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm9) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1( -8 * SIZE, A2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A2( -8 * SIZE, A2, LDA, 1, %xmm11) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L22 + ALIGN_4 + +.L23: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L25: + testq $4, M + jle .L26 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm10) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L26: + testq $2, M + jle .L27 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L27: + testq $1, M + je .L28 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + movsd -16 * SIZE(A2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm3 + ALIGN_4 + +.L28: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 +#endif + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm2 + + cmpq $SIZE, INCY + jne .L29 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + movsd 2 * SIZE(Y), %xmm5 + movhpd 3 * SIZE(Y), %xmm5 + addq $4 * SIZE, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + movlpd %xmm2, 2 * SIZE(Y1) + movhpd %xmm2, 3 * SIZE(Y1) + addq $4 * SIZE, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + jmp .L30 + ALIGN_4 + +.L29: + movsd (Y), %xmm4 + addq INCY, Y + movhpd (Y), %xmm4 + addq INCY, Y + movsd (Y), %xmm5 + addq INCY, Y + movhpd (Y), %xmm5 + addq INCY, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + movlpd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + ALIGN_4 + +.L30: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L40 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L31: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#if (GEMV_UNROLL == 2 ) && defined(PREFETCHW) + PREFETCHW 2 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L3X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L3X: +#endif + + movq M, I + sarq $3, I + jle .L35 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + MOVUPS_A1(-14 * SIZE, A1, %xmm10) + MOVUPS_A1(-14 * SIZE, A2, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L33 + ALIGN_4 + +.L32: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + MOVUPS_A1(-12 * SIZE, A2, %xmm9) + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-10 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + MOVUPS_A1( -8 * SIZE, A2, %xmm9) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A1( -6 * SIZE, A2, %xmm11) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L32 + ALIGN_4 + +.L33: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + MOVUPS_A1(-12 * SIZE, A2, %xmm9) + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-10 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L35: + testq $4, M + jle .L36 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + MOVUPS_A1(-14 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm11) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L36: + testq $2, M + jle .L37 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L37: + testq $1, M + je .L38 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + ALIGN_4 + +.L38: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + addpd %xmm8, %xmm0 +#endif + + mulpd ALPHA, %xmm0 + + movsd (Y), %xmm4 + addq INCY, Y + movhpd (Y), %xmm4 + addq INCY, Y + + addpd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L31 +#endif + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L4X + + movsd -16 * SIZE(X1), %xmm12 + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + + addq $SIZE, A1 + addq $SIZE, X1 + ALIGN_3 + +.L4X: +#endif + + movq M, I + sarq $3, I + jle .L45 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-14 * SIZE, A1, %xmm9) + MOVUPS_A1(-12 * SIZE, A1, %xmm10) + MOVUPS_A1(-10 * SIZE, A1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L43 + ALIGN_4 + +.L42: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm9, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm9) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm12, %xmm10 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm10, %xmm0 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm2 + MOVUPS_A1( -2 * SIZE, A1, %xmm11) + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + + decq I + jg .L42 + ALIGN_4 + +.L43: + mulpd %xmm12, %xmm8 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm9, %xmm2 + + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm2 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L45: + testq $4, M + jle .L46 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-14 * SIZE, A1, %xmm9) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm2 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L46: + testq $2, M + jle .L47 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L47: + testq $1, M + je .L48 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + ALIGN_4 + +.L48: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + addpd %xmm1, %xmm0 + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + addsd %xmm8, %xmm0 +#endif + + mulsd ALPHA, %xmm0 + + movsd (Y), %xmm4 + addq INCY, Y + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_4 + +.L50: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L60 + ALIGN_3 + +.L51: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L5X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm5 + mulsd %xmm12, %xmm5 + addsd %xmm5, %xmm1 + movsd -16 * SIZE(A2), %xmm6 + mulsd %xmm12, %xmm6 + addsd %xmm6, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm7 + mulsd %xmm12, %xmm7 + addsd %xmm7, %xmm3 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L5X: +#endif + + movhpd -16 * SIZE(A1, LDA), %xmm8 + movhpd -16 * SIZE(A2, LDA), %xmm9 + + movq M, I + sarq $3, I + jle .L55 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L53 + ALIGN_4 + +.L52: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm3 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A1, LDA) +#endif + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-8 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm3 + MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A2, LDA) +#endif + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-8 * SIZE, A1, %xmm4) + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(X1) +#endif + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-8 * SIZE, A2, %xmm6) + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + MOVUPS_XL1(-6 * SIZE, X1, %xmm13) + addpd %xmm7, %xmm3 + MOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L52 + ALIGN_4 + +.L53: + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm3 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm3 + MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm3 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L55: + testq $4, M + jle .L56 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm4) + + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm3 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L56: + testq $2, M + jle .L57 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm5, %xmm8 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm3 + movaps %xmm7, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L57: + testq $1, M + je .L58 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + shufpd $1, %xmm8, %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm1 + movsd -16 * SIZE(A2), %xmm6 + mulsd %xmm12, %xmm6 + addsd %xmm6, %xmm2 + shufpd $1, %xmm9, %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm3 + ALIGN_4 + +.L58: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 +#else + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movapd %xmm2, %xmm5 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 +#endif + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm2 + + cmpq $SIZE, INCY + jne .L59 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + movsd 2 * SIZE(Y), %xmm5 + movhpd 3 * SIZE(Y), %xmm5 + addq $4 * SIZE, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + movlpd %xmm2, 2 * SIZE(Y1) + movhpd %xmm2, 3 * SIZE(Y1) + addq $4 * SIZE, Y1 + + cmpq $4, N + jge .L51 + jmp .L60 + ALIGN_4 + +.L59: + movsd (Y), %xmm4 + addq INCY, Y + movhpd (Y), %xmm4 + addq INCY, Y + movsd (Y), %xmm5 + addq INCY, Y + movhpd (Y), %xmm5 + addq INCY, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + movlpd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + cmpq $4, N + jge .L51 + ALIGN_4 + +.L60: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L70 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L61: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#if (GEMV_UNROLL == 2 ) && defined(PREFETCHW) + PREFETCHW 2 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L6X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm5 + mulsd %xmm12, %xmm5 + addsd %xmm5, %xmm1 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L6X: +#endif + + movhpd -16 * SIZE(A2), %xmm8 + + movq M, I + sarq $3, I + jle .L65 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-15 * SIZE, A2, %xmm5) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-13 * SIZE, A2, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L63 + ALIGN_4 + +.L62: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm8, %xmm1 + MOVUPS_A1(-11 * SIZE, A2, %xmm9) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm6) + shufpd $1, %xmm7, %xmm5 + mulpd %xmm13, %xmm5 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm5, %xmm1 + MOVUPS_A1( -9 * SIZE, A2, %xmm8) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(A2) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-8 * SIZE, A1, %xmm4) + shufpd $1, %xmm9, %xmm7 + mulpd %xmm12, %xmm7 + MOVUPS_XL1(-8 * SIZE, X1, %xmm12) + addpd %xmm7, %xmm1 + MOVUPS_A1(-7 * SIZE, A2, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(X1) +#endif + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A1(-6 * SIZE, A1, %xmm6) + shufpd $1, %xmm8, %xmm9 + mulpd %xmm13, %xmm9 + MOVUPS_XL1(-6 * SIZE, X1, %xmm13) + addpd %xmm9, %xmm1 + MOVUPS_A1(-5 * SIZE, A2, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L62 + ALIGN_4 + +.L63: + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm8, %xmm1 + MOVUPS_A1(-11 * SIZE, A2, %xmm9) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm6) + shufpd $1, %xmm7, %xmm5 + mulpd %xmm13, %xmm5 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm5, %xmm1 + MOVUPS_A1( -9 * SIZE, A2, %xmm8) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm9, %xmm7 + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm1 + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + shufpd $1, %xmm8, %xmm9 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L65: + testq $4, M + jle .L66 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-15 * SIZE, A2, %xmm5) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-13 * SIZE, A2, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + shufpd $1, %xmm7, %xmm5 + movaps %xmm7, %xmm8 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L66: + testq $2, M + jle .L67 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-15 * SIZE, A2, %xmm5) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm5, %xmm8 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L67: + testq $1, M + je .L68 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + shufpd $1, %xmm8, %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm1 + ALIGN_4 + +.L68: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + addpd %xmm4, %xmm0 +#endif + + mulpd ALPHA, %xmm0 + + movsd (Y), %xmm4 + addq INCY, Y + movhpd (Y), %xmm4 + addq INCY, Y + + addpd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L61 +#endif + ALIGN_4 + +.L70: + cmpq $1, N + jl .L999 + +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L7X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + + addq $SIZE, A1 + addq $SIZE, X1 + ALIGN_3 + +.L7X: +#endif + movq M, I + sarq $3, I + jle .L75 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L73 + ALIGN_4 + +.L72: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm4, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + mulpd %xmm13, %xmm5 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm5, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm12, %xmm6 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm6, %xmm0 + MOVUPS_A1( -4 * SIZE, A1, %xmm6) + mulpd %xmm13, %xmm7 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm7, %xmm2 + MOVUPS_A1( -2 * SIZE, A1, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + + decq I + jg .L72 + ALIGN_4 + +.L73: + mulpd %xmm12, %xmm4 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm5, %xmm2 + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm2 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L75: + testq $4, M + jle .L76 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L76: + testq $2, M + jle .L77 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L77: + testq $1, M + je .L78 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + ALIGN_4 + +.L78: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + addpd %xmm1, %xmm0 + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + addsd %xmm4, %xmm0 +#endif + + mulsd ALPHA, %xmm0 + + movsd (Y), %xmm4 + addq INCY, Y + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + ALIGN_4 + + EPILOGUE diff --git a/kernel/x86_64/dgemv_t_atom.S b/kernel/x86_64/dgemv_t_atom.S new file mode 100644 index 0000000..246bdd3 --- /dev/null +++ b/kernel/x86_64/dgemv_t_atom.S @@ -0,0 +1,686 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCH_SIZE (8 * 6) + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 +#define BUFFER %rbx + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %rdi +#define LDA %r8 +#define X %r9 +#define INCX %rsi +#define Y %rbp +#define INCY %r10 +#define BUFFER %rbx + +#endif + +#define I %rax +#define J %r11 +#define A1 %r12 +#define A2 %r13 +#define X1 %r14 +#define Y1 %r15 + +#define ALPHA %xmm3 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (, LDA, SIZE), LDA + +#ifndef WINDOWS_ABI + movapd %xmm0, ALPHA +#endif + + movq Y, Y1 + + testq M, M + jle .L999 + testq N, N + jle .L999 + + cmpq $SIZE, INCX + cmoveq X, BUFFER + je .L10 + + movq BUFFER, X1 + + movq M, I + sarq $3, I + jle .L05 + ALIGN_3 + +.L02: + movsd (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + + movsd (X), %xmm2 + addq INCX, X + movsd (X), %xmm8 + addq INCX, X + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + movsd %xmm0, 0 * SIZE(X1) + movsd %xmm1, 1 * SIZE(X1) + movsd %xmm2, 2 * SIZE(X1) + movsd %xmm8, 3 * SIZE(X1) + movsd %xmm4, 4 * SIZE(X1) + movsd %xmm5, 5 * SIZE(X1) + movsd %xmm6, 6 * SIZE(X1) + movsd %xmm7, 7 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_3 + +.L05: + movq M, I + andq $7, I + jle .L10 + ALIGN_3 + +.L06: + movsd (X), %xmm0 + addq INCX, X + + movsd %xmm0, (X1) + addq $SIZE, X1 + + decq I + jg .L06 + ALIGN_3 + +.L10: + movq N, J + sarq $1, J + jle .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movq BUFFER, X1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + PREFETCHW 1 * SIZE(X1) + + movq M, I + sarq $3, I + jle .L14 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 0 * SIZE(A2), %xmm12 + + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + movsd 1 * SIZE(A2), %xmm13 + + movsd 2 * SIZE(X1), %xmm6 + movsd 2 * SIZE(A1), %xmm10 + movsd 2 * SIZE(A2), %xmm14 + + movsd 3 * SIZE(X1), %xmm7 + mulsd %xmm4, %xmm8 + movsd 3 * SIZE(A1), %xmm11 + mulsd %xmm4, %xmm12 + movsd 4 * SIZE(X1), %xmm4 + + mulsd %xmm5, %xmm9 + movsd 3 * SIZE(A2), %xmm15 + mulsd %xmm5, %xmm13 + movsd 5 * SIZE(X1), %xmm5 + + decq I + jle .L13 + ALIGN_3 + +.L12: + PREFETCH PREFETCH_SIZE * SIZE(A1) + addsd %xmm8, %xmm0 + PREFETCH PREFETCH_SIZE * SIZE(A2) + mulsd %xmm6, %xmm10 + movsd 4 * SIZE(A1), %xmm8 + addsd %xmm12, %xmm1 + movsd 4 * SIZE(A2), %xmm12 + mulsd %xmm6, %xmm14 + movsd 6 * SIZE(X1), %xmm6 + + addsd %xmm9, %xmm0 + movsd 5 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm11 + addsd %xmm13, %xmm1 + movsd 5 * SIZE(A2), %xmm13 + mulsd %xmm7, %xmm15 + movsd 7 * SIZE(X1), %xmm7 + + addsd %xmm10, %xmm0 + movsd 6 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm8 + addsd %xmm14, %xmm1 + movsd 6 * SIZE(A2), %xmm14 + mulsd %xmm4, %xmm12 + movsd 8 * SIZE(X1), %xmm4 + + addsd %xmm11, %xmm0 + movsd 7 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm9 + addsd %xmm15, %xmm1 + movsd 7 * SIZE(A2), %xmm15 + mulsd %xmm5, %xmm13 + movsd 9 * SIZE(X1), %xmm5 + + addsd %xmm8, %xmm0 + movsd 8 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm10 + addq $8 * SIZE, X1 + addsd %xmm12, %xmm1 + movsd 8 * SIZE(A2), %xmm12 + mulsd %xmm6, %xmm14 + movsd 2 * SIZE(X1), %xmm6 + + addsd %xmm9, %xmm0 + movsd 9 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm11 + addq $8 * SIZE, A2 + addsd %xmm13, %xmm1 + movsd 1 * SIZE(A2), %xmm13 + mulsd %xmm7, %xmm15 + movsd 3 * SIZE(X1), %xmm7 + + addsd %xmm10, %xmm0 + movsd 10 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm8 + addq $8 * SIZE, A1 + addsd %xmm14, %xmm1 + movsd 2 * SIZE(A2), %xmm14 + mulsd %xmm4, %xmm12 + movsd 4 * SIZE(X1), %xmm4 + + addsd %xmm11, %xmm0 + movsd 3 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm9 + decq I + addsd %xmm15, %xmm1 + movsd 3 * SIZE(A2), %xmm15 + mulsd %xmm5, %xmm13 + movsd 5 * SIZE(X1), %xmm5 + + jg .L12 + ALIGN_3 + +.L13: + addsd %xmm8, %xmm0 + movsd 4 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm10 + addsd %xmm12, %xmm1 + movsd 4 * SIZE(A2), %xmm12 + mulsd %xmm6, %xmm14 + movsd 6 * SIZE(X1), %xmm6 + + addsd %xmm9, %xmm0 + movsd 5 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm11 + addsd %xmm13, %xmm1 + movsd 5 * SIZE(A2), %xmm13 + mulsd %xmm7, %xmm15 + movsd 7 * SIZE(X1), %xmm7 + + addsd %xmm10, %xmm0 + movsd 6 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm8 + addsd %xmm14, %xmm1 + movsd 6 * SIZE(A2), %xmm14 + mulsd %xmm4, %xmm12 + + addsd %xmm11, %xmm0 + movsd 7 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm9 + addsd %xmm15, %xmm1 + movsd 7 * SIZE(A2), %xmm15 + mulsd %xmm5, %xmm13 + + addsd %xmm8, %xmm0 + mulsd %xmm6, %xmm10 + addsd %xmm12, %xmm1 + mulsd %xmm6, %xmm14 + + addsd %xmm9, %xmm0 + mulsd %xmm7, %xmm11 + addsd %xmm13, %xmm1 + mulsd %xmm7, %xmm15 + + addsd %xmm10, %xmm0 + addq $8 * SIZE, A1 + addsd %xmm14, %xmm1 + addq $8 * SIZE, A2 + addsd %xmm11, %xmm0 + addq $8 * SIZE, X1 + addsd %xmm15, %xmm1 + ALIGN_4 + +.L14: + testq $4, M + je .L16 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 0 * SIZE(A2), %xmm12 + + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + movsd 1 * SIZE(A2), %xmm13 + + movsd 2 * SIZE(X1), %xmm6 + movsd 2 * SIZE(A1), %xmm10 + movsd 2 * SIZE(A2), %xmm14 + + movsd 3 * SIZE(X1), %xmm7 + movsd 3 * SIZE(A1), %xmm11 + movsd 3 * SIZE(A2), %xmm15 + + mulsd %xmm4, %xmm8 + mulsd %xmm4, %xmm12 + mulsd %xmm5, %xmm9 + mulsd %xmm5, %xmm13 + + addsd %xmm8, %xmm0 + addsd %xmm12, %xmm1 + addsd %xmm9, %xmm0 + addsd %xmm13, %xmm1 + + mulsd %xmm6, %xmm10 + mulsd %xmm6, %xmm14 + mulsd %xmm7, %xmm11 + mulsd %xmm7, %xmm15 + + addsd %xmm10, %xmm0 + addsd %xmm14, %xmm1 + addsd %xmm11, %xmm0 + addsd %xmm15, %xmm1 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L16: + testq $2, M + je .L17 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 0 * SIZE(A2), %xmm12 + + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + movsd 1 * SIZE(A2), %xmm13 + + mulsd %xmm4, %xmm8 + mulsd %xmm4, %xmm12 + mulsd %xmm5, %xmm9 + mulsd %xmm5, %xmm13 + + addsd %xmm8, %xmm0 + addsd %xmm12, %xmm1 + addsd %xmm9, %xmm0 + addsd %xmm13, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + + ALIGN_4 + +.L17: + testq $1, M + je .L19 + + movsd 0 * SIZE(X1), %xmm4 + + movsd 0 * SIZE(A1), %xmm8 + movsd 0 * SIZE(A2), %xmm12 + + mulsd %xmm4, %xmm8 + mulsd %xmm4, %xmm12 + + addsd %xmm8, %xmm0 + addsd %xmm12, %xmm1 + ALIGN_4 + +.L19: + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + addq INCY, Y + + mulsd ALPHA, %xmm1 + addsd (Y), %xmm1 + addq INCY, Y + + movsd %xmm0, (Y1) + addq INCY, Y1 + movsd %xmm1, (Y1) + addq INCY, Y1 + + decq J + jg .L11 + ALIGN_3 + +.L20: + testq $1, N + jle .L999 + + movq A, A1 + movq BUFFER, X1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movq M, I + sarq $3, I + jle .L24 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + + movsd 2 * SIZE(X1), %xmm6 + movsd 2 * SIZE(A1), %xmm10 + movsd 3 * SIZE(X1), %xmm7 + movsd 3 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + movsd 4 * SIZE(X1), %xmm4 + mulsd %xmm5, %xmm9 + movsd 5 * SIZE(X1), %xmm5 + mulsd %xmm6, %xmm10 + movsd 6 * SIZE(X1), %xmm6 + mulsd %xmm7, %xmm11 + movsd 7 * SIZE(X1), %xmm7 + + decq I + jle .L23 + ALIGN_3 + +.L22: + PREFETCH PREFETCH_SIZE * SIZE(A1) + addsd %xmm8, %xmm0 + movsd 4 * SIZE(A1), %xmm8 + addsd %xmm9, %xmm0 + movsd 5 * SIZE(A1), %xmm9 + addsd %xmm10, %xmm0 + movsd 6 * SIZE(A1), %xmm10 + addsd %xmm11, %xmm0 + movsd 7 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + movsd 8 * SIZE(X1), %xmm4 + mulsd %xmm5, %xmm9 + movsd 9 * SIZE(X1), %xmm5 + mulsd %xmm6, %xmm10 + movsd 10 * SIZE(X1), %xmm6 + mulsd %xmm7, %xmm11 + movsd 11 * SIZE(X1), %xmm7 + + addsd %xmm8, %xmm0 + movsd 8 * SIZE(A1), %xmm8 + addsd %xmm9, %xmm1 + movsd 9 * SIZE(A1), %xmm9 + addsd %xmm10, %xmm1 + movsd 10 * SIZE(A1), %xmm10 + addsd %xmm11, %xmm0 + movsd 11 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + movsd 12 * SIZE(X1), %xmm4 + mulsd %xmm5, %xmm9 + movsd 13 * SIZE(X1), %xmm5 + mulsd %xmm6, %xmm10 + movsd 14 * SIZE(X1), %xmm6 + mulsd %xmm7, %xmm11 + movsd 15 * SIZE(X1), %xmm7 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + decq I + jg .L22 + ALIGN_3 + +.L23: + addsd %xmm8, %xmm0 + movsd 4 * SIZE(A1), %xmm8 + addsd %xmm9, %xmm1 + movsd 5 * SIZE(A1), %xmm9 + addsd %xmm10, %xmm0 + movsd 6 * SIZE(A1), %xmm10 + addsd %xmm11, %xmm1 + movsd 7 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm9 + mulsd %xmm6, %xmm10 + mulsd %xmm7, %xmm11 + + addsd %xmm8, %xmm0 + addsd %xmm9, %xmm1 + addsd %xmm10, %xmm0 + addq $8 * SIZE, A1 + addsd %xmm11, %xmm1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L24: + testq $4, M + je .L26 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + + movsd 2 * SIZE(X1), %xmm6 + movsd 2 * SIZE(A1), %xmm10 + movsd 3 * SIZE(X1), %xmm7 + movsd 3 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm9 + mulsd %xmm6, %xmm10 + mulsd %xmm7, %xmm11 + + addsd %xmm8, %xmm0 + addsd %xmm9, %xmm1 + addsd %xmm10, %xmm0 + addq $4 * SIZE, A1 + addsd %xmm11, %xmm1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L26: + testq $2, M + je .L27 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm9 + addsd %xmm8, %xmm0 + addq $2 * SIZE, A1 + addsd %xmm9, %xmm1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L27: + testq $1, M + je .L29 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + + mulsd %xmm4, %xmm8 + addsd %xmm8, %xmm0 + ALIGN_4 + +.L29: + addsd %xmm1, %xmm0 + + mulsd ALPHA, %xmm0 + + addsd (Y), %xmm0 + movsd %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + ALIGN_3 + + EPILOGUE diff --git a/kernel/x86_64/dot.S b/kernel/x86_64/dot.S new file mode 100644 index 0000000..e63d9cd --- /dev/null +++ b/kernel/x86_64/dot.S @@ -0,0 +1,184 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#define INCY ARG5 /* r8 */ + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpq $SIZE, INCX + jne .L14 + cmpq $SIZE, INCY + jne .L14 + + movq N, %rax + sarq $2, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(1) + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(3) + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(4) + addq $4 * SIZE, X + addq $4 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq N, %rax + andq $3, %rax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addq $SIZE, X + FLD (Y) + fmulp %st, %st(1) + addq $SIZE, Y + faddp %st,%st(1) + decq %rax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: + movq N, %rax + sarq $2, %rax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st,%st(1) + + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st,%st(2) + + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st,%st(3) + + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st,%st(4) + + decq %rax + jg .L31 + ALIGN_3 + +.L30: + movq N, %rax + andq $3, %rax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st, %st(1) + decq %rax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + ret + + EPILOGUE diff --git a/kernel/x86_64/dot_atom.S b/kernel/x86_64/dot_atom.S new file mode 100644 index 0000000..bc67b28 --- /dev/null +++ b/kernel/x86_64/dot_atom.S @@ -0,0 +1,299 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + pxor %xmm0, %xmm0 + leaq (, INCY, SIZE), INCY + pxor %xmm1, %xmm1 + + pxor %xmm2, %xmm2 + cmpq $0, N + pxor %xmm3, %xmm3 + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + movq N, %rax + sarq $3, %rax + jle .L14 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm9 + + movsd 2 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 2 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 3 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 3 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + + decq %rax + jle .L12 + + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addsd %xmm4, %xmm0 + movsd 4 * SIZE(X), %xmm4 + addsd %xmm5, %xmm1 + movsd 4 * SIZE(Y), %xmm8 + addsd %xmm6, %xmm2 + movsd 5 * SIZE(X), %xmm5 + addsd %xmm7, %xmm3 + movsd 5 * SIZE(Y), %xmm9 + + movsd 6 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 6 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 7 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 7 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + addsd %xmm4, %xmm0 + movsd 8 * SIZE(X), %xmm4 + addsd %xmm5, %xmm1 + movsd 8 * SIZE(Y), %xmm8 + addsd %xmm6, %xmm2 + movsd 9 * SIZE(X), %xmm5 + addsd %xmm7, %xmm3 + movsd 9 * SIZE(Y), %xmm9 + + movsd 10 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 10 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 11 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 11 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + addsd %xmm4, %xmm0 + movsd 4 * SIZE(X), %xmm4 + addsd %xmm5, %xmm1 + movsd 4 * SIZE(Y), %xmm8 + addsd %xmm6, %xmm2 + movsd 5 * SIZE(X), %xmm5 + addsd %xmm7, %xmm3 + movsd 5 * SIZE(Y), %xmm9 + + movsd 6 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 6 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 7 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 7 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + addq $ 8 * SIZE, X + addq $ 8 * SIZE, Y + ALIGN_3 + +.L14: + testq $7, N + jle .L999 + + testq $4, N + jle .L16 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm9 + + movsd 2 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 2 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 3 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 3 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm9 + + mulsd %xmm8, %xmm4 + mulsd %xmm9, %xmm5 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L999 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm8 + + mulsd %xmm8, %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addq INCY, Y + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm6 + addq INCY, Y + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm7 + addq INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addq INCY, Y + addsd %xmm4, %xmm0 + decq %rax + jg .L56 + ALIGN_3 + +.L999: + addsd %xmm1, %xmm0 + addsd %xmm3, %xmm2 + addsd %xmm2, %xmm0 + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S new file mode 100644 index 0000000..cc866a9 --- /dev/null +++ b/kernel/x86_64/dot_sse.S @@ -0,0 +1,1293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpq $0, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + cmpq $3, N + jle .L17 + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + mulss -32 * SIZE(Y), %xmm0 + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq N + ALIGN_2 + +.L05: + testq $2 * SIZE, Y + je .L10 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + mulps %xmm4, %xmm1 + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, N + jle .L999 + ALIGN_2 + +.L10: +#ifdef ALIGNED_ACCESS + testq $2 * SIZE, X + jne .L30 + + testq $SIZE, X + jne .L20 +#else + testq $3 * SIZE, X + jne .L20 +#endif + + movq N, %rax + sarq $5, %rax + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + movaps -16 * SIZE(X), %xmm8 + movaps -12 * SIZE(X), %xmm9 + movaps -8 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + + decq %rax + jle .L12 + + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movaps 16 * SIZE(X), %xmm8 + + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movaps 20 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movaps 24 * SIZE(X), %xmm10 + + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movaps 28 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L14: + testq $31, N + jle .L999 + + testq $16, N + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L15: + testq $8, N + jle .L16 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + testq $4, N + jle .L17 + + movaps -32 * SIZE(X), %xmm4 + mulps -32 * SIZE(Y), %xmm4 + + addps %xmm4, %xmm2 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + testq $2, N + jle .L18 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm3 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L18: + testq $1, N + jle .L999 + + movss -32 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movaps -33 * SIZE(X), %xmm4 + addq $3 * SIZE, X + + movq N, %rax + sarq $5, %rax + jle .L24 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movaps -20 * SIZE(X), %xmm8 + movaps -16 * SIZE(X), %xmm9 + movaps -12 * SIZE(X), %xmm10 + movaps -8 * SIZE(X), %xmm11 + + decq %rax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + pshufd $0x39, %xmm5, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + pshufd $0x39, %xmm6, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + movss %xmm8, %xmm7 + pshufd $0x39, %xmm7, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x39, %xmm8, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movaps 12 * SIZE(X), %xmm8 + + movss %xmm10, %xmm9 + pshufd $0x39, %xmm9, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movaps 16 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x39, %xmm10, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movaps 20 * SIZE(X), %xmm10 + + movss %xmm4, %xmm11 + pshufd $0x39, %xmm11, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movaps 24 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + pshufd $0x39, %xmm5, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + pshufd $0x39, %xmm6, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm8, %xmm7 + pshufd $0x39, %xmm7, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + movss %xmm9, %xmm8 + pshufd $0x39, %xmm8, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + + movss %xmm10, %xmm9 + pshufd $0x39, %xmm9, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0x39, %xmm10, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + + movss %xmm4, %xmm11 + pshufd $0x39, %xmm11, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L24: + testq $31, N + jle .L999 + + testq $16, N + jle .L25 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + pshufd $0x39, %xmm5, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + pshufd $0x39, %xmm6, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + pshufd $0x39, %xmm7, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, N + jle .L26 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + movss %xmm6, %xmm5 + pshufd $0x39, %xmm5, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm6, %xmm4 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + testq $4, N + jle .L27 + + movaps -32 * SIZE(X), %xmm5 + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm2 + movaps %xmm5, %xmm4 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $2, N + jle .L28 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + pshufd $0x39, %xmm4, %xmm5 + + mulps %xmm8, %xmm5 + addps %xmm5, %xmm3 + movhlps %xmm4, %xmm4 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L28: + testq $1, N + jle .L999 + + pshufd $0x39, %xmm4, %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L30: + testq $SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm4 + addq $2 * SIZE, X + + movq N, %rax + sarq $5, %rax + jle .L34 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + movaps -20 * SIZE(X), %xmm8 + + movaps -16 * SIZE(X), %xmm9 + movaps -12 * SIZE(X), %xmm10 + movaps -8 * SIZE(X), %xmm11 + + decq %rax + jle .L32 + + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm8, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm9, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movaps 12 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm10, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movaps 16 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm11, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movaps 20 * SIZE(X), %xmm10 + + SHUFPD_1 %xmm4, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movaps 24 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L32: + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + SHUFPD_1 %xmm8, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + SHUFPD_1 %xmm9, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + + SHUFPD_1 %xmm10, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + + SHUFPD_1 %xmm11, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + + SHUFPD_1 %xmm4, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L34: + testq $31, N + jle .L999 + + testq $16, N + jle .L35 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L35: + testq $8, N + jle .L36 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movapd %xmm6, %xmm4 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L36: + testq $4, N + jle .L37 + + movaps -32 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps %xmm5, %xmm4 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L37: + testq $2, N + jle .L38 + + xorps %xmm5, %xmm5 + movhlps %xmm4, %xmm5 + + mulps -32 * SIZE(Y), %xmm5 + addps %xmm5, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L38: + testq $1, N + jle .L999 + + movss -34 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm4 + addq $SIZE, X + + movq N, %rax + sarq $5, %rax + jle .L44 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movaps -20 * SIZE(X), %xmm8 + movaps -16 * SIZE(X), %xmm9 + movaps -12 * SIZE(X), %xmm10 + movaps -8 * SIZE(X), %xmm11 + + decq %rax + jle .L42 + + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + movss %xmm8, %xmm7 + shufps $0x93, %xmm8, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + shufps $0x93, %xmm9, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movaps 12 * SIZE(X), %xmm8 + + movss %xmm10, %xmm9 + shufps $0x93, %xmm10, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movaps 16 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + shufps $0x93, %xmm11, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movaps 20 * SIZE(X), %xmm10 + + movss %xmm4, %xmm11 + shufps $0x93, %xmm4, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movaps 24 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm8, %xmm7 + shufps $0x93, %xmm8, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + movss %xmm9, %xmm8 + shufps $0x93, %xmm9, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + + movss %xmm10, %xmm9 + shufps $0x93, %xmm10, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + + movss %xmm11, %xmm10 + shufps $0x93, %xmm11, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + + movss %xmm4, %xmm11 + shufps $0x93, %xmm4, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L44: + testq $31, N + jle .L999 + + testq $16, N + jle .L45 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L45: + testq $8, N + jle .L46 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm6, %xmm4 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L46: + testq $4, N + jle .L47 + + movaps -32 * SIZE(X), %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm2 + movaps %xmm5, %xmm4 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L47: + testq $2, N + jle .L48 + + movaps -32 * SIZE(X), %xmm5 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm3 + movlhps %xmm5, %xmm4 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L48: + testq $1, N + jle .L999 + + pshufd $0x93, %xmm4, %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_4 + +#else + movq N, %rax + sarq $5, %rax + jle .L24 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + + movlps -16 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 + movlps -12 * SIZE(X), %xmm9 + movhps -10 * SIZE(X), %xmm9 + movlps -8 * SIZE(X), %xmm10 + movhps -6 * SIZE(X), %xmm10 + movlps -4 * SIZE(X), %xmm11 + movhps -2 * SIZE(X), %xmm11 + + decq %rax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movlps 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movlps 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movlps 16 * SIZE(X), %xmm8 + movhps 18 * SIZE(X), %xmm8 + + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movlps 20 * SIZE(X), %xmm9 + movhps 22 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movlps 24 * SIZE(X), %xmm10 + movhps 26 * SIZE(X), %xmm10 + + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movlps 28 * SIZE(X), %xmm11 + movhps 30 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L24: + testq $31, N + jle .L999 + + testq $16, N + jle .L25 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, N + jle .L26 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + testq $4, N + jle .L27 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + mulps -32 * SIZE(Y), %xmm4 + + addps %xmm4, %xmm2 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $2, N + jle .L28 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm3 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L28: + testq $1, N + jle .L999 + + movss -32 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 +#endif + + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movss 0 * SIZE(X), %xmm4 + addq INCX, X + mulss 0 * SIZE(Y), %xmm4 + addq INCY, Y + movss 0 * SIZE(X), %xmm5 + addq INCX, X + mulss 0 * SIZE(Y), %xmm5 + addq INCY, Y + movss 0 * SIZE(X), %xmm6 + addq INCX, X + mulss 0 * SIZE(Y), %xmm6 + addq INCY, Y + movss 0 * SIZE(X), %xmm7 + addq INCX, X + mulss 0 * SIZE(Y), %xmm7 + addq INCY, Y + + addss %xmm4, %xmm0 + addss %xmm5, %xmm1 + addss %xmm6, %xmm2 + addss %xmm7, %xmm3 + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movss 0 * SIZE(X), %xmm4 + addq INCX, X + mulss 0 * SIZE(Y), %xmm4 + addq INCY, Y + addss %xmm4, %xmm0 + decq %rax + jg .L56 + ALIGN_3 + +.L999: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/dot_sse2.S b/kernel/x86_64/dot_sse2.S new file mode 100644 index 0000000..875bf4e --- /dev/null +++ b/kernel/x86_64/dot_sse2.S @@ -0,0 +1,714 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpq $0, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, Y + je .L10 + + movsd -16 * SIZE(X), %xmm0 + mulsd -16 * SIZE(Y), %xmm0 + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq N + ALIGN_2 + +.L10: + testq $SIZE, X + jne .L20 + + movq N, %rax + sarq $4, %rax + jle .L14 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq %rax + jle .L12 + + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + movaps 10 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + movaps 12 * SIZE(X), %xmm10 + + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L14: + testq $15, N + jle .L999 + + testq $8, N + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, N + jle .L16 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movaps -16 * SIZE(X), %xmm4 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L999 + + movsd -16 * SIZE(X), %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L20: + +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm4 + addq $SIZE, X + + movq N, %rax + sarq $4, %rax + jle .L24 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + movaps -12 * SIZE(X), %xmm7 + movaps -10 * SIZE(X), %xmm8 + + movaps -8 * SIZE(X), %xmm9 + movaps -6 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + + decq %rax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -2 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps 2 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm8, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps 4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm9, %xmm8 + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + movaps 6 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm10, %xmm9 + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + movaps 8 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm11, %xmm10 + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + movaps 10 * SIZE(X), %xmm10 + + SHUFPD_1 %xmm4, %xmm11 + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + movaps 12 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -2 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + + SHUFPD_1 %xmm8, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + SHUFPD_1 %xmm9, %xmm8 + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + + SHUFPD_1 %xmm10, %xmm9 + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + + SHUFPD_1 %xmm11, %xmm10 + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + + SHUFPD_1 %xmm4, %xmm11 + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L24: + testq $15, N + jle .L999 + + testq $8, N + jle .L25 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + movaps -12 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -10 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movapd %xmm6, %xmm4 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + movaps -16 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movapd %xmm5, %xmm4 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L999 + + SHUFPD_1 %xmm4, %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +#else + + movq N, %rax + sarq $4, %rax + jle .L24 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movlps -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movlps -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + + movlps -8 * SIZE(X), %xmm8 + movhps -7 * SIZE(X), %xmm8 + movlps -6 * SIZE(X), %xmm9 + movhps -5 * SIZE(X), %xmm9 + movlps -4 * SIZE(X), %xmm10 + movhps -3 * SIZE(X), %xmm10 + movlps -2 * SIZE(X), %xmm11 + movhps -1 * SIZE(X), %xmm11 + + decq %rax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movlps 2 * SIZE(X), %xmm5 + movhps 3 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movlps 4 * SIZE(X), %xmm6 + movhps 5 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movlps 6 * SIZE(X), %xmm7 + movhps 7 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + movlps 8 * SIZE(X), %xmm8 + movhps 9 * SIZE(X), %xmm8 + + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + movlps 10 * SIZE(X), %xmm9 + movhps 11 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + movlps 12 * SIZE(X), %xmm10 + movhps 13 * SIZE(X), %xmm10 + + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + movlps 14 * SIZE(X), %xmm11 + movhps 15 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L24: + testq $15, N + jle .L999 + + testq $8, N + jle .L25 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movlps -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movlps -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L999 + + movsd -16 * SIZE(X), %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 +#endif + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addq INCY, Y + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm6 + addq INCY, Y + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm7 + addq INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addq INCY, Y + addsd %xmm4, %xmm0 + decq %rax + jg .L56 + ALIGN_3 + +.L999: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + pshufd $0xe, %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_beta.S b/kernel/x86_64/gemm_beta.S new file mode 100644 index 0000000..461df50 --- /dev/null +++ b/kernel/x86_64/gemm_beta.S @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 +#define N ARG2 +#define C ARG3 +#define LDC ARG4 +#define C1 ARG5 + +#define STACK_C 16(%rsp) +#define STACK_LDC 24(%rsp) + +#else + +#define STACKSIZE 256 + +#define M ARG1 +#define N ARG2 +#define C ARG3 +#define LDC ARG4 +#define C1 %r10 + +#define STACK_C 72 + STACKSIZE(%rsp) +#define STACK_LDC 80 + STACKSIZE(%rsp) + +#endif + +#define I %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movaps %xmm3, %xmm0 +#endif + + movq STACK_C, C + movq STACK_LDC, LDC + + pxor %xmm1, %xmm1 + + test M, M + jle .L999 + test N, N + jle .L999 + +#ifdef DOUBLE + ucomisd %xmm1, %xmm0 +#else + ucomiss %xmm1, %xmm0 +#endif + jne .L201 + ALIGN_4 + +.L101: + movq C, C1 + leaq (C, LDC, SIZE), C + + movq M, I + sarq $3, I + jle .L103 + ALIGN_4 + +.L102: +#ifdef OPTERON + prefetchw 32 * SIZE(C1) +#endif + + MOVSD %xmm0, 0 * SIZE(C1) + MOVSD %xmm0, 1 * SIZE(C1) + MOVSD %xmm0, 2 * SIZE(C1) + MOVSD %xmm0, 3 * SIZE(C1) + MOVSD %xmm0, 4 * SIZE(C1) + MOVSD %xmm0, 5 * SIZE(C1) + MOVSD %xmm0, 6 * SIZE(C1) + MOVSD %xmm0, 7 * SIZE(C1) + addq $8 * SIZE, C1 + decq I + jg .L102 + ALIGN_4 + +.L103: + movq M, I + andq $7, I + jle .L105 + ALIGN_4 + +.L104: + MOVSD %xmm0, 0 * SIZE(C1) + addq $SIZE, C1 + decq I + jg .L104 + ALIGN_4 + +.L105: + decq N + jg .L101 + jmp .L999 + ALIGN_3 + +.L201: + movq C, C1 # c_offset = c + leaq (C, LDC, SIZE), C # c += ldc + movq M, I + sarq $3, I + jle .L203 + ALIGN_4 + +.L202: +#ifdef OPTERON + prefetchw 32 * SIZE(C1) +#endif + + MOVSD 0 * SIZE(C1), %xmm8 + MOVSD 1 * SIZE(C1), %xmm9 + MOVSD 2 * SIZE(C1), %xmm10 + MOVSD 3 * SIZE(C1), %xmm11 + MOVSD 4 * SIZE(C1), %xmm12 + MOVSD 5 * SIZE(C1), %xmm13 + MOVSD 6 * SIZE(C1), %xmm14 + MOVSD 7 * SIZE(C1), %xmm15 + + MULSD %xmm0, %xmm8 + MULSD %xmm0, %xmm9 + MULSD %xmm0, %xmm10 + MULSD %xmm0, %xmm11 + MULSD %xmm0, %xmm12 + MULSD %xmm0, %xmm13 + MULSD %xmm0, %xmm14 + MULSD %xmm0, %xmm15 + + MOVSD %xmm8, 0 * SIZE(C1) + MOVSD %xmm9, 1 * SIZE(C1) + MOVSD %xmm10, 2 * SIZE(C1) + MOVSD %xmm11, 3 * SIZE(C1) + MOVSD %xmm12, 4 * SIZE(C1) + MOVSD %xmm13, 5 * SIZE(C1) + MOVSD %xmm14, 6 * SIZE(C1) + MOVSD %xmm15, 7 * SIZE(C1) + + addq $8 * SIZE, C1 + decq I + jg .L202 + ALIGN_4 + +.L203: + movq M, I + andq $7, I + jle .L205 + ALIGN_4 + +.L204: + MOVSD 0 * SIZE(C1), %xmm8 + MULSD %xmm0, %xmm8 + MOVSD %xmm8, 0 * SIZE(C1) + addq $SIZE, C1 + decq I + jg .L204 + ALIGN_4 + +.L205: + decq N + jg .L201 + ALIGN_3 + +.L999: + xorq %rax, %rax + +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_2x8_nehalem.S b/kernel/x86_64/gemm_kernel_2x8_nehalem.S new file mode 100644 index 0000000..24e66d7 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_2x8_nehalem.S @@ -0,0 +1,1849 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define INC32 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define J 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define J 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCHSIZE 4 +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + leaq (, LDC, SIZE), LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + PADDING + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + PADDING + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO1, LDC, 1) + PADDING + xorps %xmm10, %xmm10 + prefetcht0 1 * SIZE(CO1, LDC, 2) + PADDING + xorps %xmm11, %xmm11 + prefetcht0 3 * SIZE(CO1, %rax, 1) + + movaps -16 * SIZE(AO), %xmm0 + + PADDING + xorps %xmm12, %xmm12 + prefetcht0 1 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 1 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 3 * SIZE(CO2, %rax, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -14 * SIZE(AO), %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm5 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + PADDING; + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + PADDING; + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm4 + + subq $-32 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm1, %xmm12 + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + mulpd %xmm7, %xmm8 + shufpd $2, %xmm0, %xmm9 + mulpd %xmm7, %xmm9 + + addpd %xmm2, %xmm13 + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + mulpd %xmm7, %xmm10 + shufpd $2, %xmm0, %xmm11 + mulpd %xmm7, %xmm11 + + addpd %xmm3, %xmm14 + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + mulpd %xmm7, %xmm12 + shufpd $2, %xmm0, %xmm13 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm15 + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + mulpd %xmm7, %xmm14 + shufpd $2, %xmm0, %xmm15 + mulpd %xmm7, %xmm15 + + movq CO1, %rax + orq LDC, %rax + testq $15, %rax + NOBRANCH + jne .L18x + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movups (CO1), %xmm0 + movups (CO1, LDC, 1), %xmm1 + movups (CO1, LDC, 2), %xmm2 + movups (CO1, %rax, 1), %xmm3 + + movups (CO2), %xmm4 + movups (CO2, LDC, 1), %xmm5 + movups (CO2, LDC, 2), %xmm6 + movups (CO2, %rax, 1), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + addpd %xmm4, %xmm12 + addpd %xmm5, %xmm13 + addpd %xmm6, %xmm14 + addpd %xmm7, %xmm15 +#endif + + movaps %xmm8, (CO1) + movaps %xmm9, (CO1, LDC, 1) + movaps %xmm10, (CO1, LDC, 2) + movaps %xmm11, (CO1, %rax, 1) + + movaps %xmm12, (CO2) + movaps %xmm13, (CO2, LDC, 1) + movaps %xmm14, (CO2, LDC, 2) + movaps %xmm15, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movups (CO1), %xmm0 + movups (CO1, LDC, 1), %xmm1 + movups (CO1, LDC, 2), %xmm2 + movups (CO1, %rax, 1), %xmm3 + movups (CO2), %xmm4 + movups (CO2, LDC, 1), %xmm5 + movups (CO2, LDC, 2), %xmm6 + movups (CO2, %rax, 1), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm12 + addpd %xmm5, %xmm13 + addpd %xmm6, %xmm14 + addpd %xmm7, %xmm15 +#endif + + movups %xmm8, (CO1) + movups %xmm9, (CO1, LDC, 1) + movups %xmm10, (CO1, LDC, 2) + movups %xmm11, (CO1, %rax, 1) + + movups %xmm12, (CO2) + movups %xmm13, (CO2, LDC, 1) + movups %xmm14, (CO2, LDC, 2) + movups %xmm15, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO1, LDC, 2), %xmm1 + movhps (CO1, %rax, 1), %xmm1 + movsd (CO2), %xmm2 + movhps (CO2, LDC, 1), %xmm2 + movsd (CO2, LDC, 2), %xmm3 + movhps (CO2, %rax, 1), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + movsd %xmm9, (CO1, LDC, 2) + movhps %xmm9, (CO1, %rax, 1) + + movsd %xmm10, (CO2) + movhps %xmm10, (CO2, LDC, 1) + movsd %xmm11, (CO2, LDC, 2) + movhps %xmm11, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK +#endif + + movq BO, B + + leaq (C, LDC, 8), C + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $4, N + jle .L50 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 2 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO2, LDC, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + mulpd %xmm7, %xmm8 + shufpd $2, %xmm0, %xmm9 + mulpd %xmm7, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + mulpd %xmm7, %xmm10 + shufpd $2, %xmm0, %xmm11 + mulpd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + + movsd 0 * SIZE(CO1, LDC, 1), %xmm1 + movhps 1 * SIZE(CO1, LDC, 1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 1 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO2, LDC, 1), %xmm3 + movhps 1 * SIZE(CO2, LDC, 1), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO1, LDC, 1) + movhps %xmm9, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 1) + movhps %xmm11, 1 * SIZE(CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm10, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm11, %xmm9 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC, 1), %xmm1 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + movsd %xmm9, (CO2) + movhps %xmm9, (CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + ALIGN_4 + +.L50: + testq $2, N + jle .L70 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + mulpd %xmm7, %xmm8 + shufpd $2, %xmm0, %xmm9 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 1 * SIZE(CO2), %xmm1 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm9, %xmm8 + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm8 + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L999 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifndef TRMMKERNEL + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 +#else + movsd -16 * SIZE(AO), %xmm0 + movhps -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhps -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 +#ifndef TRMMKERNEL + movaps -14 * SIZE(AO), %xmm0 +#else + movsd -14 * SIZE(AO), %xmm0 + movhps -13 * SIZE(AO), %xmm0 +#endif + addpd %xmm1, %xmm8 +#ifndef TRMMKERNEL + movaps -14 * SIZE(BO), %xmm1 +#else + movsd -14 * SIZE(BO), %xmm1 + movhps -13 * SIZE(BO), %xmm1 +#endif + + mulpd %xmm0, %xmm1 +#ifndef TRMMKERNEL + movaps -12 * SIZE(AO), %xmm0 +#else + movsd -12 * SIZE(AO), %xmm0 + movhps -11 * SIZE(AO), %xmm0 +#endif + addpd %xmm1, %xmm9 +#ifndef TRMMKERNEL + movaps -12 * SIZE(BO), %xmm1 +#else + movsd -12 * SIZE(BO), %xmm1 + movhps -11 * SIZE(BO), %xmm1 +#endif + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: + movsd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: + haddpd %xmm8, %xmm8 + mulsd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + + addsd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x2_atom.S b/kernel/x86_64/gemm_kernel_4x2_atom.S new file mode 100644 index 0000000..47b16ce --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x2_atom.S @@ -0,0 +1,1385 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KKK 64(%rsp) +#define KK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movsd %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $1, J + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + leaq (C, LDC, 2), C + + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + movsd ALPHA, %xmm5 + + addsd %xmm2, %xmm13 + mulsd %xmm5, %xmm8 + addsd %xmm7, %xmm14 + mulsd %xmm5, %xmm10 + addsd %xmm6, %xmm15 + mulsd %xmm5, %xmm12 + mulsd %xmm5, %xmm14 + + mulsd %xmm5, %xmm9 + mulsd %xmm5, %xmm11 + mulsd %xmm5, %xmm13 + mulsd %xmm5, %xmm15 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 + addsd 2 * SIZE(CO1), %xmm12 + addsd 3 * SIZE(CO1), %xmm14 + + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm11 + addsd 2 * SIZE(CO2), %xmm13 + addsd 3 * SIZE(CO2), %xmm15 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movsd %xmm14, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm11, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movsd %xmm15, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + mulsd %xmm7, %xmm8 + addsd %xmm6, %xmm11 + mulsd %xmm7, %xmm10 + mulsd %xmm7, %xmm9 + mulsd %xmm7, %xmm11 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 + + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm11, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + movsd ALPHA, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 0 * SIZE(CO2), %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movq BO, B + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + addq LDC, C + + movq A, AO + + movq M, I + sarq $2, I + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + movsd ALPHA, %xmm7 + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm10 + mulsd %xmm7, %xmm12 + mulsd %xmm7, %xmm14 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 + addsd 2 * SIZE(CO1), %xmm12 + addsd 3 * SIZE(CO1), %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movsd %xmm14, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + + decq I # i -- + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm10 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L60: + testq $1, M + je .L999 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + + mulsd %xmm7, %xmm8 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_barcelona.S b/kernel/x86_64/gemm_kernel_4x4_barcelona.S new file mode 100644 index 0000000..f7015c0 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_barcelona.S @@ -0,0 +1,2093 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define movapd movaps +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + addpd %xmm0, %xmm8 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + addpd %xmm0, %xmm8 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ +/*A*/ movapd (AO, %rax, 4), %xmm6 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + addpd %xmm4, %xmm8 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + addpd %xmm4, %xmm8 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ +/*A*/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + addpd %xmm6, %xmm8 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + addpd %xmm6, %xmm8 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ +/*A*/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ +/**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ +/*A*/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax ;\ + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO), %xmm3 ;\ + movapd %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 2), CO2 # coffset2 = c + ldc + + leaq (C, LDC, 4), C # c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + movddup -15 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + movapd -8 * SIZE(AO), %xmm4 + xorps %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm5 + xorps %xmm12, %xmm12 + + prefetchw 3 * SIZE(CO1) + xorps %xmm13, %xmm13 + prefetchw 7 * SIZE(CO1, LDC) + xorps %xmm14, %xmm14 + prefetchw 3 * SIZE(CO2) + xorps %xmm15, %xmm15 + prefetchw 7 * SIZE(CO2, LDC) + movapd %xmm0, %xmm2 + + prefetch -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + jl .L12 + ALIGN_4 + +.L15: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm1 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm12 +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhps %xmm12, 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movupd (CO1, LDC), %xmm2 + movupd 2 * SIZE(CO1, LDC), %xmm3 +#endif + + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm13 +#ifndef TRMMKERNEL + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm9, (CO1, LDC) + movhps %xmm9, 1 * SIZE(CO1, LDC) + movsd %xmm13, 2 * SIZE(CO1, LDC) + movhps %xmm13, 3 * SIZE(CO1, LDC) + +#ifndef TRMMKERNEL + movupd (CO2), %xmm0 + movupd 2 * SIZE(CO2), %xmm1 +#endif + + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm14 +#ifndef TRMMKERNEL + addpd %xmm0, %xmm10 + addpd %xmm1, %xmm14 +#endif + + movsd %xmm10, (CO2) + movhps %xmm10, 1 * SIZE(CO2) + movsd %xmm14, 2 * SIZE(CO2) + movhps %xmm14, 3 * SIZE(CO2) + +#ifndef TRMMKERNEL + movupd (CO2, LDC), %xmm2 + movupd 2 * SIZE(CO2, LDC), %xmm3 +#endif + + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm15 +#ifndef TRMMKERNEL + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 +#endif + + movsd %xmm11, (CO2, LDC) + movhps %xmm11, 1 * SIZE(CO2, LDC) + movsd %xmm15, 2 * SIZE(CO2, LDC) + movhps %xmm15, 3 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + xorps %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + xorps %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + xorps %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd (CO1, LDC), %xmm2 + movupd (CO2), %xmm4 + movupd (CO2, LDC), %xmm6 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm6, %xmm11 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm9, (CO1, LDC) + movhps %xmm9, 1 * SIZE(CO1, LDC) + + movsd %xmm10, (CO2) + movhps %xmm10, 1 * SIZE(CO2) + movsd %xmm11, (CO2, LDC) + movhps %xmm11, 1 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + xorps %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + xorps %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + xorps %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC), %xmm1 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC) + movsd %xmm9, (CO2) + movhps %xmm9, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + xorps %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 3 * SIZE(CO2) + + prefetch -16 * SIZE(BB) + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm1 + movupd (CO2), %xmm2 + movupd 2 * SIZE(CO2), %xmm3 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm12 + mulpd %xmm7, %xmm13 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhps %xmm12, 3 * SIZE(CO1) + + movsd %xmm9, (CO2) + movhps %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhps %xmm13, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + xorps %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + xorps %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd (CO2), %xmm2 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm9, (CO2) + movhps %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + xorps %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + xorps %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 +#endif + + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + movapd -8 * SIZE(AO), %xmm2 + xorps %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + xorps %xmm12, %xmm12 + movddup -14 * SIZE(BO), %xmm3 + xorps %xmm13, %xmm13 + movddup -15 * SIZE(BO), %xmm5 + + prefetchw 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm9 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 + +.L99: + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm1 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhps %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + xorps %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 +#endif + + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 +#endif + + mulsd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_core2.S b/kernel/x86_64/gemm_kernel_4x4_core2.S new file mode 100644 index 0000000..fa79fe0 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_core2.S @@ -0,0 +1,2221 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 4) + +#define PREFETCHSIZE (8 * 13 + 5) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + unpcklpd %xmm0, %xmm0 + movapd %xmm0, ALPHA + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + + leaq (, LDC, SIZE), LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movapd -16 * SIZE(B), %xmm0 + movapd -8 * SIZE(B), %xmm4 + + movq K, %rax + sarq $2, %rax + NOBRANCH + jle .L05 + ALIGN_3 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + movddup %xmm0, %xmm8 + movapd %xmm8, -16 * SIZE(BO) + unpckhpd %xmm0, %xmm0 + movapd %xmm0, -14 * SIZE(BO) + movapd 0 * SIZE(B), %xmm0 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movddup %xmm1, %xmm9 + movapd %xmm9, -12 * SIZE(BO) + unpckhpd %xmm1, %xmm1 + movapd %xmm1, -10 * SIZE(BO) + movddup %xmm2, %xmm10 + movapd %xmm10, -8 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + unpckhpd %xmm2, %xmm2 + movapd %xmm2, -6 * SIZE(BO) + movddup %xmm3, %xmm11 + movapd %xmm11, -4 * SIZE(BO) + unpckhpd %xmm3, %xmm3 + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + + movddup %xmm4, %xmm12 + movapd %xmm12, 0 * SIZE(BO) + unpckhpd %xmm4, %xmm4 + movapd %xmm4, 2 * SIZE(BO) + movapd 8 * SIZE(B), %xmm4 + movddup %xmm5, %xmm13 + movapd %xmm13, 4 * SIZE(BO) + unpckhpd %xmm5, %xmm5 + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movddup %xmm6, %xmm14 + movapd %xmm14, 8 * SIZE(BO) + unpckhpd %xmm6, %xmm6 + movapd %xmm6, 10 * SIZE(BO) + movddup %xmm7, %xmm15 + movapd %xmm15, 12 * SIZE(BO) + unpckhpd %xmm7, %xmm7 + movapd %xmm7, 14 * SIZE(BO) + + subq $-32 * SIZE, BO + subq $-16 * SIZE, B + decq %rax + BRANCH + jne .L02 + ALIGN_3 + +.L05: + movq K, %rax + andq $3, %rax + BRANCH + BRANCH + jle .L10 + ALIGN_3 + +.L06: + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd -12 * SIZE(B), %xmm0 + + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + BRANCH + jne .L06 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 19 * SIZE + BUFFER, BO +#else + leaq 19 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + prefetcht2 (BB) + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -19 * SIZE(BO), %xmm6 + movaps -17 * SIZE(BO), %xmm7 + + pxor %xmm2, %xmm2 + prefetcht0 3 * SIZE(CO1) + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + prefetcht0 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + + movapd %xmm2, %xmm8 + movapd %xmm2, %xmm9 + movapd %xmm2, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC, 2) + movapd %xmm2, %xmm11 + + movapd %xmm2, %xmm12 + movapd %xmm2, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 2) + movapd %xmm2, %xmm14 + movapd %xmm2, %xmm15 + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PADDING; + addpd %xmm2, %xmm10 + movaps -15 * SIZE(BO), %xmm2 + PADDING; + addpd %xmm3, %xmm14 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -13 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -11 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -9 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm10 + movaps -7 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -5 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -3 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -1 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm10 + movaps 1 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps 3 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + PADDING + movaps %xmm7, %xmm5 + mulpd %xmm1, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + + addpd %xmm6, %xmm8 + movaps 5 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps 7 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm10 + movaps 9 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + subq $-16 * SIZE, AO + movaps 11 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps 13 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps 15 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + subq $-32 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht2 -8 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm2, %xmm10 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -13 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -11 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + addq $4 * SIZE, AO + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -9 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + addq $8 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + movddup ALPHA, %xmm7 + + addpd %xmm2, %xmm10 + mulpd %xmm7, %xmm8 + addpd %xmm3, %xmm14 + mulpd %xmm7, %xmm12 + addpd %xmm4, %xmm11 + mulpd %xmm7, %xmm9 + addpd %xmm5, %xmm15 + mulpd %xmm7, %xmm13 + + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm14 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm15 + + movq CO1, %rax + orq LDC, %rax + testq $15, %rax + NOBRANCH + jne .L18x + +#ifndef TRMMKERNEL + addpd 0 * SIZE(CO1), %xmm8 + addpd 2 * SIZE(CO1), %xmm12 + addpd 0 * SIZE(CO2), %xmm9 + addpd 2 * SIZE(CO2), %xmm13 + + addpd 0 * SIZE(CO1, LDC, 2), %xmm10 + addpd 2 * SIZE(CO1, LDC, 2), %xmm14 + addpd 0 * SIZE(CO2, LDC, 2), %xmm11 + addpd 2 * SIZE(CO2, LDC, 2), %xmm15 +#endif + + movapd %xmm8, 0 * SIZE(CO1) + movapd %xmm12, 2 * SIZE(CO1) + movapd %xmm9, 0 * SIZE(CO2) + movapd %xmm13, 2 * SIZE(CO2) + + movapd %xmm10, 0 * SIZE(CO1, LDC, 2) + movapd %xmm14, 2 * SIZE(CO1, LDC, 2) + movapd %xmm11, 0 * SIZE(CO2, LDC, 2) + movapd %xmm15, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movsd 2 * SIZE(CO1, LDC, 2), %xmm5 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 + movsd 2 * SIZE(CO2, LDC, 2), %xmm7 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm14 + addpd %xmm6, %xmm11 + addpd %xmm7, %xmm15 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm9, %xmm3 + movapd %xmm10, %xmm4 + movapd %xmm11, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L21: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -4 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -2 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd 0 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd 2 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd 4 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd 6 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd 8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd 12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd 14 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + BRANCH + jg .L21 + ALIGN_4 + +.L25: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm9, %xmm3 + movapd %xmm10, %xmm4 + movapd %xmm11, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L31: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -15 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd -8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -6 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -4 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -14 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd 0 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd 2 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd 4 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -13 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd 8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd 10 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd 12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -12 * SIZE(AO), %xmm0 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + BRANCH + jg .L31 + ALIGN_4 + +.L35: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + mulsd %xmm7, %xmm10 + mulsd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 + addsd %xmm2, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + leaq (C, LDC, 4), C + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L43 + + addq %rax, %rax + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $7, %rax + BRANCH + jle .L45 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L44 + ALIGN_4 + +.L45: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L50: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L55 + ALIGN_4 + +.L51: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -6 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -6 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -2 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd 2 * SIZE(AO), %xmm1 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L51 + ALIGN_4 + +.L55: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L58 + ALIGN_4 + +.L56: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm9 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm12 + mulpd %xmm7, %xmm13 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + subq $1, I + jg .L50 + ALIGN_4 + +.L60: + testq $2, M + jle .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L65 + ALIGN_4 + +.L61: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm1, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO), %xmm0 + addpd %xmm4, %xmm10 + movapd -4 * SIZE(BO), %xmm4 + mulpd %xmm1, %xmm4 + addpd %xmm5, %xmm11 + movapd -2 * SIZE(BO), %xmm5 + mulpd %xmm1, %xmm5 + movapd -6 * SIZE(AO), %xmm1 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L61 + ALIGN_4 + +.L65: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L68 + ALIGN_4 + +.L66: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + jle .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L75 + ALIGN_4 + +.L71: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm1, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + movsd -13 * SIZE(AO), %xmm1 + + addsd %xmm2, %xmm8 + movsd -8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -6 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + addsd %xmm4, %xmm10 + movsd -4 * SIZE(BO), %xmm4 + mulsd %xmm1, %xmm4 + addsd %xmm5, %xmm11 + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + movsd -11 * SIZE(AO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L71 + ALIGN_4 + +.L75: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + ALIGN_4 + +.L76: + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 +#endif + + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 + addsd %xmm2, %xmm9 +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $4, %rax + jle .L83 + + addq %rax, %rax + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: + movq K, %rax + andq $15, %rax + BRANCH + jle .L85 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm8 + + movapd %xmm8, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L85: + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + jle .L100 + ALIGN_4 + +.L90: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(BO), %xmm5 + pxor %xmm12, %xmm12 + movapd -12 * SIZE(BO), %xmm6 + pxor %xmm13, %xmm13 + movapd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + prefetcht0 3 * SIZE(CO1) + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L95 + ALIGN_4 + +.L91: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm4, %xmm1 + movapd -8 * SIZE(BO), %xmm4 + addpd %xmm2, %xmm9 + movapd -12 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + addpd %xmm3, %xmm13 + movapd -10 * SIZE(AO), %xmm3 + mulpd %xmm5, %xmm3 + movapd -6 * SIZE(BO), %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm0, %xmm8 + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm6, %xmm0 + addpd %xmm1, %xmm12 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm6, %xmm1 + movapd -4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + addpd %xmm3, %xmm13 + movapd -2 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movapd -2 * SIZE(BO), %xmm7 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jg .L91 + ALIGN_4 + +.L95: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L98 + ALIGN_4 + +.L96: + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm4, %xmm1 + movapd -14 * SIZE(BO), %xmm4 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L98: + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 + + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + subq $1, I + jg .L90 + ALIGN_4 + +.L100: + testq $2, M + jle .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -12 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movapd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L105 + ALIGN_4 + +.L101: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movapd -8 * SIZE(BO), %xmm4 + addpd %xmm1, %xmm9 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm5, %xmm1 + movapd -6 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm10 + movapd -12 * SIZE(AO), %xmm2 + mulpd %xmm6, %xmm2 + movapd -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm11 + movapd -10 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movapd -2 * SIZE(BO), %xmm7 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jg .L101 + ALIGN_4 + +.L105: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L108 + ALIGN_4 + +.L106: + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movapd -14 * SIZE(BO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L108: + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + addpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 +#endif + + mulpd %xmm7, %xmm8 +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L110: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movsd -14 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movsd -12 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movsd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L115 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movsd -8 * SIZE(BO), %xmm4 + addpd %xmm1, %xmm9 + movsd -15 * SIZE(AO), %xmm1 + mulpd %xmm5, %xmm1 + movsd -6 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm10 + movsd -14 * SIZE(AO), %xmm2 + mulpd %xmm6, %xmm2 + movsd -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm11 + movsd -13 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movsd -2 * SIZE(BO), %xmm7 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jg .L111 + ALIGN_4 + +.L115: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + ALIGN_4 + +.L116: + addsd %xmm0, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + mulsd %xmm4, %xmm0 + movsd -14 * SIZE(BO), %xmm4 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 +#endif + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + addsd %xmm9, %xmm8 + + mulsd %xmm7, %xmm8 +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 +#endif + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_penryn.S b/kernel/x86_64/gemm_kernel_4x4_penryn.S new file mode 100644 index 0000000..3179c7d --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_penryn.S @@ -0,0 +1,2072 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define J 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define J 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#ifdef NANO +#define PREFETCHSIZE (8 * 2 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef DUNNINGTON +#define PREFETCHSIZE (8 * 97 + 4) +#define PREFETCHB prefetcht2 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht2 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 17 + 4) +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA + + subq $-16 * SIZE, A + subq $-17 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + leaq (, LDC, SIZE), LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHB -16 * SIZE(BB) + + xorpd %xmm5, %xmm5 + xorpd %xmm6, %xmm6 + + PREFETCHW 3 * SIZE(CO1) + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + PREFETCHW 7 * SIZE(CO2) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + PREFETCHW 3 * SIZE(CO1, LDC, 2) + movaps %xmm4, %xmm12 + movaps %xmm4, %xmm13 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -11 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movapd %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movapd %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + PADDING + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addpd %xmm2, %xmm9 + movaps -5 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + subq $-16 * SIZE, AO + movaps -3 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -1 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + subq $-16 * SIZE, BO + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + PREFETCHB -8 * SIZE(BB) +#ifdef DUNNINGTON + PREFETCHB 0 * SIZE(BB) + PREFETCHB 8 * SIZE(BB) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movddup ALPHA, %xmm1 + +#ifndef DUNNINGTON + subq $-16 * SIZE, BB +#else + subq $-32 * SIZE, BB +#endif + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + mulpd %xmm1, %xmm8 + movsd %xmm0, %xmm9 + mulpd %xmm1, %xmm9 + + movaps %xmm10, %xmm0 + movsd %xmm11, %xmm10 + mulpd %xmm1, %xmm10 + movsd %xmm0, %xmm11 + mulpd %xmm1, %xmm11 + + movaps %xmm12, %xmm0 + movsd %xmm13, %xmm12 + mulpd %xmm1, %xmm12 + movsd %xmm0, %xmm13 + mulpd %xmm1, %xmm13 + + movaps %xmm14, %xmm0 + movsd %xmm15, %xmm14 + mulpd %xmm1, %xmm14 + movsd %xmm0, %xmm15 + mulpd %xmm1, %xmm15 + + movq CO1, %rax + orq LDC, %rax + testq $15, %rax + NOBRANCH + jne .L18x + +#ifndef TRMMKERNEL + addpd 0 * SIZE(CO1), %xmm8 + addpd 2 * SIZE(CO1), %xmm12 + addpd 0 * SIZE(CO2), %xmm9 + addpd 2 * SIZE(CO2), %xmm13 + + addpd 0 * SIZE(CO1, LDC, 2), %xmm10 + addpd 2 * SIZE(CO1, LDC, 2), %xmm14 + addpd 0 * SIZE(CO2, LDC, 2), %xmm11 + addpd 2 * SIZE(CO2, LDC, 2), %xmm15 +#endif + + movaps %xmm8, 0 * SIZE(CO1) + movaps %xmm12, 2 * SIZE(CO1) + movaps %xmm9, 0 * SIZE(CO2) + movaps %xmm13, 2 * SIZE(CO2) + + movaps %xmm10, 0 * SIZE(CO1, LDC, 2) + movaps %xmm14, 2 * SIZE(CO1, LDC, 2) + movaps %xmm11, 0 * SIZE(CO2, LDC, 2) + movaps %xmm15, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + + decq I # i -- + BRANCH + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movsd 2 * SIZE(CO1, LDC, 2), %xmm5 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 + movsd 2 * SIZE(CO2, LDC, 2), %xmm7 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm14 + addpd %xmm6, %xmm11 + addpd %xmm7, %xmm15 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm3, %xmm8 + movaps %xmm3, %xmm9 + movaps %xmm3, %xmm10 + movaps %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -11 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -5 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -3 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps -1 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movddup ALPHA, %xmm3 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + mulpd %xmm3, %xmm8 + movsd %xmm0, %xmm9 + mulpd %xmm3, %xmm9 + + movaps %xmm10, %xmm0 + movsd %xmm11, %xmm10 + mulpd %xmm3, %xmm10 + movsd %xmm0, %xmm11 + mulpd %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 + + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -11 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -5 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -3 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -1 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 1 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -11 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + movddup ALPHA, %xmm3 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 0 * SIZE(CO2), %xmm0 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 0 * SIZE(CO2, LDC, 2), %xmm1 +#endif + + mulpd %xmm3, %xmm8 + mulpd %xmm3, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 0 * SIZE(CO2) + + movlpd %xmm9, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm9, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + PREFETCHB -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + PREFETCHW 3 * SIZE(CO2) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -11 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + movddup ALPHA, %xmm3 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + mulpd %xmm3, %xmm8 + movsd %xmm0, %xmm9 + mulpd %xmm3, %xmm9 + + movaps %xmm12, %xmm0 + movsd %xmm13, %xmm12 + mulpd %xmm3, %xmm12 + movsd %xmm0, %xmm13 + mulpd %xmm3, %xmm13 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -13 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -9 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movddup ALPHA, %xmm3 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + mulpd %xmm3, %xmm8 + movsd %xmm0, %xmm9 + mulpd %xmm3, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 2), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: + movddup ALPHA, %xmm3 + + addpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 0 * SIZE(CO2), %xmm0 +#endif + + mulpd %xmm3, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 +#endif + + mulpd %xmm3, %xmm8 + mulpd %xmm3, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: + addpd %xmm9, %xmm8 + + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 +#endif + + mulpd %xmm3, %xmm8 + + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -17 * SIZE(BO), %xmm2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: + movddup ALPHA, %xmm3 + + addpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 +#endif + + mulsd %xmm3, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_sse2.S b/kernel/x86_64/gemm_kernel_4x4_sse2.S new file mode 100644 index 0000000..1060197 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_sse2.S @@ -0,0 +1,2707 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 256(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 9 + 4) +#define movsd movlps +#define movapd movaps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 13 + 4) +#define movapd movaps +#endif + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else + +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + + unpcklpd %xmm0, %xmm0 + movapd %xmm0, ALPHA + + leaq (, LDC, SIZE), LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_3 + +.L01: +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_3 + + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq 1 * SIZE(B), %mm1 + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq 3 * SIZE(B), %mm3 + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + movq 4 * SIZE(B), %mm4 + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq 5 * SIZE(B), %mm5 + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) + + movq 6 * SIZE(B), %mm6 + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq 7 * SIZE(B), %mm7 + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) + + movq 8 * SIZE(B), %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq 9 * SIZE(B), %mm1 + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + movq 10 * SIZE(B), %mm2 + movq %mm2, 4 * SIZE(BO) + movq %mm2, 5 * SIZE(BO) + movq 11 * SIZE(B), %mm3 + movq %mm3, 6 * SIZE(BO) + movq %mm3, 7 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + movq 12 * SIZE(B), %mm4 + movq %mm4, 8 * SIZE(BO) + movq %mm4, 9 * SIZE(BO) + movq 13 * SIZE(B), %mm5 + movq %mm5, 10 * SIZE(BO) + movq %mm5, 11 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) + + movq 14 * SIZE(B), %mm6 + movq %mm6, 12 * SIZE(BO) + movq %mm6, 13 * SIZE(BO) + movq 15 * SIZE(B), %mm7 + movq %mm7, 14 * SIZE(BO) + movq %mm7, 15 * SIZE(BO) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, B + + subq $1, %rax + jne .L02 + ALIGN_3 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_3 + +.L04: + movq 0 * SIZE(B), %mm0 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq 1 * SIZE(B), %mm1 + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq 3 * SIZE(B), %mm3 + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_3 + +.L10: + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_3 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -12 * SIZE(AO), %xmm4 + movapd -12 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -10 * SIZE(AO), %xmm6 + movapd -8 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + PREFETCHW 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCHW 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + + PREFETCH 0 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $32 * SIZE, BO + addq $16 * SIZE, AO + ALIGN_3 + +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm0 + addpd %xmm1, %xmm10 + movapd -16 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm13 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm2 + addpd %xmm1, %xmm14 + movapd -8 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm2, %xmm15 + movapd -10 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_3 + +.L19: + PREFETCH 8 * SIZE(BB) + subq $-12 * SIZE, BB + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm7, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm7, %xmm14 + mulpd %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movlpd 2 * SIZE(CO1, LDC, 2), %xmm5 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 + + movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 + movlpd 2 * SIZE(CO2, LDC, 2), %xmm7 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + + movlpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movlpd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + +#ifndef TRMMKERNEL + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm14 + addpd %xmm6, %xmm11 + addpd %xmm7, %xmm15 +#endif + + movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm14, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) + + movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + movlpd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_3 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_3 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 16 * SIZE(BO), %xmm5 + movapd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm8 + movapd 18 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movapd 20 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + mulpd 22 * SIZE(BO), %xmm0 + addpd %xmm5, %xmm10 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm0, %xmm11 + movapd -10 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm7 + addpd %xmm7, %xmm8 + movapd 26 * SIZE(BO), %xmm7 + mulpd %xmm0, %xmm7 + addpd %xmm7, %xmm9 + movapd 28 * SIZE(BO), %xmm7 + mulpd %xmm0, %xmm7 + mulpd 30 * SIZE(BO), %xmm0 + addpd %xmm7, %xmm10 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movapd 34 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm9 + movapd 36 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 38 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 64 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -6 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movapd 42 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm9 + movapd 44 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 46 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 72 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm8 + movapd 50 * SIZE(BO), %xmm5 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movapd 52 * SIZE(BO), %xmm5 + mulpd %xmm2, %xmm5 + mulpd 54 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm10 + movapd 80 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm11 + movapd -2 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + addpd %xmm7, %xmm8 + movapd 58 * SIZE(BO), %xmm7 + mulpd %xmm2, %xmm7 + addpd %xmm7, %xmm9 + movapd 60 * SIZE(BO), %xmm7 + mulpd %xmm2, %xmm7 + mulpd 62 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm10 + movapd 88 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_3 + +.L29: +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movlpd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 +#endif + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm6, %xmm11 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_3 + +.L30: + testq $1, M + je .L39 + ALIGN_3 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movsd 16 * SIZE(BO), %xmm5 + movsd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_3 + +.L32: + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 32 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -15 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm8 + movsd 10 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm9 + movsd 12 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 40 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -14 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm8 + movsd 18 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm9 + movsd 20 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + mulsd 22 * SIZE(BO), %xmm0 + addsd %xmm5, %xmm10 + movsd 48 * SIZE(BO), %xmm5 + addsd %xmm0, %xmm11 + movsd -13 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm8 + movsd 26 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm9 + movsd 28 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + mulsd 30 * SIZE(BO), %xmm0 + addsd %xmm7, %xmm10 + movsd 56 * SIZE(BO), %xmm7 + addsd %xmm0, %xmm11 + movsd -12 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + movsd 34 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 36 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 38 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 64 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -11 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm8 + movsd 42 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm9 + movsd 44 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 46 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 72 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -10 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm8 + movsd 50 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm9 + movsd 52 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + mulsd 54 * SIZE(BO), %xmm0 + addsd %xmm5, %xmm10 + movsd 80 * SIZE(BO), %xmm5 + addsd %xmm0, %xmm11 + movsd -9 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm8 + movsd 58 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm9 + movsd 60 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + mulsd 62 * SIZE(BO), %xmm0 + addsd %xmm7, %xmm10 + movsd 88 * SIZE(BO), %xmm7 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + movsd 2 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 8 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_3 + +.L38: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + mulsd %xmm7, %xmm10 + mulsd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 + addsd %xmm2, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + ALIGN_3 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_3 + +.L42: + PREFETCH 56 * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_3 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_3 + +.L44: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_3 + +.L50: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_3 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm12, %xmm12 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + + movapd 0 * SIZE(AO), %xmm4 + movapd 16 * SIZE(BO), %xmm5 + movapd 8 * SIZE(AO), %xmm6 + movapd 24 * SIZE(BO), %xmm7 + + PREFETCHW 4 * SIZE(CO1) + PREFETCHW 4 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_3 + +.L52: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd 16 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd 24 * SIZE(AO), %xmm2 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm4, %xmm5 + mulpd 18 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm8 + movapd 16 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm9 + movapd 2 * SIZE(AO), %xmm4 + mulpd %xmm4, %xmm5 + mulpd 18 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm12 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm13 + movapd 4 * SIZE(AO), %xmm4 + + mulpd %xmm4, %xmm5 + mulpd 22 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm8 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm9 + movapd 6 * SIZE(AO), %xmm4 + mulpd %xmm4, %xmm5 + mulpd 22 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm12 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm13 + movapd 32 * SIZE(AO), %xmm4 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm6, %xmm7 + mulpd 26 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm8 + movapd 24 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm9 + movapd 10 * SIZE(AO), %xmm6 + mulpd %xmm6, %xmm7 + mulpd 26 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm12 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm13 + movapd 12 * SIZE(AO), %xmm6 + + mulpd %xmm6, %xmm7 + mulpd 30 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm8 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm9 + movapd 14 * SIZE(AO), %xmm6 + mulpd %xmm6, %xmm7 + mulpd 30 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm12 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm13 + movapd 40 * SIZE(AO), %xmm6 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_3 + +.L56: + movapd 0 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm12 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_3 + +.L59: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm12 + mulpd %xmm7, %xmm13 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_3 + +.L60: + testq $2, M + je .L70 + ALIGN_3 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 16 * SIZE(BO), %xmm5 + movapd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm5 + mulpd 18 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm8 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm5 + mulpd 22 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm10 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + mulpd 26 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm8 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + mulpd 30 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm10 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_3 + +.L69: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_3 + +.L70: + testq $1, M + je .L79 + ALIGN_3 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -12 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movsd 16 * SIZE(BO), %xmm5 + movsd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_3 + +.L72: + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm9 + movsd -15 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 32 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -14 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm8 + movsd 12 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm9 + movsd -13 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 40 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + + mulsd %xmm2, %xmm5 + mulsd 18 * SIZE(BO), %xmm2 + addsd %xmm5, %xmm8 + movsd 20 * SIZE(BO), %xmm5 + addsd %xmm2, %xmm9 + movsd -11 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm5 + mulsd 22 * SIZE(BO), %xmm2 + addsd %xmm5, %xmm10 + movsd 48 * SIZE(BO), %xmm5 + addsd %xmm2, %xmm11 + movsd -10 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm7 + mulsd 26 * SIZE(BO), %xmm2 + addsd %xmm7, %xmm8 + movsd 28 * SIZE(BO), %xmm7 + addsd %xmm2, %xmm9 + movsd -9 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm7 + mulsd 30 * SIZE(BO), %xmm2 + addsd %xmm7, %xmm10 + movsd 56 * SIZE(BO), %xmm7 + addsd %xmm2, %xmm11 + movsd -4 * SIZE(AO), %xmm2 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_3 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulsd %xmm0, %xmm1 + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm8 + addsd %xmm0, %xmm9 + movsd -15 * SIZE(AO), %xmm0 + movsd 4 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_3 + +.L78: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 +#endif + + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 + addsd %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_3 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + ALIGN_3 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L83 + ALIGN_3 + +.L82: + PREFETCH 56 * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_3 + +.L83: + movq K, %rax + andq $7, %rax + BRANCH + jle .L90 + ALIGN_3 + +.L84: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_3 + +.L90: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_3 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 0 * SIZE(AO), %xmm4 + movapd 8 * SIZE(AO), %xmm6 + + PREFETCHW 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_3 + +.L92: + mulpd %xmm1, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm10 + movapd 16 * SIZE(AO), %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm1, %xmm11 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AO), %xmm1 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO), %xmm2 + addpd %xmm1, %xmm9 + movapd 6 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AO), %xmm1 + addpd %xmm2, %xmm10 + movapd 24 * SIZE(AO), %xmm2 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm1, %xmm11 + movapd 16 * SIZE(BO), %xmm1 + mulpd %xmm3, %xmm4 + mulpd 2 * SIZE(AO), %xmm3 + addpd %xmm4, %xmm8 + movapd 4 * SIZE(AO), %xmm4 + addpd %xmm3, %xmm9 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm4 + mulpd 6 * SIZE(AO), %xmm3 + addpd %xmm4, %xmm10 + movapd 32 * SIZE(AO), %xmm4 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm3, %xmm11 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm6 + mulpd 10 * SIZE(AO), %xmm3 + addpd %xmm6, %xmm8 + movapd 12 * SIZE(AO), %xmm6 + addpd %xmm3, %xmm9 + movapd 14 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm6 + mulpd 14 * SIZE(AO), %xmm3 + addpd %xmm6, %xmm10 + movapd 40 * SIZE(AO), %xmm6 + addpd %xmm3, %xmm11 + movapd 24 * SIZE(BO), %xmm3 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_3 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_3 + +.L96: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movapd 2 * SIZE(BO), %xmm1 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_3 + +.L99: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_3 + +.L100: + testq $2, M + je .L110 + ALIGN_3 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L105 + ALIGN_3 + +.L102: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd -14 * SIZE(AO), %xmm0 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -12 * SIZE(AO), %xmm0 + mulpd 4 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm10 + movapd -10 * SIZE(AO), %xmm0 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm2 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -4 * SIZE(AO), %xmm2 + mulpd 12 * SIZE(BO), %xmm2 + addpd %xmm2, %xmm10 + movapd -2 * SIZE(AO), %xmm2 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_3 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_3 + +.L106: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(AO), %xmm0 + movapd 2 * SIZE(BO), %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_3 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + addq $2 * SIZE, CO1 # coffset += 4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + ALIGN_3 + +.L110: + testq $1, M + je .L999 + ALIGN_3 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -12 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_3 + +.L112: + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd 16 * SIZE(BO), %xmm1 + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm9 + movsd -14 * SIZE(AO), %xmm0 + mulsd 4 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm10 + movsd -13 * SIZE(AO), %xmm0 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm8 + movsd 24 * SIZE(BO), %xmm3 + mulsd 10 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm9 + movsd -10 * SIZE(AO), %xmm2 + mulsd 12 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm10 + movsd -9 * SIZE(AO), %xmm2 + mulsd 14 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm11 + movsd -4 * SIZE(AO), %xmm2 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_3 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd 2 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_3 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + + mulsd %xmm7, %xmm8 +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm8 +#endif + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_3 + +.L999: + movq %rbx, %rsp + + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_sse3.S b/kernel/x86_64/gemm_kernel_4x4_sse3.S new file mode 100644 index 0000000..8cbe6ed --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_sse3.S @@ -0,0 +1,2561 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KKK 64(%rsp) +#define KK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movsd %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + pxor %xmm4, %xmm4 + movddup 16 * SIZE(BO), %xmm13 + pxor %xmm5, %xmm5 + movapd 24 * SIZE(AO), %xmm14 + pxor %xmm6, %xmm6 + movddup 24 * SIZE(BO), %xmm15 + pxor %xmm7, %xmm7 + + prefetchnta 3 * SIZE(CO1) + prefetchnta 3 * SIZE(CO2) + prefetchnta 3 * SIZE(CO1, LDC, 2) + prefetchnta 3 * SIZE(CO2, LDC, 2) + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + NOBRANCH + je .L15 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + BRANCH + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm4 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm5 + + testq $15, CO1 + NOBRANCH + jne .L19x + testq $15, LDC + NOBRANCH + jne .L19x + + mulpd %xmm15, %xmm2 + mulpd %xmm15, %xmm3 + mulpd %xmm15, %xmm6 + mulpd %xmm15, %xmm7 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd 0 * SIZE(CO1), %xmm0 + addpd 2 * SIZE(CO1), %xmm4 + addpd 0 * SIZE(CO2), %xmm1 + addpd 2 * SIZE(CO2), %xmm5 + + addpd 0 * SIZE(CO1, LDC, 2), %xmm2 + addpd 2 * SIZE(CO1, LDC, 2), %xmm6 + addpd 0 * SIZE(CO2, LDC, 2), %xmm3 + addpd 2 * SIZE(CO2, LDC, 2), %xmm7 +#endif + + movapd %xmm0, 0 * SIZE(CO1) + movapd %xmm4, 2 * SIZE(CO1) + movapd %xmm1, 0 * SIZE(CO2) + movapd %xmm5, 2 * SIZE(CO2) + + movapd %xmm2, 0 * SIZE(CO1, LDC, 2) + movapd %xmm6, 2 * SIZE(CO1, LDC, 2) + movapd %xmm3, 0 * SIZE(CO2, LDC, 2) + movapd %xmm7, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L11 + jmp .L20 + ALIGN_4 + +.L19x: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movsd 0 * SIZE(CO2), %xmm10 + movhpd 1 * SIZE(CO2), %xmm10 + movsd 2 * SIZE(CO2), %xmm11 + movhpd 3 * SIZE(CO2), %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm4 + addpd %xmm10, %xmm1 + addpd %xmm11, %xmm5 +#endif + + mulpd %xmm15, %xmm2 + mulpd %xmm15, %xmm3 + mulpd %xmm15, %xmm6 + mulpd %xmm15, %xmm7 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm12 + movsd 2 * SIZE(CO1, LDC, 2), %xmm13 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm13 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm14 + movsd 2 * SIZE(CO2, LDC, 2), %xmm15 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm15 + + addpd %xmm12, %xmm2 + addpd %xmm13, %xmm6 + addpd %xmm14, %xmm3 + addpd %xmm15, %xmm7 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movhpd %xmm4, 3 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movhpd %xmm1, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm6, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm6, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhpd 1 * SIZE(CO2), %xmm10 + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm14 +#endif + + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm2 + mulpd %xmm15, %xmm3 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm10, %xmm1 + addpd %xmm12, %xmm2 + addpd %xmm14, %xmm3 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhpd %xmm1, 1 * SIZE(CO2) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 1 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 0 * SIZE(CO2), %xmm8 + movsd 0 * SIZE(CO1, LDC, 2), %xmm9 + movhpd 0 * SIZE(CO2, LDC, 2), %xmm9 +#endif + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO2) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + movq BO, B + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-4 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) + prefetchw 4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + movsd 0 * SIZE(CO2), %xmm10 + movhpd 1 * SIZE(CO2), %xmm10 + movsd 2 * SIZE(CO2), %xmm11 + movhpd 3 * SIZE(CO2), %xmm11 +#endif + + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm4 + mulpd %xmm15, %xmm5 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm4 + addpd %xmm10, %xmm1 + addpd %xmm11, %xmm5 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movhpd %xmm4, 3 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhpd %xmm1, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhpd 1 * SIZE(CO2), %xmm10 +#endif + + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm10, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhpd %xmm1, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 0 * SIZE(CO2), %xmm8 +#endif + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + mulpd %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 +#endif + + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 +#endif + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + mulpd %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(AO), %xmm9 + movapd 0 * SIZE(BO), %xmm8 + movapd 4 * SIZE(AO), %xmm11 + movapd 4 * SIZE(BO), %xmm10 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 +#endif + + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + mulsd %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x8_nano.S b/kernel/x86_64/gemm_kernel_4x8_nano.S new file mode 100644 index 0000000..4d81405 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x8_nano.S @@ -0,0 +1,2479 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi + +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 256(%rsp) + +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define RPREFETCHSIZE (16 * 4) +#define PREFETCHSIZE (16 * 8 + 8) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + salq $BASE_SHIFT, LDC + + movq N, J + sarq $3, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 32 * SIZE + BUFFER, BO + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + movq K, %rax + sarq $1, %rax + jle .L03 + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + movaps 16 * SIZE(B), %xmm1 + + pshufd $0x50, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xfa, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps 20 * SIZE(B), %xmm3 + + pshufd $0x50, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(BO) + pshufd $0xfa, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(BO) + + movaps 24 * SIZE(B), %xmm5 + + pshufd $0x50, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(BO) + pshufd $0xfa, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(BO) + + movaps 28 * SIZE(B), %xmm7 + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $1, %rax + BRANCH + jle .L10 + + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + pshufd $0x50, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xfa, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + ALIGN_4 + +.L10: + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + leaq (LDC, LDC, 2), %rax + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + + pxor %xmm8, %xmm8 + PREFETCHW 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + PREFETCHW 5 * SIZE(CO1, LDC, 1) + pxor %xmm10, %xmm10 + PREFETCHW 3 * SIZE(CO1, LDC, 2) + pxor %xmm11, %xmm11 + PREFETCHW 5 * SIZE(CO1, %rax) + + pxor %xmm12, %xmm12 + PREFETCHW 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCHW 5 * SIZE(CO2, LDC, 1) + pxor %xmm14, %xmm14 + PREFETCHW 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + PREFETCHW 5 * SIZE(CO2, %rax) + + PREFETCH -32 * SIZE(BB) + addq $16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0)(AO) + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + movaps -28 * SIZE(AO), %xmm2 + addps %xmm3, %xmm15 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps -8 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -4 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm14 + movaps 0 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + movaps -24 * SIZE(AO), %xmm2 + addps %xmm3, %xmm15 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps 8 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps 12 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm14 + movaps 16 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + movaps -20 * SIZE(AO), %xmm2 + addps %xmm3, %xmm15 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movaps 20 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps 24 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps 28 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm14 + movaps 32 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm15 + + subq $-16 * SIZE, AO + addq $ 64 * SIZE, BO + decq %rax + BRANCH + jg .L12 + +.L16: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L18 + ALIGN_4 + +.L17: + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm12 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm15 + + addq $ 4 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L17 + ALIGN_4 + +.L18: + leaq (LDC, LDC, 2), %rax + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1, LDC, 1), %xmm0 + movsd 0 * SIZE(CO1, LDC, 1), %xmm1 + movhps 2 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm2 + movhps 2 * SIZE(CO1, %rax), %xmm2 + movsd 0 * SIZE(CO1, %rax), %xmm3 + movhps 2 * SIZE(CO1, LDC, 2), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO2), %xmm4 + movhps 2 * SIZE(CO2, LDC, 1), %xmm4 + movsd 0 * SIZE(CO2, LDC, 1), %xmm5 + movhps 2 * SIZE(CO2), %xmm5 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, %rax), %xmm6 + movsd 0 * SIZE(CO2, %rax), %xmm7 + movhps 2 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm4, %xmm12 + addps %xmm5, %xmm13 + addps %xmm6, %xmm14 + addps %xmm7, %xmm15 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1, LDC, 1) + movlps %xmm9, 0 * SIZE(CO1, LDC, 1) + movhps %xmm9, 2 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, %rax) + movlps %xmm11, 0 * SIZE(CO1, %rax) + movhps %xmm11, 2 * SIZE(CO1, LDC, 2) + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2, LDC, 1) + movlps %xmm13, 0 * SIZE(CO2, LDC, 1) + movhps %xmm13, 2 * SIZE(CO2) + + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, %rax) + movlps %xmm15, 0 * SIZE(CO2, %rax) + movhps %xmm15, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movddup -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L26 + ALIGN_3 + +.L22: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -30 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 0 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -28 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps 8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps 12 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -26 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps 20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps 24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps 28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + BRANCH + jg .L22 + +.L26: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L28 + ALIGN_4 + +.L27: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -30 * SIZE(AO), %xmm0 + + addq $ 2 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L27 + ALIGN_4 + +.L28: + leaq (LDC, LDC, 2), %rax + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO1, LDC, 2), %xmm1 + movhps (CO1, %rax), %xmm1 + + movsd (CO2), %xmm2 + movhps (CO2, LDC, 1), %xmm2 + movsd (CO2, LDC, 2), %xmm3 + movhps (CO2, %rax), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + movlps %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + + movlps %xmm9, (CO1, LDC, 2) + movhps %xmm9, (CO1, %rax) + + movlps %xmm10, (CO2) + movhps %xmm10, (CO2, LDC, 1) + + movlps %xmm11, (CO2, LDC, 2) + movhps %xmm11, (CO2, %rax) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + addq %rax, %rax + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L36 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0)(AO) + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -31 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 0 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -30 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps 8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps 12 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -29 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps 20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps 24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps 28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + BRANCH + jg .L32 + +.L36: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L38 + ALIGN_4 + +.L37: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -31 * SIZE(AO), %xmm0 + + addq $ 1 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L37 + ALIGN_4 + +.L38: + leaq (LDC, LDC, 2), %rax + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + movhlps %xmm8, %xmm12 + movhlps %xmm9, %xmm13 + movhlps %xmm10, %xmm14 + movhlps %xmm11, %xmm15 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO1, LDC, 1), %xmm12 + addss (CO1, LDC, 2), %xmm9 + addss (CO1, %rax), %xmm13 + + addss (CO2), %xmm10 + addss (CO2, LDC, 1), %xmm14 + addss (CO2, LDC, 2), %xmm11 + addss (CO2, %rax), %xmm15 +#endif + + movss %xmm8, (CO1) + movss %xmm12, (CO1, LDC, 1) + + movss %xmm9, (CO1, LDC, 2) + movss %xmm13, (CO1, %rax) + + movss %xmm10, (CO2) + movss %xmm14, (CO2, LDC, 1) + + movss %xmm11, (CO2, LDC, 2) + movss %xmm15, (CO2, %rax) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + addq %rax, %rax + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $8, KK +#endif + leaq (C, LDC, 8), C + decq J + jg .L01 + ALIGN_4 + +.L40: + testq $4, N + jle .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 32 * SIZE + BUFFER, BO + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + movaps 16 * SIZE(B), %xmm1 + + pshufd $0x50, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xfa, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps 20 * SIZE(B), %xmm3 + + pshufd $0x50, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(BO) + pshufd $0xfa, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(BO) + + movaps 24 * SIZE(B), %xmm5 + + pshufd $0x50, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(BO) + pshufd $0xfa, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(BO) + + movaps 28 * SIZE(B), %xmm7 + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L45: + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + movaps 4 * SIZE(B), %xmm1 + + addq $ 4 * SIZE, B + subq $-8 * SIZE, BO + decq %rax + jne .L45 + ALIGN_4 + +.L50: + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + + pxor %xmm8, %xmm8 + PREFETCHW 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + PREFETCHW 5 * SIZE(CO1, LDC) + pxor %xmm10, %xmm10 + PREFETCHW 3 * SIZE(CO2) + pxor %xmm11, %xmm11 + PREFETCHW 5 * SIZE(CO2, LDC) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L56 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0)(AO) + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps 0 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + decq %rax + BRANCH + jg .L52 + +.L56: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L58 + ALIGN_4 + +.L57: + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + addq $ 4 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + jg .L57 + ALIGN_4 + +.L58: + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1, LDC), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 2 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2, LDC), %xmm2 + movsd 0 * SIZE(CO2, LDC), %xmm3 + movhps 2 * SIZE(CO2), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1, LDC) + movlps %xmm9, 0 * SIZE(CO1, LDC) + movhps %xmm9, 2 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2, LDC) + movlps %xmm11, 0 * SIZE(CO2, LDC) + movhps %xmm11, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L66 + ALIGN_3 + +.L62: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -30 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -28 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -12 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -26 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps 0 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -24 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + decq %rax + BRANCH + jg .L62 + +.L66: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L68 + ALIGN_4 + +.L67: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -30 * SIZE(AO), %xmm0 + + addq $ 2 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + jg .L67 + ALIGN_4 + +.L68: + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + + movsd (CO2), %xmm1 + movhps (CO2, LDC), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movlps %xmm8, (CO1) + movhps %xmm8, (CO1, LDC) + + movlps %xmm9, (CO2) + movhps %xmm9, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L76 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0)(AO) + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -31 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -30 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -12 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -29 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps 0 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -28 * SIZE(AO), %xmm0 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + decq %rax + BRANCH + jg .L72 + +.L76: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L78 + ALIGN_4 + +.L77: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -31 * SIZE(AO), %xmm0 + + addq $ 1 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + jg .L77 + ALIGN_4 + +.L78: + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + + movhlps %xmm8, %xmm10 + movhlps %xmm9, %xmm11 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO1, LDC), %xmm10 + addss (CO2), %xmm9 + addss (CO2, LDC), %xmm11 +#endif + + movss %xmm8, (CO1) + movss %xmm10, (CO1, LDC) + + movss %xmm9, (CO2) + movss %xmm11, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C + ALIGN_4 + +.L80: + testq $2, N + jle .L120 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 32 * SIZE + BUFFER, BO + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + + movq K, %rax + sarq $2, %rax + jle .L83 + ALIGN_4 + +.L82: + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + movaps 8 * SIZE(B), %xmm1 + + pshufd $0x50, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xfa, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps 12 * SIZE(B), %xmm3 + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + + decq %rax + jne .L82 + ALIGN_4 + +.L83: + movq K, %rax + andq $3, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L85: + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + movsd 2 * SIZE(B), %xmm1 + + addq $ 2 * SIZE, B + subq $-4 * SIZE, BO + decq %rax + jne .L85 + ALIGN_4 + +.L90: + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $2, I + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + + pxor %xmm8, %xmm8 + PREFETCHW 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + PREFETCHW 3 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L96 + ALIGN_3 + +.L92: + PREFETCH (PREFETCHSIZE + 0)(AO) + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -24 * SIZE(AO), %xmm0 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + BRANCH + jg .L92 + +.L96: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L98 + ALIGN_4 + +.L97: + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + + addq $ 4 * SIZE, AO + subq $-4 * SIZE, BO + decq %rax + jg .L97 + ALIGN_4 + +.L98: + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO2) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L106 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + BRANCH + jg .L102 + +.L106: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L108 + ALIGN_4 + +.L107: + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + + addq $ 2 * SIZE, AO + subq $-4 * SIZE, BO + decq %rax + jg .L107 + ALIGN_4 + +.L108: + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movlps %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L110: + testq $1, M + je .L119 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L116 + ALIGN_3 + +.L112: + PREFETCH (PREFETCHSIZE + 0)(AO) + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + BRANCH + jg .L112 + +.L116: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L118 + ALIGN_4 + +.L117: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + + addq $ 1 * SIZE, AO + subq $-4 * SIZE, BO + decq %rax + jg .L117 + ALIGN_4 + +.L118: + mulps %xmm7, %xmm8 + movhlps %xmm8, %xmm9 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO2), %xmm9 +#endif + + movss %xmm8, (CO1) + movss %xmm9, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L119: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + ALIGN_4 + +.L120: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 32 * SIZE + BUFFER, BO + + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + + movq K, %rax + sarq $2, %rax + jle .L123 + ALIGN_4 + +.L122: + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + movsd 4 * SIZE(B), %xmm1 + movhps 6 * SIZE(B), %xmm1 + + addq $ 4 * SIZE, B + subq $-8 * SIZE, BO + + decq %rax + jne .L122 + ALIGN_4 + +.L123: + movq K, %rax + andq $3, %rax + BRANCH + jle .L130 + ALIGN_4 + +.L125: + pshufd $0x50, %xmm1, %xmm0 + movlps %xmm0, -32 * SIZE(BO) + movss 1 * SIZE(B), %xmm1 + + addq $ 1 * SIZE, B + subq $-2 * SIZE, BO + decq %rax + jne .L125 + ALIGN_4 + +.L130: + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + jle .L140 + ALIGN_4 + +.L131: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movddup -32 * SIZE(BO), %xmm1 + + pxor %xmm8, %xmm8 + PREFETCHW 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L136 + ALIGN_3 + +.L132: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -24 * SIZE(BO), %xmm1 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + decq %rax + BRANCH + jg .L132 + +.L136: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L138 + ALIGN_4 + +.L137: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + + addq $ 4 * SIZE, AO + subq $-2 * SIZE, BO + decq %rax + jg .L137 + ALIGN_4 + +.L138: + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + decq I + jg .L131 + ALIGN_4 + +.L140: + testq $2, M + je .L150 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L146 + ALIGN_3 + +.L142: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -24 * SIZE(BO), %xmm1 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + BRANCH + jg .L142 + +.L146: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L148 + ALIGN_4 + +.L147: + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + + addq $ 2 * SIZE, AO + subq $-2 * SIZE, BO + decq %rax + jg .L147 + ALIGN_4 + +.L148: + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movlps %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L150: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movss -32 * SIZE(BO), %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L156 + ALIGN_3 + +.L152: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulss %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + + mulss %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -28 * SIZE(BO), %xmm1 + + mulss %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -26 * SIZE(BO), %xmm1 + + mulss %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -24 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + BRANCH + jg .L152 + +.L156: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L158 + ALIGN_4 + +.L157: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L157 + ALIGN_4 + +.L158: + mulss %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 +#endif + + movss %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x8_nehalem.S b/kernel/x86_64/gemm_kernel_4x8_nehalem.S new file mode 100644 index 0000000..5d02ac6 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x8_nehalem.S @@ -0,0 +1,2397 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %rbp + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rdx +#define BB %r12 + +#define PREA %r10 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define J 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define J 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCHSIZE 8 +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm0, %xmm0 + movlps %xmm0, ALPHA + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $BASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: + prefetcht2 -32 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + leaq (LDC, LDC, 2), %rax + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + PADDING + xorps %xmm4, %xmm4 + + PADDING + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO1, LDC, 1) + PADDING + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC, 2) + PADDING + xorps %xmm11, %xmm11 + prefetcht0 7 * SIZE(CO1, %rax, 1) + + movaps -32 * SIZE(AO), %xmm0 + + PADDING + xorps %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 3 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 7 * SIZE(CO2, %rax, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + movaps -28 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + movaps -20 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm4 + + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movups (CO1), %xmm0 + movups (CO1, LDC, 1), %xmm1 + movups (CO1, LDC, 2), %xmm2 + movups (CO1, %rax, 1), %xmm3 + + movups (CO2), %xmm4 + movups (CO2, LDC, 1), %xmm5 + movups (CO2, LDC, 2), %xmm6 + movups (CO2, %rax, 1), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + addps %xmm4, %xmm12 + addps %xmm5, %xmm13 + addps %xmm6, %xmm14 + addps %xmm7, %xmm15 +#endif + + movups %xmm8, (CO1) + movups %xmm9, (CO1, LDC, 1) + movups %xmm10, (CO1, LDC, 2) + movups %xmm11, (CO1, %rax, 1) + + movups %xmm12, (CO2) + movups %xmm13, (CO2, LDC, 1) + movups %xmm14, (CO2, LDC, 2) + movups %xmm15, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO1, LDC, 2), %xmm1 + movhps (CO1, %rax, 1), %xmm1 + + movsd (CO2), %xmm2 + movhps (CO2, LDC, 1), %xmm2 + movsd (CO2, LDC, 2), %xmm3 + movhps (CO2, %rax, 1), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + movsd %xmm9, (CO1, LDC, 2) + movhps %xmm9, (CO1, %rax, 1) + + movsd %xmm10, (CO2) + movhps %xmm10, (CO2, LDC, 1) + movsd %xmm11, (CO2, LDC, 2) + movhps %xmm11, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm12 + + pshufd $0xff, %xmm8, %xmm11 + pshufd $0xaa, %xmm8, %xmm10 + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + + pshufd $0xff, %xmm12, %xmm15 + pshufd $0xaa, %xmm12, %xmm14 + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO1, LDC, 1), %xmm9 + addss (CO1, LDC, 2), %xmm10 + addss (CO1, %rax, 1), %xmm11 + + addss (CO2), %xmm12 + addss (CO2, LDC, 1), %xmm13 + addss (CO2, LDC, 2), %xmm14 + addss (CO2, %rax, 1), %xmm15 +#endif + + movss %xmm8, (CO1) + movss %xmm9, (CO1, LDC, 1) + movss %xmm10, (CO1, LDC, 2) + movss %xmm11, (CO1, %rax, 1) + + movss %xmm12, (CO2) + movss %xmm13, (CO2, LDC, 1) + movss %xmm14, (CO2, LDC, 2) + movss %xmm15, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK +#endif + + movq BO, B + + leaq (C, LDC, 8), C + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L40: + testq $4, N + jle .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO2, LDC, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC, 1), %xmm1 + movhps 2 * SIZE(CO1, LDC, 1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO2, LDC, 1), %xmm3 + movhps 2 * SIZE(CO2, LDC, 1), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO1, LDC, 1) + movhps %xmm9, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 1) + movhps %xmm11, 2 * SIZE(CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC, 1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + movsd %xmm9, (CO2) + movhps %xmm9, (CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + addps %xmm2, %xmm8 + mulps %xmm7, %xmm8 + + pshufd $0xff, %xmm8, %xmm11 + pshufd $0xaa, %xmm8, %xmm10 + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO1, LDC, 1), %xmm9 + addss (CO2), %xmm10 + addss (CO2, LDC, 1), %xmm11 +#endif + + movss %xmm8, (CO1) + movss %xmm9, (CO1, LDC, 1) + movss %xmm10, (CO2) + movss %xmm11, (CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + ALIGN_4 + +.L70: + testq $2, N + jle .L100 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: + addps %xmm1, %xmm8 + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L90: + testq $1, M + BRANCH + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: + addps %xmm2, %xmm8 + mulps %xmm7, %xmm8 + + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO2), %xmm9 +#endif + + movss %xmm8, (CO1) + movss %xmm9, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L100: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: + addps %xmm1, %xmm8 + + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: + addps %xmm1, %xmm8 + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: + addps %xmm2, %xmm8 + + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 +#endif + + movss %xmm8, (CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_barcelona.S b/kernel/x86_64/gemm_kernel_8x4_barcelona.S new file mode 100644 index 0000000..b40c8ba --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_barcelona.S @@ -0,0 +1,3253 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 4 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define KERNEL1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + movaps (AO, %rax, 4), %xmm6 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\ + addps %xmm1, %xmm14 ;\ + movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\ + addps %xmm5, %xmm14 ;\ + movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 ;\ + addq $16 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps (AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 32) * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm0 + pshufd $0x55, %xmm11, %xmm1 + pshufd $0xaa, %xmm11, %xmm2 + pshufd $0xff, %xmm11, %xmm3 + + prefetchw (WPREFETCHSIZE + 48) * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm4 + pshufd $0x55, %xmm15, %xmm5 + pshufd $0xaa, %xmm15, %xmm6 + pshufd $0xff, %xmm15, %xmm7 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movaps -28 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movaps -16 * SIZE(AO), %xmm4 + xorps %xmm10, %xmm10 + movaps 0 * SIZE(BO), %xmm5 + xorps %xmm11, %xmm11 + + prefetch -20 * SIZE(BB) + + prefetchw 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1, LDC, 2) + xorps %xmm14, %xmm14 + prefetchw 7 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + movaps %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + prefetch 16 * SIZE(BB) + subq $-32 * SIZE, BB + + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL_SUB1(32 * 0) + KERNEL_SUB2(32 * 0) + KERNEL_SUB3(32 * 0) + KERNEL_SUB4(32 * 0) + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + ALIGN_3 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm8 + movaps %xmm2, %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm9 + movaps %xmm0, %xmm2 + addps %xmm3, %xmm13 + movaps -20 * SIZE(BO, %rax, 8), %xmm3 + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm10 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm11 + addps %xmm3, %xmm15 + movaps -12 * SIZE(BO, %rax, 8), %xmm3 + movaps %xmm0, %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + movups 0 * SIZE(CO1), %xmm0 + movups 4 * SIZE(CO1), %xmm1 + movups 0 * SIZE(CO2), %xmm2 + movups 4 * SIZE(CO2), %xmm3 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movups 0 * SIZE(CO1, LDC, 2), %xmm4 + movups 4 * SIZE(CO1, LDC, 2), %xmm5 + movups 0 * SIZE(CO2, LDC, 2), %xmm6 + movups 4 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movsd %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + +#ifndef TRMMKERNEL + addps %xmm4, %xmm10 + addps %xmm5, %xmm14 + addps %xmm6, %xmm11 + addps %xmm7, %xmm15 +#endif + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movsd %xmm14, 4 * SIZE(CO1, LDC, 2) + movhps %xmm14, 6 * SIZE(CO1, LDC, 2) + + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movsd %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhps 2 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhps 2 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movsd 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsd 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movsd 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movsd 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movsd 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movsd 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movsd 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movsd 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movsd 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movsd 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movsd 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movsd 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsd 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movsd 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsd 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsd 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movsd 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsd 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss -30 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss -29 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss -24 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss -27 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss -26 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss -25 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + mulss %xmm15, %xmm0 + mulss %xmm15, %xmm1 + mulss %xmm15, %xmm2 + mulss %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm10 + movss 0 * SIZE(CO1, LDC, 2), %xmm12 + movss 0 * SIZE(CO2, LDC, 2), %xmm14 + + addss %xmm8, %xmm0 + addss %xmm10, %xmm1 + addss %xmm12, %xmm2 + addss %xmm14, %xmm3 +#endif + + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO2) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $ 2 * SIZE, B + addq $ 8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + prefetchw 4 * SIZE(CO1) + xorps %xmm4, %xmm4 + prefetchw 4 * SIZE(CO2) + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 48 * SIZE(AO), %xmm10 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 4 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 8 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 12 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 64 * SIZE(AO), %xmm12 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 20 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 24 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 80 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + movsd 4 * SIZE(CO2), %xmm11 + movhps 6 * SIZE(CO2), %xmm11 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 + addps %xmm10, %xmm1 + addps %xmm11, %xmm5 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + movsd %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movsd 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsd 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movsd 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsd 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movsd 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsd 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -29 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -24 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -27 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -26 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -25 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm10 +#endif + + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + mulss %xmm15, %xmm0 + mulss %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addss %xmm8, %xmm0 + addss %xmm10, %xmm1 +#endif + + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movups 0 * SIZE(B), %xmm3 + movups 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + prefetchw 4 * SIZE(CO1) + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps -20 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps -12 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps -8 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps -4 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 4 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 8 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 12 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 64 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 20 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 24 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 28 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 80 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps -28 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -24 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps -20 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps -12 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -8 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps -4 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + mulps %xmm15, %xmm0 +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss -31 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss -30 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss -29 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss -24 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss -27 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss -26 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss -25 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss -20 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movss ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + mulss %xmm15, %xmm0 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + addss %xmm8, %xmm0 +#endif + movss %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_core2.S b/kernel/x86_64/gemm_kernel_8x4_core2.S new file mode 100644 index 0000000..285d644 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_core2.S @@ -0,0 +1,2615 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (16 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (16 * 13 + 10) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq OLD_M, M + movq OLD_N, N + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J + jle .L50 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq 32 * SIZE + BUFFER, BO + + movaps -32 * SIZE(B), %xmm3 + + movq K, %rax + sarq $2, %rax + jle .L05 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movaps -28 * SIZE(B), %xmm7 + movaps -24 * SIZE(B), %xmm11 + movaps -20 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0x55, %xmm3, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + pshufd $0xaa, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xff, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps -16 * SIZE(B), %xmm3 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + pshufd $0x00, %xmm7, %xmm4 + movaps %xmm4, -16 * SIZE(BO) + pshufd $0x55, %xmm7, %xmm5 + movaps %xmm5, -12 * SIZE(BO) + pshufd $0xaa, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(BO) + pshufd $0xff, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 32) * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + movaps %xmm8, 0 * SIZE(BO) + pshufd $0x55, %xmm11, %xmm9 + movaps %xmm9, 4 * SIZE(BO) + pshufd $0xaa, %xmm11, %xmm10 + movaps %xmm10, 8 * SIZE(BO) + pshufd $0xff, %xmm11, %xmm11 + movaps %xmm11, 12 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 48) * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm12 + movaps %xmm12, 16 * SIZE(BO) + pshufd $0x55, %xmm15, %xmm13 + movaps %xmm13, 20 * SIZE(BO) + pshufd $0xaa, %xmm15, %xmm14 + movaps %xmm14, 24 * SIZE(BO) + pshufd $0xff, %xmm15, %xmm15 + movaps %xmm15, 28 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-64 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L05: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movaps -32 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L06 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 40 * SIZE + BUFFER, BO +#else + leaq 40 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movaps -28 * SIZE(AO), %xmm1 + pxor %xmm10, %xmm10 + movaps -40 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movaps -36 * SIZE(BO), %xmm7 + + prefetcht2 -32 * SIZE(BB) + + prefetcht0 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 7 * SIZE(CO2) + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + prefetcht0 7 * SIZE(CO1, LDC, 2) + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + prefetcht0 7 * SIZE(CO2, LDC, 2) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -4 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + PADDING; + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps 12 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps 20 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + subq $-32 * SIZE, AO + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps 28 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, BO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_4 + +.L15: + prefetcht2 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + addq $8 * SIZE, AO + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + addq $16 * SIZE, BO + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA, %xmm7 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm14 + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 4 * SIZE(CO1, LDC, 2), %xmm5 + movhps 6 * SIZE(CO1, LDC, 2), %xmm5 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 + movsd 4 * SIZE(CO2, LDC, 2), %xmm7 + movhps 6 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movlps %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + +#ifndef TRMMKERNEL + addps %xmm4, %xmm10 + addps %xmm5, %xmm14 + addps %xmm6, %xmm11 + addps %xmm7, %xmm15 +#endif + + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 4 * SIZE(CO1, LDC, 2) + movhps %xmm14, 6 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + subq $1, I + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L25 + ALIGN_4 + +.L21: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -28 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps 0 * SIZE(BO), %xmm2 + movaps 4 * SIZE(BO), %xmm3 + movaps 8 * SIZE(BO), %xmm4 + movaps 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -20 * SIZE(AO), %xmm0 + movaps 16 * SIZE(BO), %xmm2 + movaps 20 * SIZE(BO), %xmm3 + movaps 24 * SIZE(BO), %xmm4 + movaps 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L21 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L28 + ALIGN_4 + +.L26: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + addq $ 4 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 + addps %xmm4, %xmm10 + addps %xmm6, %xmm11 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + subq $1, I + ALIGN_4 + +.L30: + testq $2, M + jle .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L35 + ALIGN_4 + +.L31: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -30 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 4 * SIZE(BO), %xmm3 + movsd 8 * SIZE(BO), %xmm4 + movsd 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -26 * SIZE(AO), %xmm0 + movsd 16 * SIZE(BO), %xmm2 + movsd 20 * SIZE(BO), %xmm3 + movsd 24 * SIZE(BO), %xmm4 + movsd 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L31 + ALIGN_4 + +.L35: + movsd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + ALIGN_4 + +.L36: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + addq $ 2 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 + addps %xmm4, %xmm10 + addps %xmm6, %xmm11 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L40: + testq $1, M + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L45 + ALIGN_4 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -31 * SIZE(AO), %xmm0 + movss -16 * SIZE(BO), %xmm2 + movss -12 * SIZE(BO), %xmm3 + movss -8 * SIZE(BO), %xmm4 + movss -4 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -30 * SIZE(AO), %xmm0 + movss 0 * SIZE(BO), %xmm2 + movss 4 * SIZE(BO), %xmm3 + movss 8 * SIZE(BO), %xmm4 + movss 12 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -29 * SIZE(AO), %xmm0 + movss 16 * SIZE(BO), %xmm2 + movss 20 * SIZE(BO), %xmm3 + movss 24 * SIZE(BO), %xmm4 + movss 28 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L41 + ALIGN_4 + +.L45: + movss ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L48 + ALIGN_4 + +.L46: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + addq $ 1 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm0 + movss 0 * SIZE(CO2), %xmm2 + movss 0 * SIZE(CO1, LDC, 2), %xmm4 + movss 0 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulss %xmm7, %xmm8 + mulss %xmm7, %xmm9 + mulss %xmm7, %xmm10 + mulss %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addss %xmm0, %xmm8 + addss %xmm2, %xmm9 + addss %xmm4, %xmm10 + addss %xmm6, %xmm11 +#endif + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + movss %xmm10, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C + subq $1, J + jg .L01 + ALIGN_4 + +.L50: + testq $2, N + jle .L100 + ALIGN_4 + +.L51: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L53 + + addq %rax, %rax + ALIGN_4 + +.L52: + movaps -32 * SIZE(B), %xmm3 + movaps -28 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + subq $1, %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $7, %rax + BRANCH + jle .L55 + ALIGN_4 + +.L54: + movss -32 * SIZE(B), %xmm8 + movss -31 * SIZE(B), %xmm9 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L54 + ALIGN_4 + +.L55: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO # aoffset = a + + movq M, I + sarq $3, I + jle .L70 + ALIGN_4 + +.L60: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht0 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L65 + ALIGN_4 + +.L61: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -20 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + + movaps -16 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -12 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + + movaps -8 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -4 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + subq $-32 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L61 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L68 + ALIGN_4 + +.L66: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movlps %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + subq $1, I + jg .L60 + ALIGN_4 + +.L70: + testq $4, M + jle .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L75 + ALIGN_4 + +.L71: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L71 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + ALIGN_4 + +.L76: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L80: + testq $2, M + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L85 + ALIGN_4 + +.L81: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L81 + ALIGN_4 + +.L85: + movsd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L88 + ALIGN_4 + +.L86: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L90: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L95 + ALIGN_4 + +.L91: + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -31 * SIZE(AO), %xmm1 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm4 + mulss %xmm1, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -30 * SIZE(AO), %xmm0 + movss -29 * SIZE(AO), %xmm1 + movss -16 * SIZE(BO), %xmm2 + movss -12 * SIZE(BO), %xmm3 + movss -8 * SIZE(BO), %xmm4 + movss -4 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm4 + mulss %xmm1, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L91 + ALIGN_4 + +.L95: + movss ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L98 + ALIGN_4 + +.L96: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm10, %xmm8 + addss %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm0 + movss 0 * SIZE(CO2), %xmm2 +#endif + + mulss %xmm7, %xmm8 + mulss %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addss %xmm0, %xmm8 + addss %xmm2, %xmm9 +#endif + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + + + +.L100: + testq $1, N + jle .L999 + ALIGN_4 + +.L101: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $4, %rax + jle .L103 + + addq %rax, %rax + ALIGN_4 + +.L102: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $15, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movss -32 * SIZE(B), %xmm8 + + shufps $0, %xmm8, %xmm8 + + movaps %xmm8, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L104 + ALIGN_4 + +.L105: + movq C, CO1 + movq A, AO + + movq M, I + sarq $3, I + jle .L120 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L115 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -28 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + movaps -20 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + + subq $-32 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L111 + ALIGN_4 + +.L115: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + ALIGN_4 + +.L116: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm12 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + subq $1, I + jg .L110 + ALIGN_4 + +.L120: + testq $4, M + jle .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L125 + ALIGN_4 + +.L121: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L121 + ALIGN_4 + +.L125: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L128 + ALIGN_4 + +.L126: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm8 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 +#endif + + mulps %xmm7, %xmm8 +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L130: + testq $2, M + jle .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L135 + ALIGN_4 + +.L131: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -24 * SIZE(BO), %xmm2 + movsd -20 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L131 + ALIGN_4 + +.L135: + movsd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L138 + ALIGN_4 + +.L136: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L136 + ALIGN_4 + +.L138: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm9, %xmm8 + + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + movlps %xmm8, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 + ALIGN_4 + +.L140: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L145 + ALIGN_4 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -31 * SIZE(AO), %xmm1 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm1, %xmm3 + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + + movss -30 * SIZE(AO), %xmm0 + movss -29 * SIZE(AO), %xmm1 + movss -24 * SIZE(BO), %xmm2 + movss -20 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm1, %xmm3 + addss %xmm2, %xmm10 + addss %xmm3, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L141 + ALIGN_4 + +.L145: + movss ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L148 + ALIGN_4 + +.L146: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L146 + ALIGN_4 + +.L148: +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm0 +#endif + + addss %xmm10, %xmm8 + addss %xmm11, %xmm9 + addss %xmm9, %xmm8 + mulss %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addss %xmm0, %xmm8 +#endif + + movss %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_penryn.S b/kernel/x86_64/gemm_kernel_8x4_penryn.S new file mode 100644 index 0000000..68ca5fc --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_penryn.S @@ -0,0 +1,2515 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define J 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define J 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 17 + 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm0, %xmm0 + movlps %xmm0, ALPHA + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $BASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L50 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $3, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + xorpd %xmm5, %xmm5 + prefetcht0 -32 * SIZE(BB) + xorpd %xmm6, %xmm6 + + prefetcht2 7 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht2 7 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht2 7 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movaps %xmm4, %xmm13 + prefetcht2 7 * SIZE(CO2, LDC, 2) + movaps %xmm4, %xmm14 + movaps %xmm4, %xmm15 + + subq $-24 * SIZE, BB + + leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH -32 * SIZE(PREA) + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + PREFETCH -16 * SIZE(PREA) + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 4 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + PREFETCH 0 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 12 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 20 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + PREFETCH 16 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 28 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + subq $-64 * SIZE, AO + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, PREA + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht0 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + + movddup ALPHA, %xmm3 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + mulps %xmm3, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm3, %xmm10 + mulps %xmm3, %xmm11 + + mulps %xmm3, %xmm12 + mulps %xmm3, %xmm13 + mulps %xmm3, %xmm14 + mulps %xmm3, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 4 * SIZE(CO1, LDC, 2), %xmm5 + movhps 6 * SIZE(CO1, LDC, 2), %xmm5 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 + movsd 4 * SIZE(CO2, LDC, 2), %xmm7 + movhps 6 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + addps %xmm4, %xmm10 + addps %xmm5, %xmm14 + addps %xmm6, %xmm11 + addps %xmm7, %xmm15 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movsd %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movsd %xmm14, 4 * SIZE(CO1, LDC, 2) + movhps %xmm14, 6 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movsd %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm6, %xmm10 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + subq $-16 * SIZE, AO + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + addps %xmm6, %xmm10 + addps %xmm4, %xmm11 + + movddup ALPHA, %xmm3 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + mulps %xmm3, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm3, %xmm10 + mulps %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 + addps %xmm4, %xmm10 + addps %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $2, M + BRANCH + jle .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm0, %xmm1 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0xee, %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm10 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0x44, %xmm0, %xmm1 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0xee, %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm10 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm2, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x44, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + movddup ALPHA, %xmm2 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 0 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO1, LDC, 2), %xmm1 + movhps 0 * SIZE(CO2, LDC, 2), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 0 * SIZE(CO2) + movsd %xmm9, 0 * SIZE(CO1, LDC, 2) + movhps %xmm9, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movaps -24 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + movddup ALPHA, %xmm2 + addps %xmm9, %xmm8 + mulps %xmm2, %xmm8 + + pshufd $0xff, %xmm8, %xmm11 + pshufd $0xaa, %xmm8, %xmm10 + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + +#ifndef TRMMKERNEL + addss 0 * SIZE(CO1), %xmm8 + addss 0 * SIZE(CO2), %xmm9 + addss 0 * SIZE(CO1, LDC, 2), %xmm10 + addss 0 * SIZE(CO2, LDC, 2), %xmm11 +#endif + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + movss %xmm10, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L50: + testq $2, N + jle .L90 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $3, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 -32 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + prefetcht0 7 * SIZE(CO1) + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + prefetcht0 7 * SIZE(CO2) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + addps %xmm5, %xmm10 + pshufd $0xaa, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0xff, %xmm2, %xmm6 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0xaa, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0xff, %xmm2, %xmm6 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + movddup ALPHA, %xmm7 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm9 + addps %xmm3, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm10, 4 * SIZE(CO1) + movhps %xmm10, 6 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movsd %xmm11, 4 * SIZE(CO2) + movhps %xmm11, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $4, M + BRANCH + jle .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm10 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xff, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm10 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xff, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_3 + +.L66: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + movddup ALPHA, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $2, M + BRANCH + jle .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm9 + pshufd $0xee, %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + pshufd $0xfa, %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm9 + pshufd $0xee, %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + pshufd $0xfa, %xmm2, %xmm3 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + pshufd $0x50, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: + movddup ALPHA, %xmm2 + + addps %xmm9, %xmm8 + addps %xmm3, %xmm8 + + mulps %xmm2, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 0 * SIZE(CO2), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L89 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -30 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movsd -28 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -26 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movsd -24 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L88 + ALIGN_3 + +.L86: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -30 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: + movddup ALPHA, %xmm2 + addps %xmm9, %xmm8 + mulps %xmm2, %xmm8 + + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + +#ifndef TRMMKERNEL + addss 0 * SIZE(CO1), %xmm8 + addss 0 * SIZE(CO2), %xmm9 +#endif + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L90: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $3, I + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm1 + xorps %xmm9, %xmm9 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + prefetcht0 7 * SIZE(CO1) + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -20 * SIZE(AO), %xmm1 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm10 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm11 + movaps -12 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -4 * SIZE(AO), %xmm1 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm10 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm11 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + ALIGN_3 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: + movddup ALPHA, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $4, M + BRANCH + jle .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L108 + ALIGN_3 + +.L106: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: + movddup ALPHA, %xmm7 + + addps %xmm9, %xmm8 + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L118 + ALIGN_3 + +.L116: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: + movddup ALPHA, %xmm2 + + mulps %xmm2, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movss -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + addss %xmm2, %xmm9 + movss -30 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + addss %xmm2, %xmm9 + movss -28 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L128 + ALIGN_3 + +.L126: + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: + movss ALPHA, %xmm2 + addss %xmm9, %xmm8 + mulss %xmm2, %xmm8 + +#ifndef TRMMKERNEL + addss 0 * SIZE(CO1), %xmm8 +#endif + + movss %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_sse.S b/kernel/x86_64/gemm_kernel_8x4_sse.S new file mode 100644 index 0000000..218cb04 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_sse.S @@ -0,0 +1,3446 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi + +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 256(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 9 + 8) +#endif + +#if defined(GENERIC) || defined(NANO) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 5 + 8) +#endif + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movd 0 * SIZE(B), %mm0 + + movq K, %rax + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + punpckldq %mm0, %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + punpckldq %mm1, %mm1 + movd 8 * SIZE(B), %mm0 + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + punpckldq %mm2, %mm2 + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + punpckldq %mm3, %mm3 + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + punpckldq %mm4, %mm4 + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + punpckldq %mm5, %mm5 + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + punpckldq %mm6, %mm6 + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + punpckldq %mm7, %mm7 + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movaps -24 * SIZE(AO), %xmm4 + movaps -24 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + movaps -20 * SIZE(AO), %xmm6 + movaps -16 * SIZE(BO), %xmm7 + xorps %xmm11, %xmm11 + + PREFETCHW 7 * SIZE(CO1) + xorps %xmm12, %xmm12 + PREFETCHW 15 * SIZE(CO2) + xorps %xmm13, %xmm13 + PREFETCHW 7 * SIZE(CO1, LDC, 2) + xorps %xmm14, %xmm14 + PREFETCHW 15 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + PREFETCH -32 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: + PREFETCH -16 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $64 * SIZE, BO + addq $32 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 64 * SIZE, BO + subq $-32 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm0 + addps %xmm1, %xmm10 + movaps -32 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm13 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm2 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm2, %xmm15 + movaps -20 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 4 * SIZE(CO1, LDC, 2), %xmm5 + movhps 6 * SIZE(CO1, LDC, 2), %xmm5 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 + movsd 4 * SIZE(CO2, LDC, 2), %xmm7 + movhps 6 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movlps %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + +#ifndef TRMMKERNEL + addps %xmm4, %xmm10 + addps %xmm5, %xmm14 + addps %xmm6, %xmm11 + addps %xmm7, %xmm15 +#endif + + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 4 * SIZE(CO1, LDC, 2) + movhps %xmm14, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhps 2 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhps 2 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(CO1), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 0 * SIZE(CO2), %xmm10 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss -30 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss -29 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss -24 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss -27 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss -26 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss -25 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + mulss %xmm15, %xmm0 + mulss %xmm15, %xmm1 + mulss %xmm15, %xmm2 + mulss %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm10 + movss 0 * SIZE(CO1, LDC, 2), %xmm12 + movss 0 * SIZE(CO2, LDC, 2), %xmm14 + + addss %xmm8, %xmm0 + addss %xmm10, %xmm1 + addss %xmm12, %xmm2 + addss %xmm14, %xmm3 +#endif + + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO2) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + PREFETCH 32 * SIZE(B) + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH 32 * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) +#endif + + addq $ 2 * SIZE, B + addq $ 8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + PREFETCHW 7 * SIZE(CO1) + xorps %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 48 * SIZE(AO), %xmm10 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 4 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 8 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 12 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 64 * SIZE(AO), %xmm12 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 20 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 24 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 80 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + movsd 4 * SIZE(CO2), %xmm11 + movhps 6 * SIZE(CO2), %xmm11 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 + addps %xmm10, %xmm1 + addps %xmm11, %xmm5 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + movlps %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(CO1), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 0 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -29 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -24 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -27 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -26 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -25 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm10 +#endif + + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + mulss %xmm15, %xmm0 + mulss %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addss %xmm8, %xmm0 + addss %xmm10, %xmm1 +#endif + + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + PREFETCH 32 * SIZE(B) + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH 32 * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 * SIZE(BO) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + movd 0 * SIZE(B), %mm0 + punpckldq %mm0, %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) +#endif + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + PREFETCHW 7 * SIZE(CO1) + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps -20 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps -12 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps -8 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps -4 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 4 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 8 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 12 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 64 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 20 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 24 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 28 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 80 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps -28 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -24 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps -20 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps -12 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -8 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps -4 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + mulps %xmm15, %xmm0 +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(CO1), %xmm8 + addps %xmm8, %xmm0 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss -31 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss -30 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss -29 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss -24 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss -27 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss -26 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss -25 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss -20 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movss ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + mulss %xmm15, %xmm0 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + addss %xmm8, %xmm0 +#endif + movss %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_sse3.S b/kernel/x86_64/gemm_kernel_8x4_sse3.S new file mode 100644 index 0000000..c7954fe --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_sse3.S @@ -0,0 +1,3022 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r12 +#define BO %r13 +#define CO1 %r14 +#define CO2 %r15 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 320 + +#define KERNEL1(address) \ + mulps %xmm8, %xmm9; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AO); \ + addps %xmm9, %xmm0; \ + movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm1; \ + movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 4 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm3; \ + movsldup 0 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm5; \ + movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 8 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm7; \ + movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm0; \ + movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm1; \ + movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 12 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm3; \ + movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm5; \ + movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 64 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm7; \ + movsldup 64 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm1; \ + movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 20 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm3; \ + movsldup 16 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm5; \ + movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 24 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm7; \ + movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm1; \ + movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 28 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm3; \ + movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm5; \ + movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 80 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm7; \ + movsldup 80 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulps %xmm12, %xmm13; \ + PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * SIZE(AO); \ + addps %xmm13, %xmm0; \ + movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm1; \ + movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 36 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm3; \ + movsldup 32 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm5; \ + movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 40 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm7; \ + movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm0; \ + movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm1; \ + movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 44 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm3; \ + movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm5; \ + movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 96 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm7; \ + movsldup 96 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm1; \ + movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 52 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm3; \ + movsldup 48 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm5; \ + movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 56 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm7; \ + movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm1; \ + movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 60 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm3; \ + movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm5; \ + movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 112 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm7; \ + movsldup 112 * SIZE + (address) * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq 112 * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: + prefetcht0 0 * SIZE(BB) + prefetcht0 8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + prefetchnta 8 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 8 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 8 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 8 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 + +.L1X: + KERNEL1 (64 * 0) + KERNEL2 (64 * 0) + KERNEL3 (64 * 0) + KERNEL4 (64 * 0) + KERNEL5 (64 * 0) + KERNEL6 (64 * 0) + KERNEL7 (64 * 0) + KERNEL8 (64 * 0) + KERNEL9 (64 * 0) + KERNEL10(64 * 0) + KERNEL11(64 * 0) + KERNEL12(64 * 0) + KERNEL13(64 * 0) + KERNEL14(64 * 0) + KERNEL15(64 * 0) + KERNEL16(64 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 1) + KERNEL2 (64 * 1) + KERNEL3 (64 * 1) + KERNEL4 (64 * 1) + KERNEL5 (64 * 1) + KERNEL6 (64 * 1) + KERNEL7 (64 * 1) + KERNEL8 (64 * 1) + KERNEL9 (64 * 1) + KERNEL10(64 * 1) + KERNEL11(64 * 1) + KERNEL12(64 * 1) + KERNEL13(64 * 1) + KERNEL14(64 * 1) + KERNEL15(64 * 1) + KERNEL16(64 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 2) + KERNEL2 (64 * 2) + KERNEL3 (64 * 2) + KERNEL4 (64 * 2) + KERNEL5 (64 * 2) + KERNEL6 (64 * 2) + KERNEL7 (64 * 2) + KERNEL8 (64 * 2) + KERNEL9 (64 * 2) + KERNEL10(64 * 2) + KERNEL11(64 * 2) + KERNEL12(64 * 2) + KERNEL13(64 * 2) + KERNEL14(64 * 2) + KERNEL15(64 * 2) + KERNEL16(64 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 3) + KERNEL2 (64 * 3) + KERNEL3 (64 * 3) + KERNEL4 (64 * 3) + KERNEL5 (64 * 3) + KERNEL6 (64 * 3) + KERNEL7 (64 * 3) + KERNEL8 (64 * 3) + KERNEL9 (64 * 3) + KERNEL10(64 * 3) + KERNEL11(64 * 3) + KERNEL12(64 * 3) + KERNEL13(64 * 3) + KERNEL14(64 * 3) + KERNEL15(64 * 3) + KERNEL16(64 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 4) + KERNEL2 (64 * 4) + KERNEL3 (64 * 4) + KERNEL4 (64 * 4) + KERNEL5 (64 * 4) + KERNEL6 (64 * 4) + KERNEL7 (64 * 4) + KERNEL8 (64 * 4) + KERNEL9 (64 * 4) + KERNEL10(64 * 4) + KERNEL11(64 * 4) + KERNEL12(64 * 4) + KERNEL13(64 * 4) + KERNEL14(64 * 4) + KERNEL15(64 * 4) + KERNEL16(64 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 5) + KERNEL2 (64 * 5) + KERNEL3 (64 * 5) + KERNEL4 (64 * 5) + KERNEL5 (64 * 5) + KERNEL6 (64 * 5) + KERNEL7 (64 * 5) + KERNEL8 (64 * 5) + KERNEL9 (64 * 5) + KERNEL10(64 * 5) + KERNEL11(64 * 5) + KERNEL12(64 * 5) + KERNEL13(64 * 5) + KERNEL14(64 * 5) + KERNEL15(64 * 5) + KERNEL16(64 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 6) + KERNEL2 (64 * 6) + KERNEL3 (64 * 6) + KERNEL4 (64 * 6) + KERNEL5 (64 * 6) + KERNEL6 (64 * 6) + KERNEL7 (64 * 6) + KERNEL8 (64 * 6) + KERNEL9 (64 * 6) + KERNEL10(64 * 6) + KERNEL11(64 * 6) + KERNEL12(64 * 6) + KERNEL13(64 * 6) + KERNEL14(64 * 6) + KERNEL15(64 * 6) + KERNEL16(64 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 7) + KERNEL2 (64 * 7) + KERNEL3 (64 * 7) + KERNEL4 (64 * 7) + KERNEL5 (64 * 7) + KERNEL6 (64 * 7) + KERNEL7 (64 * 7) + KERNEL8 (64 * 7) + KERNEL9 (64 * 7) + KERNEL10(64 * 7) + KERNEL11(64 * 7) + KERNEL12(64 * 7) + KERNEL13(64 * 7) + KERNEL14(64 * 7) + KERNEL15(64 * 7) + KERNEL16(64 * 7) + + addq $64 * 8 * SIZE, AO + addq $64 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + KERNEL1 (64 * 0) + KERNEL2 (64 * 0) + KERNEL3 (64 * 0) + KERNEL4 (64 * 0) + KERNEL5 (64 * 0) + KERNEL6 (64 * 0) + KERNEL7 (64 * 0) + KERNEL8 (64 * 0) + KERNEL9 (64 * 0) + KERNEL10(64 * 0) + KERNEL11(64 * 0) + KERNEL12(64 * 0) + KERNEL13(64 * 0) + KERNEL14(64 * 0) + KERNEL15(64 * 0) + KERNEL16(64 * 0) + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm6 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm7 + movsldup 8 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + mulps %xmm15, %xmm0 + movhps 2 * SIZE(CO1), %xmm8 + mulps %xmm15, %xmm1 + movsd 4 * SIZE(CO1), %xmm9 + mulps %xmm15, %xmm2 + movhps 6 * SIZE(CO1), %xmm9 + mulps %xmm15, %xmm3 + movsd 0 * SIZE(CO2), %xmm10 + mulps %xmm15, %xmm4 + movhps 2 * SIZE(CO2), %xmm10 + mulps %xmm15, %xmm5 + movsd 4 * SIZE(CO2), %xmm11 + mulps %xmm15, %xmm6 + movhps 6 * SIZE(CO2), %xmm11 + mulps %xmm15, %xmm7 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhps 2 * SIZE(CO1, LDC, 2), %xmm12 + movsd 4 * SIZE(CO1, LDC, 2), %xmm13 + movhps 6 * SIZE(CO1, LDC, 2), %xmm13 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhps 2 * SIZE(CO2, LDC, 2), %xmm14 + movsd 4 * SIZE(CO2, LDC, 2), %xmm15 + movhps 6 * SIZE(CO2, LDC, 2), %xmm15 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 + addps %xmm10, %xmm1 + addps %xmm11, %xmm5 + addps %xmm12, %xmm2 + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + addps %xmm13, %xmm6 + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + addps %xmm14, %xmm3 + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + addps %xmm15, %xmm7 + movsd %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) +#else + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm5 + mulps %xmm15, %xmm6 + mulps %xmm15, %xmm7 + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + movsd %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) +#endif + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm6, 4 * SIZE(CO1, LDC, 2) + movhps %xmm6, 6 * SIZE(CO1, LDC, 2) + + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 2 * SIZE(CO2, LDC, 2) + movsd %xmm7, 4 * SIZE(CO2, LDC, 2) + movhps %xmm7, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 64 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsldup 80 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 32 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsldup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsldup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsldup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsldup 96 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 48 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsldup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsldup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsldup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsldup 112 * SIZE(BO), %xmm15 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhps 2 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhps 2 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#else + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + movddup 8 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 32 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 16 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 20 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 24 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 28 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movsd 64 * SIZE(BO), %xmm9 + addps %xmm11, %xmm0 + movsd 36 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 40 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 44 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 48 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movsd 52 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 56 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 60 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 96 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 + movsd 0 * SIZE(CO1, LDC, 2), %xmm9 + movhps 0 * SIZE(CO2, LDC, 2), %xmm9 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 + addps %xmm9, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 32 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + shufps $0, %xmm8, %xmm8 + movhps 4 * SIZE(BO), %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 8 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 16 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 20 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 3 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 24 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 28 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 64 * SIZE(BO), %xmm9 + shufps $0, %xmm10, %xmm10 + movhps 36 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 40 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 44 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 7 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 56 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 60 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 96 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + shufps $0, %xmm8, %xmm8 + movhps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm9 + movss 0 * SIZE(CO1, LDC, 2), %xmm10 + movss 0 * SIZE(CO2, LDC, 2), %xmm11 +#endif + + addps %xmm1, %xmm0 + + mulps %xmm15, %xmm0 + + movhlps %xmm0, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addss %xmm0, %xmm8 + psrlq $32, %xmm0 + addss %xmm0, %xmm9 + addss %xmm1, %xmm10 + psrlq $32, %xmm1 + addss %xmm1, %xmm11 + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + movss %xmm10, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movss %xmm0, 0 * SIZE(CO1) + psrlq $32, %xmm0 + movss %xmm0, 0 * SIZE(CO2) + movss %xmm1, 0 * SIZE(CO1, LDC, 2) + psrlq $32, %xmm1 + movss %xmm1, 0 * SIZE(CO2, LDC, 2) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L53 + ALIGN_4 + +.L52: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $7, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movddup 0 * SIZE(B), %xmm0 + movaps %xmm0, 0 * SIZE(BO) + + addq $ 2 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + prefetcht2 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetcht2 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 36 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movsldup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm0 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movsldup 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + movsd 4 * SIZE(CO2), %xmm11 + movhps 6 * SIZE(CO2), %xmm11 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 + addps %xmm10, %xmm1 + addps %xmm11, %xmm5 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + movsd %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movsldup 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movsldup 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsldup 48 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + movddup 8 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + shufps $0x50, %xmm9, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L86 + ALIGN_4 + +.L88: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 +#endif + + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + shufps $0, %xmm8, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 3 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 7 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L96 + ALIGN_4 + +.L98: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm9 + + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + addss %xmm0, %xmm8 + psrlq $32, %xmm0 + addss %xmm0, %xmm9 + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) +#else + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + + movss %xmm0, 0 * SIZE(CO1) + psrlq $32, %xmm0 + movss %xmm0, 0 * SIZE(CO2) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + movss %xmm0, 0 * SIZE(BO) + movss %xmm0, 1 * SIZE(BO) + movss %xmm1, 2 * SIZE(BO) + movss %xmm1, 3 * SIZE(BO) + movss %xmm2, 4 * SIZE(BO) + movss %xmm2, 5 * SIZE(BO) + movss %xmm3, 6 * SIZE(BO) + movss %xmm3, 7 * SIZE(BO) + movss %xmm4, 8 * SIZE(BO) + movss %xmm4, 9 * SIZE(BO) + movss %xmm5, 10 * SIZE(BO) + movss %xmm5, 11 * SIZE(BO) + movss %xmm6, 12 * SIZE(BO) + movss %xmm6, 13 * SIZE(BO) + movss %xmm7, 14 * SIZE(BO) + movss %xmm7, 15 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm0 + movss %xmm0, 0 * SIZE(BO) + movss %xmm0, 1 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 2 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movddup 0 * SIZE(BO), %xmm9 + movddup 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + prefetchnta 8 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm9, %xmm0 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + movddup 8 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm9 + movaps 36 * SIZE(AO), %xmm12 + addps %xmm9, %xmm0 + movddup 16 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + addps %xmm11, %xmm0 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movddup 2 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 +#endif + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + movaps 0 * SIZE(AO), %xmm8 + movddup 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movddup 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 16 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L126 + ALIGN_4 + +.L128: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 +#endif + + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $4, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $15, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + movsd 0 * SIZE(AO), %xmm8 + movsd 0 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L136 + ALIGN_4 + +.L138: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 +#endif + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + mulps %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 +#endif + movsd %xmm0, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movss 0 * SIZE(BO), %xmm9 + movss 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss 1 * SIZE(AO), %xmm8 + mulss 2 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 16 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 6 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 10 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 24 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 12 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 14 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movss ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + movss 0 * SIZE(AO), %xmm8 + movss 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movss 0 * SIZE(CO1), %xmm8 +#endif + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + mulss %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addss %xmm8, %xmm0 +#endif + movss %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_ncopy_2.S b/kernel/x86_64/gemm_ncopy_2.S new file mode 100644 index 0000000..72c2b9d --- /dev/null +++ b/kernel/x86_64/gemm_ncopy_2.S @@ -0,0 +1,290 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(NEHALEM) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define AO3 %r13 +#define AO4 %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA # Scaling + + movq N, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L12: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq M, I + sarq $2, I + jle .L14 + ALIGN_4 + +.L13: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 1 * SIZE(AO1), %xmm2 + movss 1 * SIZE(AO2), %xmm3 + movss 2 * SIZE(AO1), %xmm4 + movss 2 * SIZE(AO2), %xmm5 + movss 3 * SIZE(AO1), %xmm6 + movss 3 * SIZE(AO2), %xmm7 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) + movss %xmm4, 4 * SIZE(B) + movss %xmm5, 5 * SIZE(B) + movss %xmm6, 6 * SIZE(B) + movss %xmm7, 7 * SIZE(B) +#else + PREFETCH RPREFETCHSIZE * SIZE(AO1) + + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 1 * SIZE(AO1), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(AO2) + + movsd 2 * SIZE(AO1), %xmm2 + movhpd 2 * SIZE(AO2), %xmm2 + movsd 3 * SIZE(AO1), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + PREFETCHW WPREFETCHSIZE * SIZE(B) + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + movq M, I + andq $3, I + jle .L16 + ALIGN_4 + +.L15: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movapd %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $2 * SIZE, B + decq I + jg .L15 + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L20: + testq $1, N + jle .L999 + + movq A, AO1 + + movq M, I + sarq $2, I + jle .L34 + ALIGN_4 + +.L33: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 1 * SIZE(AO1), %xmm1 + movss 2 * SIZE(AO1), %xmm2 + movss 3 * SIZE(AO1), %xmm3 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + decq I + jg .L33 + ALIGN_4 + +.L34: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L35: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $1 * SIZE, B + decq I + jg .L35 + ALIGN_4 + + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_ncopy_4.S b/kernel/x86_64/gemm_ncopy_4.S new file mode 100644 index 0000000..a04542f --- /dev/null +++ b/kernel/x86_64/gemm_ncopy_4.S @@ -0,0 +1,470 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 +#endif + +#ifdef ATOM +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NANO +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef BARCELONA +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef GENERIC +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define AO3 %r13 +#define AO4 %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA # Scaling + + movq N, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L12: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + leaq (A, LDA, 4), A + + movq M, I + sarq $2, I + jle .L14 + ALIGN_4 + +.L13: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 0 * SIZE(AO3), %xmm2 + movss 0 * SIZE(AO4), %xmm3 + + movss 1 * SIZE(AO1), %xmm4 + movss 1 * SIZE(AO2), %xmm5 + movss 1 * SIZE(AO3), %xmm6 + movss 1 * SIZE(AO4), %xmm7 + + movss 2 * SIZE(AO1), %xmm8 + movss 2 * SIZE(AO2), %xmm9 + movss 2 * SIZE(AO3), %xmm10 + movss 2 * SIZE(AO4), %xmm11 + + movss 3 * SIZE(AO1), %xmm12 + movss 3 * SIZE(AO2), %xmm13 + movss 3 * SIZE(AO3), %xmm14 + movss 3 * SIZE(AO4), %xmm15 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) + movss %xmm4, 4 * SIZE(B) + movss %xmm5, 5 * SIZE(B) + movss %xmm6, 6 * SIZE(B) + movss %xmm7, 7 * SIZE(B) + + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + PREFETCH RPREFETCHSIZE * SIZE(AO3) + PREFETCH RPREFETCHSIZE * SIZE(AO4) + + PREFETCHW WPREFETCHSIZE * SIZE(B) + + movss %xmm8, 8 * SIZE(B) + movss %xmm9, 9 * SIZE(B) + movss %xmm10, 10 * SIZE(B) + movss %xmm11, 11 * SIZE(B) + movss %xmm12, 12 * SIZE(B) + movss %xmm13, 13 * SIZE(B) + movss %xmm14, 14 * SIZE(B) + movss %xmm15, 15 * SIZE(B) +#else + PREFETCH RPREFETCHSIZE * SIZE(AO1) + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 1 * SIZE(AO1), %xmm2 + movhpd 1 * SIZE(AO2), %xmm2 + PREFETCH RPREFETCHSIZE * SIZE(AO2) + movsd 2 * SIZE(AO1), %xmm4 + movhpd 2 * SIZE(AO2), %xmm4 + movsd 3 * SIZE(AO1), %xmm6 + movhpd 3 * SIZE(AO2), %xmm6 + + PREFETCH RPREFETCHSIZE * SIZE(AO3) + movsd 0 * SIZE(AO3), %xmm1 + movhpd 0 * SIZE(AO4), %xmm1 + movsd 1 * SIZE(AO3), %xmm3 + movhpd 1 * SIZE(AO4), %xmm3 + PREFETCH RPREFETCHSIZE * SIZE(AO4) + movsd 2 * SIZE(AO3), %xmm5 + movhpd 2 * SIZE(AO4), %xmm5 + movsd 3 * SIZE(AO3), %xmm7 + movhpd 3 * SIZE(AO4), %xmm7 + + PREFETCHW WPREFETCHSIZE * SIZE(B) + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(B) +#endif + movapd %xmm4, 8 * SIZE(B) + movapd %xmm5, 10 * SIZE(B) + movapd %xmm6, 12 * SIZE(B) + movapd %xmm7, 14 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + addq $4 * SIZE, AO3 + addq $4 * SIZE, AO4 + + subq $-16 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + movq M, I + andq $3, I + jle .L16 + ALIGN_4 + +.L15: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 0 * SIZE(AO3), %xmm2 + movss 0 * SIZE(AO4), %xmm3 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 0 * SIZE(AO3), %xmm1 + movhpd 0 * SIZE(AO4), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $SIZE, AO3 + addq $SIZE, AO4 + addq $4 * SIZE, B + decq I + jg .L15 + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L20: + testq $2, N + jle .L30 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq M, I + sarq $2, I + jle .L24 + ALIGN_4 + +.L23: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 1 * SIZE(AO1), %xmm2 + movss 1 * SIZE(AO2), %xmm3 + movss 2 * SIZE(AO1), %xmm4 + movss 2 * SIZE(AO2), %xmm5 + movss 3 * SIZE(AO1), %xmm6 + movss 3 * SIZE(AO2), %xmm7 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) + movss %xmm4, 4 * SIZE(B) + movss %xmm5, 5 * SIZE(B) + movss %xmm6, 6 * SIZE(B) + movss %xmm7, 7 * SIZE(B) + +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 1 * SIZE(AO1), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movsd 2 * SIZE(AO1), %xmm2 + movhpd 2 * SIZE(AO2), %xmm2 + movsd 3 * SIZE(AO1), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) +#endif + + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + + PREFETCHW WPREFETCHSIZE * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + decq I + jg .L23 + ALIGN_4 + +.L24: + movq M, I + andq $3, I + jle .L30 + ALIGN_4 + +.L25: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + + movapd %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $2 * SIZE, B + decq I + jg .L25 + ALIGN_4 + +.L30: + testq $1, N + jle .L999 + + movq A, AO1 + + movq M, I + sarq $2, I + jle .L34 + ALIGN_4 + +.L33: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 1 * SIZE(AO1), %xmm1 + movss 2 * SIZE(AO1), %xmm2 + movss 3 * SIZE(AO1), %xmm3 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + decq I + jg .L33 + ALIGN_4 + +.L34: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L35: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $1 * SIZE, B + decq I + jg .L35 + ALIGN_4 + + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_ncopy_4_opteron.S b/kernel/x86_64/gemm_ncopy_4_opteron.S new file mode 100644 index 0000000..edde7e2 --- /dev/null +++ b/kernel/x86_64/gemm_ncopy_4_opteron.S @@ -0,0 +1,388 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCHSIZE (12 + 4) +#define WPREFETCHSIZE (48 + 4) +#define MOVNTQ MOVQ +#else +#define RPREFETCHSIZE (12 + 4) +#define WPREFETCHSIZE (24 + 4) +#define MOVNTQ MOVQ +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define AO3 %r13 +#define AO4 %rax + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCH prefetch +#else +#define RPREFETCH prefetch +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + EMMS + + leaq (,LDA, SIZE), LDA # Scaling + + movq N, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L11: +#if 0 + movq A, AO1 + leaq (A, LDA, 1), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + + movq M, I + sarq $4, I + jle .L13 + ALIGN_4 + +.L12: + MOVQ 0 * SIZE(AO1), %mm0 + addq $8 * SIZE, AO1 + MOVQ 0 * SIZE(AO2), %mm1 + addq $8 * SIZE, AO2 + MOVQ 0 * SIZE(AO3), %mm2 + addq $8 * SIZE, AO3 + MOVQ 0 * SIZE(AO4), %mm3 + addq $8 * SIZE, AO4 + + decq I + jg .L12 + ALIGN_4 + +.L13: +#endif + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + leaq (A, LDA, 4), A + + movq M, I + sarq $2, I + jle .L15 + ALIGN_4 + +.L14: + RPREFETCH (RPREFETCHSIZE) * SIZE(AO1) + + MOVQ 0 * SIZE(AO1), %mm0 + MOVNTQ %mm0, 0 * SIZE(B) + MOVQ 0 * SIZE(AO2), %mm1 + MOVNTQ %mm1, 1 * SIZE(B) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO2) + + MOVQ 0 * SIZE(AO3), %mm2 + MOVNTQ %mm2, 2 * SIZE(B) + MOVQ 0 * SIZE(AO4), %mm3 + MOVNTQ %mm3, 3 * SIZE(B) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + MOVQ 1 * SIZE(AO1), %mm4 + MOVNTQ %mm4, 4 * SIZE(B) + MOVQ 1 * SIZE(AO2), %mm5 + MOVNTQ %mm5, 5 * SIZE(B) + MOVQ 1 * SIZE(AO3), %mm6 + MOVNTQ %mm6, 6 * SIZE(B) + MOVQ 1 * SIZE(AO4), %mm7 + MOVNTQ %mm7, 7 * SIZE(B) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO3) + + MOVQ 2 * SIZE(AO1), %mm0 + MOVNTQ %mm0, 8 * SIZE(B) + MOVQ 2 * SIZE(AO2), %mm1 + MOVNTQ %mm1, 9 * SIZE(B) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO4) + + MOVQ 2 * SIZE(AO3), %mm2 + MOVNTQ %mm2, 10 * SIZE(B) + MOVQ 2 * SIZE(AO4), %mm3 + MOVNTQ %mm3, 11 * SIZE(B) + + prefetchw (WPREFETCHSIZE + 8) * SIZE(B) + MOVQ 3 * SIZE(AO1), %mm4 + MOVNTQ %mm4, 12 * SIZE(B) + MOVQ 3 * SIZE(AO2), %mm5 + MOVNTQ %mm5, 13 * SIZE(B) + MOVQ 3 * SIZE(AO3), %mm6 + MOVNTQ %mm6, 14 * SIZE(B) + MOVQ 3 * SIZE(AO4), %mm7 + MOVNTQ %mm7, 15 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + addq $4 * SIZE, AO3 + addq $4 * SIZE, AO4 + + subq $-16 * SIZE, B + decq I + jg .L14 + ALIGN_4 + +.L15: + movq M, I + andq $3, I + jle .L17 + ALIGN_4 + +.L16: + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 0 * SIZE(AO2), %mm1 + MOVQ 0 * SIZE(AO3), %mm2 + MOVQ 0 * SIZE(AO4), %mm3 + + MOVNTQ %mm0, 0 * SIZE(B) + MOVNTQ %mm1, 1 * SIZE(B) + MOVNTQ %mm2, 2 * SIZE(B) + MOVNTQ %mm3, 3 * SIZE(B) + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $SIZE, AO3 + addq $SIZE, AO4 + addq $4 * SIZE, B + decq I + jg .L16 + ALIGN_4 + +.L17: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $2, N + jle .L30 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq M, I + sarq $2, I + jle .L24 + ALIGN_4 + +.L23: + prefetch (RPREFETCHSIZE) * SIZE(AO1) + MOVQ 0 * SIZE(AO1), %mm0 + prefetch (RPREFETCHSIZE) * SIZE(AO2) + MOVQ 0 * SIZE(AO2), %mm1 + MOVQ 1 * SIZE(AO1), %mm2 + MOVQ 1 * SIZE(AO2), %mm3 + MOVQ 2 * SIZE(AO1), %mm4 + MOVQ 2 * SIZE(AO2), %mm5 + MOVQ 3 * SIZE(AO1), %mm6 + MOVQ 3 * SIZE(AO2), %mm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + + MOVNTQ %mm0, 0 * SIZE(B) + MOVNTQ %mm1, 1 * SIZE(B) + MOVNTQ %mm2, 2 * SIZE(B) + MOVNTQ %mm3, 3 * SIZE(B) + MOVNTQ %mm4, 4 * SIZE(B) + MOVNTQ %mm5, 5 * SIZE(B) + MOVNTQ %mm6, 6 * SIZE(B) + MOVNTQ %mm7, 7 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + decq I + jg .L23 + ALIGN_4 + +.L24: + movq M, I + andq $3, I + jle .L30 + ALIGN_4 + +.L25: + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 0 * SIZE(AO2), %mm1 + + MOVNTQ %mm0, 0 * SIZE(B) + MOVNTQ %mm1, 1 * SIZE(B) + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $2 * SIZE, B + decq I + jg .L25 + ALIGN_4 + +.L30: + testq $1, N + jle .L999 + + movq A, AO1 + + movq M, I + sarq $2, I + jle .L34 + ALIGN_4 + +.L33: + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 2 * SIZE(AO1), %mm2 + MOVQ 3 * SIZE(AO1), %mm3 + + MOVNTQ %mm0, 0 * SIZE(B) + MOVNTQ %mm1, 1 * SIZE(B) + MOVNTQ %mm2, 2 * SIZE(B) + MOVNTQ %mm3, 3 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + decq I + jg .L33 + ALIGN_4 + +.L34: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L35: + MOVQ 0 * SIZE(AO1), %mm0 + addq $SIZE, AO1 + + MOVNTQ %mm0, 0 * SIZE(B) + addq $1 * SIZE, B + decq I + jg .L35 + ALIGN_4 + + +.L999: + EMMS + +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_tcopy_2.S b/kernel/x86_64/gemm_tcopy_2.S new file mode 100644 index 0000000..8bfaca2 --- /dev/null +++ b/kernel/x86_64/gemm_tcopy_2.S @@ -0,0 +1,276 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(NEHALEM) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r10 +#define J %rbp + +#define AO1 %r9 +#define AO2 %r15 +#define AO3 %r11 +#define AO4 %r14 +#define BO1 %r13 +#define M8 %rbx +#define BO %rax + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 64 + STACKSIZE(%rsp) + +#define B %rdi + +#define I %r10 +#define J %r11 + +#define AO1 %r12 +#define AO2 %r13 +#define AO3 %r14 +#define AO4 %r15 + +#define BO1 %rsi +#define M8 %rbp +#define BO %rax + +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + movq N, %rax + andq $-2, %rax + imulq M, %rax + + leaq (B, %rax, SIZE), BO1 + + leaq (, LDA, SIZE), LDA + leaq (, M, SIZE), M8 + + movq M, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), A + + movq B, BO + addq $4 * SIZE, B + + movq N, I + sarq $1, I + jle .L14 + ALIGN_4 + +.L12: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, 0 * SIZE(BO) +#else + PREFETCH RPREFETCHSIZE * SIZE(AO1) + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + PREFETCH RPREFETCHSIZE * SIZE(AO2) + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + PREFETCHW WPREFETCHSIZE * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) +#endif + + leaq (BO, M8, 2), BO + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + decq I + jg .L12 + ALIGN_4 + +.L14: + testq $1, N + jle .L19 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + + movss %xmm0, 0 * SIZE(BO1) + movss %xmm1, 1 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + + movapd %xmm0, 0 * SIZE(BO1) +#endif + + addq $2 * SIZE, BO1 + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + jle .L999 + ALIGN_4 + +.L31: + movq A, AO1 + movq B, BO + + movq N, I + sarq $1, I + jle .L33 + ALIGN_4 + +.L32: +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movapd %xmm0, 0 * SIZE(BO) +#endif + + addq $2 * SIZE, AO1 + leaq (BO, M8, 2), BO + decq I + jg .L32 + ALIGN_4 + +.L33: + testq $1, N + jle .L999 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(BO1) +#endif + addq $1 * SIZE, BO1 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_tcopy_4.S b/kernel/x86_64/gemm_tcopy_4.S new file mode 100644 index 0000000..877969f --- /dev/null +++ b/kernel/x86_64/gemm_tcopy_4.S @@ -0,0 +1,544 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 +#endif + +#ifdef ATOM +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NANO +#define RPREFETCHSIZE 8 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef BARCELONA +#define RPREFETCHSIZE 8 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef GENERIC +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r10 +#define J %rbp + +#define AO1 %r9 +#define AO2 %r15 +#define AO3 %r11 +#define AO4 %r14 +#define BO1 %r13 +#define BO2 %r12 +#define M8 %rbx +#define BO %rax + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 64 + STACKSIZE(%rsp) + +#define B %rdi + +#define I %r10 +#define J %r11 + +#define AO1 %r12 +#define AO2 %r13 +#define AO3 %r14 +#define AO4 %r15 + +#define BO1 %rsi +#define BO2 %rbx +#define M8 %rbp +#define BO %rax + +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + movq N, %rax + movq N, %rbx + andq $-4, %rax + andq $-2, %rbx + imulq M, %rax + imulq M, %rbx + + leaq (B, %rax, SIZE), BO1 + leaq (B, %rbx, SIZE), BO2 + + leaq (, LDA, SIZE), LDA + leaq (, M, SIZE), M8 + + movq M, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + leaq (A, LDA, 4), A + + movq B, BO + addq $16 * SIZE, B + + movq N, I + sarq $2, I + jle .L13 + ALIGN_4 + +.L12: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + movlps 0 * SIZE(AO2), %xmm1 + movhps 2 * SIZE(AO2), %xmm1 + + movlps 0 * SIZE(AO3), %xmm2 + movhps 2 * SIZE(AO3), %xmm2 + movlps 0 * SIZE(AO4), %xmm3 + movhps 2 * SIZE(AO4), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + PREFETCH RPREFETCHSIZE * SIZE(AO3) + PREFETCH RPREFETCHSIZE * SIZE(AO4) + + PREFETCHW WPREFETCHSIZE * SIZE(BO) +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) +#else + + PREFETCH RPREFETCHSIZE * SIZE(AO1) + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(AO2) + movsd 0 * SIZE(AO2), %xmm2 + movhpd 1 * SIZE(AO2), %xmm2 + movsd 2 * SIZE(AO2), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + PREFETCH RPREFETCHSIZE * SIZE(AO3) + movsd 0 * SIZE(AO3), %xmm4 + movhpd 1 * SIZE(AO3), %xmm4 + movsd 2 * SIZE(AO3), %xmm5 + movhpd 3 * SIZE(AO3), %xmm5 + + PREFETCH RPREFETCHSIZE * SIZE(AO4) + movsd 0 * SIZE(AO4), %xmm6 + movhpd 1 * SIZE(AO4), %xmm6 + movsd 2 * SIZE(AO4), %xmm7 + movhpd 3 * SIZE(AO4), %xmm7 + + PREFETCHW WPREFETCHSIZE * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(B) +#endif + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) +#endif + + leaq (BO, M8, 4), BO + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + addq $4 * SIZE, AO3 + addq $4 * SIZE, AO4 + decq I + jg .L12 + ALIGN_4 + +.L13: + testq $2, N + jle .L14 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movlps 0 * SIZE(AO3), %xmm1 + movhps 0 * SIZE(AO4), %xmm1 + + movaps %xmm0, 0 * SIZE(BO1) + movaps %xmm1, 4 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movsd 0 * SIZE(AO3), %xmm2 + movhpd 1 * SIZE(AO3), %xmm2 + movsd 0 * SIZE(AO4), %xmm3 + movhpd 1 * SIZE(AO4), %xmm3 + + movapd %xmm0, 0 * SIZE(BO1) + movapd %xmm1, 2 * SIZE(BO1) + movapd %xmm2, 4 * SIZE(BO1) + movapd %xmm3, 6 * SIZE(BO1) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $2 * SIZE, AO3 + addq $2 * SIZE, AO4 + addq $8 * SIZE, BO1 + ALIGN_4 + +.L14: + testq $1, N + jle .L19 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 0 * SIZE(AO3), %xmm2 + movss 0 * SIZE(AO4), %xmm3 + + movss %xmm0, 0 * SIZE(BO2) + movss %xmm1, 1 * SIZE(BO2) + movss %xmm2, 2 * SIZE(BO2) + movss %xmm3, 3 * SIZE(BO2) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 0 * SIZE(AO3), %xmm1 + movhpd 0 * SIZE(AO4), %xmm1 + + movapd %xmm0, 0 * SIZE(BO2) + movapd %xmm1, 2 * SIZE(BO2) +#endif + + addq $4 * SIZE, BO2 + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + jle .L30 + ALIGN_4 + +.L21: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), A + + movq B, BO + addq $8 * SIZE, B + + movq N, I + sarq $2, I + jle .L23 + ALIGN_4 + +.L22: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movlps 0 * SIZE(AO2), %xmm1 + movhps 2 * SIZE(AO2), %xmm1 + +#if defined(PENTIUM4) || defined(GENERIC) + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + PREFETCHW WPREFETCHSIZE * SIZE(BO) +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movsd 0 * SIZE(AO2), %xmm2 + movhpd 1 * SIZE(AO2), %xmm2 + movsd 2 * SIZE(AO2), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + PREFETCHW WPREFETCHSIZE * SIZE(BO) +#endif + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (BO, M8, 4), BO + decq I + jg .L22 + ALIGN_4 + +.L23: + testq $2, N + jle .L24 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movapd %xmm0, 0 * SIZE(BO1) + movapd %xmm1, 2 * SIZE(BO1) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $4 * SIZE, BO1 + ALIGN_4 + +.L24: + testq $1, N + jle .L30 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + + movss %xmm0, 0 * SIZE(BO2) + movss %xmm1, 1 * SIZE(BO2) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + + movapd %xmm0, 0 * SIZE(BO2) +#endif + addq $2 * SIZE, BO2 + ALIGN_4 + +.L30: + testq $1, M + jle .L999 + ALIGN_4 + +.L31: + movq A, AO1 + movq B, BO + + movq N, I + sarq $2, I + jle .L33 + ALIGN_4 + +.L32: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movaps %xmm0, 0 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) +#endif + + addq $4 * SIZE, AO1 + leaq (BO, M8, 4), BO + decq I + jg .L32 + ALIGN_4 + +.L33: + testq $2, N + jle .L34 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + + movlps %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(BO1) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, BO1 + ALIGN_4 + +.L34: + testq $1, N + jle .L999 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss %xmm0, 0 * SIZE(BO2) +#else + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(BO2) +#endif + addq $1 * SIZE, BO2 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_tcopy_4_opteron.S b/kernel/x86_64/gemm_tcopy_4_opteron.S new file mode 100644 index 0000000..459eeb8 --- /dev/null +++ b/kernel/x86_64/gemm_tcopy_4_opteron.S @@ -0,0 +1,476 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCHSIZE (12 + 4) +#define WPREFETCHSIZE (12 + 4) +#define MOVNTQ MOVQ +#else +#define RPREFETCHSIZE (12 + 4) +#define WPREFETCHSIZE (12 + 4) +#define MOVNTQ MOVQ +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r10 +#define J %rbp + +#define AO1 %r9 +#define AO2 %r15 +#define AO3 %r11 +#define AO4 %r14 +#define BO1 %r13 +#define BO2 %r12 +#define M8 %rbx +#define BO %rax + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 64 + STACKSIZE(%rsp) + +#define B %rdi + +#define I %r10 +#define J %r11 + +#define AO1 %r12 +#define AO2 %r13 +#define AO3 %r14 +#define AO4 %r15 + +#define BO1 %rsi +#define BO2 %rbx +#define M8 %rbp +#define BO %rax + +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCH prefetch +#else +#define RPREFETCH prefetch +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + movq N, %rax + movq N, %rbx + andq $-4, %rax + andq $-2, %rbx + imulq M, %rax + imulq M, %rbx + + EMMS + + leaq (B, %rax, SIZE), BO1 + leaq (B, %rbx, SIZE), BO2 + + leaq (, LDA, SIZE), LDA + leaq (, M, SIZE), M8 + movq M, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L11: +#if 0 + movq A, AO1 + leaq (A, LDA, 1), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + + movq N, I + sarq $3, I + jle .L13 + ALIGN_4 + +.L12: + MOVQ 0 * SIZE(AO1), %mm0 + addq $8 * SIZE, AO1 + MOVQ 0 * SIZE(AO2), %mm1 + addq $8 * SIZE, AO2 + MOVQ 0 * SIZE(AO3), %mm2 + addq $8 * SIZE, AO3 + MOVQ 0 * SIZE(AO4), %mm3 + addq $8 * SIZE, AO4 + + decq I + jg .L12 + ALIGN_4 + +.L13: +#endif + + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + leaq (A, LDA, 4), A + + movq B, BO + addq $16 * SIZE, B + + movq N, I + sarq $2, I + jle .L15 + ALIGN_4 + +.L14: + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO1) + + MOVQ 0 * SIZE(AO1), %mm0 + MOVNTQ %mm0, 0 * SIZE(BO) + MOVQ 1 * SIZE(AO1), %mm1 + MOVNTQ %mm1, 1 * SIZE(BO) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO2) + + MOVQ 2 * SIZE(AO1), %mm2 + MOVNTQ %mm2, 2 * SIZE(BO) + MOVQ 3 * SIZE(AO1), %mm3 + MOVNTQ %mm3, 3 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + MOVQ 0 * SIZE(AO2), %mm4 + MOVNTQ %mm4, 4 * SIZE(BO) + MOVQ 1 * SIZE(AO2), %mm5 + MOVNTQ %mm5, 5 * SIZE(BO) + MOVQ 2 * SIZE(AO2), %mm6 + MOVNTQ %mm6, 6 * SIZE(BO) + MOVQ 3 * SIZE(AO2), %mm7 + MOVNTQ %mm7, 7 * SIZE(BO) + + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO3) + + MOVQ 0 * SIZE(AO3), %mm0 + MOVNTQ %mm0, 8 * SIZE(BO) + MOVQ 1 * SIZE(AO3), %mm1 + MOVNTQ %mm1, 9 * SIZE(BO) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO4) + + MOVQ 2 * SIZE(AO3), %mm2 + MOVNTQ %mm2, 10 * SIZE(BO) + MOVQ 3 * SIZE(AO3), %mm3 + MOVNTQ %mm3, 11 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 8) * SIZE(B) + MOVQ 0 * SIZE(AO4), %mm4 + MOVNTQ %mm4, 12 * SIZE(BO) + MOVQ 1 * SIZE(AO4), %mm5 + MOVNTQ %mm5, 13 * SIZE(BO) + MOVQ 2 * SIZE(AO4), %mm6 + MOVNTQ %mm6, 14 * SIZE(BO) + MOVQ 3 * SIZE(AO4), %mm7 + MOVNTQ %mm7, 15 * SIZE(BO) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + addq $4 * SIZE, AO3 + addq $4 * SIZE, AO4 + + leaq (BO, M8, 4), BO + decq I + jg .L14 + ALIGN_4 + +.L15: + testq $2, N + jle .L16 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 0 * SIZE(AO2), %mm2 + MOVQ 1 * SIZE(AO2), %mm3 + + MOVQ 0 * SIZE(AO3), %mm4 + MOVQ 1 * SIZE(AO3), %mm5 + MOVQ 0 * SIZE(AO4), %mm6 + MOVQ 1 * SIZE(AO4), %mm7 + + MOVNTQ %mm0, 0 * SIZE(BO1) + MOVNTQ %mm1, 1 * SIZE(BO1) + MOVNTQ %mm2, 2 * SIZE(BO1) + MOVNTQ %mm3, 3 * SIZE(BO1) + MOVNTQ %mm4, 4 * SIZE(BO1) + MOVNTQ %mm5, 5 * SIZE(BO1) + MOVNTQ %mm6, 6 * SIZE(BO1) + MOVNTQ %mm7, 7 * SIZE(BO1) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $2 * SIZE, AO3 + addq $2 * SIZE, AO4 + addq $8 * SIZE, BO1 + ALIGN_4 + +.L16: + testq $1, N + jle .L19 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 0 * SIZE(AO2), %mm1 + MOVQ 0 * SIZE(AO3), %mm2 + MOVQ 0 * SIZE(AO4), %mm3 + + MOVNTQ %mm0, 0 * SIZE(BO2) + MOVNTQ %mm1, 1 * SIZE(BO2) + MOVNTQ %mm2, 2 * SIZE(BO2) + MOVNTQ %mm3, 3 * SIZE(BO2) + + addq $4 * SIZE, BO2 + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + jle .L30 + ALIGN_4 + +.L21: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), A + + movq B, BO + addq $8 * SIZE, B + + movq N, I + sarq $2, I + jle .L23 + ALIGN_4 + +.L22: + RPREFETCH (RPREFETCHSIZE) * SIZE(AO1) + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 2 * SIZE(AO1), %mm2 + MOVQ 3 * SIZE(AO1), %mm3 + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO2) + MOVQ 0 * SIZE(AO2), %mm4 + MOVQ 1 * SIZE(AO2), %mm5 + MOVQ 2 * SIZE(AO2), %mm6 + MOVQ 3 * SIZE(AO2), %mm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + MOVNTQ %mm0, 0 * SIZE(BO) + MOVNTQ %mm1, 1 * SIZE(BO) + MOVNTQ %mm2, 2 * SIZE(BO) + MOVNTQ %mm3, 3 * SIZE(BO) + MOVNTQ %mm4, 4 * SIZE(BO) + MOVNTQ %mm5, 5 * SIZE(BO) + MOVNTQ %mm6, 6 * SIZE(BO) + MOVNTQ %mm7, 7 * SIZE(BO) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (BO, M8, 4), BO + decq I + jg .L22 + ALIGN_4 + +.L23: + testq $2, N + jle .L24 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 0 * SIZE(AO2), %mm2 + MOVQ 1 * SIZE(AO2), %mm3 + + MOVNTQ %mm0, 0 * SIZE(BO1) + MOVNTQ %mm1, 1 * SIZE(BO1) + MOVNTQ %mm2, 2 * SIZE(BO1) + MOVNTQ %mm3, 3 * SIZE(BO1) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $4 * SIZE, BO1 + ALIGN_4 + +.L24: + testq $1, N + jle .L30 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 0 * SIZE(AO2), %mm1 + + MOVNTQ %mm0, 0 * SIZE(BO2) + MOVNTQ %mm1, 1 * SIZE(BO2) + + addq $2 * SIZE, BO2 + ALIGN_4 + +.L30: + testq $1, M + jle .L999 + ALIGN_4 + +.L31: + movq A, AO1 + movq B, BO + + movq N, I + sarq $2, I + jle .L33 + ALIGN_4 + +.L32: + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 2 * SIZE(AO1), %mm2 + MOVQ 3 * SIZE(AO1), %mm3 + + MOVNTQ %mm0, 0 * SIZE(BO) + MOVNTQ %mm1, 1 * SIZE(BO) + MOVNTQ %mm2, 2 * SIZE(BO) + MOVNTQ %mm3, 3 * SIZE(BO) + + addq $4 * SIZE, AO1 + leaq (BO, M8, 4), BO + decq I + jg .L32 + ALIGN_4 + +.L33: + testq $2, N + jle .L34 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + + MOVNTQ %mm0, 0 * SIZE(BO1) + MOVNTQ %mm1, 1 * SIZE(BO1) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, BO1 + ALIGN_4 + +.L34: + testq $1, N + jle .L999 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVNTQ %mm0, 0 * SIZE(BO2) + + addq $1 * SIZE, BO2 + ALIGN_4 + +.L999: + EMMS + +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/iamax.S b/kernel/x86_64/iamax.S new file mode 100644 index 0000000..27637c5 --- /dev/null +++ b/kernel/x86_64/iamax.S @@ -0,0 +1,352 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define RET %rax +#define I ARG4 +#define NUM %r10 + +#ifndef USE_MIN +#define FMOV fcmovbe +#define IMOV cmovnbe +#else +#define FMOV fcmovnbe +#define IMOV cmovb +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $BASE_SHIFT, INCX + + fldz + xorq RET, RET + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + ffreep %st + movq $2, NUM + movq $1, RET + + FLD (X) +#ifdef USE_ABS + fabs +#endif + addq INCX, X + decq M + jle .L999 + + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 1 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 2 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 3 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 4 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 5 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 6 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 7 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq $8 * SIZE, X + + decq I + jg .L10 + ALIGN_4 + +.L20: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + + addq $1 * SIZE, X + incq NUM + decq I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + decq I + jg .L50 + ALIGN_4 + +.L60: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq INCX, X + decq I + jg .L61 + ALIGN_4 + +.L999: + ffreep %st + ret + + EPILOGUE diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S new file mode 100644 index 0000000..8b7de07 --- /dev/null +++ b/kernel/x86_64/iamax_sse.S @@ -0,0 +1,1020 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define RET %rax +#define I ARG4 +#define XX %r10 +#define MM %r11 + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 /* Return Value(Float) */ + xor RET, RET /* Return Value(Int) */ + testq M, M + jle .L999 + leaq (, INCX, SIZE), INCX + testq INCX, INCX + jle .L999 + + movq M, MM + movq X, XX + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 /* Generate USE_ABS */ +#endif + + movss (X), %xmm0 + addq INCX, X + decq M + shufps $0, %xmm0, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm0 +#endif + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 /* Generating "seed value" */ + cmpq $SIZE, INCX + jne .L80 /* Incx != 1 goto L80 */ + +/* Analigned Check */ + testq $3, X /* 00000011 */ + jne .L30 /* Purely Unaligned Mode */ + + cmpq $8, M + jle .L30 /* if M <= 8 goto Unaligned mode */ + + testq $4, X /* bit test 000100 */ + je .L05 + + movss 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + decq M + addq $SIZE, X + ALIGN_3 + +.L05: + testq $8, X + je .L06 + + movsd 0 * SIZE(X), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm1 + subq $2, M + addq $2 * SIZE, X + ALIGN_3 + +.L06: + movq M, I + sarq $4, I + jle .L15 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps 4 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + + movaps 8 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + + movaps 12 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L15: + andq $15, M + jle .L20 + + testq $8, M + je .L16 + + movaps 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps 4 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + addq $8 * SIZE, X + ALIGN_3 + +.L16: + testq $4, M + je .L17 + + movaps 0 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L17: + testq $2, M + je .L18 + + movsd 0 * SIZE(X), %xmm7 + unpcklps %xmm7, %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + addq $2 * SIZE, X + +.L18: + testq $1, M + je .L20 + + movss 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + ALIGN_3 + +.L20: + movq XX, X + movq MM, M + + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + testq $4, X + je .L21 + + movss 0 * SIZE(X), %xmm1 + + decq M + addq $SIZE, X + +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + ALIGN_3 + +.L21: + testq $8, X + je .L22 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + + subq $2, M + addq $2 * SIZE, X + +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L22: + movq M, I + sarq $3, I + jle .L25 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + cmpeqps %xmm0, %xmm1 + + movaps 4 * SIZE(X), %xmm3 +#ifdef USE_ABS + andps %xmm15, %xmm3 +#endif + cmpeqps %xmm0, %xmm3 + + orps %xmm3, %xmm1 +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L24 + + addq $8 * SIZE, X + addq $8, RET + decq I + jg .L23 + jmp .L25 + ALIGN_3 + +.L24: + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 + movss 4 * SIZE(X), %xmm5 + movss 5 * SIZE(X), %xmm6 + movss 6 * SIZE(X), %xmm7 + movss 7 * SIZE(X), %xmm8 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm6 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_4 + +.L25: + testq $4, M + je .L26 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 +#endif + addq $4 * SIZE, X + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L26: + testq $2, M + je .L27 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 +#endif + addq $2 * SIZE, X + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L27: + incq RET + jmp .L999 + ALIGN_3 + +/* Unaligned Mode */ +.L30: + movq M, I + sarq $4, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + + movsd 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + + movsd 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L31 + ALIGN_4 + +.L35: + andq $15, M + jle .L40 + + testq $8, M + je .L36 + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L36: + testq $4, M + je .L37 + + movsd 0 * SIZE(X), %xmm6 + movhps 2 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L37: + testq $2, M + je .L38 + + movsd 0 * SIZE(X), %xmm7 + unpcklps %xmm7, %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + addq $2 * SIZE, X + +.L38: + testq $1, M + je .L40 + + movss 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + jmp .L40 + ALIGN_4 + +.L40: + movq XX, X + movq MM, M + + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L45 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movhps 2 * SIZE(X), %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + cmpeqps %xmm0, %xmm1 + + movsd 4 * SIZE(X), %xmm3 + movhps 6 * SIZE(X), %xmm3 +#ifdef USE_ABS + andps %xmm15, %xmm3 +#endif + cmpeqps %xmm0, %xmm3 + + orps %xmm3, %xmm1 +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L44 + + addq $8 * SIZE, X + addq $8, RET + decq I + jg .L43 + jmp .L45 + ALIGN_3 + +.L44: + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 + movss 4 * SIZE(X), %xmm5 + movss 5 * SIZE(X), %xmm6 + movss 6 * SIZE(X), %xmm7 + movss 7 * SIZE(X), %xmm8 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm6 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_4 + +.L45: + testq $4, M + je .L46 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 +#endif + addq $4 * SIZE, X + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L46: + testq $2, M + je .L47 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 +#endif + addq $2 * SIZE, X + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L47: + incq RET + jmp .L999 + ALIGN_3 + +.L80: + movq M, I + sarq $3, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + + decq I + jg .L81 + ALIGN_4 + +.L85: + andq $7, M + jle .L90 + + testq $4, M + je .L86 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + ALIGN_3 + +.L86: + testq $2, M + je .L87 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + ALIGN_3 + +.L87: + testq $1, M + je .L90 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + ALIGN_4 + +.L90: + movq XX, X + movq MM, M + + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm2 + maxss %xmm2, %xmm0 + shufps $0, %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L95 + ALIGN_4 + +.L93: + movss 0 * SIZE(X), %xmm1 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + cmpeqss %xmm0, %xmm1 + + movss 0 * SIZE(X), %xmm2 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm2 +#endif + cmpeqss %xmm0, %xmm2 + + movss 0 * SIZE(X), %xmm3 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm3 +#endif + cmpeqss %xmm0, %xmm3 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + cmpeqss %xmm0, %xmm4 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + cmpeqps %xmm0, %xmm5 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + cmpeqss %xmm0, %xmm6 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + cmpeqss %xmm0, %xmm7 + + movss 0 * SIZE(X), %xmm8 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + cmpeqss %xmm0, %xmm8 + + orps %xmm2, %xmm1 + orps %xmm4, %xmm3 + orps %xmm6, %xmm5 + orps %xmm8, %xmm7 + orps %xmm3, %xmm1 + orps %xmm7, %xmm5 + orps %xmm5, %xmm1 + +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L94 + + addq $8, RET + decq I + jg .L93 + jmp .L95 + ALIGN_3 + +.L94: + subq INCX, X + movss 0 * SIZE(X), %xmm8 + subq INCX, X + movss 0 * SIZE(X), %xmm7 + subq INCX, X + movss 0 * SIZE(X), %xmm6 + subq INCX, X + movss 0 * SIZE(X), %xmm5 + subq INCX, X + movss 0 * SIZE(X), %xmm4 + subq INCX, X + movss 0 * SIZE(X), %xmm3 + subq INCX, X + movss 0 * SIZE(X), %xmm2 + subq INCX, X + movss 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm6 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_4 + +.L95: + testq $4, M + je .L96 + + movss 0 * SIZE(X), %xmm1 + addq INCX, X + movss 0 * SIZE(X), %xmm2 + addq INCX, X + movss 0 * SIZE(X), %xmm3 + addq INCX, X + movss 0 * SIZE(X), %xmm4 + addq INCX, X + +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L96: + testq $2, M + je .L97 + + movss 0 * SIZE(X), %xmm1 + addq INCX, X + movss 0 * SIZE(X), %xmm2 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L97: + incq RET + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/iamax_sse2.S b/kernel/x86_64/iamax_sse2.S new file mode 100644 index 0000000..c17a81a --- /dev/null +++ b/kernel/x86_64/iamax_sse2.S @@ -0,0 +1,1136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define RET %rax +#define I ARG4 +#define XX %r10 +#define MM %r11 + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + xor RET, RET + testq M, M + jle .L999 + leaq (, INCX, SIZE), INCX + testq INCX, INCX + jle .L999 + + movq M, MM + movq X, XX + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 +#endif + + movsd (X), %xmm0 + addq INCX, X + decq M +#ifdef USE_ABS + andpd %xmm15, %xmm0 +#endif + unpcklpd %xmm0, %xmm0 + movapd %xmm0, %xmm1 + movapd %xmm0, %xmm2 + movapd %xmm0, %xmm3 + cmpq $SIZE, INCX + jne .L80 + +/* Analigned Check */ + cmpq $7, M + jle .L50 + + testq $7, X + jne .L50 # Purely Unaligned Mode + + testq $15, X # Checking for 128bit align + je .L05 + + movsd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + decq M + addq $SIZE, X + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L15 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movapd 4 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movapd 6 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 8 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 10 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movapd 12 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movapd 14 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L15: + andq $15, M + jle .L20 + + testq $8, M + je .L16 + + movapd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movapd 4 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movapd 6 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L16: + testq $4, M + je .L17 + + movapd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L17: + testq $2, M + je .L18 + + movapd 0 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + addq $2 * SIZE, X + +.L18: + testq $1, M + je .L20 + + movsd 0 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + unpcklpd %xmm7, %xmm7 + maxpd %xmm7, %xmm3 + ALIGN_3 + +/* Finding Index */ +.L20: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + ALIGN_3 + + testq $15, X # Checking for 128bit align + je .L21 + + movsd 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andpd %xmm15, %xmm1 +#endif + incq RET + comisd %xmm0, %xmm1 + je .L999 + addq $SIZE, X + decq M + ALIGN_3 + +.L21: + movq M, I + sarq $3, I + jle .L25 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andpd %xmm15, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movapd 2 * SIZE(X), %xmm3 +#ifdef USE_ABS + andpd %xmm15, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movapd 4 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + cmpeqpd %xmm0, %xmm5 + + movapd 6 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + cmpeqpd %xmm0, %xmm7 + + orpd %xmm3, %xmm1 + orpd %xmm7, %xmm5 + orpd %xmm5, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L23 + + addq $8 * SIZE, X + addq $8, RET + decq I + jg .L22 + jmp .L25 + ALIGN_4 + +.L23: + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movsd 5 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 7 * SIZE(X), %xmm8 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm6 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_3 + +.L25: + testq $4, M + je .L27 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 +#endif + addq $4 * SIZE, X + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L27: + testq $2, M + je .L28 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 +#endif + addq $2 * SIZE, X + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L28: + incq RET + jmp .L999 + ALIGN_3 + +/* Unaligned Mode */ +.L50: + movq M, I + sarq $4, I + jle .L55 + ALIGN_4 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 4 * SIZE(X), %xmm6 + movhpd 5 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 8 * SIZE(X), %xmm4 + movhpd 9 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 10 * SIZE(X), %xmm5 + movhpd 11 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 12 * SIZE(X), %xmm6 + movhpd 13 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 14 * SIZE(X), %xmm7 + movhpd 15 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L51 + ALIGN_4 + +.L55: + andq $15, M + jle .L60 + + testq $8, M + je .L56 + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 4 * SIZE(X), %xmm6 + movhpd 5 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + addq $8 * SIZE, X + ALIGN_3 + +.L56: + testq $4, M + je .L57 + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L57: + testq $2, M + je .L58 + + movsd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + addq $2 * SIZE, X + +.L58: + testq $1, M + je .L60 + + movsd 0 * SIZE(X), %xmm7 + unpcklpd %xmm7, %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + ALIGN_3 + +.L60: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L65 + ALIGN_4 + +.L62: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 +#ifdef USE_ABS + andpd %xmm15, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movsd 2 * SIZE(X), %xmm3 + movhpd 3 * SIZE(X), %xmm3 +#ifdef USE_ABS + andpd %xmm15, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movsd 4 * SIZE(X), %xmm5 + movhpd 5 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + cmpeqpd %xmm0, %xmm5 + + movsd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + cmpeqpd %xmm0, %xmm7 + + orpd %xmm3, %xmm1 + orpd %xmm7, %xmm5 + orpd %xmm5, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L63 + + addq $8 * SIZE, X + addq $8, RET + decq I + jg .L62 + jmp .L65 + ALIGN_4 + +.L63: + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movsd 5 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 7 * SIZE(X), %xmm8 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm6 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_3 + +.L65: + testq $4, M + je .L67 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 +#endif + addq $4 * SIZE, X + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L67: + testq $2, M + je .L68 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 +#endif + addq $2 * SIZE, X + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L68: + incq RET + jmp .L999 + ALIGN_4 + +.L80: + movq M, I + sarq $4, I + jle .L85 + ALIGN_4 + +.L81: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + decq I + jg .L81 + ALIGN_4 + +.L85: + andq $15, M + jle .L90 + + testq $8, M + je .L86 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + ALIGN_3 + +.L86: + testq $4, M + je .L87 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + ALIGN_3 + +.L87: + testq $2, M + je .L88 + + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + ALIGN_3 + +.L88: + testq $1, M + je .L90 + + movsd 0 * SIZE(X), %xmm7 + unpcklpd %xmm7, %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L90: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L95 + ALIGN_4 + +.L92: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + cmpeqpd %xmm0, %xmm5 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + cmpeqpd %xmm0, %xmm7 + + orpd %xmm3, %xmm1 + orpd %xmm7, %xmm5 + orpd %xmm5, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L93 + + addq $8, RET + decq I + jg .L92 + jmp .L95 + ALIGN_4 + +.L93: + subq INCX, X + movsd 0 * SIZE(X), %xmm8 + subq INCX, X + movsd 0 * SIZE(X), %xmm7 + subq INCX, X + movsd 0 * SIZE(X), %xmm6 + subq INCX, X + movsd 0 * SIZE(X), %xmm5 + subq INCX, X + movsd 0 * SIZE(X), %xmm4 + subq INCX, X + movsd 0 * SIZE(X), %xmm3 + subq INCX, X + movsd 0 * SIZE(X), %xmm2 + subq INCX, X + movsd 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm6 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_3 + +.L95: + testq $4, M + je .L97 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 +#endif + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L97: + testq $2, M + je .L98 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 +#endif + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L98: + incq RET + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/izamax.S b/kernel/x86_64/izamax.S new file mode 100644 index 0000000..a77b06d --- /dev/null +++ b/kernel/x86_64/izamax.S @@ -0,0 +1,270 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I ARG4 +#define NUM %r10 +#define RET %rax + +#ifndef USE_MIN +#define FMOV fcmovbe +#define IMOV cmovnbe +#else +#define FMOV fcmovnb +#define IMOV cmovb +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $ZBASE_SHIFT, INCX + + fldz + xorq RET, RET + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + ffreep %st + movq $2, NUM + movq $1, RET + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + addq INCX, X + decq M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq $8 * SIZE, X + + decq I + jg .L10 + ALIGN_4 + +.L20: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq $2 * SIZE, X + decq I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + decq I + jg .L50 + ALIGN_4 + +.L60: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq INCX, X + decq I + jg .L61 + ALIGN_4 + +.L999: + ffreep %st + ret + + EPILOGUE diff --git a/kernel/x86_64/izamax_sse.S b/kernel/x86_64/izamax_sse.S new file mode 100644 index 0000000..2dfeb93 --- /dev/null +++ b/kernel/x86_64/izamax_sse.S @@ -0,0 +1,554 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define RET %rax +#define I ARG4 +#define XX %r10 +#define MM %r11 + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + xor RET, RET + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + movq M, MM + movq X, XX + + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 + + movss 0 * SIZE(X), %xmm0 + movss 1 * SIZE(X), %xmm1 + addq INCX, X + decq M + andps %xmm15, %xmm0 + andps %xmm15, %xmm1 + addps %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, %xmm1 + cmpq $2 * SIZE, INCX + jne .L70 + +.L30: + movq M, I + sarq $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + movsd 8 * SIZE(X), %xmm7 + movhps 10 * SIZE(X), %xmm7 + movsd 12 * SIZE(X), %xmm8 + movhps 14 * SIZE(X), %xmm8 + movaps %xmm7, %xmm9 + + shufps $0x88, %xmm8, %xmm7 + shufps $0xdd, %xmm8, %xmm9 + + andps %xmm15, %xmm7 + andps %xmm15, %xmm9 + addps %xmm9, %xmm7 + maxps %xmm7, %xmm0 + + addq $16 * SIZE, X + decq I + jg .L31 + ALIGN_4 + +.L35: + andq $7, M + jle .L40 + + testq $4, M + je .L36 + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + addq $8 * SIZE, X + ALIGN_3 + +.L36: + testq $2, M + je .L37 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + movss 2 * SIZE(X), %xmm6 + movss 3 * SIZE(X), %xmm7 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + maxss %xmm4, %xmm0 + maxss %xmm6, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L37: + testq $1, M + je .L40 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addps %xmm5, %xmm4 + maxss %xmm4, %xmm0 + ALIGN_4 + +.L40: + movq XX, X + movq MM, M + + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movq M, I + sarq $2, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movhps 2 * SIZE(X), %xmm1 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm15, %xmm1 + andps %xmm15, %xmm3 + addps %xmm3, %xmm1 + + cmpeqps %xmm0, %xmm1 +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L43 + + addq $8 * SIZE, X + addq $4, RET + decq I + jg .L41 + jmp .L45 + ALIGN_4 + +.L43: + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 + movss 4 * SIZE(X), %xmm5 + movss 5 * SIZE(X), %xmm6 + movss 6 * SIZE(X), %xmm7 + movss 7 * SIZE(X), %xmm8 + addq $8 * SIZE, X + + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + addps %xmm6, %xmm5 + addps %xmm8, %xmm7 + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + ALIGN_3 + +.L45: + testq $2, M + je .L47 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 + addq $4 * SIZE, X + + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L47: + incq RET + jmp .L999 + ALIGN_3 + +.L70: + movq M, I + sarq $3, I + jle .L75 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhps 0 * SIZE(X), %xmm5 + addq INCX, X + + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhps 0 * SIZE(X), %xmm7 + addq INCX, X + movsd 0 * SIZE(X), %xmm8 + addq INCX, X + movhps 0 * SIZE(X), %xmm8 + addq INCX, X + movaps %xmm7, %xmm9 + + shufps $0x88, %xmm8, %xmm7 + shufps $0xdd, %xmm8, %xmm9 + + andps %xmm15, %xmm7 + andps %xmm15, %xmm9 + addps %xmm9, %xmm7 + maxps %xmm7, %xmm0 + + decq I + jg .L71 + ALIGN_4 + +.L75: + andq $7, M + jle .L80 + + testq $4, M + je .L76 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhps 0 * SIZE(X), %xmm5 + addq INCX, X + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + ALIGN_3 + +.L76: + testq $2, M + je .L77 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + addq INCX, X + movss 0 * SIZE(X), %xmm6 + movss 1 * SIZE(X), %xmm7 + addq INCX, X + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + maxss %xmm4, %xmm0 + maxss %xmm6, %xmm1 + ALIGN_3 + +.L77: + testq $1, M + je .L80 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addps %xmm5, %xmm4 + maxss %xmm4, %xmm0 + ALIGN_4 + +.L80: + movq XX, X + movq MM, M + + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movq M, I + sarq $2, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhps 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movhps 0 * SIZE(X), %xmm2 + addq INCX, X + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm15, %xmm1 + andps %xmm15, %xmm3 + addps %xmm3, %xmm1 + + cmpeqps %xmm0, %xmm1 +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L83 + + addq $4, RET + decq I + jg .L81 + jmp .L85 + ALIGN_4 + +.L83: + subq INCX, X + movss 0 * SIZE(X), %xmm7 + movss 1 * SIZE(X), %xmm8 + subq INCX, X + movss 0 * SIZE(X), %xmm5 + movss 1 * SIZE(X), %xmm6 + subq INCX, X + movss 0 * SIZE(X), %xmm3 + movss 1 * SIZE(X), %xmm4 + subq INCX, X + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + addps %xmm6, %xmm5 + addps %xmm8, %xmm7 + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + ALIGN_3 + +.L85: + testq $2, M + je .L87 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + addq INCX, X + movss 0 * SIZE(X), %xmm3 + movss 1 * SIZE(X), %xmm4 + addq INCX, X + + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L87: + incq RET + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/izamax_sse2.S b/kernel/x86_64/izamax_sse2.S new file mode 100644 index 0000000..4e66e53 --- /dev/null +++ b/kernel/x86_64/izamax_sse2.S @@ -0,0 +1,597 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define RET %rax +#define I ARG4 +#define XX %r10 +#define MM %r11 + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + xor RET, RET + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + movq M, MM + movq X, XX + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + decq M + andpd %xmm15, %xmm0 + andpd %xmm15, %xmm1 + addpd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, %xmm1 + movapd %xmm0, %xmm2 + movapd %xmm0, %xmm3 + cmpq $2 * SIZE, INCX + jne .L60 + + movq M, I + sarq $3, I + jle .L25 + ALIGN_4 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 5 * SIZE(X), %xmm7 + movhpd 6 * SIZE(X), %xmm6 + movhpd 7 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 8 * SIZE(X), %xmm4 + movsd 9 * SIZE(X), %xmm5 + movhpd 10 * SIZE(X), %xmm4 + movhpd 11 * SIZE(X), %xmm5 + movsd 12 * SIZE(X), %xmm6 + movsd 13 * SIZE(X), %xmm7 + movhpd 14 * SIZE(X), %xmm6 + movhpd 15 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm2 + maxpd %xmm6, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L21 + ALIGN_4 + +.L25: + andq $7, M + jle .L30 + + testq $4, M + je .L26 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 5 * SIZE(X), %xmm7 + movhpd 6 * SIZE(X), %xmm6 + movhpd 7 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L26: + testq $2, M + je .L27 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + addq $4 * SIZE, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm0 + ALIGN_3 + +.L27: + testq $1, M + je .L30 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxsd %xmm4, %xmm2 + ALIGN_4 + +.L30: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $2, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movhpd 2 * SIZE(X), %xmm1 + movhpd 3 * SIZE(X), %xmm2 + movsd 4 * SIZE(X), %xmm3 + movsd 5 * SIZE(X), %xmm4 + movhpd 6 * SIZE(X), %xmm3 + movhpd 7 * SIZE(X), %xmm4 + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + cmpeqpd %xmm0, %xmm1 + cmpeqpd %xmm0, %xmm3 + + orpd %xmm3, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L33 + + addq $8 * SIZE, X + addq $4, RET + decq I + jg .L31 + jmp .L35 + ALIGN_4 + +.L33: + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movsd 5 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 7 * SIZE(X), %xmm8 + addq $8 * SIZE, X + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + addpd %xmm8, %xmm7 + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + ALIGN_3 + +.L35: + testq $2, M + je .L36 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + addq $4 * SIZE, X + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L36: + incq RET + jmp .L999 + ALIGN_3 + +.L60: + movq M, I + sarq $3, I + jle .L65 + ALIGN_4 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm2 + maxpd %xmm6, %xmm3 + + decq I + jg .L61 + ALIGN_4 + +.L65: + andq $7, M + jle .L70 + + testq $4, M + je .L66 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + ALIGN_3 + +.L66: + testq $2, M + je .L67 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm2 + ALIGN_3 + +.L67: + testq $1, M + je .L70 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxsd %xmm4, %xmm3 + ALIGN_3 + +.L70: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $2, I + jle .L75 + ALIGN_4 + +.L71: +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + prefetch PREFETCHSIZE * SIZE(X) +#endif + +#ifdef PENTIUM4 + prefetchnta PREFETCHSIZE * SIZE(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + movsd 1 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm4 + addq INCX, X + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + cmpeqpd %xmm0, %xmm1 + cmpeqpd %xmm0, %xmm3 + + orpd %xmm3, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L73 + + addq $4, RET + decq I + jg .L71 + jmp .L75 + ALIGN_4 + +.L73: + subq INCX, X + movsd 0 * SIZE(X), %xmm7 + movsd 1 * SIZE(X), %xmm8 + subq INCX, X + movsd 0 * SIZE(X), %xmm5 + movsd 1 * SIZE(X), %xmm6 + subq INCX, X + movsd 0 * SIZE(X), %xmm3 + movsd 1 * SIZE(X), %xmm4 + subq INCX, X + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + addpd %xmm8, %xmm7 + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + ALIGN_3 + +.L75: + testq $2, M + je .L76 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + movsd 1 * SIZE(X), %xmm4 + addq INCX, X + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L76: + incq RET + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/lsame.S b/kernel/x86_64/lsame.S new file mode 100644 index 0000000..8b1ca10 --- /dev/null +++ b/kernel/x86_64/lsame.S @@ -0,0 +1,72 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define X ARG1 /* rdi */ +#define Y ARG2 /* rsi */ +#define XX ARG3 +#define YY ARG4 + + PROLOGUE + PROFCODE + + movzbq (X), X + movzbq (Y), Y + + andq $255, X + andq $255, Y + + leaq -32(X), XX + leaq -32(Y), YY + + cmpq $97, X + cmovge XX, X + + cmpq $97,Y + cmovge YY, Y + + movq $0, %rax + movq $1, %r8 + + cmpq X, Y + cmoveq %r8, %rax + ret + + EPILOGUE diff --git a/kernel/x86_64/mcount.S b/kernel/x86_64/mcount.S new file mode 100644 index 0000000..2770e32 --- /dev/null +++ b/kernel/x86_64/mcount.S @@ -0,0 +1,46 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + + jmp _mcount + + EPILOGUE diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S new file mode 100644 index 0000000..d375e8e --- /dev/null +++ b/kernel/x86_64/nrm2.S @@ -0,0 +1,206 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $BASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + FLD 2 * SIZE(X) + fmul %st(0), %st + FLD 3 * SIZE(X) + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fmul %st(0), %st + FLD 5 * SIZE(X) + fmul %st(0), %st + FLD 6 * SIZE(X) + fmul %st(0), %st + FLD 7 * SIZE(X) + fmul %st(0), %st + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $7, M + jle .L998 + ALIGN_4 + + +.L21: + FLD (X) + fmul %st(0), %st + faddp %st,%st(1) + addq $1 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $7, M + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addq INCX, X + fmul %st(0), %st + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + fsqrt +#ifndef XDOUBLE + sub $2 * SIZE, %rsp + FST (%rsp) + MOVSD (%rsp), %xmm0 + add $2 * SIZE, %rsp +#endif + ret + + EPILOGUE + diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S new file mode 100644 index 0000000..37762ab --- /dev/null +++ b/kernel/x86_64/nrm2_sse.S @@ -0,0 +1,316 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + pxor %xmm1, %xmm1 + testq INCX, INCX + jle .L999 + + pxor %xmm2, %xmm2 + leaq (, INCX, SIZE), INCX + pxor %xmm3, %xmm3 + cmpq $SIZE, INCX + jne .L40 + + testq $SIZE, X + je .L05 + + movss 0 * SIZE(X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + addq INCX, X + decq M + jle .L998 + ALIGN_3 + +.L05: + movq M, I + sarq $3, I + jle .L14 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + addq $8 * SIZE, X + decq I + jle .L12 + ALIGN_3 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + cvtps2pd %xmm6, %xmm10 + cvtps2pd %xmm7, %xmm11 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + addq $8 * SIZE, X + decq I + jg .L10 + ALIGN_3 + +.L12: + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + cvtps2pd %xmm6, %xmm10 + cvtps2pd %xmm7, %xmm11 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + ALIGN_3 + + +.L14: + testq $4, M + je .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + cvtps2pd %xmm4, %xmm6 + cvtps2pd %xmm5, %xmm7 + mulpd %xmm6, %xmm6 + mulpd %xmm7, %xmm7 + addpd %xmm6, %xmm0 + addpd %xmm7, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L15: + testq $2, M + je .L16 + + movsd 0 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm6 + mulpd %xmm6, %xmm6 + addpd %xmm6, %xmm2 + addq $2 * SIZE, X + ALIGN_3 + +.L16: + testq $1, M + je .L998 + + movss 0 * SIZE(X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L41: + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm7 + addq INCX, X + movss (X), %xmm8 + addq INCX, X + movss (X), %xmm9 + addq INCX, X + movss (X), %xmm10 + addq INCX, X + movss (X), %xmm11 + addq INCX, X + + cvtss2sd %xmm4, %xmm4 + cvtss2sd %xmm5, %xmm5 + cvtss2sd %xmm6, %xmm6 + cvtss2sd %xmm7, %xmm7 + cvtss2sd %xmm8, %xmm8 + cvtss2sd %xmm9, %xmm9 + cvtss2sd %xmm10, %xmm10 + cvtss2sd %xmm11, %xmm11 + + mulsd %xmm4, %xmm4 + mulsd %xmm5, %xmm5 + mulsd %xmm6, %xmm6 + mulsd %xmm7, %xmm7 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + mulsd %xmm8, %xmm8 + mulsd %xmm9, %xmm9 + mulsd %xmm10, %xmm10 + mulsd %xmm11, %xmm11 + + addsd %xmm8, %xmm0 + addsd %xmm9, %xmm1 + addsd %xmm10, %xmm2 + addsd %xmm11, %xmm3 + + decq I + jg .L41 + ALIGN_3 + +.L44: + testq $4, M + je .L45 + + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm7 + addq INCX, X + + cvtss2sd %xmm4, %xmm8 + cvtss2sd %xmm5, %xmm9 + cvtss2sd %xmm6, %xmm10 + cvtss2sd %xmm7, %xmm11 + + mulsd %xmm8, %xmm8 + mulsd %xmm9, %xmm9 + mulsd %xmm10, %xmm10 + mulsd %xmm11, %xmm11 + + addsd %xmm8, %xmm0 + addsd %xmm9, %xmm1 + addsd %xmm10, %xmm2 + addsd %xmm11, %xmm3 + ALIGN_3 + +.L45: + testq $2, M + je .L46 + + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + + cvtss2sd %xmm4, %xmm6 + cvtss2sd %xmm5, %xmm7 + mulsd %xmm6, %xmm6 + mulsd %xmm7, %xmm7 + addsd %xmm6, %xmm1 + addsd %xmm7, %xmm2 + ALIGN_3 + +.L46: + testq $1, M + je .L998 + + movss (X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + sqrtsd %xmm0, %xmm0 + + cvtsd2ss %xmm0, %xmm0 + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/qconjg.S b/kernel/x86_64/qconjg.S new file mode 100644 index 0000000..49ca766 --- /dev/null +++ b/kernel/x86_64/qconjg.S @@ -0,0 +1,54 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + fldz + FLD 1 * SIZE(ARG1) + fsubrp %st, %st(1) + FLD 0 * SIZE(ARG1) + + FST 0 * SIZE(ARG2) + FST 1 * SIZE(ARG2) + ret + + EPILOGUE diff --git a/kernel/x86_64/qdot.S b/kernel/x86_64/qdot.S new file mode 100644 index 0000000..c958fc5 --- /dev/null +++ b/kernel/x86_64/qdot.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $BASE_SHIFT, INCX + sall $BASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl N, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(1) + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(2) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(3) + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(4) + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addl $SIZE, X + FLD (Y) + fmulp %st, %st(1) + addl $SIZE, Y + faddp %st,%st(1) + decl %eax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: + movl N, %eax + sarl $2, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(1) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(2) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(3) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(4) + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st, %st(1) + decl %eax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86_64/qgemm_kernel_2x2.S b/kernel/x86_64/qgemm_kernel_2x2.S new file mode 100644 index 0000000..9db145b --- /dev/null +++ b/kernel/x86_64/qgemm_kernel_2x2.S @@ -0,0 +1,810 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define KKK 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OFFSET 32 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 24 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $BASE_SHIFT, LDC + + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + leaq (, LDC, 2), %rax + addq %rax, C + + movq M, I + sarq $1, I + je .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO + decq I + jne .L11 + ALIGN_4 + +.L20: + movq M, %rax + andq $1, %rax + je .L29 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq ( B, %rax, 2), BO +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + decq J + jne .L01 + ALIGN_4 + +.L30: + movq N, %rax + testq $1, %rax + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + addq LDC, C + + movq M, I + sarq $1, I + je .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq ( B, %rax, 1), BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO + decq I + jne .L31 + ALIGN_4 + +.L40: + movq M, %rax + andq $1, %rax + je .L49 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq ( B, %rax, 1), BO +#endif + + fldz + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + FLD ALPHA + + fmulp %st, %st(1) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) +#else + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + movq BO, B + ALIGN_4 + +.L999: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/qgemv_n.S b/kernel/x86_64/qgemv_n.S new file mode 100644 index 0000000..28415ec --- /dev/null +++ b/kernel/x86_64/qgemv_n.S @@ -0,0 +1,410 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define P 32 + +#define STACKSIZE 80 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OLD_INCX 24 + STACKSIZE(%rsp) +#define OLD_Y 32 + STACKSIZE(%rsp) +#define OLD_INCY 40 + STACKSIZE(%rsp) +#define BUFFER 48 + STACKSIZE(%rsp) + +#define PLDA_M 56 (%rsp) +#define IS 64 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#define TEMP %rax +#define I %rax +#define J %r11 +#define A1 %r12 +#define X1 %r13 +#define Y1 %r14 +#define XP %r15 +/* #define BUFFER %r15 */ +#define MIN_N %rbx + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + + FLD ALPHA + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + movq $0, IS + + test M, M + jle .L79 # goto END + test N, N + jle .L79 # goto END + + movq LDA, %rax + imulq $P, %rax # P * lda + subq M ,%rax # P * lda - m + salq $BASE_SHIFT, %rax + movq %rax, PLDA_M + + salq $BASE_SHIFT, LDA + ALIGN_2 + +.L32: + movq $P, %rax + movq N, MIN_N + subq IS, MIN_N + cmpq %rax, MIN_N + cmovg %rax, MIN_N + + movq IS, XP + salq $BASE_SHIFT, XP + leaq (X,XP, 1), XP + + cmpq $SIZE, INCX + je .L34 # if incx == 1 goto L34 + + movq BUFFER, XP + movq XP, X1 + + movq MIN_N, I + sarq $2,I + jle .L35 + ALIGN_2 + +.L36: + FLD (X) + addq INCX,X + FLD (X) + addq INCX,X + FLD (X) + addq INCX,X + FLD (X) + addq INCX,X + + FST 3 * SIZE(X1) + FST 2 * SIZE(X1) + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + + addq $4 * SIZE, X1 + decq I + jg .L36 + ALIGN_3 + +.L35: + movq MIN_N, I + andq $3, I + jle .L34 + ALIGN_2 + +.L42: + FLD (X) + addq INCX, X + FST (X1) + addq $SIZE, X1 + decq I + jg .L42 + ALIGN_3 + +/* Main Routine */ +.L34: + movq Y, Y1 + movq M, J + sarq $2, J + jle .L47 + ALIGN_2 + +.L48: + movq A, A1 # a_offset = a + fldz + addq $4 * SIZE, A # a += 4 + fldz + movq XP, X1 # b_offset = xp + fldz + movq MIN_N, I # i = min_n + fldz + FLD (X1) # bt1 = b_offset + sarq $1, I + jle .L51 + ALIGN_2 + +.L80: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(A1) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(X1) # bt1 = b_offset + + addq LDA, A1 # a_offset += lda + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(A1) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + addq LDA, A1 + faddp %st, %st(4) # ct4 += at1 + + FLD 2 * SIZE(X1) # bt1 = b_offset + addq $2 * SIZE, X1 # b_offset += 2 + + decq I + jg .L80 + +.L51: + movq MIN_N, I + andq $1, I + je .L57 + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(A1) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 3) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + fldz + ALIGN_2 + +.L57: + ffreep %st(0) + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + decq J # j -- + jg .L48 + ALIGN_3 + +.L47: + movq M, J + andq $3, J # j = (m & 3) + jle .L60 + ALIGN_2 + +.L61: + movq A, A1 # a_offset = a + fldz + addq $SIZE, A # a++ + fldz + movq XP, X1 + fldz + fldz + movq MIN_N, I + sarq $3, I + jle .L64 + ALIGN_2 + +.L65: + FLD 0 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(1) + addq LDA, A1 + + FLD 1 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(2) + addq LDA ,A1 + + FLD 2 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(3) + addq LDA, A1 + + FLD 3 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(4) + addq LDA, A1 + + FLD 4 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st,%st(1) + addq LDA, A1 + + FLD 5 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(2) + addq LDA, A1 + + FLD 6 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st,%st(3) + addq LDA, A1 + + FLD 7 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st,%st(4) + addq LDA, A1 + + addq $8 * SIZE, X1 + decq I + jg .L65 + +.L64: + movq MIN_N,I + andq $7, I + jle .L70 + ALIGN_2 + +.L71: + FLD (X1) + addq $SIZE, X1 + FLD (A1) + fmulp %st, %st(1) + addq LDA, A1 # a_offset += lda + faddp %st, %st(1) + decq I + jg .L71 + ALIGN_2 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1), %st + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + decq J + jg .L61 + +.L60: + addq PLDA_M, A + addq $P, IS + cmpq N, IS + jl .L32 + +.L79: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/qgemv_t.S b/kernel/x86_64/qgemv_t.S new file mode 100644 index 0000000..9402f21 --- /dev/null +++ b/kernel/x86_64/qgemv_t.S @@ -0,0 +1,466 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define STACKSIZE 80 +#define P 4096 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OLD_INCX 24 + STACKSIZE(%rsp) +#define OLD_Y 32 + STACKSIZE(%rsp) +#define OLD_INCY 40 + STACKSIZE(%rsp) +#define BUFFER 48 + STACKSIZE(%rsp) + +#define NLDA 56 (%rsp) +#define IS 64 (%rsp) +#define XP 72 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#define TEMP %rax +#define I %rax +#define J %r11 +#define A1 %r12 +#define A2 %r15 +#define X1 %r13 +#define Y1 %r14 +#define MIN_M %rbx + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + + FLD ALPHA + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + movq $0, IS + + test M, M + jle .L79 # goto END + test N, N + jle .L79 # goto END + + movq N, %rax + imulq LDA, %rax + movq $P, NLDA + subq %rax, NLDA + salq $BASE_SHIFT, NLDA + + salq $BASE_SHIFT, LDA + ALIGN_2 + +.L32: + movq $P, %rax + movq M, MIN_M + subq IS , MIN_M + cmpq %rax, MIN_M + cmovg %rax, MIN_M + + movq IS, X1 + salq $BASE_SHIFT, X1 + leaq (X,X1, 1), X1 + + movq X1, XP + + cmpq $SIZE, INCX + je .L34 + + movq BUFFER, X1 + movq X1, XP + + movq MIN_M, I + sarq $2, I + jle .L35 + ALIGN_3 + +.L36: + FLD (X) + addq INCX, X + FST 0 * SIZE(X1) + + FLD (X) + addq INCX, X + FST 1 * SIZE(X1) + + FLD (X) + addq INCX, X + FST 2 * SIZE(X1) + + FLD (X) + addq INCX, X + FST 3 * SIZE(X1) + + addq $4 * SIZE, X1 + decq I + jg .L36 + ALIGN_3 + +.L35: + movq MIN_M, I + andq $3,I + jle .L34 + ALIGN_2 + +.L42: + FLD (X) + addq INCX, X + FST (X1) + addq $SIZE, X1 + decq I + jg .L42 + ALIGN_3 + +/* Main Routine */ + +.L34: + movq Y, Y1 # coffset = y + + movq N, J + sarq $2, J + jle .L47 + ALIGN_3 + +.L48: + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 4), A + + fldz + fldz + fldz + fldz + + movq XP, X1 + FLD (X1) + + movq MIN_M, I + sarq $2,I + jle .L51 + ALIGN_3 + +.L80: + FLD 0 * SIZE(A1) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(2) # ct1 += at1 + FLD 0 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 0 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 0 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + faddp %st,%st(4) + FLD 1 * SIZE(X1) + FLD 1 * SIZE(A1) # at = *(a_offset + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + FLD 1 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + FLD 1 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) + + fmul %st(1),%st + faddp %st,%st(4) + FLD 1 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 2 * SIZE(X1) + + FLD 2 * SIZE(A1) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 2 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 3 * SIZE(X1) + FLD 3 * SIZE(A1) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(2) # ct1 += at1 + FLD 3 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(3) # ct2 += at1 + FLD 3 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 3 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + addq $4 * SIZE, A1 + faddp %st,%st(4) + addq $4 * SIZE, A2 + + FLD 4 * SIZE(X1) + addq $4 * SIZE, X1 + + decq I + jg .L80 + ALIGN_3 + +.L51: + movq MIN_M, I + andq $3, I + je .L81 + ALIGN_3 + +.L52: + FLD (A1) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD (A2) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD (A1, LDA, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD (A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 1 * SIZE(X1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + decq I + jg .L52 + ALIGN_3 + +.L81: + ffreep %st(0) + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + decq J + jg .L48 + ALIGN_3 + +.L47: + movq N, J + andq $3, J + jle .L60 + ALIGN_2 + +.L61: + movq A, A1 # a_offset = a + fldz # ct1 = ZERO + fldz # ct1 = ZERO + + addq LDA, A + fldz # ct1 = ZERO + fldz # ct1 = ZERO + + movq XP, X1 + + movq MIN_M, I + sarq $3,I + jle .L64 + ALIGN_3 + +.L65: + FLD 0 * SIZE(X1) + FLD 0 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(1) + + FLD 1 * SIZE(X1) + FLD 1 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(2) + + FLD 2 * SIZE(X1) + FLD 2 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(3) + + FLD 3 * SIZE(X1) + FLD 3 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 4 * SIZE(X1) + FLD 4 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(1) + + FLD 5 * SIZE(X1) + FLD 5 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(2) + + FLD 6 * SIZE(X1) + FLD 6 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(3) + + FLD 7 * SIZE(X1) + FLD 7 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(4) + + addq $8 * SIZE, X1 + addq $8 * SIZE, A1 + + decq I + jg .L65 + ALIGN_3 + +.L64: + movq MIN_M, I + andq $7, I + jle .L70 + ALIGN_3 + +.L71: + FLD (X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st,%st(1) + + addq $SIZE, X1 + addq $SIZE, A1 + decq I + jg .L71 + ALIGN_3 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1),%st + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + decq J + jg .L61 + ALIGN_3 + +.L60: + addq NLDA, A + + addq $P, IS + cmpq M, IS + jl .L32 + ALIGN_3 + +.L79: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/qtrsm_kernel_LN_2x2.S b/kernel/x86_64/qtrsm_kernel_LN_2x2.S new file mode 100644 index 0000000..7093eba --- /dev/null +++ b/kernel/x86_64/qtrsm_kernel_LN_2x2.S @@ -0,0 +1,1234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define AORIG 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OFFSET 32 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 24 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $BASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $BASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $BASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + + lea (, LDC, 2), %rax + +#ifdef RT + subq %rax, C +#endif + movq C, CO +#ifndef RT + addq %rax, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, %rax + andq $1, %rax + je .L20 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + je .L29 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L11 + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J + jne .L01 + ALIGN_4 + +.L30: + movq N, %rax + testq $1, %rax + je .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#ifdef RT + subq LDC, C +#endif + movq C, CO +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, %rax + andq $1, %rax + je .L40 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + movq M, I + sarq $1, I + je .L49 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L31 + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/qtrsm_kernel_LT_2x2.S b/kernel/x86_64/qtrsm_kernel_LT_2x2.S new file mode 100644 index 0000000..d2a05a1 --- /dev/null +++ b/kernel/x86_64/qtrsm_kernel_LT_2x2.S @@ -0,0 +1,1234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define AORIG 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OFFSET 32 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 24 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $BASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $BASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $BASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + + lea (, LDC, 2), %rax + +#ifdef RT + subq %rax, C +#endif + movq C, CO +#ifndef RT + addq %rax, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + je .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L11 + ALIGN_4 + +.L20: + movq M, %rax + andq $1, %rax + je .L29 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J + jne .L01 + ALIGN_4 + +.L30: + movq N, %rax + testq $1, %rax + je .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#ifdef RT + subq LDC, C +#endif + movq C, CO +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + je .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L31 + ALIGN_4 + +.L40: + movq M, %rax + andq $1, %rax + je .L49 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/qtrsm_kernel_RT_2x2.S b/kernel/x86_64/qtrsm_kernel_RT_2x2.S new file mode 100644 index 0000000..288aa07 --- /dev/null +++ b/kernel/x86_64/qtrsm_kernel_RT_2x2.S @@ -0,0 +1,1234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define AORIG 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OFFSET 32 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 24 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $BASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $BASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $BASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, %rax + testq $1, %rax + je .L30 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#ifdef RT + subq LDC, C +#endif + movq C, CO +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + je .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L31 + ALIGN_4 + +.L40: + movq M, %rax + andq $1, %rax + je .L49 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L30: + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + + lea (, LDC, 2), %rax + +#ifdef RT + subq %rax, C +#endif + movq C, CO +#ifndef RT + addq %rax, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + je .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L11 + ALIGN_4 + +.L20: + movq M, %rax + andq $1, %rax + je .L29 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J + jne .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/rot.S b/kernel/x86_64/rot.S new file mode 100644 index 0000000..05e5aeb --- /dev/null +++ b/kernel/x86_64/rot.S @@ -0,0 +1,348 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 48(%rsp), INCY + FLD 72(%rsp) + FLD 56(%rsp) +#else + FLD 24(%rsp) + FLD 8(%rsp) +#endif + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + testq N, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + movq N, I + sarq $2, I + jle .L15 + ALIGN_4 + +.L10: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 2 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 3 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 3 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + + decq I + jg .L10 + ALIGN_4 + +.L15: + movq N, I + andq $3, I + jle .L999 + ALIGN_4 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq $SIZE, X + addq $SIZE, Y + + decq I + jg .L16 + jmp .L999 + ALIGN_4 + +.L50: + movq N, I + sarq $2, I + jle .L55 + ALIGN_4 + +.L51: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq I + jg .L51 + ALIGN_4 + +.L55: + movq N, I + andq $3, I + jle .L999 + ALIGN_4 + +.L56: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq I + jg .L56 + ALIGN_4 + + +.L999: + ffreep %st + ffreep %st + ret + + EPILOGUE diff --git a/kernel/x86_64/rot_sse.S b/kernel/x86_64/rot_sse.S new file mode 100644 index 0000000..cb7e1b3 --- /dev/null +++ b/kernel/x86_64/rot_sse.S @@ -0,0 +1,1090 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define C %xmm14 +#define S %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY + movss 48(%rsp), %xmm0 + movss 56(%rsp), %xmm1 +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + pshufd $0x0, %xmm0, C + pshufd $0x0, %xmm1, S + + cmpq $0, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + testq $SIZE, X + je .L05 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq N + jle .L999 + +.L05: + testq $2 * SIZE, X + je .L10 + + cmpq $1, N + je .L17 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, N + jle .L999 + ALIGN_2 + +.L10: + testq $3 * SIZE, Y + jne .L20 + + movq N, %rax + sarq $5, %rax + jle .L14 + + movaps 0 * SIZE(Y), %xmm1 + movaps 4 * SIZE(Y), %xmm3 + movaps 8 * SIZE(Y), %xmm9 + movaps 12 * SIZE(Y), %xmm11 + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm8 + movaps 12 * SIZE(X), %xmm10 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 20 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 24 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 28 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 32 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 36 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps 32 * SIZE(X), %xmm0 + movaps %xmm2, 20 * SIZE(X) + movaps 36 * SIZE(X), %xmm2 + movaps %xmm4, 16 * SIZE(Y) + movaps %xmm6, 20 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 40 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 44 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps 40 * SIZE(X), %xmm8 + movaps %xmm10, 28 * SIZE(X) + movaps 44 * SIZE(X), %xmm10 + movaps %xmm4, 24 * SIZE(Y) + movaps %xmm6, 28 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 20 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 24 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 28 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 20 * SIZE(X) + movaps %xmm4, 16 * SIZE(Y) + movaps %xmm6, 20 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps %xmm10, 28 * SIZE(X) + movaps %xmm4, 24 * SIZE(Y) + movaps %xmm6, 28 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + ALIGN_3 + +.L14: + testq $31, N + jle .L999 + + testq $16, N + jle .L15 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps 8 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + movaps 12 * SIZE(Y), %xmm3 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L15: + testq $8, N + jle .L16 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + testq $4, N + jle .L17 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + testq $2, N + jle .L18 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L18: + testq $1, N + jle .L999 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movq N, %rax + sarq $5, %rax + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 12 * SIZE(Y), %xmm3 + movhps 14 * SIZE(Y), %xmm3 + movaps 8 * SIZE(X), %xmm0 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + movaps 16 * SIZE(X), %xmm0 + movaps 20 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 20 * SIZE(X) + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movsd 28 * SIZE(Y), %xmm3 + movhps 30 * SIZE(Y), %xmm3 + movaps 24 * SIZE(X), %xmm0 + movaps 28 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 24 * SIZE(X) + movaps %xmm2, 28 * SIZE(X) + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L24: + testq $31, N + jle .L999 + + testq $16, N + jle .L25 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 12 * SIZE(Y), %xmm3 + movhps 14 * SIZE(Y), %xmm3 + movaps 8 * SIZE(X), %xmm0 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, N + jle .L26 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + + +.L26: + testq $4, N + jle .L27 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $2, N + jle .L28 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L28: + testq $1, N + jle .L999 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/rot_sse2.S b/kernel/x86_64/rot_sse2.S new file mode 100644 index 0000000..5055547 --- /dev/null +++ b/kernel/x86_64/rot_sse2.S @@ -0,0 +1,986 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define C %xmm14 +#define S %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY + movsd 48(%rsp), %xmm0 + movsd 56(%rsp), %xmm1 +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + pshufd $0x44, %xmm0, C + pshufd $0x44, %xmm1, S + + cmpq $0, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + testq $SIZE, X + je .L10 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq N + jle .L999 + ALIGN_2 + +.L10: + testq $SIZE, Y + jne .L20 + + movq N, %rax + sarq $4, %rax + jle .L14 + + movaps 0 * SIZE(Y), %xmm1 + movaps 2 * SIZE(Y), %xmm3 + movaps 4 * SIZE(Y), %xmm9 + movaps 6 * SIZE(Y), %xmm11 + + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm2 + movaps 4 * SIZE(X), %xmm8 + movaps 6 * SIZE(X), %xmm10 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulpd S, %xmm1 + movaps %xmm3, %xmm6 + mulpd S, %xmm3 + movaps %xmm0, %xmm5 + mulpd C, %xmm0 + movaps %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movaps 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movaps 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 0 * SIZE(X) + movaps 8 * SIZE(X), %xmm0 + movaps %xmm2, 2 * SIZE(X) + movaps 10 * SIZE(X), %xmm2 + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 2 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulpd S, %xmm9 + movaps %xmm8, %xmm5 + mulpd C, %xmm8 + movaps %xmm11, %xmm6 + mulpd S, %xmm11 + movaps %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movaps 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movaps 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm8, 4 * SIZE(X) + movaps 12 * SIZE(X), %xmm8 + movaps %xmm10,6 * SIZE(X) + movaps 14 * SIZE(X), %xmm10 + movaps %xmm4, 4 * SIZE(Y) + movaps %xmm6, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulpd S, %xmm1 + movaps %xmm3, %xmm6 + mulpd S, %xmm3 + movaps %xmm0, %xmm5 + mulpd C, %xmm0 + movaps %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movaps 18 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 10 * SIZE(X) + movaps 18 * SIZE(X), %xmm2 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulpd S, %xmm9 + movaps %xmm8, %xmm5 + mulpd C, %xmm8 + movaps %xmm11, %xmm6 + mulpd S, %xmm11 + movaps %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movaps 20 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movaps 22 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm8, 12 * SIZE(X) + movaps 20 * SIZE(X), %xmm8 + movaps %xmm10, 14 * SIZE(X) + movaps 22 * SIZE(X), %xmm10 + movaps %xmm4, 12 * SIZE(Y) + movaps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm1, %xmm4 + mulpd S, %xmm1 + movaps %xmm3, %xmm6 + mulpd S, %xmm3 + movaps %xmm0, %xmm5 + mulpd C, %xmm0 + movaps %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movaps 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movaps 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps 8 * SIZE(X), %xmm0 + movaps %xmm2, 2 * SIZE(X) + movaps 10 * SIZE(X), %xmm2 + + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 2 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulpd S, %xmm9 + movaps %xmm8, %xmm5 + mulpd C, %xmm8 + movaps %xmm11, %xmm6 + mulpd S, %xmm11 + movaps %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movaps 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movaps 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm8, 4 * SIZE(X) + movaps 12 * SIZE(X), %xmm8 + movaps %xmm10,6 * SIZE(X) + movaps 14 * SIZE(X), %xmm10 + movaps %xmm4, 4 * SIZE(Y) + movaps %xmm6, 6 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulpd S, %xmm1 + movaps %xmm3, %xmm6 + mulpd S, %xmm3 + movaps %xmm0, %xmm5 + mulpd C, %xmm0 + movaps %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 10 * SIZE(X) + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 10 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulpd S, %xmm9 + movaps %xmm8, %xmm5 + mulpd C, %xmm8 + movaps %xmm11, %xmm6 + mulpd S, %xmm11 + movaps %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm8, 12 * SIZE(X) + movaps %xmm10, 14 * SIZE(X) + movaps %xmm4, 12 * SIZE(Y) + movaps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + + +.L14: + testq $15, N + jle .L999 + + testq $8, N + jle .L15 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(Y), %xmm3 + movaps 2 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 2 * SIZE(Y) + + movaps 4 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + movaps 6 * SIZE(Y), %xmm3 + movaps 6 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 6 * SIZE(X) + movaps %xmm4, 4 * SIZE(Y) + movaps %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, N + jle .L16 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(Y), %xmm3 + movaps 2 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movaps -1 * SIZE(Y), %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps 1 * SIZE(Y), %xmm3 + movaps 3 * SIZE(Y), %xmm8 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + + movlpd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlpd %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps 5 * SIZE(Y), %xmm9 + movaps 7 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + movaps 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movaps %xmm8, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm9, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 6 * SIZE(X) + movlpd %xmm4, 4 * SIZE(Y) + movhps %xmm4, 5 * SIZE(Y) + movlpd %xmm6, 6 * SIZE(Y) + movhps %xmm6, 7 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps 9 * SIZE(Y), %xmm3 + movaps 11 * SIZE(Y), %xmm8 + movaps 8 * SIZE(X), %xmm0 + movaps 10 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 10 * SIZE(X) + movlpd %xmm4, 8 * SIZE(Y) + movhps %xmm4, 9 * SIZE(Y) + movlpd %xmm6, 10 * SIZE(Y) + movhps %xmm6, 11 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps 13 * SIZE(Y), %xmm9 + movaps 15 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + movaps 14 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movaps %xmm8, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm9, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 12 * SIZE(X) + movaps %xmm2, 14 * SIZE(X) + movlpd %xmm4, 12 * SIZE(Y) + movhps %xmm4, 13 * SIZE(Y) + movlpd %xmm6, 14 * SIZE(Y) + movhps %xmm6, 15 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L24: + testq $15, N + jle .L999 + + testq $8, N + jle .L25 + + movaps 1 * SIZE(Y), %xmm3 + movaps 3 * SIZE(Y), %xmm8 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + movlpd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlpd %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + + movaps 5 * SIZE(Y), %xmm9 + movaps 7 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + movaps 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movaps %xmm8, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm9, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 6 * SIZE(X) + movlpd %xmm4, 4 * SIZE(Y) + movhps %xmm4, 5 * SIZE(Y) + movlpd %xmm6, 6 * SIZE(Y) + movhps %xmm6, 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + movaps 1 * SIZE(Y), %xmm3 + movaps 3 * SIZE(Y), %xmm8 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + movlpd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlpd %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + movaps %xmm8, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + movaps 1 * SIZE(Y), %xmm4 + movaps 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + movaps %xmm4, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L999 + + unpckhpd %xmm1, %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, (X) + movhps %xmm0, (X, INCX) + movlpd %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leaq (X, INCX, 2), X + leaq (Y, INCY, 2), Y + + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, (X) + movhps %xmm0, (X, INCX) + movlpd %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leaq (X, INCX, 2), X + leaq (Y, INCY, 2), Y + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd (Y), %xmm1 + movsd (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, (X) + movsd %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/scal.S b/kernel/x86_64/scal.S new file mode 100644 index 0000000..1f8e4d4 --- /dev/null +++ b/kernel/x86_64/scal.S @@ -0,0 +1,302 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG4 +#define INCX ARG5 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + FLD 8(%rsp) + + ftst + fnstsw %ax + andb $68, %ah + je .L300 + +/* Alpha == ZERO */ + cmpq $1, INCX + jne .L104 + + movq M, I + sarq $3, I + jle .L102 + ALIGN_4 + +.L101: + fld %st + FST 0 * SIZE(X) + fld %st + FST 1 * SIZE(X) + fld %st + FST 2 * SIZE(X) + fld %st + FST 3 * SIZE(X) + fld %st + FST 4 * SIZE(X) + fld %st + FST 5 * SIZE(X) + fld %st + FST 6 * SIZE(X) + fld %st + FST 7 * SIZE(X) + + addq $8 * SIZE, X + decq I + jg .L101 + ALIGN_4 + +.L102: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + +.L103: + fld %st + FST 0 * SIZE(X) + + addq $SIZE, X + decq I + jg .L103 + jmp .L999 + ALIGN_4 + +.L104: + salq $BASE_SHIFT, INCX + + movq M, I + sarq $3, I + jle .L106 + ALIGN_4 + +.L105: + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + + decq I + jg .L105 + ALIGN_4 + +.L106: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + +.L107: + fld %st + FST 0 * SIZE(X) + addq INCX, X + decq I + jg .L107 + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L300: + cmpq $1,INCX + jne .L304 + + movq M, I + sarq $3, I + jle .L302 + ALIGN_4 + +.L301: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + + FLD 1 * SIZE(X) + fmul %st(1), %st + FST 1 * SIZE(X) + + FLD 2 * SIZE(X) + fmul %st(1), %st + FST 2 * SIZE(X) + + FLD 3 * SIZE(X) + fmul %st(1), %st + FST 3 * SIZE(X) + + FLD 4 * SIZE(X) + fmul %st(1), %st + FST 4 * SIZE(X) + + FLD 5 * SIZE(X) + fmul %st(1), %st + FST 5 * SIZE(X) + + FLD 6 * SIZE(X) + fmul %st(1), %st + FST 6 * SIZE(X) + + FLD 7 * SIZE(X) + fmul %st(1), %st + FST 7 * SIZE(X) + + addq $8 * SIZE, X + decq I + jg .L301 + ALIGN_4 + +.L302: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + +.L303: + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq $SIZE, X + decq I + jg .L303 + jmp .L999 + ALIGN_4 + +.L304: + salq $BASE_SHIFT, INCX + + movq M, I + sarq $3, I + jle .L306 + ALIGN_4 + +.L305: + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + decq I + jg .L305 + ALIGN_4 + +.L306: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + +.L307: + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + decq I + jg .L307 + ALIGN_4 + +.L999: + ffreep %st(0) + ret + + EPILOGUE diff --git a/kernel/x86_64/scal_atom.S b/kernel/x86_64/scal_atom.S new file mode 100644 index 0000000..ecc687c --- /dev/null +++ b/kernel/x86_64/scal_atom.S @@ -0,0 +1,446 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), X + movq 48(%rsp), INCX + + movaps %xmm3, %xmm0 +#endif + + SAVEREGISTERS + + testq M, M + jle .L999 + + pxor %xmm1, %xmm1 + lea (, INCX, SIZE), INCX + comisd %xmm0, %xmm1 + jne .L100 + +/* Alpha == ZERO */ + cmpq $SIZE, INCX + jne .L50 + + movq M, I + sarq $3, I + jle .L12 + ALIGN_4 + +.L11: + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + movsd %xmm1, 2 * SIZE(X) + movsd %xmm1, 3 * SIZE(X) + + movsd %xmm1, 4 * SIZE(X) + movsd %xmm1, 5 * SIZE(X) + movsd %xmm1, 6 * SIZE(X) + movsd %xmm1, 7 * SIZE(X) + + addq $8 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $4, M + je .L14 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + movsd %xmm1, 2 * SIZE(X) + movsd %xmm1, 3 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + + addq $2 * SIZE, X + ALIGN_3 + +.L15: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +.L50: + movq M, I + sarq $3, I + jle .L52 + ALIGN_4 + +.L51: + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + + decq I + jg .L51 + ALIGN_4 + +.L52: + testq $7, M + je .L999 + + testq $4, M + je .L53 + + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L53: + testq $2, M + je .L54 + + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L54: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + cmpq $SIZE, INCX + jne .L150 + + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L113 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movsd 5 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 7 * SIZE(X), %xmm8 + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm2, 1 * SIZE(X) + movsd %xmm3, 2 * SIZE(X) + movsd %xmm4, 3 * SIZE(X) + + movsd 8 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm5 + movsd 9 * SIZE(X), %xmm2 + mulsd %xmm0, %xmm6 + movsd 10 * SIZE(X), %xmm3 + mulsd %xmm0, %xmm7 + movsd 11 * SIZE(X), %xmm4 + mulsd %xmm0, %xmm8 + + movsd %xmm5, 4 * SIZE(X) + movsd %xmm6, 5 * SIZE(X) + movsd %xmm7, 6 * SIZE(X) + movsd %xmm8, 7 * SIZE(X) + + movsd 12 * SIZE(X), %xmm5 + mulsd %xmm0, %xmm1 + movsd 13 * SIZE(X), %xmm6 + mulsd %xmm0, %xmm2 + movsd 14 * SIZE(X), %xmm7 + mulsd %xmm0, %xmm3 + movsd 15 * SIZE(X), %xmm8 + mulsd %xmm0, %xmm4 + + addq $8 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + movsd %xmm1, 0 * SIZE(X) + mulsd %xmm0, %xmm5 + movsd %xmm2, 1 * SIZE(X) + mulsd %xmm0, %xmm6 + movsd %xmm3, 2 * SIZE(X) + mulsd %xmm0, %xmm7 + movsd %xmm4, 3 * SIZE(X) + mulsd %xmm0, %xmm8 + + movsd %xmm5, 4 * SIZE(X) + movsd %xmm6, 5 * SIZE(X) + movsd %xmm7, 6 * SIZE(X) + movsd %xmm8, 7 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L113: + testq $4, M + je .L115 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm2, 1 * SIZE(X) + movsd %xmm3, 2 * SIZE(X) + movsd %xmm4, 3 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L115: + testq $2, M + je .L116 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm2, 1 * SIZE(X) + + addq $2 * SIZE, X + ALIGN_3 + +.L116: + testq $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movq X, XX + movq M, I # rcx = n + sarq $3, I # (n >> 3) + jle .L152 + ALIGN_4 + +.L151: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + mulsd %xmm0, %xmm1 + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + mulsd %xmm0, %xmm2 + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd %xmm0, %xmm3 + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + mulsd %xmm0, %xmm4 + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + mulsd %xmm0, %xmm5 + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + mulsd %xmm0, %xmm6 + movsd 0 * SIZE(X), %xmm8 + addq INCX, X + mulsd %xmm0, %xmm7 + + movsd %xmm1, 0 * SIZE(XX) + addq INCX, XX + mulsd %xmm0, %xmm8 + movsd %xmm2, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm3, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm4, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm5, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm6, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm7, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm8, 0 * SIZE(XX) + addq INCX, XX + decq I + jg .L151 + ALIGN_4 + +.L152: + testq $7, M + je .L999 + + testq $4, M + je .L153 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + mulsd %xmm0, %xmm1 + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + mulsd %xmm0, %xmm2 + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd %xmm0, %xmm3 + + movsd %xmm1, 0 * SIZE(XX) + addq INCX, XX + mulsd %xmm0, %xmm4 + movsd %xmm2, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm3, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm4, 0 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L153: + testq $2, M + je .L154 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + mulsd %xmm0, %xmm1 + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + mulsd %xmm0, %xmm2 + + movsd %xmm1, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm2, 0 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L154: + testq $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, 0 * SIZE(X) + ALIGN_4 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/scal_sse.S b/kernel/x86_64/scal_sse.S new file mode 100644 index 0000000..323e8b9 --- /dev/null +++ b/kernel/x86_64/scal_sse.S @@ -0,0 +1,612 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), X + movq 48(%rsp), INCX + + movaps %xmm3, %xmm0 +#endif + + SAVEREGISTERS + + testq M, M + jle .L999 + + lea (, INCX, SIZE), INCX + + pxor %xmm1, %xmm1 + comiss %xmm0, %xmm1 + shufps $0, %xmm0, %xmm0 + + jne .L100 # Alpha != ZERO + +/* Alpha == ZERO */ + cmpq $SIZE, INCX + jne .L50 + +/* INCX == 1 */ + cmpq $3, M + jle .L14 + + testq $4, X # aligned for double word? + je .L05 + + movss %xmm1, 0 * SIZE(X) + addq $SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L05: + testq $8, X # aligned for quad word? + je .L06 + + movsd %xmm1, 0 * SIZE(X) + addq $2 * SIZE, X + subq $2, M + jle .L999 + ALIGN_3 + +.L06: + movq M, I + sarq $4, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 8 * SIZE(X) + movaps %xmm1, 12 * SIZE(X) + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $15, M + je .L999 + testq $8, M + je .L13 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L13: + testq $4, M + je .L14 + + movaps %xmm1, 0 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movsd %xmm1, 0 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L15: + testq $1, M + je .L999 + + movss %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L50: + movq M, I # rcx = n + sarq $3, I # (n >> 3) + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + + decq I + jg .L51 + ALIGN_4 + +.L52: + testq $7, M + je .L999 + + testq $4, M + je .L53 + + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + ALIGN_3 + +.L53: + testq $2, M + je .L54 + + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + ALIGN_3 + +.L54: + testq $1, M + je .L999 + + movss %xmm1, (X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + cmpq $SIZE, INCX + jne .L150 + + subq $-32 * SIZE, X + + cmpq $3, M + jle .L116 + + testq $SIZE, X + je .L105 + + movss -32 * SIZE(X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, -32 * SIZE(X) + addq $SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L105: + testq $2 * SIZE, X + je .L110 + + movsd -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movsd %xmm1, -32 * SIZE(X) + addq $2 * SIZE, X + subq $2, M + jle .L999 + ALIGN_3 + +.L110: + movq M, I + sarq $5, I + jle .L113 + +#if defined(BARCELONA) || defined(SHANGHAI) + + movaps %xmm0, %xmm1 + mulps -32 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulps -28 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulps -24 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulps -20 * SIZE(X), %xmm4 + movaps %xmm0, %xmm5 + mulps -16 * SIZE(X), %xmm5 + movaps %xmm0, %xmm6 + mulps -12 * SIZE(X), %xmm6 + movaps %xmm0, %xmm7 + mulps -8 * SIZE(X), %xmm7 + movaps %xmm0, %xmm8 + mulps -4 * SIZE(X), %xmm8 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, -32 * SIZE(X) + movaps %xmm2, -28 * SIZE(X) + movaps %xmm3, -24 * SIZE(X) + movaps %xmm4, -20 * SIZE(X) + + movaps %xmm0, %xmm1 + mulps 0 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulps 4 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulps 8 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulps 12 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm5, -16 * SIZE(X) + movaps %xmm6, -12 * SIZE(X) + movaps %xmm7, -8 * SIZE(X) + movaps %xmm8, -4 * SIZE(X) + + movaps %xmm0, %xmm5 + mulps 16 * SIZE(X), %xmm5 + movaps %xmm0, %xmm6 + mulps 20 * SIZE(X), %xmm6 + movaps %xmm0, %xmm7 + mulps 24 * SIZE(X), %xmm7 + movaps %xmm0, %xmm8 + mulps 28 * SIZE(X), %xmm8 + + subq $-32 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + movaps %xmm1, -32 * SIZE(X) + movaps %xmm2, -28 * SIZE(X) + movaps %xmm3, -24 * SIZE(X) + movaps %xmm4, -20 * SIZE(X) + + movaps %xmm5, -16 * SIZE(X) + movaps %xmm6, -12 * SIZE(X) + movaps %xmm7, -8 * SIZE(X) + movaps %xmm8, -4 * SIZE(X) + +#else + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + movaps -4 * SIZE(X), %xmm8 + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + movaps 0 * SIZE(X), %xmm1 + mulps %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(X) + movaps 4 * SIZE(X), %xmm2 + mulps %xmm0, %xmm3 + movaps %xmm3, -24 * SIZE(X) + movaps 8 * SIZE(X), %xmm3 + mulps %xmm0, %xmm4 + movaps %xmm4, -20 * SIZE(X) + movaps 12 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps %xmm0, %xmm5 + movaps %xmm5, -16 * SIZE(X) + movaps 16 * SIZE(X), %xmm5 + mulps %xmm0, %xmm6 + movaps %xmm6, -12 * SIZE(X) + movaps 20 * SIZE(X), %xmm6 + mulps %xmm0, %xmm7 + movaps %xmm7, -8 * SIZE(X) + movaps 24 * SIZE(X), %xmm7 + mulps %xmm0, %xmm8 + movaps %xmm8, -4 * SIZE(X) + movaps 28 * SIZE(X), %xmm8 + + subq $-32 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -24 * SIZE(X) + mulps %xmm0, %xmm4 + movaps %xmm4, -20 * SIZE(X) + + mulps %xmm0, %xmm5 + movaps %xmm5, -16 * SIZE(X) + mulps %xmm0, %xmm6 + movaps %xmm6, -12 * SIZE(X) + mulps %xmm0, %xmm7 + movaps %xmm7, -8 * SIZE(X) + mulps %xmm0, %xmm8 + movaps %xmm8, -4 * SIZE(X) + +#endif + + subq $-32 * SIZE, X + ALIGN_3 + +.L113: + testq $31, M + je .L999 + + testq $16, M + je .L114 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm3 + movaps -24 * SIZE(X), %xmm5 + movaps -20 * SIZE(X), %xmm7 + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -28 * SIZE(X) + mulps %xmm0, %xmm5 + movaps %xmm5, -24 * SIZE(X) + mulps %xmm0, %xmm7 + movaps %xmm7, -20 * SIZE(X) + + addq $16 * SIZE, X + ALIGN_3 + +.L114: + testq $8, M + je .L115 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -28 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L115: + testq $4, M + je .L116 + + movaps -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L116: + testq $2, M + je .L117 + + movsd -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movsd %xmm1, -32 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L117: + testq $1, M + je .L999 + + movss -32 * SIZE(X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, -32 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movq X, XX + movq M, I # rcx = n + sarq $3, I # (n >> 3) + jle .L152 + ALIGN_4 + +.L151: + movss (X), %xmm1 + addq INCX, X + movss (X), %xmm2 + addq INCX, X + movss (X), %xmm3 + addq INCX, X + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm7 + addq INCX, X + movss (X), %xmm8 + addq INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + mulss %xmm0, %xmm6 + mulss %xmm0, %xmm7 + mulss %xmm0, %xmm8 + + movss %xmm1, (XX) + addq INCX, XX + movss %xmm2, (XX) + addq INCX, XX + movss %xmm3, (XX) + addq INCX, XX + movss %xmm4, (XX) + addq INCX, XX + movss %xmm5, (XX) + addq INCX, XX + movss %xmm6, (XX) + addq INCX, XX + movss %xmm7, (XX) + addq INCX, XX + movss %xmm8, (XX) + addq INCX, XX + decq I + jg .L151 + ALIGN_4 + +.L152: + testq $7, M + je .L999 + + testq $4, M + je .L153 + + movss (X), %xmm1 + addq INCX, X + movss (X), %xmm2 + addq INCX, X + movss (X), %xmm3 + addq INCX, X + movss (X), %xmm4 + addq INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + + movss %xmm1, (XX) + addq INCX, XX + movss %xmm2, (XX) + addq INCX, XX + movss %xmm3, (XX) + addq INCX, XX + movss %xmm4, (XX) + addq INCX, XX + ALIGN_3 + +.L153: + testq $2, M + je .L154 + + movss (X), %xmm1 + addq INCX, X + movss (X), %xmm2 + addq INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + + movss %xmm1, (XX) + addq INCX, XX + movss %xmm2, (XX) + addq INCX, XX + ALIGN_3 + +.L154: + testq $1, M + je .L999 + + movss (X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (X) + ALIGN_4 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S new file mode 100644 index 0000000..b0abb45 --- /dev/null +++ b/kernel/x86_64/scal_sse2.S @@ -0,0 +1,588 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), X + movq 48(%rsp), INCX + + movaps %xmm3, %xmm0 +#endif + + SAVEREGISTERS + + testq M, M + jle .L999 + + leaq (, INCX, SIZE), INCX + + xorps %xmm1, %xmm1 + comisd %xmm0, %xmm1 + jne .L100 # Alpha != ZERO + +/* Alpha == ZERO */ + cmpq $SIZE, INCX + jne .L50 + +/* INCX == 1 */ + testq $15, X # aligned for quad word? + je .L05 + + movsd %xmm1, 0 * SIZE(X) + addq $SIZE, X + decq M + jle .L999 + ALIGN_3 +.L05: + +/* Aligned Mode */ + movq M, I # rcx = n + sarq $4, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, 8 * SIZE(X) + movaps %xmm1, 10 * SIZE(X) + movaps %xmm1, 12 * SIZE(X) + movaps %xmm1, 14 * SIZE(X) + + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $15, M + je .L999 + testq $8, M + je .L13 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 6 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L13: + testq $4, M + je .L14 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movaps %xmm1, 0 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L15: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +.L50: + movq M, I + sarq $3, I + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + + decq I + jg .L51 + ALIGN_4 + +.L52: + testq $7, M + je .L999 + + testq $4, M + je .L53 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + ALIGN_3 + +.L53: + testq $2, M + je .L54 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + ALIGN_3 + +.L54: + testq $1, M + je .L999 + + movsd %xmm1, (X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + unpcklpd %xmm0, %xmm0 + + cmpq $SIZE, INCX + jne .L150 + + testq $SIZE, X + je .L105 + + movsd 0 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, 0 * SIZE(X) + addq $SIZE, X + decq M + jle .L999 + ALIGN_3 +.L105: + subq $-16 * SIZE, X + + movq M, I # rcx = n + sarq $4, I + jle .L113 + +#if defined(BARCELONA) || defined(SHANGHAI) + + movaps %xmm0, %xmm1 + mulpd -16 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulpd -14 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulpd -12 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulpd -10 * SIZE(X), %xmm4 + movaps %xmm0, %xmm5 + mulpd -8 * SIZE(X), %xmm5 + movaps %xmm0, %xmm6 + mulpd -6 * SIZE(X), %xmm6 + movaps %xmm0, %xmm7 + mulpd -4 * SIZE(X), %xmm7 + movaps %xmm0, %xmm8 + mulpd -2 * SIZE(X), %xmm8 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, -16 * SIZE(X) + movaps %xmm2, -14 * SIZE(X) + movaps %xmm3, -12 * SIZE(X) + movaps %xmm4, -10 * SIZE(X) + + movaps %xmm0, %xmm1 + mulpd 0 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulpd 2 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulpd 4 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulpd 6 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm5, -8 * SIZE(X) + movaps %xmm6, -6 * SIZE(X) + movaps %xmm7, -4 * SIZE(X) + movaps %xmm8, -2 * SIZE(X) + + movaps %xmm0, %xmm5 + mulpd 8 * SIZE(X), %xmm5 + movaps %xmm0, %xmm6 + mulpd 10 * SIZE(X), %xmm6 + movaps %xmm0, %xmm7 + mulpd 12 * SIZE(X), %xmm7 + movaps %xmm0, %xmm8 + mulpd 14 * SIZE(X), %xmm8 + + subq $-16 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + movaps %xmm1, -16 * SIZE(X) + movaps %xmm2, -14 * SIZE(X) + movaps %xmm3, -12 * SIZE(X) + movaps %xmm4, -10 * SIZE(X) + movaps %xmm5, -8 * SIZE(X) + movaps %xmm6, -6 * SIZE(X) + movaps %xmm7, -4 * SIZE(X) + movaps %xmm8, -2 * SIZE(X) + +#else + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + movaps -8 * SIZE(X), %xmm5 + movaps -6 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + movaps -2 * SIZE(X), %xmm8 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + movaps 0 * SIZE(X), %xmm1 + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + movaps 2 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + movaps 4 * SIZE(X), %xmm3 + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + movaps 6 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd %xmm0, %xmm5 + movaps %xmm5, -8 * SIZE(X) + movaps 8 * SIZE(X), %xmm5 + mulpd %xmm0, %xmm6 + movaps %xmm6, -6 * SIZE(X) + movaps 10 * SIZE(X), %xmm6 + + mulpd %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(X) + movaps 12 * SIZE(X), %xmm7 + mulpd %xmm0, %xmm8 + movaps %xmm8, -2 * SIZE(X) + movaps 14 * SIZE(X), %xmm8 + + subq $-16 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + + mulpd %xmm0, %xmm5 + movaps %xmm5, -8 * SIZE(X) + mulpd %xmm0, %xmm6 + movaps %xmm6, -6 * SIZE(X) + mulpd %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(X) + mulpd %xmm0, %xmm8 + movaps %xmm8, -2 * SIZE(X) +#endif + + subq $-16 * SIZE, X + ALIGN_3 + +.L113: + testq $15, M + je .L999 + + testq $8, M + je .L114 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L114: + testq $4, M + je .L115 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L115: + testq $2, M + je .L116 + + movaps -16 * SIZE(X), %xmm1 + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L116: + testq $1, M + je .L999 + + movsd -16 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, -16 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movq X, XX + movq M, I # rcx = n + sarq $3, I # (n >> 3) + jle .L152 + ALIGN_4 + +.L151: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + movsd (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + movsd (X), %xmm8 + addq INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + mulsd %xmm0, %xmm8 + + movsd %xmm1, (XX) + addq INCX, XX + movsd %xmm2, (XX) + addq INCX, XX + movsd %xmm3, (XX) + addq INCX, XX + movsd %xmm4, (XX) + addq INCX, XX + movsd %xmm5, (XX) + addq INCX, XX + movsd %xmm6, (XX) + addq INCX, XX + movsd %xmm7, (XX) + addq INCX, XX + movsd %xmm8, (XX) + addq INCX, XX + decq I + jg .L151 + ALIGN_4 + +.L152: + testq $7, M + je .L999 + + testq $4, M + je .L153 + + movsd (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + movsd (X), %xmm4 + addq INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + + movsd %xmm1, (XX) + addq INCX, XX + movsd %xmm2, (XX) + addq INCX, XX + movsd %xmm3, (XX) + addq INCX, XX + movsd %xmm4, (XX) + addq INCX, XX + ALIGN_3 + +.L153: + testq $2, M + je .L154 + + movsd (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + + movsd %xmm1, (XX) + addq INCX, XX + movsd %xmm2, (XX) + addq INCX, XX + ALIGN_3 + +.L154: + testq $1, M + je .L999 + + movsd (X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, (X) + ALIGN_4 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/sgemv_n.S b/kernel/x86_64/sgemv_n.S new file mode 100644 index 0000000..ead2420 --- /dev/null +++ b/kernel/x86_64/sgemv_n.S @@ -0,0 +1,6018 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 4 +#undef GEMV_UNROLL +#define GEMV_UNROLL 4 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + movss %xmm0, ALPHA +#else + movss %xmm3, ALPHA +#endif + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + leaq (LDA, LDA, 2), LDA3 + +#ifdef ALIGNED_ACCESS + movq M, MM + testq $4 * SIZE - 1, A + je .L0X + cmpq $3, M + jle .L0X + + movq A, MM + sarq $BASE_SHIFT, MM + andq $3, MM + subq $4, MM + addq M, MM + +.L0X: +#endif + + testq N, N # if n <= 0 goto END + jle .L999 + testq M, M # if n <= 0 goto END + jle .L999 + + subq $-32 * SIZE, A + + movq BUFFER, Y1 + + pxor %xmm0, %xmm0 + + movq M, %rax +#ifdef ALIGNED_ACCESS + addq $19, %rax +#else + addq $16, %rax +#endif + sarq $4, %rax + ALIGN_3 + +.L01: + movaps %xmm0, 0 * SIZE(Y1) + movaps %xmm0, 4 * SIZE(Y1) + movaps %xmm0, 8 * SIZE(Y1) + movaps %xmm0, 12 * SIZE(Y1) + addq $16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: +#ifdef ALIGNED_ACCESS + movq A, %rax + andq $4 * SIZE - 1, %rax + addq %rax, BUFFER + + testq $4 * SIZE - 1, LDA + jne .L100 +#endif + +#if GEMV_UNROLL >= 8 + + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 4), A2 + leaq (A, LDA, 8), A + + movss (X), %xmm8 + addq INCX, X + movss (X), %xmm9 + addq INCX, X + movss (X), %xmm10 + addq INCX, X + movss (X), %xmm11 + addq INCX, X + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm8 + shufps $0, %xmm8, %xmm8 + mulss %xmm0, %xmm9 + shufps $0, %xmm9, %xmm9 + mulss %xmm0, %xmm10 + shufps $0, %xmm10, %xmm10 + mulss %xmm0, %xmm11 + shufps $0, %xmm11, %xmm11 + + mulss %xmm0, %xmm12 + shufps $0, %xmm12, %xmm12 + mulss %xmm0, %xmm13 + shufps $0, %xmm13, %xmm13 + mulss %xmm0, %xmm14 + shufps $0, %xmm14, %xmm14 + mulss %xmm0, %xmm15 + shufps $0, %xmm15, %xmm15 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L17 + + testq $SIZE, A1 + je .L1X + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA, 1), %xmm5 + movss -32 * SIZE(A1, LDA, 2), %xmm6 + movss -32 * SIZE(A1, LDA3, 1), %xmm7 + + movss -32 * SIZE(Y1), %xmm0 + + mulss %xmm8, %xmm4 + addss %xmm4, %xmm0 + movss -32 * SIZE(A2), %xmm4 + mulss %xmm9, %xmm5 + addss %xmm5, %xmm0 + movss -32 * SIZE(A2, LDA, 1), %xmm5 + mulss %xmm10, %xmm6 + addss %xmm6, %xmm0 + movss -32 * SIZE(A2, LDA, 2), %xmm6 + mulss %xmm11, %xmm7 + addss %xmm7, %xmm0 + movss -32 * SIZE(A2, LDA3, 1), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L1X: + testq $2 * SIZE, A1 + je .L1XX + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA, 1), %xmm5 + movsd -32 * SIZE(A1, LDA, 2), %xmm6 + movsd -32 * SIZE(A1, LDA3, 1), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movsd -32 * SIZE(A2, LDA, 1), %xmm5 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA, 2), %xmm6 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movsd -32 * SIZE(A2, LDA3, 1), %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L1XX: +#endif + movq MM, I + sarq $4, I + jle .L15 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm4) + MOVUPS_A1 (-28 * SIZE, A1, %xmm5) + MOVUPS_A1 (-24 * SIZE, A1, %xmm6) + MOVUPS_A1 (-20 * SIZE, A1, %xmm7) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm4) + mulps %xmm8, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm5) + mulps %xmm8, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm6) + mulps %xmm8, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm9, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm4) + mulps %xmm9, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm5) + mulps %xmm9, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm6) + mulps %xmm9, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) +#endif + + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm4) + mulps %xmm10, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm5) + mulps %xmm10, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm6) + mulps %xmm10, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) +#endif + + mulps %xmm11, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-32 * SIZE, A2, %xmm4) + mulps %xmm11, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1 (-28 * SIZE, A2, %xmm5) + mulps %xmm11, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A1 (-24 * SIZE, A2, %xmm6) + mulps %xmm11, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A1 (-20 * SIZE, A2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) +#endif + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm4) + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm5) + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm6) + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) +#endif + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm4) + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm5) + mulps %xmm15, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm6) + mulps %xmm15, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm4) + mulps %xmm8, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm5) + mulps %xmm8, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm6) + mulps %xmm8, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm7) + + mulps %xmm9, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm4) + mulps %xmm9, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm5) + mulps %xmm9, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm6) + mulps %xmm9, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm7) + + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm4) + mulps %xmm10, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm5) + mulps %xmm10, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm6) + mulps %xmm10, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm7) + + mulps %xmm11, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-32 * SIZE, A2, %xmm4) + mulps %xmm11, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1 (-28 * SIZE, A2, %xmm5) + mulps %xmm11, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A1 (-24 * SIZE, A2, %xmm6) + mulps %xmm11, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A1 (-20 * SIZE, A2, %xmm7) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm7) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm7) + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm4) + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm5) + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm6) + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm7) + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm2 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $8, MM + je .L16 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm4) + MOVUPS_A1 (-28 * SIZE, A1, %xmm5) + + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm7) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm4) + mulps %xmm8, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm5) + + mulps %xmm9, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm6) + mulps %xmm9, %xmm7 + addps %xmm7, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm7) + + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-32 * SIZE, A2, %xmm4) + mulps %xmm10, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1 (-28 * SIZE, A2, %xmm5) + + mulps %xmm11, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm11, %xmm7 + addps %xmm7, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm7) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm4) + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm5) + + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm7) + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + + mulps %xmm15, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm15, %xmm7 + addps %xmm7, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L16: + testq $4, MM + je .L17 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm4) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm6) + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm7) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-32 * SIZE, A2, %xmm4) + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm6) + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm7) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $2, MM + je .L18 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA, 1), %xmm5 + movsd -32 * SIZE(A1, LDA, 2), %xmm6 + movsd -32 * SIZE(A1, LDA3, 1), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movsd -32 * SIZE(A2, LDA, 1), %xmm5 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA, 2), %xmm6 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movsd -32 * SIZE(A2, LDA3, 1), %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L18: + testq $1, MM + je .L19 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA, 1), %xmm5 + movss -32 * SIZE(A1, LDA, 2), %xmm6 + movss -32 * SIZE(A1, LDA3, 1), %xmm7 + + movss -32 * SIZE(Y1), %xmm0 + + mulss %xmm8, %xmm4 + addss %xmm4, %xmm0 + movss -32 * SIZE(A2), %xmm4 + mulss %xmm9, %xmm5 + addss %xmm5, %xmm0 + movss -32 * SIZE(A2, LDA, 1), %xmm5 + mulss %xmm10, %xmm6 + addss %xmm6, %xmm0 + movss -32 * SIZE(A2, LDA, 2), %xmm6 + mulss %xmm11, %xmm7 + addss %xmm7, %xmm0 + movss -32 * SIZE(A2, LDA3, 1), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L19: + cmpq $8, N + jge .L11 + ALIGN_3 + +.L20: +#endif + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + mulss %xmm0, %xmm15 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L27 + + testq $SIZE, A1 + je .L2X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L2X: + testq $2 * SIZE, A1 + je .L2XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L2XX: +#endif + + movq MM, I + sarq $4, I + jle .L25 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + MOVUPS_A1 (-24 * SIZE, A1, %xmm10) + MOVUPS_A1 (-20 * SIZE, A1, %xmm11) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm7) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm8) + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm9) + mulps %xmm14, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) + mulps %xmm14, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm5) + mulps %xmm15, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm6) + mulps %xmm15, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm7) + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm2 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm3 + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm15, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm15, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $8, MM + je .L26 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L26: + testq $4, MM + je .L27 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $2, MM + je .L28 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L28: + testq $1, MM +#if GEMV_UNROLL == 4 + je .L29 +#else + je .L30 +#endif + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 4 +.L29: + cmpq $4, N + jge .L21 +#endif + ALIGN_3 + +.L30: + testq N, N + jle .L990 + + cmpq $3, N + jne .L40 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L37 + + testq $SIZE, A1 + je .L3X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L3X: + testq $2 * SIZE, A1 + je .L3XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L3XX: +#endif + + movq MM, I + sarq $4, I + jle .L35 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + MOVUPS_A1 (-24 * SIZE, A1, %xmm10) + MOVUPS_A1 (-20 * SIZE, A1, %xmm11) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm7) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm8) + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm9) + mulps %xmm14, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) + mulps %xmm14, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm14, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm14, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $8, MM + je .L36 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L36: + testq $4, MM + je .L37 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $2, MM + je .L38 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L38: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L40: + cmpq $2, N + jne .L50 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L47 + + testq $SIZE, A1 + je .L4X + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L4X: + testq $2 * SIZE, A1 + je .L4XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L4XX: +#endif + + movq MM, I + sarq $4, I + jle .L45 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + MOVUPS_A1 (-24 * SIZE, A1, %xmm10) + MOVUPS_A1 (-20 * SIZE, A1, %xmm11) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + MOVUPS_A1(-28 * SIZE, A2, %xmm5) + MOVUPS_A1(-24 * SIZE, A2, %xmm6) + MOVUPS_A1(-20 * SIZE, A2, %xmm7) + + decq I + jle .L44 + ALIGN_3 + +.L43: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1(-12 * SIZE, A2, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A1( -8 * SIZE, A2, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A1( -4 * SIZE, A2, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L43 + ALIGN_3 + +.L44: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L45: + testq $8, MM + je .L46 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm5) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L46: + testq $4, MM + je .L47 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L47: + testq $2, MM + je .L48 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L48: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L50: + cmpq $1, N + jne .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movss (X), %xmm12 + + mulss ALPHA, %xmm12 + shufps $0, %xmm12, %xmm12 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L57 + + testq $SIZE, A1 + je .L5X + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L5X: + testq $2 * SIZE, A1 + je .L5XX + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L5XX: +#endif + + movq MM, I + sarq $4, I + jle .L55 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + MOVUPS_A1 (-24 * SIZE, A1, %xmm10) + MOVUPS_A1 (-20 * SIZE, A1, %xmm11) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L54 + ALIGN_3 + +.L53: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L53 + ALIGN_3 + +.L54: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L55: + testq $8, MM + je .L56 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L56: + testq $4, MM + je .L57 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L57: + testq $2, MM + je .L58 + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L58: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + +#ifdef ALIGNED_ACCESS + jmp .L990 + ALIGN_3 + +.L100: + testq $2 * SIZE - 1, LDA + jne .L200 + + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + mulss %xmm0, %xmm15 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + cmpq $3, M + jle .L107 + + testq $SIZE, A1 + je .L10X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L10X: + testq $2 * SIZE, A1 + je .L10XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L10XX: + movhps -32 * SIZE(A1, LDA), %xmm8 + movhps -32 * SIZE(A2, LDA), %xmm9 + + movq MM, I + sarq $4, I + jle .L105 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -32 * SIZE(A2), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -28 * SIZE(A2), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -24 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A2), %xmm7 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A2, LDA), %xmm4 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A2, LDA), %xmm5 + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A2, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm15, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2, LDA), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -32 * SIZE(A2), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -28 * SIZE(A2), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -24 * SIZE(A2), %xmm6 + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A2), %xmm7 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A2, LDA), %xmm4 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A2, LDA), %xmm5 + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A2, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm15, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2, LDA), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L105: + testq $8, MM + je .L106 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -30 * SIZE(A1, LDA), %xmm6 + movaps -26 * SIZE(A1, LDA), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -32 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -28 * SIZE(A2), %xmm10 + + shufps $0x4e, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -30 * SIZE(A2, LDA), %xmm11 + shufps $0x4e, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + movaps -26 * SIZE(A2, LDA), %xmm7 + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + shufps $0x4e, %xmm11, %xmm9 + mulps %xmm15, %xmm9 + addps %xmm9, %xmm0 + shufps $0x4e, %xmm7, %xmm11 + mulps %xmm15, %xmm11 + addps %xmm11, %xmm1 + movaps %xmm7, %xmm9 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L106: + testq $4, MM + je .L107 + + movaps -32 * SIZE(A1), %xmm4 + movaps -30 * SIZE(A1, LDA), %xmm5 + movaps -32 * SIZE(A2), %xmm6 + movaps -30 * SIZE(A2, LDA), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + shufps $0x4e, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + shufps $0x4e, %xmm7, %xmm9 + mulps %xmm15, %xmm9 + addps %xmm9, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L107: + testq $2, MM + je .L108 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + movsd -32 * SIZE(A2, LDA), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L108: + testq $1, MM + je .L109 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + movss -32 * SIZE(A2, LDA), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L109: + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: + testq N, N + jle .L990 + + cmpq $3, N + jne .L120 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + + cmpq $3, M + jle .L117 + + testq $SIZE, A1 + je .L11X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L11X: + testq $2 * SIZE, A1 + je .L11XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L11XX: + movhps -32 * SIZE(A1, LDA), %xmm8 + movhps -32 * SIZE(A2, LDA), %xmm9 + + movq MM, I + sarq $4, I + jle .L115 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -32 * SIZE(A2), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -28 * SIZE(A2), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -24 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A2), %xmm7 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -32 * SIZE(A2), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -28 * SIZE(A2), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -24 * SIZE(A2), %xmm6 + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A2), %xmm7 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L115: + testq $8, MM + je .L116 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -30 * SIZE(A1, LDA), %xmm6 + movaps -26 * SIZE(A1, LDA), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -32 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -28 * SIZE(A2), %xmm10 + + shufps $0x4e, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + shufps $0x4e, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L116: + testq $4, MM + je .L117 + + movaps -32 * SIZE(A1), %xmm4 + movaps -30 * SIZE(A1, LDA), %xmm5 + movaps -32 * SIZE(A2), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + shufps $0x4e, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L117: + testq $2, MM + je .L118 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L118: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L120: + cmpq $2, N + jl .L130 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + + cmpq $3, M + jle .L127 + + testq $SIZE, A1 + je .L12X + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L12X: + testq $2 * SIZE, A1 + je .L12XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L12XX: + movhps -32 * SIZE(A2), %xmm8 + + movq MM, I + sarq $4, I + jle .L125 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L125: + testq $8, MM + je .L126 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -30 * SIZE(A2), %xmm6 + movaps -26 * SIZE(A2), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + + shufps $0x4e, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + shufps $0x4e, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L126: + testq $4, MM + je .L127 + + movaps -32 * SIZE(A1), %xmm4 + movaps -30 * SIZE(A2), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + shufps $0x4e, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L127: + testq $2, MM + je .L128 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L128: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L130: + cmpq $1, N + jne .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movss (X), %xmm12 + + mulss ALPHA, %xmm12 + shufps $0, %xmm12, %xmm12 + + cmpq $3, M + jle .L137 + + testq $SIZE, A1 + je .L13X + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L13X: + testq $2 * SIZE, A1 + je .L13XX + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L13XX: + movq MM, I + sarq $4, I + jle .L135 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + movaps -24 * SIZE(A1), %xmm10 + movaps -20 * SIZE(A1), %xmm11 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L134 + ALIGN_3 + +.L133: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + movaps -12 * SIZE(A1), %xmm9 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + movaps -8 * SIZE(A1), %xmm10 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + movaps -4 * SIZE(A1), %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L133 + ALIGN_3 + +.L134: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L135: + testq $8, MM + je .L136 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L136: + testq $4, MM + je .L137 + + movaps -32 * SIZE(A1), %xmm8 + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L137: + testq $2, MM + je .L138 + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L138: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L200: + testq $2 * SIZE, LDA + jne .L300 + + cmpq $4, N + jl .L210 + ALIGN_3 + +.L201: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + mulss %xmm0, %xmm15 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + cmpq $3, M + jle .L207 + + testq $SIZE, A1 + je .L20X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L20X: + testq $2 * SIZE, A1 + je .L20XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L20XX: + movaps -33 * SIZE(A1, LDA), %xmm8 + movaps -34 * SIZE(A2), %xmm9 + movaps -35 * SIZE(A2, LDA), %xmm10 + + movq MM, I + sarq $4, I + jle .L205 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L204 + ALIGN_3 + +.L203: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -31 * SIZE(A2, LDA), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -27 * SIZE(A2, LDA), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -23 * SIZE(A2, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + movss %xmm4, %xmm10 + shufps $0x93, %xmm4, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movaps -19 * SIZE(A2, LDA), %xmm10 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + movss %xmm10, %xmm6 + shufps $0x93, %xmm10, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L203 + ALIGN_3 + +.L204: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A1, LDA), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -31 * SIZE(A2, LDA), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -27 * SIZE(A2, LDA), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -23 * SIZE(A2, LDA), %xmm6 + + movss %xmm4, %xmm10 + shufps $0x93, %xmm4, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movaps -19 * SIZE(A2, LDA), %xmm10 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movss %xmm10, %xmm6 + shufps $0x93, %xmm10, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L205: + testq $8, MM + je .L206 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -29 * SIZE(A1, LDA), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -30 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -26 * SIZE(A2), %xmm5 + + movss %xmm6, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -25 * SIZE(A1, LDA), %xmm8 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps -31 * SIZE(A2, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -27 * SIZE(A2, LDA), %xmm7 + + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps %xmm5, %xmm9 + + movss %xmm6, %xmm10 + shufps $0x93, %xmm6, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm10 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L206: + testq $4, MM + je .L207 + + movaps -32 * SIZE(A1), %xmm4 + movaps -29 * SIZE(A1, LDA), %xmm5 + movaps -30 * SIZE(A2), %xmm6 + movaps -31 * SIZE(A2, LDA), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movss %xmm5, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + shufps $0x4e, %xmm6, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movss %xmm7, %xmm10 + shufps $0x93, %xmm7, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L207: + testq $2, MM + je .L208 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + movsd -32 * SIZE(A2, LDA), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L208: + testq $1, MM + je .L209 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + movss -32 * SIZE(A2, LDA), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L209: + cmpq $4, N + jge .L201 + ALIGN_3 + +.L210: + cmpq $3, N + jne .L220 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + + cmpq $3, M + jle .L217 + + testq $SIZE, A1 + je .L21X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L21X: + testq $2 * SIZE, A1 + je .L21XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L21XX: + movaps -33 * SIZE(A1, LDA), %xmm8 + movaps -34 * SIZE(A2), %xmm9 + movaps -35 * SIZE(A2, LDA), %xmm10 + + movq MM, I + sarq $4, I + jle .L215 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L214 + ALIGN_3 + +.L213: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L213 + ALIGN_3 + +.L214: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A1, LDA), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L215: + testq $8, MM + je .L216 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -29 * SIZE(A1, LDA), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -30 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -26 * SIZE(A2), %xmm5 + + movss %xmm6, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -25 * SIZE(A1, LDA), %xmm8 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps %xmm5, %xmm9 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L216: + testq $4, MM + je .L217 + + movaps -32 * SIZE(A1), %xmm4 + movaps -29 * SIZE(A1, LDA), %xmm5 + movaps -30 * SIZE(A2), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movss %xmm5, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + shufps $0x4e, %xmm6, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movss %xmm7, %xmm10 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L217: + testq $2, MM + je .L218 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L218: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + jmp .L990 + ALIGN_4 + +.L220: + testq N, N + jle .L990 + + cmpq $2, N + jne .L230 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + + cmpq $3, M + jle .L227 + + testq $SIZE, A1 + je .L22X + + movss -32 * SIZE(Y1), %xmm9 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm9 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm9 + + movss %xmm9, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L22X: + testq $2 * SIZE, A1 + je .L22XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm9 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm9 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm9 + + movlps %xmm9, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L22XX: + movaps -33 * SIZE(A1, LDA), %xmm8 + + movq MM, I + sarq $4, I + jle .L225 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L224 + ALIGN_3 + +.L223: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A2), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L223 + ALIGN_3 + +.L224: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A2), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A2), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L225: + testq $8, MM + je .L226 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -29 * SIZE(A2), %xmm6 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -25 * SIZE(A2), %xmm7 + + movss %xmm6, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L226: + testq $4, MM + je .L227 + + movaps -32 * SIZE(A1), %xmm4 + movaps -29 * SIZE(A2), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movss %xmm5, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L227: + testq $2, MM + je .L228 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm9 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm9 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm9 + + movlps %xmm9, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L228: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm9 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm9 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm9 + + movss %xmm9, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L230: + cmpq $1, N + jne .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movss (X), %xmm12 + + mulss ALPHA, %xmm12 + shufps $0, %xmm12, %xmm12 + + cmpq $3, M + jle .L237 + + testq $SIZE, A1 + je .L23X + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L23X: + testq $2 * SIZE, A1 + je .L23XX + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L23XX: + testq $2 * SIZE, A1 + jne .L230 + + movq MM, I + sarq $4, I + jle .L235 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + movaps -24 * SIZE(A1), %xmm10 + movaps -20 * SIZE(A1), %xmm11 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L234 + ALIGN_3 + +.L233: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + movaps -12 * SIZE(A1), %xmm9 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + movaps -8 * SIZE(A1), %xmm10 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + movaps -4 * SIZE(A1), %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L233 + ALIGN_3 + +.L234: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L235: + testq $8, MM + je .L236 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L236: + testq $4, MM + je .L237 + + movaps -32 * SIZE(A1), %xmm8 + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L237: + testq $2, MM + je .L238 + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L238: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_4 + +.L300: + cmpq $4, N + jl .L310 + ALIGN_3 + +.L301: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + mulss %xmm0, %xmm15 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + cmpq $3, M + jle .L307 + + testq $SIZE, A1 + je .L30X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L30X: + testq $2 * SIZE, A1 + je .L30XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L30XX: + movaps -35 * SIZE(A1, LDA), %xmm8 + movaps -34 * SIZE(A2), %xmm9 + movaps -33 * SIZE(A2, LDA), %xmm10 + + movq MM, I + sarq $4, I + jle .L305 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L304 + ALIGN_3 + +.L303: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -29 * SIZE(A2, LDA), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -25 * SIZE(A2, LDA), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -21 * SIZE(A2, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movaps -17 * SIZE(A2, LDA), %xmm10 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + movss %xmm10, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L303 + ALIGN_3 + +.L304: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A1, LDA), %xmm4 + + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A1, LDA), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -29 * SIZE(A2, LDA), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -25 * SIZE(A2, LDA), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -21 * SIZE(A2, LDA), %xmm6 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movaps -17 * SIZE(A2, LDA), %xmm10 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movss %xmm10, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L305: + testq $8, MM + je .L306 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -31 * SIZE(A1, LDA), %xmm6 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -27 * SIZE(A1, LDA), %xmm7 + + movss %xmm6, %xmm8 + shufps $0x93, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + movaps -26 * SIZE(A2), %xmm5 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -29 * SIZE(A2, LDA), %xmm6 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps %xmm5, %xmm9 + movaps -25 * SIZE(A2, LDA), %xmm7 + + movss %xmm6, %xmm10 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm10 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L306: + testq $4, MM + je .L307 + + movaps -32 * SIZE(A1), %xmm4 + movaps -31 * SIZE(A1, LDA), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -30 * SIZE(A2), %xmm6 + movss %xmm5, %xmm8 + shufps $0x93, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -29 * SIZE(A2, LDA), %xmm7 + + shufps $0x4e, %xmm6, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movss %xmm7, %xmm10 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L307: + testq $2, MM + je .L308 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + movsd -32 * SIZE(A2, LDA), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L308: + testq $1, MM + je .L309 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + movss -32 * SIZE(A2, LDA), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L309: + cmpq $4, N + jge .L301 + ALIGN_3 + +.L310: + cmpq $3, N + jne .L320 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + + cmpq $3, M + jle .L317 + + testq $SIZE, A1 + je .L31X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L31X: + testq $2 * SIZE, A1 + je .L31XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L31XX: + movaps -35 * SIZE(A1, LDA), %xmm8 + movaps -34 * SIZE(A2), %xmm9 + movaps -33 * SIZE(A2, LDA), %xmm10 + + movq MM, I + sarq $4, I + jle .L315 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L314 + ALIGN_3 + +.L313: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L313 + ALIGN_3 + +.L314: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A1, LDA), %xmm4 + + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A1, LDA), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L315: + testq $8, MM + je .L316 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -31 * SIZE(A1, LDA), %xmm6 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -27 * SIZE(A1, LDA), %xmm7 + + movss %xmm6, %xmm8 + shufps $0x93, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + movaps -26 * SIZE(A2), %xmm5 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps %xmm5, %xmm9 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L316: + testq $4, MM + je .L317 + + movaps -32 * SIZE(A1), %xmm4 + movaps -31 * SIZE(A1, LDA), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -30 * SIZE(A2), %xmm6 + movss %xmm5, %xmm8 + shufps $0x93, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + shufps $0x4e, %xmm6, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L317: + testq $2, MM + je .L318 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L318: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L320: + cmpq $2, N + jne .L330 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + + cmpq $3, M + jle .L327 + + testq $SIZE, A1 + je .L32X + + movss -32 * SIZE(Y1), %xmm9 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm9 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm9 + + movss %xmm9, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L32X: + testq $2 * SIZE, A1 + je .L32XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L32XX: + movaps -35 * SIZE(A1, LDA), %xmm8 + + movq MM, I + sarq $4, I + jle .L325 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L324 + ALIGN_3 + +.L323: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A2), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L323 + ALIGN_3 + +.L324: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A2), %xmm4 + + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A2), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A2), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L325: + testq $8, MM + je .L326 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -31 * SIZE(A2), %xmm6 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -27 * SIZE(A2), %xmm7 + + movss %xmm6, %xmm8 + shufps $0x93, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L326: + testq $4, MM + je .L327 + + movaps -32 * SIZE(A1), %xmm4 + movaps -31 * SIZE(A2), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movss %xmm5, %xmm8 + shufps $0x93, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L327: + testq $2, MM + je .L328 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L328: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L330: + cmpq $1, N + jne .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movss (X), %xmm12 + + mulss ALPHA, %xmm12 + shufps $0, %xmm12, %xmm12 + + cmpq $3, M + jle .L337 + + testq $SIZE, A1 + je .L33X + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L33X: + testq $2 * SIZE, A1 + je .L33XX + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L33XX: + movq MM, I + sarq $4, I + jle .L335 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + movaps -24 * SIZE(A1), %xmm10 + movaps -20 * SIZE(A1), %xmm11 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L334 + ALIGN_3 + +.L333: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + movaps -12 * SIZE(A1), %xmm9 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + movaps -8 * SIZE(A1), %xmm10 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + movaps -4 * SIZE(A1), %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L333 + ALIGN_3 + +.L334: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L335: + testq $8, MM + je .L336 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L336: + testq $4, MM + je .L337 + + movaps -32 * SIZE(A1), %xmm8 + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L337: + testq $2, MM + je .L338 + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L338: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 +#endif + ALIGN_4 + + +.L990: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L994 + ALIGN_3 +.L992: + movsd 0 * SIZE(BUFFER), %xmm0 + movhps 2 * SIZE(BUFFER), %xmm0 + movsd 4 * SIZE(BUFFER), %xmm4 + movhps 6 * SIZE(BUFFER), %xmm4 + + pshufd $0x01, %xmm0, %xmm1 + pshufd $0x02, %xmm0, %xmm2 + pshufd $0x03, %xmm0, %xmm3 + + pshufd $0x01, %xmm4, %xmm5 + pshufd $0x02, %xmm4, %xmm6 + pshufd $0x03, %xmm4, %xmm7 + + addss (Y), %xmm0 + addq INCY, Y + addss (Y), %xmm1 + addq INCY, Y + addss (Y), %xmm2 + addq INCY, Y + addss (Y), %xmm3 + addq INCY, Y + addss (Y), %xmm4 + addq INCY, Y + addss (Y), %xmm5 + addq INCY, Y + addss (Y), %xmm6 + addq INCY, Y + addss (Y), %xmm7 + addq INCY, Y + + movss %xmm0, (Y1) + addq INCY, Y1 + movss %xmm1, (Y1) + addq INCY, Y1 + movss %xmm2, (Y1) + addq INCY, Y1 + movss %xmm3, (Y1) + addq INCY, Y1 + movss %xmm4, (Y1) + addq INCY, Y1 + movss %xmm5, (Y1) + addq INCY, Y1 + movss %xmm6, (Y1) + addq INCY, Y1 + movss %xmm7, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $7, M + jle .L999 + + testq $4, M + jle .L995 + + movsd 0 * SIZE(BUFFER), %xmm0 + movhps 2 * SIZE(BUFFER), %xmm0 + + pshufd $0x01, %xmm0, %xmm1 + pshufd $0x02, %xmm0, %xmm2 + pshufd $0x03, %xmm0, %xmm3 + + addss (Y), %xmm0 + addq INCY, Y + addss (Y), %xmm1 + addq INCY, Y + addss (Y), %xmm2 + addq INCY, Y + addss (Y), %xmm3 + addq INCY, Y + + movss %xmm0, (Y1) + addq INCY, Y1 + movss %xmm1, (Y1) + addq INCY, Y1 + movss %xmm2, (Y1) + addq INCY, Y1 + movss %xmm3, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L995: + testq $2, M + jle .L996 + + movsd (BUFFER), %xmm0 + + pshufd $0x01, %xmm0, %xmm1 + + addss (Y), %xmm0 + addq INCY, Y + addss (Y), %xmm1 + addq INCY, Y + + movss %xmm0, (Y1) + addq INCY, Y1 + movss %xmm1, (Y1) + addq INCY, Y1 + + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L996: + testq $1, M + jle .L999 + + movss (BUFFER), %xmm0 + + addss (Y), %xmm0 + + movss %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + + ret + EPILOGUE diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S new file mode 100644 index 0000000..052ff1a --- /dev/null +++ b/kernel/x86_64/sgemv_t.S @@ -0,0 +1,6370 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 4 +#undef GEMV_UNROLL +#define GEMV_UNROLL 4 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define X1 %rbp + +#define Y1 INCX + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + +#define ALPHA %xmm7 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + pshufd $0, %xmm0, ALPHA +#else + pshufd $0, %xmm3, ALPHA +#endif + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + leaq (LDA, LDA, 2), LDA3 + +#ifdef ALIGNED_ACCESS + movq M, MM + testq $4 * SIZE - 1, A + je .L0X + cmpq $3, M + jle .L0X + + movq A, MM + sarq $BASE_SHIFT, MM + andq $3, MM + subq $4, MM + addq M, MM + +.L0X: +#endif + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_4 + + subq $-32 * SIZE, A + +#ifdef ALIGNED_ACCESS + movq A, %rax + andq $4 * SIZE - 1, %rax + addq %rax, BUFFER +#endif + + movq BUFFER, X1 + + movq M, I + sarq $3, I + jle .L05 + ALIGN_4 + +.L02: + movss (X), %xmm0 + addq INCX, X + movss (X), %xmm1 + addq INCX, X + + movss (X), %xmm2 + addq INCX, X + movss (X), %xmm3 + addq INCX, X + + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm8 + addq INCX, X + + movss %xmm0, 0 * SIZE(X1) + movss %xmm1, 1 * SIZE(X1) + movss %xmm2, 2 * SIZE(X1) + movss %xmm3, 3 * SIZE(X1) + movss %xmm4, 4 * SIZE(X1) + movss %xmm5, 5 * SIZE(X1) + movss %xmm6, 6 * SIZE(X1) + movss %xmm8, 7 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $7, I + jle .L10 + ALIGN_2 + +.L06: + movss (X), %xmm0 + addq INCX, X + movss %xmm0, 0 * SIZE(X1) + addq $SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + testq $4 * SIZE - 1, LDA + jne .L100 +#endif + +#if GEMV_UNROLL >= 8 + + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 4), A2 + leaq (A1, LDA, 8), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L17 + + testq $SIZE, A1 + je .L1X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA, 1), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A1, LDA, 2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A1, LDA3, 1), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + movss -32 * SIZE(A2), %xmm0 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm12 + movss -32 * SIZE(A2, LDA, 1), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm13 + movss -32 * SIZE(A2, LDA, 2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm14 + movss -32 * SIZE(A2, LDA3, 1), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm15 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L1X: + testq $2 * SIZE, A1 + je .L1XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA, 1), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A1, LDA, 2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A1, LDA3, 1), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + movsd -32 * SIZE(A2), %xmm0 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + movsd -32 * SIZE(A2, LDA, 1), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + movsd -32 * SIZE(A2, LDA, 2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + movsd -32 * SIZE(A2, LDA3, 1), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm15 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L1XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 8 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L15 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) + + decq I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm15 + MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1) +#endif + + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) +#endif + + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + MOVUPS_A2 (-16 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm15 + MOVUPS_A2 (-16 * SIZE, A1, LDA3, 1, %xmm3) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L12 + ALIGN_4 + +.L13: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm15 + MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm15 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L15: + testq $8, MM + jle .L16 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm15 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L16: + testq $4, MM + jle .L17 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm15 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L17: + testq $2, MM + jle .L18 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA, 1), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A1, LDA, 2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A1, LDA3, 1), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + movsd -32 * SIZE(A2), %xmm0 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + movsd -32 * SIZE(A2, LDA, 1), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + movsd -32 * SIZE(A2, LDA, 2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + movsd -32 * SIZE(A2, LDA3, 1), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm15 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L18: + testq $1, MM + jle .L19 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA, 1), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A1, LDA, 2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A1, LDA3, 1), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + movss -32 * SIZE(A2), %xmm0 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm12 + movss -32 * SIZE(A2, LDA, 1), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm13 + movss -32 * SIZE(A2, LDA, 2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm14 + movss -32 * SIZE(A2, LDA3, 1), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm15 + ALIGN_4 + +.L19: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 + + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + haddps %xmm14, %xmm12 + + pshufd $0x1, %xmm12, %xmm13 + pshufd $0x2, %xmm12, %xmm14 + pshufd $0x3, %xmm12, %xmm15 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 + + movaps %xmm12, %xmm0 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm0 + + movaps %xmm14, %xmm1 + unpcklps %xmm15, %xmm14 + unpckhps %xmm15, %xmm1 + + movaps %xmm12, %xmm13 + unpcklps %xmm14, %xmm12 + unpckhps %xmm14, %xmm13 + + movaps %xmm0, %xmm14 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm14 + + addps %xmm13, %xmm12 + addps %xmm0, %xmm14 + addps %xmm14, %xmm12 + + pshufd $0x2, %xmm12, %xmm13 + pshufd $0x1, %xmm12, %xmm14 + pshufd $0x3, %xmm12, %xmm15 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + mulss ALPHA, %xmm12 + mulss ALPHA, %xmm13 + mulss ALPHA, %xmm14 + mulss ALPHA, %xmm15 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + addss (Y), %xmm12 + addq INCY, Y + addss (Y), %xmm13 + addq INCY, Y + addss (Y), %xmm14 + addq INCY, Y + addss (Y), %xmm15 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + movss %xmm12, (Y1) + addq INCY, Y1 + movss %xmm13, (Y1) + addq INCY, Y1 + movss %xmm14, (Y1) + addq INCY, Y1 + movss %xmm15, (Y1) + addq INCY, Y1 + + cmpq $8, N + jge .L11 + ALIGN_4 + +.L20: +#endif + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L27 + + testq $SIZE, A1 + je .L2X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L2X: + testq $2 * SIZE, A1 + je .L2XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L2XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#if (GEMV_UNROLL == 4) && defined(PREFETCHW) + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L25 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) + MOVUPS_A1 (-28 * SIZE, A2, %xmm14) + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15) + + decq I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm15 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm15, %xmm11 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-16 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + MOVUPS_A2 (-16 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + MOVUPS_A1 (-12 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm15 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm15, %xmm11 + MOVUPS_A2 (-12 * SIZE, A2, LDA, 1, %xmm15) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L22 + ALIGN_4 + +.L23: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm15 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm15, %xmm11 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + mulps %xmm5, %xmm15 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm15, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L25: + testq $8, MM + jle .L26 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A1 (-28 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15) + mulps %xmm5, %xmm15 + addps %xmm15, %xmm11 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L26: + testq $4, MM + jle .L27 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L27: + testq $2, MM + jle .L28 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L28: + testq $1, MM + jle .L29 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + ALIGN_4 + +.L29: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + ALIGN_4 + +.L30: + cmpq $3, N + jne .L40 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L37 + + testq $SIZE, A1 + je .L3X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L3X: + testq $2 * SIZE, A1 + je .L3XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L3XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#if (GEMV_UNROLL == 4) && defined(PREFETCHW) + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L35 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) + MOVUPS_A1 (-28 * SIZE, A2, %xmm14) + + decq I + jle .L33 + ALIGN_4 + +.L32: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm14, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-16 * SIZE, A2, %xmm2) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm14, %xmm10 + MOVUPS_A1 (-12 * SIZE, A2, %xmm14) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L32 + ALIGN_4 + +.L33: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm14, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm14) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + mulps %xmm5, %xmm14 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm14, %xmm10 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L35: + testq $8, MM + jle .L36 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A1 (-28 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L36: + testq $4, MM + jle .L37 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L37: + testq $2, MM + jle .L38 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L38: + testq $1, MM + jle .L39 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + ALIGN_4 + +.L39: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L40: + cmpq $2, N + jne .L50 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L47 + + testq $SIZE, A1 + je .L4X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L4X: + testq $2 * SIZE, A1 + je .L4XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L4XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L45 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-32 * SIZE, A2, %xmm1) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + MOVUPS_A1 (-28 * SIZE, A2, %xmm13) + + decq I + jle .L43 + ALIGN_4 + +.L42: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + MOVUPS_A1 (-24 * SIZE, A2, %xmm1) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm13, %xmm9 + MOVUPS_A1 (-20 * SIZE, A2, %xmm13) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + MOVUPS_A1 (-16 * SIZE, A2, %xmm1) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm13, %xmm9 + MOVUPS_A1 (-12 * SIZE, A2, %xmm13) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L42 + ALIGN_4 + +.L43: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + MOVUPS_A1 (-24 * SIZE, A2, %xmm1) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm13, %xmm9 + MOVUPS_A1 (-20 * SIZE, A2, %xmm13) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + mulps %xmm5, %xmm13 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm13, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L45: + testq $8, MM + jle .L46 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm1) + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-28 * SIZE, A2, %xmm13) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L46: + testq $4, MM + jle .L47 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L47: + testq $2, MM + jle .L48 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L48: + testq $1, MM + jle .L49 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + ALIGN_4 + +.L49: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm8, %xmm8 +#else + movaps %xmm8, %xmm10 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm10 + + addps %xmm10, %xmm8 + movhlps %xmm8, %xmm9 + addps %xmm9, %xmm8 +#endif + + pshufd $0x1, %xmm8, %xmm9 + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L50: + cmpq $1, N + jne .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L57 + + testq $SIZE, A1 + je .L5X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + + addq $1 * SIZE, A1 + addq $1 * SIZE, X1 + ALIGN_3 + +.L5X: + testq $2 * SIZE, A1 + je .L5XX + + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 + +.L5XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L55 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + + decq I + jle .L53 + ALIGN_4 + +.L52: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + + decq I + jg .L52 + ALIGN_4 + +.L53: + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + ALIGN_4 + +.L55: + testq $8, MM + jle .L56 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L56: + testq $4, MM + jle .L57 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L57: + testq $2, MM + jle .L58 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L58: + testq $1, MM + jle .L59 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + ALIGN_4 + +.L59: + addps %xmm9, %xmm8 + +#ifdef HAVE_SSE3 + haddps %xmm8, %xmm8 + haddps %xmm8, %xmm8 +#else + pshufd $1, %xmm8, %xmm9 + pshufd $2, %xmm8, %xmm10 + pshufd $3, %xmm8, %xmm11 + + addss %xmm9, %xmm8 + addss %xmm11, %xmm10 + addss %xmm10, %xmm8 +#endif + + mulss ALPHA, %xmm8 + + addss (Y), %xmm8 + movss %xmm8, (Y1) + +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_4 + +.L100: + testq $2 * SIZE - 1, LDA + jne .L200 + + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + cmpq $3, M + jle .L107 + + testq $SIZE, A1 + je .L10X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L10X: + testq $2 * SIZE, A1 + je .L10XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L10XX: + MOVUPS_A2 (-34 * SIZE, A1, LDA, 1, %xmm12) + MOVUPS_A2 (-34 * SIZE, A2, LDA, 1, %xmm13) + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L105 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) + + decq I + jle .L103 + ALIGN_4 + +.L102: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-16 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-14 * SIZE, A2, LDA, 1, %xmm3) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L102 + ALIGN_4 + +.L103: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L105: + testq $8, MM + jle .L106 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L106: + testq $4, MM + jle .L107 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm11 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L107: + testq $2, MM + jle .L108 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L108: + testq $1, MM + jle .L109 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + ALIGN_4 + +.L109: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L101 + ALIGN_4 + +.L110: + cmpq $3, N + jne .L120 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + + cmpq $3, M + jle .L117 + + testq $SIZE, A1 + je .L11X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L11X: + testq $2 * SIZE, A1 + je .L11XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L11XX: + MOVUPS_A2 (-34 * SIZE, A1, LDA, 1, %xmm12) + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L115 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + + decq I + jle .L113 + ALIGN_4 + +.L112: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm2) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-16 * SIZE, A2, %xmm2) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L112 + ALIGN_4 + +.L113: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm2) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L115: + testq $8, MM + jle .L116 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L116: + testq $4, MM + jle .L117 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L117: + testq $2, MM + jle .L118 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L118: + testq $1, MM + jle .L119 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + ALIGN_4 + +.L119: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + jmp .L999 + ALIGN_4 + +.L120: + cmpq $2, N + jne .L130 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L127 + + testq $SIZE, A1 + je .L12X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L12X: + testq $2 * SIZE, A1 + je .L12XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L12XX: + MOVUPS_A1 (-34 * SIZE, A2, %xmm12) + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L125 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-30 * SIZE, A2, %xmm1) + + decq I + jle .L123 + ALIGN_4 + +.L122: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-26 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-22 * SIZE, A2, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-18 * SIZE, A2, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-14 * SIZE, A2, %xmm1) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L122 + ALIGN_4 + +.L123: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-26 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-22 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-18 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L125: + testq $8, MM + jle .L126 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-30 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-26 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L126: + testq $4, MM + jle .L127 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-30 * SIZE, A2, %xmm1) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L127: + testq $2, MM + jle .L128 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L128: + testq $1, MM + jle .L129 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + ALIGN_4 + +.L129: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm8, %xmm8 +#else + movaps %xmm8, %xmm10 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm10 + + addps %xmm10, %xmm8 + movhlps %xmm8, %xmm9 + addps %xmm9, %xmm8 +#endif + + pshufd $0x1, %xmm8, %xmm9 + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L130: + cmpq $1, N + jne .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L137 + + testq $SIZE, A1 + je .L13X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + + addq $1 * SIZE, A1 + addq $1 * SIZE, X1 + ALIGN_3 + +.L13X: + testq $2 * SIZE, A1 + je .L13XX + + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 + +.L13XX: + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L135 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + + decq I + jle .L133 + ALIGN_4 + +.L132: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + + decq I + jg .L132 + ALIGN_4 + +.L133: + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + ALIGN_4 + +.L135: + testq $8, MM + jle .L136 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L136: + testq $4, MM + jle .L137 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L137: + testq $2, MM + jle .L138 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L138: + testq $1, MM + jle .L139 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + ALIGN_4 + +.L139: + addps %xmm9, %xmm8 + +#ifdef HAVE_SSE3 + haddps %xmm8, %xmm8 + haddps %xmm8, %xmm8 +#else + pshufd $1, %xmm8, %xmm9 + pshufd $2, %xmm8, %xmm10 + pshufd $3, %xmm8, %xmm11 + + addss %xmm9, %xmm8 + addss %xmm11, %xmm10 + addss %xmm10, %xmm8 +#endif + + mulss ALPHA, %xmm8 + + addss (Y), %xmm8 + movss %xmm8, (Y1) + jmp .L999 + ALIGN_4 + +.L200: + testq $2 * SIZE, LDA + jne .L300 + + cmpq $4, N + jl .L210 + ALIGN_3 + +.L201: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + cmpq $3, M + jle .L207 + + testq $SIZE, A1 + je .L20X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L20X: + testq $2 * SIZE, A1 + je .L20XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L20XX: + movaps -33 * SIZE(A1, LDA), %xmm12 + movaps -34 * SIZE(A2), %xmm13 + movaps -35 * SIZE(A2, LDA), %xmm14 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L205 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) + + decq I + jle .L203 + ALIGN_4 + +.L202: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-14 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-15 * SIZE, A2, LDA, 1, %xmm3) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L202 + ALIGN_4 + +.L203: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L205: + testq $8, MM + jle .L206 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L206: + testq $4, MM + jle .L207 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + addps %xmm14, %xmm11 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L207: + testq $2, MM + jle .L208 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L208: + testq $1, MM + jle .L209 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + ALIGN_4 + +.L209: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L201 + ALIGN_4 + +.L210: + cmpq $3, N + jne .L220 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + + cmpq $3, M + jle .L217 + + testq $SIZE, A1 + je .L21X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L21X: + testq $2 * SIZE, A1 + je .L21XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L21XX: + movaps -33 * SIZE(A1, LDA), %xmm12 + movaps -34 * SIZE(A2), %xmm13 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L215 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + decq I + jle .L213 + ALIGN_4 + +.L212: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-14 * SIZE, A2, %xmm2) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L212 + ALIGN_4 + +.L213: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L215: + testq $8, MM + jle .L216 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L216: + testq $4, MM + jle .L217 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L217: + testq $2, MM + jle .L218 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L218: + testq $1, MM + jle .L219 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + ALIGN_4 + +.L219: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + jmp .L999 + ALIGN_4 + +.L220: + testq N, N + jle .L999 + + cmpq $2, N + jne .L230 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L227 + + testq $SIZE, A1 + je .L22X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L22X: + testq $2 * SIZE, A1 + je .L22XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L22XX: + movaps -33 * SIZE(A2), %xmm12 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L225 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-29 * SIZE, A2, %xmm1) + + decq I + jle .L223 + ALIGN_4 + +.L222: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-25 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-21 * SIZE, A2, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-17 * SIZE, A2, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-13 * SIZE, A2, %xmm1) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L222 + ALIGN_4 + +.L223: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-25 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-21 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-17 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L225: + testq $8, MM + jle .L226 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-29 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-25 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L226: + testq $4, MM + jle .L227 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-29 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L227: + testq $2, MM + jle .L228 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L228: + testq $1, MM + jle .L229 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + ALIGN_4 + +.L229: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm8, %xmm8 +#else + movaps %xmm8, %xmm10 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm10 + + addps %xmm10, %xmm8 + movhlps %xmm8, %xmm9 + addps %xmm9, %xmm8 +#endif + + pshufd $0x1, %xmm8, %xmm9 + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L230: + cmpq $1, N + jne .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L237 + + testq $SIZE, A1 + je .L23X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + + addq $1 * SIZE, A1 + addq $1 * SIZE, X1 + ALIGN_3 + +.L23X: + testq $2 * SIZE, A1 + je .L23XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 + +.L23XX: + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + + movq MM, I + sarq $4, I + jle .L235 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + + decq I + jle .L233 + ALIGN_4 + +.L232: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + + decq I + jg .L232 + ALIGN_4 + +.L233: + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + ALIGN_4 + +.L235: + testq $8, MM + jle .L236 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L236: + testq $4, MM + jle .L237 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L237: + testq $2, MM + jle .L238 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L238: + testq $1, MM + jle .L239 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + ALIGN_4 + +.L239: + addps %xmm9, %xmm8 + +#ifdef HAVE_SSE3 + haddps %xmm8, %xmm8 + haddps %xmm8, %xmm8 +#else + pshufd $1, %xmm8, %xmm9 + pshufd $2, %xmm8, %xmm10 + pshufd $3, %xmm8, %xmm11 + + addss %xmm9, %xmm8 + addss %xmm11, %xmm10 + addss %xmm10, %xmm8 +#endif + + mulss ALPHA, %xmm8 + + addss (Y), %xmm8 + movss %xmm8, (Y1) + jmp .L999 + ALIGN_4 + +.L300: + cmpq $4, N + jl .L310 + ALIGN_3 + +.L301: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + cmpq $3, M + jle .L307 + + testq $SIZE, A1 + je .L30X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L30X: + testq $2 * SIZE, A1 + je .L30XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L30XX: + movaps -35 * SIZE(A1, LDA), %xmm12 + movaps -34 * SIZE(A2), %xmm13 + movaps -33 * SIZE(A2, LDA), %xmm14 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L305 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) + + decq I + jle .L303 + ALIGN_4 + +.L302: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-14 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-13 * SIZE, A2, LDA, 1, %xmm3) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L302 + ALIGN_4 + +.L303: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L305: + testq $8, MM + jle .L306 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L306: + testq $4, MM + jle .L307 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + addps %xmm14, %xmm11 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L307: + testq $2, MM + jle .L308 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L308: + testq $1, MM + jle .L309 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + ALIGN_4 + +.L309: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L301 + ALIGN_4 + +.L310: + testq N, N + jle .L999 + + cmpq $3, N + jne .L320 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + + cmpq $3, M + jle .L317 + + testq $SIZE, A1 + je .L31X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L31X: + testq $2 * SIZE, A1 + je .L31XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L31XX: + movaps -35 * SIZE(A1, LDA), %xmm12 + movaps -34 * SIZE(A2), %xmm13 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L315 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + decq I + jle .L313 + ALIGN_4 + +.L312: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-14 * SIZE, A2, %xmm2) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L312 + ALIGN_4 + +.L313: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L315: + testq $8, MM + jle .L316 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L316: + testq $4, MM + jle .L317 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L317: + testq $2, MM + jle .L318 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L318: + testq $1, MM + jle .L319 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + ALIGN_4 + +.L319: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + jmp .L999 + ALIGN_3 + +.L320: + cmpq $2, N + jne .L330 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L327 + + testq $SIZE, A1 + je .L32X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L32X: + testq $2 * SIZE, A1 + je .L32XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L32XX: + movaps -35 * SIZE(A2), %xmm12 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L325 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-31 * SIZE, A2, %xmm1) + + decq I + jle .L323 + ALIGN_4 + +.L322: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-27 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-23 * SIZE, A2, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-19 * SIZE, A2, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-15 * SIZE, A2, %xmm1) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L322 + ALIGN_4 + +.L323: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-27 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-23 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-19 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L325: + testq $8, MM + jle .L326 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-31 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-27 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L326: + testq $4, MM + jle .L327 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-31 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L327: + testq $2, MM + jle .L328 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L328: + testq $1, MM + jle .L329 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + ALIGN_4 + +.L329: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm8, %xmm8 +#else + movaps %xmm8, %xmm10 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm10 + + addps %xmm10, %xmm8 + movhlps %xmm8, %xmm9 + addps %xmm9, %xmm8 +#endif + + pshufd $0x1, %xmm8, %xmm9 + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L330: + cmpq $1, N + jne .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L337 + + testq $SIZE, A1 + je .L33X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + + addq $1 * SIZE, A1 + addq $1 * SIZE, X1 + ALIGN_3 + +.L33X: + testq $2 * SIZE, A1 + je .L33XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 + +.L33XX: + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L335 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + + decq I + jle .L333 + ALIGN_4 + +.L332: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + + decq I + jg .L332 + ALIGN_4 + +.L333: + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + ALIGN_4 + +.L335: + testq $8, MM + jle .L336 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L336: + testq $4, MM + jle .L337 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L337: + testq $2, MM + jle .L338 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L338: + testq $1, MM + jle .L339 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + ALIGN_4 + +.L339: + addps %xmm9, %xmm8 + +#ifdef HAVE_SSE3 + haddps %xmm8, %xmm8 + haddps %xmm8, %xmm8 +#else + pshufd $1, %xmm8, %xmm9 + pshufd $2, %xmm8, %xmm10 + pshufd $3, %xmm8, %xmm11 + + addss %xmm9, %xmm8 + addss %xmm11, %xmm10 + addss %xmm10, %xmm8 +#endif + + mulss ALPHA, %xmm8 + + addss (Y), %xmm8 + movss %xmm8, (Y1) + + jmp .L999 +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + ALIGN_4 + + EPILOGUE diff --git a/kernel/x86_64/staticbuffer.S b/kernel/x86_64/staticbuffer.S new file mode 100644 index 0000000..7bbd23d --- /dev/null +++ b/kernel/x86_64/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 8 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 +#endif diff --git a/kernel/x86_64/swap.S b/kernel/x86_64/swap.S new file mode 100644 index 0000000..50a7fb5 --- /dev/null +++ b/kernel/x86_64/swap.S @@ -0,0 +1,439 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define N ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define N ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#define XX %r10 +#define YY %r11 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 24(%rsp), INCY +#endif +#else + pushq %rbx + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + EMMS + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + cmpq $SIZE, INCX + jne .L14 + cmpq $SIZE, INCY + jne .L14 + + movq N, %rax + sarq $3, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm6, 16(X) + movq %mm7, 24(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) + movq %mm2, 16(Y) + movq %mm3, 24(Y) + + movq 32(X), %mm0 + movq 40(X), %mm1 + movq 48(X), %mm2 + movq 56(X), %mm3 + movq 32(Y), %mm4 + movq 40(Y), %mm5 + movq 48(Y), %mm6 + movq 56(Y), %mm7 + + movq %mm4, 32(X) + movq %mm5, 40(X) + movq %mm6, 48(X) + movq %mm7, 56(X) + movq %mm0, 32(Y) + movq %mm1, 40(Y) + movq %mm2, 48(Y) + movq %mm3, 56(Y) + + movq 64(X), %mm0 + movq 72(X), %mm1 + movq 80(X), %mm2 + movq 88(X), %mm3 + movq 64(Y), %mm4 + movq 72(Y), %mm5 + movq 80(Y), %mm6 + movq 88(Y), %mm7 + + movq %mm4, 64(X) + movq %mm5, 72(X) + movq %mm6, 80(X) + movq %mm7, 88(X) + movq %mm0, 64(Y) + movq %mm1, 72(Y) + movq %mm2, 80(Y) + movq %mm3, 88(Y) + + movq 96(X), %mm0 + movq 104(X), %mm1 + movq 112(X), %mm2 + movq 120(X), %mm3 + movq 96(Y), %mm4 + movq 104(Y), %mm5 + movq 112(Y), %mm6 + movq 120(Y), %mm7 + + movq %mm4, 96(X) + movq %mm5, 104(X) + movq %mm6, 112(X) + movq %mm7, 120(X) + movq %mm0, 96(Y) + movq %mm1, 104(Y) + movq %mm2, 112(Y) + movq %mm3, 120(Y) + +#elif defined(DOUBLE) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + movq 2 * SIZE(X), %mm2 + movq 3 * SIZE(X), %mm3 + movq 0 * SIZE(Y), %mm4 + movq 1 * SIZE(Y), %mm5 + movq 2 * SIZE(Y), %mm6 + movq 3 * SIZE(Y), %mm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 0 * SIZE(X) + movq %mm5, 1 * SIZE(X) + movq %mm6, 2 * SIZE(X) + movq %mm7, 3 * SIZE(X) + movq %mm0, 0 * SIZE(Y) + movq %mm1, 1 * SIZE(Y) + movq %mm2, 2 * SIZE(Y) + movq %mm3, 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movq 4 * SIZE(X), %mm0 + movq 5 * SIZE(X), %mm1 + movq 6 * SIZE(X), %mm2 + movq 7 * SIZE(X), %mm3 + movq 4 * SIZE(Y), %mm4 + movq 5 * SIZE(Y), %mm5 + movq 6 * SIZE(Y), %mm6 + movq 7 * SIZE(Y), %mm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 4 * SIZE(X) + movq %mm5, 5 * SIZE(X) + movq %mm6, 6 * SIZE(X) + movq %mm7, 7 * SIZE(X) + movq %mm0, 4 * SIZE(Y) + movq %mm1, 5 * SIZE(Y) + movq %mm2, 6 * SIZE(Y) + movq %mm3, 7 * SIZE(Y) + +#else +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + movq 0 * SIZE(X), %mm0 + movq 2 * SIZE(X), %mm1 + movq 4 * SIZE(X), %mm2 + movq 6 * SIZE(X), %mm3 + movq 0 * SIZE(Y), %mm4 + movq 2 * SIZE(Y), %mm5 + movq 4 * SIZE(Y), %mm6 + movq 6 * SIZE(Y), %mm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 0 * SIZE(X) + movq %mm5, 2 * SIZE(X) + movq %mm6, 4 * SIZE(X) + movq %mm7, 6 * SIZE(X) + + movq %mm0, 0 * SIZE(Y) + movq %mm1, 2 * SIZE(Y) + movq %mm2, 4 * SIZE(Y) + movq %mm3, 6 * SIZE(Y) +#endif + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq N, %rax + andq $7, %rax + jle .L27 + ALIGN_3 + +.L22: + +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) +#else + MOVQ 0 * SIZE(X), %mm0 + MOVQ 0 * SIZE(Y), %mm4 + MOVQ %mm4, 0 * SIZE(X) + MOVQ %mm0, 0 * SIZE(Y) +#endif + + addq $SIZE, X + addq $SIZE, Y + decq %rax + jg .L22 + jmp .L27 + ALIGN_3 + +/* INCX != 1 or INCY != 1 */ + +.L14: + movq N, %rax + movq X, XX + movq Y, YY + sarq $2, %rax + jle .L28 + ALIGN_2 + +.L29: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + addq INCY, YY + + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + addq INCY, YY + + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + addq INCY, YY + + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + addq INCY, YY +#else + MOVQ (X), %mm0 + addq INCX, X + MOVQ (X), %mm1 + addq INCX, X + MOVQ (X), %mm2 + addq INCX, X + MOVQ (X), %mm3 + addq INCX, X + + MOVQ (Y), %mm4 + addq INCY, Y + MOVQ (Y), %mm5 + addq INCY, Y + MOVQ (Y), %mm6 + addq INCY, Y + MOVQ (Y), %mm7 + addq INCY, Y + + MOVQ %mm4, (XX) + addq INCX, XX + MOVQ %mm5, (XX) + addq INCX, XX + MOVQ %mm6, (XX) + addq INCX, XX + MOVQ %mm7, (XX) + addq INCX, XX + + MOVQ %mm0, (YY) + addq INCY, YY + MOVQ %mm1, (YY) + addq INCY, YY + MOVQ %mm2, (YY) + addq INCY, YY + MOVQ %mm3, (YY) + addq INCY, YY +#endif + + decq %rax + jg .L29 + ALIGN_3 + +.L28: + movq N, %rax + andq $3, %rax + jle .L27 + ALIGN_3 + +.L35: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) +#else + MOVQ (X), %mm0 + MOVQ (Y), %mm4 + + MOVQ %mm4, (X) + MOVQ %mm0, (Y) +#endif + addq INCX, X + addq INCY, Y + + decq %rax + jg .L35 + ALIGN_3 + +.L27: + EMMS + xorq %rax,%rax + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE + diff --git a/kernel/x86_64/swap_sse.S b/kernel/x86_64/swap_sse.S new file mode 100644 index 0000000..5702870 --- /dev/null +++ b/kernel/x86_64/swap_sse.S @@ -0,0 +1,1160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + pushq %rbx + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + cmpq $3, M + jle .L16 + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + ALIGN_3 + +.L05: + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_3 + +.L10: + cmpq $3, M + jle .L16 + + testq $2 * SIZE, X + jne .L30 + + testq $1 * SIZE, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + + decq %rax + jg .L11 + ALIGN_3 + +.L13: + testq $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + addq $2 * SIZE, X + movlps %xmm0, -32 * SIZE(Y) + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L20: + movaps -33 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + pshufd $0x39, %xmm1, %xmm3 + movlps %xmm3, -31 * SIZE(X) + + subq $3, M + + movq M, %rax + sarq $5, %rax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -13 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -5 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -5 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L23: + testq $16, M + jle .L24 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + testq $8, M + jle .L25 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, M + jle .L26 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + pshufd $0x39, %xmm0, %xmm2 + pshufd $0xff, %xmm0, %xmm0 + + movlps %xmm2, -32 * SIZE(Y) + movss %xmm0, -30 * SIZE(Y) + + testq $2, M + jle .L27 + + movsd -29 * SIZE(X), %xmm0 + movsd -29 * SIZE(Y), %xmm1 + + movlps %xmm0, -29 * SIZE(Y) + movlps %xmm1, -29 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, M + jle .L29 + + movss -29 * SIZE(X), %xmm0 + movss -29 * SIZE(Y), %xmm1 + + movss %xmm0, -29 * SIZE(Y) + movss %xmm1, -29 * SIZE(X) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L30: + testq $1 * SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + subq $2, M + + movq M, %rax + sarq $5, %rax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -6 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -6 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L33: + testq $16, M + jle .L34 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + testq $8, M + jle .L35 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, M + jle .L36 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + movhps %xmm0, -32 * SIZE(Y) + + testq $2, M + jle .L37 + + movsd -30 * SIZE(X), %xmm0 + movsd -30 * SIZE(Y), %xmm1 + + movlps %xmm0, -30 * SIZE(Y) + movlps %xmm1, -30 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, M + jle .L39 + + movss -30 * SIZE(X), %xmm0 + movss -30 * SIZE(Y), %xmm1 + + movss %xmm0, -30 * SIZE(Y) + movss %xmm1, -30 * SIZE(X) + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + + subq $3, M + + movq M, %rax + sarq $5, %rax + jle .L43 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -11 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -3 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -3 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L43: + testq $16, M + jle .L44 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $8, M + jle .L45 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, M + jle .L46 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + movsd -31 * SIZE(X), %xmm2 + + pshufd $0x39, %xmm1, %xmm1 + movlps %xmm1, -31 * SIZE(X) + + pshufd $0xff, %xmm0, %xmm0 + + movss %xmm0, -32 * SIZE(Y) + movlps %xmm2, -31 * SIZE(Y) + + addq $3 * SIZE, X + addq $3 * SIZE, Y + + testq $2, M + jle .L47 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm0, -32 * SIZE(Y) + movlps %xmm1, -32 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, M + jle .L49 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm0, -32 * SIZE(Y) + movss %xmm1, -32 * SIZE(X) + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L50: + movq M, %rax + sarq $3, %rax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $7, %rax + jle .L57 + ALIGN_3 + +.L56: + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + movss %xmm0, (Y) + + addq INCX, X + addq INCY, Y + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/swap_sse2.S b/kernel/x86_64/swap_sse2.S new file mode 100644 index 0000000..5f16419 --- /dev/null +++ b/kernel/x86_64/swap_sse2.S @@ -0,0 +1,585 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + pushq %rbx + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + cmpq $SIZE, INCX + jne .L40 + cmpq $SIZE, INCY + jne .L40 + + testq $SIZE, Y + je .L10 + + movsd 0 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm8 + + movsd %xmm8, 0 * SIZE(X) + movsd %xmm0, 0 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, X + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + + decq %rax + jg .L11 + ALIGN_3 + +.L13: + testq $8, M + jle .L14 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + testq $4, M + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + testq $2, M + jle .L16 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + testq $1, M + jle .L19 + + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + movlps %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L20: + movhps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + decq M + jle .L29 + + movq M, %rax + sarq $4, %rax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -5 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -3 * SIZE(X), %xmm2 + movaps -2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + + movhps %xmm0, -16 * SIZE(Y) + movhps -15 * SIZE(X), %xmm0 + movhps %xmm1, -15 * SIZE(X) + + addq $SIZE, X + addq $SIZE, Y + ALIGN_3 + +.L29: + movhps %xmm0, -16 * SIZE(Y) + + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L40: + movq M, %rax + sarq $3, %rax + jle .L45 + ALIGN_3 + +.L41: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L45: + movq M, %rax + andq $7, %rax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + movsd %xmm0, (Y) + + addq INCX, X + addq INCY, Y + decq %rax + jg .L46 + ALIGN_3 + +.L47: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S new file mode 100644 index 0000000..901a5ad --- /dev/null +++ b/kernel/x86_64/symv_L_sse.S @@ -0,0 +1,1029 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 8) +#define movsd movlps +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_LDA 40 + STACKSIZE(%rsp) +#define OLD_X 48 + STACKSIZE(%rsp) +#define OLD_INCX 56 + STACKSIZE(%rsp) +#define OLD_Y 64 + STACKSIZE(%rsp) +#define OLD_INCY 72 + STACKSIZE(%rsp) +#define OLD_BUFFER 80 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA %xmm0 + +#define atemp1 %xmm0 +#define atemp2 %xmm1 +#define atemp3 %xmm2 +#define atemp4 %xmm3 + +#define xsum1 %xmm4 +#define xsum2 %xmm5 +#define xsum3 %xmm6 +#define xsum4 %xmm7 + +#define xtemp1 %xmm8 +#define xtemp2 %xmm9 +#define yy1 %xmm10 +#define xt1 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define a4 %xmm15 + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq M, M + jle .L999 + + shufps $0, ALPHA, ALPHA + + movq BUFFER, XX + + movq M, %rax + sarq $3, %rax + jle .L02 + ALIGN_3 + +.L01: + movss 0 * SIZE(X), %xmm1 + addq INCX, X + movss 0 * SIZE(X), %xmm2 + addq INCX, X + movss 0 * SIZE(X), %xmm3 + addq INCX, X + movss 0 * SIZE(X), %xmm4 + addq INCX, X + movss 0 * SIZE(X), %xmm5 + addq INCX, X + movss 0 * SIZE(X), %xmm6 + addq INCX, X + movss 0 * SIZE(X), %xmm7 + addq INCX, X + movss 0 * SIZE(X), %xmm8 + addq INCX, X + + mulss ALPHA, %xmm1 + mulss ALPHA, %xmm2 + mulss ALPHA, %xmm3 + mulss ALPHA, %xmm4 + mulss ALPHA, %xmm5 + mulss ALPHA, %xmm6 + mulss ALPHA, %xmm7 + mulss ALPHA, %xmm8 + + movss %xmm1, 0 * SIZE(XX) + movss %xmm2, 1 * SIZE(XX) + movss %xmm3, 2 * SIZE(XX) + movss %xmm4, 3 * SIZE(XX) + movss %xmm5, 4 * SIZE(XX) + movss %xmm6, 5 * SIZE(XX) + movss %xmm7, 6 * SIZE(XX) + movss %xmm8, 7 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $7, %rax + jle .L05 + ALIGN_3 + +.L03: + movss 0 * SIZE(X), %xmm1 + addq INCX, X + + mulss ALPHA, %xmm1 + + movss %xmm1, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $3, %rax + jle .L07 + ALIGN_3 + +.L06: + movss 0 * SIZE(YY), %xmm0 + addq INCY, YY + movss 0 * SIZE(YY), %xmm1 + addq INCY, YY + movss 0 * SIZE(YY), %xmm2 + addq INCY, YY + movss 0 * SIZE(YY), %xmm3 + addq INCY, YY + movss 0 * SIZE(YY), %xmm4 + addq INCY, YY + movss 0 * SIZE(YY), %xmm5 + addq INCY, YY + movss 0 * SIZE(YY), %xmm6 + addq INCY, YY + movss 0 * SIZE(YY), %xmm7 + addq INCY, YY + + movss %xmm0, 0 * SIZE(XX) + movss %xmm1, 1 * SIZE(XX) + movss %xmm2, 2 * SIZE(XX) + movss %xmm3, 3 * SIZE(XX) + movss %xmm4, 4 * SIZE(XX) + movss %xmm5, 5 * SIZE(XX) + movss %xmm6, 6 * SIZE(XX) + movss %xmm7, 7 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $7, %rax + jle .L10 + ALIGN_3 + +.L08: + movss 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movss %xmm0, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + xorq IS, IS # is = 0 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 2), A2 + leaq 4 * SIZE(A, LDA, 4), A + + leaq (NEW_X, IS, SIZE), XX + leaq 4 * SIZE(NEW_Y, IS, SIZE), YY + + movaps 0 * SIZE(XX), atemp4 + + movsd 0 * SIZE(A1), xsum1 + movhps 2 * SIZE(A1), xsum1 + mulps atemp4, xsum1 + + movss 1 * SIZE(A1), xsum2 + movss 1 * SIZE(A1, LDA, 1), a2 + movss 2 * SIZE(A1, LDA, 1), a3 + movss 3 * SIZE(A1, LDA, 1), a4 + unpcklps a3, xsum2 + unpcklps a4, a2 + unpcklps a2, xsum2 + mulps atemp4, xsum2 + + movss 2 * SIZE(A1), xsum3 + movss 2 * SIZE(A1, LDA, 1), a2 + movss 2 * SIZE(A2), a3 + movss 3 * SIZE(A2), a4 + unpcklps a3, xsum3 + unpcklps a4, a2 + unpcklps a2, xsum3 + mulps atemp4, xsum3 + + movss 3 * SIZE(A1), xsum4 + movss 3 * SIZE(A1, LDA, 1), a2 + movss 3 * SIZE(A2), a3 + movss 3 * SIZE(A2, LDA, 1), a4 + unpcklps a3, xsum4 + unpcklps a4, a2 + unpcklps a2, xsum4 + mulps atemp4, xsum4 + + pshufd $0x00, atemp4, atemp1 + pshufd $0x55, atemp4, atemp2 + pshufd $0xaa, atemp4, atemp3 + pshufd $0xff, atemp4, atemp4 + + movaps 4 * SIZE(XX), xtemp1 + movaps 8 * SIZE(XX), xtemp2 + + movsd 0 * SIZE(YY), yy1 + movhps 2 * SIZE(YY), yy1 + + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + addq $4 * SIZE, XX + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + + movq M, I + subq IS, I + subq $4, I + sarq $4, I + jle .L14 + ALIGN_3 + +.L12: + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A1) + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCH PREFETCHSIZE(XX) +#endif + + movaps xtemp1, xt1 + movaps 8 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 8 * SIZE(A1), a1 + movhps 10 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A1, LDA, 1) + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 8 * SIZE(A1, LDA, 1), a2 + movhps 10 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 8 * SIZE(A2), a3 + movhps 10 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 12 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 8 * SIZE(A2, LDA, 1), a4 + movhps 10 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 4 * SIZE(YY) + movhps yy1, 6 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhps 10 * SIZE(YY), yy1 + + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 12 * SIZE(A1), a1 + movhps 14 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A2) + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 12 * SIZE(A1, LDA, 1), a2 + movhps 14 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 12 * SIZE(A2), a3 + movhps 14 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCHW PREFETCHSIZE(YY) +#endif + + movaps xtemp1, xt1 + movaps 16 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 12 * SIZE(A2, LDA, 1), a4 + movhps 14 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 8 * SIZE(YY) + movhps yy1, 10 * SIZE(YY) + movsd 12 * SIZE(YY), yy1 + movhps 14 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 16 * SIZE(A1), a1 + movhps 18 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A2, LDA, 1) + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 16 * SIZE(A1, LDA, 1), a2 + movhps 18 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 16 * SIZE(A2), a3 + movhps 18 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 20 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 16 * SIZE(A2, LDA, 1), a4 + movhps 18 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 12 * SIZE(YY) + movhps yy1, 14 * SIZE(YY) + movsd 16 * SIZE(YY), yy1 + movhps 18 * SIZE(YY), yy1 + + addq $16 * SIZE, XX + addq $16 * SIZE, YY + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L14: + movq M, I + subq IS, I + subq $4, I + test $8, I + jle .L15 + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + + movaps xtemp1, xt1 + movaps 8 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 8 * SIZE(A1), a1 + movhps 10 * SIZE(A1), a1 + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 8 * SIZE(A1, LDA, 1), a2 + movhps 10 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 8 * SIZE(A2), a3 + movhps 10 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 12 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 8 * SIZE(A2, LDA, 1), a4 + movhps 10 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 4 * SIZE(YY) + movhps yy1, 6 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhps 10 * SIZE(YY), yy1 + + addq $8 * SIZE, XX + addq $8 * SIZE, YY + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L15: + test $4, I + jle .L17 + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + + movaps xtemp1, xt1 + movsd 4 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $2, M + jle .L18 + + pxor xtemp2, xtemp2 + + movlhps xtemp2, a1 + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movss 2 * SIZE(A1), a1 + + movlhps xtemp2, a2 + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movss 2 * SIZE(A1, LDA, 1), a2 + + movlhps xtemp2, a3 + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movss 2 * SIZE(A2), a3 + + movlhps xtemp2, a4 + movaps xtemp1, xt1 + movss 2 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movss 2 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movss 2 * SIZE(YY), yy1 + + addq $2 * SIZE, XX + addq $2 * SIZE, YY + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + ALIGN_3 + +.L18: + testq $1, M + jle .L19 + + movss 0 * SIZE(XX), xtemp1 + + movss 0 * SIZE(YY), yy1 + + movss 0 * SIZE(A1), a1 + movss 0 * SIZE(A1, LDA, 1), a2 + movss 0 * SIZE(A2), a3 + movss 0 * SIZE(A2, LDA, 1), a4 + + movaps xtemp1, xt1 + mulss a1, xt1 + mulss atemp1, a1 + addss xt1, xsum1 + addss a1, yy1 + + movaps xtemp1, xt1 + mulss a2, xt1 + mulss atemp2, a2 + addss xt1, xsum2 + addss a2, yy1 + + movaps xtemp1, xt1 + mulss a3, xt1 + mulss atemp3, a3 + addss xt1, xsum3 + addss a3, yy1 + + movaps xtemp1, xt1 + mulss a4, xt1 + mulss atemp4, a4 + addss xt1, xsum4 + addss a4, yy1 + + movss yy1, 0 * SIZE(YY) + ALIGN_3 + +.L19: +#ifndef HAVE_SSE3 + movaps xsum1, xtemp1 + unpcklps xsum3, xsum1 + unpckhps xsum3, xtemp1 + + movaps xsum2, xtemp2 + unpcklps xsum4, xsum2 + unpckhps xsum4, xtemp2 + + movaps xsum1, xsum3 + unpcklps xsum2, xsum1 + unpckhps xsum2, xsum3 + + movaps xtemp1, xsum4 + unpcklps xtemp2, xtemp1 + unpckhps xtemp2, xsum4 + + addps xsum3, xsum1 + addps xtemp1, xsum4 + addps xsum4, xsum1 +#else + haddps xsum2, xsum1 + haddps xsum4, xsum3 + + haddps xsum3, xsum1 +#endif + + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + movhps 2 * SIZE(NEW_Y, IS, SIZE), yy1 + + addps xsum1, yy1 + + movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) + movhps yy1, 2 * SIZE(NEW_Y, IS, SIZE) + + addq $4, IS + + movq IS, I + addq $4, I + cmpq N, I + jle .L11 + ALIGN_3 + +.L20: + testq $2, N + jle .L30 + + movq A, A1 + leaq 2 * SIZE(A, LDA, 2), A + + movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 + +#if defined(OPTERON) + pxor xsum1, xsum1 +#endif + movsd 0 * SIZE(A1), xsum1 + mulps atemp4, xsum1 + + movss 1 * SIZE(A1), xsum2 + movss 1 * SIZE(A1, LDA, 1), a2 + unpcklps a2, xsum2 + mulps atemp4, xsum2 + + pshufd $0x00, atemp4, atemp1 + pshufd $0x55, atemp4, atemp2 + + testq $1, M + jle .L29 + + movss 2 * SIZE(A1), a1 + movss 2 * SIZE(A1, LDA, 1), a2 + movss 2 * SIZE(NEW_X, IS, SIZE), xtemp1 + movss 2 * SIZE(NEW_Y, IS, SIZE), yy1 + + movaps xtemp1, xt1 + mulss a1, xt1 + mulss atemp1, a1 + addss xt1, xsum1 + addps a1, yy1 + + movaps xtemp1, xt1 + mulss a2, xt1 + mulss atemp2, a2 + addss xt1, xsum2 + addss a2, yy1 + + movss yy1, 2 * SIZE(NEW_Y, IS, SIZE) + ALIGN_3 + +.L29: + +#ifndef HAVE_SSE3 + unpcklps xsum2, xsum1 + movhlps xsum1, xsum2 + addps xsum2, xsum1 +#else + haddps xsum2, xsum1 + haddps xsum1, xsum1 +#endif + + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + + addps xsum1, yy1 + + movlps yy1, 0 * SIZE(NEW_Y, IS, SIZE) + + addq $2, IS + ALIGN_3 + +.L30: + testq $1, N + jle .L990 + + movss 0 * SIZE(NEW_X, IS, SIZE), xsum1 + mulss 0 * SIZE(A), xsum1 + addss 0 * SIZE(NEW_Y, IS, SIZE), xsum1 + movss xsum1, 0 * SIZE(NEW_Y, IS, SIZE) + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq M, %rax + sarq $3, %rax + jle .L997 + ALIGN_3 + +.L996: + movss 0 * SIZE(NEW_Y), %xmm0 + movss 1 * SIZE(NEW_Y), %xmm1 + movss 2 * SIZE(NEW_Y), %xmm2 + movss 3 * SIZE(NEW_Y), %xmm3 + movss 4 * SIZE(NEW_Y), %xmm4 + movss 5 * SIZE(NEW_Y), %xmm5 + movss 6 * SIZE(NEW_Y), %xmm6 + movss 7 * SIZE(NEW_Y), %xmm7 + + movss %xmm0, 0 * SIZE(Y) + addq INCY, Y + movss %xmm1, 0 * SIZE(Y) + addq INCY, Y + movss %xmm2, 0 * SIZE(Y) + addq INCY, Y + movss %xmm3, 0 * SIZE(Y) + addq INCY, Y + movss %xmm4, 0 * SIZE(Y) + addq INCY, Y + movss %xmm5, 0 * SIZE(Y) + addq INCY, Y + movss %xmm6, 0 * SIZE(Y) + addq INCY, Y + movss %xmm7, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $7, %rax + jle .L999 + ALIGN_3 + +.L998: + movss 0 * SIZE(NEW_Y), %xmm0 + + movss %xmm0, 0 * SIZE(Y) + addq INCY, Y + + addq $1 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S new file mode 100644 index 0000000..bfe7ebd --- /dev/null +++ b/kernel/x86_64/symv_L_sse2.S @@ -0,0 +1,978 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 8) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_LDA 40 + STACKSIZE(%rsp) +#define OLD_X 48 + STACKSIZE(%rsp) +#define OLD_INCX 56 + STACKSIZE(%rsp) +#define OLD_Y 64 + STACKSIZE(%rsp) +#define OLD_INCY 72 + STACKSIZE(%rsp) +#define OLD_BUFFER 80 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA %xmm0 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define yy1 %xmm2 +#define yy2 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define xsum3 %xmm10 +#define xsum4 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq M, M + jle .L999 + + unpcklpd ALPHA, ALPHA + + movq BUFFER, XX + + movq M, %rax + sarq $3, %rax + jle .L02 + ALIGN_3 + +.L01: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movhpd 0 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X + + mulpd ALPHA, %xmm1 + mulpd ALPHA, %xmm2 + mulpd ALPHA, %xmm3 + mulpd ALPHA, %xmm4 + + movapd %xmm1, 0 * SIZE(XX) + movapd %xmm2, 2 * SIZE(XX) + movapd %xmm3, 4 * SIZE(XX) + movapd %xmm4, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $7, %rax + jle .L05 + ALIGN_3 + +.L03: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + + mulsd ALPHA, %xmm1 + + movlpd %xmm1, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $3, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $7, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movsd %xmm0, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + xorq IS, IS # is = 0 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 2), A2 + leaq 4 * SIZE(A, LDA, 4), A + + leaq (NEW_X, IS, SIZE), XX + leaq 4 * SIZE(NEW_Y, IS, SIZE), YY + + movapd 0 * SIZE(XX), atemp2 + movapd 2 * SIZE(XX), atemp4 + + movsd 0 * SIZE(A1), xsum1 + movhpd 1 * SIZE(A1), xsum1 + mulpd atemp2, xsum1 + + movsd 1 * SIZE(A1), xsum2 + movhpd 1 * SIZE(A1, LDA, 1), xsum2 + mulpd atemp2, xsum2 + + movsd 2 * SIZE(A1), xsum3 + movhpd 2 * SIZE(A1, LDA, 1), xsum3 + mulpd atemp2, xsum3 + + movsd 3 * SIZE(A1), xsum4 + movhpd 3 * SIZE(A1, LDA, 1), xsum4 + mulpd atemp2, xsum4 + + movsd 2 * SIZE(A1), a1 + movhpd 3 * SIZE(A1), a1 + mulpd atemp4, a1 + addpd a1, xsum1 + + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + mulpd atemp4, a1 + addpd a1, xsum2 + + movsd 2 * SIZE(A2), a1 + movhpd 3 * SIZE(A2), a1 + mulpd atemp4, a1 + addpd a1, xsum3 + + movsd 3 * SIZE(A2), a1 + movhpd 3 * SIZE(A2, LDA, 1), a1 + mulpd atemp4, a1 + addpd a1, xsum4 + + movapd 4 * SIZE(XX), xtemp1 + movapd 6 * SIZE(XX), xtemp2 + + movsd 4 * SIZE(A1), a1 + movhpd 5 * SIZE(A1), a1 + movsd 6 * SIZE(A1), a2 + movhpd 7 * SIZE(A1), a2 + movsd 4 * SIZE(A1, LDA, 1), a3 + movhpd 5 * SIZE(A1, LDA, 1), a3 + + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + +#ifndef HAVE_SSE3 + movapd atemp2, atemp1 + unpcklpd atemp1, atemp1 + unpckhpd atemp2, atemp2 + movapd atemp4, atemp3 + unpcklpd atemp3, atemp3 + unpckhpd atemp4, atemp4 +#else + movddup atemp2, atemp1 + unpckhpd atemp2, atemp2 + movddup atemp4, atemp3 + unpckhpd atemp4, atemp4 +#endif + + addq $4 * SIZE, XX + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + + movq M, I + subq IS, I + subq $4, I + sarq $3, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + movsd 0 * SIZE(A2), a2 + movhpd 1 * SIZE(A2), a2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy1 + movsd 2 * SIZE(A2), a3 + movhpd 3 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCH PREFETCHSIZE(XX) +#endif + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy2 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy1 + movsd 2 * SIZE(A2, LDA, 1), a2 + movhpd 3 * SIZE(A2, LDA, 1), a2 + + PREFETCH PREFETCHSIZE(A1, LDA, 1) + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum3 + addpd a3, yy2 + movsd 4 * SIZE(A1), a3 + movhpd 5 * SIZE(A1), a3 + + movapd xtemp1, xt1 + movapd 4 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + movsd 6 * SIZE(A1), a1 + movhpd 7 * SIZE(A1), a1 + + movapd xtemp2, xt1 + movapd 6 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum4 + addpd a2, yy2 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhpd 5 * SIZE(A1, LDA, 1), a2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp1, a3 + addpd xt1, xsum1 + addpd a3, yy1 + movsd 6 * SIZE(A1, LDA, 1), a3 + movhpd 7 * SIZE(A1, LDA, 1), a3 + + PREFETCH PREFETCHSIZE(A2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + movsd 4 * SIZE(A2), a1 + movhpd 5 * SIZE(A2), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + addpd xt1, xsum2 + addpd a2, yy1 + movsd 6 * SIZE(A2), a2 + movhpd 7 * SIZE(A2), a2 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCHW PREFETCHSIZE(YY) +#endif + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy2 + movsd 4 * SIZE(A2, LDA, 1), a3 + movhpd 5 * SIZE(A2, LDA, 1), a3 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum3 + addpd a1, yy1 + movsd 6 * SIZE(A2, LDA, 1), a1 + movhpd 7 * SIZE(A2, LDA, 1), a1 + + PREFETCH PREFETCHSIZE(A2, LDA, 1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy2 + movsd 10 * SIZE(A1), a2 + movhpd 11 * SIZE(A1), a2 + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a3, xt1 + mulpd atemp4, a3 + addpd xt1, xsum4 + addpd a3, yy1 + movsd 8 * SIZE(A1, LDA, 1), a3 + movhpd 9 * SIZE(A1, LDA, 1), a3 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy2 + movsd 8 * SIZE(A1), a1 + movhpd 9 * SIZE(A1), a1 + + movsd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + movsd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + addq $8 * SIZE, XX + addq $8 * SIZE, YY + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + movq M, I + subq IS, I + subq $4, I + test $4, I + jle .L17 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + movsd 0 * SIZE(A2), a2 + movhpd 1 * SIZE(A2), a2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy1 + movsd 2 * SIZE(A2), a3 + movhpd 3 * SIZE(A2), a3 + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy2 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy1 + movsd 2 * SIZE(A2, LDA, 1), a2 + movhpd 3 * SIZE(A2, LDA, 1), a2 + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum3 + addpd a3, yy2 + movsd 4 * SIZE(A1, LDA, 1), a3 + movhpd 5 * SIZE(A1, LDA, 1), a3 + + movapd xtemp1, xt1 + movapd 4 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + movsd 4 * SIZE(A1), a1 + movhpd 5 * SIZE(A1), a1 + + movapd xtemp2, xt1 + movapd 6 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum4 + addpd a2, yy2 + movsd 6 * SIZE(A1), a2 + movhpd 7 * SIZE(A1), a2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $2, M + jle .L18 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 0 * SIZE(A1, LDA, 1), a1 + movhpd 1 * SIZE(A1, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy1 + movsd 0 * SIZE(A2), a1 + movhpd 1 * SIZE(A2), a1 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum3 + addpd a1, yy1 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + movapd 2 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + movsd 2 * SIZE(A1), a1 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 2 * SIZE(YY), yy1 + + addq $2 * SIZE, XX + addq $2 * SIZE, YY + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + ALIGN_3 + +.L18: + testq $1, M + jle .L19 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp1, a1 + addsd xt1, xsum1 + addpd a1, yy1 + movsd 0 * SIZE(A1, LDA, 1), a1 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp2, a1 + addsd xt1, xsum2 + addsd a1, yy1 + movsd 0 * SIZE(A2), a1 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp3, a1 + addsd xt1, xsum3 + addsd a1, yy1 + movsd 0 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp4, a1 + addsd xt1, xsum4 + addsd a1, yy1 + + movsd yy1, 0 * SIZE(YY) + ALIGN_3 + +.L19: +#ifndef HAVE_SSE3 + movapd xsum1, atemp1 + movapd xsum3, atemp3 + + unpcklpd xsum2, xsum1 + unpcklpd xsum4, xsum3 + + unpckhpd xsum2, atemp1 + unpckhpd xsum4, atemp3 + + addpd atemp1, xsum1 + addpd atemp3, xsum3 +#else + haddpd xsum2, xsum1 + haddpd xsum4, xsum3 +#endif + + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1 + movsd 2 * SIZE(NEW_Y, IS, SIZE), yy2 + movhpd 3 * SIZE(NEW_Y, IS, SIZE), yy2 + + addpd xsum1, yy1 + addpd xsum3, yy2 + + movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) + movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE) + movsd yy2, 2 * SIZE(NEW_Y, IS, SIZE) + movhpd yy2, 3 * SIZE(NEW_Y, IS, SIZE) + + addq $4, IS + + movq IS, I + addq $4, I + cmpq N, I + jle .L11 + ALIGN_3 + +.L20: + testq $2, N + jle .L30 + + movq A, A1 + leaq 2 * SIZE(A, LDA, 2), A + + movapd 0 * SIZE(NEW_X, IS, SIZE), atemp2 + + movsd 0 * SIZE(A1), xsum1 + movhpd 1 * SIZE(A1), xsum1 + mulpd atemp2, xsum1 + + movsd 1 * SIZE(A1), xsum2 + movhpd 1 * SIZE(A1, LDA, 1), xsum2 + mulpd atemp2, xsum2 + +#ifndef HAVE_SSE3 + movapd atemp2, atemp1 + unpcklpd atemp1, atemp1 +#else + movddup atemp2, atemp1 +#endif + unpckhpd atemp2, atemp2 + + testq $1, M + jle .L29 + + movsd 2 * SIZE(A1), a1 + movsd 2 * SIZE(A1, LDA, 1), a2 + movsd 2 * SIZE(NEW_X, IS, SIZE), xtemp1 + movsd 2 * SIZE(NEW_Y, IS, SIZE), yy1 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp1, a1 + addsd xt1, xsum1 + addpd a1, yy1 + + movapd xtemp1, xt1 + mulsd a2, xt1 + mulsd atemp2, a2 + addsd xt1, xsum2 + addsd a2, yy1 + + movsd yy1, 2 * SIZE(NEW_Y, IS, SIZE) + ALIGN_3 + +.L29: +#ifndef HAVE_SSE3 + movapd xsum1, atemp1 + unpcklpd xsum2, xsum1 + unpckhpd xsum2, atemp1 + addpd atemp1, xsum1 +#else + haddpd xsum2, xsum1 +#endif + + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1 + + addpd xsum1, yy1 + + movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) + movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE) + + addq $2, IS + ALIGN_3 + +.L30: + testq $1, N + jle .L990 + + movsd 0 * SIZE(A), xsum1 + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + + mulsd atemp1, xsum1 + addsd xsum1, yy1 + movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq M, %rax + sarq $3, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $7, %rax + jle .L999 + ALIGN_3 + +.L998: + movsd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + + addq $1 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S new file mode 100644 index 0000000..2df76f1 --- /dev/null +++ b/kernel/x86_64/symv_U_sse.S @@ -0,0 +1,1059 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 8) +#define movsd movlps +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_LDA 40 + STACKSIZE(%rsp) +#define OLD_X 48 + STACKSIZE(%rsp) +#define OLD_INCX 56 + STACKSIZE(%rsp) +#define OLD_Y 64 + STACKSIZE(%rsp) +#define OLD_INCY 72 + STACKSIZE(%rsp) +#define OLD_BUFFER 80 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA %xmm0 + +#define atemp1 %xmm0 +#define atemp2 %xmm1 +#define atemp3 %xmm2 +#define atemp4 %xmm3 + +#define xsum1 %xmm4 +#define xsum2 %xmm5 +#define xsum3 %xmm6 +#define xsum4 %xmm7 + +#define xtemp1 %xmm8 +#define xtemp2 %xmm9 +#define yy1 %xmm10 +#define xt1 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define a4 %xmm15 + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq M, M + jle .L999 + + negq IS + addq M, IS + + movq IS, TEMP + imulq LDA, TEMP + addq TEMP, A + + shufps $0, ALPHA, ALPHA + + movq BUFFER, XX + + movq M, %rax + sarq $3, %rax + jle .L02 + ALIGN_3 + +.L01: + movss 0 * SIZE(X), %xmm1 + addq INCX, X + movss 0 * SIZE(X), %xmm2 + addq INCX, X + movss 0 * SIZE(X), %xmm3 + addq INCX, X + movss 0 * SIZE(X), %xmm4 + addq INCX, X + movss 0 * SIZE(X), %xmm5 + addq INCX, X + movss 0 * SIZE(X), %xmm6 + addq INCX, X + movss 0 * SIZE(X), %xmm7 + addq INCX, X + movss 0 * SIZE(X), %xmm8 + addq INCX, X + + mulss ALPHA, %xmm1 + mulss ALPHA, %xmm2 + mulss ALPHA, %xmm3 + mulss ALPHA, %xmm4 + mulss ALPHA, %xmm5 + mulss ALPHA, %xmm6 + mulss ALPHA, %xmm7 + mulss ALPHA, %xmm8 + + movss %xmm1, 0 * SIZE(XX) + movss %xmm2, 1 * SIZE(XX) + movss %xmm3, 2 * SIZE(XX) + movss %xmm4, 3 * SIZE(XX) + movss %xmm5, 4 * SIZE(XX) + movss %xmm6, 5 * SIZE(XX) + movss %xmm7, 6 * SIZE(XX) + movss %xmm8, 7 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $7, %rax + jle .L05 + ALIGN_3 + +.L03: + movss 0 * SIZE(X), %xmm1 + addq INCX, X + + mulss ALPHA, %xmm1 + + movss %xmm1, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $3, %rax + jle .L07 + ALIGN_3 + +.L06: + movss 0 * SIZE(YY), %xmm0 + addq INCY, YY + movss 0 * SIZE(YY), %xmm1 + addq INCY, YY + movss 0 * SIZE(YY), %xmm2 + addq INCY, YY + movss 0 * SIZE(YY), %xmm3 + addq INCY, YY + movss 0 * SIZE(YY), %xmm4 + addq INCY, YY + movss 0 * SIZE(YY), %xmm5 + addq INCY, YY + movss 0 * SIZE(YY), %xmm6 + addq INCY, YY + movss 0 * SIZE(YY), %xmm7 + addq INCY, YY + + movss %xmm0, 0 * SIZE(XX) + movss %xmm1, 1 * SIZE(XX) + movss %xmm2, 2 * SIZE(XX) + movss %xmm3, 3 * SIZE(XX) + movss %xmm4, 4 * SIZE(XX) + movss %xmm5, 5 * SIZE(XX) + movss %xmm6, 6 * SIZE(XX) + movss %xmm7, 7 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $7, %rax + jle .L10 + ALIGN_3 + +.L08: + movss 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movss %xmm0, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + movq IS, I + addq $4, I + cmpq M, I + jg .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 + + pshufd $0x00, atemp4, atemp1 + pshufd $0x55, atemp4, atemp2 + pshufd $0xaa, atemp4, atemp3 + pshufd $0xff, atemp4, atemp4 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + pxor xsum3, xsum3 + pxor xsum4, xsum4 + + movaps 0 * SIZE(NEW_X), xtemp1 + movaps 4 * SIZE(NEW_X), xtemp2 + + movsd 0 * SIZE(A1), a1 + movhps 2 * SIZE(A1), a1 + movsd 0 * SIZE(A1, LDA, 1), a2 + movhps 2 * SIZE(A1, LDA, 1), a2 + movsd 0 * SIZE(A2), a3 + movhps 2 * SIZE(A2), a3 + movsd 0 * SIZE(A2, LDA, 1), a4 + movhps 2 * SIZE(A2, LDA, 1), a4 + + movsd 0 * SIZE(NEW_Y), yy1 + movhps 2 * SIZE(NEW_Y), yy1 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $4, I + jle .L14 + ALIGN_3 + +.L12: + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A1) + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCH PREFETCHSIZE(XX) +#endif + + movaps xtemp1, xt1 + movaps 8 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 8 * SIZE(A1), a1 + movhps 10 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A1, LDA, 1) + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 8 * SIZE(A1, LDA, 1), a2 + movhps 10 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 8 * SIZE(A2), a3 + movhps 10 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 12 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 8 * SIZE(A2, LDA, 1), a4 + movhps 10 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 4 * SIZE(YY) + movhps yy1, 6 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhps 10 * SIZE(YY), yy1 + + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 12 * SIZE(A1), a1 + movhps 14 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A2) + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 12 * SIZE(A1, LDA, 1), a2 + movhps 14 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 12 * SIZE(A2), a3 + movhps 14 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCHW PREFETCHSIZE(YY) +#endif + + movaps xtemp1, xt1 + movaps 16 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 12 * SIZE(A2, LDA, 1), a4 + movhps 14 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 8 * SIZE(YY) + movhps yy1, 10 * SIZE(YY) + movsd 12 * SIZE(YY), yy1 + movhps 14 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 16 * SIZE(A1), a1 + movhps 18 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A2, LDA, 1) + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 16 * SIZE(A1, LDA, 1), a2 + movhps 18 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 16 * SIZE(A2), a3 + movhps 18 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 20 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 16 * SIZE(A2, LDA, 1), a4 + movhps 18 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 12 * SIZE(YY) + movhps yy1, 14 * SIZE(YY) + movsd 16 * SIZE(YY), yy1 + movhps 18 * SIZE(YY), yy1 + + addq $16 * SIZE, XX + addq $16 * SIZE, YY + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L14: + testq $8, IS + jle .L15 + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + + movaps xtemp1, xt1 + movaps 8 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 8 * SIZE(A1), a1 + movhps 10 * SIZE(A1), a1 + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 8 * SIZE(A1, LDA, 1), a2 + movhps 10 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 8 * SIZE(A2), a3 + movhps 10 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 12 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 8 * SIZE(A2, LDA, 1), a4 + movhps 10 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 4 * SIZE(YY) + movhps yy1, 6 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhps 10 * SIZE(YY), yy1 + + addq $8 * SIZE, XX + addq $8 * SIZE, YY + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L15: + testq $4, IS + jle .L18 + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + + movaps xtemp1, xt1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + movaps 0 * SIZE(NEW_X, IS, SIZE), atemp1 + + movss 0 * SIZE(A1), a1 + movss 0 * SIZE(A1, LDA, 1), a2 + movss 0 * SIZE(A2), a3 + movss 0 * SIZE(A2, LDA, 1), a4 + + unpcklps a3, a1 + unpcklps a4, a2 + unpcklps a2, a1 + + mulps atemp1, a1 + addps a1, xsum1 + + movsd 0 * SIZE(A1, LDA, 1), a1 + movss 1 * SIZE(A2), a2 + movhps 1 * SIZE(A2, LDA, 1), a2 + + shufps $0x84, a2, a1 + + mulps atemp1, a1 + addps a1, xsum2 + + movsd 0 * SIZE(A2), a1 + movss 2 * SIZE(A2), a2 + movhps 2 * SIZE(A2, LDA, 1), a2 + + shufps $0x84, a2, a1 + + mulps atemp1, a1 + addps a1, xsum3 + + movsd 0 * SIZE(A2, LDA, 1), a1 + movhps 2 * SIZE(A2, LDA, 1), a1 + + mulps atemp1, a1 + addps a1, xsum4 + + +#ifndef HAVE_SSE3 + movaps xsum1, xtemp1 + unpcklps xsum3, xsum1 + unpckhps xsum3, xtemp1 + + movaps xsum2, xtemp2 + unpcklps xsum4, xsum2 + unpckhps xsum4, xtemp2 + + movaps xsum1, xsum3 + unpcklps xsum2, xsum1 + unpckhps xsum2, xsum3 + + movaps xtemp1, xsum4 + unpcklps xtemp2, xtemp1 + unpckhps xtemp2, xsum4 + + addps xsum3, xsum1 + addps xtemp1, xsum4 + addps xsum4, xsum1 +#else + haddps xsum2, xsum1 + haddps xsum4, xsum3 + + haddps xsum3, xsum1 +#endif + + addps xsum1, yy1 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + + addq $4, IS + + movq IS, I + addq $4, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + testq $2, M + jle .L30 + + movq A, A1 + leaq (A, LDA, 2), A + + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp4 + + pshufd $0x00, atemp4, atemp1 + pshufd $0x55, atemp4, atemp2 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + movaps 0 * SIZE(NEW_X), xtemp1 + + movsd 0 * SIZE(A1), a1 + movhps 2 * SIZE(A1), a1 + movsd 0 * SIZE(A1, LDA, 1), a2 + movhps 2 * SIZE(A1, LDA, 1), a2 + + movsd 0 * SIZE(NEW_Y), yy1 + movhps 2 * SIZE(NEW_Y), yy1 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $2, I + jle .L28 + ALIGN_3 + +.L22: + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + movaps xtemp1, xt1 + movaps 4 * SIZE(XX), xtemp1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + + decq I + jg .L22 + ALIGN_3 + +.L28: + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + + movss 0 * SIZE(A1), a1 + movss 0 * SIZE(A1, LDA, 1), a2 + + unpcklps a2, a1 + + mulps atemp1, a1 + addps a1, xsum1 + + movsd 0 * SIZE(A1, LDA, 1), a1 + mulps atemp1, a1 + addps a1, xsum2 + +#ifndef HAVE_SSE3 + movhlps xsum1, xsum3 + movhlps xsum2, xsum4 + addps xsum3, xsum1 + addps xsum4, xsum2 + + unpcklps xsum2, xsum1 + movhlps xsum1, xsum2 + + addps xsum2, xsum1 +#else + haddps xsum2, xsum1 + haddps xsum1, xsum1 +#endif + + addps xsum1, yy1 + + movlps yy1, 0 * SIZE(YY) + + addq $2, IS + ALIGN_3 + +.L30: + testq $1, M + jle .L990 + + movq A, A1 + + movss 0 * SIZE(NEW_X, IS, SIZE), atemp1 + + pshufd $0x00, atemp1, atemp1 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + movss 0 * SIZE(NEW_Y), yy1 + + movss 0 * SIZE(NEW_X), xtemp1 + movss 1 * SIZE(NEW_X), xtemp2 + + movss 0 * SIZE(A1), a1 + movss 1 * SIZE(A1), a2 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $1, I + jle .L38 + ALIGN_3 + +.L32: + movaps xtemp1, xt1 + movss 2 * SIZE(XX), xtemp1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movss 2 * SIZE(A1), a1 + + movss yy1, 0 * SIZE(YY) + movss 1 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + movss 3 * SIZE(XX), xtemp2 + mulps a2, xt1 + mulps atemp1, a2 + addps xt1, xsum1 + addps a2, yy1 + movss 3 * SIZE(A1), a2 + + movss yy1, 1 * SIZE(YY) + movss 2 * SIZE(YY), yy1 + + addq $2 * SIZE, XX + addq $2 * SIZE, YY + addq $2 * SIZE, A1 + + decq I + jg .L32 + ALIGN_3 + +.L38: + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + + movss 0 * SIZE(A1), a1 + mulss atemp1, a1 + addss a1, xsum1 + +#ifndef HAVE_SSE3 + movhlps xsum1, xsum3 + movhlps xsum2, xsum4 + addps xsum3, xsum1 + addps xsum4, xsum2 + + unpcklps xsum2, xsum1 + movhlps xsum1, xsum2 + + addps xsum2, xsum1 +#else + addss xsum2, xsum1 +#endif + + addss xsum1, yy1 + + movss yy1, 0 * SIZE(YY) + + addq $2, IS + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq M, %rax + sarq $3, %rax + jle .L997 + ALIGN_3 + +.L996: + movss 0 * SIZE(NEW_Y), %xmm0 + movss 1 * SIZE(NEW_Y), %xmm1 + movss 2 * SIZE(NEW_Y), %xmm2 + movss 3 * SIZE(NEW_Y), %xmm3 + movss 4 * SIZE(NEW_Y), %xmm4 + movss 5 * SIZE(NEW_Y), %xmm5 + movss 6 * SIZE(NEW_Y), %xmm6 + movss 7 * SIZE(NEW_Y), %xmm7 + + movss %xmm0, 0 * SIZE(Y) + addq INCY, Y + movss %xmm1, 0 * SIZE(Y) + addq INCY, Y + movss %xmm2, 0 * SIZE(Y) + addq INCY, Y + movss %xmm3, 0 * SIZE(Y) + addq INCY, Y + movss %xmm4, 0 * SIZE(Y) + addq INCY, Y + movss %xmm5, 0 * SIZE(Y) + addq INCY, Y + movss %xmm6, 0 * SIZE(Y) + addq INCY, Y + movss %xmm7, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $7, %rax + jle .L999 + ALIGN_3 + +.L998: + movss 0 * SIZE(NEW_Y), %xmm0 + + movss %xmm0, 0 * SIZE(Y) + addq INCY, Y + + addq $1 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S new file mode 100644 index 0000000..bbba0b4 --- /dev/null +++ b/kernel/x86_64/symv_U_sse2.S @@ -0,0 +1,976 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 8) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_LDA 40 + STACKSIZE(%rsp) +#define OLD_X 48 + STACKSIZE(%rsp) +#define OLD_INCX 56 + STACKSIZE(%rsp) +#define OLD_Y 64 + STACKSIZE(%rsp) +#define OLD_INCY 72 + STACKSIZE(%rsp) +#define OLD_BUFFER 80 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA %xmm0 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define yy1 %xmm2 +#define yy2 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define xsum3 %xmm10 +#define xsum4 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq M, M + jle .L999 + + negq IS + addq M, IS + + movq IS, TEMP + imulq LDA, TEMP + addq TEMP, A + + unpcklpd ALPHA, ALPHA + + movq BUFFER, XX + + movq M, %rax + sarq $3, %rax + jle .L02 + ALIGN_3 + +.L01: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movhpd 0 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X + + mulpd ALPHA, %xmm1 + mulpd ALPHA, %xmm2 + mulpd ALPHA, %xmm3 + mulpd ALPHA, %xmm4 + + movapd %xmm1, 0 * SIZE(XX) + movapd %xmm2, 2 * SIZE(XX) + movapd %xmm3, 4 * SIZE(XX) + movapd %xmm4, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $7, %rax + jle .L05 + ALIGN_3 + +.L03: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + + mulsd ALPHA, %xmm1 + + movlpd %xmm1, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $3, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $7, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movsd %xmm0, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + movq IS, I + addq $4, I + cmpq M, I + jg .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + +#ifdef HAVE_SSE3 + movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2 + movddup 2 * SIZE(NEW_X, IS, SIZE), atemp3 + movddup 3 * SIZE(NEW_X, IS, SIZE), atemp4 +#else + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 + movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2 + movsd 2 * SIZE(NEW_X, IS, SIZE), atemp3 + movhpd 2 * SIZE(NEW_X, IS, SIZE), atemp3 + movsd 3 * SIZE(NEW_X, IS, SIZE), atemp4 + movhpd 3 * SIZE(NEW_X, IS, SIZE), atemp4 +#endif + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + pxor xsum3, xsum3 + pxor xsum4, xsum4 + + movapd 0 * SIZE(NEW_X), xtemp1 + movapd 2 * SIZE(NEW_X), xtemp2 + + movsd 0 * SIZE(A1), a1 + movhpd 1 * SIZE(A1), a1 + movsd 2 * SIZE(A1), a2 + movhpd 3 * SIZE(A1), a2 + movsd 0 * SIZE(A1, LDA, 1), a3 + movhpd 1 * SIZE(A1, LDA, 1), a3 + + movsd 0 * SIZE(NEW_Y), yy1 + movhpd 1 * SIZE(NEW_Y), yy1 + movsd 2 * SIZE(NEW_Y), yy2 + movhpd 3 * SIZE(NEW_Y), yy2 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $3, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + movsd 0 * SIZE(A2), a2 + movhpd 1 * SIZE(A2), a2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy1 + movsd 2 * SIZE(A2), a3 + movhpd 3 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCH PREFETCHSIZE(XX) +#endif + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy2 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy1 + movsd 2 * SIZE(A2, LDA, 1), a2 + movhpd 3 * SIZE(A2, LDA, 1), a2 + + PREFETCH PREFETCHSIZE(A1, LDA, 1) + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum3 + addpd a3, yy2 + movsd 4 * SIZE(A1), a3 + movhpd 5 * SIZE(A1), a3 + + movapd xtemp1, xt1 + movapd 4 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + movsd 6 * SIZE(A1), a1 + movhpd 7 * SIZE(A1), a1 + + movapd xtemp2, xt1 + movapd 6 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum4 + addpd a2, yy2 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhpd 5 * SIZE(A1, LDA, 1), a2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp1, a3 + addpd xt1, xsum1 + addpd a3, yy1 + movsd 6 * SIZE(A1, LDA, 1), a3 + movhpd 7 * SIZE(A1, LDA, 1), a3 + + PREFETCH PREFETCHSIZE(A2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + movsd 4 * SIZE(A2), a1 + movhpd 5 * SIZE(A2), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + addpd xt1, xsum2 + addpd a2, yy1 + movsd 6 * SIZE(A2), a2 + movhpd 7 * SIZE(A2), a2 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCHW PREFETCHSIZE(YY) +#endif + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy2 + movsd 4 * SIZE(A2, LDA, 1), a3 + movhpd 5 * SIZE(A2, LDA, 1), a3 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum3 + addpd a1, yy1 + movsd 6 * SIZE(A2, LDA, 1), a1 + movhpd 7 * SIZE(A2, LDA, 1), a1 + + PREFETCH PREFETCHSIZE(A2, LDA, 1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy2 + movsd 10 * SIZE(A1), a2 + movhpd 11 * SIZE(A1), a2 + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a3, xt1 + mulpd atemp4, a3 + addpd xt1, xsum4 + addpd a3, yy1 + movsd 8 * SIZE(A1, LDA, 1), a3 + movhpd 9 * SIZE(A1, LDA, 1), a3 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy2 + movsd 8 * SIZE(A1), a1 + movhpd 9 * SIZE(A1), a1 + + movsd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + movsd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + addq $8 * SIZE, XX + addq $8 * SIZE, YY + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + testq $4, IS + jle .L18 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + movsd 0 * SIZE(A2), a2 + movhpd 1 * SIZE(A2), a2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy1 + movsd 2 * SIZE(A2), a3 + movhpd 3 * SIZE(A2), a3 + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy2 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy1 + movsd 2 * SIZE(A2, LDA, 1), a2 + movhpd 3 * SIZE(A2, LDA, 1), a2 + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum3 + addpd a3, yy2 + + movapd xtemp1, xt1 + movapd 4 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + + movapd xtemp2, xt1 + movapd 6 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum4 + addpd a2, yy2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + unpckhpd atemp2, atemp1 + unpckhpd atemp4, atemp3 + + movsd 0 * SIZE(A1), a1 + movhpd 0 * SIZE(A1, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum1 + + movsd 0 * SIZE(A1, LDA, 1), a1 + movhpd 1 * SIZE(A1, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum2 + + movsd 0 * SIZE(A2), a1 + movhpd 1 * SIZE(A2), a1 + mulpd atemp1, a1 + addpd a1, xsum3 + + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum4 + + movsd 0 * SIZE(A2), a1 + movhpd 0 * SIZE(A2, LDA, 1), a1 + mulpd atemp3, a1 + addpd a1, xsum1 + + movsd 1 * SIZE(A2), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + mulpd atemp3, a1 + addpd a1, xsum2 + + movsd 2 * SIZE(A2), a1 + movhpd 2 * SIZE(A2, LDA, 1), a1 + mulpd atemp3, a1 + addpd a1, xsum3 + + movsd 2 * SIZE(A2, LDA, 1), a1 + movhpd 3 * SIZE(A2, LDA, 1), a1 + mulpd atemp3, a1 + addpd a1, xsum4 + +#ifndef HAVE_SSE3 + movapd xsum1, atemp1 + movapd xsum3, atemp3 + + unpcklpd xsum2, xsum1 + unpcklpd xsum4, xsum3 + + unpckhpd xsum2, atemp1 + unpckhpd xsum4, atemp3 + + addpd atemp1, xsum1 + addpd atemp3, xsum3 +#else + haddpd xsum2, xsum1 + haddpd xsum4, xsum3 +#endif + + addpd xsum1, yy1 + addpd xsum3, yy2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + + addq $4, IS + + movq IS, I + addq $4, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + testq $2, M + je .L30 + ALIGN_3 + +.L21: + movq A, A1 + leaq (A, LDA, 2), A + +#ifdef HAVE_SSE3 + movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2 +#else + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 + movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2 +#endif + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + movapd 0 * SIZE(NEW_X), xtemp1 + + movsd 0 * SIZE(NEW_Y), yy1 + movhpd 1 * SIZE(NEW_Y), yy1 + + movsd 0 * SIZE(A1), a1 + movhpd 1 * SIZE(A1), a1 + movsd 0 * SIZE(A1, LDA, 1), a2 + movhpd 1 * SIZE(A1, LDA, 1), a2 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $1, I + jle .L28 + ALIGN_3 + +.L22: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1), a1 + movhpd 3 * SIZE(A1), a1 + + movapd xtemp1, xt1 + movapd 2 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp2, a2 + addpd xt1, xsum2 + addpd a2, yy1 + movsd 2 * SIZE(A1, LDA, 1), a2 + movhpd 3 * SIZE(A1, LDA, 1), a2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 2 * SIZE(YY), yy1 + movhpd 3 * SIZE(YY), yy1 + + addq $2 * SIZE, XX + addq $2 * SIZE, YY + addq $2 * SIZE, A1 + + decq I + jg .L22 + ALIGN_3 + +.L28: + unpckhpd atemp2, atemp1 + + movsd 0 * SIZE(A1), a1 + movhpd 0 * SIZE(A1, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum1 + + movsd 0 * SIZE(A1, LDA, 1), a1 + movhpd 1 * SIZE(A1, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum2 + +#ifndef HAVE_SSE3 + movapd xsum1, atemp1 + + unpcklpd xsum2, xsum1 + unpckhpd xsum2, atemp1 + + addpd atemp1, xsum1 +#else + haddpd xsum2, xsum1 +#endif + + addpd xsum1, yy1 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + + addq $2, IS + ALIGN_3 + +.L30: + testq $1, M + je .L990 + ALIGN_3 + +.L31: + movq A, A1 + +#ifdef HAVE_SSE3 + movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 +#else + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 +#endif + + pxor xsum1, xsum1 + + movsd 0 * SIZE(NEW_X), xtemp1 + movsd 0 * SIZE(NEW_Y), yy1 + movsd 0 * SIZE(A1), a1 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + testq I, I + jle .L38 + ALIGN_3 + +.L32: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 1 * SIZE(A1), a1 + + movsd 1 * SIZE(XX), xtemp1 + + movsd yy1, 0 * SIZE(YY) + movsd 1 * SIZE(YY), yy1 + + addq $1 * SIZE, XX + addq $1 * SIZE, YY + addq $1 * SIZE, A1 + + decq I + jg .L32 + ALIGN_3 + +.L38: + movsd 0 * SIZE(A1), a1 + mulsd atemp1, a1 + addsd a1, xsum1 + + addsd xsum1, yy1 + + movsd yy1, 0 * SIZE(YY) + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq M, %rax + sarq $3, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $7, %rax + jle .L999 + ALIGN_3 + +.L998: + movsd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + + addq $1 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S new file mode 100644 index 0000000..d70bede --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S @@ -0,0 +1,3075 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movaps %xmm3, %xmm0 +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + testq $1, M + BRANCH + jle .L20 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm3, %xmm7 + movaps %xmm3, %xmm6 + pshufd $0xe, %xmm2, %xmm5 + movaps %xmm2, %xmm4 + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + movsd -12 * SIZE(BO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm4 + movsd -11 * SIZE(BO), %xmm13 + mulsd %xmm0, %xmm13 + subsd %xmm13, %xmm5 + movsd -10 * SIZE(BO), %xmm14 + mulsd %xmm0, %xmm14 + subsd %xmm14, %xmm6 + movsd -9 * SIZE(BO), %xmm15 + mulsd %xmm0, %xmm15 + subsd %xmm15, %xmm7 + + movsd -7 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + movsd -4 * SIZE(BO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm4 + movsd -3 * SIZE(BO), %xmm13 + mulsd %xmm1, %xmm13 + subsd %xmm13, %xmm5 + movsd -2 * SIZE(BO), %xmm14 + mulsd %xmm1, %xmm14 + subsd %xmm14, %xmm6 + movsd -1 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm7 + + movsd 2 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd 3 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + movsd 4 * SIZE(BO), %xmm12 + mulsd %xmm2, %xmm12 + subsd %xmm12, %xmm4 + movsd 5 * SIZE(BO), %xmm13 + mulsd %xmm2, %xmm13 + subsd %xmm13, %xmm5 + movsd 6 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm6 + movsd 7 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm7 + + movsd 11 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 + movsd 12 * SIZE(BO), %xmm12 + mulsd %xmm3, %xmm12 + subsd %xmm12, %xmm4 + movsd 13 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm5 + movsd 14 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm6 + movsd 15 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm7 + + movsd 20 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm4 + movsd 21 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm5 + movsd 22 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm6 + movsd 23 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm7 + + movsd 29 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm5 + movsd 30 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm6 + movsd 31 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm7 + + movsd 38 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm6 + movsd 39 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm7 + + movsd 47 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm7 +#endif + +#ifdef RT + movsd 47 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm7 + movsd 46 * SIZE(BO), %xmm9 + mulsd %xmm7, %xmm9 + subsd %xmm9, %xmm6 + movsd 45 * SIZE(BO), %xmm10 + mulsd %xmm7, %xmm10 + subsd %xmm10, %xmm5 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm7, %xmm11 + subsd %xmm11, %xmm4 + movsd 43 * SIZE(BO), %xmm12 + mulsd %xmm7, %xmm12 + subsd %xmm12, %xmm3 + movsd 42 * SIZE(BO), %xmm13 + mulsd %xmm7, %xmm13 + subsd %xmm13, %xmm2 + movsd 41 * SIZE(BO), %xmm14 + mulsd %xmm7, %xmm14 + subsd %xmm14, %xmm1 + movsd 40 * SIZE(BO), %xmm15 + mulsd %xmm7, %xmm15 + subsd %xmm15, %xmm0 + + movsd 38 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm6 + movsd 37 * SIZE(BO), %xmm10 + mulsd %xmm6, %xmm10 + subsd %xmm10, %xmm5 + movsd 36 * SIZE(BO), %xmm11 + mulsd %xmm6, %xmm11 + subsd %xmm11, %xmm4 + movsd 35 * SIZE(BO), %xmm12 + mulsd %xmm6, %xmm12 + subsd %xmm12, %xmm3 + movsd 34 * SIZE(BO), %xmm13 + mulsd %xmm6, %xmm13 + subsd %xmm13, %xmm2 + movsd 33 * SIZE(BO), %xmm14 + mulsd %xmm6, %xmm14 + subsd %xmm14, %xmm1 + movsd 32 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm0 + + movsd 29 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm5 + movsd 28 * SIZE(BO), %xmm11 + mulsd %xmm5, %xmm11 + subsd %xmm11, %xmm4 + movsd 27 * SIZE(BO), %xmm12 + mulsd %xmm5, %xmm12 + subsd %xmm12, %xmm3 + movsd 26 * SIZE(BO), %xmm13 + mulsd %xmm5, %xmm13 + subsd %xmm13, %xmm2 + movsd 25 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm1 + movsd 24 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm0 + + movsd 20 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm4 + movsd 19 * SIZE(BO), %xmm12 + mulsd %xmm4, %xmm12 + subsd %xmm12, %xmm3 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm2 + movsd 17 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm1 + movsd 16 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm0 + + movsd 11 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd 10 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd 9 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd 8 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd 2 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd 1 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd 0 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 + movaps %xmm4, %xmm2 + unpcklpd %xmm5, %xmm2 + movaps %xmm6, %xmm3 + unpcklpd %xmm7, %xmm3 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO1, %rax, 1) + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 0 * SIZE(CO2, %rax, 1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) + movapd %xmm2, -12 * SIZE(BO) + movapd %xmm3, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + NOBRANCH + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetcht0 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 -2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 -3 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 -2 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 -3 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 -2 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 -3 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 -2 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 -3 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -10 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addq $32 * SIZE, BO + subq $-8 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm13 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $0, %xmm13, %xmm12 + shufpd $3, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $0, %xmm15, %xmm14 + shufpd $3, %xmm0, %xmm15 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm2 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm6 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + shufpd $2, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + shufpd $2, %xmm0, %xmm15 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + subpd %xmm12, %xmm4 + subpd %xmm13, %xmm5 + subpd %xmm14, %xmm6 + subpd %xmm15, %xmm7 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -14 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + movapd %xmm12, %xmm14 + movapd %xmm12, %xmm15 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + mulpd %xmm5, %xmm14 + mulpd %xmm7, %xmm15 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + subpd %xmm14, %xmm4 + subpd %xmm15, %xmm6 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 + + movddup -15 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + movapd %xmm12, %xmm14 + movapd %xmm12, %xmm15 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + mulpd %xmm4, %xmm14 + mulpd %xmm6, %xmm15 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + subpd %xmm14, %xmm5 + subpd %xmm15, %xmm7 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + movddup -12 * SIZE(BO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm4 + movddup -11 * SIZE(BO), %xmm13 + mulpd %xmm0, %xmm13 + subpd %xmm13, %xmm5 + movddup -10 * SIZE(BO), %xmm14 + mulpd %xmm0, %xmm14 + subpd %xmm14, %xmm6 + movddup -9 * SIZE(BO), %xmm15 + mulpd %xmm0, %xmm15 + subpd %xmm15, %xmm7 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + movddup -4 * SIZE(BO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm4 + movddup -3 * SIZE(BO), %xmm13 + mulpd %xmm1, %xmm13 + subpd %xmm13, %xmm5 + movddup -2 * SIZE(BO), %xmm14 + mulpd %xmm1, %xmm14 + subpd %xmm14, %xmm6 + movddup -1 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm7 + + movddup 2 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + movddup 4 * SIZE(BO), %xmm12 + mulpd %xmm2, %xmm12 + subpd %xmm12, %xmm4 + movddup 5 * SIZE(BO), %xmm13 + mulpd %xmm2, %xmm13 + subpd %xmm13, %xmm5 + movddup 6 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm6 + movddup 7 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm7 + + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm4 + movddup 13 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm5 + movddup 14 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm6 + movddup 15 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm7 + + movddup 20 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm6 + movddup 23 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm7 + + movddup 29 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm5 + movddup 30 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm7 + + movddup 38 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm6 + movddup 39 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm7 + + movddup 47 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm7 +#endif + +#ifdef RT + movddup 47 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm7 + movddup 46 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm6 + movddup 45 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm5 + movddup 44 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm4 + movddup 43 * SIZE(BO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + movddup 42 * SIZE(BO), %xmm13 + mulpd %xmm7, %xmm13 + subpd %xmm13, %xmm2 + movddup 41 * SIZE(BO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm1 + movddup 40 * SIZE(BO), %xmm15 + mulpd %xmm7, %xmm15 + subpd %xmm15, %xmm0 + + movddup 38 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm6 + movddup 37 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm5 + movddup 36 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm4 + movddup 35 * SIZE(BO), %xmm12 + mulpd %xmm6, %xmm12 + subpd %xmm12, %xmm3 + movddup 34 * SIZE(BO), %xmm13 + mulpd %xmm6, %xmm13 + subpd %xmm13, %xmm2 + movddup 33 * SIZE(BO), %xmm14 + mulpd %xmm6, %xmm14 + subpd %xmm14, %xmm1 + movddup 32 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm0 + + movddup 29 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm5 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm5, %xmm11 + subpd %xmm11, %xmm4 + movddup 27 * SIZE(BO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + movddup 26 * SIZE(BO), %xmm13 + mulpd %xmm5, %xmm13 + subpd %xmm13, %xmm2 + movddup 25 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm1 + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm0 + + movddup 20 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm4 + movddup 19 * SIZE(BO), %xmm12 + mulpd %xmm4, %xmm12 + subpd %xmm12, %xmm3 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm2 + movddup 17 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm1 + movddup 16 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm0 + + movddup 11 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup 10 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup 9 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup 8 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup 2 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup 1 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup 0 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm2, -14 * SIZE(BO) + movapd %xmm4, -12 * SIZE(BO) + movapd %xmm6, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5 , -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 1 * SIZE(CO1, LDC, 2) + movhps %xmm2, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 1 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movsd %xmm5, 1 * SIZE(CO2) + movhps %xmm4, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 1 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movsd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhps %xmm6, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 1 * SIZE(CO2, %rax, 1) +#else + movups %xmm0, 0 * SIZE(CO1) + movups %xmm1, 0 * SIZE(CO1, LDC, 1) + movups %xmm2, 0 * SIZE(CO1, LDC, 2) + movups %xmm3, 0 * SIZE(CO1, %rax, 1) + movups %xmm4, 0 * SIZE(CO2) + movups %xmm5, 0 * SIZE(CO2, LDC, 1) + movups %xmm6, 0 * SIZE(CO2, LDC, 2) + movups %xmm7, 0 * SIZE(CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $4, N + jle .L50 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L40 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + + movsd -11 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -10 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -9 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + + movsd -1 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd -2 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd -3 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd -4 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd -6 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -11 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -12 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 0 * SIZE(CO2, LDC, 1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + movq M, I + sarq $1, I + NOBRANCH + jle .L49 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 2 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm2 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm3 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + + movddup -15 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + + movddup -11 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -10 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -9 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + + movddup -1 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup -2 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup -3 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup -4 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup -6 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -11 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -12 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm2, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm3, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L50: + testq $2, N + jle .L70 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L60 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 +#else + movapd -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 + movsd -15 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm1 + + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -14 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $1, I + NOBRANCH + jle .L69 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm0 + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(AO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 + movddup -15 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm1 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -14 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L80 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movhps -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhps -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 + movsd -14 * SIZE(AO), %xmm0 + movhps -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movsd -14 * SIZE(BO), %xmm1 + movhps -13 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movsd -12 * SIZE(AO), %xmm0 + movhps -11 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movsd -12 * SIZE(BO), %xmm1 + movhps -11 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm0 +#else + movsd -16 * SIZE(AO), %xmm0 +#endif + + subsd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BO) +#else + movsd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + + +.L80: + movq M, I + sarq $1, I + NOBRANCH + jle .L89 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 +#else + movapd -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef LN + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 + movsd -14 * SIZE(AO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm0 + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(AO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm1 + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm1, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L89: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x2_atom.S b/kernel/x86_64/trsm_kernel_LN_4x2_atom.S new file mode 100644 index 0000000..6ba2fc4 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x2_atom.S @@ -0,0 +1,2116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L20 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 3 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm1 + mulsd %xmm13, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm1, %xmm9 + movsd 0 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm0 + mulsd %xmm13, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + addsd %xmm6, %xmm11 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm1 + movsd 3 * SIZE(AO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm2 + movsd 2 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm10 + + subsd %xmm9, %xmm2 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm1, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + PREFETCH (PREFETCHSIZE + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addsd %xmm2, %xmm13 + addsd %xmm7, %xmm14 + addsd %xmm6, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 5 * SIZE(BO), %xmm5 + movsd 6 * SIZE(BO), %xmm6 + movsd 7 * SIZE(BO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 + subsd %xmm12, %xmm4 + subsd %xmm13, %xmm5 + subsd %xmm14, %xmm6 + subsd %xmm15, %xmm7 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + movsd 4 * SIZE(AO), %xmm1 + movsd 5 * SIZE(AO), %xmm3 + movsd 6 * SIZE(AO), %xmm5 + movsd 7 * SIZE(AO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 + subsd %xmm13, %xmm5 + subsd %xmm15, %xmm7 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm7 + movsd 13 * SIZE(AO), %xmm11 + + movaps %xmm9, %xmm10 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm9 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm7, %xmm10 + subsd %xmm9, %xmm4 + movsd 9 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm6, %xmm11 + mulsd %xmm7, %xmm12 + subsd %xmm11, %xmm2 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm3 + + movaps %xmm13, %xmm14 + mulsd %xmm6, %xmm13 + mulsd %xmm7, %xmm14 + subsd %xmm13, %xmm0 + subsd %xmm14, %xmm1 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 5 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm4, %xmm11 + mulsd %xmm5, %xmm12 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm1 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm11, %xmm0 + mulsd %xmm11, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + + movsd 2 * SIZE(AO), %xmm11 + movaps %xmm9, %xmm10 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm9 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm1, %xmm10 + subsd %xmm9, %xmm2 + movsd 6 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm12 + subsd %xmm11, %xmm4 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm5 + + movaps %xmm13, %xmm14 + mulsd %xmm0, %xmm13 + mulsd %xmm1, %xmm14 + subsd %xmm13, %xmm6 + subsd %xmm14, %xmm7 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + movsd 10 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm2, %xmm11 + mulsd %xmm3, %xmm12 + subsd %xmm11, %xmm6 + subsd %xmm12, %xmm7 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 15 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm6 + subsd %xmm10, %xmm7 + + mulsd %xmm8, %xmm6 + mulsd %xmm8, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + mulsd %xmm4, %xmm11 + mulsd %xmm6, %xmm12 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + subsd %xmm11, %xmm5 + subsd %xmm12, %xmm7 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 + mulsd %xmm13, %xmm5 + mulsd %xmm13, %xmm7 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm5 + mulsd %xmm8, %xmm7 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + mulsd %xmm5, %xmm11 + mulsd %xmm7, %xmm12 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm4 + subsd %xmm12, %xmm6 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm4 + mulsd %xmm13, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movsd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) + movsd %xmm4, 4 * SIZE(BO) + movsd %xmm5, 5 * SIZE(BO) + movsd %xmm6, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) + movsd %xmm1, 4 * SIZE(AO) + movsd %xmm3, 5 * SIZE(AO) + movsd %xmm5, 6 * SIZE(AO) + movsd %xmm7, 7 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L50 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + subsd %xmm8, %xmm0 +#else + movsd 0 * SIZE(AO), %xmm0 + subsd %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L50: + testq $2, M + je .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + movsd 2 * SIZE(AO), %xmm9 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11,%xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + movsd 1 * SIZE(AO), %xmm9 + movsd 3 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm0 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm2 + mulsd %xmm11,%xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $2, I + jle .L69 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm4 + movsd 3 * SIZE(BO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm9 + movsd 13 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm4 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm11 + movsd 10 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm2 + movsd 9 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm13 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm0 + + mulsd %xmm8, %xmm4 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm11 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 2 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm2 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm11 + movsd 5 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm4 + movsd 6 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm13 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm6 + + mulsd %xmm8, %xmm2 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm6 + mulsd %xmm8, %xmm4 + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm6 + mulsd %xmm8, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) + movsd %xmm4, 2 * SIZE(BO) + movsd %xmm6, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L41 + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S new file mode 100644 index 0000000..4cdaff3 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S @@ -0,0 +1,3390 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define OFFSET 48(%rsp) +#define AORIG 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define AORIG 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ +/***/ movapd (AO, %rax, 4), %xmm6 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ +/**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ +/***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm12 +#else + movq STACKSIZE + 8(%rsp), LDC + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movsd %xmm12, OFFSET + movsd %xmm12, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L20 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#else + movapd -16 * SIZE(AO), %xmm2 + movapd -14 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -15 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd -14 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd -13 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd -9 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -5 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd -3 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd -4 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -7 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd -8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -12 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm3, -14 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) + movaps %xmm3, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) + movaps %xmm4, -12 * SIZE(AO) + movaps %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#else + prefetchw -8 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw -8 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw -8 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw -8 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#endif + + prefetch -10 * SIZE(BB) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + prefetch 14 * SIZE(BB) + subq $-16 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm15 + + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm11, %xmm14 + subpd %xmm14, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm3 + + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) + movaps %xmm1, -8 * SIZE(BO) + movaps %xmm3, -6 * SIZE(BO) + movaps %xmm5, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) + movaps %xmm4, -8 * SIZE(AO) + movaps %xmm5, -6 * SIZE(AO) + movaps %xmm6, -4 * SIZE(AO) + movaps %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L60 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 1), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 +#else + movapd -16 * SIZE(AO), %xmm2 +#endif + + subpd %xmm8, %xmm2 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -16 * SIZE(BO), %xmm2 + movsd -15 * SIZE(BO), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + + movlpd -14 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + + mulsd -16 * SIZE(BO), %xmm2 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 3 * SIZE(CO2) +#else + prefetchw -8 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw -8 * SIZE(CO2) +#endif + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13,-14 * SIZE(BO) + movaps %xmm1, -12 * SIZE(BO) + movaps %xmm5, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L100 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm10, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movlpd %xmm10, -16 * SIZE(BO) +#else + movlpd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L100: + testq $2, M + je .L110 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -14 * SIZE(BO), %xmm3 + +#ifndef LN + prefetchw 3 * SIZE(CO1) +#else + prefetchw -8 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm10 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm11 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm9 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm10 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movlpd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) + movaps %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_core2.S b/kernel/x86_64/trsm_kernel_LN_4x4_core2.S new file mode 100644 index 0000000..fc5284a --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_core2.S @@ -0,0 +1,3739 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define J 0(%rsp) +#define OFFSET 8(%rsp) +#define KK 16(%rsp) +#define KKK 24(%rsp) +#define AORIG 32(%rsp) +#define BORIG 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -8 * SIZE(B), %xmm4 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + movddup %xmm2, %xmm10 + unpckhpd %xmm2, %xmm2 + movddup %xmm3, %xmm11 + unpckhpd %xmm3, %xmm3 + movddup %xmm4, %xmm12 + unpckhpd %xmm4, %xmm4 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + movddup %xmm6, %xmm14 + unpckhpd %xmm6, %xmm6 + movddup %xmm7, %xmm15 + unpckhpd %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + movapd %xmm10, -8 * SIZE(BO) + movapd %xmm2, -6 * SIZE(BO) + movapd %xmm11, -4 * SIZE(BO) + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + movapd %xmm12, 0 * SIZE(BO) + movapd %xmm4, 2 * SIZE(BO) + movapd %xmm13, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movapd %xmm14, 8 * SIZE(BO) + movapd %xmm6, 10 * SIZE(BO) + movapd %xmm15, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + testq $1, M + je .L20 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -15 * SIZE(AO), %xmm0 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 6 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -13 * SIZE(AO), %xmm0 + movsd 8 * SIZE(BO), %xmm2 + movsd 10 * SIZE(BO), %xmm3 + movsd 12 * SIZE(BO), %xmm4 + movsd 14 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 + movsd -14 * SIZE(B), %xmm14 + movsd -13 * SIZE(B), %xmm15 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 + movsd -14 * SIZE(AO), %xmm14 + movsd -13 * SIZE(AO), %xmm15 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + subsd %xmm10, %xmm14 + subsd %xmm11, %xmm15 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movlpd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(B), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(B), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(B), %xmm13 + movlpd -10 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(B), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(B), %xmm14 + movlpd -5 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(B), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(B), %xmm15 + + movlpd -2 * SIZE(B), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(B), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(B), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(B), %xmm14 + + movlpd -7 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(B), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(B), %xmm13 + + movlpd -12 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + movsd %xmm14, -14 * SIZE(B) + movsd %xmm15, -13 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) + movsd %xmm14, -12 * SIZE(BO) + movsd %xmm14, -11 * SIZE(BO) + movsd %xmm15, -10 * SIZE(BO) + movsd %xmm15, -9 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jne .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(B), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(B), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(B), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(B), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(B), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(B), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(B), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 -3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 -3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 +#endif + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm10 + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm14 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 0 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd 4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd 6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + addq $32 * SIZE, BO + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subq $1, %rax + mulpd %xmm1, %xmm5 + + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + movapd -8 * SIZE(B), %xmm1 + movapd -6 * SIZE(B), %xmm3 + movapd -4 * SIZE(B), %xmm5 + movapd -2 * SIZE(B), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm5, -4 * SIZE(B) + movapd %xmm7, -2 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm3, %xmm2 + SHUFPD_3 %xmm3, %xmm3 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + movddup %xmm7, %xmm6 + SHUFPD_3 %xmm7, %xmm7 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + je .L60 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(B), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm13 + + movlpd -14 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + movapd -12 * SIZE(B), %xmm1 + movapd -10 * SIZE(B), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm5, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) + movapd %xmm0, -8 * SIZE(BO) + movapd %xmm1, -6 * SIZE(BO) + movapd %xmm4, -4 * SIZE(BO) + movapd %xmm5, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + testq $1, M + je .L100 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -12 * SIZE(BO), %xmm2 + movsd -10 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(B) + + movlpd %xmm10, -16 * SIZE(BO) + movlpd %xmm10, -15 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) +#else + prefetcht2 3 * SIZE(CO1) +#endif + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -14 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + movapd -10 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + movddup %xmm11, %xmm9 + SHUFPD_3 %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S b/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S new file mode 100644 index 0000000..09f9122 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S @@ -0,0 +1,3425 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L20 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -2 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm12 + movaps -14 * SIZE(BO), %xmm13 +#else + movaps -16 * SIZE(AO), %xmm12 + movaps -14 * SIZE(AO), %xmm13 +#endif + + subpd %xmm8, %xmm12 + subpd %xmm9, %xmm13 + +#if defined(RN) || defined(RT) + movhlps %xmm13, %xmm15 + movsd %xmm13, %xmm14 + movhlps %xmm12, %xmm13 + movsd %xmm12, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm12 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movlpd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(BO), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(BO), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(BO), %xmm13 + movlpd -10 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(BO), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(BO), %xmm14 + movlpd -5 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(BO), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(BO), %xmm15 + + movlpd -2 * SIZE(BO), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(BO), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(BO), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(BO), %xmm14 + + movlpd -7 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(BO), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(BO), %xmm13 + + movlpd -12 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + movsd %xmm13, 0 * SIZE(CO1, LDC, 2) + movhps %xmm13, 0 * SIZE(CO2, LDC, 2) + + movaps %xmm12, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm5, %xmm5 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 -4 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 -4 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 3 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + addpd %xmm3, %xmm11 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 16 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + subq $-32 * SIZE, AO + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movapd %xmm14, %xmm0 + movsd %xmm15, %xmm14 + movsd %xmm0, %xmm15 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L60 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + movhlps %xmm8, %xmm9 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(BO), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(BO), %xmm13 + + movlpd -14 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm13, -15 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 -4 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#else + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm5, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L90 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -12 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + pxor %xmm9, %xmm9 + movhps -15 * SIZE(BO), %xmm2 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S b/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S new file mode 100644 index 0000000..ca0bfbd --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S @@ -0,0 +1,4150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#ifndef ALLOC_HUGETLB +#define PREFETCHSIZE (8 * 4 + 4) +#else +#define PREFETCHSIZE (8 * 2 + 4) +#endif +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 4 + 4) +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 40 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $16 * SIZE, BO + addq $ 8 * SIZE, B + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + movsd %xmm2, 4 * SIZE(BO) + movsd %xmm2, 5 * SIZE(BO) + movsd %xmm3, 6 * SIZE(BO) + movsd %xmm3, 7 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + testq $1, M + je .L20 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 10 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 12 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 20 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 22 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 26 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 28 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 30 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 4 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 34 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 36 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 38 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 64 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 5 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 42 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 46 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 72 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 6 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 50 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 52 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 54 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 80 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 7 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 58 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 60 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 62 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 88 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 8 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 + movsd 2 * SIZE(B), %xmm6 + movsd 3 * SIZE(B), %xmm7 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 + movsd 2 * SIZE(AO), %xmm6 + movsd 3 * SIZE(AO), %xmm7 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + subsd %xmm2, %xmm6 + subsd %xmm3, %xmm7 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movlpd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + movlpd 2 * SIZE(B), %xmm2 + mulsd %xmm4, %xmm2 + subsd %xmm2, %xmm6 + movlpd 3 * SIZE(B), %xmm3 + mulsd %xmm4, %xmm3 + subsd %xmm3, %xmm7 + + mulsd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm6 + movlpd 7 * SIZE(B), %xmm2 + mulsd %xmm5, %xmm2 + subsd %xmm2, %xmm7 + + mulsd 10 * SIZE(B), %xmm6 + movlpd 11 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm7 + + mulsd 15 * SIZE(B), %xmm7 +#endif + +#ifdef RT + mulsd 15 * SIZE(B), %xmm7 + + movlpd 14 * SIZE(B), %xmm1 + mulsd %xmm7, %xmm1 + subsd %xmm1, %xmm6 + movlpd 13 * SIZE(B), %xmm2 + mulsd %xmm7, %xmm2 + subsd %xmm2, %xmm5 + movlpd 12 * SIZE(B), %xmm3 + mulsd %xmm7, %xmm3 + subsd %xmm3, %xmm4 + + mulsd 10 * SIZE(B), %xmm6 + + movlpd 9 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm5 + movlpd 8 * SIZE(B), %xmm2 + mulsd %xmm6, %xmm2 + subsd %xmm2, %xmm4 + + mulsd 5 * SIZE(B), %xmm5 + + movlpd 4 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + movsd %xmm6, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + movsd %xmm6, 2 * SIZE(B) + movsd %xmm7, 3 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) + movsd %xmm6, 4 * SIZE(BO) + movsd %xmm6, 5 * SIZE(BO) + movsd %xmm7, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) + movsd %xmm6, 2 * SIZE(AO) + movsd %xmm7, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm0 + movapd 18 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm1 + movapd 20 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + mulpd 22 * SIZE(BO), %xmm8 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm8, %xmm3 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm0 + movapd 26 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm1 + movapd 28 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + mulpd 30 * SIZE(BO), %xmm8 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 34 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movapd 36 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 38 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm2 + movapd 64 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm3 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 42 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movapd 44 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + mulpd 46 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm2 + movapd 72 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm0 + movapd 50 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm1 + movapd 52 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + mulpd 54 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 80 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm0 + movapd 58 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm1 + movapd 60 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + mulpd 62 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 88 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(BO), %xmm9 + movapd 2 * SIZE(BO), %xmm11 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + +#ifdef LN + PREFETCHW -4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW -4 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW -4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW -4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 +#else + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + movapd 8 * SIZE(B), %xmm9 + movapd 10 * SIZE(B), %xmm11 + movapd 12 * SIZE(B), %xmm13 + movapd 14 * SIZE(B), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + movapd %xmm9, 8 * SIZE(B) + movapd %xmm11, 10 * SIZE(B) + movapd %xmm13, 12 * SIZE(B) + movapd %xmm15, 14 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) + movlpd %xmm9, 16 * SIZE(BO) + movlpd %xmm9, 17 * SIZE(BO) + movhpd %xmm9, 18 * SIZE(BO) + movhpd %xmm9, 19 * SIZE(BO) + movlpd %xmm11, 20 * SIZE(BO) + movlpd %xmm11, 21 * SIZE(BO) + movhpd %xmm11, 22 * SIZE(BO) + movhpd %xmm11, 23 * SIZE(BO) + movlpd %xmm13, 24 * SIZE(BO) + movlpd %xmm13, 25 * SIZE(BO) + movhpd %xmm13, 26 * SIZE(BO) + movhpd %xmm13, 27 * SIZE(BO) + movlpd %xmm15, 28 * SIZE(BO) + movlpd %xmm15, 29 * SIZE(BO) + movhpd %xmm15, 30 * SIZE(BO) + movhpd %xmm15, 31 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + je .L60 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 10 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm0 + movsd 12 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + mulsd %xmm10, %xmm13 + mulsd 18 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm0 + movsd 20 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm1 + movsd 5 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm13 + mulsd 22 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm3 + movsd 6 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 26 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm0 + movsd 28 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm1 + movsd 7 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 30 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm8, %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + + mulsd 3 * SIZE(B), %xmm5 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm5 + + movlpd 2 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm13 + mulpd 18 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + mulpd 22 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 26 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 30 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + movapd 16 * SIZE(AO), %xmm12 + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movapd 24 * SIZE(BO), %xmm15 + +#ifdef LN + PREFETCHW -4 * SIZE(CO1) + PREFETCHW -4 * SIZE(CO2) +#else + PREFETCHW 4 * SIZE(CO1) + PREFETCHW 4 * SIZE(CO2) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 32 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 8 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 40 * SIZE(AO), %xmm10 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 16 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 18 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 20 * SIZE(AO), %xmm12 + + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 22 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 48 * SIZE(AO), %xmm12 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 24 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 26 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 28 * SIZE(AO), %xmm14 + + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 30 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 56 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm9 + movapd 6 * SIZE(B), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm9, 4 * SIZE(B) + movapd %xmm13, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) + movlpd %xmm9, 8 * SIZE(BO) + movlpd %xmm9, 9 * SIZE(BO) + movhpd %xmm9, 10 * SIZE(BO) + movhpd %xmm9, 11 * SIZE(BO) + movlpd %xmm13, 12 * SIZE(BO) + movlpd %xmm13, 13 * SIZE(BO) + movhpd %xmm13, 14 * SIZE(BO) + movhpd %xmm13, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movsd 0 * SIZE(B), %xmm0 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + testq $1, M + je .L100 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 16 * SIZE(BO), %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm1 + movsd 2 * SIZE(AO), %xmm8 + mulsd 4 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm2 + movsd 3 * SIZE(AO), %xmm8 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + mulsd %xmm10, %xmm11 + movsd 5 * SIZE(AO), %xmm10 + addsd %xmm11, %xmm0 + movsd 24 * SIZE(BO), %xmm11 + mulsd 10 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm1 + movsd 6 * SIZE(AO), %xmm10 + mulsd 12 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm2 + movsd 7 * SIZE(AO), %xmm10 + mulsd 14 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm8, %xmm9 + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + addsd %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(AO), %xmm8 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 4 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movapd 6 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 12 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movapd 14 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movapd 24 * SIZE(AO), %xmm14 + +#ifdef LN + PREFETCHW -4 * SIZE(CO1) +#else + PREFETCHW 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm12 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm0 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm11, %xmm1 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm12 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm2 + movapd 48 * SIZE(AO), %xmm12 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm11, %xmm3 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm0 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm1 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm2 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm3 + movapd 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S b/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S new file mode 100644 index 0000000..66a5e40 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S @@ -0,0 +1,3873 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 272 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L30 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $2, M + BRANCH + je .L20 + ALIGN_4 + +.L21: + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta -4 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta -4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta -4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 +#else + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + movapd 8 * SIZE(BO), %xmm9 + movapd 10 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm13 + movapd 14 * SIZE(BO), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) + movapd %xmm9, 8 * SIZE(BO) + movapd %xmm11, 10 * SIZE(BO) + movapd %xmm13, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + + testq $1, M + je .L70 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $2, M + je .L60 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-4 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) + prefetchnta -4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + movapd 4 * SIZE(BO), %xmm9 + movapd 6 * SIZE(BO), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) + movapd %xmm9, 4 * SIZE(BO) + movapd %xmm13, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L110 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $2, M + je .L100 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L100: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S new file mode 100644 index 0000000..28c2ca0 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S @@ -0,0 +1,4847 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + testq $1, M + BRANCH + jle .L20 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + movsd -28 * SIZE(AO), %xmm4 + movhps -26 * SIZE(AO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + unpcklps %xmm5, %xmm4 + unpcklps %xmm7, %xmm6 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) + movlps %xmm4, -28 * SIZE(AO) + movlps %xmm6, -26 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO1, LDC, 2) + movss %xmm3, (CO1, %rax, 1) + + movss %xmm4, (CO2) + movss %xmm5, (CO2, LDC, 1) + movss %xmm6, (CO2, LDC, 2) + movss %xmm7, (CO2, %rax, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm5 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm3 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 + subps %xmm10, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + movaps -24 * SIZE(AO), %xmm4 + movaps -20 * SIZE(AO), %xmm6 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + subps %xmm10, %xmm4 + subps %xmm11, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm2, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movaps %xmm2, %xmm5 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm5 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO1, LDC, 2) + movhps %xmm4, (CO1, %rax, 1) + + movsd %xmm2, (CO2) + movhps %xmm2, (CO2, LDC, 1) + movsd %xmm5, (CO2, LDC, 2) + movhps %xmm5, (CO2, %rax, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + movlhps %xmm5, %xmm4 + movlhps %xmm7, %xmm6 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + movaps %xmm4, -24 * SIZE(AO) + movaps %xmm6, -20 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO1, LDC, 2) + movsd %xmm3, (CO1, %rax, 1) + + movsd %xmm4, (CO2) + movsd %xmm5, (CO2, LDC, 1) + movsd %xmm6, (CO2, LDC, 2) + movsd %xmm7, (CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I + NOBRANCH + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht2 -4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 -4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 -4 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht2 -4 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht2 -4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 -4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht2 -4 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht2 -4 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps %xmm12, %xmm4 + shufps $0x88, %xmm13, %xmm12 + movaps %xmm14, %xmm5 + shufps $0x88, %xmm15, %xmm14 + shufps $0xdd, %xmm15, %xmm4 + shufps $0xdd, %xmm13, %xmm5 + + movaps %xmm12, %xmm6 + shufps $0x88, %xmm14, %xmm12 + shufps $0xdd, %xmm6, %xmm14 + + movaps %xmm4, %xmm13 + movaps %xmm5, %xmm15 + shufps $0x22, %xmm5, %xmm13 + shufps $0x77, %xmm4, %xmm15 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm5 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm6 + movaps -8 * SIZE(BO), %xmm3 + movaps -4 * SIZE(BO), %xmm7 + +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 + movaps -16 * SIZE(AO), %xmm4 + movaps -12 * SIZE(AO), %xmm5 + movaps -8 * SIZE(AO), %xmm6 + movaps -4 * SIZE(AO), %xmm7 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + subps %xmm12, %xmm4 + subps %xmm13, %xmm5 + subps %xmm14, %xmm6 + subps %xmm15, %xmm7 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm5, -20 * SIZE(BO) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm6, -12 * SIZE(BO) + movaps %xmm3, -8 * SIZE(BO) + movaps %xmm7, -4 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm8, %xmm5 + + movaps %xmm6, %xmm9 + shufps $0x88, %xmm7, %xmm6 + shufps $0xdd, %xmm9, %xmm7 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm6, %xmm4 + movaps %xmm5, %xmm9 + shufps $0x22, %xmm7, %xmm5 + shufps $0xdd, %xmm6, %xmm8 + movaps %xmm8, %xmm6 + shufps $0x77, %xmm7, %xmm9 + movaps %xmm9, %xmm7 + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) + movaps %xmm4, -16 * SIZE(AO) + movaps %xmm5, -12 * SIZE(AO) + movaps %xmm6, -8 * SIZE(AO) + movaps %xmm7, -4 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 2 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movsd %xmm5, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 2 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhps %xmm6, 2 * SIZE(CO2, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 2 * SIZE(CO2, %rax, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L40: + testq $4, N + jle .L70 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L50 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO2) + movss %xmm3, (CO2, LDC, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO2) + movhps %xmm4, (CO2, LDC, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO2) + movsd %xmm3, (CO2, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $2, I + NOBRANCH + jle .L69 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 -4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 -4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 -4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 -4 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 2 * SIZE(CO2, LDC, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L70: + testq $2, N + jle .L100 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L80 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 + + mulss %xmm8, %xmm0 + mulss %xmm8, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) + movss %xmm1, -31 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) + movss %xmm1, -31 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO2) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + pshufd $0xd8, %xmm8, %xmm8 + + movaps -32 * SIZE(BO), %xmm0 +#else + movaps -32 * SIZE(AO), %xmm0 +#endif + + subps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO2) +#else + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm1, -30 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + movq M, I + sarq $2, I + NOBRANCH + jle .L99 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 -4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 -4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 +#endif + + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + movlps %xmm2, -28 * SIZE(BO) + movlps %xmm3, -26 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + testq $1, N + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L110 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addss %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BO), %xmm0 + + subss %xmm8, %xmm0 +#else + movss -32 * SIZE(AO), %xmm0 + + subss %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 +#endif + + mulss %xmm8, %xmm0 + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0x55, %xmm0, %xmm1 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) +#else + movlps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L120: + movq M, I + sarq $2, I + NOBRANCH + jle .L129 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 -4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + movhps -30 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm2, -30 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) +#else + movaps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L129: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S new file mode 100644 index 0000000..513572e --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S @@ -0,0 +1,5950 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define movsd movlps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + EMMS + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $2 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + pshufd $0x55, %xmm11, %xmm9 + pshufd $0xaa, %xmm11, %xmm10 + pshufd $0xff, %xmm11, %xmm11 + + pshufd $0x00, %xmm15, %xmm12 + pshufd $0x55, %xmm15, %xmm13 + pshufd $0xaa, %xmm15, %xmm14 + pshufd $0xff, %xmm15, %xmm15 + + movaps %xmm8, 32 * SIZE(BO) + movaps %xmm9, 36 * SIZE(BO) + movaps %xmm10, 40 * SIZE(BO) + movaps %xmm11, 44 * SIZE(BO) + movaps %xmm12, 48 * SIZE(BO) + movaps %xmm13, 52 * SIZE(BO) + movaps %xmm14, 56 * SIZE(BO) + movaps %xmm15, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + testq $1, M + je .L20 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss 2 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss 3 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss 8 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss 5 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss 7 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + unpcklps %xmm1, %xmm0 + + movapd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + movss 2 * SIZE(AO), %xmm12 + movss 3 * SIZE(AO), %xmm14 + + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 + subss %xmm2, %xmm12 + subss %xmm3, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) + movss %xmm12, 2 * SIZE(AO) + movss %xmm14, 3 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm12, 0 * SIZE(CO1, LDC, 2) + movss %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movapd 0 * SIZE(B), %xmm1 + movapd 4 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 4 * SIZE(AO), %xmm12 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 6 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) + movlps %xmm12, 4 * SIZE(AO) + movlps %xmm14, 6 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $4, M + je .L40 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + movaps 8 * SIZE(AO), %xmm12 + movaps 12 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) + movaps %xmm12, 8 * SIZE(AO) + movaps %xmm14, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + movq M, I + sarq $3, I # i = (m >> 3) + jle .L49 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW -8 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW -8 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW -8 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW -8 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 4 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 8 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 16 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 16 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 28 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 24 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 28 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 36 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 60 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 40 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 80 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 68 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 72 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 44 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + movaps 16 * SIZE(B), %xmm12 + movaps 20 * SIZE(B), %xmm13 + movaps 24 * SIZE(B), %xmm14 + movaps 28 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + movaps 16 * SIZE(AO), %xmm12 + movaps 20 * SIZE(AO), %xmm13 + movaps 24 * SIZE(AO), %xmm14 + movaps 28 * SIZE(AO), %xmm15 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 + subps %xmm2, %xmm12 + subps %xmm6, %xmm13 + subps %xmm3, %xmm14 + subps %xmm7, %xmm15 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm15 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm15 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm15 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm9 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm9 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + movaps %xmm12, 16 * SIZE(B) + movaps %xmm13, 20 * SIZE(B) + movaps %xmm14, 24 * SIZE(B) + movaps %xmm15, 28 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + pshufd $0xaa, %xmm12, %xmm4 + pshufd $0xff, %xmm12, %xmm6 + movaps %xmm2, 64 * SIZE(BO) + movaps %xmm3, 68 * SIZE(BO) + movaps %xmm4, 72 * SIZE(BO) + movaps %xmm6, 76 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + pshufd $0xaa, %xmm13, %xmm4 + pshufd $0xff, %xmm13, %xmm6 + movaps %xmm2, 80 * SIZE(BO) + movaps %xmm3, 84 * SIZE(BO) + movaps %xmm4, 88 * SIZE(BO) + movaps %xmm6, 92 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + pshufd $0xaa, %xmm14, %xmm4 + pshufd $0xff, %xmm14, %xmm6 + movaps %xmm2, 96 * SIZE(BO) + movaps %xmm3, 100 * SIZE(BO) + movaps %xmm4, 104 * SIZE(BO) + movaps %xmm6, 108 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + pshufd $0xaa, %xmm15, %xmm4 + pshufd $0xff, %xmm15, %xmm6 + movaps %xmm2, 112 * SIZE(BO) + movaps %xmm3, 116 * SIZE(BO) + movaps %xmm4, 120 * SIZE(BO) + movaps %xmm6, 124 * SIZE(BO) + +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) + movaps %xmm12, 16 * SIZE(AO) + movaps %xmm13, 20 * SIZE(AO) + movaps %xmm14, 24 * SIZE(AO) + movaps %xmm15, 28 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movaps %xmm12, %xmm2 + unpcklps %xmm14, %xmm12 + unpckhps %xmm14, %xmm2 + + movaps %xmm13, %xmm7 + unpcklps %xmm15, %xmm13 + unpckhps %xmm15, %xmm7 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movaps %xmm2, %xmm15 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm15 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm11, 4 * SIZE(CO2) + movhps %xmm11, 6 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm13, 4 * SIZE(CO1, LDC, 2) + movhps %xmm13, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $32 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $1 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + je .L70 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 3 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 8 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 5 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 6 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 7 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (AO, %rax, SIZE), AO +#ifdef LT + addq $ 2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $2, M + je .L80 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $ 4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L80: + testq $4, M + je .L90 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $ 8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + movq M, I + sarq $3, I # i = (m >> 3) + jle .L99 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW -8 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW -8 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + addps %xmm12, %xmm5 + movaps 96 * SIZE(BO), %xmm13 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 8 * SIZE(B), %xmm12 +#ifdef movsd + xorps %xmm13, %xmm13 +#endif + movsd 10 * SIZE(B), %xmm13 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 12 * SIZE(B), %xmm14 +#ifdef movsd + xorps %xmm15, %xmm15 +#endif + movsd 14 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + movlps %xmm12, 8 * SIZE(B) + movlps %xmm13, 10 * SIZE(B) + movlps %xmm14, 12 * SIZE(B) + movlps %xmm15, 14 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + movaps %xmm2, 56 * SIZE(BO) + movaps %xmm3, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm14, 4 * SIZE(CO1, LDC, 1) + movhps %xmm14, 6 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm11, 4 * SIZE(CO1, LDC, 1) + movhps %xmm11, 6 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L61 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + testq $1, N + je .L999 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + testq $1, M + je .L120 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 1 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + subps %xmm0, %xmm8 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AO), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L120: + testq $2, M + je .L130 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + testq $4, M + je .L140 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L140: + movq M, I + sarq $3, I # i = (m >> 3) + jle .L149 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW -8 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps 12 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps 20 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps 28 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 36 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 44 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 52 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 60 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 8), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 + subss %xmm4, %xmm12 + subss %xmm6, %xmm13 + subss %xmm9, %xmm14 + subss %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm8, %xmm15 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + movss %xmm12, 4 * SIZE(B) + movss %xmm13, 5 * SIZE(B) + movss %xmm14, 6 * SIZE(B) + movss %xmm15, 7 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + movaps %xmm2, 16 * SIZE(BO) + pshufd $0x00, %xmm13, %xmm2 + movaps %xmm2, 20 * SIZE(BO) + pshufd $0x00, %xmm14, %xmm2 + movaps %xmm2, 24 * SIZE(BO) + pshufd $0x00, %xmm15, %xmm2 + movaps %xmm2, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + unpcklps %xmm13, %xmm12 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L111 + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + EMMS + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S new file mode 100644 index 0000000..b04299a --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S @@ -0,0 +1,3077 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetcht0 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 1 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 1 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 2 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 1 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 2 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -10 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addq $32 * SIZE, BO + subq $-8 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm13 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $0, %xmm13, %xmm12 + shufpd $3, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $0, %xmm15, %xmm14 + shufpd $3, %xmm0, %xmm15 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm2 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm6 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + shufpd $2, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + shufpd $2, %xmm0, %xmm15 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + subpd %xmm12, %xmm4 + subpd %xmm13, %xmm5 + subpd %xmm14, %xmm6 + subpd %xmm15, %xmm7 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -14 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + movapd %xmm12, %xmm14 + movapd %xmm12, %xmm15 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + mulpd %xmm5, %xmm14 + mulpd %xmm7, %xmm15 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + subpd %xmm14, %xmm4 + subpd %xmm15, %xmm6 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 + + movddup -15 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + movapd %xmm12, %xmm14 + movapd %xmm12, %xmm15 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + mulpd %xmm4, %xmm14 + mulpd %xmm6, %xmm15 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + subpd %xmm14, %xmm5 + subpd %xmm15, %xmm7 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + movddup -12 * SIZE(BO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm4 + movddup -11 * SIZE(BO), %xmm13 + mulpd %xmm0, %xmm13 + subpd %xmm13, %xmm5 + movddup -10 * SIZE(BO), %xmm14 + mulpd %xmm0, %xmm14 + subpd %xmm14, %xmm6 + movddup -9 * SIZE(BO), %xmm15 + mulpd %xmm0, %xmm15 + subpd %xmm15, %xmm7 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + movddup -4 * SIZE(BO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm4 + movddup -3 * SIZE(BO), %xmm13 + mulpd %xmm1, %xmm13 + subpd %xmm13, %xmm5 + movddup -2 * SIZE(BO), %xmm14 + mulpd %xmm1, %xmm14 + subpd %xmm14, %xmm6 + movddup -1 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm7 + + movddup 2 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + movddup 4 * SIZE(BO), %xmm12 + mulpd %xmm2, %xmm12 + subpd %xmm12, %xmm4 + movddup 5 * SIZE(BO), %xmm13 + mulpd %xmm2, %xmm13 + subpd %xmm13, %xmm5 + movddup 6 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm6 + movddup 7 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm7 + + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm4 + movddup 13 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm5 + movddup 14 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm6 + movddup 15 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm7 + + movddup 20 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm6 + movddup 23 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm7 + + movddup 29 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm5 + movddup 30 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm7 + + movddup 38 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm6 + movddup 39 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm7 + + movddup 47 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm7 +#endif + +#ifdef RT + movddup 47 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm7 + movddup 46 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm6 + movddup 45 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm5 + movddup 44 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm4 + movddup 43 * SIZE(BO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + movddup 42 * SIZE(BO), %xmm13 + mulpd %xmm7, %xmm13 + subpd %xmm13, %xmm2 + movddup 41 * SIZE(BO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm1 + movddup 40 * SIZE(BO), %xmm15 + mulpd %xmm7, %xmm15 + subpd %xmm15, %xmm0 + + movddup 38 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm6 + movddup 37 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm5 + movddup 36 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm4 + movddup 35 * SIZE(BO), %xmm12 + mulpd %xmm6, %xmm12 + subpd %xmm12, %xmm3 + movddup 34 * SIZE(BO), %xmm13 + mulpd %xmm6, %xmm13 + subpd %xmm13, %xmm2 + movddup 33 * SIZE(BO), %xmm14 + mulpd %xmm6, %xmm14 + subpd %xmm14, %xmm1 + movddup 32 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm0 + + movddup 29 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm5 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm5, %xmm11 + subpd %xmm11, %xmm4 + movddup 27 * SIZE(BO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + movddup 26 * SIZE(BO), %xmm13 + mulpd %xmm5, %xmm13 + subpd %xmm13, %xmm2 + movddup 25 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm1 + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm0 + + movddup 20 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm4 + movddup 19 * SIZE(BO), %xmm12 + mulpd %xmm4, %xmm12 + subpd %xmm12, %xmm3 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm2 + movddup 17 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm1 + movddup 16 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm0 + + movddup 11 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup 10 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup 9 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup 8 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup 2 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup 1 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup 0 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm2, -14 * SIZE(BO) + movapd %xmm4, -12 * SIZE(BO) + movapd %xmm6, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5 , -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 1 * SIZE(CO1, LDC, 2) + movhps %xmm2, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 1 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movsd %xmm5, 1 * SIZE(CO2) + movhps %xmm4, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 1 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movsd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhps %xmm6, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 1 * SIZE(CO2, %rax, 1) +#else + movups %xmm0, 0 * SIZE(CO1) + movups %xmm1, 0 * SIZE(CO1, LDC, 1) + movups %xmm2, 0 * SIZE(CO1, LDC, 2) + movups %xmm3, 0 * SIZE(CO1, %rax, 1) + movups %xmm4, 0 * SIZE(CO2) + movups %xmm5, 0 * SIZE(CO2, LDC, 1) + movups %xmm6, 0 * SIZE(CO2, LDC, 2) + movups %xmm7, 0 * SIZE(CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm3, %xmm7 + movaps %xmm3, %xmm6 + pshufd $0xe, %xmm2, %xmm5 + movaps %xmm2, %xmm4 + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + movsd -12 * SIZE(BO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm4 + movsd -11 * SIZE(BO), %xmm13 + mulsd %xmm0, %xmm13 + subsd %xmm13, %xmm5 + movsd -10 * SIZE(BO), %xmm14 + mulsd %xmm0, %xmm14 + subsd %xmm14, %xmm6 + movsd -9 * SIZE(BO), %xmm15 + mulsd %xmm0, %xmm15 + subsd %xmm15, %xmm7 + + movsd -7 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + movsd -4 * SIZE(BO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm4 + movsd -3 * SIZE(BO), %xmm13 + mulsd %xmm1, %xmm13 + subsd %xmm13, %xmm5 + movsd -2 * SIZE(BO), %xmm14 + mulsd %xmm1, %xmm14 + subsd %xmm14, %xmm6 + movsd -1 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm7 + + movsd 2 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd 3 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + movsd 4 * SIZE(BO), %xmm12 + mulsd %xmm2, %xmm12 + subsd %xmm12, %xmm4 + movsd 5 * SIZE(BO), %xmm13 + mulsd %xmm2, %xmm13 + subsd %xmm13, %xmm5 + movsd 6 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm6 + movsd 7 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm7 + + movsd 11 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 + movsd 12 * SIZE(BO), %xmm12 + mulsd %xmm3, %xmm12 + subsd %xmm12, %xmm4 + movsd 13 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm5 + movsd 14 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm6 + movsd 15 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm7 + + movsd 20 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm4 + movsd 21 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm5 + movsd 22 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm6 + movsd 23 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm7 + + movsd 29 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm5 + movsd 30 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm6 + movsd 31 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm7 + + movsd 38 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm6 + movsd 39 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm7 + + movsd 47 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm7 +#endif + +#ifdef RT + movsd 47 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm7 + movsd 46 * SIZE(BO), %xmm9 + mulsd %xmm7, %xmm9 + subsd %xmm9, %xmm6 + movsd 45 * SIZE(BO), %xmm10 + mulsd %xmm7, %xmm10 + subsd %xmm10, %xmm5 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm7, %xmm11 + subsd %xmm11, %xmm4 + movsd 43 * SIZE(BO), %xmm12 + mulsd %xmm7, %xmm12 + subsd %xmm12, %xmm3 + movsd 42 * SIZE(BO), %xmm13 + mulsd %xmm7, %xmm13 + subsd %xmm13, %xmm2 + movsd 41 * SIZE(BO), %xmm14 + mulsd %xmm7, %xmm14 + subsd %xmm14, %xmm1 + movsd 40 * SIZE(BO), %xmm15 + mulsd %xmm7, %xmm15 + subsd %xmm15, %xmm0 + + movsd 38 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm6 + movsd 37 * SIZE(BO), %xmm10 + mulsd %xmm6, %xmm10 + subsd %xmm10, %xmm5 + movsd 36 * SIZE(BO), %xmm11 + mulsd %xmm6, %xmm11 + subsd %xmm11, %xmm4 + movsd 35 * SIZE(BO), %xmm12 + mulsd %xmm6, %xmm12 + subsd %xmm12, %xmm3 + movsd 34 * SIZE(BO), %xmm13 + mulsd %xmm6, %xmm13 + subsd %xmm13, %xmm2 + movsd 33 * SIZE(BO), %xmm14 + mulsd %xmm6, %xmm14 + subsd %xmm14, %xmm1 + movsd 32 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm0 + + movsd 29 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm5 + movsd 28 * SIZE(BO), %xmm11 + mulsd %xmm5, %xmm11 + subsd %xmm11, %xmm4 + movsd 27 * SIZE(BO), %xmm12 + mulsd %xmm5, %xmm12 + subsd %xmm12, %xmm3 + movsd 26 * SIZE(BO), %xmm13 + mulsd %xmm5, %xmm13 + subsd %xmm13, %xmm2 + movsd 25 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm1 + movsd 24 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm0 + + movsd 20 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm4 + movsd 19 * SIZE(BO), %xmm12 + mulsd %xmm4, %xmm12 + subsd %xmm12, %xmm3 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm2 + movsd 17 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm1 + movsd 16 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm0 + + movsd 11 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd 10 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd 9 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd 8 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd 2 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd 1 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd 0 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 + movaps %xmm4, %xmm2 + unpcklpd %xmm5, %xmm2 + movaps %xmm6, %xmm3 + unpcklpd %xmm7, %xmm3 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO1, %rax, 1) + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 0 * SIZE(CO2, %rax, 1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) + movapd %xmm2, -12 * SIZE(BO) + movapd %xmm3, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $4, N + jle .L50 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 2 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm2 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm3 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + + movddup -15 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + + movddup -11 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -10 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -9 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + + movddup -1 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup -2 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup -3 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup -4 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup -6 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -11 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -12 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm2, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm3, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + + movsd -11 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -10 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -9 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + + movsd -1 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd -2 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd -3 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd -4 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd -6 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -11 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -12 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 0 * SIZE(CO2, LDC, 1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L50: + testq $2, N + jle .L70 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm0 + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(AO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 + movddup -15 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm1 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -14 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 +#else + movapd -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 + movsd -15 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm1 + + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -14 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 +#else + movapd -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef LN + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 + movsd -14 * SIZE(AO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm0 + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(AO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm1 + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm1, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L89 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movhps -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhps -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 + movsd -14 * SIZE(AO), %xmm0 + movhps -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movsd -14 * SIZE(BO), %xmm1 + movhps -13 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movsd -12 * SIZE(AO), %xmm0 + movhps -11 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movsd -12 * SIZE(BO), %xmm1 + movhps -11 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm0 +#else + movsd -16 * SIZE(AO), %xmm0 +#endif + + subsd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BO) +#else + movsd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x2_atom.S b/kernel/x86_64/trsm_kernel_LT_4x2_atom.S new file mode 100644 index 0000000..c6ad0a2 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x2_atom.S @@ -0,0 +1,2116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addsd %xmm2, %xmm13 + addsd %xmm7, %xmm14 + addsd %xmm6, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 5 * SIZE(BO), %xmm5 + movsd 6 * SIZE(BO), %xmm6 + movsd 7 * SIZE(BO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 + subsd %xmm12, %xmm4 + subsd %xmm13, %xmm5 + subsd %xmm14, %xmm6 + subsd %xmm15, %xmm7 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + movsd 4 * SIZE(AO), %xmm1 + movsd 5 * SIZE(AO), %xmm3 + movsd 6 * SIZE(AO), %xmm5 + movsd 7 * SIZE(AO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 + subsd %xmm13, %xmm5 + subsd %xmm15, %xmm7 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm7 + movsd 13 * SIZE(AO), %xmm11 + + movaps %xmm9, %xmm10 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm9 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm7, %xmm10 + subsd %xmm9, %xmm4 + movsd 9 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm6, %xmm11 + mulsd %xmm7, %xmm12 + subsd %xmm11, %xmm2 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm3 + + movaps %xmm13, %xmm14 + mulsd %xmm6, %xmm13 + mulsd %xmm7, %xmm14 + subsd %xmm13, %xmm0 + subsd %xmm14, %xmm1 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 5 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm4, %xmm11 + mulsd %xmm5, %xmm12 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm1 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm11, %xmm0 + mulsd %xmm11, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + + movsd 2 * SIZE(AO), %xmm11 + movaps %xmm9, %xmm10 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm9 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm1, %xmm10 + subsd %xmm9, %xmm2 + movsd 6 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm12 + subsd %xmm11, %xmm4 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm5 + + movaps %xmm13, %xmm14 + mulsd %xmm0, %xmm13 + mulsd %xmm1, %xmm14 + subsd %xmm13, %xmm6 + subsd %xmm14, %xmm7 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + movsd 10 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm2, %xmm11 + mulsd %xmm3, %xmm12 + subsd %xmm11, %xmm6 + subsd %xmm12, %xmm7 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 15 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm6 + subsd %xmm10, %xmm7 + + mulsd %xmm8, %xmm6 + mulsd %xmm8, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + mulsd %xmm4, %xmm11 + mulsd %xmm6, %xmm12 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + subsd %xmm11, %xmm5 + subsd %xmm12, %xmm7 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 + mulsd %xmm13, %xmm5 + mulsd %xmm13, %xmm7 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm5 + mulsd %xmm8, %xmm7 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + mulsd %xmm5, %xmm11 + mulsd %xmm7, %xmm12 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm4 + subsd %xmm12, %xmm6 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm4 + mulsd %xmm13, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movsd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) + movsd %xmm4, 4 * SIZE(BO) + movsd %xmm5, 5 * SIZE(BO) + movsd %xmm6, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) + movsd %xmm1, 4 * SIZE(AO) + movsd %xmm3, 5 * SIZE(AO) + movsd %xmm5, 6 * SIZE(AO) + movsd %xmm7, 7 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + addsd %xmm6, %xmm11 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm1 + movsd 3 * SIZE(AO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm2 + movsd 2 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm10 + + subsd %xmm9, %xmm2 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm1, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 3 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm1 + mulsd %xmm13, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm1, %xmm9 + movsd 0 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm0 + mulsd %xmm13, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm4 + movsd 3 * SIZE(BO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm9 + movsd 13 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm4 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm11 + movsd 10 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm2 + movsd 9 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm13 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm0 + + mulsd %xmm8, %xmm4 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm11 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 2 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm2 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm11 + movsd 5 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm4 + movsd 6 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm13 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm6 + + mulsd %xmm8, %xmm2 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm6 + mulsd %xmm8, %xmm4 + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm6 + mulsd %xmm8, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) + movsd %xmm4, 2 * SIZE(BO) + movsd %xmm6, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + je .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + movsd 2 * SIZE(AO), %xmm9 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11,%xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + movsd 1 * SIZE(AO), %xmm9 + movsd 3 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm0 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm2 + mulsd %xmm11,%xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $1, M + je .L69 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + subsd %xmm8, %xmm0 +#else + movsd 0 * SIZE(AO), %xmm0 + subsd %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S new file mode 100644 index 0000000..b133bcf --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S @@ -0,0 +1,3396 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define OFFSET 48(%rsp) +#define AORIG 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define AORIG 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ +/**/ movapd (AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ +/**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ +/***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ +/**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ +/**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm12 +#else + movq STACKSIZE + 8(%rsp), LDC + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movsd %xmm12, OFFSET + movsd %xmm12, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#else + prefetchw -8 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw -8 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw -8 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw -8 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#endif + + prefetch -16 * SIZE(BB) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm15 + + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm11, %xmm14 + subpd %xmm14, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm3 + + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) + movaps %xmm1, -8 * SIZE(BO) + movaps %xmm3, -6 * SIZE(BO) + movaps %xmm5, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) + movaps %xmm4, -8 * SIZE(AO) + movaps %xmm5, -6 * SIZE(AO) + movaps %xmm6, -4 * SIZE(AO) + movaps %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) + movaps %xmm4, -12 * SIZE(AO) + movaps %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#else + movapd -16 * SIZE(AO), %xmm2 + movapd -14 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -15 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd -14 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd -13 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd -9 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -5 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd -3 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd -4 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -7 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd -8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -12 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm3, -14 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) + movaps %xmm3, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 3 * SIZE(CO2) +#else + prefetchw -8 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw -8 * SIZE(CO2) +#endif + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13,-14 * SIZE(BO) + movaps %xmm1, -12 * SIZE(BO) + movaps %xmm5, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 1), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 +#else + movapd -16 * SIZE(AO), %xmm2 +#endif + + subpd %xmm8, %xmm2 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -16 * SIZE(BO), %xmm2 + movsd -15 * SIZE(BO), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + + movlpd -14 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + + mulsd -16 * SIZE(BO), %xmm2 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -14 * SIZE(BO), %xmm3 + +#ifndef LN + prefetchw 3 * SIZE(CO1) +#else + prefetchw -8 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm10 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm11 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm9 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm10 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movlpd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) + movaps %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm10, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movlpd %xmm10, -16 * SIZE(BO) +#else + movlpd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_core2.S b/kernel/x86_64/trsm_kernel_LT_4x4_core2.S new file mode 100644 index 0000000..7864ec5 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_core2.S @@ -0,0 +1,3730 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define J 0(%rsp) +#define OFFSET 8(%rsp) +#define KK 16(%rsp) +#define KKK 24(%rsp) +#define AORIG 32(%rsp) +#define BORIG 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -8 * SIZE(B), %xmm4 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + movddup %xmm2, %xmm10 + unpckhpd %xmm2, %xmm2 + movddup %xmm3, %xmm11 + unpckhpd %xmm3, %xmm3 + movddup %xmm4, %xmm12 + unpckhpd %xmm4, %xmm4 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + movddup %xmm6, %xmm14 + unpckhpd %xmm6, %xmm6 + movddup %xmm7, %xmm15 + unpckhpd %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + movapd %xmm10, -8 * SIZE(BO) + movapd %xmm2, -6 * SIZE(BO) + movapd %xmm11, -4 * SIZE(BO) + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + movapd %xmm12, 0 * SIZE(BO) + movapd %xmm4, 2 * SIZE(BO) + movapd %xmm13, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movapd %xmm14, 8 * SIZE(BO) + movapd %xmm6, 10 * SIZE(BO) + movapd %xmm15, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm10 + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm14 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 0 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd 4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd 6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + addq $32 * SIZE, BO + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subq $1, %rax + mulpd %xmm1, %xmm5 + + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + movapd -8 * SIZE(B), %xmm1 + movapd -6 * SIZE(B), %xmm3 + movapd -4 * SIZE(B), %xmm5 + movapd -2 * SIZE(B), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm5, -4 * SIZE(B) + movapd %xmm7, -2 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm3, %xmm2 + SHUFPD_3 %xmm3, %xmm3 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + movddup %xmm7, %xmm6 + SHUFPD_3 %xmm7, %xmm7 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jne .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(B), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(B), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(B), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(B), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(B), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(B), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(B), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -15 * SIZE(AO), %xmm0 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 6 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -13 * SIZE(AO), %xmm0 + movsd 8 * SIZE(BO), %xmm2 + movsd 10 * SIZE(BO), %xmm3 + movsd 12 * SIZE(BO), %xmm4 + movsd 14 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 + movsd -14 * SIZE(B), %xmm14 + movsd -13 * SIZE(B), %xmm15 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 + movsd -14 * SIZE(AO), %xmm14 + movsd -13 * SIZE(AO), %xmm15 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + subsd %xmm10, %xmm14 + subsd %xmm11, %xmm15 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movlpd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(B), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(B), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(B), %xmm13 + movlpd -10 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(B), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(B), %xmm14 + movlpd -5 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(B), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(B), %xmm15 + + movlpd -2 * SIZE(B), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(B), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(B), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(B), %xmm14 + + movlpd -7 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(B), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(B), %xmm13 + + movlpd -12 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + movsd %xmm14, -14 * SIZE(B) + movsd %xmm15, -13 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) + movsd %xmm14, -12 * SIZE(BO) + movsd %xmm14, -11 * SIZE(BO) + movsd %xmm15, -10 * SIZE(BO) + movsd %xmm15, -9 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + movapd -12 * SIZE(B), %xmm1 + movapd -10 * SIZE(B), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm5, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) + movapd %xmm0, -8 * SIZE(BO) + movapd %xmm1, -6 * SIZE(BO) + movapd %xmm4, -4 * SIZE(BO) + movapd %xmm5, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(B), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm13 + + movlpd -14 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) +#else + prefetcht2 3 * SIZE(CO1) +#endif + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -14 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + movapd -10 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + movddup %xmm11, %xmm9 + SHUFPD_3 %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -12 * SIZE(BO), %xmm2 + movsd -10 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(B) + + movlpd %xmm10, -16 * SIZE(BO) + movlpd %xmm10, -15 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S b/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S new file mode 100644 index 0000000..77fc0c5 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S @@ -0,0 +1,3424 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 -4 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 -4 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 3 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + addpd %xmm3, %xmm11 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 16 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + subq $-32 * SIZE, AO + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movapd %xmm14, %xmm0 + movsd %xmm15, %xmm14 + movsd %xmm0, %xmm15 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm5, %xmm5 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -2 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm12 + movaps -14 * SIZE(BO), %xmm13 +#else + movaps -16 * SIZE(AO), %xmm12 + movaps -14 * SIZE(AO), %xmm13 +#endif + + subpd %xmm8, %xmm12 + subpd %xmm9, %xmm13 + +#if defined(RN) || defined(RT) + movhlps %xmm13, %xmm15 + movsd %xmm13, %xmm14 + movhlps %xmm12, %xmm13 + movsd %xmm12, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm12 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movlpd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(BO), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(BO), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(BO), %xmm13 + movlpd -10 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(BO), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(BO), %xmm14 + movlpd -5 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(BO), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(BO), %xmm15 + + movlpd -2 * SIZE(BO), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(BO), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(BO), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(BO), %xmm14 + + movlpd -7 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(BO), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(BO), %xmm13 + + movlpd -12 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + movsd %xmm13, 0 * SIZE(CO1, LDC, 2) + movhps %xmm13, 0 * SIZE(CO2, LDC, 2) + + movaps %xmm12, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 -4 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#else + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm5, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + movhlps %xmm8, %xmm9 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(BO), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(BO), %xmm13 + + movlpd -14 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm13, -15 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + pxor %xmm9, %xmm9 + movhps -15 * SIZE(BO), %xmm2 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + BRANCH + jle .L119 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -12 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S b/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S new file mode 100644 index 0000000..d50c8d5 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S @@ -0,0 +1,4169 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#ifndef ALLOC_HUGETLB +#define PREFETCHSIZE (8 * 4 + 4) +#else +#define PREFETCHSIZE (8 * 2 + 4) +#endif +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 4 + 4) +#endif + +#ifdef OPTERON +#define movsd movlpd +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 40 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $16 * SIZE, BO + addq $ 8 * SIZE, B + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + movsd %xmm2, 4 * SIZE(BO) + movsd %xmm2, 5 * SIZE(BO) + movsd %xmm3, 6 * SIZE(BO) + movsd %xmm3, 7 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(BO), %xmm9 + movapd 2 * SIZE(BO), %xmm11 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +#else + sarq $3, %rax + je .L15 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jg .L12 + ALIGN_4 +#endif + + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + movapd 8 * SIZE(B), %xmm9 + movapd 10 * SIZE(B), %xmm11 + movapd 12 * SIZE(B), %xmm13 + movapd 14 * SIZE(B), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + movapd %xmm9, 8 * SIZE(B) + movapd %xmm11, 10 * SIZE(B) + movapd %xmm13, 12 * SIZE(B) + movapd %xmm15, 14 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) + movlpd %xmm9, 16 * SIZE(BO) + movlpd %xmm9, 17 * SIZE(BO) + movhpd %xmm9, 18 * SIZE(BO) + movhpd %xmm9, 19 * SIZE(BO) + movlpd %xmm11, 20 * SIZE(BO) + movlpd %xmm11, 21 * SIZE(BO) + movhpd %xmm11, 22 * SIZE(BO) + movhpd %xmm11, 23 * SIZE(BO) + movlpd %xmm13, 24 * SIZE(BO) + movlpd %xmm13, 25 * SIZE(BO) + movhpd %xmm13, 26 * SIZE(BO) + movhpd %xmm13, 27 * SIZE(BO) + movlpd %xmm15, 28 * SIZE(BO) + movlpd %xmm15, 29 * SIZE(BO) + movhpd %xmm15, 30 * SIZE(BO) + movhpd %xmm15, 31 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm0 + movapd 18 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm1 + movapd 20 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + mulpd 22 * SIZE(BO), %xmm8 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm8, %xmm3 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm0 + movapd 26 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm1 + movapd 28 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + mulpd 30 * SIZE(BO), %xmm8 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 34 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movapd 36 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 38 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm2 + movapd 64 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm3 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 42 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movapd 44 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + mulpd 46 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm2 + movapd 72 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm0 + movapd 50 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm1 + movapd 52 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + mulpd 54 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 80 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm0 + movapd 58 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm1 + movapd 60 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + mulpd 62 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 88 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 10 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 12 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 20 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 22 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 26 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 28 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 30 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 4 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 34 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 36 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 38 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 64 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 5 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 42 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 46 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 72 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 6 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 50 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 52 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 54 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 80 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 7 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 58 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 60 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 62 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 88 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 8 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 + movsd 2 * SIZE(B), %xmm6 + movsd 3 * SIZE(B), %xmm7 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 + movsd 2 * SIZE(AO), %xmm6 + movsd 3 * SIZE(AO), %xmm7 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + subsd %xmm2, %xmm6 + subsd %xmm3, %xmm7 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movlpd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + movlpd 2 * SIZE(B), %xmm2 + mulsd %xmm4, %xmm2 + subsd %xmm2, %xmm6 + movlpd 3 * SIZE(B), %xmm3 + mulsd %xmm4, %xmm3 + subsd %xmm3, %xmm7 + + mulsd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm6 + movlpd 7 * SIZE(B), %xmm2 + mulsd %xmm5, %xmm2 + subsd %xmm2, %xmm7 + + mulsd 10 * SIZE(B), %xmm6 + movlpd 11 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm7 + + mulsd 15 * SIZE(B), %xmm7 +#endif + +#ifdef RT + mulsd 15 * SIZE(B), %xmm7 + + movlpd 14 * SIZE(B), %xmm1 + mulsd %xmm7, %xmm1 + subsd %xmm1, %xmm6 + movlpd 13 * SIZE(B), %xmm2 + mulsd %xmm7, %xmm2 + subsd %xmm2, %xmm5 + movlpd 12 * SIZE(B), %xmm3 + mulsd %xmm7, %xmm3 + subsd %xmm3, %xmm4 + + mulsd 10 * SIZE(B), %xmm6 + + movlpd 9 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm5 + movlpd 8 * SIZE(B), %xmm2 + mulsd %xmm6, %xmm2 + subsd %xmm2, %xmm4 + + mulsd 5 * SIZE(B), %xmm5 + + movlpd 4 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + movsd %xmm6, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + movsd %xmm6, 2 * SIZE(B) + movsd %xmm7, 3 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) + movsd %xmm6, 4 * SIZE(BO) + movsd %xmm6, 5 * SIZE(BO) + movsd %xmm7, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) + movsd %xmm6, 2 * SIZE(AO) + movsd %xmm7, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + movapd 16 * SIZE(AO), %xmm12 + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movapd 24 * SIZE(BO), %xmm15 + + PREFETCHW 4 * SIZE(CO1) + PREFETCHW 4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 32 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 8 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 40 * SIZE(AO), %xmm10 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 16 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 18 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 20 * SIZE(AO), %xmm12 + + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 22 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 48 * SIZE(AO), %xmm12 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 24 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 26 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 28 * SIZE(AO), %xmm14 + + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 30 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 56 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm9 + movapd 6 * SIZE(B), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm9, 4 * SIZE(B) + movapd %xmm13, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) + movlpd %xmm9, 8 * SIZE(BO) + movlpd %xmm9, 9 * SIZE(BO) + movhpd %xmm9, 10 * SIZE(BO) + movhpd %xmm9, 11 * SIZE(BO) + movlpd %xmm13, 12 * SIZE(BO) + movlpd %xmm13, 13 * SIZE(BO) + movhpd %xmm13, 14 * SIZE(BO) + movhpd %xmm13, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm13 + mulpd 18 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + mulpd 22 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 26 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 30 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 10 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm0 + movsd 12 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + mulsd %xmm10, %xmm13 + mulsd 18 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm0 + movsd 20 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm1 + movsd 5 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm13 + mulsd 22 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm3 + movsd 6 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 26 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm0 + movsd 28 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm1 + movsd 7 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 30 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm8, %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + + mulsd 3 * SIZE(B), %xmm5 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm5 + + movlpd 2 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movsd 0 * SIZE(B), %xmm0 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movapd 24 * SIZE(AO), %xmm14 + + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm12 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm0 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm11, %xmm1 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm12 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm2 + movapd 48 * SIZE(AO), %xmm12 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm11, %xmm3 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm0 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm1 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm2 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm3 + movapd 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(AO), %xmm8 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 4 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movapd 6 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 12 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movapd 14 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 16 * SIZE(BO), %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm1 + movsd 2 * SIZE(AO), %xmm8 + mulsd 4 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm2 + movsd 3 * SIZE(AO), %xmm8 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + mulsd %xmm10, %xmm11 + movsd 5 * SIZE(AO), %xmm10 + addsd %xmm11, %xmm0 + movsd 24 * SIZE(BO), %xmm11 + mulsd 10 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm1 + movsd 6 * SIZE(AO), %xmm10 + mulsd 12 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm2 + movsd 7 * SIZE(AO), %xmm10 + mulsd 14 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm8, %xmm9 + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + addsd %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S b/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S new file mode 100644 index 0000000..266f442 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S @@ -0,0 +1,3856 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + movapd 8 * SIZE(BO), %xmm9 + movapd 10 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm13 + movapd 14 * SIZE(BO), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) + movapd %xmm9, 8 * SIZE(BO) + movapd %xmm11, 10 * SIZE(BO) + movapd %xmm13, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + ALIGN_4 + +.L21: + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-4 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) + prefetchw 4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + movapd 4 * SIZE(BO), %xmm9 + movapd 6 * SIZE(BO), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) + movapd %xmm9, 4 * SIZE(BO) + movapd %xmm13, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm9 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm8 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm11 + pxor %xmm2, %xmm2 + movapd 4 * SIZE(BO), %xmm10 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S new file mode 100644 index 0000000..917f8f9 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S @@ -0,0 +1,4847 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht2 4 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht2 4 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps %xmm12, %xmm4 + shufps $0x88, %xmm13, %xmm12 + movaps %xmm14, %xmm5 + shufps $0x88, %xmm15, %xmm14 + shufps $0xdd, %xmm15, %xmm4 + shufps $0xdd, %xmm13, %xmm5 + + movaps %xmm12, %xmm6 + shufps $0x88, %xmm14, %xmm12 + shufps $0xdd, %xmm6, %xmm14 + + movaps %xmm4, %xmm13 + movaps %xmm5, %xmm15 + shufps $0x22, %xmm5, %xmm13 + shufps $0x77, %xmm4, %xmm15 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm5 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm6 + movaps -8 * SIZE(BO), %xmm3 + movaps -4 * SIZE(BO), %xmm7 + +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 + movaps -16 * SIZE(AO), %xmm4 + movaps -12 * SIZE(AO), %xmm5 + movaps -8 * SIZE(AO), %xmm6 + movaps -4 * SIZE(AO), %xmm7 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + subps %xmm12, %xmm4 + subps %xmm13, %xmm5 + subps %xmm14, %xmm6 + subps %xmm15, %xmm7 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm5, -20 * SIZE(BO) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm6, -12 * SIZE(BO) + movaps %xmm3, -8 * SIZE(BO) + movaps %xmm7, -4 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm8, %xmm5 + + movaps %xmm6, %xmm9 + shufps $0x88, %xmm7, %xmm6 + shufps $0xdd, %xmm9, %xmm7 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm6, %xmm4 + movaps %xmm5, %xmm9 + shufps $0x22, %xmm7, %xmm5 + shufps $0xdd, %xmm6, %xmm8 + movaps %xmm8, %xmm6 + shufps $0x77, %xmm7, %xmm9 + movaps %xmm9, %xmm7 + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) + movaps %xmm4, -16 * SIZE(AO) + movaps %xmm5, -12 * SIZE(AO) + movaps %xmm6, -8 * SIZE(AO) + movaps %xmm7, -4 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 2 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movsd %xmm5, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 2 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhps %xmm6, 2 * SIZE(CO2, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 2 * SIZE(CO2, %rax, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm5 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm3 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 + subps %xmm10, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + movaps -24 * SIZE(AO), %xmm4 + movaps -20 * SIZE(AO), %xmm6 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + subps %xmm10, %xmm4 + subps %xmm11, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm2, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movaps %xmm2, %xmm5 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm5 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO1, LDC, 2) + movhps %xmm4, (CO1, %rax, 1) + + movsd %xmm2, (CO2) + movhps %xmm2, (CO2, LDC, 1) + movsd %xmm5, (CO2, LDC, 2) + movhps %xmm5, (CO2, %rax, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + movlhps %xmm5, %xmm4 + movlhps %xmm7, %xmm6 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + movaps %xmm4, -24 * SIZE(AO) + movaps %xmm6, -20 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO1, LDC, 2) + movsd %xmm3, (CO1, %rax, 1) + + movsd %xmm4, (CO2) + movsd %xmm5, (CO2, LDC, 1) + movsd %xmm6, (CO2, LDC, 2) + movsd %xmm7, (CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + movsd -28 * SIZE(AO), %xmm4 + movhps -26 * SIZE(AO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + unpcklps %xmm5, %xmm4 + unpcklps %xmm7, %xmm6 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) + movlps %xmm4, -28 * SIZE(AO) + movlps %xmm6, -26 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO1, LDC, 2) + movss %xmm3, (CO1, %rax, 1) + + movss %xmm4, (CO2) + movss %xmm5, (CO2, LDC, 1) + movss %xmm6, (CO2, LDC, 2) + movss %xmm7, (CO2, %rax, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L40: + testq $4, N + jle .L70 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 2 * SIZE(CO2, LDC, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO2) + movhps %xmm4, (CO2, LDC, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO2) + movsd %xmm3, (CO2, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO2) + movss %xmm3, (CO2, LDC, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L70: + testq $2, N + jle .L100 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 +#endif + + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + movlps %xmm2, -28 * SIZE(BO) + movlps %xmm3, -26 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + pshufd $0xd8, %xmm8, %xmm8 + + movaps -32 * SIZE(BO), %xmm0 +#else + movaps -32 * SIZE(AO), %xmm0 +#endif + + subps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO2) +#else + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm1, -30 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $1, M + BRANCH + jle .L99 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 + + mulss %xmm8, %xmm0 + mulss %xmm8, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) + movss %xmm1, -31 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) + movss %xmm1, -31 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO2) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + testq $1, N + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + movhps -30 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm2, -30 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) +#else + movaps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0x55, %xmm0, %xmm1 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) +#else + movlps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L129 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addss %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BO), %xmm0 + + subss %xmm8, %xmm0 +#else + movss -32 * SIZE(AO), %xmm0 + + subss %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 +#endif + + mulss %xmm8, %xmm0 + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L129: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S new file mode 100644 index 0000000..526a78c --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S @@ -0,0 +1,5949 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define movsd movlps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + EMMS + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $2 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + pshufd $0x55, %xmm11, %xmm9 + pshufd $0xaa, %xmm11, %xmm10 + pshufd $0xff, %xmm11, %xmm11 + + pshufd $0x00, %xmm15, %xmm12 + pshufd $0x55, %xmm15, %xmm13 + pshufd $0xaa, %xmm15, %xmm14 + pshufd $0xff, %xmm15, %xmm15 + + movaps %xmm8, 32 * SIZE(BO) + movaps %xmm9, 36 * SIZE(BO) + movaps %xmm10, 40 * SIZE(BO) + movaps %xmm11, 44 * SIZE(BO) + movaps %xmm12, 48 * SIZE(BO) + movaps %xmm13, 52 * SIZE(BO) + movaps %xmm14, 56 * SIZE(BO) + movaps %xmm15, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 7 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 4 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 8 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 16 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 16 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 28 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 24 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 28 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 36 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 60 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 40 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 80 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 68 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 72 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 44 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + movaps 16 * SIZE(B), %xmm12 + movaps 20 * SIZE(B), %xmm13 + movaps 24 * SIZE(B), %xmm14 + movaps 28 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + movaps 16 * SIZE(AO), %xmm12 + movaps 20 * SIZE(AO), %xmm13 + movaps 24 * SIZE(AO), %xmm14 + movaps 28 * SIZE(AO), %xmm15 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 + subps %xmm2, %xmm12 + subps %xmm6, %xmm13 + subps %xmm3, %xmm14 + subps %xmm7, %xmm15 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm15 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm15 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm15 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm9 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm9 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + movaps %xmm12, 16 * SIZE(B) + movaps %xmm13, 20 * SIZE(B) + movaps %xmm14, 24 * SIZE(B) + movaps %xmm15, 28 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + pshufd $0xaa, %xmm12, %xmm4 + pshufd $0xff, %xmm12, %xmm6 + movaps %xmm2, 64 * SIZE(BO) + movaps %xmm3, 68 * SIZE(BO) + movaps %xmm4, 72 * SIZE(BO) + movaps %xmm6, 76 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + pshufd $0xaa, %xmm13, %xmm4 + pshufd $0xff, %xmm13, %xmm6 + movaps %xmm2, 80 * SIZE(BO) + movaps %xmm3, 84 * SIZE(BO) + movaps %xmm4, 88 * SIZE(BO) + movaps %xmm6, 92 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + pshufd $0xaa, %xmm14, %xmm4 + pshufd $0xff, %xmm14, %xmm6 + movaps %xmm2, 96 * SIZE(BO) + movaps %xmm3, 100 * SIZE(BO) + movaps %xmm4, 104 * SIZE(BO) + movaps %xmm6, 108 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + pshufd $0xaa, %xmm15, %xmm4 + pshufd $0xff, %xmm15, %xmm6 + movaps %xmm2, 112 * SIZE(BO) + movaps %xmm3, 116 * SIZE(BO) + movaps %xmm4, 120 * SIZE(BO) + movaps %xmm6, 124 * SIZE(BO) + +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) + movaps %xmm12, 16 * SIZE(AO) + movaps %xmm13, 20 * SIZE(AO) + movaps %xmm14, 24 * SIZE(AO) + movaps %xmm15, 28 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movaps %xmm12, %xmm2 + unpcklps %xmm14, %xmm12 + unpckhps %xmm14, %xmm2 + + movaps %xmm13, %xmm7 + unpcklps %xmm15, %xmm13 + unpckhps %xmm15, %xmm7 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movaps %xmm2, %xmm15 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm15 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm11, 4 * SIZE(CO2) + movhps %xmm11, 6 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm13, 4 * SIZE(CO1, LDC, 2) + movhps %xmm13, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $32 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + movaps 8 * SIZE(AO), %xmm12 + movaps 12 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) + movaps %xmm12, 8 * SIZE(AO) + movaps %xmm14, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movapd 0 * SIZE(B), %xmm1 + movapd 4 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 4 * SIZE(AO), %xmm12 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 6 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) + movlps %xmm12, 4 * SIZE(AO) + movlps %xmm14, 6 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss 2 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss 3 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss 8 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss 5 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss 7 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + unpcklps %xmm1, %xmm0 + + movapd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + movss 2 * SIZE(AO), %xmm12 + movss 3 * SIZE(AO), %xmm14 + + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 + subss %xmm2, %xmm12 + subss %xmm3, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) + movss %xmm12, 2 * SIZE(AO) + movss %xmm14, 3 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm12, 0 * SIZE(CO1, LDC, 2) + movss %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $1 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + addps %xmm12, %xmm5 + movaps 96 * SIZE(BO), %xmm13 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 8 * SIZE(B), %xmm12 +#ifdef movsd + xorps %xmm13, %xmm13 +#endif + movsd 10 * SIZE(B), %xmm13 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 12 * SIZE(B), %xmm14 +#ifdef movsd + xorps %xmm15, %xmm15 +#endif + movsd 14 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + movlps %xmm12, 8 * SIZE(B) + movlps %xmm13, 10 * SIZE(B) + movlps %xmm14, 12 * SIZE(B) + movlps %xmm15, 14 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + movaps %xmm2, 56 * SIZE(BO) + movaps %xmm3, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm14, 4 * SIZE(CO1, LDC, 1) + movhps %xmm14, 6 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm11, 4 * SIZE(CO1, LDC, 1) + movhps %xmm11, 6 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $ 8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $ 4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 3 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 8 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 5 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 6 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 7 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (AO, %rax, SIZE), AO +#ifdef LT + addq $ 2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + testq $1, N + je .L999 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps 12 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps 20 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps 28 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 36 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 44 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 52 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 60 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 8), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 + subss %xmm4, %xmm12 + subss %xmm6, %xmm13 + subss %xmm9, %xmm14 + subss %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm8, %xmm15 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + movss %xmm12, 4 * SIZE(B) + movss %xmm13, 5 * SIZE(B) + movss %xmm14, 6 * SIZE(B) + movss %xmm15, 7 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + movaps %xmm2, 16 * SIZE(BO) + pshufd $0x00, %xmm13, %xmm2 + movaps %xmm2, 20 * SIZE(BO) + pshufd $0x00, %xmm14, %xmm2 + movaps %xmm2, 24 * SIZE(BO) + pshufd $0x00, %xmm15, %xmm2 + movaps %xmm2, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + unpcklps %xmm13, %xmm12 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L140: + testq $1, M + je .L149 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 1 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + subps %xmm0, %xmm8 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AO), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + EMMS + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S new file mode 100644 index 0000000..8c7f92f --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S @@ -0,0 +1,3077 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movaps %xmm3, %xmm0 +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + jle .L30 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm0 +#else + movaps -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef LN + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 + movsd -14 * SIZE(AO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm0 + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(AO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm1 + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm1, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L89 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movhps -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhps -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 + movsd -14 * SIZE(AO), %xmm0 + movhps -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movsd -14 * SIZE(BO), %xmm1 + movhps -13 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movsd -12 * SIZE(AO), %xmm0 + movhps -11 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movsd -12 * SIZE(BO), %xmm1 + movhps -11 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm0 +#else + movsd -16 * SIZE(AO), %xmm0 +#endif + + subsd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BO) +#else + movsd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L30: + testq $2, N + jle .L50 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm1 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm0 + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(AO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 + movddup -15 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm1 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -14 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm1, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm0 +#else + movaps -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 + movsd -15 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm1 + + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -14 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L50: + testq $4, N + jle .L70 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 2 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm1 + movaps -10 * SIZE(BO), %xmm3 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -12 * SIZE(AO), %xmm2 + movaps -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(AO), %xmm12 + movaps %xmm12, %xmm13 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + + movddup -15 * SIZE(AO), %xmm12 + movaps %xmm12, %xmm13 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + + movddup -11 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -10 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -9 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + + movddup -1 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup -2 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup -3 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup -4 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup -6 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -11 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -12 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm2, -14 * SIZE(BO) + movaps %xmm1, -12 * SIZE(BO) + movaps %xmm3, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm1 +#else + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + + movsd -11 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -10 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -9 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + + movsd -1 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd -2 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd -3 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd -4 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd -6 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -11 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -12 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 0 * SIZE(CO2, LDC, 1) + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm1, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L70: + movq N, J + sarq $3, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetcht0 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 1 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 1 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 2 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 1 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 2 * SIZE(CO2, %rax, 1) + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -10 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addq $32 * SIZE, BO + subq $-8 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm13 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $0, %xmm13, %xmm12 + shufpd $3, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $0, %xmm15, %xmm14 + shufpd $3, %xmm0, %xmm15 + + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm4 + movaps -10 * SIZE(BO), %xmm6 + movaps -8 * SIZE(BO), %xmm1 + movaps -6 * SIZE(BO), %xmm3 + movaps -4 * SIZE(BO), %xmm5 + movaps -2 * SIZE(BO), %xmm7 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + shufpd $2, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + shufpd $2, %xmm0, %xmm15 + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -12 * SIZE(AO), %xmm2 + movaps -10 * SIZE(AO), %xmm3 + + movaps -8 * SIZE(AO), %xmm4 + movaps -6 * SIZE(AO), %xmm5 + movaps -4 * SIZE(AO), %xmm6 + movaps -2 * SIZE(AO), %xmm7 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + subpd %xmm12, %xmm4 + subpd %xmm13, %xmm5 + subpd %xmm14, %xmm6 + subpd %xmm15, %xmm7 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -14 * SIZE(AO), %xmm12 + movaps %xmm12, %xmm13 + movaps %xmm12, %xmm14 + movaps %xmm12, %xmm15 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + mulpd %xmm5, %xmm14 + mulpd %xmm7, %xmm15 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + subpd %xmm14, %xmm4 + subpd %xmm15, %xmm6 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 + + movddup -15 * SIZE(AO), %xmm12 + movaps %xmm12, %xmm13 + movaps %xmm12, %xmm14 + movaps %xmm12, %xmm15 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + mulpd %xmm4, %xmm14 + mulpd %xmm6, %xmm15 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + subpd %xmm14, %xmm5 + subpd %xmm15, %xmm7 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + movddup -12 * SIZE(BO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm4 + movddup -11 * SIZE(BO), %xmm13 + mulpd %xmm0, %xmm13 + subpd %xmm13, %xmm5 + movddup -10 * SIZE(BO), %xmm14 + mulpd %xmm0, %xmm14 + subpd %xmm14, %xmm6 + movddup -9 * SIZE(BO), %xmm15 + mulpd %xmm0, %xmm15 + subpd %xmm15, %xmm7 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + movddup -4 * SIZE(BO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm4 + movddup -3 * SIZE(BO), %xmm13 + mulpd %xmm1, %xmm13 + subpd %xmm13, %xmm5 + movddup -2 * SIZE(BO), %xmm14 + mulpd %xmm1, %xmm14 + subpd %xmm14, %xmm6 + movddup -1 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm7 + + movddup 2 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + movddup 4 * SIZE(BO), %xmm12 + mulpd %xmm2, %xmm12 + subpd %xmm12, %xmm4 + movddup 5 * SIZE(BO), %xmm13 + mulpd %xmm2, %xmm13 + subpd %xmm13, %xmm5 + movddup 6 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm6 + movddup 7 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm7 + + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm4 + movddup 13 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm5 + movddup 14 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm6 + movddup 15 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm7 + + movddup 20 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm6 + movddup 23 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm7 + + movddup 29 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm5 + movddup 30 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm7 + + movddup 38 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm6 + movddup 39 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm7 + + movddup 47 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm7 +#endif + +#ifdef RT + movddup 47 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm7 + movddup 46 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm6 + movddup 45 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm5 + movddup 44 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm4 + movddup 43 * SIZE(BO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + movddup 42 * SIZE(BO), %xmm13 + mulpd %xmm7, %xmm13 + subpd %xmm13, %xmm2 + movddup 41 * SIZE(BO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm1 + movddup 40 * SIZE(BO), %xmm15 + mulpd %xmm7, %xmm15 + subpd %xmm15, %xmm0 + + movddup 38 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm6 + movddup 37 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm5 + movddup 36 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm4 + movddup 35 * SIZE(BO), %xmm12 + mulpd %xmm6, %xmm12 + subpd %xmm12, %xmm3 + movddup 34 * SIZE(BO), %xmm13 + mulpd %xmm6, %xmm13 + subpd %xmm13, %xmm2 + movddup 33 * SIZE(BO), %xmm14 + mulpd %xmm6, %xmm14 + subpd %xmm14, %xmm1 + movddup 32 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm0 + + movddup 29 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm5 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm5, %xmm11 + subpd %xmm11, %xmm4 + movddup 27 * SIZE(BO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + movddup 26 * SIZE(BO), %xmm13 + mulpd %xmm5, %xmm13 + subpd %xmm13, %xmm2 + movddup 25 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm1 + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm0 + + movddup 20 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm4 + movddup 19 * SIZE(BO), %xmm12 + mulpd %xmm4, %xmm12 + subpd %xmm12, %xmm3 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm2 + movddup 17 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm1 + movddup 16 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm0 + + movddup 11 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup 10 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup 9 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup 8 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup 2 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup 1 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup 0 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 1 * SIZE(CO1, LDC, 2) + movhps %xmm2, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 1 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movsd %xmm5, 1 * SIZE(CO2) + movhps %xmm4, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 1 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movsd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhps %xmm6, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 1 * SIZE(CO2, %rax, 1) +#else + movups %xmm0, 0 * SIZE(CO1) + movups %xmm1, 0 * SIZE(CO1, LDC, 1) + movups %xmm2, 0 * SIZE(CO1, LDC, 2) + movups %xmm3, 0 * SIZE(CO1, %rax, 1) + movups %xmm4, 0 * SIZE(CO2) + movups %xmm5, 0 * SIZE(CO2, LDC, 1) + movups %xmm6, 0 * SIZE(CO2, LDC, 2) + movups %xmm7, 0 * SIZE(CO2, %rax, 1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm2, -14 * SIZE(BO) + movaps %xmm4, -12 * SIZE(BO) + movaps %xmm6, -10 * SIZE(BO) + movaps %xmm1, -8 * SIZE(BO) + movaps %xmm3, -6 * SIZE(BO) + movaps %xmm5, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) + movaps %xmm4, -8 * SIZE(AO) + movaps %xmm5 , -6 * SIZE(AO) + movaps %xmm6, -4 * SIZE(AO) + movaps %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm1 + movaps -12 * SIZE(BO), %xmm2 + movaps -10 * SIZE(BO), %xmm3 +#else + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -12 * SIZE(AO), %xmm2 + movaps -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm3, %xmm7 + movaps %xmm3, %xmm6 + pshufd $0xe, %xmm2, %xmm5 + movaps %xmm2, %xmm4 + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + movsd -12 * SIZE(BO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm4 + movsd -11 * SIZE(BO), %xmm13 + mulsd %xmm0, %xmm13 + subsd %xmm13, %xmm5 + movsd -10 * SIZE(BO), %xmm14 + mulsd %xmm0, %xmm14 + subsd %xmm14, %xmm6 + movsd -9 * SIZE(BO), %xmm15 + mulsd %xmm0, %xmm15 + subsd %xmm15, %xmm7 + + movsd -7 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + movsd -4 * SIZE(BO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm4 + movsd -3 * SIZE(BO), %xmm13 + mulsd %xmm1, %xmm13 + subsd %xmm13, %xmm5 + movsd -2 * SIZE(BO), %xmm14 + mulsd %xmm1, %xmm14 + subsd %xmm14, %xmm6 + movsd -1 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm7 + + movsd 2 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd 3 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + movsd 4 * SIZE(BO), %xmm12 + mulsd %xmm2, %xmm12 + subsd %xmm12, %xmm4 + movsd 5 * SIZE(BO), %xmm13 + mulsd %xmm2, %xmm13 + subsd %xmm13, %xmm5 + movsd 6 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm6 + movsd 7 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm7 + + movsd 11 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 + movsd 12 * SIZE(BO), %xmm12 + mulsd %xmm3, %xmm12 + subsd %xmm12, %xmm4 + movsd 13 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm5 + movsd 14 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm6 + movsd 15 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm7 + + movsd 20 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm4 + movsd 21 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm5 + movsd 22 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm6 + movsd 23 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm7 + + movsd 29 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm5 + movsd 30 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm6 + movsd 31 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm7 + + movsd 38 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm6 + movsd 39 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm7 + + movsd 47 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm7 +#endif + +#ifdef RT + movsd 47 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm7 + movsd 46 * SIZE(BO), %xmm9 + mulsd %xmm7, %xmm9 + subsd %xmm9, %xmm6 + movsd 45 * SIZE(BO), %xmm10 + mulsd %xmm7, %xmm10 + subsd %xmm10, %xmm5 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm7, %xmm11 + subsd %xmm11, %xmm4 + movsd 43 * SIZE(BO), %xmm12 + mulsd %xmm7, %xmm12 + subsd %xmm12, %xmm3 + movsd 42 * SIZE(BO), %xmm13 + mulsd %xmm7, %xmm13 + subsd %xmm13, %xmm2 + movsd 41 * SIZE(BO), %xmm14 + mulsd %xmm7, %xmm14 + subsd %xmm14, %xmm1 + movsd 40 * SIZE(BO), %xmm15 + mulsd %xmm7, %xmm15 + subsd %xmm15, %xmm0 + + movsd 38 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm6 + movsd 37 * SIZE(BO), %xmm10 + mulsd %xmm6, %xmm10 + subsd %xmm10, %xmm5 + movsd 36 * SIZE(BO), %xmm11 + mulsd %xmm6, %xmm11 + subsd %xmm11, %xmm4 + movsd 35 * SIZE(BO), %xmm12 + mulsd %xmm6, %xmm12 + subsd %xmm12, %xmm3 + movsd 34 * SIZE(BO), %xmm13 + mulsd %xmm6, %xmm13 + subsd %xmm13, %xmm2 + movsd 33 * SIZE(BO), %xmm14 + mulsd %xmm6, %xmm14 + subsd %xmm14, %xmm1 + movsd 32 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm0 + + movsd 29 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm5 + movsd 28 * SIZE(BO), %xmm11 + mulsd %xmm5, %xmm11 + subsd %xmm11, %xmm4 + movsd 27 * SIZE(BO), %xmm12 + mulsd %xmm5, %xmm12 + subsd %xmm12, %xmm3 + movsd 26 * SIZE(BO), %xmm13 + mulsd %xmm5, %xmm13 + subsd %xmm13, %xmm2 + movsd 25 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm1 + movsd 24 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm0 + + movsd 20 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm4 + movsd 19 * SIZE(BO), %xmm12 + mulsd %xmm4, %xmm12 + subsd %xmm12, %xmm3 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm2 + movsd 17 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm1 + movsd 16 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm0 + + movsd 11 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd 10 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd 9 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd 8 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd 2 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd 1 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd 0 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 + movaps %xmm4, %xmm2 + unpcklpd %xmm5, %xmm2 + movaps %xmm6, %xmm3 + unpcklpd %xmm7, %xmm3 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO1, %rax, 1) + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 0 * SIZE(CO2, %rax, 1) + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm1, -14 * SIZE(BO) + movaps %xmm2, -12 * SIZE(BO) + movaps %xmm3, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x2_atom.S b/kernel/x86_64/trsm_kernel_RT_4x2_atom.S new file mode 100644 index 0000000..ae49c38 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x2_atom.S @@ -0,0 +1,2116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm4 + movsd 3 * SIZE(BO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm9 + movsd 13 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm4 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm11 + movsd 10 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm2 + movsd 9 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm13 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm0 + + mulsd %xmm8, %xmm4 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm11 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 2 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm2 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm11 + movsd 5 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm4 + movsd 6 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm13 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm6 + + mulsd %xmm8, %xmm2 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm6 + mulsd %xmm8, %xmm4 + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm6 + mulsd %xmm8, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) + movsd %xmm4, 2 * SIZE(BO) + movsd %xmm6, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + je .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + movsd 2 * SIZE(AO), %xmm9 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11,%xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + movsd 1 * SIZE(AO), %xmm9 + movsd 3 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm0 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm2 + mulsd %xmm11,%xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $1, M + je .L69 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + subsd %xmm8, %xmm0 +#else + movsd 0 * SIZE(AO), %xmm0 + subsd %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L40: + movq N, J + sarq $1, J + jle .L999 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + PREFETCH (PREFETCHSIZE + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addsd %xmm2, %xmm13 + addsd %xmm7, %xmm14 + addsd %xmm6, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 5 * SIZE(BO), %xmm5 + movsd 6 * SIZE(BO), %xmm6 + movsd 7 * SIZE(BO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 + subsd %xmm12, %xmm4 + subsd %xmm13, %xmm5 + subsd %xmm14, %xmm6 + subsd %xmm15, %xmm7 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + movsd 4 * SIZE(AO), %xmm1 + movsd 5 * SIZE(AO), %xmm3 + movsd 6 * SIZE(AO), %xmm5 + movsd 7 * SIZE(AO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 + subsd %xmm13, %xmm5 + subsd %xmm15, %xmm7 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm7 + movsd 13 * SIZE(AO), %xmm11 + + movaps %xmm9, %xmm10 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm9 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm7, %xmm10 + subsd %xmm9, %xmm4 + movsd 9 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm6, %xmm11 + mulsd %xmm7, %xmm12 + subsd %xmm11, %xmm2 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm3 + + movaps %xmm13, %xmm14 + mulsd %xmm6, %xmm13 + mulsd %xmm7, %xmm14 + subsd %xmm13, %xmm0 + subsd %xmm14, %xmm1 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 5 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm4, %xmm11 + mulsd %xmm5, %xmm12 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm1 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm11, %xmm0 + mulsd %xmm11, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + + movsd 2 * SIZE(AO), %xmm11 + movaps %xmm9, %xmm10 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm9 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm1, %xmm10 + subsd %xmm9, %xmm2 + movsd 6 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm12 + subsd %xmm11, %xmm4 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm5 + + movaps %xmm13, %xmm14 + mulsd %xmm0, %xmm13 + mulsd %xmm1, %xmm14 + subsd %xmm13, %xmm6 + subsd %xmm14, %xmm7 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + movsd 10 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm2, %xmm11 + mulsd %xmm3, %xmm12 + subsd %xmm11, %xmm6 + subsd %xmm12, %xmm7 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 15 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm6 + subsd %xmm10, %xmm7 + + mulsd %xmm8, %xmm6 + mulsd %xmm8, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + mulsd %xmm4, %xmm11 + mulsd %xmm6, %xmm12 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + subsd %xmm11, %xmm5 + subsd %xmm12, %xmm7 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 + mulsd %xmm13, %xmm5 + mulsd %xmm13, %xmm7 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm5 + mulsd %xmm8, %xmm7 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + mulsd %xmm5, %xmm11 + mulsd %xmm7, %xmm12 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm4 + subsd %xmm12, %xmm6 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm4 + mulsd %xmm13, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movsd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) + movsd %xmm4, 4 * SIZE(BO) + movsd %xmm5, 5 * SIZE(BO) + movsd %xmm6, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) + movsd %xmm1, 4 * SIZE(AO) + movsd %xmm3, 5 * SIZE(AO) + movsd %xmm5, 6 * SIZE(AO) + movsd %xmm7, 7 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + addsd %xmm6, %xmm11 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm1 + movsd 3 * SIZE(AO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm2 + movsd 2 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm10 + + subsd %xmm9, %xmm2 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm1, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 3 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm1 + mulsd %xmm13, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm1, %xmm9 + movsd 0 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm0 + mulsd %xmm13, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S new file mode 100644 index 0000000..400f60e --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S @@ -0,0 +1,3393 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define OFFSET 48(%rsp) +#define AORIG 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define AORIG 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ +/***/ movapd (AO, %rax, 4), %xmm6 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ +/**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ +/***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm12 +#else + movq STACKSIZE + 8(%rsp), LDC + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movsd %xmm12, OFFSET + movsd %xmm12, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -14 * SIZE(BO), %xmm3 + +#ifndef LN + prefetchw 3 * SIZE(CO1) +#else + prefetchw -8 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm10 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm11 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm9 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm10 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movlpd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) + movaps %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm10, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movlpd %xmm10, -16 * SIZE(BO) +#else + movlpd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + testq $2, N + je .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 5 * SIZE(CO2) +#else + prefetchw -4 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw -4 * SIZE(CO2) +#endif + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13,-14 * SIZE(BO) + movaps %xmm1, -12 * SIZE(BO) + movaps %xmm5, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 1), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 +#else + movapd -16 * SIZE(AO), %xmm2 +#endif + + subpd %xmm8, %xmm2 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -16 * SIZE(BO), %xmm2 + movsd -15 * SIZE(BO), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + + movlpd -14 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + + mulsd -16 * SIZE(BO), %xmm2 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + movq B, BB + subq %rax, BB + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 5 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 5 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#else + prefetchw -8 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw -8 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw -8 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw -8 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#endif + + prefetch -16 * SIZE(BB) + prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm15 + + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm11, %xmm14 + subpd %xmm14, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm3 + + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) + movaps %xmm1, -8 * SIZE(BO) + movaps %xmm3, -6 * SIZE(BO) + movaps %xmm5, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) + movaps %xmm4, -8 * SIZE(AO) + movaps %xmm5, -6 * SIZE(AO) + movaps %xmm6, -4 * SIZE(AO) + movaps %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) + movaps %xmm4, -12 * SIZE(AO) + movaps %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#else + movapd -16 * SIZE(AO), %xmm2 + movapd -14 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -15 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd -14 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd -13 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd -9 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -5 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd -3 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd -4 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -7 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd -8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -12 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm3, -14 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) + movaps %xmm3, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_core2.S b/kernel/x86_64/trsm_kernel_RT_4x4_core2.S new file mode 100644 index 0000000..89d07ce --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_core2.S @@ -0,0 +1,3737 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define J 0(%rsp) +#define OFFSET 8(%rsp) +#define KK 16(%rsp) +#define KKK 24(%rsp) +#define AORIG 32(%rsp) +#define BORIG 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) +#else + prefetcht2 3 * SIZE(CO1) +#endif + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -14 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + movapd -10 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + movddup %xmm11, %xmm9 + SHUFPD_3 %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -12 * SIZE(BO), %xmm2 + movsd -10 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(B) + + movlpd %xmm10, -16 * SIZE(BO) + movlpd %xmm10, -15 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + movapd -12 * SIZE(B), %xmm1 + movapd -10 * SIZE(B), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm5, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) + movapd %xmm0, -8 * SIZE(BO) + movapd %xmm1, -6 * SIZE(BO) + movapd %xmm4, -4 * SIZE(BO) + movapd %xmm5, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(B), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm13 + + movlpd -14 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -8 * SIZE(B), %xmm4 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + movddup %xmm2, %xmm10 + unpckhpd %xmm2, %xmm2 + movddup %xmm3, %xmm11 + unpckhpd %xmm3, %xmm3 + movddup %xmm4, %xmm12 + unpckhpd %xmm4, %xmm4 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + movddup %xmm6, %xmm14 + unpckhpd %xmm6, %xmm6 + movddup %xmm7, %xmm15 + unpckhpd %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + movapd %xmm10, -8 * SIZE(BO) + movapd %xmm2, -6 * SIZE(BO) + movapd %xmm11, -4 * SIZE(BO) + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + movapd %xmm12, 0 * SIZE(BO) + movapd %xmm4, 2 * SIZE(BO) + movapd %xmm13, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movapd %xmm14, 8 * SIZE(BO) + movapd %xmm6, 10 * SIZE(BO) + movapd %xmm15, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + prefetcht2 0 * SIZE(BB) + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 -3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 -3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 +#endif + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm10 + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm14 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 0 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd 4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd 6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + addq $32 * SIZE, BO + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subq $1, %rax + mulpd %xmm1, %xmm5 + + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + movapd -8 * SIZE(B), %xmm1 + movapd -6 * SIZE(B), %xmm3 + movapd -4 * SIZE(B), %xmm5 + movapd -2 * SIZE(B), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm5, -4 * SIZE(B) + movapd %xmm7, -2 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm3, %xmm2 + SHUFPD_3 %xmm3, %xmm3 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + movddup %xmm7, %xmm6 + SHUFPD_3 %xmm7, %xmm7 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jne .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(B), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(B), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(B), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(B), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(B), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(B), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(B), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -15 * SIZE(AO), %xmm0 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 6 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -13 * SIZE(AO), %xmm0 + movsd 8 * SIZE(BO), %xmm2 + movsd 10 * SIZE(BO), %xmm3 + movsd 12 * SIZE(BO), %xmm4 + movsd 14 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 + movsd -14 * SIZE(B), %xmm14 + movsd -13 * SIZE(B), %xmm15 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 + movsd -14 * SIZE(AO), %xmm14 + movsd -13 * SIZE(AO), %xmm15 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + subsd %xmm10, %xmm14 + subsd %xmm11, %xmm15 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movlpd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(B), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(B), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(B), %xmm13 + movlpd -10 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(B), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(B), %xmm14 + movlpd -5 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(B), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(B), %xmm15 + + movlpd -2 * SIZE(B), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(B), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(B), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(B), %xmm14 + + movlpd -7 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(B), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(B), %xmm13 + + movlpd -12 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + movsd %xmm14, -14 * SIZE(B) + movsd %xmm15, -13 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) + movsd %xmm14, -12 * SIZE(BO) + movsd %xmm14, -11 * SIZE(BO) + movsd %xmm15, -10 * SIZE(BO) + movsd %xmm15, -9 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S b/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S new file mode 100644 index 0000000..a575d4c --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S @@ -0,0 +1,3426 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + BRANCH + jle .L40 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + pxor %xmm9, %xmm9 + movhps -15 * SIZE(BO), %xmm2 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + BRANCH + jle .L119 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -12 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + movq B, BB + subq %rax, BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 -4 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#else + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm5, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + movhlps %xmm8, %xmm9 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(BO), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(BO), %xmm13 + + movlpd -14 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm13, -15 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + movq N, J + sarq $2, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + movq B, BB + subq %rax, BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 -4 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 -4 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 3 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + addpd %xmm3, %xmm11 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 16 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + subq $-32 * SIZE, AO + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movapd %xmm14, %xmm0 + movsd %xmm15, %xmm14 + movsd %xmm0, %xmm15 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm5, %xmm5 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -2 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm12 + movaps -14 * SIZE(BO), %xmm13 +#else + movaps -16 * SIZE(AO), %xmm12 + movaps -14 * SIZE(AO), %xmm13 +#endif + + subpd %xmm8, %xmm12 + subpd %xmm9, %xmm13 + +#if defined(RN) || defined(RT) + movhlps %xmm13, %xmm15 + movsd %xmm13, %xmm14 + movhlps %xmm12, %xmm13 + movsd %xmm12, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm12 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movlpd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(BO), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(BO), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(BO), %xmm13 + movlpd -10 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(BO), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(BO), %xmm14 + movlpd -5 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(BO), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(BO), %xmm15 + + movlpd -2 * SIZE(BO), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(BO), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(BO), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(BO), %xmm14 + + movlpd -7 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(BO), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(BO), %xmm13 + + movlpd -12 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + movsd %xmm13, 0 * SIZE(CO1, LDC, 2) + movhps %xmm13, 0 * SIZE(CO2, LDC, 2) + + movaps %xmm12, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S b/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S new file mode 100644 index 0000000..07c978e --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S @@ -0,0 +1,4134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#ifndef ALLOC_HUGETLB +#define PREFETCHSIZE (8 * 4 + 4) +#else +#define PREFETCHSIZE (8 * 2 + 4) +#endif +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 4 + 4) +#endif + +#ifdef OPTERON +#define movsd movlpd +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movsd 0 * SIZE(B), %xmm0 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movapd 24 * SIZE(AO), %xmm14 + + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm12 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm0 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm11, %xmm1 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm12 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm2 + movapd 48 * SIZE(AO), %xmm12 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm11, %xmm3 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm0 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm1 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm2 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm3 + movapd 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(AO), %xmm8 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 4 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movapd 6 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 12 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movapd 14 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 16 * SIZE(BO), %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm1 + movsd 2 * SIZE(AO), %xmm8 + mulsd 4 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm2 + movsd 3 * SIZE(AO), %xmm8 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + mulsd %xmm10, %xmm11 + movsd 5 * SIZE(AO), %xmm10 + addsd %xmm11, %xmm0 + movsd 24 * SIZE(BO), %xmm11 + mulsd 10 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm1 + movsd 6 * SIZE(AO), %xmm10 + mulsd 12 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm2 + movsd 7 * SIZE(AO), %xmm10 + mulsd 14 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm8, %xmm9 + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + addsd %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + movapd 16 * SIZE(AO), %xmm12 + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movapd 24 * SIZE(BO), %xmm15 + + PREFETCHW 4 * SIZE(CO1) + PREFETCHW 4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 32 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 8 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 40 * SIZE(AO), %xmm10 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 16 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 18 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 20 * SIZE(AO), %xmm12 + + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 22 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 48 * SIZE(AO), %xmm12 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 24 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 26 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 28 * SIZE(AO), %xmm14 + + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 30 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 56 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm9 + movapd 6 * SIZE(B), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm9, 4 * SIZE(B) + movapd %xmm13, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) + movlpd %xmm9, 8 * SIZE(BO) + movlpd %xmm9, 9 * SIZE(BO) + movhpd %xmm9, 10 * SIZE(BO) + movhpd %xmm9, 11 * SIZE(BO) + movlpd %xmm13, 12 * SIZE(BO) + movlpd %xmm13, 13 * SIZE(BO) + movhpd %xmm13, 14 * SIZE(BO) + movhpd %xmm13, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm13 + mulpd 18 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + mulpd 22 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 26 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 30 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 10 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm0 + movsd 12 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + mulsd %xmm10, %xmm13 + mulsd 18 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm0 + movsd 20 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm1 + movsd 5 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm13 + mulsd 22 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm3 + movsd 6 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 26 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm0 + movsd 28 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm1 + movsd 7 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 30 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm8, %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + + mulsd 3 * SIZE(B), %xmm5 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm5 + + movlpd 2 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 40 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $16 * SIZE, BO + addq $ 8 * SIZE, B + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + movsd %xmm2, 4 * SIZE(BO) + movsd %xmm2, 5 * SIZE(BO) + movsd %xmm3, 6 * SIZE(BO) + movsd %xmm3, 7 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(BO), %xmm9 + movapd 2 * SIZE(BO), %xmm11 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + movapd 8 * SIZE(B), %xmm9 + movapd 10 * SIZE(B), %xmm11 + movapd 12 * SIZE(B), %xmm13 + movapd 14 * SIZE(B), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + movapd %xmm9, 8 * SIZE(B) + movapd %xmm11, 10 * SIZE(B) + movapd %xmm13, 12 * SIZE(B) + movapd %xmm15, 14 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) + movlpd %xmm9, 16 * SIZE(BO) + movlpd %xmm9, 17 * SIZE(BO) + movhpd %xmm9, 18 * SIZE(BO) + movhpd %xmm9, 19 * SIZE(BO) + movlpd %xmm11, 20 * SIZE(BO) + movlpd %xmm11, 21 * SIZE(BO) + movhpd %xmm11, 22 * SIZE(BO) + movhpd %xmm11, 23 * SIZE(BO) + movlpd %xmm13, 24 * SIZE(BO) + movlpd %xmm13, 25 * SIZE(BO) + movhpd %xmm13, 26 * SIZE(BO) + movhpd %xmm13, 27 * SIZE(BO) + movlpd %xmm15, 28 * SIZE(BO) + movlpd %xmm15, 29 * SIZE(BO) + movhpd %xmm15, 30 * SIZE(BO) + movhpd %xmm15, 31 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm0 + movapd 18 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm1 + movapd 20 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + mulpd 22 * SIZE(BO), %xmm8 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm8, %xmm3 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm0 + movapd 26 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm1 + movapd 28 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + mulpd 30 * SIZE(BO), %xmm8 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 34 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movapd 36 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 38 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm2 + movapd 64 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm3 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 42 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movapd 44 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + mulpd 46 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm2 + movapd 72 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm0 + movapd 50 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm1 + movapd 52 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + mulpd 54 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 80 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm0 + movapd 58 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm1 + movapd 60 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + mulpd 62 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 88 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 10 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 12 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 20 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 22 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 26 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 28 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 30 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 4 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 34 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 36 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 38 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 64 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 5 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 42 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 46 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 72 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 6 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 50 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 52 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 54 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 80 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 7 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 58 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 60 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 62 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 88 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 8 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 + movsd 2 * SIZE(B), %xmm6 + movsd 3 * SIZE(B), %xmm7 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 + movsd 2 * SIZE(AO), %xmm6 + movsd 3 * SIZE(AO), %xmm7 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + subsd %xmm2, %xmm6 + subsd %xmm3, %xmm7 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movlpd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + movlpd 2 * SIZE(B), %xmm2 + mulsd %xmm4, %xmm2 + subsd %xmm2, %xmm6 + movlpd 3 * SIZE(B), %xmm3 + mulsd %xmm4, %xmm3 + subsd %xmm3, %xmm7 + + mulsd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm6 + movlpd 7 * SIZE(B), %xmm2 + mulsd %xmm5, %xmm2 + subsd %xmm2, %xmm7 + + mulsd 10 * SIZE(B), %xmm6 + movlpd 11 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm7 + + mulsd 15 * SIZE(B), %xmm7 +#endif + +#ifdef RT + mulsd 15 * SIZE(B), %xmm7 + + movlpd 14 * SIZE(B), %xmm1 + mulsd %xmm7, %xmm1 + subsd %xmm1, %xmm6 + movlpd 13 * SIZE(B), %xmm2 + mulsd %xmm7, %xmm2 + subsd %xmm2, %xmm5 + movlpd 12 * SIZE(B), %xmm3 + mulsd %xmm7, %xmm3 + subsd %xmm3, %xmm4 + + mulsd 10 * SIZE(B), %xmm6 + + movlpd 9 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm5 + movlpd 8 * SIZE(B), %xmm2 + mulsd %xmm6, %xmm2 + subsd %xmm2, %xmm4 + + mulsd 5 * SIZE(B), %xmm5 + + movlpd 4 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + movsd %xmm6, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + movsd %xmm6, 2 * SIZE(B) + movsd %xmm7, 3 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) + movsd %xmm6, 4 * SIZE(BO) + movsd %xmm6, 5 * SIZE(BO) + movsd %xmm7, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) + movsd %xmm6, 2 * SIZE(AO) + movsd %xmm7, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S b/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S new file mode 100644 index 0000000..f0e8bf9 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S @@ -0,0 +1,3844 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 272 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) + +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L80 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L80: + testq $2, N + je .L40 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) + prefetchw 4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + movapd 4 * SIZE(BO), %xmm9 + movapd 6 * SIZE(BO), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) + movapd %xmm9, 4 * SIZE(BO) + movapd %xmm13, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L40: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + movapd 8 * SIZE(BO), %xmm9 + movapd 10 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm13 + movapd 14 * SIZE(BO), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) + movapd %xmm9, 8 * SIZE(BO) + movapd %xmm11, 10 * SIZE(BO) + movapd %xmm13, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + ALIGN_4 + +.L21: + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S new file mode 100644 index 0000000..ffac798 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S @@ -0,0 +1,4847 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + jle .L40 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + movhps -30 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm2, -30 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) +#else + movaps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0x55, %xmm0, %xmm1 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) +#else + movlps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L129 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addss %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BO), %xmm0 + + subss %xmm8, %xmm0 +#else + movss -32 * SIZE(AO), %xmm0 + + subss %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 +#endif + + mulss %xmm8, %xmm0 + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L129: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + testq $2, N + jle .L70 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 +#endif + + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + movlps %xmm2, -28 * SIZE(BO) + movlps %xmm3, -26 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + pshufd $0xd8, %xmm8, %xmm8 + + movaps -32 * SIZE(BO), %xmm0 +#else + movaps -32 * SIZE(AO), %xmm0 +#endif + + subps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO2) +#else + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm1, -30 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $1, M + BRANCH + jle .L99 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 + + mulss %xmm8, %xmm0 + mulss %xmm8, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) + movss %xmm1, -31 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) + movss %xmm1, -31 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO2) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L70: + testq $4, N + jle .L100 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 2 * SIZE(CO2, LDC, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO2) + movhps %xmm4, (CO2, LDC, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO2) + movsd %xmm3, (CO2, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO2) + movss %xmm3, (CO2, LDC, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L100: + movq N, J + sarq $3, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht2 4 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht2 4 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps %xmm12, %xmm4 + shufps $0x88, %xmm13, %xmm12 + movaps %xmm14, %xmm5 + shufps $0x88, %xmm15, %xmm14 + shufps $0xdd, %xmm15, %xmm4 + shufps $0xdd, %xmm13, %xmm5 + + movaps %xmm12, %xmm6 + shufps $0x88, %xmm14, %xmm12 + shufps $0xdd, %xmm6, %xmm14 + + movaps %xmm4, %xmm13 + movaps %xmm5, %xmm15 + shufps $0x22, %xmm5, %xmm13 + shufps $0x77, %xmm4, %xmm15 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm5 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm6 + movaps -8 * SIZE(BO), %xmm3 + movaps -4 * SIZE(BO), %xmm7 + +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 + movaps -16 * SIZE(AO), %xmm4 + movaps -12 * SIZE(AO), %xmm5 + movaps -8 * SIZE(AO), %xmm6 + movaps -4 * SIZE(AO), %xmm7 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + subps %xmm12, %xmm4 + subps %xmm13, %xmm5 + subps %xmm14, %xmm6 + subps %xmm15, %xmm7 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm5, -20 * SIZE(BO) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm6, -12 * SIZE(BO) + movaps %xmm3, -8 * SIZE(BO) + movaps %xmm7, -4 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm8, %xmm5 + + movaps %xmm6, %xmm9 + shufps $0x88, %xmm7, %xmm6 + shufps $0xdd, %xmm9, %xmm7 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm6, %xmm4 + movaps %xmm5, %xmm9 + shufps $0x22, %xmm7, %xmm5 + shufps $0xdd, %xmm6, %xmm8 + movaps %xmm8, %xmm6 + shufps $0x77, %xmm7, %xmm9 + movaps %xmm9, %xmm7 + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) + movaps %xmm4, -16 * SIZE(AO) + movaps %xmm5, -12 * SIZE(AO) + movaps %xmm6, -8 * SIZE(AO) + movaps %xmm7, -4 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 2 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movsd %xmm5, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 2 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhps %xmm6, 2 * SIZE(CO2, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 2 * SIZE(CO2, %rax, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm5 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm3 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 + subps %xmm10, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + movaps -24 * SIZE(AO), %xmm4 + movaps -20 * SIZE(AO), %xmm6 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + subps %xmm10, %xmm4 + subps %xmm11, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm2, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movaps %xmm2, %xmm5 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm5 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO1, LDC, 2) + movhps %xmm4, (CO1, %rax, 1) + + movsd %xmm2, (CO2) + movhps %xmm2, (CO2, LDC, 1) + movsd %xmm5, (CO2, LDC, 2) + movhps %xmm5, (CO2, %rax, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + movlhps %xmm5, %xmm4 + movlhps %xmm7, %xmm6 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + movaps %xmm4, -24 * SIZE(AO) + movaps %xmm6, -20 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO1, LDC, 2) + movsd %xmm3, (CO1, %rax, 1) + + movsd %xmm4, (CO2) + movsd %xmm5, (CO2, LDC, 1) + movsd %xmm6, (CO2, LDC, 2) + movsd %xmm7, (CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + movsd -28 * SIZE(AO), %xmm4 + movhps -26 * SIZE(AO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + unpcklps %xmm5, %xmm4 + unpcklps %xmm7, %xmm6 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) + movlps %xmm4, -28 * SIZE(AO) + movlps %xmm6, -26 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO1, LDC, 2) + movss %xmm3, (CO1, %rax, 1) + + movss %xmm4, (CO2) + movss %xmm5, (CO2, LDC, 1) + movss %xmm6, (CO2, LDC, 2) + movss %xmm7, (CO2, %rax, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S new file mode 100644 index 0000000..e96496f --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S @@ -0,0 +1,5975 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define movsd movlps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L50 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps 12 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps 20 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps 28 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 36 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 44 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 52 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 60 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 8), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 + subss %xmm4, %xmm12 + subss %xmm6, %xmm13 + subss %xmm9, %xmm14 + subss %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm8, %xmm15 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + movss %xmm12, 4 * SIZE(B) + movss %xmm13, 5 * SIZE(B) + movss %xmm14, 6 * SIZE(B) + movss %xmm15, 7 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + movaps %xmm2, 16 * SIZE(BO) + pshufd $0x00, %xmm13, %xmm2 + movaps %xmm2, 20 * SIZE(BO) + pshufd $0x00, %xmm14, %xmm2 + movaps %xmm2, 24 * SIZE(BO) + pshufd $0x00, %xmm15, %xmm2 + movaps %xmm2, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + unpcklps %xmm13, %xmm12 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L140: + testq $1, M + je .L149 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 1 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + subps %xmm0, %xmm8 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AO), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L50: + testq $2, N + je .L100 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $1 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + addps %xmm12, %xmm5 + movaps 96 * SIZE(BO), %xmm13 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 8 * SIZE(B), %xmm12 +#ifdef movsd + xorps %xmm13, %xmm13 +#endif + movsd 10 * SIZE(B), %xmm13 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 12 * SIZE(B), %xmm14 +#ifdef movsd + xorps %xmm15, %xmm15 +#endif + movsd 14 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + movlps %xmm12, 8 * SIZE(B) + movlps %xmm13, 10 * SIZE(B) + movlps %xmm14, 12 * SIZE(B) + movlps %xmm15, 14 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + movaps %xmm2, 56 * SIZE(BO) + movaps %xmm3, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm14, 4 * SIZE(CO1, LDC, 1) + movhps %xmm14, 6 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm11, 4 * SIZE(CO1, LDC, 1) + movhps %xmm11, 6 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $ 8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 8 * SIZE(AO), %xmm10 + +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd 0 * SIZE(BO), %xmm9 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 16 * SIZE(BO), %xmm11 +#ifdef movsd + xorps %xmm13, %xmm13 +#endif + movsd 32 * SIZE(BO), %xmm13 +#ifdef movsd + xorps %xmm15, %xmm15 +#endif + movsd 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $ 4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 3 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 8 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 5 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 6 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 7 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (AO, %rax, SIZE), AO +#ifdef LT + addq $ 2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $2 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + pshufd $0x55, %xmm11, %xmm9 + pshufd $0xaa, %xmm11, %xmm10 + pshufd $0xff, %xmm11, %xmm11 + + pshufd $0x00, %xmm15, %xmm12 + pshufd $0x55, %xmm15, %xmm13 + pshufd $0xaa, %xmm15, %xmm14 + pshufd $0xff, %xmm15, %xmm15 + + movaps %xmm8, 32 * SIZE(BO) + movaps %xmm9, 36 * SIZE(BO) + movaps %xmm10, 40 * SIZE(BO) + movaps %xmm11, 44 * SIZE(BO) + movaps %xmm12, 48 * SIZE(BO) + movaps %xmm13, 52 * SIZE(BO) + movaps %xmm14, 56 * SIZE(BO) + movaps %xmm15, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 7 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 4 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 8 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 16 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 16 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 28 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 24 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 28 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 36 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 60 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 40 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 80 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 68 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 72 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 44 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + movaps 16 * SIZE(B), %xmm12 + movaps 20 * SIZE(B), %xmm13 + movaps 24 * SIZE(B), %xmm14 + movaps 28 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + movaps 16 * SIZE(AO), %xmm12 + movaps 20 * SIZE(AO), %xmm13 + movaps 24 * SIZE(AO), %xmm14 + movaps 28 * SIZE(AO), %xmm15 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 + subps %xmm2, %xmm12 + subps %xmm6, %xmm13 + subps %xmm3, %xmm14 + subps %xmm7, %xmm15 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm15 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm15 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm15 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm9 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm9 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + movaps %xmm12, 16 * SIZE(B) + movaps %xmm13, 20 * SIZE(B) + movaps %xmm14, 24 * SIZE(B) + movaps %xmm15, 28 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + pshufd $0xaa, %xmm12, %xmm4 + pshufd $0xff, %xmm12, %xmm6 + movaps %xmm2, 64 * SIZE(BO) + movaps %xmm3, 68 * SIZE(BO) + movaps %xmm4, 72 * SIZE(BO) + movaps %xmm6, 76 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + pshufd $0xaa, %xmm13, %xmm4 + pshufd $0xff, %xmm13, %xmm6 + movaps %xmm2, 80 * SIZE(BO) + movaps %xmm3, 84 * SIZE(BO) + movaps %xmm4, 88 * SIZE(BO) + movaps %xmm6, 92 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + pshufd $0xaa, %xmm14, %xmm4 + pshufd $0xff, %xmm14, %xmm6 + movaps %xmm2, 96 * SIZE(BO) + movaps %xmm3, 100 * SIZE(BO) + movaps %xmm4, 104 * SIZE(BO) + movaps %xmm6, 108 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + pshufd $0xaa, %xmm15, %xmm4 + pshufd $0xff, %xmm15, %xmm6 + movaps %xmm2, 112 * SIZE(BO) + movaps %xmm3, 116 * SIZE(BO) + movaps %xmm4, 120 * SIZE(BO) + movaps %xmm6, 124 * SIZE(BO) + +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) + movaps %xmm12, 16 * SIZE(AO) + movaps %xmm13, 20 * SIZE(AO) + movaps %xmm14, 24 * SIZE(AO) + movaps %xmm15, 28 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movaps %xmm12, %xmm2 + unpcklps %xmm14, %xmm12 + unpckhps %xmm14, %xmm2 + + movaps %xmm13, %xmm7 + unpcklps %xmm15, %xmm13 + unpckhps %xmm15, %xmm7 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movaps %xmm2, %xmm15 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm15 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm11, 4 * SIZE(CO2) + movhps %xmm11, 6 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm13, 4 * SIZE(CO1, LDC, 2) + movhps %xmm13, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $32 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + movaps 8 * SIZE(AO), %xmm12 + movaps 12 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) + movaps %xmm12, 8 * SIZE(AO) + movaps %xmm14, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movaps 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movapd 0 * SIZE(B), %xmm1 + movapd 4 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 4 * SIZE(AO), %xmm12 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 6 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) + movlps %xmm12, 4 * SIZE(AO) + movlps %xmm14, 6 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss 2 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss 3 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss 8 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss 5 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss 7 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + unpcklps %xmm1, %xmm0 + + movapd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + movss 2 * SIZE(AO), %xmm12 + movss 3 * SIZE(AO), %xmm14 + + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 + subss %xmm2, %xmm12 + subss %xmm3, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) + movss %xmm12, 2 * SIZE(AO) + movss %xmm14, 3 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm12, 0 * SIZE(CO1, LDC, 2) + movss %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + + + +.L999: + movq %rbx, %rsp + EMMS + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/xdot.S b/kernel/x86_64/xdot.S new file mode 100644 index 0000000..966b499 --- /dev/null +++ b/kernel/x86_64/xdot.S @@ -0,0 +1,290 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) + +#include "l1param.h" + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + testl N, N + jle .L88 + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl N, %eax + sarl $1, %eax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(X) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + jmp .L27 + ALIGN_3 + +.L14: + movl N, %eax + sarl $1, %eax + jle .L30 + ALIGN_3 + + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addl INCX, X + + FLD 0 * SIZE(X) + addl INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + ALIGN_3 + +.L27: + movl RESULT, %eax + +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) + + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L88: + movl RESULT, %eax + + fldz + fldz + + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86_64/xgemm3m_kernel_2x2.S b/kernel/x86_64/xgemm3m_kernel_2x2.S new file mode 100644 index 0000000..6d116a1 --- /dev/null +++ b/kernel/x86_64/xgemm3m_kernel_2x2.S @@ -0,0 +1,877 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define KKK 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA_R 8 + STACKSIZE(%rsp) +#define ALPHA_I 24 + STACKSIZE(%rsp) +#define OFFSET 48 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 40 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + leaq (, LDC, 2), %rax + addq %rax, C + + movq M, I + sarq $1, I + je .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fld %st(3) + fmul %st(1), %st + + FLD 2 * SIZE(CO) + faddp %st, %st(1) + FST 2 * SIZE(CO) + + fld %st(4) + fmul %st(1), %st + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + fmul %st(5), %st + + FLD 2 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 2 * SIZE(CO, LDC) + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 3 * SIZE(CO) + faddp %st, %st(1) + FST 3 * SIZE(CO) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) + + FLD 3 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 3 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO + decq I + jne .L11 + ALIGN_4 + +.L20: + movq M, %rax + andq $1, %rax + je .L29 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq ( B, %rax, 2), BO +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmul %st(3), %st + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + decq J + jne .L01 + ALIGN_4 + +.L30: + movq N, %rax + testq $1, %rax + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + addq LDC, C + + movq M, I + sarq $1, I + je .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq ( B, %rax, 1), BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmul %st(3), %st + + FLD 2 * SIZE(CO) + faddp %st, %st(1) + FST 2 * SIZE(CO) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 3 * SIZE(CO) + faddp %st, %st(1) + FST 3 * SIZE(CO) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO + decq I + jne .L31 + ALIGN_4 + +.L40: + movq M, %rax + andq $1, %rax + je .L49 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq ( B, %rax, 1), BO +#endif + + fldz + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + FLD ALPHA_I + FLD ALPHA_R + + fmul %st(2), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmulp %st, %st(1) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + movq BO, B + ALIGN_4 + +.L999: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/xgemm_kernel_1x1.S b/kernel/x86_64/xgemm_kernel_1x1.S new file mode 100644 index 0000000..164e618 --- /dev/null +++ b/kernel/x86_64/xgemm_kernel_1x1.S @@ -0,0 +1,374 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define STACKSIZE 64 + +#define ALPHA_R 8 + STACKSIZE(%rsp) +#define ALPHA_I 24 + STACKSIZE(%rsp) +#define OFFSET 48 + STACKSIZE(%rsp) + +#define KK %r11 +#define KKK 48(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 faddp +#define ADD4 faddp +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 fsubrp +#define ADD4 faddp +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 faddp +#define ADD4 fsubrp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 fsubrp +#define ADD4 fsubrp +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 40 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + cmpq $0, M + jle .L999 + + movq N, %rax + movq %rax, J + testq %rax, %rax + jle .L999 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + addq LDC, C + + movq M, I + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: + faddp %st, %st(3) + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FLD ALPHA_R + fld %st + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + FLD ALPHA_I + fmul %st, %st(3) + fmulp %st, %st(4) + + fsubp %st, %st(2) + faddp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO + decq I + jne .L11 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + movq BO, B + decq J + jne .L01 + ALIGN_4 + +.L999: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/xgemv_n.S b/kernel/x86_64/xgemv_n.S new file mode 100644 index 0000000..db6d80a --- /dev/null +++ b/kernel/x86_64/xgemv_n.S @@ -0,0 +1,334 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define P 32 + +#define STACKSIZE 80 + +#define ALPHA_R 8 + STACKSIZE(%rsp) +#define ALPHA_I 24 + STACKSIZE(%rsp) +#define OLD_INCX 40 + STACKSIZE(%rsp) +#define OLD_Y 48 + STACKSIZE(%rsp) +#define OLD_INCY 56 + STACKSIZE(%rsp) +#define BUFFER 64 + STACKSIZE(%rsp) + +#define PLDA_M 56 (%rsp) +#define IS 64 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#define TEMP %rax +#define I %rax +#define J %r11 +#define A1 %r12 +#define X1 %r13 +#define Y1 %r14 +#define XP %r15 +#define MIN_N %rbx + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + + FLD ALPHA_I + FLD ALPHA_R + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movq $0, IS + + test M, M + jle .L79 + test N, N + jle .L79 + + movq LDA, %rax + imulq $P, %rax # P * lda + subq M ,%rax # P * lda - m + salq $ZBASE_SHIFT, %rax + movq %rax, PLDA_M + + salq $ZBASE_SHIFT, LDA + ALIGN_2 + +.L32: + movq $P, %rax + movq N, MIN_N + subq IS, MIN_N + cmpq %rax, MIN_N + cmovg %rax, MIN_N + + movq IS, XP + salq $ZBASE_SHIFT, XP + leaq (X,XP, 1), XP + + cmpq $2 * SIZE, INCX + je .L34 + + movq BUFFER, XP + movq XP, X1 + + movq MIN_N, I + sarq $1, I + jle .L35 + ALIGN_2 + +.L36: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + + FST 3 * SIZE(X1) + FST 2 * SIZE(X1) + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + + addq $4 * SIZE, X1 # xp += 4 + decq I + jg .L36 + ALIGN_3 + +.L35: + movq MIN_N, I + andq $1, I + jle .L34 + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + ALIGN_3 + +/* Main Routine */ +.L34: + movq Y, Y1 # c_offset + movq M, J # j = m + ALIGN_3 + +.L61: + movq A, A1 # a_offset = a + addq $2 * SIZE, A # a++ + + fldz + fldz + fldz + fldz + + movq XP, X1 + FLD (X1) # bt1 = *(b_offset + 0) + + movq MIN_N, I + sarq $1, I + jle .L64 + ALIGN_3 + +.L65: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(X1) # bt1 = *(b_offset + 2) + + addq $2 * SIZE, X1 # b_offset += 2 + addq LDA, A1 # a_offset += lda + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(X1) # bt1 = *(b_offset + 2) + + addq $2 * SIZE, X1 # b_offset += 2 + addq LDA, A1 # a_offset += lda + + decq I + jg .L65 + +.L64: + movq MIN_N, I + andq $1, I + jle .L70 + ALIGN_2 + +.L71: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_2 + +.L70: + ffreep %st(0) + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4) + fld %st(2) + fmul %st(4) + fsubp %st, %st(1) + + FLD 0 * SIZE(Y1) + faddp %st, %st(1) + FST 0 * SIZE(Y1) + + fmul %st(2) + fxch %st(1) + fmul %st(3) + faddp %st, %st(1) + + FLD 1 * SIZE(Y1) + faddp %st, %st(1) + FST 1 * SIZE(Y1) + + addq INCY, Y1 + decq J + jg .L61 + +.L60: + addq PLDA_M, A + addq $P, IS + cmpq N, IS + jl .L32 + +.L79: + ffreep %st + ffreep %st + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/xgemv_t.S b/kernel/x86_64/xgemv_t.S new file mode 100644 index 0000000..c09dcf0 --- /dev/null +++ b/kernel/x86_64/xgemv_t.S @@ -0,0 +1,338 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define STACKSIZE 80 +#define P 4096 + +#define ALPHA_R 8 + STACKSIZE(%rsp) +#define ALPHA_I 24 + STACKSIZE(%rsp) +#define OLD_INCX 40 + STACKSIZE(%rsp) +#define OLD_Y 48 + STACKSIZE(%rsp) +#define OLD_INCY 56 + STACKSIZE(%rsp) +#define BUFFER 64 + STACKSIZE(%rsp) + +#define NLDA 56 (%rsp) +#define IS 64 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#define TEMP %rax +#define I %rax +#define J %r11 +#define A1 %r12 +#define XP %r15 +#define X1 %r13 +#define Y1 %r14 +#define MIN_M %rbx + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + + FLD ALPHA_I + FLD ALPHA_R + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movq $0, IS + + test M, M + jle .L79 # goto END + test N, N + jle .L79 # goto END + + movq N, %rax + imulq LDA, %rax + movq $P, NLDA + subq %rax, NLDA + salq $ZBASE_SHIFT, NLDA + + salq $ZBASE_SHIFT, LDA + ALIGN_2 + +.L32: + movq $P, %rax + movq M, MIN_M + subq IS , MIN_M + cmpq %rax, MIN_M + cmovg %rax, MIN_M + + movq IS, X1 + salq $ZBASE_SHIFT, X1 + leaq (X,X1, 1), X1 + + movq X1, XP + + cmpq $2 * SIZE, INCX + je .L34 + + movq BUFFER, X1 + movq X1, XP + + movq MIN_M, I + sarq $1, I + jle .L35 + ALIGN_3 + +.L36: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + + FST 3 * SIZE(X1) + FST 2 * SIZE(X1) + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + + addq $4 * SIZE, X1 # xp += 4 + decq I + jg .L36 + ALIGN_3 + +.L35: + movq MIN_M, I + andq $1,I + jle .L34 + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + ALIGN_3 + +/* Main Routine */ + +.L34: + movq Y, Y1 # coffset = y + + movq N, J + ALIGN_2 + +.L61: + movq A, A1 # a_offset = a + fldz # ct1 = ZERO + fldz # ct1 = ZERO + + addq LDA, A + fldz # ct1 = ZERO + fldz # ct1 = ZERO + + movq XP, X1 + + FLD (X1) # bt1 = *(b_offset + 0) + + movq MIN_M, I + sarq $1, I + jle .L64 + ALIGN_3 + +.L65: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 3 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 4 * SIZE(X1) # bt1 = *(b_offset + 1) + + addq $4 * SIZE, X1 + addq $4 * SIZE, A1 + decq I + jg .L65 + ALIGN_3 + +.L64: + movq MIN_M, I + andq $1, I + jle .L70 + ALIGN_3 + +.L71: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_3 + +.L70: + ffreep %st(0) + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4) + fld %st(2) + fmul %st(4) + fsubp %st, %st(1) + + FLD 0 * SIZE(Y1) + faddp %st, %st(1) + FST 0 * SIZE(Y1) + + fmul %st(2) + fxch %st(1) + fmul %st(3) + faddp %st, %st(1) + + FLD 1 * SIZE(Y1) + faddp %st, %st(1) + FST 1 * SIZE(Y1) + addq INCY, Y1 + + decq J + jg .L61 + ALIGN_3 + +.L60: + addq NLDA, A + + addq $P, IS + cmpq M, IS + jl .L32 + ALIGN_3 + +.L79: + ffreep %st + ffreep %st + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/xtrsm_kernel_LT_1x1.S b/kernel/x86_64/xtrsm_kernel_LT_1x1.S new file mode 100644 index 0000000..86d4a74 --- /dev/null +++ b/kernel/x86_64/xtrsm_kernel_LT_1x1.S @@ -0,0 +1,486 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define OFFSET 48 + STACKSIZE(%rsp) + +#define STACKSIZE 64 + +#define KK %r11 +#define AORIG 48(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + +#ifndef CONJ +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 faddp +#define ADD4 faddp +#elif defined(LN) || defined(LT) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 fsubrp +#define ADD4 faddp +#else +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 faddp +#define ADD4 fsubrp +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 40 + STACKSIZE(%rsp), LDC + + salq $ZBASE_SHIFT, LDC + + addq $8 * SIZE, A + addq $8 * SIZE, B + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + cmpq $0, M + jle .L999 + + movq N, %rax + movq %rax, J + testq %rax, %rax + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#ifdef RT + subq LDC, C +#endif + movq C, CO +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + movq M, I + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: + faddp %st, %st(3) + faddp %st, %st(1) + + fxch %st(1) + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st(1), %st + FLD -8 * SIZE(AO) + fmul %st(3), %st + FLD -7 * SIZE(AO) + fmulp %st, %st(3) + FLD -7 * SIZE(AO) + fmulp %st, %st(4) +#endif + +#if defined(RN) || defined(RT) + FLD -8 * SIZE(BO) + fmul %st(1), %st + FLD -8 * SIZE(BO) + fmul %st(3), %st + FLD -7 * SIZE(BO) + fmulp %st, %st(3) + FLD -7 * SIZE(BO) + fmulp %st, %st(4) +#endif + +#ifndef CONJ + faddp %st, %st(2) + fsubp %st, %st(2) +#else + fsubp %st, %st(2) + faddp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -7 * SIZE(BO) + fxch %st(1) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -8 * SIZE(AO) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L11 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + + decq J + jne .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zamax.S b/kernel/x86_64/zamax.S new file mode 100644 index 0000000..21d96b6 --- /dev/null +++ b/kernel/x86_64/zamax.S @@ -0,0 +1,241 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define I %rax + +#ifndef USE_MIN +#define FMOV fcmovbe +#else +#define FMOV fcmovnbe +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $ZBASE_SHIFT, INCX + + fldz + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + ffreep %st + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + addq INCX, X + decq M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq $8 * SIZE, X + + decq I + jg .L10 + ALIGN_4 + +.L20: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq $2 * SIZE, X + decq I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + decq I + jg .L50 + ALIGN_4 + +.L60: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq INCX, X + decq I + jg .L61 + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/zamax_atom.S b/kernel/x86_64/zamax_atom.S new file mode 100644 index 0000000..3f67574 --- /dev/null +++ b/kernel/x86_64/zamax_atom.S @@ -0,0 +1,336 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + salq $ZBASE_SHIFT, INCX + + testq M, M + jle .L999 + + testq INCX, INCX + jle .L999 + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm4 + addq INCX, X + + andps %xmm15, %xmm0 + andps %xmm15, %xmm4 + + addsd %xmm4, %xmm0 + decq M + jle .L999 + + movaps %xmm0, %xmm1 + + cmpq $2 * SIZE, INCX + jne .L20 + + movq M, I + sarq $2, I + jle .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movsd 2 * SIZE(X), %xmm6 + movsd 3 * SIZE(X), %xmm7 + + movsd 4 * SIZE(X), %xmm8 + andps %xmm15, %xmm4 + movsd 5 * SIZE(X), %xmm9 + andps %xmm15, %xmm5 + movsd 6 * SIZE(X), %xmm10 + addsd %xmm4, %xmm5 + movsd 7 * SIZE(X), %xmm11 + decq I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm6 + movsd 8 * SIZE(X), %xmm4 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + movsd 10 * SIZE(X), %xmm6 + + maxsd %xmm5, %xmm0 + movsd 9 * SIZE(X), %xmm5 + andps %xmm15, %xmm8 + maxsd %xmm7, %xmm1 + movsd 11 * SIZE(X), %xmm7 + andps %xmm15, %xmm9 + addsd %xmm8, %xmm9 + movsd 12 * SIZE(X), %xmm8 + + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + addsd %xmm10, %xmm11 + movsd 14 * SIZE(X), %xmm10 + + maxsd %xmm9, %xmm0 + movsd 13 * SIZE(X), %xmm9 + andps %xmm15, %xmm4 + maxsd %xmm11, %xmm1 + movsd 15 * SIZE(X), %xmm11 + andps %xmm15, %xmm5 + addsd %xmm4, %xmm5 + + addq $8 * SIZE, X + decq I + jg .L12 + ALIGN_4 + +.L13: + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + andps %xmm15, %xmm8 + maxsd %xmm7, %xmm1 + andps %xmm15, %xmm9 + addsd %xmm8, %xmm9 + + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + addsd %xmm10, %xmm11 + + maxsd %xmm9, %xmm0 + maxsd %xmm11, %xmm1 + + addq $8 * SIZE, X + ALIGN_4 + +.L15: + testq $2, M + jle .L17 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movsd 2 * SIZE(X), %xmm6 + movsd 3 * SIZE(X), %xmm7 + addq $4 * SIZE, X + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addsd %xmm4, %xmm5 + + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + maxsd %xmm7, %xmm1 + ALIGN_3 + +.L17: + testq $1, M + jle .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + + addsd %xmm4, %xmm5 + maxsd %xmm5, %xmm0 + jmp .L998 + ALIGN_3 + +.L20: + movq M, I + sarq $2, I + jle .L25 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + movsd 0 * SIZE(X), %xmm8 + andps %xmm15, %xmm4 + movsd 1 * SIZE(X), %xmm9 + addq INCX, X + andps %xmm15, %xmm5 + movsd 0 * SIZE(X), %xmm10 + addsd %xmm4, %xmm5 + movsd 1 * SIZE(X), %xmm11 + addq INCX, X + + decq I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm6 + movsd 0 * SIZE(X), %xmm4 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + movsd 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm8 + addq INCX, X + maxsd %xmm7, %xmm1 + movsd 0 * SIZE(X), %xmm6 + andps %xmm15, %xmm9 + movsd 1 * SIZE(X), %xmm7 + addsd %xmm8, %xmm9 + addq INCX, X + + andps %xmm15, %xmm10 + movsd 0 * SIZE(X), %xmm8 + andps %xmm15, %xmm11 + addsd %xmm10, %xmm11 + + maxsd %xmm9, %xmm0 + movsd 1 * SIZE(X), %xmm9 + addq INCX, X + andps %xmm15, %xmm4 + movsd 0 * SIZE(X), %xmm10 + maxsd %xmm11, %xmm1 + movsd 1 * SIZE(X), %xmm11 + andps %xmm15, %xmm5 + addq INCX, X + addsd %xmm4, %xmm5 + + decq I + jg .L22 + ALIGN_4 + +.L23: + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + andps %xmm15, %xmm8 + maxsd %xmm7, %xmm1 + andps %xmm15, %xmm9 + addsd %xmm8, %xmm9 + + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + addsd %xmm10, %xmm11 + + maxsd %xmm9, %xmm0 + maxsd %xmm11, %xmm1 + ALIGN_4 + +.L25: + testq $2, M + jle .L27 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addsd %xmm4, %xmm5 + + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + maxsd %xmm7, %xmm1 + ALIGN_3 + +.L27: + testq $1, M + jle .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + + addsd %xmm4, %xmm5 + maxsd %xmm5, %xmm0 + ALIGN_3 + +.L998: + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zamax_sse.S b/kernel/x86_64/zamax_sse.S new file mode 100644 index 0000000..5566a35 --- /dev/null +++ b/kernel/x86_64/zamax_sse.S @@ -0,0 +1,309 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + salq $ZBASE_SHIFT, INCX + + testq M, M + jle .L999 + + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 + + movss 0 * SIZE(X), %xmm0 + movss 1 * SIZE(X), %xmm1 + addq INCX, X + decq M + andps %xmm15, %xmm0 + andps %xmm15, %xmm1 + addps %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, %xmm1 + cmpq $2 * SIZE, INCX + jne .L40 + +.L30: + movq M, I + sarq $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + movsd 8 * SIZE(X), %xmm7 + movhps 10 * SIZE(X), %xmm7 + movsd 12 * SIZE(X), %xmm8 + movhps 14 * SIZE(X), %xmm8 + movaps %xmm7, %xmm9 + + shufps $0x88, %xmm8, %xmm7 + shufps $0xdd, %xmm8, %xmm9 + + andps %xmm15, %xmm7 + andps %xmm15, %xmm9 + addps %xmm9, %xmm7 + maxps %xmm7, %xmm0 + + addq $16 * SIZE, X + decq I + jg .L31 + ALIGN_4 + +.L35: + andq $7, M + jle .L998 + + testq $4, M + je .L36 + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + addq $8 * SIZE, X + ALIGN_3 + +.L36: + testq $2, M + je .L37 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + movss 2 * SIZE(X), %xmm6 + movss 3 * SIZE(X), %xmm7 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + maxss %xmm4, %xmm0 + maxss %xmm6, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L37: + testq $1, M + je .L998 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addps %xmm5, %xmm4 + maxss %xmm4, %xmm0 + jmp .L998 + ALIGN_4 + + +.L40: + movq M, I + sarq $3, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhps 0 * SIZE(X), %xmm5 + addq INCX, X + + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhps 0 * SIZE(X), %xmm7 + addq INCX, X + movsd 0 * SIZE(X), %xmm8 + addq INCX, X + movhps 0 * SIZE(X), %xmm8 + addq INCX, X + movaps %xmm7, %xmm9 + + shufps $0x88, %xmm8, %xmm7 + shufps $0xdd, %xmm8, %xmm9 + + andps %xmm15, %xmm7 + andps %xmm15, %xmm9 + addps %xmm9, %xmm7 + maxps %xmm7, %xmm0 + + decq I + jg .L41 + ALIGN_4 + +.L45: + andq $7, M + jle .L998 + + testq $4, M + je .L46 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhps 0 * SIZE(X), %xmm5 + addq INCX, X + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + ALIGN_3 + +.L46: + testq $2, M + je .L47 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + addq INCX, X + movss 0 * SIZE(X), %xmm6 + movss 1 * SIZE(X), %xmm7 + addq INCX, X + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + maxss %xmm4, %xmm0 + maxss %xmm6, %xmm1 + ALIGN_3 + +.L47: + testq $1, M + je .L998 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addps %xmm5, %xmm4 + maxss %xmm4, %xmm0 + jmp .L998 + ALIGN_4 + +.L998: + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zamax_sse2.S b/kernel/x86_64/zamax_sse2.S new file mode 100644 index 0000000..eb8fd43 --- /dev/null +++ b/kernel/x86_64/zamax_sse2.S @@ -0,0 +1,341 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + decq M + andpd %xmm15, %xmm0 + andpd %xmm15, %xmm1 + addpd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, %xmm1 + movapd %xmm0, %xmm2 + movapd %xmm0, %xmm3 + + cmpq $2 * SIZE, INCX + jne .L40 + +.L30: + movq M, I + sarq $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 5 * SIZE(X), %xmm7 + movhpd 6 * SIZE(X), %xmm6 + movhpd 7 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm0 + + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm6 + maxpd %xmm6, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 8 * SIZE(X), %xmm4 + movsd 9 * SIZE(X), %xmm5 + movhpd 10 * SIZE(X), %xmm4 + movhpd 11 * SIZE(X), %xmm5 + movsd 12 * SIZE(X), %xmm6 + movsd 13 * SIZE(X), %xmm7 + movhpd 14 * SIZE(X), %xmm6 + movhpd 15 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm2 + + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm6 + maxpd %xmm6, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L31 + ALIGN_4 + +.L35: + andq $7, M + jle .L998 + + testq $4, M + je .L36 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 5 * SIZE(X), %xmm7 + movhpd 6 * SIZE(X), %xmm6 + movhpd 7 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L36: + testq $2, M + je .L37 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + addq $4 * SIZE, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm0 + ALIGN_3 + +.L37: + testq $1, M + je .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxsd %xmm4, %xmm2 + jmp .L998 + ALIGN_4 + + +.L40: + movq M, I + sarq $3, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm0 + + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm6 + maxpd %xmm6, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm2 + + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm6 + maxpd %xmm6, %xmm3 + + decq I + jg .L41 + ALIGN_4 + +.L45: + andq $7, M + jle .L998 + + testq $4, M + je .L46 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + ALIGN_3 + +.L46: + testq $2, M + je .L47 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm2 + ALIGN_3 + +.L47: + testq $1, M + je .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxsd %xmm4, %xmm3 + jmp .L998 + ALIGN_4 + +.L998: + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zasum.S b/kernel/x86_64/zasum.S new file mode 100644 index 0000000..b94e49b --- /dev/null +++ b/kernel/x86_64/zasum.S @@ -0,0 +1,200 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE * 2, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $3, M + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st,%st(3) + faddp %st,%st(1) + addq $2 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + faddp %st,%st(3) + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/zasum_atom.S b/kernel/x86_64/zasum_atom.S new file mode 100644 index 0000000..ab83809 --- /dev/null +++ b/kernel/x86_64/zasum_atom.S @@ -0,0 +1,411 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + salq $ZBASE_SHIFT, INCX + xorps %xmm13, %xmm13 + + cmpq $2 * SIZE, INCX + jne .L20 + + addq M, M + + testq $SIZE, X + je .L05 + + movsd (X), %xmm0 + addq $SIZE, X + andps %xmm15, %xmm0 + decq M + ALIGN_3 + +.L05: + subq $-16 * SIZE, X + + movq M, I + sarq $4, I + jle .L12 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm8, %xmm12 + addsd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm9, %xmm13 + addsd %xmm9, %xmm2 + movaps 10 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm10, %xmm12 + addsd %xmm10, %xmm0 + movaps 12 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm11, %xmm13 + addsd %xmm11, %xmm2 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm15, %xmm4 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + + andps %xmm15, %xmm6 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + + andps %xmm15, %xmm8 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm8, %xmm12 + addsd %xmm8, %xmm0 + + andps %xmm15, %xmm9 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm9, %xmm13 + addsd %xmm9, %xmm2 + + andps %xmm15, %xmm10 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm10, %xmm12 + addsd %xmm10, %xmm0 + + andps %xmm15, %xmm11 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm11, %xmm13 + addsd %xmm11, %xmm2 + + addsd %xmm13, %xmm3 + subq $-16 * SIZE, X + ALIGN_3 + +.L12: + andq $15, M + jle .L998 + + testq $8, M + je .L13 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + addq $8 * SIZE, X + + andps %xmm15, %xmm4 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + addsd %xmm13, %xmm3 + andps %xmm15, %xmm6 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + addsd %xmm13, %xmm3 + ALIGN_3 + +.L13: + testq $4, M + je .L14 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + addq $4 * SIZE, X + + andps %xmm15, %xmm4 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + addsd %xmm13, %xmm3 + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movaps -16 * SIZE(X), %xmm4 + addq $2 * SIZE, X + andps %xmm15, %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + addsd %xmm4, %xmm2 + addsd %xmm5, %xmm3 + ALIGN_3 + +.L15: + testq $1, M + je .L998 + + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L20: + movq M, I + sarq $2, I + jle .L25 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + movsd 0 * SIZE(X), %xmm8 + movsd 1 * SIZE(X), %xmm9 + addq INCX, X + movsd 0 * SIZE(X), %xmm10 + movsd 1 * SIZE(X), %xmm11 + + decq I + jle .L23 + ALIGN_4 + +.L22: + andps %xmm15, %xmm4 + addq INCX, X + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + movsd 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm6 + addq INCX, X + addsd %xmm6, %xmm2 + movsd 0 * SIZE(X), %xmm6 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + movsd 1 * SIZE(X), %xmm7 + + andps %xmm15, %xmm8 + addq INCX, X + addsd %xmm8, %xmm0 + movsd 0 * SIZE(X), %xmm8 + andps %xmm15, %xmm9 + addsd %xmm9, %xmm1 + movsd 1 * SIZE(X), %xmm9 + andps %xmm15, %xmm10 + addq INCX, X + addsd %xmm10, %xmm2 + movsd 0 * SIZE(X), %xmm10 + andps %xmm15, %xmm11 + addsd %xmm11, %xmm3 + movsd 1 * SIZE(X), %xmm11 + + decq I + jg .L22 + ALIGN_4 + +.L23: + andps %xmm15, %xmm4 + addq INCX, X + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + andps %xmm15, %xmm6 + addsd %xmm6, %xmm2 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + + andps %xmm15, %xmm8 + addsd %xmm8, %xmm0 + andps %xmm15, %xmm9 + addsd %xmm9, %xmm1 + andps %xmm15, %xmm10 + addsd %xmm10, %xmm2 + andps %xmm15, %xmm11 + addsd %xmm11, %xmm3 + ALIGN_3 + +.L25: + testq $2, M + je .L26 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + movsd 1 * SIZE(X), %xmm7 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + addq INCX, X + + andps %xmm15, %xmm6 + addsd %xmm6, %xmm2 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + ALIGN_3 + +.L26: + testq $1, M + je .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + ALIGN_3 + +.L998: + addsd %xmm1, %xmm0 + addsd %xmm3, %xmm2 + addsd %xmm2, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zasum_sse.S b/kernel/x86_64/zasum_sse.S new file mode 100644 index 0000000..7f3d3d1 --- /dev/null +++ b/kernel/x86_64/zasum_sse.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L100 + + subq $-32 * SIZE, X + addq M, M + + cmpq $3, M + jle .L18 + + testq $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + andps %xmm15, %xmm0 + addq $SIZE, X + decq M + jle .L998 + ALIGN_3 + +.L05: + testq $8, X + je .L10 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + andps %xmm15, %xmm1 + addq $2 * SIZE, X + subq $2, M + jle .L998 + ALIGN_3 + +.L10: + movq M, I + sarq $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + movaps -16 * SIZE(X), %xmm8 + movaps -12 * SIZE(X), %xmm9 + movaps -8 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + decq I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps 16 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addps %xmm9, %xmm1 + movaps 20 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addps %xmm10, %xmm2 + movaps 24 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addps %xmm11, %xmm3 + movaps 28 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + decq I + jg .L11 + ALIGN_3 + +.L12: + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + + andps %xmm15, %xmm8 + addps %xmm8, %xmm0 + andps %xmm15, %xmm9 + addps %xmm9, %xmm1 + + andps %xmm15, %xmm10 + addps %xmm10, %xmm2 + andps %xmm15, %xmm11 + addps %xmm11, %xmm3 + + addq $32 * SIZE, X + ALIGN_3 + +.L14: + testq $31, M + jle .L998 + +.L15: + testq $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -20 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + addq $16 * SIZE, X + ALIGN_3 + +.L16: + testq $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L17: + testq $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L18: + testq $2, M + je .L19 + +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd -32 * SIZE(X), %xmm7 + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, M + je .L998 + + movss -32 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + jmp .L998 + ALIGN_4 + +.L100: + movq M, I + sarq $2, I + jle .L105 + ALIGN_4 + +.L101: + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + decq I + jg .L101 + ALIGN_4 + +.L105: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andq $3, M + jle .L998 + ALIGN_4 + +.L106: + movsd (X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + addq INCX, X + decq M + jg .L106 + ALIGN_4 + +.L998: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zasum_sse2.S b/kernel/x86_64/zasum_sse2.S new file mode 100644 index 0000000..9d0ec2e --- /dev/null +++ b/kernel/x86_64/zasum_sse2.S @@ -0,0 +1,318 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L40 + + subq $-16 * SIZE, X + addq M, M + + testq $SIZE, X + je .L05 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -16 * SIZE(X), %xmm0 + addq $SIZE, X + + andps %xmm15, %xmm0 + subq $1, M + jle .L999 + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addpd %xmm9, %xmm1 + movaps 10 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addpd %xmm10, %xmm2 + movaps 12 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addpd %xmm11, %xmm3 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + andps %xmm15, %xmm8 + andps %xmm15, %xmm9 + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + ALIGN_3 + +.L20: + andq $15, M + jle .L998 + + testq $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L21: + testq $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L22: + testq $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addpd %xmm6, %xmm3 + addq $2 * SIZE, X + +.L23: + testq $1, M + je .L998 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + prefetcht0 PREFETCHSIZE * SIZE(X) +#endif + +#ifdef PENTIUM4 + prefetchnta PREFETCHSIZE * SIZE(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + addq INCX, X + andpd %xmm15, %xmm4 + addpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm6 + addq INCX, X + andpd %xmm15, %xmm6 + addpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm3 + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + andpd %xmm15, %xmm4 + addpd %xmm4, %xmm0 + addq INCX, X + decq M + jg .L61 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zaxpy.S b/kernel/x86_64/zaxpy.S new file mode 100644 index 0000000..266c147 --- /dev/null +++ b/kernel/x86_64/zaxpy.S @@ -0,0 +1,336 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG4 /* rsi */ +#define INCX ARG5 /* rdx */ +#define Y ARG6 /* rcx */ +#define INCY ARG2 /* r8 */ + +#ifndef CONJ +#define ADD1 fsubrp +#define ADD2 faddp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#endif + +#define ALPHA_R 8(%rsp) +#define ALPHA_I 24(%rsp) + +#include "l1param.h" + + PROLOGUE + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movq 40(%rsp), INCY + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L40 + + cmpq $2 * SIZE, INCX + jne .L14 + cmpq $2 * SIZE, INCY + jne .L14 + + movq M, %rax + sarq $2, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1), %st + FLD 3 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 2 * SIZE(Y) + faddp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(2), %st + FLD 3 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 3 * SIZE(Y) + faddp %st, %st(1) + FST 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 4 * SIZE(X) + fmul %st(1), %st + FLD 5 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 4 * SIZE(Y) + faddp %st, %st(1) + FST 4 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(2), %st + FLD 5 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 5 * SIZE(Y) + faddp %st, %st(1) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1), %st + FLD 7 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 6 * SIZE(Y) + faddp %st, %st(1) + FST 6 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(2), %st + FLD 7 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 7 * SIZE(Y) + faddp %st, %st(1) + FST 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq M, %rax + andq $3, %rax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq %rax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movq M, %rax + sarq $2, %rax + jle .L28 + ALIGN_3 + +.L29: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L29 + ALIGN_3 + +.L28: + movq M, %rax + andq $3, %rax + jle .L40 + ALIGN_3 + +.L35: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + ffreep %st(0) + ret + + EPILOGUE diff --git a/kernel/x86_64/zaxpy_atom.S b/kernel/x86_64/zaxpy_atom.S new file mode 100644 index 0000000..e623326 --- /dev/null +++ b/kernel/x86_64/zaxpy_atom.S @@ -0,0 +1,675 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 40(%rsp), INCY +#endif +#else + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + +#ifndef CONJ +#define ADD1 subsd +#define ADD2 addsd +#else +#define ADD1 addsd +#define ADD2 subsd +#endif + + salq $ZBASE_SHIFT, INCX + movaps %xmm0, ALPHA_R + salq $ZBASE_SHIFT, INCY + movaps %xmm1, ALPHA_I + + testq M, M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L20 + cmpq $2 * SIZE, INCY + jne .L20 + + movq M, %rax + sarq $2, %rax + jle .L15 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(Y), %xmm9 + + movsd 2 * SIZE(X), %xmm4 + movsd 3 * SIZE(X), %xmm5 + movsd 2 * SIZE(Y), %xmm10 + movsd 3 * SIZE(Y), %xmm11 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + movsd 4 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm9 + movsd 5 * SIZE(X), %xmm1 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulsd ALPHA_I, %xmm7 + movsd 4 * SIZE(Y), %xmm12 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + movsd 5 * SIZE(Y), %xmm13 + addsd %xmm2, %xmm9 + + addsd %xmm4, %xmm10 + movsd 6 * SIZE(X), %xmm4 + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + + ADD2 %xmm5, %xmm11 + movsd 7 * SIZE(X), %xmm5 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + + ADD1 %xmm7, %xmm10 + movsd %xmm8, 0 * SIZE(Y) + mulsd ALPHA_I, %xmm3 + + addsd %xmm6, %xmm11 + movsd %xmm9, 1 * SIZE(Y) + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + movsd %xmm10, 2 * SIZE(Y) + mulsd ALPHA_R, %xmm4 + movsd 6 * SIZE(Y), %xmm10 + addsd %xmm0, %xmm12 + movsd 8 * SIZE(X), %xmm0 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm5, %xmm7 + movsd %xmm11, 3 * SIZE(Y) + mulsd ALPHA_R, %xmm5 + movsd 7 * SIZE(Y), %xmm11 + ADD2 %xmm1, %xmm13 + movsd 9 * SIZE(X), %xmm1 + + mulsd ALPHA_I, %xmm7 + movsd 8 * SIZE(Y), %xmm8 + ADD1 %xmm3, %xmm12 + + mulsd ALPHA_I, %xmm6 + movsd 9 * SIZE(Y), %xmm9 + addsd %xmm2, %xmm13 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + addsd %xmm4, %xmm10 + movsd 10 * SIZE(X), %xmm4 + + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + ADD2 %xmm5, %xmm11 + movsd 11 * SIZE(X), %xmm5 + + mulsd ALPHA_I, %xmm3 + movsd %xmm12, 4 * SIZE(Y) + ADD1 %xmm7, %xmm10 + + mulsd ALPHA_I, %xmm2 + movsd %xmm13, 5 * SIZE(Y) + addsd %xmm6, %xmm11 + + movaps %xmm4, %xmm6 + movsd %xmm10, 6 * SIZE(Y) + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + movsd 10 * SIZE(Y), %xmm10 + movsd 12 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + movsd %xmm11, 7 * SIZE(Y) + mulsd ALPHA_R, %xmm5 + movsd 11 * SIZE(Y), %xmm11 + ADD2 %xmm1, %xmm9 + movsd 13 * SIZE(X), %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L11 + ALIGN_3 + +.L12: + mulsd ALPHA_I, %xmm7 + movsd 4 * SIZE(Y), %xmm12 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + movsd 5 * SIZE(Y), %xmm13 + addsd %xmm2, %xmm9 + + addsd %xmm4, %xmm10 + movsd 6 * SIZE(X), %xmm4 + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + + ADD2 %xmm5, %xmm11 + movsd 7 * SIZE(X), %xmm5 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + + ADD1 %xmm7, %xmm10 + movsd %xmm8, 0 * SIZE(Y) + mulsd ALPHA_I, %xmm3 + + addsd %xmm6, %xmm11 + movsd %xmm9, 1 * SIZE(Y) + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + movsd %xmm10, 2 * SIZE(Y) + mulsd ALPHA_R, %xmm4 + movsd 6 * SIZE(Y), %xmm10 + addsd %xmm0, %xmm12 + + movaps %xmm5, %xmm7 + movsd %xmm11, 3 * SIZE(Y) + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm13 + movsd 7 * SIZE(Y), %xmm11 + + mulsd ALPHA_I, %xmm7 + ADD1 %xmm3, %xmm12 + + mulsd ALPHA_I, %xmm6 + addsd %xmm2, %xmm13 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + addsd %xmm4, %xmm10 + + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + ADD2 %xmm5, %xmm11 + + mulsd ALPHA_I, %xmm3 + ADD1 %xmm7, %xmm10 + + addsd %xmm6, %xmm11 + mulsd ALPHA_I, %xmm2 + + movsd %xmm12, 4 * SIZE(Y) + movsd %xmm13, 5 * SIZE(Y) + movsd %xmm10, 6 * SIZE(Y) + movsd %xmm11, 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $2, %rax + jle .L17 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 2 * SIZE(X), %xmm4 + movsd 3 * SIZE(X), %xmm5 + + movaps %xmm0, %xmm2 + movsd 0 * SIZE(Y), %xmm8 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + movsd 1 * SIZE(Y), %xmm9 + mulsd ALPHA_R, %xmm1 + movsd 2 * SIZE(Y), %xmm10 + mulsd ALPHA_I, %xmm3 + movsd 3 * SIZE(Y), %xmm11 + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + + movaps %xmm5, %xmm7 + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm9 + + mulsd ALPHA_I, %xmm7 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + addsd %xmm2, %xmm9 + + addsd %xmm4, %xmm10 + movsd %xmm8, 0 * SIZE(Y) + ADD2 %xmm5, %xmm11 + movsd %xmm9, 1 * SIZE(Y) + ADD1 %xmm7, %xmm10 + addsd %xmm6, %xmm11 + + movsd %xmm10, 2 * SIZE(Y) + movsd %xmm11, 3 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + movq M, %rax + andq $1, %rax + jle .L999 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(Y), %xmm9 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_I, %xmm2 + + addsd %xmm0, %xmm8 + ADD2 %xmm1, %xmm9 + ADD1 %xmm3, %xmm8 + addsd %xmm2, %xmm9 + + movsd %xmm8, 0 * SIZE(Y) + movsd %xmm9, 1 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movq Y, YY + + movq M, %rax + sarq $2, %rax + jle .L25 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(Y), %xmm9 + addq INCY, Y + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(Y), %xmm10 + movsd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm9 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + mulsd ALPHA_I, %xmm7 + movsd 0 * SIZE(Y), %xmm12 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + movsd 1 * SIZE(Y), %xmm13 + addsd %xmm2, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm10 + movsd 0 * SIZE(X), %xmm4 + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + + ADD2 %xmm5, %xmm11 + movsd 1 * SIZE(X), %xmm5 + movaps %xmm1, %xmm3 + addq INCX, X + mulsd ALPHA_R, %xmm1 + + ADD1 %xmm7, %xmm10 + movsd %xmm8, 0 * SIZE(YY) + mulsd ALPHA_I, %xmm3 + + addsd %xmm6, %xmm11 + movsd %xmm9, 1 * SIZE(YY) + mulsd ALPHA_I, %xmm2 + addq INCY, YY + + movaps %xmm4, %xmm6 + movsd %xmm10, 0 * SIZE(YY) + mulsd ALPHA_R, %xmm4 + movsd 0 * SIZE(Y), %xmm10 + addsd %xmm0, %xmm12 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + movsd %xmm11, 1 * SIZE(YY) + addq INCY, YY + mulsd ALPHA_R, %xmm5 + movsd 1 * SIZE(Y), %xmm11 + addq INCY, Y + ADD2 %xmm1, %xmm13 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + + mulsd ALPHA_I, %xmm7 + movsd 0 * SIZE(Y), %xmm8 + ADD1 %xmm3, %xmm12 + + mulsd ALPHA_I, %xmm6 + movsd 1 * SIZE(Y), %xmm9 + addsd %xmm2, %xmm13 + addq INCY, Y + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + addsd %xmm4, %xmm10 + movsd 0 * SIZE(X), %xmm4 + + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + ADD2 %xmm5, %xmm11 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + + mulsd ALPHA_I, %xmm3 + movsd %xmm12, 0 * SIZE(YY) + ADD1 %xmm7, %xmm10 + + mulsd ALPHA_I, %xmm2 + movsd %xmm13, 1 * SIZE(YY) + addsd %xmm6, %xmm11 + addq INCY, YY + + movaps %xmm4, %xmm6 + movsd %xmm10, 0 * SIZE(YY) + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + movsd 0 * SIZE(Y), %xmm10 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + movsd %xmm11, 1 * SIZE(YY) + addq INCY, YY + mulsd ALPHA_R, %xmm5 + movsd 1 * SIZE(Y), %xmm11 + addq INCY, Y + ADD2 %xmm1, %xmm9 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + mulsd ALPHA_I, %xmm7 + movsd 0 * SIZE(Y), %xmm12 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + movsd 1 * SIZE(Y), %xmm13 + addsd %xmm2, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm10 + movsd 0 * SIZE(X), %xmm4 + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + + ADD2 %xmm5, %xmm11 + movsd 1 * SIZE(X), %xmm5 + movaps %xmm1, %xmm3 + addq INCX, X + mulsd ALPHA_R, %xmm1 + + ADD1 %xmm7, %xmm10 + movsd %xmm8, 0 * SIZE(YY) + mulsd ALPHA_I, %xmm3 + + addsd %xmm6, %xmm11 + movsd %xmm9, 1 * SIZE(YY) + mulsd ALPHA_I, %xmm2 + addq INCY, YY + + movaps %xmm4, %xmm6 + movsd %xmm10, 0 * SIZE(YY) + mulsd ALPHA_R, %xmm4 + movsd 0 * SIZE(Y), %xmm10 + addsd %xmm0, %xmm12 + + movaps %xmm5, %xmm7 + movsd %xmm11, 1 * SIZE(YY) + mulsd ALPHA_R, %xmm5 + addq INCY, YY + ADD2 %xmm1, %xmm13 + movsd 1 * SIZE(Y), %xmm11 + + mulsd ALPHA_I, %xmm7 + addq INCY, Y + ADD1 %xmm3, %xmm12 + + mulsd ALPHA_I, %xmm6 + addsd %xmm2, %xmm13 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + addsd %xmm4, %xmm10 + + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + ADD2 %xmm5, %xmm11 + + mulsd ALPHA_I, %xmm3 + ADD1 %xmm7, %xmm10 + + addsd %xmm6, %xmm11 + mulsd ALPHA_I, %xmm2 + + movsd %xmm12, 0 * SIZE(YY) + movsd %xmm13, 1 * SIZE(YY) + addq INCY, YY + movsd %xmm10, 0 * SIZE(YY) + movsd %xmm11, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L25: + movq M, %rax + andq $2, %rax + jle .L27 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + + movaps %xmm0, %xmm2 + movsd 0 * SIZE(Y), %xmm8 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + movsd 1 * SIZE(Y), %xmm9 + addq INCY, Y + mulsd ALPHA_R, %xmm1 + movsd 0 * SIZE(Y), %xmm10 + mulsd ALPHA_I, %xmm3 + movsd 1 * SIZE(Y), %xmm11 + mulsd ALPHA_I, %xmm2 + addq INCY, Y + + movaps %xmm4, %xmm6 + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + + movaps %xmm5, %xmm7 + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm9 + + mulsd ALPHA_I, %xmm7 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + addsd %xmm2, %xmm9 + + addsd %xmm4, %xmm10 + movsd %xmm8, 0 * SIZE(YY) + ADD2 %xmm5, %xmm11 + movsd %xmm9, 1 * SIZE(YY) + ADD1 %xmm7, %xmm10 + addq INCY, YY + addsd %xmm6, %xmm11 + + movsd %xmm10, 0 * SIZE(YY) + movsd %xmm11, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L27: + movq M, %rax + andq $1, %rax + jle .L999 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(Y), %xmm9 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_I, %xmm2 + + addsd %xmm0, %xmm8 + ADD2 %xmm1, %xmm9 + ADD1 %xmm3, %xmm8 + addsd %xmm2, %xmm9 + + movsd %xmm8, 0 * SIZE(YY) + movsd %xmm9, 1 * SIZE(YY) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zaxpy_sse.S b/kernel/x86_64/zaxpy_sse.S new file mode 100644 index 0000000..69cdeda --- /dev/null +++ b/kernel/x86_64/zaxpy_sse.S @@ -0,0 +1,3118 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + movaps %xmm3, %xmm0 + movss 40(%rsp), %xmm1 + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L100 + cmpq $2 * SIZE, INCY + jne .L100 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + + pshufd $0, %xmm0, ALPHA_R + pshufd $0, %xmm1, ALPHA_I + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 + xorpd %xmm7, ALPHA_I +#else + xorpd %xmm7, ALPHA_R +#endif + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps %xmm8, %xmm0 + addps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq M + jle .L999 + ALIGN_2 + +.L10: + testq $SIZE, Y + jne .L50 + + testq $3 * SIZE, X + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 8 * SIZE(X), %xmm2 + movaps 12 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L15: + testq $8, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L16: + testq $4, M + jle .L17 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L17: + testq $2, M + jle .L18 + + movaps -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L18: + testq $1, M + jle .L999 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + movsd -32 * SIZE(Y), %xmm1 + addps %xmm1, %xmm0 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + testq $2 * SIZE, X + jne .L30 + + subq $1 * SIZE, X + + movaps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L25 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm7 + movaps 0 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm1 + movaps 8 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 12 * SIZE(X), %xmm3 + movaps 16 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm7 + movaps 0 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, M + jle .L26 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L26: + testq $4, M + jle .L27 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L27: + testq $2, M + jle .L28 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L28: + testq $1, M + jle .L999 + + pshufd $0x06, %xmm0, %xmm8 + pshufd $0x09, %xmm0, %xmm0 + + mulps ALPHA_I, %xmm8 + mulps ALPHA_R, %xmm0 + + addps -32 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(Y) + + jmp .L999 + ALIGN_3 + +.L30: + testq $1 * SIZE, X + jne .L40 +#endif + + movq M, %rax + sarq $4, %rax + jle .L35 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L35: + testq $8, M + jle .L36 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L36: + testq $4, M + jle .L37 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L37: + testq $2, M + jle .L38 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L38: + testq $1, M + jle .L999 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + movsd -32 * SIZE(Y), %xmm1 + + addps %xmm1, %xmm0 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS + +.L40: + subq $3 * SIZE, X + + movaps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L45 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + + decq %rax + jle .L42 + ALIGN_3 + +.L41: + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm7 + movaps 0 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm1 + movaps 8 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 12 * SIZE(X), %xmm3 + movaps 16 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm7 + movaps 0 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L45: + testq $8, M + jle .L46 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L46: + testq $4, M + jle .L47 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L47: + testq $2, M + jle .L48 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L48: + testq $1, M + jle .L999 + + movaps -28 * SIZE(X), %xmm1 + movsd -32 * SIZE(Y), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps %xmm8, %xmm0 + addps %xmm2, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + jmp .L999 + ALIGN_3 +#endif + +.L50: + xorps %xmm0, %xmm0 + + subq $1 * SIZE, Y + + testq $3 * SIZE, X + jne .L60 + + movq M, %rax + sarq $4, %rax + jle .L55 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm4 + + decq %rax + jle .L52 + ALIGN_3 + +.L51: + movaps -16 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm7 + movaps -4 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm1 + movaps 4 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 8 * SIZE(X), %xmm3 + movaps 12 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L51 + ALIGN_3 + +.L52: + movaps -16 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm7 + movaps -4 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L55: + testq $8, M + jle .L56 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L56: + testq $4, M + jle .L57 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L57: + testq $2, M + jle .L58 + + movaps -32 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L58: + testq $1, M + jle .L59 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L59: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L60: +#ifdef ALIGNED_ACCESS + + testq $2 * SIZE, X + jne .L70 + + subq $1 * SIZE, X + + movaps -32 * SIZE(X), %xmm1 + + movq M, %rax + sarq $4, %rax + jle .L65 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decq %rax + jle .L62 + ALIGN_3 + +.L61: + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L61 + ALIGN_3 + +.L62: + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L65: + testq $8, M + jle .L66 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L66: + testq $4, M + jle .L67 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L67: + testq $2, M + jle .L68 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L68: + testq $1, M + jle .L69 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + movhlps %xmm0, %xmm0 + movss %xmm0, -30 * SIZE(Y) + jmp .L999 + +.L69: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L70: + testq $1 * SIZE, X + jne .L80 +#endif + + movq M, %rax + sarq $4, %rax + jle .L75 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + movsd -24 * SIZE(X), %xmm3 + movhps -22 * SIZE(X), %xmm3 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + + decq %rax + jle .L72 + ALIGN_3 + +.L71: + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + movsd -12 * SIZE(X), %xmm6 + movhps -10 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm7 + movhps -6 * SIZE(X), %xmm7 + movsd -4 * SIZE(X), %xmm0 + movhps -2 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm1 + movhps 2 * SIZE(X), %xmm1 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movsd 8 * SIZE(X), %xmm3 + movhps 10 * SIZE(X), %xmm3 + movsd 12 * SIZE(X), %xmm4 + movhps 14 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L71 + ALIGN_3 + +.L72: + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + movsd -12 * SIZE(X), %xmm6 + movhps -10 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm7 + movhps -6 * SIZE(X), %xmm7 + movsd -4 * SIZE(X), %xmm0 + movhps -2 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L75: + testq $8, M + jle .L76 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm3 + movhps -22 * SIZE(X), %xmm3 + movsd -20 * SIZE(X), %xmm0 + movhps -18 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L76: + testq $4, M + jle .L77 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L77: + testq $2, M + jle .L78 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L78: + testq $1, M + jle .L79 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L79: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS + +.L80: + subq $3 * SIZE, X + + movaps -32 * SIZE(X), %xmm1 + + movq M, %rax + sarq $4, %rax + jle .L85 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decq %rax + jle .L82 + ALIGN_3 + +.L81: + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L81 + ALIGN_3 + +.L82: + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L85: + testq $8, M + jle .L86 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L86: + testq $4, M + jle .L87 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L87: + testq $2, M + jle .L88 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L88: + testq $1, M + jle .L89 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + movhlps %xmm0, %xmm0 + movss %xmm0, -30 * SIZE(Y) + jmp .L999 + +.L89: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 +#endif + +.L100: +#ifndef CONJ + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm15 + + pxor %xmm13, %xmm13 + subps %xmm15, %xmm13 + + unpcklps %xmm14, %xmm13 + unpcklps %xmm15, %xmm14 + movaps %xmm13, %xmm15 +#else + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm15 + + pxor %xmm13, %xmm13 + subps %xmm14, %xmm13 + + unpcklps %xmm15, %xmm14 + unpcklps %xmm13, %xmm15 +#endif + + movq Y, YY + + movq M, %rax + sarq $3, %rax + jle .L105 + ALIGN_3 + +.L102: + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 + movshdup %xmm4, %xmm5 + movsldup %xmm4, %xmm4 + movshdup %xmm6, %xmm7 + movsldup %xmm6, %xmm6 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + pshufd $0xf5, %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 + pshufd $0xf5, %xmm4, %xmm5 + shufps $0xa0, %xmm4, %xmm4 + pshufd $0xf5, %xmm6, %xmm7 + shufps $0xa0, %xmm6, %xmm6 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm3 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm5 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm7 + + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + addps %xmm7, %xmm11 + + movsd %xmm8, (YY) + addq INCY, YY + movhps %xmm8, (YY) + addq INCY, YY + movsd %xmm9, (YY) + addq INCY, YY + movhps %xmm9, (YY) + addq INCY, YY + movsd %xmm10, (YY) + addq INCY, YY + movhps %xmm10, (YY) + addq INCY, YY + movsd %xmm11, (YY) + addq INCY, YY + movhps %xmm11, (YY) + addq INCY, YY + + decq %rax + jg .L102 + ALIGN_3 + +.L105: + testq $4, M + jle .L106 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + pshufd $0xf5, %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm3 + + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm9 + + movsd %xmm8, (YY) + addq INCY, YY + movhps %xmm8, (YY) + addq INCY, YY + movsd %xmm9, (YY) + addq INCY, YY + movhps %xmm9, (YY) + addq INCY, YY + ALIGN_3 + +.L106: + testq $2, M + jle .L107 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + + movsd %xmm8, (YY) + addq INCY, YY + movhps %xmm8, (YY) + addq INCY, YY + ALIGN_3 + +.L107: + testq $1, M + jle .L999 + + movsd (X), %xmm0 + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + + movsd (Y), %xmm8 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + + movsd %xmm8, (Y) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S new file mode 100644 index 0000000..f1616e3 --- /dev/null +++ b/kernel/x86_64/zaxpy_sse2.S @@ -0,0 +1,1793 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#define USE_PSHUFD + +#if defined(HAVE_SSE3) && !defined(CORE_OPTERON) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifdef HAVE_SSE3 + movddup %xmm0, ALPHA_R + movddup %xmm1, ALPHA_I +#else + pshufd $0x44, %xmm0, ALPHA_R + pshufd $0x44, %xmm1, ALPHA_I +#endif + +#ifndef CONJ + shufps $0x0c, %xmm7, %xmm7 + xorpd %xmm7, ALPHA_I +#else + shufps $0xc0, %xmm7, %xmm7 + xorpd %xmm7, ALPHA_R +#endif + + testq $SIZE, Y + jne .L30 + + testq $SIZE, X + jne .L20 + + movq M, %rax + sarq $3, %rax + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm8 +#else + movsd -15 * SIZE(X), %xmm8 + movhps -16 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm8 +#else + movsd -13 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm8 +#else + movsd -11 * SIZE(X), %xmm8 + movhps -12 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm8 +#else + movsd -9 * SIZE(X), %xmm8 + movhps -10 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm4, %xmm8 +#else + movsd -7 * SIZE(X), %xmm8 + movhps -8 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd -8 * SIZE(Y), %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm5, %xmm8 +#else + movsd -5 * SIZE(X), %xmm8 + movhps -6 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm8 + addpd -6 * SIZE(Y), %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm2 + movaps 6 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm6, %xmm8 +#else + movsd -3 * SIZE(X), %xmm8 + movhps -4 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm8 + addpd -4 * SIZE(Y), %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm7, %xmm8 +#else + movsd -1 * SIZE(X), %xmm8 + movhps -2 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm7 + mulpd ALPHA_I, %xmm8 + addpd -2 * SIZE(Y), %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd -8 * SIZE(Y), %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + pshufd $0x4e, %xmm5, %xmm8 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm8 + addpd -6 * SIZE(Y), %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + pshufd $0x4e, %xmm6, %xmm8 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm8 + addpd -4 * SIZE(Y), %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + pshufd $0x4e, %xmm7, %xmm8 + mulpd ALPHA_R, %xmm7 + mulpd ALPHA_I, %xmm8 + addpd -2 * SIZE(Y), %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $4, %rax + jle .L16 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + movq M, %rax + andq $2, %rax + jle .L17 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + movq M, %rax + andq $1, %rax + jle .L999 + + movaps -16 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movq M, %rax + sarq $3, %rax + jle .L25 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd -8 * SIZE(Y), %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm5, %xmm8 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm8 + addpd -6 * SIZE(Y), %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm8 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm8 + addpd -4 * SIZE(Y), %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm7, %xmm8 + mulpd ALPHA_R, %xmm7 + mulpd ALPHA_I, %xmm8 + addpd -2 * SIZE(Y), %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd -8 * SIZE(Y), %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + pshufd $0x4e, %xmm5, %xmm8 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm8 + addpd -6 * SIZE(Y), %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + pshufd $0x4e, %xmm6, %xmm8 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm8 + addpd -4 * SIZE(Y), %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + pshufd $0x4e, %xmm7, %xmm8 + mulpd ALPHA_R, %xmm7 + mulpd ALPHA_I, %xmm8 + addpd -2 * SIZE(Y), %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $4, %rax + jle .L26 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $2, %rax + jle .L27 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + movq M, %rax + andq $1, %rax + jle .L999 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + testq $SIZE, X + jne .L40 + + movaps -16 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + xorps %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm0 + + xorps %xmm4, %xmm4 + movhps -16 * SIZE(Y), %xmm4 + + addpd %xmm0, %xmm4 + movhps %xmm4, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $2 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L39 + + movq M, %rax + sarq $3, %rax + jle .L35 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -10 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -8 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -6 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 2 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 4 * SIZE(X), %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -10 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -8 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -6 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L35: + movq M, %rax + andq $4, %rax + jle .L36 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm4 + SHUFPD_1 %xmm4, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L36: + movq M, %rax + andq $2, %rax + jle .L37 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L37: + movq M, %rax + andq $1, %rax + jle .L39 + + movaps -16 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L39: + SHUFPD_1 %xmm0, %xmm0 + + addsd -16 * SIZE(Y), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L40: + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + xorps %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm0 + + xorps %xmm4, %xmm4 + movhps -16 * SIZE(Y), %xmm4 + + addpd %xmm0, %xmm4 + movhps %xmm4, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $2 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L49 + + movq M, %rax + sarq $3, %rax + jle .L45 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + movsd -12 * SIZE(X), %xmm3 + movhps -11 * SIZE(X), %xmm3 + + decq %rax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -10 * SIZE(X), %xmm0 + movhps -9 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -8 * SIZE(X), %xmm1 + movhps -7 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -6 * SIZE(X), %xmm2 + movhps -5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -3 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd -2 * SIZE(X), %xmm0 + movhps -1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movsd 2 * SIZE(X), %xmm2 + movhps 3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movsd 4 * SIZE(X), %xmm3 + movhps 5 * SIZE(X), %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -10 * SIZE(X), %xmm0 + movhps -9 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -8 * SIZE(X), %xmm1 + movhps -7 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -6 * SIZE(X), %xmm2 + movhps -5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -3 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd -2 * SIZE(X), %xmm0 + movhps -1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L45: + movq M, %rax + andq $4, %rax + jle .L46 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + movsd -12 * SIZE(X), %xmm3 + movhps -11 * SIZE(X), %xmm3 + movsd -10 * SIZE(X), %xmm4 + movhps -9 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm4 + SHUFPD_1 %xmm4, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L46: + movq M, %rax + andq $2, %rax + jle .L47 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L47: + movq M, %rax + andq $1, %rax + jle .L49 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L49: + SHUFPD_1 %xmm0, %xmm0 + + addsd -16 * SIZE(Y), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: +#ifndef CONJ + movaps %xmm0, %xmm14 # a 0 + + pxor %xmm15, %xmm15 # 0 0 + subsd %xmm1, %xmm15 # -b 0 + + unpcklpd %xmm14, %xmm15 # -b a + unpcklpd %xmm1, %xmm14 # a b +#else + movaps %xmm0, %xmm14 # a 0 + movaps %xmm1, %xmm15 # b 0 + + pxor %xmm13, %xmm13 # 0 0 + subsd %xmm0, %xmm13 # -a 0 + + unpcklpd %xmm13, %xmm15 # b -a + unpcklpd %xmm1, %xmm14 # a b +#endif + + movq Y, YY + movq M, %rax + sarq $3, %rax + jle .L55 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + decq %rax + jle .L52 + ALIGN_3 + +.L51: + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L52: + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L55: + movq M, %rax + andq $4, %rax + jle .L56 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L56: + movq M, %rax + andq $2, %rax + jle .L57 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L57: + movq M, %rax + andq $1, %rax + jle .L999 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm1 + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm8 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zcopy.S b/kernel/x86_64/zcopy.S new file mode 100644 index 0000000..d76426b --- /dev/null +++ b/kernel/x86_64/zcopy.S @@ -0,0 +1,389 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#define FLAG ARG6 +#else +#define INCY %r10 +#define FLAG %r11 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + EMMS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq N, N # if m == 0 goto End + jle .L999 + + cmpq $2 * SIZE, INCX # if incx != 1 + jne .L100 + cmpq $2 * SIZE, INCY # if incy != 1 + jne .L100 + + movq N, %rax # i = m + sarq $2, %rax + jle .L20 + ALIGN_2 + +.L11: +#ifdef XDOUBLE + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq 0(X), %mm0 + movq %mm0, 0(Y) + + movq 8(X), %mm1 + movq %mm1, 8(Y) + + movq 16(X), %mm2 + movq %mm2, 16(Y) + + movq 24(X), %mm3 + movq %mm3, 24(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq 32(X), %mm4 + movq %mm4, 32(Y) + + movq 40(X), %mm5 + movq %mm5, 40(Y) + + movq 48(X), %mm6 + movq %mm6, 48(Y) + + movq 56(X), %mm7 + movq %mm7, 56(Y) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movq 64(X), %mm0 + movq %mm0, 64(Y) + + movq 72(X), %mm1 + movq %mm1, 72(Y) + + movq 80(X), %mm2 + movq %mm2, 80(Y) + + movq 88(X), %mm3 + movq %mm3, 88(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movq 96(X), %mm4 + movq %mm4, 96(Y) + + movq 104(X), %mm5 + movq %mm5, 104(Y) + + movq 112(X), %mm6 + movq %mm6, 112(Y) + + movq 120(X), %mm7 + movq %mm7, 120(Y) +#elif defined(DOUBLE) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + + movq %mm0, 0 * SIZE(Y) + movq %mm1, 1 * SIZE(Y) + + movq 2 * SIZE(X), %mm2 + movq 3 * SIZE(X), %mm3 + + movq %mm2, 2 * SIZE(Y) + movq %mm3, 3 * SIZE(Y) + + movq 4 * SIZE(X), %mm4 + movq 5 * SIZE(X), %mm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 4 * SIZE(Y) + movq %mm5, 5 * SIZE(Y) + + movq 6 * SIZE(X), %mm6 + movq 7 * SIZE(X), %mm7 + + movq %mm6, 6 * SIZE(Y) + movq %mm7, 7 * SIZE(Y) +#else + movq 0 * SIZE(X), %mm0 + movq 2 * SIZE(X), %mm2 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq %mm0, 0 * SIZE(Y) + movq %mm2, 2 * SIZE(Y) + + movq 4 * SIZE(X), %mm4 + movq 6 * SIZE(X), %mm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 4 * SIZE(Y) + movq %mm6, 6 * SIZE(Y) +#endif + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L11 + ALIGN_2 + +.L20: + movq N, %rax # i = m + andq $3, %rax + jle .L99 + ALIGN_2 + +.L21: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm1 + movq %mm1, 1 * SIZE(Y) +#else + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) +#endif + + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq %rax + jg .L21 + +.L99: + xorq %rax,%rax + EMMS + ret + ALIGN_3 + +.L100: + movq N, %rax + sarq $2, %rax + jle .L120 + ALIGN_2 + +.L111: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y + + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y + + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y + + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm1 + movq %mm1, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm2 + movq %mm2, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm3 + movq %mm3, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm4 + movq %mm4, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm5 + movq %mm5, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm6 + movq %mm6, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm7 + movq %mm7, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y +#else + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm2 + movq %mm2, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm4 + movq %mm4, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm6 + movq %mm6, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y +#endif + + decq %rax + jg .L111 + +.L120: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_2 + +.L121: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm1 + movq %mm1, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y +#else + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y +#endif + + decq %rax + jg .L121 + +.L999: + xorq %rax,%rax + EMMS + ret + + EPILOGUE + diff --git a/kernel/x86_64/zcopy_sse.S b/kernel/x86_64/zcopy_sse.S new file mode 100644 index 0000000..91f283a --- /dev/null +++ b/kernel/x86_64/zcopy_sse.S @@ -0,0 +1,992 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + cmpq $2 * SIZE, INCX + jne .L100 + cmpq $2 * SIZE, INCY + jne .L100 + + cmpq $3, M + jle .L106 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + addq M, M + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + ALIGN_4 + +.L05: + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_4 + +.L10: + testq $3 * SIZE, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -32 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -28 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -24 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm2) + movaps %xmm3, -20 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4,-16 * SIZE(Y) + LOAD(16 * SIZE, X, %xmm4) + movaps %xmm5,-12 * SIZE(Y) + LOAD(20 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -8 * SIZE(Y) + LOAD(24 * SIZE, X, %xmm6) + movaps %xmm7, -4 * SIZE(Y) + LOAD(28 * SIZE, X, %xmm7) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + movaps %xmm4, -16 * SIZE(Y) + movaps %xmm5, -12 * SIZE(Y) + movaps %xmm6, -8 * SIZE(Y) + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + ALIGN_3 + +.L13: + testq $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + + +.L20: + testq $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + movaps -6 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 14 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 18 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 22 * SIZE(X), %xmm6 + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 26 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L23: + testq $16, M + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + testq $8, M + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm1, %xmm0 + shufps $0x4e, %xmm2, %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, M + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, M + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, M + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L30: + testq $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + movaps -5 * SIZE(X), %xmm7 + + decq %rax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 15 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 19 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 23 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 27 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L33: + testq $16, M + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + testq $8, M + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, M + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + testq $2, M + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, M + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + movaps -7 * SIZE(X), %xmm7 + + decq %rax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 13 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 17 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 21 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 25 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L43: + testq $16, M + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $8, M + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, M + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + testq $2, M + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, M + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_4 + +.L100: + movq M, %rax + sarq $3, %rax + jle .L105 + ALIGN_3 + +.L102: + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + movsd %xmm0, (Y) + addq INCY, Y + movhps %xmm0, (Y) + addq INCY, Y + movsd %xmm1, (Y) + addq INCY, Y + movhps %xmm1, (Y) + addq INCY, Y + movsd %xmm2, (Y) + addq INCY, Y + movhps %xmm2, (Y) + addq INCY, Y + movsd %xmm3, (Y) + addq INCY, Y + movhps %xmm3, (Y) + addq INCY, Y + + decq %rax + jg .L102 + ALIGN_3 + +.L105: + testq $4, M + jle .L106 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + movsd %xmm0, (Y) + addq INCY, Y + movhps %xmm0, (Y) + addq INCY, Y + movsd %xmm1, (Y) + addq INCY, Y + movhps %xmm1, (Y) + addq INCY, Y + ALIGN_3 + +.L106: + testq $2, M + jle .L107 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + movsd %xmm0, (Y) + addq INCY, Y + movhps %xmm0, (Y) + addq INCY, Y + ALIGN_3 + +.L107: + testq $1, M + jle .L999 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zcopy_sse2.S b/kernel/x86_64/zcopy_sse2.S new file mode 100644 index 0000000..c3a99a5 --- /dev/null +++ b/kernel/x86_64/zcopy_sse2.S @@ -0,0 +1,655 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + addq M, M + +#ifdef ALIGNED_ACCESS + testq $SIZE, Y +#else + testq $SIZE, X +#endif + je .L10 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + +#ifdef ALIGNED_ACCESS + testq $SIZE, X +#else + testq $SIZE, Y +#endif + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -16 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -14 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -12 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movaps %xmm3, -10 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4, -8 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movaps %xmm5, -6 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -4 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movaps %xmm7, -2 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, -8 * SIZE(Y) + movaps %xmm5, -6 * SIZE(Y) + movaps %xmm6, -4 * SIZE(Y) + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L13: + testq $8, M + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + testq $4, M + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + testq $2, M + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + testq $1, M + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + LOAD( 1 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + LOAD( 3 * SIZE, X, %xmm2) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + LOAD( 5 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + LOAD( 7 * SIZE, X, %xmm4) + + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + LOAD( 9 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + LOAD(11 * SIZE, X, %xmm6) + + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + LOAD(13 * SIZE, X, %xmm7) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm8, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#else + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movaps -10 * SIZE(X), %xmm3 + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#endif + +.L50: + movq M, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L51: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addq INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addq INCX, X + + + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + movlps %xmm1, 0 * SIZE(Y) + movhps %xmm1, 1 * SIZE(Y) + addq INCY, Y + + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + addq INCY, Y + + movlps %xmm3, 0 * SIZE(Y) + movhps %xmm3, 1 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $3, %rax + jle .L57 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zdot.S b/kernel/x86_64/zdot.S new file mode 100644 index 0000000..f968347 --- /dev/null +++ b/kernel/x86_64/zdot.S @@ -0,0 +1,259 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + testq N, N + jle .L88 + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpq $2 * SIZE, INCX + jne .L14 + cmpq $2 * SIZE, INCY + jne .L14 + + movq N, %rax + sarq $1, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq N, %rax + andq $1, %rax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + jmp .L27 + ALIGN_3 + +.L14: + movq N, %rax + sarq $1, %rax + jle .L30 + ALIGN_3 + + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addq INCX, X + + FLD 0 * SIZE(X) + addq INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addq INCX, X + addq INCY, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L30: + movq N, %rax + andq $1, %rax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + ALIGN_3 + +.L27: +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + ret + ALIGN_3 + +.L88: + fldz + fldz + + ret + EPILOGUE diff --git a/kernel/x86_64/zdot_atom.S b/kernel/x86_64/zdot_atom.S new file mode 100644 index 0000000..9a8239c --- /dev/null +++ b/kernel/x86_64/zdot_atom.S @@ -0,0 +1,461 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + pxor %xmm0, %xmm0 + salq $ZBASE_SHIFT, INCY + pxor %xmm1, %xmm1 + + pxor %xmm2, %xmm2 + cmpq $0, N + pxor %xmm3, %xmm3 + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L20 + cmpq $2 * SIZE, INCY + jne .L20 + + movq N, %rax + sarq $2, %rax + jle .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 2 * SIZE(X), %xmm10 + mulsd %xmm7, %xmm8 + movsd 2 * SIZE(Y), %xmm11 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 3 * SIZE(X), %xmm12 + mulsd %xmm6, %xmm9 + movsd 3 * SIZE(Y), %xmm13 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + movsd 4 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + movsd 4 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + movsd 5 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + movsd 5 * SIZE(Y), %xmm7 + + addsd %xmm10, %xmm0 + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 6 * SIZE(X), %xmm10 + addsd %xmm14, %xmm1 + mulsd %xmm7, %xmm8 + movsd 6 * SIZE(Y), %xmm11 + addsd %xmm12, %xmm2 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 7 * SIZE(X), %xmm12 + addsd %xmm15, %xmm3 + mulsd %xmm6, %xmm9 + movsd 7 * SIZE(Y), %xmm13 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + movsd 8 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + movsd 8 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + movsd 9 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + movsd 9 * SIZE(Y), %xmm7 + + addsd %xmm10, %xmm0 + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 10 * SIZE(X), %xmm10 + addsd %xmm14, %xmm1 + mulsd %xmm7, %xmm8 + movsd 10 * SIZE(Y), %xmm11 + addsd %xmm12, %xmm2 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 11 * SIZE(X), %xmm12 + addsd %xmm15, %xmm3 + mulsd %xmm6, %xmm9 + movsd 11 * SIZE(Y), %xmm13 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + movsd 4 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + movsd 4 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + movsd 5 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + movsd 5 * SIZE(Y), %xmm7 + + addsd %xmm10, %xmm0 + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 6 * SIZE(X), %xmm10 + addsd %xmm14, %xmm1 + mulsd %xmm7, %xmm8 + movsd 6 * SIZE(Y), %xmm11 + addsd %xmm12, %xmm2 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 7 * SIZE(X), %xmm12 + addsd %xmm15, %xmm3 + mulsd %xmm6, %xmm9 + movsd 7 * SIZE(Y), %xmm13 + + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + + addsd %xmm10, %xmm0 + addsd %xmm14, %xmm1 + addsd %xmm12, %xmm2 + addsd %xmm15, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + movq N, %rax + andq $2, %rax + jle .L17 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 2 * SIZE(X), %xmm10 + mulsd %xmm7, %xmm8 + movsd 2 * SIZE(Y), %xmm11 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 3 * SIZE(X), %xmm12 + mulsd %xmm6, %xmm9 + movsd 3 * SIZE(Y), %xmm13 + + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + + addsd %xmm10, %xmm0 + addsd %xmm14, %xmm1 + addsd %xmm12, %xmm2 + addsd %xmm15, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + movq N, %rax + andq $1, %rax + jle .L999 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + mulsd %xmm6, %xmm9 + + addsd %xmm4, %xmm0 + addsd %xmm8, %xmm1 + addsd %xmm5, %xmm2 + addsd %xmm9, %xmm3 + jmp .L999 + ALIGN_3 + +.L20: + movq N, %rax + sarq $2, %rax + jle .L25 + ALIGN_3 + +.L23: + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + movsd 0 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movsd 1 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + movsd 0 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movsd 1 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + movsd 0 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movsd 1 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm8, %xmm1 + addsd %xmm5, %xmm2 + addsd %xmm9, %xmm3 + + decq %rax + jg .L23 + ALIGN_3 + +.L25: + testq $3, N + je .L999 + + movq N, %rax + andq $2, %rax + jle .L27 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + movsd 0 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movsd 1 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm8, %xmm1 + addsd %xmm5, %xmm2 + addsd %xmm9, %xmm3 + + ALIGN_3 + +.L27: + movq N, %rax + andq $1, %rax + jle .L999 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + mulsd %xmm6, %xmm9 + + addsd %xmm4, %xmm0 + addsd %xmm8, %xmm1 + addsd %xmm5, %xmm2 + addsd %xmm9, %xmm3 + ALIGN_3 + +.L999: +#ifndef CONJ + subsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 +#else + addsd %xmm2, %xmm0 + subsd %xmm3, %xmm1 +#endif + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zdot_sse.S b/kernel/x86_64/zdot_sse.S new file mode 100644 index 0000000..3302b90 --- /dev/null +++ b/kernel/x86_64/zdot_sse.S @@ -0,0 +1,3492 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + testq N, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L200 + cmpq $2 * SIZE, INCY + jne .L200 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + testq $SIZE, X + jne .L50 + +.L0x: + testq $2 * SIZE, X + je .L10 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm0 + + pshufd $0xb1, %xmm0, %xmm1 + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq N + ALIGN_3 + +.L10: + testq $3 * SIZE, Y + jne .L20 + + movq N, %rax + sarq $4, %rax + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm8 + movaps -28 * SIZE(Y), %xmm9 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm10 + movaps -20 * SIZE(Y), %xmm11 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -16 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -12 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -12 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -8 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -4 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps 0 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps 0 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps 4 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps 4 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps 8 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps 8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps 12 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps 12 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -16 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -12 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -12 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -8 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -4 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L15: + testq $8, N + jle .L16 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm9 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm10 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm7 + movaps -20 * SIZE(Y), %xmm11 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L16: + testq $4, N + jle .L17 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm8 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm9 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L17: + testq $2, N + jle .L18 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L18: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + testq $2 * SIZE, Y + jne .L30 + + movaps -33 * SIZE(Y), %xmm8 + addq $3 * SIZE, Y + + shufps $0xb1, %xmm1, %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L25 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm11 + movaps -20 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps 0 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps 4 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps 8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, N + jle .L26 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm11 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm7 + movaps -20 * SIZE(Y), %xmm8 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L26: + testq $4, N + jle .L27 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L27: + testq $2, N + jle .L28 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L28: + testq $1, N + jle .L29 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L29: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L30: + + testq $SIZE, Y + jne .L40 +#endif + + movq N, %rax + sarq $4, %rax + jle .L35 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm8 + movhps -30 * SIZE(Y), %xmm8 + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm9 + movhps -26 * SIZE(Y), %xmm9 + + movaps -24 * SIZE(X), %xmm6 + movsd -24 * SIZE(Y), %xmm10 + movhps -22 * SIZE(Y), %xmm10 + movaps -20 * SIZE(X), %xmm7 + movsd -20 * SIZE(Y), %xmm11 + movhps -18 * SIZE(Y), %xmm11 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd -16 * SIZE(Y), %xmm8 + movhps -14 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -16 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd -12 * SIZE(Y), %xmm9 + movhps -10 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -12 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd -8 * SIZE(Y), %xmm10 + movhps -6 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd -4 * SIZE(Y), %xmm11 + movhps -2 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd 0 * SIZE(Y), %xmm8 + movhps 2 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps 0 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd 4 * SIZE(Y), %xmm9 + movhps 6 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps 4 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd 8 * SIZE(Y), %xmm10 + movhps 10 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps 8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd 12 * SIZE(Y), %xmm11 + movhps 14 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps 12 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L32: + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd -16 * SIZE(Y), %xmm8 + movhps -14 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -16 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd -12 * SIZE(Y), %xmm9 + movhps -10 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -12 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd -8 * SIZE(Y), %xmm10 + movhps -6 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd -4 * SIZE(Y), %xmm11 + movhps -2 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L35: + testq $8, N + jle .L36 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm8 + movhps -30 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm9 + movhps -26 * SIZE(Y), %xmm9 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps -24 * SIZE(X), %xmm6 + movsd -24 * SIZE(Y), %xmm10 + movhps -22 * SIZE(Y), %xmm10 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm7 + movsd -20 * SIZE(Y), %xmm11 + movhps -18 * SIZE(Y), %xmm11 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L36: + testq $4, N + jle .L37 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm8 + movhps -30 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm9 + movhps -26 * SIZE(Y), %xmm9 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L37: + testq $2, N + jle .L38 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm8 + movhps -30 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L38: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +#ifdef ALIGNED_ACCESS +.L40: + movaps -35 * SIZE(Y), %xmm8 + addq $1 * SIZE, Y + + shufps $0xb1, %xmm1, %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L45 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm11 + movaps -20 * SIZE(X), %xmm7 + + decq %rax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps 0 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps 4 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps 8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L45: + testq $8, N + jle .L46 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm11 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm7 + movaps -20 * SIZE(Y), %xmm8 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L46: + testq $4, N + jle .L47 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L47: + testq $2, N + jle .L48 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L48: + testq $1, N + jle .L49 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + movss -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L49: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + jmp .L98 + ALIGN_3 +#endif + +.L50: + testq $SIZE, Y + jne .L70 + +#ifdef ALIGNED_ACCESS + + testq $2 * SIZE, Y + je .L50x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + + pshufd $0xb1, %xmm0, %xmm1 + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addq $2 * SIZE, X + addq $2 * SIZE, Y + + decq N + ALIGN_3 + +.L50x: + testq $2 * SIZE, X + jne .L60 + + movaps -33 * SIZE(X), %xmm8 + addq $3 * SIZE, X + + shufps $0xb1, %xmm1, %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L55 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + movaps -24 * SIZE(Y), %xmm6 + movaps -24 * SIZE(X), %xmm11 + movaps -20 * SIZE(Y), %xmm7 + + decq %rax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps 0 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps 4 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps 8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L52: + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L55: + testq $8, N + jle .L56 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(Y), %xmm6 + movaps -24 * SIZE(X), %xmm11 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(Y), %xmm7 + movaps -20 * SIZE(X), %xmm8 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L56: + testq $4, N + jle .L57 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L57: + testq $2, N + jle .L58 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L58: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L60: + movaps -35 * SIZE(X), %xmm8 + addq $1 * SIZE, X + + shufps $0xb1, %xmm1, %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L65 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + movaps -24 * SIZE(Y), %xmm6 + movaps -24 * SIZE(X), %xmm11 + movaps -20 * SIZE(Y), %xmm7 + + decq %rax + jle .L62 + ALIGN_3 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps 0 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps 4 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps 8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L61 + ALIGN_3 + +.L62: + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L65: + testq $8, N + jle .L66 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(Y), %xmm6 + movaps -24 * SIZE(X), %xmm11 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(Y), %xmm7 + movaps -20 * SIZE(X), %xmm8 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L66: + testq $4, N + jle .L67 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L67: + testq $2, N + jle .L68 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L68: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + movss -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +#else + + testq $2 * SIZE, Y + je .L50x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(Y), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm0, %xmm1 + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addq $2 * SIZE, X + addq $2 * SIZE, Y + + decq N + ALIGN_3 + +.L50x: + movq N, %rax + sarq $4, %rax + jle .L55 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm8 + movhps -30 * SIZE(X), %xmm8 + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm9 + movhps -26 * SIZE(X), %xmm9 + + movaps -24 * SIZE(Y), %xmm6 + movlps -24 * SIZE(X), %xmm10 + movhps -22 * SIZE(X), %xmm10 + movaps -20 * SIZE(Y), %xmm7 + movlps -20 * SIZE(X), %xmm11 + movhps -18 * SIZE(X), %xmm11 + + decq %rax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movlps -16 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movlps -12 * SIZE(X), %xmm9 + movhps -10 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movlps -8 * SIZE(X), %xmm10 + movhps -6 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movlps -4 * SIZE(X), %xmm11 + movhps -2 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movlps 0 * SIZE(X), %xmm8 + movhps 2 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movlps 4 * SIZE(X), %xmm9 + movhps 6 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movlps 8 * SIZE(X), %xmm10 + movhps 10 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movlps 12 * SIZE(X), %xmm11 + movhps 14 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L52: + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movlps -16 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movlps -12 * SIZE(X), %xmm9 + movhps -10 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movlps -8 * SIZE(X), %xmm10 + movhps -6 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movlps -4 * SIZE(X), %xmm11 + movhps -2 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L55: + testq $8, N + jle .L56 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm8 + movhps -30 * SIZE(X), %xmm8 + + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm9 + movhps -26 * SIZE(X), %xmm9 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(Y), %xmm6 + movlps -24 * SIZE(X), %xmm10 + movhps -22 * SIZE(X), %xmm10 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(Y), %xmm7 + movlps -20 * SIZE(X), %xmm11 + movhps -18 * SIZE(X), %xmm11 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L56: + testq $4, N + jle .L57 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm8 + movhps -30 * SIZE(X), %xmm8 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm9 + movhps -26 * SIZE(X), %xmm9 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L57: + testq $2, N + jle .L58 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm8 + movhps -30 * SIZE(X), %xmm8 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L58: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(X), %xmm8 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 +#endif + +.L70: + testq $2 * SIZE, Y + je .L70x + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + addq $2 * SIZE, X +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + addq $2 * SIZE, Y + + pshufd $0xb1, %xmm1, %xmm0 + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + decq N + ALIGN_3 + +.L70x: + testq $2 * SIZE, X + jne .L80 + + movaps -33 * SIZE(X), %xmm4 + addq $3 * SIZE, X + movaps -33 * SIZE(Y), %xmm8 + addq $3 * SIZE, Y + + movq N, %rax + sarq $4, %rax + jle .L75 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + movaps -24 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm11 + + decq %rax + jle .L72 + ALIGN_3 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -20 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -16 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -16 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -12 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -8 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -4 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -4 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps 0 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps 0 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps 4 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps 4 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps 8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps 8 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L71 + ALIGN_3 + +.L72: + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -20 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -16 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -16 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -12 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -8 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -4 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -4 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L75: + testq $8, N + jle .L76 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps -24 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm11 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm4 + movaps -20 * SIZE(Y), %xmm8 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L76: + testq $4, N + jle .L77 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps %xmm6, %xmm4 + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L77: + testq $2, N + jle .L78 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm5, %xmm4 + movaps %xmm9, %xmm8 + ALIGN_3 + +.L78: + testq $1, N + jle .L79 + + xorps %xmm5, %xmm5 + movss %xmm5, %xmm4 + movss %xmm5, %xmm8 + + shufps $0x24, %xmm4, %xmm4 + pshufd $0x18, %xmm8, %xmm12 + shufps $0x24, %xmm8, %xmm8 + + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L79: + shufps $0x39, %xmm0, %xmm0 + shufps $0x39, %xmm1, %xmm1 + shufps $0x39, %xmm2, %xmm2 + shufps $0x39, %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L80: + movsd -33 * SIZE(X), %xmm4 + movhps -31 * SIZE(X), %xmm4 + addq $3 * SIZE, X + movaps -33 * SIZE(Y), %xmm8 + addq $3 * SIZE, Y + + movq N, %rax + sarq $4, %rax + jle .L85 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movsd -28 * SIZE(X), %xmm6 + movhps -26 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movsd -24 * SIZE(X), %xmm7 + movhps -22 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm11 + + decq %rax + jle .L82 + ALIGN_3 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -16 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movsd -12 * SIZE(X), %xmm6 + movhps -10 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movsd -8 * SIZE(X), %xmm7 + movhps -6 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -4 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movsd -4 * SIZE(X), %xmm4 + movhps -2 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps 0 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movsd 0 * SIZE(X), %xmm5 + movhps 2 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps 4 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movsd 4 * SIZE(X), %xmm6 + movhps 6 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps 8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movsd 8 * SIZE(X), %xmm7 + movhps 10 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L81 + ALIGN_3 + +.L82: + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -16 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movsd -12 * SIZE(X), %xmm6 + movhps -10 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movsd -8 * SIZE(X), %xmm7 + movhps -6 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -4 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movsd -4 * SIZE(X), %xmm4 + movhps -2 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L85: + testq $8, N + jle .L86 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movsd -28 * SIZE(X), %xmm6 + movhps -26 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movsd -24 * SIZE(X), %xmm7 + movhps -22 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm11 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + movaps -20 * SIZE(Y), %xmm8 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L86: + testq $4, N + jle .L87 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movsd -28 * SIZE(X), %xmm6 + movhps -26 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps %xmm6, %xmm4 + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L87: + testq $2, N + jle .L88 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm5, %xmm4 + movaps %xmm9, %xmm8 + ALIGN_3 + +.L88: + testq $1, N + jle .L89 + + xorps %xmm5, %xmm5 + movss %xmm5, %xmm4 + movss %xmm5, %xmm8 + + shufps $0x24, %xmm4, %xmm4 + pshufd $0x18, %xmm8, %xmm12 + shufps $0x24, %xmm8, %xmm8 + + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L89: + shufps $0x39, %xmm0, %xmm0 + shufps $0x39, %xmm1, %xmm1 + shufps $0x39, %xmm2, %xmm2 + shufps $0x39, %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L200: + movq N, %rax + sarq $4, %rax + jle .L205 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + + decq %rax + jle .L204 + ALIGN_3 + +.L203: + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + mulps %xmm4, %xmm12 + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + mulps %xmm5, %xmm12 + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + mulps %xmm6, %xmm12 + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + mulps %xmm7, %xmm12 + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + mulps %xmm4, %xmm12 + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + mulps %xmm5, %xmm12 + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + mulps %xmm6, %xmm12 + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + + mulps %xmm7, %xmm12 + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + addps %xmm12, %xmm3 + + decq %rax + jg .L203 + ALIGN_3 + +.L204: + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + mulps %xmm4, %xmm12 + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + mulps %xmm5, %xmm12 + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + mulps %xmm6, %xmm12 + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + mulps %xmm7, %xmm12 + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + ALIGN_3 + +.L205: + testq $8, N + jle .L206 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + ALIGN_3 + +.L206: + testq $4, N + jle .L207 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + ALIGN_3 + +.L207: + testq $2, N + jle .L208 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L208: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd (X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd (Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L98: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movhlps %xmm0, %xmm2 + movhlps %xmm1, %xmm3 + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + pshufd $1, %xmm0, %xmm2 + pshufd $1, %xmm1, %xmm3 + ALIGN_3 + +.L999: +#ifndef CONJ + subss %xmm2, %xmm0 + addss %xmm3, %xmm1 +#else + addss %xmm2, %xmm0 + subss %xmm3, %xmm1 +#endif + unpcklps %xmm1, %xmm0 + + RESTOREREGISTERS + + ret + ALIGN_3 + + EPILOGUE diff --git a/kernel/x86_64/zdot_sse2.S b/kernel/x86_64/zdot_sse2.S new file mode 100644 index 0000000..77fa8e3 --- /dev/null +++ b/kernel/x86_64/zdot_sse2.S @@ -0,0 +1,1550 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#undef movsd + +#ifndef OPTERON +#define MOVLPS movsd +#else +#define MOVLPS movlps +#endif + + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpq $0, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, Y + jne .L30 + + testq $SIZE, X + jne .L20 + + movq N, %rax + sarq $3, %rax + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm8 + movaps -14 * SIZE(Y), %xmm9 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + movaps -12 * SIZE(Y), %xmm10 + movaps -10 * SIZE(Y), %xmm11 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -8 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps -6 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps -4 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps -2 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps 0 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps 0 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps 2 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps 2 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps 4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps 4 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps 6 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps 6 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -8 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps -6 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps -4 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps -2 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, N + jle .L16 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm9 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + movaps -12 * SIZE(X), %xmm6 + movaps -12 * SIZE(Y), %xmm10 + movaps -10 * SIZE(X), %xmm7 + movaps -10 * SIZE(Y), %xmm11 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm9 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L98 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L20: + movq N, %rax + sarq $3, %rax + jle .L25 + + MOVLPS -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + MOVLPS -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm8 + movaps -14 * SIZE(Y), %xmm9 + MOVLPS -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + MOVLPS -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + movaps -12 * SIZE(Y), %xmm10 + movaps -10 * SIZE(Y), %xmm11 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps 0 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps 2 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS 2 * SIZE(X), %xmm5 + movhps 3 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps 4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS 4 * SIZE(X), %xmm6 + movhps 5 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps 6 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS 6 * SIZE(X), %xmm7 + movhps 7 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + MOVLPS -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm9 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + MOVLPS -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movaps -12 * SIZE(Y), %xmm10 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + movaps -10 * SIZE(Y), %xmm11 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + MOVLPS -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm9 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L98 + + MOVLPS -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L30: + testq $SIZE, X + jne .L40 + + movq N, %rax + sarq $3, %rax + jle .L35 + + MOVLPS -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + MOVLPS -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -16 * SIZE(X), %xmm8 + movaps -14 * SIZE(X), %xmm9 + MOVLPS -12 * SIZE(Y), %xmm6 + movhps -11 * SIZE(Y), %xmm6 + MOVLPS -10 * SIZE(Y), %xmm7 + movhps -9 * SIZE(Y), %xmm7 + movaps -12 * SIZE(X), %xmm10 + movaps -10 * SIZE(X), %xmm11 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(X), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS -8 * SIZE(Y), %xmm4 + movhps -7 * SIZE(Y), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(X), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS -6 * SIZE(Y), %xmm5 + movhps -5 * SIZE(Y), %xmm5 + addpd %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(X), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS -4 * SIZE(Y), %xmm6 + movhps -3 * SIZE(Y), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(X), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS -2 * SIZE(Y), %xmm7 + movhps -1 * SIZE(Y), %xmm7 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps 0 * SIZE(X), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(Y), %xmm4 + movhps 1 * SIZE(Y), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps 2 * SIZE(X), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS 2 * SIZE(Y), %xmm5 + movhps 3 * SIZE(Y), %xmm5 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps 4 * SIZE(X), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS 4 * SIZE(Y), %xmm6 + movhps 5 * SIZE(Y), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps 6 * SIZE(X), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS 6 * SIZE(Y), %xmm7 + movhps 7 * SIZE(Y), %xmm7 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L32: + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(X), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS -8 * SIZE(Y), %xmm4 + movhps -7 * SIZE(Y), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(X), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS -6 * SIZE(Y), %xmm5 + movhps -5 * SIZE(Y), %xmm5 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(X), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS -4 * SIZE(Y), %xmm6 + movhps -3 * SIZE(Y), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(X), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS -2 * SIZE(Y), %xmm7 + movhps -1 * SIZE(Y), %xmm7 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, N + jle .L36 + + MOVLPS -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm9 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + MOVLPS -12 * SIZE(Y), %xmm6 + movhps -11 * SIZE(Y), %xmm6 + movaps -12 * SIZE(X), %xmm10 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -10 * SIZE(Y), %xmm7 + movhps -9 * SIZE(Y), %xmm7 + movaps -10 * SIZE(X), %xmm11 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L36: + testq $2, N + jle .L37 + + MOVLPS -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm9 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L37: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + + testq $1, N + jle .L98 + + MOVLPS -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + addpd %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L40: + movhps -16 * SIZE(X), %xmm4 + addq $SIZE, X + movhps -16 * SIZE(Y), %xmm8 + addq $SIZE, Y + + movq N, %rax + sarq $3, %rax + jle .L45 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm9 + movaps -14 * SIZE(X), %xmm6 + movaps -14 * SIZE(Y), %xmm10 + movaps -12 * SIZE(X), %xmm7 + movaps -12 * SIZE(Y), %xmm11 + decq %rax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -10 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + movaps -8 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps -8 * SIZE(X), %xmm5 + addpd %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -6 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps -6 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + movaps -4 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addpd %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -2 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -2 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + movaps 0 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps 0 * SIZE(X), %xmm5 + addpd %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps 2 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps 2 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + movaps 4 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps 4 * SIZE(X), %xmm7 + addpd %xmm12, %xmm1 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -10 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + movaps -8 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps -8 * SIZE(X), %xmm5 + addpd %xmm12, %xmm1 + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -6 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps -6 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + movaps -4 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addpd %xmm12, %xmm1 + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -2 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -2 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm1 + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm1 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, N + jle .L46 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm9 + movaps -14 * SIZE(X), %xmm6 + movaps -14 * SIZE(Y), %xmm10 + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + movaps -12 * SIZE(X), %xmm7 + movaps -12 * SIZE(Y), %xmm11 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm1 + + movaps -10 * SIZE(X), %xmm4 + movaps -10 * SIZE(Y), %xmm8 + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L46: + testq $2, N + jle .L47 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm9 + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + movaps -14 * SIZE(X), %xmm6 + movaps -14 * SIZE(Y), %xmm10 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm1 + + movaps %xmm6, %xmm4 + movaps %xmm10, %xmm8 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, N + jle .L48 + + movlps -16 * SIZE(X), %xmm4 + movlps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + ALIGN_3 + +.L48: + SHUFPD_1 %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm2, %xmm2 + SHUFPD_1 %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L50: + movq N, %rax + sarq $3, %rax + jle .L55 + + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + + decq %rax + jle .L54 + ALIGN_3 + +.L53: + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + mulpd %xmm5, %xmm12 + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + mulpd %xmm6, %xmm12 + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + mulpd %xmm7, %xmm12 + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + + mulpd %xmm5, %xmm12 + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + mulpd %xmm6, %xmm12 + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + mulpd %xmm7, %xmm12 + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + addpd %xmm12, %xmm3 + + decq %rax + jg .L53 + ALIGN_3 + +.L54: + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + mulpd %xmm5, %xmm12 + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + mulpd %xmm6, %xmm12 + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + mulpd %xmm7, %xmm12 + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + ALIGN_3 + +.L55: + testq $4, N + jle .L56 + + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + ALIGN_3 + +.L56: + testq $2, N + jle .L57 + + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + ALIGN_3 + +.L57: + testq $1, N + jle .L98 + + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + ALIGN_3 + +.L98: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm0, %xmm2 + pshufd $0x4e, %xmm1, %xmm3 + +.L999: +#ifndef CONJ + subsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 +#else + addsd %xmm2, %xmm0 + subsd %xmm3, %xmm1 +#endif + + RESTOREREGISTERS + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S b/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S new file mode 100644 index 0000000..97eb1ec --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S @@ -0,0 +1,1933 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define BX %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif +#endif + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + prefetcht0 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 7 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 3 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 7 * SIZE(CO2, %rax, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -10 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addq $32 * SIZE, BO + subq $-8 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm13 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm15 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + shufpd $2, %xmm0, %xmm13 + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + shufpd $2, %xmm0, %xmm15 + + leaq (LDC, LDC, 2), %rax + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC), %xmm2 + movhps 1 * SIZE(CO1, LDC), %xmm2 + movsd 2 * SIZE(CO1, LDC), %xmm3 + movhps 3 * SIZE(CO1, LDC), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 1 * SIZE(CO1, LDC) + movlps %xmm3, 2 * SIZE(CO1, LDC) + movhps %xmm3, 3 * SIZE(CO1, LDC) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhps 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 0 * SIZE(CO1, %rax), %xmm2 + movhps 1 * SIZE(CO1, %rax), %xmm2 + movsd 2 * SIZE(CO1, %rax), %xmm3 + movhps 3 * SIZE(CO1, %rax), %xmm3 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movddup %xmm11, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 1 * SIZE(CO1, LDC, 2) + movlps %xmm1, 2 * SIZE(CO1, LDC, 2) + movhps %xmm1, 3 * SIZE(CO1, LDC, 2) + + movlps %xmm2, 0 * SIZE(CO1, %rax) + movhps %xmm2, 1 * SIZE(CO1, %rax) + movlps %xmm3, 2 * SIZE(CO1, %rax) + movhps %xmm3, 3 * SIZE(CO1, %rax) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movsd 0 * SIZE(CO2, LDC), %xmm2 + movhps 1 * SIZE(CO2, LDC), %xmm2 + movsd 2 * SIZE(CO2, LDC), %xmm3 + movhps 3 * SIZE(CO2, LDC), %xmm3 + + movddup %xmm12, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm12, %xmm12 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm1 + + movddup %xmm13, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm13, %xmm13 + mulpd %xmm7, %xmm13 + addpd %xmm13, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO2, LDC) + movhps %xmm2, 1 * SIZE(CO2, LDC) + movlps %xmm3, 2 * SIZE(CO2, LDC) + movhps %xmm3, 3 * SIZE(CO2, LDC) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhps 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 0 * SIZE(CO2, %rax), %xmm2 + movhps 1 * SIZE(CO2, %rax), %xmm2 + movsd 2 * SIZE(CO2, %rax), %xmm3 + movhps 3 * SIZE(CO2, %rax), %xmm3 + + movddup %xmm14, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm14, %xmm14 + mulpd %xmm7, %xmm14 + addpd %xmm14, %xmm1 + + movddup %xmm15, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm15, %xmm15 + mulpd %xmm7, %xmm15 + addpd %xmm15, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2, LDC, 2) + movhps %xmm0, 1 * SIZE(CO2, LDC, 2) + movlps %xmm1, 2 * SIZE(CO2, LDC, 2) + movhps %xmm1, 3 * SIZE(CO2, LDC, 2) + + movlps %xmm2, 0 * SIZE(CO2, %rax) + movhps %xmm2, 1 * SIZE(CO2, %rax) + movlps %xmm3, 2 * SIZE(CO2, %rax) + movhps %xmm3, 3 * SIZE(CO2, %rax) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + leaq (LDC, LDC, 2), %rax + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 1 * SIZE(CO1, LDC), %xmm1 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm2 + movhps 1 * SIZE(CO1, LDC, 2), %xmm2 + movsd 0 * SIZE(CO1, %rax), %xmm3 + movhps 1 * SIZE(CO1, %rax), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) + movhps %xmm1, 1 * SIZE(CO1, LDC) + + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 1 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %rax) + movhps %xmm3, 1 * SIZE(CO1, %rax) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO2, LDC), %xmm1 + movhps 1 * SIZE(CO2, LDC), %xmm1 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm2 + movhps 1 * SIZE(CO2, LDC, 2), %xmm2 + movsd 0 * SIZE(CO2, %rax), %xmm3 + movhps 1 * SIZE(CO2, %rax), %xmm3 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movddup %xmm11, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 0 * SIZE(CO2, LDC) + movhps %xmm1, 1 * SIZE(CO2, LDC) + + movlps %xmm2, 0 * SIZE(CO2, LDC, 2) + movhps %xmm2, 1 * SIZE(CO2, LDC, 2) + movlps %xmm3, 0 * SIZE(CO2, %rax) + movhps %xmm3, 1 * SIZE(CO2, %rax) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK +#endif + + movq BO, B + + leaq (C, LDC, 8), C + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $4, N + jle .L50 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 7 * SIZE(CO2, LDC, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC), %xmm2 + movhps 1 * SIZE(CO1, LDC), %xmm2 + movsd 2 * SIZE(CO1, LDC), %xmm3 + movhps 3 * SIZE(CO1, LDC), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 1 * SIZE(CO1, LDC) + movlps %xmm3, 2 * SIZE(CO1, LDC) + movhps %xmm3, 3 * SIZE(CO1, LDC) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movsd 0 * SIZE(CO2, LDC), %xmm2 + movhps 1 * SIZE(CO2, LDC), %xmm2 + movsd 2 * SIZE(CO2, LDC), %xmm3 + movhps 3 * SIZE(CO2, LDC), %xmm3 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movddup %xmm11, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO2, LDC) + movhps %xmm2, 1 * SIZE(CO2, LDC) + movlps %xmm3, 2 * SIZE(CO2, LDC) + movhps %xmm3, 3 * SIZE(CO2, LDC) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 1 * SIZE(CO1, LDC), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 1 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO2, LDC), %xmm3 + movhps 1 * SIZE(CO2, LDC), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) + movhps %xmm1, 1 * SIZE(CO1, LDC) + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movlps %xmm3, 0 * SIZE(CO2, LDC) + movhps %xmm3, 1 * SIZE(CO2, LDC) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + ALIGN_4 + +.L50: + testq $2, N + jle .L70 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhps 3 * SIZE(CO2), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movlps %xmm3, 2 * SIZE(CO2) + movhps %xmm3, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 1 * SIZE(CO2), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L999 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifndef TRMMKERNEL + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 +#else + movsd -16 * SIZE(AO), %xmm0 + movhpd -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhpd -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 +#ifndef TRMMKERNEL + movapd -14 * SIZE(AO), %xmm0 +#else + movsd -14 * SIZE(AO), %xmm0 + movhpd -13 * SIZE(AO), %xmm0 +#endif + addpd %xmm1, %xmm8 +#ifndef TRMMKERNEL + movapd -14 * SIZE(BO), %xmm1 +#else + movsd -14 * SIZE(BO), %xmm1 + movhpd -13 * SIZE(BO), %xmm1 +#endif + + mulpd %xmm0, %xmm1 +#ifndef TRMMKERNEL + movapd -12 * SIZE(AO), %xmm0 +#else + movsd -12 * SIZE(AO), %xmm0 + movhpd -11 * SIZE(AO), %xmm0 +#endif + addpd %xmm1, %xmm9 +#ifndef TRMMKERNEL + movapd -12 * SIZE(BO), %xmm1 +#else + movsd -12 * SIZE(BO), %xmm1 + movhpd -11 * SIZE(BO), %xmm1 +#endif + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: + haddpd %xmm8, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x2_atom.S b/kernel/x86_64/zgemm3m_kernel_4x2_atom.S new file mode 100644 index 0000000..189505d --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x2_atom.S @@ -0,0 +1,1215 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KKK 72(%rsp) +#define KK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 + +#else + movq OLD_LDC, LDC +#endif + + movsd %xmm0, ALPHA_R + movsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $1, J + jle .L40 + ALIGN_4 + +.L10: + movq C, CO1 + leaq (C, LDC, 1), CO2 + leaq (C, LDC, 2), C + + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: + movq B, BO + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + + movq K, %rax + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: + movq K, %rax + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + movsd ALPHA_R, %xmm4 + addsd %xmm2, %xmm13 + movsd ALPHA_I, %xmm5 + addsd %xmm7, %xmm14 + addsd %xmm6, %xmm15 + + movaps %xmm8, %xmm0 + movaps %xmm10, %xmm1 + movaps %xmm12, %xmm2 + movaps %xmm14, %xmm3 + + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm0 + mulsd %xmm4, %xmm10 + mulsd %xmm5, %xmm1 + mulsd %xmm4, %xmm12 + mulsd %xmm5, %xmm2 + mulsd %xmm4, %xmm14 + mulsd %xmm5, %xmm3 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm0 + addsd 2 * SIZE(CO1), %xmm10 + addsd 3 * SIZE(CO1), %xmm1 + addsd 4 * SIZE(CO1), %xmm12 + addsd 5 * SIZE(CO1), %xmm2 + addsd 6 * SIZE(CO1), %xmm14 + addsd 7 * SIZE(CO1), %xmm3 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm0, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movsd %xmm1, 3 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movsd %xmm2, 5 * SIZE(CO1) + movsd %xmm14, 6 * SIZE(CO1) + movsd %xmm3, 7 * SIZE(CO1) + + movaps %xmm9, %xmm0 + movaps %xmm11, %xmm1 + movaps %xmm13, %xmm2 + movaps %xmm15, %xmm3 + + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm0 + mulsd %xmm4, %xmm11 + mulsd %xmm5, %xmm1 + mulsd %xmm4, %xmm13 + mulsd %xmm5, %xmm2 + mulsd %xmm4, %xmm15 + mulsd %xmm5, %xmm3 + + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm0 + addsd 2 * SIZE(CO2), %xmm11 + addsd 3 * SIZE(CO2), %xmm1 + addsd 4 * SIZE(CO2), %xmm13 + addsd 5 * SIZE(CO2), %xmm2 + addsd 6 * SIZE(CO2), %xmm15 + addsd 7 * SIZE(CO2), %xmm3 + + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm0, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movsd %xmm1, 3 * SIZE(CO2) + movsd %xmm13, 4 * SIZE(CO2) + movsd %xmm2, 5 * SIZE(CO2) + movsd %xmm15, 6 * SIZE(CO2) + movsd %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + jle .L30 + + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + movq K, %rax + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: + movq K, %rax + movsd ALPHA_R, %xmm5 + movsd ALPHA_I, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + addsd %xmm6, %xmm11 + + movaps %xmm8, %xmm12 + movaps %xmm10, %xmm13 + movaps %xmm9, %xmm14 + movaps %xmm11, %xmm15 + + mulsd %xmm5, %xmm8 + mulsd %xmm7, %xmm12 + mulsd %xmm5, %xmm10 + mulsd %xmm7, %xmm13 + mulsd %xmm5, %xmm9 + mulsd %xmm7, %xmm14 + mulsd %xmm5, %xmm11 + mulsd %xmm7, %xmm15 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm12 + addsd 2 * SIZE(CO1), %xmm10 + addsd 3 * SIZE(CO1), %xmm13 + + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm14 + addsd 2 * SIZE(CO2), %xmm11 + addsd 3 * SIZE(CO2), %xmm15 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm12, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm14, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movsd %xmm15, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + movq K, %rax + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: + movq K, %rax + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + movaps %xmm8, %xmm10 + movaps %xmm9, %xmm11 + + mulsd %xmm6, %xmm8 + mulsd %xmm7, %xmm10 + mulsd %xmm6, %xmm9 + mulsd %xmm7, %xmm11 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm11, 1 * SIZE(CO2) + ALIGN_4 + +.L39: + movq BO, B + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + + movq C, CO1 + addq LDC, C + + movq A, AO + + movq M, I + sarq $2, I + jle .L50 + ALIGN_4 + +.L41: + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 7 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + + movq K, %rax + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: + movq K, %rax + + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + mulsd %xmm6, %xmm8 + mulsd %xmm7, %xmm9 + mulsd %xmm6, %xmm10 + mulsd %xmm7, %xmm11 + mulsd %xmm6, %xmm12 + mulsd %xmm7, %xmm13 + mulsd %xmm6, %xmm14 + mulsd %xmm7, %xmm15 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 + addsd 2 * SIZE(CO1), %xmm10 + addsd 3 * SIZE(CO1), %xmm11 + addsd 4 * SIZE(CO1), %xmm12 + addsd 5 * SIZE(CO1), %xmm13 + addsd 6 * SIZE(CO1), %xmm14 + addsd 7 * SIZE(CO1), %xmm15 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movsd %xmm11, 3 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movsd %xmm13, 5 * SIZE(CO1) + movsd %xmm14, 6 * SIZE(CO1) + movsd %xmm15, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 + + decq I # i -- + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + jle .L60 + + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + + movq K, %rax + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: + movq K, %rax + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + mulsd %xmm6, %xmm8 + mulsd %xmm7, %xmm9 + mulsd %xmm6, %xmm10 + mulsd %xmm7, %xmm11 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 + addsd 2 * SIZE(CO1), %xmm10 + addsd 3 * SIZE(CO1), %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movsd %xmm11, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L60: + testq $1, M + je .L999 + ALIGN_4 + + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + movq K, %rax + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: + movq K, %rax + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + + movaps %xmm8, %xmm9 + mulsd %xmm6, %xmm8 + mulsd %xmm7, %xmm9 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S new file mode 100644 index 0000000..4199bd9 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S @@ -0,0 +1,2467 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define BUFFERED + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 512(%rsp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 21 + 0) + +#define RPREFETCHSIZE (8 * 14 + 0) +#define WPREFETCHSIZE (8 * 6 + 0) + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd (AO, %rax, 4), %xmm6 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#ifndef __APPLE__ + .align 512 +#endif +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif +#endif + + movq %rsp, %rbx # save old stack + subq $1024 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A +#ifndef BUFFERED + subq $-16 * SIZE, B +#endif + + movsd %xmm0, 0 + ALPHA + movsd %xmm1, 8 + ALPHA + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#endif + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#ifdef BUFFERED + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_3 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps (B), %xmm0 + movaps 2 * SIZE(B), %xmm1 + + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm1, -14 * SIZE(BO) + + prefetch (RPREFETCHSIZE + 8) * SIZE(B) + + movaps 4 * SIZE(B), %xmm2 + movaps 6 * SIZE(B), %xmm3 + + movaps %xmm2, -12 * SIZE(BO) + movaps %xmm3, -10 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + movaps 8 * SIZE(B), %xmm4 + movaps 10 * SIZE(B), %xmm5 + + movaps %xmm4, -8 * SIZE(BO) + movaps %xmm5, -6 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 8) * SIZE(BO) + + movaps 12 * SIZE(B), %xmm6 + movaps 14 * SIZE(B), %xmm7 + + movaps %xmm6, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, B + + subq $1, %rax + jne .L02 + ALIGN_3 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_3 + +.L04: + movaps (B), %xmm0 + movaps %xmm0, -16 * SIZE(BO) + + movaps 2 * SIZE(B), %xmm1 + movaps %xmm1, -14 * SIZE(BO) + + addq $4 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: +#endif + movq A, AO # aoffset = a + movq B, BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + prefetch (RPREFETCHSIZE + 0) * SIZE(BB) + prefetch (RPREFETCHSIZE + 8) * SIZE(BB) + prefetch (RPREFETCHSIZE + 16) * SIZE(BB) + subq $-16 * SIZE, BB + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + + prefetchw 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 7 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO1, LDC, 2), %xmm2 + movsd 6 * SIZE(CO1, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO1, LDC, 2), %xmm3 + + movddup %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + movddup %xmm14, %xmm5 + unpckhpd %xmm14, %xmm14 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm14 + + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm14, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) + movsd %xmm3, 6 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO2, LDC, 2), %xmm2 + movsd 6 * SIZE(CO2, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO2, LDC, 2), %xmm3 + + movddup %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + movddup %xmm15, %xmm5 + unpckhpd %xmm15, %xmm15 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm15 + + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm15, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO2, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO2, LDC, 2) + movsd %xmm3, 6 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movddup %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movddup %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + +#ifndef BUFFERED + movq BO, B +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#ifdef BUFFERED + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + prefetchnta (RPREFETCHSIZE + 0) * SIZE(B) + + movaps (B), %xmm0 + movaps %xmm0, -16 * SIZE(BO) + + movaps 2 * SIZE(B), %xmm1 + movaps %xmm1, -14 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + movaps 4 * SIZE(B), %xmm2 + movaps %xmm2, -12 * SIZE(BO) + + movaps 6 * SIZE(B), %xmm3 + movaps %xmm3, -10 * SIZE(BO) + + subq $-8 * SIZE, BO + subq $-8 * SIZE, B + + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movaps (B), %xmm0 + movaps %xmm0, -16 * SIZE(BO) + + addq $2 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L44 + ALIGN_4 + +.L50: +#endif + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + prefetchw 7 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 7 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + +#ifndef BUFFERED + movq BO, B +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#ifdef BUFFERED + movq K, %rax + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + prefetchnta (RPREFETCHSIZE + 0) * SIZE(B) + + movaps (B), %xmm0 + movaps %xmm0, -16 * SIZE(BO) + + movaps 2 * SIZE(B), %xmm1 + movaps %xmm1, -14 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + movaps 4 * SIZE(B), %xmm2 + movaps %xmm2, -12 * SIZE(BO) + + movaps 6 * SIZE(B), %xmm3 + movaps %xmm3, -10 * SIZE(BO) + + subq $-8 * SIZE, BO + subq $-8 * SIZE, B + + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: + movq K, %rax + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movsd (B), %xmm0 + movlpd %xmm0, -16 * SIZE(BO) + + addq $1 * SIZE, B + addq $1 * SIZE, BO + decq %rax + jne .L84 + ALIGN_4 + +.L90: +#endif + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm12, %xmm12 + movddup -14 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + movddup -15 * SIZE(BO), %xmm5 + + prefetchw 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm9 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 + +.L99: + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + ALIGN_3 + +.L999: + movq %rbx, %rsp + + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_core2.S b/kernel/x86_64/zgemm3m_kernel_4x4_core2.S new file mode 100644 index 0000000..1b466fb --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_core2.S @@ -0,0 +1,2282 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 13 + 5) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif +#endif + + movq %rsp, %r15 # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm0, 0 + ALPHA + movsd %xmm1, 8 + ALPHA + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + NOBRANCH + jle .L05 + ALIGN_4 + +.L02: + movapd -16 * SIZE(B), %xmm0 + prefetchnta (PREFETCH_R + 0) * SIZE(B) + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -8 * SIZE(B), %xmm4 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + prefetchnta (PREFETCH_R + 8) * SIZE(B) + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + movddup %xmm2, %xmm10 + unpckhpd %xmm2, %xmm2 + movddup %xmm3, %xmm11 + unpckhpd %xmm3, %xmm3 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movddup %xmm4, %xmm12 + unpckhpd %xmm4, %xmm4 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + movddup %xmm6, %xmm14 + unpckhpd %xmm6, %xmm6 + movddup %xmm7, %xmm15 + unpckhpd %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + movapd %xmm10, -8 * SIZE(BO) + movapd %xmm2, -6 * SIZE(BO) + movapd %xmm11, -4 * SIZE(BO) + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + movapd %xmm12, 0 * SIZE(BO) + movapd %xmm4, 2 * SIZE(BO) + movapd %xmm13, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movapd %xmm14, 8 * SIZE(BO) + movapd %xmm6, 10 * SIZE(BO) + movapd %xmm15, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-32 * SIZE, BO + decq %rax + BRANCH + jne .L02 + ALIGN_4 + +.L05: + movq K, %rax + andq $3, %rax + BRANCH + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + BRANCH + jne .L06 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 20 * SIZE + BUFFER, BO +#else + leaq 20 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -20 * SIZE(BO), %xmm6 + movaps -18 * SIZE(BO), %xmm7 + + prefetcht2 0 * SIZE(BB) + + pxor %xmm2, %xmm2 + prefetcht0 7 * SIZE(CO1) + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + prefetcht0 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + + movapd %xmm2, %xmm8 + movapd %xmm2, %xmm9 + movapd %xmm2, %xmm10 + prefetcht0 7 * SIZE(CO1, LDC, 2) + movapd %xmm2, %xmm11 + + movapd %xmm2, %xmm12 + movapd %xmm2, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 2) + movapd %xmm2, %xmm14 + movapd %xmm2, %xmm15 + + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PADDING; + addpd %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + PADDING; + addpd %xmm3, %xmm14 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps -10 * SIZE(AO), %xmm1 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -6 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -2 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps -6 * SIZE(AO), %xmm1 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps 2 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm7, %xmm5 + mulpd %xmm1, %xmm5 + mulpd %xmm0, %xmm7 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps 6 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + movaps 8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps -2 * SIZE(AO), %xmm1 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps 10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps 12 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + subq $-16 * SIZE, AO + + addpd %xmm7, %xmm9 + movaps 14 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + subq $-32 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_4 + +.L15: + prefetcht2 -8 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addpd %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + addq $4 * SIZE, AO + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + addq $8 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movapd ALPHA, %xmm7 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO1, LDC, 2), %xmm2 + movsd 6 * SIZE(CO1, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO1, LDC, 2), %xmm3 + + movddup %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + movddup %xmm14, %xmm5 + unpckhpd %xmm14, %xmm14 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm14 + + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm14, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) + movsd %xmm3, 6 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO2, LDC, 2), %xmm2 + movsd 6 * SIZE(CO2, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO2, LDC, 2), %xmm3 + + movddup %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + movddup %xmm15, %xmm5 + unpckhpd %xmm15, %xmm15 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm15 + + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm15, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO2, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO2, LDC, 2) + movsd %xmm3, 6 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm9, %xmm3 + movapd %xmm10, %xmm4 + movapd %xmm11, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L21: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -4 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -2 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd 0 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd 2 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd 4 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd 6 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd 8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd 12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd 14 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + BRANCH + jg .L21 + ALIGN_4 + +.L25: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movddup %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movddup %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm9, %xmm3 + movapd %xmm10, %xmm4 + movapd %xmm11, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L31: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -15 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd -8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -6 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -4 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -14 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd 0 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd 2 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd 4 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -13 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd 8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd 10 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd 12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -12 * SIZE(AO), %xmm0 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + BRANCH + jg .L31 + ALIGN_4 + +.L35: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + + movddup %xmm11, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L43 + + addq %rax, %rax + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $7, %rax + BRANCH + jle .L45 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L44 + ALIGN_4 + +.L45: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L50: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L55 + ALIGN_4 + +.L51: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -6 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -6 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -2 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd 2 * SIZE(AO), %xmm1 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L51 + ALIGN_4 + +.L55: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L58 + ALIGN_4 + +.L56: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm9 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + subq $1, I + jg .L50 + ALIGN_4 + +.L60: + testq $2, M + jle .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L65 + ALIGN_4 + +.L61: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm1, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO), %xmm0 + addpd %xmm4, %xmm10 + movapd -4 * SIZE(BO), %xmm4 + mulpd %xmm1, %xmm4 + addpd %xmm5, %xmm11 + movapd -2 * SIZE(BO), %xmm5 + mulpd %xmm1, %xmm5 + movapd -6 * SIZE(AO), %xmm1 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L61 + ALIGN_4 + +.L65: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L68 + ALIGN_4 + +.L66: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + jle .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L75 + ALIGN_4 + +.L71: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm1, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + movsd -13 * SIZE(AO), %xmm1 + + addsd %xmm2, %xmm8 + movsd -8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -6 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + addsd %xmm4, %xmm10 + movsd -4 * SIZE(BO), %xmm4 + mulsd %xmm1, %xmm4 + addsd %xmm5, %xmm11 + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + movsd -11 * SIZE(AO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L71 + ALIGN_4 + +.L75: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + ALIGN_4 + +.L76: + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $4, %rax + jle .L83 + + addq %rax, %rax + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: + movq K, %rax + andq $15, %rax + BRANCH + jle .L85 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm8 + + movapd %xmm8, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L85: + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + jle .L100 + ALIGN_4 + +.L90: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(BO), %xmm5 + pxor %xmm12, %xmm12 + movapd -12 * SIZE(BO), %xmm6 + pxor %xmm13, %xmm13 + movapd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + prefetcht0 3 * SIZE(CO1) + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L95 + ALIGN_4 + +.L91: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm4, %xmm1 + movapd -8 * SIZE(BO), %xmm4 + addpd %xmm2, %xmm9 + movapd -12 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + addpd %xmm3, %xmm13 + movapd -10 * SIZE(AO), %xmm3 + mulpd %xmm5, %xmm3 + movapd -6 * SIZE(BO), %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm0, %xmm8 + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm6, %xmm0 + addpd %xmm1, %xmm12 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm6, %xmm1 + movapd -4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + addpd %xmm3, %xmm13 + movapd -2 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movapd -2 * SIZE(BO), %xmm7 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jg .L91 + ALIGN_4 + +.L95: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L98 + ALIGN_4 + +.L96: + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm4, %xmm1 + movapd -14 * SIZE(BO), %xmm4 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L98: + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 + + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + + subq $1, I + jg .L90 + ALIGN_4 + +.L100: + testq $2, M + jle .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -12 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movapd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L105 + ALIGN_4 + +.L101: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movapd -8 * SIZE(BO), %xmm4 + addpd %xmm1, %xmm9 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm5, %xmm1 + movapd -6 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm10 + movapd -12 * SIZE(AO), %xmm2 + mulpd %xmm6, %xmm2 + movapd -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm11 + movapd -10 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movapd -2 * SIZE(BO), %xmm7 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jg .L101 + ALIGN_4 + +.L105: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L108 + ALIGN_4 + +.L106: + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movapd -14 * SIZE(BO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L108: + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movsd -14 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movsd -12 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movsd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L115 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movsd -8 * SIZE(BO), %xmm4 + addpd %xmm1, %xmm9 + movsd -15 * SIZE(AO), %xmm1 + mulpd %xmm5, %xmm1 + movsd -6 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm10 + movsd -14 * SIZE(AO), %xmm2 + mulpd %xmm6, %xmm2 + movsd -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm11 + movsd -13 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movsd -2 * SIZE(BO), %xmm7 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jg .L111 + ALIGN_4 + +.L115: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + ALIGN_4 + +.L116: + addsd %xmm0, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + mulsd %xmm4, %xmm0 + movsd -14 * SIZE(BO), %xmm4 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + addsd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S b/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S new file mode 100644 index 0000000..7dd2c91 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S @@ -0,0 +1,2131 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#ifdef NANO +#define PREFETCHSIZE (8 * 2 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht2 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 17 + 4) +#endif + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-17 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHB -16 * SIZE(BB) + + xorpd %xmm5, %xmm5 + xorpd %xmm6, %xmm6 + + PREFETCHW 3 * SIZE(CO1) + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + PREFETCHW 7 * SIZE(CO2) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + PREFETCHW 3 * SIZE(CO1, LDC, 2) + movaps %xmm4, %xmm12 + movaps %xmm4, %xmm13 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + + subq $-12 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -11 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movapd %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movapd %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + PADDING + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addpd %xmm2, %xmm9 + movaps -5 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + subq $-16 * SIZE, AO + movaps -3 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -1 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + subq $-16 * SIZE, BO + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + PREFETCHB -8 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movups ALPHA_R, %xmm7 + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + movsd %xmm15, %xmm14 + movsd %xmm0, %xmm15 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhps 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhps 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm12, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm12, %xmm12 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 4 * SIZE(CO1) + movhps %xmm2, 5 * SIZE(CO1) + movlps %xmm3, 6 * SIZE(CO1) + movhps %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhps 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhps 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movddup %xmm13, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm13, %xmm13 + mulpd %xmm7, %xmm13 + addpd %xmm13, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movlps %xmm2, 4 * SIZE(CO2) + movhps %xmm2, 5 * SIZE(CO2) + movlps %xmm3, 6 * SIZE(CO2) + movhps %xmm3, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhps 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm2 + movhps 5 * SIZE(CO1, LDC, 2), %xmm2 + movsd 6 * SIZE(CO1, LDC, 2), %xmm3 + movhps 7 * SIZE(CO1, LDC, 2), %xmm3 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movddup %xmm14, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm14, %xmm14 + mulpd %xmm7, %xmm14 + addpd %xmm14, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 1 * SIZE(CO1, LDC, 2) + movlps %xmm1, 2 * SIZE(CO1, LDC, 2) + movhps %xmm1, 3 * SIZE(CO1, LDC, 2) + + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 5 * SIZE(CO1, LDC, 2) + movlps %xmm3, 6 * SIZE(CO1, LDC, 2) + movhps %xmm3, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhps 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm2 + movhps 5 * SIZE(CO2, LDC, 2), %xmm2 + movsd 6 * SIZE(CO2, LDC, 2), %xmm3 + movhps 7 * SIZE(CO2, LDC, 2), %xmm3 + + movddup %xmm11, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm1 + + movddup %xmm15, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm15, %xmm15 + mulpd %xmm7, %xmm15 + addpd %xmm15, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2, LDC, 2) + movhps %xmm0, 1 * SIZE(CO2, LDC, 2) + movlps %xmm1, 2 * SIZE(CO2, LDC, 2) + movhps %xmm1, 3 * SIZE(CO2, LDC, 2) + + movlps %xmm2, 4 * SIZE(CO2, LDC, 2) + movhps %xmm2, 5 * SIZE(CO2, LDC, 2) + movlps %xmm3, 6 * SIZE(CO2, LDC, 2) + movhps %xmm3, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm3, %xmm8 + movaps %xmm3, %xmm9 + movaps %xmm3, %xmm10 + movaps %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -11 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -5 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -3 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps -1 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + movups ALPHA_R, %xmm7 + + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhps 3 * SIZE(CO1, LDC, 2), %xmm1 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 1 * SIZE(CO1, LDC, 2) + movlps %xmm1, 2 * SIZE(CO1, LDC, 2) + movhps %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhps 3 * SIZE(CO2, LDC, 2), %xmm1 + + movddup %xmm11, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm1 + + movlps %xmm0, 0 * SIZE(CO2, LDC, 2) + movhps %xmm0, 1 * SIZE(CO2, LDC, 2) + movlps %xmm1, 2 * SIZE(CO2, LDC, 2) + movhps %xmm1, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -11 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -5 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -3 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -1 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 1 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -11 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + movups ALPHA_R, %xmm7 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 1 * SIZE(CO2), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 0 * SIZE(CO2, LDC, 2), %xmm1 + movhps 1 * SIZE(CO2, LDC, 2), %xmm1 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 1 * SIZE(CO1, LDC, 2) + movlps %xmm1, 0 * SIZE(CO2, LDC, 2) + movhps %xmm1, 1 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + PREFETCHB -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + PREFETCHW 3 * SIZE(CO2) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -11 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + movups ALPHA_R, %xmm7 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movaps %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhps 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhps 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm12, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm12, %xmm12 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 4 * SIZE(CO1) + movhps %xmm2, 5 * SIZE(CO1) + movlps %xmm3, 6 * SIZE(CO1) + movhps %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhps 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhps 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movddup %xmm13, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm13, %xmm13 + mulpd %xmm7, %xmm13 + addpd %xmm13, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movlps %xmm2, 4 * SIZE(CO2) + movhps %xmm2, 5 * SIZE(CO2) + movlps %xmm3, 6 * SIZE(CO2) + movhps %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -13 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -9 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + movups ALPHA_R, %xmm7 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 2), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: + movups ALPHA_R, %xmm7 + + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 1 * SIZE(CO2), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: + movups ALPHA_R, %xmm7 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhps 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhps 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm12, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm12, %xmm12 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 4 * SIZE(CO1) + movhps %xmm2, 5 * SIZE(CO1) + movlps %xmm3, 6 * SIZE(CO1) + movhps %xmm3, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: + movups ALPHA_R, %xmm7 + + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -17 * SIZE(BO), %xmm2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: + movups ALPHA_R, %xmm7 + + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S b/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S new file mode 100644 index 0000000..3b313b3 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S @@ -0,0 +1,2820 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5 + 4) +#define movsd movlps +#define movapd movaps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 5 + 4) +#define movapd movaps +#endif + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else + +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 +#endif + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + + movsd %xmm0, 0 + ALPHA + movsd %xmm1, 8 + ALPHA + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_3 + +.L01: +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_3 + + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) + + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) + + movq 8 * SIZE(B), %mm0 + movq 9 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + movq 10 * SIZE(B), %mm2 + movq 11 * SIZE(B), %mm3 + movq %mm2, 4 * SIZE(BO) + movq %mm2, 5 * SIZE(BO) + movq %mm3, 6 * SIZE(BO) + movq %mm3, 7 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + movq 12 * SIZE(B), %mm4 + movq 13 * SIZE(B), %mm5 + movq %mm4, 8 * SIZE(BO) + movq %mm4, 9 * SIZE(BO) + movq %mm5, 10 * SIZE(BO) + movq %mm5, 11 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) + + movq 14 * SIZE(B), %mm6 + movq 15 * SIZE(B), %mm7 + movq %mm6, 12 * SIZE(BO) + movq %mm6, 13 * SIZE(BO) + movq %mm7, 14 * SIZE(BO) + movq %mm7, 15 * SIZE(BO) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, B + + subq $1, %rax + jne .L02 + ALIGN_3 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_3 + +.L04: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_3 + +.L10: + movq A, AO # aoffset = a + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_3 + +.L11: + PREFETCH 0 * SIZE(BB) + PREFETCH 8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -12 * SIZE(AO), %xmm4 + movapd -12 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -10 * SIZE(AO), %xmm6 + movapd -8 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCHW 7 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $32 * SIZE, BO + addq $16 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm0 + addpd %xmm1, %xmm10 + movapd -16 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm13 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm2 + addpd %xmm1, %xmm14 + movapd -8 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm2, %xmm15 + movapd -10 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_3 + +.L19: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + pshufd $0x44, %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + pshufd $0x44, %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + pshufd $0x44, %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO1, LDC, 2), %xmm2 + movsd 6 * SIZE(CO1, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x44, %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + pshufd $0x44, %xmm14, %xmm5 + unpckhpd %xmm14, %xmm14 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm14 + + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm14, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) + movsd %xmm3, 6 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO2, LDC, 2), %xmm2 + movsd 6 * SIZE(CO2, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x44, %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + pshufd $0x44, %xmm15, %xmm5 + unpckhpd %xmm15, %xmm15 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm15 + + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm15, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO2, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO2, LDC, 2) + movsd %xmm3, 6 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_3 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_3 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 16 * SIZE(BO), %xmm5 + movapd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm8 + movapd 18 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movapd 20 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + mulpd 22 * SIZE(BO), %xmm0 + addpd %xmm5, %xmm10 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm0, %xmm11 + movapd -10 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm7 + addpd %xmm7, %xmm8 + movapd 26 * SIZE(BO), %xmm7 + mulpd %xmm0, %xmm7 + addpd %xmm7, %xmm9 + movapd 28 * SIZE(BO), %xmm7 + mulpd %xmm0, %xmm7 + mulpd 30 * SIZE(BO), %xmm0 + addpd %xmm7, %xmm10 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movapd 34 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm9 + movapd 36 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 38 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 64 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -6 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movapd 42 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm9 + movapd 44 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 46 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 72 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm8 + movapd 50 * SIZE(BO), %xmm5 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movapd 52 * SIZE(BO), %xmm5 + mulpd %xmm2, %xmm5 + mulpd 54 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm10 + movapd 80 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm11 + movapd -2 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + addpd %xmm7, %xmm8 + movapd 58 * SIZE(BO), %xmm7 + mulpd %xmm2, %xmm7 + addpd %xmm7, %xmm9 + movapd 60 * SIZE(BO), %xmm7 + mulpd %xmm2, %xmm7 + mulpd 62 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm10 + movapd 88 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_3 + +.L29: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + pshufd $0x44, %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + pshufd $0x44, %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x44, %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_3 + +.L30: + testq $1, M + je .L39 + ALIGN_3 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movsd 16 * SIZE(BO), %xmm5 + movsd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_3 + +.L32: + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 32 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -15 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm8 + movsd 10 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm9 + movsd 12 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 40 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -14 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm8 + movsd 18 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm9 + movsd 20 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + mulsd 22 * SIZE(BO), %xmm0 + addsd %xmm5, %xmm10 + movsd 48 * SIZE(BO), %xmm5 + addsd %xmm0, %xmm11 + movsd -13 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm8 + movsd 26 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm9 + movsd 28 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + mulsd 30 * SIZE(BO), %xmm0 + addsd %xmm7, %xmm10 + movsd 56 * SIZE(BO), %xmm7 + addsd %xmm0, %xmm11 + movsd -12 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + movsd 34 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 36 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 38 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 64 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -11 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm8 + movsd 42 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm9 + movsd 44 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 46 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 72 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -10 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm8 + movsd 50 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm9 + movsd 52 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + mulsd 54 * SIZE(BO), %xmm0 + addsd %xmm5, %xmm10 + movsd 80 * SIZE(BO), %xmm5 + addsd %xmm0, %xmm11 + movsd -9 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm8 + movsd 58 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm9 + movsd 60 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + mulsd 62 * SIZE(BO), %xmm0 + addsd %xmm7, %xmm10 + movsd 88 * SIZE(BO), %xmm7 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + movsd 2 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 8 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_3 + +.L38: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + pshufd $0x44, %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + pshufd $0x44, %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + + pshufd $0x44, %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + + pshufd $0x44, %xmm11, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + ALIGN_3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + ALIGN_3 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_3 + +.L42: + PREFETCH 56 * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_3 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_3 + +.L44: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_3 + +.L50: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_3 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm12, %xmm12 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + + movapd 0 * SIZE(AO), %xmm4 + movapd 16 * SIZE(BO), %xmm5 + movapd 8 * SIZE(AO), %xmm6 + movapd 24 * SIZE(BO), %xmm7 + + PREFETCHW 7 * SIZE(CO1) + PREFETCHW 7 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_3 + +.L52: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd 16 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd 24 * SIZE(AO), %xmm2 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm4, %xmm5 + mulpd 18 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm8 + movapd 16 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm9 + movapd 2 * SIZE(AO), %xmm4 + mulpd %xmm4, %xmm5 + mulpd 18 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm12 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm13 + movapd 4 * SIZE(AO), %xmm4 + + mulpd %xmm4, %xmm5 + mulpd 22 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm8 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm9 + movapd 6 * SIZE(AO), %xmm4 + mulpd %xmm4, %xmm5 + mulpd 22 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm12 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm13 + movapd 32 * SIZE(AO), %xmm4 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm6, %xmm7 + mulpd 26 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm8 + movapd 24 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm9 + movapd 10 * SIZE(AO), %xmm6 + mulpd %xmm6, %xmm7 + mulpd 26 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm12 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm13 + movapd 12 * SIZE(AO), %xmm6 + + mulpd %xmm6, %xmm7 + mulpd 30 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm8 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm9 + movapd 14 * SIZE(AO), %xmm6 + mulpd %xmm6, %xmm7 + mulpd 30 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm12 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm13 + movapd 40 * SIZE(AO), %xmm6 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_3 + +.L56: + movapd 0 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm12 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_3 + +.L59: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + pshufd $0x44, %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + pshufd $0x44, %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + pshufd $0x44, %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_3 + +.L60: + testq $2, M + je .L70 + ALIGN_3 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 16 * SIZE(BO), %xmm5 + movapd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm5 + mulpd 18 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm8 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm5 + mulpd 22 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm10 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + mulpd 26 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm8 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + mulpd 30 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm10 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_3 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + pshufd $0x44, %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_3 + +.L70: + testq $1, M + je .L79 + ALIGN_3 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -12 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movsd 16 * SIZE(BO), %xmm5 + movsd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_3 + +.L72: + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm9 + movsd -15 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 32 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -14 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm8 + movsd 12 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm9 + movsd -13 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 40 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + + mulsd %xmm2, %xmm5 + mulsd 18 * SIZE(BO), %xmm2 + addsd %xmm5, %xmm8 + movsd 20 * SIZE(BO), %xmm5 + addsd %xmm2, %xmm9 + movsd -11 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm5 + mulsd 22 * SIZE(BO), %xmm2 + addsd %xmm5, %xmm10 + movsd 48 * SIZE(BO), %xmm5 + addsd %xmm2, %xmm11 + movsd -10 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm7 + mulsd 26 * SIZE(BO), %xmm2 + addsd %xmm7, %xmm8 + movsd 28 * SIZE(BO), %xmm7 + addsd %xmm2, %xmm9 + movsd -9 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm7 + mulsd 30 * SIZE(BO), %xmm2 + addsd %xmm7, %xmm10 + movsd 56 * SIZE(BO), %xmm7 + addsd %xmm2, %xmm11 + movsd -4 * SIZE(AO), %xmm2 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_3 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulsd %xmm0, %xmm1 + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm8 + addsd %xmm0, %xmm9 + movsd -15 * SIZE(AO), %xmm0 + movsd 4 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_3 + +.L78: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + pshufd $0x44, %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + pshufd $0x44, %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + ALIGN_3 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + ALIGN_3 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L83 + ALIGN_3 + +.L82: + PREFETCH 56 * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_3 + +.L83: + movq K, %rax + andq $7, %rax + BRANCH + jle .L90 + ALIGN_3 + +.L84: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_3 + +.L90: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_3 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 0 * SIZE(AO), %xmm4 + movapd 8 * SIZE(AO), %xmm6 + + PREFETCHW 7 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_3 + +.L92: + mulpd %xmm1, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm10 + movapd 16 * SIZE(AO), %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm1, %xmm11 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AO), %xmm1 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO), %xmm2 + addpd %xmm1, %xmm9 + movapd 6 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AO), %xmm1 + addpd %xmm2, %xmm10 + movapd 24 * SIZE(AO), %xmm2 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm1, %xmm11 + movapd 16 * SIZE(BO), %xmm1 + mulpd %xmm3, %xmm4 + mulpd 2 * SIZE(AO), %xmm3 + addpd %xmm4, %xmm8 + movapd 4 * SIZE(AO), %xmm4 + addpd %xmm3, %xmm9 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm4 + mulpd 6 * SIZE(AO), %xmm3 + addpd %xmm4, %xmm10 + movapd 32 * SIZE(AO), %xmm4 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm3, %xmm11 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm6 + mulpd 10 * SIZE(AO), %xmm3 + addpd %xmm6, %xmm8 + movapd 12 * SIZE(AO), %xmm6 + addpd %xmm3, %xmm9 + movapd 14 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm6 + mulpd 14 * SIZE(AO), %xmm3 + addpd %xmm6, %xmm10 + movapd 40 * SIZE(AO), %xmm6 + addpd %xmm3, %xmm11 + movapd 24 * SIZE(BO), %xmm3 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_3 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_3 + +.L96: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movapd 2 * SIZE(BO), %xmm1 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_3 + +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + pshufd $0x44, %xmm9, %xmm5 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm9 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm9, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_3 + +.L100: + testq $2, M + je .L110 + ALIGN_3 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L105 + ALIGN_3 + +.L102: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd -14 * SIZE(AO), %xmm0 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -12 * SIZE(AO), %xmm0 + mulpd 4 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm10 + movapd -10 * SIZE(AO), %xmm0 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm2 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -4 * SIZE(AO), %xmm2 + mulpd 12 * SIZE(BO), %xmm2 + addpd %xmm2, %xmm10 + movapd -2 * SIZE(AO), %xmm2 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_3 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_3 + +.L106: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(AO), %xmm0 + movapd 2 * SIZE(BO), %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_3 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_3 + +.L110: + testq $1, M + je .L999 + ALIGN_3 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -12 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_3 + +.L112: + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd 16 * SIZE(BO), %xmm1 + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm9 + movsd -14 * SIZE(AO), %xmm0 + mulsd 4 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm10 + movsd -13 * SIZE(AO), %xmm0 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm8 + movsd 24 * SIZE(BO), %xmm3 + mulsd 10 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm9 + movsd -10 * SIZE(AO), %xmm2 + mulsd 12 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm10 + movsd -9 * SIZE(AO), %xmm2 + mulsd 14 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm11 + movsd -4 * SIZE(AO), %xmm2 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_3 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd 2 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_3 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + pshufd $0x44, %xmm8, %xmm4 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + ALIGN_3 + +.L999: + movq %rbx, %rsp + + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S b/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S new file mode 100644 index 0000000..73f5fce --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S @@ -0,0 +1,2622 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KKK 72(%rsp) +#define KK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCH prefetcht2 +#define PREFETCHSIZE (16 * 12 + 3) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#endif + + movsd %xmm0, ALPHA_R + movsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (, K, 4), BB + leaq (B, BB, SIZE), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: + prefetcht0 0 * SIZE(BB) + prefetcht0 8 * SIZE(BB) + subq $-8 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + pxor %xmm4, %xmm4 + movddup 16 * SIZE(BO), %xmm13 + pxor %xmm5, %xmm5 + movapd 24 * SIZE(AO), %xmm14 + pxor %xmm6, %xmm6 + movddup 24 * SIZE(BO), %xmm15 + pxor %xmm7, %xmm7 + + prefetchnta 7 * SIZE(CO1) + prefetchnta 7 * SIZE(CO2) + prefetchnta 7 * SIZE(CO1, LDC, 2) + prefetchnta 7 * SIZE(CO2, LDC, 2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + NOBRANCH + je .L15 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + BRANCH + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movsd 4 * SIZE(CO1), %xmm10 + movhpd 5 * SIZE(CO1), %xmm10 + movsd 6 * SIZE(CO1), %xmm11 + movhpd 7 * SIZE(CO1), %xmm11 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + movddup %xmm4, %xmm13 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm4 + + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm4, %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 4 * SIZE(CO1) + movhpd %xmm10, 5 * SIZE(CO1) + movsd %xmm11, 6 * SIZE(CO1) + movhpd %xmm11, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + movsd 2 * SIZE(CO2), %xmm9 + movhpd 3 * SIZE(CO2), %xmm9 + + movsd 4 * SIZE(CO2), %xmm10 + movhpd 5 * SIZE(CO2), %xmm10 + movsd 6 * SIZE(CO2), %xmm11 + movhpd 7 * SIZE(CO2), %xmm11 + + movddup %xmm1, %xmm12 + unpckhpd %xmm1, %xmm1 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm5 + + addpd %xmm12, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm5, %xmm11 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + movsd %xmm9, 2 * SIZE(CO2) + movhpd %xmm9, 3 * SIZE(CO2) + + movsd %xmm10, 4 * SIZE(CO2) + movhpd %xmm10, 5 * SIZE(CO2) + movsd %xmm11, 6 * SIZE(CO2) + movhpd %xmm11, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 + movsd 2 * SIZE(CO1, LDC, 2), %xmm9 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm9 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm10 + movhpd 5 * SIZE(CO1, LDC, 2), %xmm10 + movsd 6 * SIZE(CO1, LDC, 2), %xmm11 + movhpd 7 * SIZE(CO1, LDC, 2), %xmm11 + + movddup %xmm2, %xmm12 + unpckhpd %xmm2, %xmm2 + movddup %xmm6, %xmm13 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm2 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm6 + + addpd %xmm12, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm6, %xmm11 + + movsd %xmm8, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) + movsd %xmm9, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm9, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm10, 4 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 5 * SIZE(CO1, LDC, 2) + movsd %xmm11, 6 * SIZE(CO1, LDC, 2) + movhpd %xmm11, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 + movsd 2 * SIZE(CO2, LDC, 2), %xmm9 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm9 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm10 + movhpd 5 * SIZE(CO2, LDC, 2), %xmm10 + movsd 6 * SIZE(CO2, LDC, 2), %xmm11 + movhpd 7 * SIZE(CO2, LDC, 2), %xmm11 + + movddup %xmm3, %xmm12 + unpckhpd %xmm3, %xmm3 + movddup %xmm7, %xmm13 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm3 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm7 + + addpd %xmm12, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm7, %xmm11 + + movsd %xmm8, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) + movsd %xmm9, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm9, 3 * SIZE(CO2, LDC, 2) + + movsd %xmm10, 4 * SIZE(CO2, LDC, 2) + movhpd %xmm10, 5 * SIZE(CO2, LDC, 2) + movsd %xmm11, 6 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L11 + jmp .L20 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + movsd 2 * SIZE(CO2), %xmm9 + movhpd 3 * SIZE(CO2), %xmm9 + + movddup %xmm1, %xmm12 + unpckhpd %xmm1, %xmm1 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm1 + addpd %xmm12, %xmm8 + addpd %xmm1, %xmm9 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + movsd %xmm9, 2 * SIZE(CO2) + movhpd %xmm9, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 + movsd 2 * SIZE(CO1, LDC, 2), %xmm9 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm9 + + movddup %xmm2, %xmm12 + unpckhpd %xmm2, %xmm2 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm2 + addpd %xmm12, %xmm8 + addpd %xmm2, %xmm9 + + movsd %xmm8, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) + movsd %xmm9, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm9, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 + movsd 2 * SIZE(CO2, LDC, 2), %xmm9 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm9 + + movddup %xmm3, %xmm12 + unpckhpd %xmm3, %xmm3 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm3 + addpd %xmm12, %xmm8 + addpd %xmm3, %xmm9 + + movsd %xmm8, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) + movsd %xmm9, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm9, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + + movddup %xmm0, %xmm12 + + mulpd %xmm15, %xmm12 + addpd %xmm12, %xmm8 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm0 + addpd %xmm0, %xmm8 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 + + movddup %xmm1, %xmm12 + + mulpd %xmm15, %xmm12 + addpd %xmm12, %xmm8 + + movsd %xmm8, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 + + unpckhpd %xmm1, %xmm1 + + mulpd %xmm15, %xmm1 + addpd %xmm1, %xmm8 + + movsd %xmm8, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + movq BO, B + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) + prefetchw 4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movsd 4 * SIZE(CO1), %xmm10 + movhpd 5 * SIZE(CO1), %xmm10 + movsd 6 * SIZE(CO1), %xmm11 + movhpd 7 * SIZE(CO1), %xmm11 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + movddup %xmm4, %xmm13 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm4 + + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm4, %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 4 * SIZE(CO1) + movhpd %xmm10, 5 * SIZE(CO1) + movsd %xmm11, 6 * SIZE(CO1) + movhpd %xmm11, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + movsd 2 * SIZE(CO2), %xmm9 + movhpd 3 * SIZE(CO2), %xmm9 + + movsd 4 * SIZE(CO2), %xmm10 + movhpd 5 * SIZE(CO2), %xmm10 + movsd 6 * SIZE(CO2), %xmm11 + movhpd 7 * SIZE(CO2), %xmm11 + + movddup %xmm1, %xmm12 + unpckhpd %xmm1, %xmm1 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm5 + + addpd %xmm12, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm5, %xmm11 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + movsd %xmm9, 2 * SIZE(CO2) + movhpd %xmm9, 3 * SIZE(CO2) + + movsd %xmm10, 4 * SIZE(CO2) + movhpd %xmm10, 5 * SIZE(CO2) + movsd %xmm11, 6 * SIZE(CO2) + movhpd %xmm11, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + movsd 2 * SIZE(CO2), %xmm9 + movhpd 3 * SIZE(CO2), %xmm9 + + movddup %xmm1, %xmm12 + unpckhpd %xmm1, %xmm1 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm1 + addpd %xmm12, %xmm8 + addpd %xmm1, %xmm9 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + movsd %xmm9, 2 * SIZE(CO2) + movhpd %xmm9, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + + movddup %xmm0, %xmm12 + mulpd %xmm15, %xmm12 + addpd %xmm12, %xmm8 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm0 + addpd %xmm0, %xmm8 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movsd 4 * SIZE(CO1), %xmm10 + movhpd 5 * SIZE(CO1), %xmm10 + movsd 6 * SIZE(CO1), %xmm11 + movhpd 7 * SIZE(CO1), %xmm11 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm13 + unpckhpd %xmm1, %xmm1 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm1 + + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm1, %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 4 * SIZE(CO1) + movhpd %xmm10, 5 * SIZE(CO1) + movsd %xmm11, 6 * SIZE(CO1) + movhpd %xmm11, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(AO), %xmm9 + movapd 0 * SIZE(BO), %xmm8 + movapd 4 * SIZE(AO), %xmm11 + movapd 4 * SIZE(BO), %xmm10 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + + movddup %xmm0, %xmm12 + + mulpd %xmm15, %xmm12 + addpd %xmm12, %xmm8 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S b/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S new file mode 100644 index 0000000..92be8fc --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S @@ -0,0 +1,2472 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 - 8) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, ALPHA_R + movlps %xmm0, ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $3, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + prefetcht0 -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 7 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 3 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 7 * SIZE(CO2, %rax, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm4 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm7 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm4 + + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + leaq (LDC, LDC, 2), %rax + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC), %xmm2 + movhps 2 * SIZE(CO1, LDC), %xmm2 + movsd 4 * SIZE(CO1, LDC), %xmm3 + movhps 6 * SIZE(CO1, LDC), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 0 * SIZE(CO1, %rax), %xmm2 + movhps 2 * SIZE(CO1, %rax), %xmm2 + movsd 4 * SIZE(CO1, %rax), %xmm3 + movhps 6 * SIZE(CO1, %rax), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm11, %xmm5 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm11 + + addps %xmm4, %xmm0 + addps %xmm10, %xmm1 + addps %xmm5, %xmm2 + addps %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm1, 4 * SIZE(CO1, LDC, 2) + movhps %xmm1, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm2, 0 * SIZE(CO1, %rax) + movhps %xmm2, 2 * SIZE(CO1, %rax) + movlps %xmm3, 4 * SIZE(CO1, %rax) + movhps %xmm3, 6 * SIZE(CO1, %rax) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + movsd 0 * SIZE(CO2, LDC), %xmm2 + movhps 2 * SIZE(CO2, LDC), %xmm2 + movsd 4 * SIZE(CO2, LDC), %xmm3 + movhps 6 * SIZE(CO2, LDC), %xmm3 + + pshufd $0x50, %xmm12, %xmm4 + pshufd $0xfa, %xmm12, %xmm12 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm4, %xmm0 + addps %xmm12, %xmm1 + addps %xmm5, %xmm2 + addps %xmm13, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO2, LDC) + movhps %xmm2, 2 * SIZE(CO2, LDC) + movlps %xmm3, 4 * SIZE(CO2, LDC) + movhps %xmm3, 6 * SIZE(CO2, LDC) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 0 * SIZE(CO2, %rax), %xmm2 + movhps 2 * SIZE(CO2, %rax), %xmm2 + movsd 4 * SIZE(CO2, %rax), %xmm3 + movhps 6 * SIZE(CO2, %rax), %xmm3 + + pshufd $0x50, %xmm14, %xmm4 + pshufd $0xfa, %xmm14, %xmm14 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm4, %xmm0 + addps %xmm14, %xmm1 + addps %xmm5, %xmm2 + addps %xmm15, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2, LDC, 2) + movhps %xmm0, 2 * SIZE(CO2, LDC, 2) + movlps %xmm1, 4 * SIZE(CO2, LDC, 2) + movhps %xmm1, 6 * SIZE(CO2, LDC, 2) + + movlps %xmm2, 0 * SIZE(CO2, %rax) + movhps %xmm2, 2 * SIZE(CO2, %rax) + movlps %xmm3, 4 * SIZE(CO2, %rax) + movhps %xmm3, 6 * SIZE(CO2, %rax) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + + leaq (LDC, LDC, 2), %rax + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 2 * SIZE(CO1, LDC), %xmm1 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm2 + movhps 2 * SIZE(CO1, LDC, 2), %xmm2 + movsd 0 * SIZE(CO1, %rax), %xmm3 + movhps 2 * SIZE(CO1, %rax), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) + movhps %xmm1, 2 * SIZE(CO1, LDC) + + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %rax) + movhps %xmm3, 2 * SIZE(CO1, %rax) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO2, LDC), %xmm1 + movhps 2 * SIZE(CO2, LDC), %xmm1 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm2 + movhps 2 * SIZE(CO2, LDC, 2), %xmm2 + movsd 0 * SIZE(CO2, %rax), %xmm3 + movhps 2 * SIZE(CO2, %rax), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm11, %xmm5 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm11 + + addps %xmm4, %xmm0 + addps %xmm10, %xmm1 + addps %xmm5, %xmm2 + addps %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + movlps %xmm1, 0 * SIZE(CO2, LDC) + movhps %xmm1, 2 * SIZE(CO2, LDC) + + movlps %xmm2, 0 * SIZE(CO2, LDC, 2) + movhps %xmm2, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 0 * SIZE(CO2, %rax) + movhps %xmm3, 2 * SIZE(CO2, %rax) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + leaq (LDC, LDC, 2), %rax + + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + movsd (CO1, LDC, 2), %xmm1 + movhps (CO1, %rax), %xmm1 + + movsd (CO2), %xmm2 + movhps (CO2, LDC), %xmm2 + movsd (CO2, LDC, 2), %xmm3 + movhps (CO2, %rax), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm12, %xmm3 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO1, LDC) + movlps %xmm1, (CO1, LDC, 2) + movhps %xmm1, (CO1, %rax) + + movlps %xmm2, (CO2) + movhps %xmm2, (CO2, LDC) + movlps %xmm3, (CO2, LDC, 2) + movhps %xmm3, (CO2, %rax) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK +#endif + + movq BO, B + + leaq (C, LDC, 8), C + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L40: + testq $4, N + jle .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO2, LDC, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC), %xmm2 + movhps 2 * SIZE(CO1, LDC), %xmm2 + movsd 4 * SIZE(CO1, LDC), %xmm3 + movhps 6 * SIZE(CO1, LDC), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + movsd 0 * SIZE(CO2, LDC), %xmm2 + movhps 2 * SIZE(CO2, LDC), %xmm2 + movsd 4 * SIZE(CO2, LDC), %xmm3 + movhps 6 * SIZE(CO2, LDC), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm11, %xmm5 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm11 + + addps %xmm4, %xmm0 + addps %xmm10, %xmm1 + addps %xmm5, %xmm2 + addps %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO2, LDC) + movhps %xmm2, 2 * SIZE(CO2, LDC) + movlps %xmm3, 4 * SIZE(CO2, LDC) + movhps %xmm3, 6 * SIZE(CO2, LDC) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 2 * SIZE(CO1, LDC), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO2, LDC), %xmm3 + movhps 2 * SIZE(CO2, LDC), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) + movhps %xmm1, 2 * SIZE(CO1, LDC) + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm3, 0 * SIZE(CO2, LDC) + movhps %xmm3, 2 * SIZE(CO2, LDC) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + addps %xmm2, %xmm8 + + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO1, LDC) + movlps %xmm1, (CO2) + movhps %xmm1, (CO2, LDC) + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + ALIGN_4 + +.L70: + testq $2, N + jle .L100 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm3, 4 * SIZE(CO2) + movhps %xmm3, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: + addps %xmm1, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L90: + testq $1, M + BRANCH + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: + addps %xmm2, %xmm8 + + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L100: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: + addps %xmm1, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: + addps %xmm1, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: + addps %xmm2, %xmm8 + + movsd (CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S b/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S new file mode 100644 index 0000000..80c8524 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S @@ -0,0 +1,3253 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define KERNEL1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + movaps (AO, %rax, 4), %xmm6 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\ + addps %xmm1, %xmm14 ;\ + movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\ + addps %xmm5, %xmm14 ;\ + movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 ;\ + addq $16 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps (AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-1024, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + movss %xmm0, 0 + ALPHA + movss %xmm1, 4 + ALPHA + movss %xmm0, 8 + ALPHA + movss %xmm1, 12 + ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 32) * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm0 + pshufd $0x55, %xmm11, %xmm1 + pshufd $0xaa, %xmm11, %xmm2 + pshufd $0xff, %xmm11, %xmm3 + + prefetchw (WPREFETCHSIZE + 48) * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm4 + pshufd $0x55, %xmm15, %xmm5 + pshufd $0xaa, %xmm15, %xmm6 + pshufd $0xff, %xmm15, %xmm7 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + prefetch 0 * SIZE(BB) + prefetch 16 * SIZE(BB) + subq $-32 * SIZE, BB + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movaps -28 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movaps 0 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + + prefetchw 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 7 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + + movaps %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL_SUB1(32 * 0) + KERNEL_SUB2(32 * 0) + KERNEL_SUB3(32 * 0) + KERNEL_SUB4(32 * 0) + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + ALIGN_3 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm8 + movaps %xmm2, %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm9 + movaps %xmm0, %xmm2 + addps %xmm3, %xmm13 + movaps -20 * SIZE(BO, %rax, 8), %xmm3 + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm10 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm11 + addps %xmm3, %xmm15 + movaps -12 * SIZE(BO, %rax, 8), %xmm3 + movaps %xmm0, %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: + movups 0 * SIZE(CO1), %xmm0 + movups 4 * SIZE(CO1), %xmm1 + movups 8 * SIZE(CO1), %xmm2 + movups 12 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm0 + movups 4 * SIZE(CO2), %xmm1 + movups 8 * SIZE(CO2), %xmm2 + movups 12 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + movups 0 * SIZE(CO1, LDC, 2), %xmm0 + movups 4 * SIZE(CO1, LDC, 2), %xmm1 + movups 8 * SIZE(CO1, LDC, 2), %xmm2 + movups 12 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm14, %xmm5 + pshufd $0xfa, %xmm14, %xmm14 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm14 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + addps %xmm2, %xmm5 + addps %xmm3, %xmm14 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + movlps %xmm5, 8 * SIZE(CO1, LDC, 2) + movhps %xmm5, 10 * SIZE(CO1, LDC, 2) + movlps %xmm14, 12 * SIZE(CO1, LDC, 2) + movhps %xmm14, 14 * SIZE(CO1, LDC, 2) + + movups 0 * SIZE(CO2, LDC, 2), %xmm0 + movups 4 * SIZE(CO2, LDC, 2), %xmm1 + movups 8 * SIZE(CO2, LDC, 2), %xmm2 + movups 12 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + addps %xmm2, %xmm5 + addps %xmm3, %xmm15 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + movlps %xmm5, 8 * SIZE(CO2, LDC, 2) + movhps %xmm5, 10 * SIZE(CO2, LDC, 2) + movlps %xmm15, 12 * SIZE(CO2, LDC, 2) + movhps %xmm15, 14 * SIZE(CO2, LDC, 2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm4 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm0 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm0 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + movups 4 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm4 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm1 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm1 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movups 0 * SIZE(CO1, LDC, 2), %xmm8 + movups 4 * SIZE(CO1, LDC, 2), %xmm9 + + pshufd $0x50, %xmm2, %xmm4 + pshufd $0xfa, %xmm2, %xmm2 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm2 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm2 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movups 0 * SIZE(CO2, LDC, 2), %xmm8 + movups 4 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm3, %xmm4 + pshufd $0xfa, %xmm3, %xmm3 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm3 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm3 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 4 * SIZE(CO2, LDC, 2) + movhps %xmm3, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movsd 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsd 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movsd 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movsd 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movsd 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movsd 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movsd 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movsd 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movsd 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movsd 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movsd 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movsd 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsd 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movsd 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsd 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsd 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movsd 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsd 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + movups 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + + movups 0 * SIZE(CO1, LDC, 2), %xmm8 + + pshufd $0x50, %xmm2, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + + movups 0 * SIZE(CO2, LDC, 2), %xmm8 + + pshufd $0x50, %xmm3, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss -30 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss -29 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss -24 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss -27 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss -26 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss -25 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + + pshufd $0x50, %xmm2, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + + pshufd $0x50, %xmm3, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $ 2 * SIZE, B + addq $ 8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + prefetchw 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchw 15 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 32 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 48 * SIZE(AO), %xmm10 + + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 4 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 8 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 12 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 64 * SIZE(AO), %xmm12 + + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) + + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 20 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 24 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 80 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + movups 8 * SIZE(CO1), %xmm10 + movups 12 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm3 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + addps %xmm10, %xmm3 + addps %xmm11, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm3, 8 * SIZE(CO1) + movhps %xmm3, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + movups 4 * SIZE(CO2), %xmm9 + movups 8 * SIZE(CO2), %xmm10 + movups 12 * SIZE(CO2), %xmm11 + + pshufd $0x50, %xmm1, %xmm2 + pshufd $0xfa, %xmm1, %xmm1 + pshufd $0x50, %xmm5, %xmm3 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm1 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm1 + addps %xmm10, %xmm3 + addps %xmm11, %xmm5 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + movlps %xmm3, 8 * SIZE(CO2) + movhps %xmm3, 10 * SIZE(CO2) + movlps %xmm5, 12 * SIZE(CO2) + movhps %xmm5, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + movups 4 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm2 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm1 + addps %xmm8, %xmm2 + addps %xmm9, %xmm1 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movsd 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsd 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movsd 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsd 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movsd 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsd 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movups 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm2 + + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -29 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -24 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -27 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -26 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -25 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm2 + + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movups 0 * SIZE(B), %xmm3 + movups 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + prefetchw 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps -20 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm9, %xmm10 + mulps -12 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps -8 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps -4 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + mulps %xmm11, %xmm12 + mulps 4 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 8 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 12 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 64 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) + mulps %xmm11, %xmm14 + mulps 20 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 24 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 28 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 80 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + movups 8 * SIZE(CO1), %xmm10 + movups 12 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm3 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + addps %xmm10, %xmm3 + addps %xmm11, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm3, 8 * SIZE(CO1) + movhps %xmm3, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps -28 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -24 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps -20 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm10, %xmm11 + movaps -12 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -8 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps -4 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + + movups 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss -31 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss -30 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss -29 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss -24 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss -27 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss -26 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss -25 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss -20 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_core2.S b/kernel/x86_64/zgemm3m_kernel_8x4_core2.S new file mode 100644 index 0000000..2ddbb5c --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_core2.S @@ -0,0 +1,2675 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (16 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (16 * 21 + 8) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif +#endif + + movq %rsp, %r15 # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movss %xmm0, 0 + ALPHA + movss %xmm1, 4 + ALPHA + movss %xmm0, 8 + ALPHA + movss %xmm1, 12 + ALPHA + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq OLD_M, M + movq OLD_N, N + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J + jle .L50 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq 32 * SIZE + BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L05 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movaps -32 * SIZE(B), %xmm3 + movaps -28 * SIZE(B), %xmm7 + movaps -24 * SIZE(B), %xmm11 + movaps -20 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + pshufd $0x00, %xmm11, %xmm8 + pshufd $0x55, %xmm11, %xmm9 + pshufd $0xaa, %xmm11, %xmm10 + pshufd $0xff, %xmm11, %xmm11 + pshufd $0x00, %xmm15, %xmm12 + pshufd $0x55, %xmm15, %xmm13 + pshufd $0xaa, %xmm15, %xmm14 + pshufd $0xff, %xmm15, %xmm15 + + prefetcht0 (PREFETCH_W + 32) * SIZE(BO) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + movaps %xmm4, -16 * SIZE(BO) + movaps %xmm5, -12 * SIZE(BO) + movaps %xmm6, -8 * SIZE(BO) + movaps %xmm7, -4 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 48) * SIZE(BO) + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + movaps %xmm12, 16 * SIZE(BO) + movaps %xmm13, 20 * SIZE(BO) + movaps %xmm14, 24 * SIZE(BO) + movaps %xmm15, 28 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-64 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L05: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movaps -32 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L06 + ALIGN_4 + +.L10: + movq B, BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 40 * SIZE + BUFFER, BO +#else + leaq 40 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movaps -28 * SIZE(AO), %xmm1 + pxor %xmm10, %xmm10 + movaps -40 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movaps -36 * SIZE(BO), %xmm7 + + prefetcht0 (PREFETCH_R + 0) * SIZE(BB) + + prefetcht0 15 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 15 * SIZE(CO2) + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + prefetcht0 15 * SIZE(CO1, LDC, 2) + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + prefetcht0 15 * SIZE(CO2, LDC, 2) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -4 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps 12 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps 20 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + subq $-32 * SIZE, AO + + addps %xmm7, %xmm9 + movaps 28 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, BO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + addq $8 * SIZE, AO + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + addq $16 * SIZE, BO + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA, %xmm7 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm14 + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + movsd 8 * SIZE(CO1, LDC, 2), %xmm2 + movhps 10 * SIZE(CO1, LDC, 2), %xmm2 + movsd 12 * SIZE(CO1, LDC, 2), %xmm3 + movhps 14 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm14, %xmm5 + pshufd $0xfa, %xmm14, %xmm14 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm14 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + addps %xmm2, %xmm5 + addps %xmm3, %xmm14 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + movlps %xmm5, 8 * SIZE(CO1, LDC, 2) + movhps %xmm5, 10 * SIZE(CO1, LDC, 2) + movlps %xmm14, 12 * SIZE(CO1, LDC, 2) + movhps %xmm14, 14 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + movsd 8 * SIZE(CO2, LDC, 2), %xmm2 + movhps 10 * SIZE(CO2, LDC, 2), %xmm2 + movsd 12 * SIZE(CO2, LDC, 2), %xmm3 + movhps 14 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + addps %xmm2, %xmm5 + addps %xmm3, %xmm15 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + movlps %xmm5, 8 * SIZE(CO2, LDC, 2) + movhps %xmm5, 10 * SIZE(CO2, LDC, 2) + movlps %xmm15, 12 * SIZE(CO2, LDC, 2) + movhps %xmm15, 14 * SIZE(CO2, LDC, 2) + + addq $16 * SIZE, CO1 + addq $16 * SIZE, CO2 + subq $1, I + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L25 + ALIGN_4 + +.L21: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -28 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps 0 * SIZE(BO), %xmm2 + movaps 4 * SIZE(BO), %xmm3 + movaps 8 * SIZE(BO), %xmm4 + movaps 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -20 * SIZE(AO), %xmm0 + movaps 16 * SIZE(BO), %xmm2 + movaps 20 * SIZE(BO), %xmm3 + movaps 24 * SIZE(BO), %xmm4 + movaps 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L21 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L28 + ALIGN_4 + +.L26: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + addq $ 4 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L26 + ALIGN_4 + +.L28: + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $2, M + jle .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L35 + ALIGN_4 + +.L31: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -30 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 4 * SIZE(BO), %xmm3 + movsd 8 * SIZE(BO), %xmm4 + movsd 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -26 * SIZE(AO), %xmm0 + movsd 16 * SIZE(BO), %xmm2 + movsd 20 * SIZE(BO), %xmm3 + movsd 24 * SIZE(BO), %xmm4 + movsd 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L31 + ALIGN_4 + +.L35: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + ALIGN_4 + +.L36: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + addq $ 2 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm9, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + + pshufd $0x50, %xmm10, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + + pshufd $0x50, %xmm11, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L40: + testq $1, M + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L45 + ALIGN_4 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -31 * SIZE(AO), %xmm0 + movss -16 * SIZE(BO), %xmm2 + movss -12 * SIZE(BO), %xmm3 + movss -8 * SIZE(BO), %xmm4 + movss -4 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -30 * SIZE(AO), %xmm0 + movss 0 * SIZE(BO), %xmm2 + movss 4 * SIZE(BO), %xmm3 + movss 8 * SIZE(BO), %xmm4 + movss 12 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -29 * SIZE(AO), %xmm0 + movss 16 * SIZE(BO), %xmm2 + movss 20 * SIZE(BO), %xmm3 + movss 24 * SIZE(BO), %xmm4 + movss 28 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L41 + ALIGN_4 + +.L45: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L48 + ALIGN_4 + +.L46: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + addq $ 1 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L46 + ALIGN_4 + +.L48: + movsd 0 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm9, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + + pshufd $0x50, %xmm10, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + + pshufd $0x50, %xmm11, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C + subq $1, J + jg .L01 + ALIGN_4 + +.L50: + testq $2, N + jle .L100 + ALIGN_4 + +.L51: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L53 + + addq %rax, %rax + ALIGN_4 + +.L52: + movaps -32 * SIZE(B), %xmm3 + movaps -28 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + subq $1, %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $7, %rax + BRANCH + jle .L55 + ALIGN_4 + +.L54: + movss -32 * SIZE(B), %xmm8 + movss -31 * SIZE(B), %xmm9 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L54 + ALIGN_4 + +.L55: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO # aoffset = a + + movq M, I + sarq $3, I + jle .L70 + ALIGN_4 + +.L60: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 15 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht0 15 * SIZE(CO2) + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L65 + ALIGN_4 + +.L61: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -20 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + + movaps -16 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -12 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + + movaps -8 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -4 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + subq $-32 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L61 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L68 + ALIGN_4 + +.L66: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 + addq $16 * SIZE, CO2 + subq $1, I + jg .L60 + ALIGN_4 + +.L70: + testq $4, M + jle .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L75 + ALIGN_4 + +.L71: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L71 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + ALIGN_4 + +.L76: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + ALIGN_4 + +.L80: + testq $2, M + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L85 + ALIGN_4 + +.L81: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L81 + ALIGN_4 + +.L85: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L88 + ALIGN_4 + +.L86: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm9, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L90: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L95 + ALIGN_4 + +.L91: + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -31 * SIZE(AO), %xmm1 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm4 + mulss %xmm1, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -30 * SIZE(AO), %xmm0 + movss -29 * SIZE(AO), %xmm1 + movss -16 * SIZE(BO), %xmm2 + movss -12 * SIZE(BO), %xmm3 + movss -8 * SIZE(BO), %xmm4 + movss -4 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm4 + mulss %xmm1, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L91 + ALIGN_4 + +.L95: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L98 + ALIGN_4 + +.L96: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm10, %xmm8 + addss %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm9, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + + + +.L100: + testq $1, N + jle .L999 + ALIGN_4 + +.L101: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $4, %rax + jle .L103 + + addq %rax, %rax + ALIGN_4 + +.L102: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $15, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movss -32 * SIZE(B), %xmm8 + + shufps $0, %xmm8, %xmm8 + + movaps %xmm8, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L104 + ALIGN_4 + +.L105: + movq C, CO1 + movq A, AO + + movq M, I + sarq $3, I + jle .L120 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 15 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L115 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -28 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + movaps -20 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + + subq $-32 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L111 + ALIGN_4 + +.L115: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + ALIGN_4 + +.L116: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 + subq $1, I + jg .L110 + ALIGN_4 + +.L120: + testq $4, M + jle .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L125 + ALIGN_4 + +.L121: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L121 + ALIGN_4 + +.L125: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L128 + ALIGN_4 + +.L126: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm8 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 + ALIGN_4 + +.L130: + testq $2, M + jle .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L135 + ALIGN_4 + +.L131: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -24 * SIZE(BO), %xmm2 + movsd -20 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L131 + ALIGN_4 + +.L135: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L138 + ALIGN_4 + +.L136: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L140: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L145 + ALIGN_4 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -31 * SIZE(AO), %xmm1 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm1, %xmm3 + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + + movss -30 * SIZE(AO), %xmm0 + movss -29 * SIZE(AO), %xmm1 + movss -24 * SIZE(BO), %xmm2 + movss -20 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm1, %xmm3 + addss %xmm2, %xmm10 + addss %xmm3, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L141 + ALIGN_4 + +.L145: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L148 + ALIGN_4 + +.L146: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm10, %xmm8 + addss %xmm11, %xmm9 + addss %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S b/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S new file mode 100644 index 0000000..bf2d96e --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S @@ -0,0 +1,2593 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 17 + 4) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif +#endif + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, ALPHA_R + movlps %xmm0, ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J + NOBRANCH + jle .L50 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $3, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + xorpd %xmm5, %xmm5 + prefetcht0 -32 * SIZE(BB) + xorpd %xmm6, %xmm6 + + prefetcht2 7 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht2 7 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht2 7 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movaps %xmm4, %xmm13 + prefetcht2 7 * SIZE(CO2, LDC, 2) + movaps %xmm4, %xmm14 + movaps %xmm4, %xmm15 + + subq $-24 * SIZE, BB + + leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH -32 * SIZE(PREA) + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + PREFETCH -16 * SIZE(PREA) + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 4 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + PREFETCH 0 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 12 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 20 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + PREFETCH 16 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 28 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + subq $-64 * SIZE, AO + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, PREA + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht0 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + movups ALPHA_R, %xmm7 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + movsd 8 * SIZE(CO1, LDC, 2), %xmm2 + movhps 10 * SIZE(CO1, LDC, 2), %xmm2 + movsd 12 * SIZE(CO1, LDC, 2), %xmm3 + movhps 14 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm14, %xmm5 + pshufd $0xfa, %xmm14, %xmm14 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm14 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + addps %xmm2, %xmm5 + addps %xmm3, %xmm14 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + movlps %xmm5, 8 * SIZE(CO1, LDC, 2) + movhps %xmm5, 10 * SIZE(CO1, LDC, 2) + movlps %xmm14, 12 * SIZE(CO1, LDC, 2) + movhps %xmm14, 14 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + movsd 8 * SIZE(CO2, LDC, 2), %xmm2 + movhps 10 * SIZE(CO2, LDC, 2), %xmm2 + movsd 12 * SIZE(CO2, LDC, 2), %xmm3 + movhps 14 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + addps %xmm2, %xmm5 + addps %xmm3, %xmm15 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + movlps %xmm5, 8 * SIZE(CO2, LDC, 2) + movhps %xmm5, 10 * SIZE(CO2, LDC, 2) + movlps %xmm15, 12 * SIZE(CO2, LDC, 2) + movhps %xmm15, 14 * SIZE(CO2, LDC, 2) + + addq $16 * SIZE, CO1 + addq $16 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm6, %xmm10 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + subq $-16 * SIZE, AO + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + movups ALPHA_R, %xmm7 + + addps %xmm6, %xmm10 + addps %xmm4, %xmm11 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $2, M + BRANCH + jle .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm0, %xmm1 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0xee, %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm10 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0x44, %xmm0, %xmm1 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0xee, %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm10 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm2, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x44, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + movups ALPHA_R, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 0 * SIZE(CO2) + movhps %xmm8, 2 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 0 * SIZE(CO2, LDC, 2), %xmm1 + movhps 2 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm9, 0 * SIZE(CO2, LDC, 2) + movhps %xmm9, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movaps -24 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + movups ALPHA_R, %xmm7 + + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 0 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO1, LDC, 2), %xmm1 + movhps 0 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 0 * SIZE(CO2) + movlps %xmm8, 0 * SIZE(CO1, LDC, 2) + movhps %xmm8, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L50: + testq $2, N + jle .L90 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $3, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 -32 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + prefetcht0 7 * SIZE(CO1) + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + prefetcht0 7 * SIZE(CO2) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + addps %xmm5, %xmm10 + pshufd $0xaa, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0xff, %xmm2, %xmm6 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0xaa, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0xff, %xmm2, %xmm6 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + movups ALPHA_R, %xmm7 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm10, %xmm5 + pshufd $0xfa, %xmm10, %xmm10 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm10 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm10 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm10, 12 * SIZE(CO1) + movhps %xmm10, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm11, %xmm5 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm11 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm11 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm11, 12 * SIZE(CO2) + movhps %xmm11, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 + addq $16 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $4, M + BRANCH + jle .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm10 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xff, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm10 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xff, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_3 + +.L66: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + movups ALPHA_R, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $2, M + BRANCH + jle .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm9 + pshufd $0xee, %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + pshufd $0xfa, %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm9 + pshufd $0xee, %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + pshufd $0xfa, %xmm2, %xmm3 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + pshufd $0x50, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: + movups ALPHA_R, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm3, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 0 * SIZE(CO2) + movhps %xmm8, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L89 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -30 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movsd -28 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -26 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movsd -24 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L88 + ALIGN_3 + +.L86: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -30 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: + movups ALPHA_R, %xmm7 + + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 0 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 0 * SIZE(CO2) + ALIGN_4 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L90: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $3, I + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm1 + xorps %xmm9, %xmm9 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + prefetcht0 7 * SIZE(CO1) + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -20 * SIZE(AO), %xmm1 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm10 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm11 + movaps -12 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -4 * SIZE(AO), %xmm1 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm10 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm11 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + ALIGN_3 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: + movups ALPHA_R, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm9 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm9, 12 * SIZE(CO1) + movhps %xmm9, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $4, M + BRANCH + jle .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L108 + ALIGN_3 + +.L106: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: + movups ALPHA_R, %xmm7 + + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L118 + ALIGN_3 + +.L116: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: + movups ALPHA_R, %xmm7 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movss -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + addss %xmm2, %xmm9 + movss -30 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + addss %xmm2, %xmm9 + movss -28 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L128 + ALIGN_3 + +.L126: + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: + movups ALPHA_R, %xmm7 + + addss %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_sse.S b/kernel/x86_64/zgemm3m_kernel_8x4_sse.S new file mode 100644 index 0000000..6bd9148 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_sse.S @@ -0,0 +1,3498 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi + +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (16 * 5 + 8) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (16 * 5 + 8) +#endif + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#endif + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-1024, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + movss %xmm0, 0 + ALPHA + movss %xmm1, 4 + ALPHA + movss %xmm0, 8 + ALPHA + movss %xmm1, 12 + ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: + PREFETCH 0 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movaps -24 * SIZE(AO), %xmm4 + movaps -24 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movaps -20 * SIZE(AO), %xmm6 + movaps -16 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + PREFETCHW 15 * SIZE(CO1) + pxor %xmm12, %xmm12 + PREFETCHW 15 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCHW 15 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + PREFETCHW 15 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $64 * SIZE, BO + addq $32 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 64 * SIZE, BO + subq $-32 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm0 + addps %xmm1, %xmm10 + movaps -32 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm13 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm2 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm2, %xmm15 + movaps -20 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + movsd 8 * SIZE(CO1, LDC, 2), %xmm2 + movhps 10 * SIZE(CO1, LDC, 2), %xmm2 + movsd 12 * SIZE(CO1, LDC, 2), %xmm3 + movhps 14 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm14, %xmm5 + pshufd $0xfa, %xmm14, %xmm14 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm14 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + addps %xmm2, %xmm5 + addps %xmm3, %xmm14 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + movlps %xmm5, 8 * SIZE(CO1, LDC, 2) + movhps %xmm5, 10 * SIZE(CO1, LDC, 2) + movlps %xmm14, 12 * SIZE(CO1, LDC, 2) + movhps %xmm14, 14 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + movsd 8 * SIZE(CO2, LDC, 2), %xmm2 + movhps 10 * SIZE(CO2, LDC, 2), %xmm2 + movsd 12 * SIZE(CO2, LDC, 2), %xmm3 + movhps 14 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + addps %xmm2, %xmm5 + addps %xmm3, %xmm15 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + movlps %xmm5, 8 * SIZE(CO2, LDC, 2) + movhps %xmm5, 10 * SIZE(CO2, LDC, 2) + movlps %xmm15, 12 * SIZE(CO2, LDC, 2) + movhps %xmm15, 14 * SIZE(CO2, LDC, 2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm4 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm0 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm0 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm4 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm1 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm1 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + movsd 4 * SIZE(CO1, LDC, 2), %xmm9 + movhps 6 * SIZE(CO1, LDC, 2), %xmm9 + + pshufd $0x50, %xmm2, %xmm4 + pshufd $0xfa, %xmm2, %xmm2 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm2 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm2 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhps 2 * SIZE(CO2, LDC, 2), %xmm8 + movsd 4 * SIZE(CO2, LDC, 2), %xmm9 + movhps 6 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm3, %xmm4 + pshufd $0xfa, %xmm3, %xmm3 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm3 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm3 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 4 * SIZE(CO2, LDC, 2) + movhps %xmm3, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + + pshufd $0x50, %xmm2, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhps 2 * SIZE(CO2, LDC, 2), %xmm8 + + pshufd $0x50, %xmm3, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss -30 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss -29 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss -24 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss -27 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss -26 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss -25 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + + pshufd $0x50, %xmm2, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + + pshufd $0x50, %xmm3, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + PREFETCHNTA 32 * SIZE(B) + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCHNTA 32 * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) +#endif + + addq $ 2 * SIZE, B + addq $ 8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 15 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 48 * SIZE(AO), %xmm10 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 4 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 8 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 12 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 64 * SIZE(AO), %xmm12 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 20 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 24 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 80 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm3 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + addps %xmm10, %xmm3 + addps %xmm11, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm3, 8 * SIZE(CO1) + movhps %xmm3, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + movsd 8 * SIZE(CO2), %xmm10 + movhps 10 * SIZE(CO2), %xmm10 + movsd 12 * SIZE(CO2), %xmm11 + movhps 14 * SIZE(CO2), %xmm11 + + pshufd $0x50, %xmm1, %xmm2 + pshufd $0xfa, %xmm1, %xmm1 + pshufd $0x50, %xmm5, %xmm3 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm1 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm1 + addps %xmm10, %xmm3 + addps %xmm11, %xmm5 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + movlps %xmm3, 8 * SIZE(CO2) + movhps %xmm3, 10 * SIZE(CO2) + movlps %xmm5, 12 * SIZE(CO2) + movhps %xmm5, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm2 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm1 + addps %xmm8, %xmm2 + addps %xmm9, %xmm1 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -29 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -24 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -27 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -26 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -25 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + PREFETCHNTA 32 * SIZE(B) + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCHNTA 32 * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 * SIZE(BO) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + movd 0 * SIZE(B), %mm0 + punpckldq %mm0, %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) +#endif + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps -20 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps -12 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps -8 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps -4 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 4 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 8 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 12 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 64 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 20 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 24 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 28 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 80 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm3 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + addps %xmm10, %xmm3 + addps %xmm11, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm3, 8 * SIZE(CO1) + movhps %xmm3, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps -28 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -24 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps -20 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps -12 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -8 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps -4 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss -31 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss -30 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss -29 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss -24 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss -27 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss -26 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss -25 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss -20 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + EMMS + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S b/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S new file mode 100644 index 0000000..67537a7 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S @@ -0,0 +1,3075 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r12 +#define BO %r13 +#define CO1 %r14 +#define CO2 %r15 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 320 + +#define KERNEL1(address) \ + mulps %xmm8, %xmm9; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AO); \ + addps %xmm9, %xmm0; \ + movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm1; \ + movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 4 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm3; \ + movsldup 0 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm5; \ + movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 8 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm7; \ + movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm0; \ + movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm1; \ + movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 12 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm3; \ + movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm5; \ + movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 64 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm7; \ + movsldup 64 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm1; \ + movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 20 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm3; \ + movsldup 16 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm5; \ + movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 24 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm7; \ + movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm1; \ + movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 28 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm3; \ + movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm5; \ + movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 80 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm7; \ + movsldup 80 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulps %xmm12, %xmm13; \ + PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * SIZE(AO); \ + addps %xmm13, %xmm0; \ + movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm1; \ + movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 36 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm3; \ + movsldup 32 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm5; \ + movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 40 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm7; \ + movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm0; \ + movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm1; \ + movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 44 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm3; \ + movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm5; \ + movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 96 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm7; \ + movsldup 96 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm1; \ + movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 52 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm3; \ + movsldup 48 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm5; \ + movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 56 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm7; \ + movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm1; \ + movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 60 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm3; \ + movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm5; \ + movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 112 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm7; \ + movsldup 112 * SIZE + (address) * SIZE(BO), %xmm15 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-1024, %rsp # align stack + + STACK_TOUCHING + + movss %xmm0, 0 + ALPHA + movss %xmm1, 4 + ALPHA + movss %xmm0, 8 + ALPHA + movss %xmm1, 12 + ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq 112 * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: + prefetcht0 0 * SIZE(BB) + prefetcht0 8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + prefetchnta 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 15 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 15 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 15 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 + +.L1X: + KERNEL1 (64 * 0) + KERNEL2 (64 * 0) + KERNEL3 (64 * 0) + KERNEL4 (64 * 0) + KERNEL5 (64 * 0) + KERNEL6 (64 * 0) + KERNEL7 (64 * 0) + KERNEL8 (64 * 0) + KERNEL9 (64 * 0) + KERNEL10(64 * 0) + KERNEL11(64 * 0) + KERNEL12(64 * 0) + KERNEL13(64 * 0) + KERNEL14(64 * 0) + KERNEL15(64 * 0) + KERNEL16(64 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 1) + KERNEL2 (64 * 1) + KERNEL3 (64 * 1) + KERNEL4 (64 * 1) + KERNEL5 (64 * 1) + KERNEL6 (64 * 1) + KERNEL7 (64 * 1) + KERNEL8 (64 * 1) + KERNEL9 (64 * 1) + KERNEL10(64 * 1) + KERNEL11(64 * 1) + KERNEL12(64 * 1) + KERNEL13(64 * 1) + KERNEL14(64 * 1) + KERNEL15(64 * 1) + KERNEL16(64 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 2) + KERNEL2 (64 * 2) + KERNEL3 (64 * 2) + KERNEL4 (64 * 2) + KERNEL5 (64 * 2) + KERNEL6 (64 * 2) + KERNEL7 (64 * 2) + KERNEL8 (64 * 2) + KERNEL9 (64 * 2) + KERNEL10(64 * 2) + KERNEL11(64 * 2) + KERNEL12(64 * 2) + KERNEL13(64 * 2) + KERNEL14(64 * 2) + KERNEL15(64 * 2) + KERNEL16(64 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 3) + KERNEL2 (64 * 3) + KERNEL3 (64 * 3) + KERNEL4 (64 * 3) + KERNEL5 (64 * 3) + KERNEL6 (64 * 3) + KERNEL7 (64 * 3) + KERNEL8 (64 * 3) + KERNEL9 (64 * 3) + KERNEL10(64 * 3) + KERNEL11(64 * 3) + KERNEL12(64 * 3) + KERNEL13(64 * 3) + KERNEL14(64 * 3) + KERNEL15(64 * 3) + KERNEL16(64 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 4) + KERNEL2 (64 * 4) + KERNEL3 (64 * 4) + KERNEL4 (64 * 4) + KERNEL5 (64 * 4) + KERNEL6 (64 * 4) + KERNEL7 (64 * 4) + KERNEL8 (64 * 4) + KERNEL9 (64 * 4) + KERNEL10(64 * 4) + KERNEL11(64 * 4) + KERNEL12(64 * 4) + KERNEL13(64 * 4) + KERNEL14(64 * 4) + KERNEL15(64 * 4) + KERNEL16(64 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 5) + KERNEL2 (64 * 5) + KERNEL3 (64 * 5) + KERNEL4 (64 * 5) + KERNEL5 (64 * 5) + KERNEL6 (64 * 5) + KERNEL7 (64 * 5) + KERNEL8 (64 * 5) + KERNEL9 (64 * 5) + KERNEL10(64 * 5) + KERNEL11(64 * 5) + KERNEL12(64 * 5) + KERNEL13(64 * 5) + KERNEL14(64 * 5) + KERNEL15(64 * 5) + KERNEL16(64 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 6) + KERNEL2 (64 * 6) + KERNEL3 (64 * 6) + KERNEL4 (64 * 6) + KERNEL5 (64 * 6) + KERNEL6 (64 * 6) + KERNEL7 (64 * 6) + KERNEL8 (64 * 6) + KERNEL9 (64 * 6) + KERNEL10(64 * 6) + KERNEL11(64 * 6) + KERNEL12(64 * 6) + KERNEL13(64 * 6) + KERNEL14(64 * 6) + KERNEL15(64 * 6) + KERNEL16(64 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 7) + KERNEL2 (64 * 7) + KERNEL3 (64 * 7) + KERNEL4 (64 * 7) + KERNEL5 (64 * 7) + KERNEL6 (64 * 7) + KERNEL7 (64 * 7) + KERNEL8 (64 * 7) + KERNEL9 (64 * 7) + KERNEL10(64 * 7) + KERNEL11(64 * 7) + KERNEL12(64 * 7) + KERNEL13(64 * 7) + KERNEL14(64 * 7) + KERNEL15(64 * 7) + KERNEL16(64 * 7) + + addq $64 * 8 * SIZE, AO + addq $64 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + KERNEL1 (64 * 0) + KERNEL2 (64 * 0) + KERNEL3 (64 * 0) + KERNEL4 (64 * 0) + KERNEL5 (64 * 0) + KERNEL6 (64 * 0) + KERNEL7 (64 * 0) + KERNEL8 (64 * 0) + KERNEL9 (64 * 0) + KERNEL10(64 * 0) + KERNEL11(64 * 0) + KERNEL12(64 * 0) + KERNEL13(64 * 0) + KERNEL14(64 * 0) + KERNEL15(64 * 0) + KERNEL16(64 * 0) + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm6 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm7 + movsldup 8 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm13 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm4 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + addps %xmm10, %xmm13 + addps %xmm11, %xmm4 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm13, 8 * SIZE(CO1) + movhps %xmm13, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + movsd 8 * SIZE(CO2), %xmm10 + movhps 10 * SIZE(CO2), %xmm10 + movsd 12 * SIZE(CO2), %xmm11 + movhps 14 * SIZE(CO2), %xmm11 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + pshufd $0x50, %xmm5, %xmm13 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm5 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + addps %xmm10, %xmm13 + addps %xmm11, %xmm5 + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + movlps %xmm13, 8 * SIZE(CO2) + movhps %xmm13, 10 * SIZE(CO2) + movlps %xmm5, 12 * SIZE(CO2) + movhps %xmm5, 14 * SIZE(CO2) + + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + movsd 4 * SIZE(CO1, LDC, 2), %xmm9 + movhps 6 * SIZE(CO1, LDC, 2), %xmm9 + movsd 8 * SIZE(CO1, LDC, 2), %xmm10 + movhps 10 * SIZE(CO1, LDC, 2), %xmm10 + movsd 12 * SIZE(CO1, LDC, 2), %xmm11 + movhps 14 * SIZE(CO1, LDC, 2), %xmm11 + + pshufd $0x50, %xmm2, %xmm12 + pshufd $0xfa, %xmm2, %xmm2 + pshufd $0x50, %xmm6, %xmm13 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm6 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm2 + addps %xmm10, %xmm13 + addps %xmm11, %xmm6 + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + movlps %xmm13, 8 * SIZE(CO1, LDC, 2) + movhps %xmm13, 10 * SIZE(CO1, LDC, 2) + movlps %xmm6, 12 * SIZE(CO1, LDC, 2) + movhps %xmm6, 14 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhps 2 * SIZE(CO2, LDC, 2), %xmm8 + movsd 4 * SIZE(CO2, LDC, 2), %xmm9 + movhps 6 * SIZE(CO2, LDC, 2), %xmm9 + movsd 8 * SIZE(CO2, LDC, 2), %xmm10 + movhps 10 * SIZE(CO2, LDC, 2), %xmm10 + movsd 12 * SIZE(CO2, LDC, 2), %xmm11 + movhps 14 * SIZE(CO2, LDC, 2), %xmm11 + + pshufd $0x50, %xmm3, %xmm12 + pshufd $0xfa, %xmm3, %xmm3 + pshufd $0x50, %xmm7, %xmm13 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm7 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm3 + addps %xmm10, %xmm13 + addps %xmm11, %xmm7 + + movlps %xmm12, 0 * SIZE(CO2, LDC, 2) + movhps %xmm12, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 4 * SIZE(CO2, LDC, 2) + movhps %xmm3, 6 * SIZE(CO2, LDC, 2) + movlps %xmm13, 8 * SIZE(CO2, LDC, 2) + movhps %xmm13, 10 * SIZE(CO2, LDC, 2) + movlps %xmm7, 12 * SIZE(CO2, LDC, 2) + movhps %xmm7, 14 * SIZE(CO2, LDC, 2) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 64 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsldup 80 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 32 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsldup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsldup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsldup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsldup 96 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 48 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsldup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsldup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsldup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsldup 112 * SIZE(BO), %xmm15 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + movsd 4 * SIZE(CO1, LDC, 2), %xmm9 + movhps 6 * SIZE(CO1, LDC, 2), %xmm9 + + pshufd $0x50, %xmm2, %xmm12 + pshufd $0xfa, %xmm2, %xmm2 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm2 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm2 + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhps 2 * SIZE(CO2, LDC, 2), %xmm8 + movsd 4 * SIZE(CO2, LDC, 2), %xmm9 + movhps 6 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm3, %xmm12 + pshufd $0xfa, %xmm3, %xmm3 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm3 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm3 + + movlps %xmm12, 0 * SIZE(CO2, LDC, 2) + movhps %xmm12, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 4 * SIZE(CO2, LDC, 2) + movhps %xmm3, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + movddup 8 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 32 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 16 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 20 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 24 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 28 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movsd 64 * SIZE(BO), %xmm9 + addps %xmm11, %xmm0 + movsd 36 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 40 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 44 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 48 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movsd 52 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 56 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 60 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 96 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm9 + movhps 2 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + movsd 0 * SIZE(CO2, LDC, 2), %xmm9 + movhps 2 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm1, 0 * SIZE(CO2, LDC, 2) + movhps %xmm1, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 32 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + shufps $0, %xmm8, %xmm8 + movhps 4 * SIZE(BO), %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 8 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 16 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 20 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 3 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 24 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 28 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 64 * SIZE(BO), %xmm9 + shufps $0, %xmm10, %xmm10 + movhps 36 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 40 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 44 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 7 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 56 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 60 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 96 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + shufps $0, %xmm8, %xmm8 + movhps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 + movsd 0 * SIZE(CO1, LDC, 2), %xmm9 + movhps 0 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L53 + ALIGN_4 + +.L52: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $7, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movddup 0 * SIZE(B), %xmm0 + movaps %xmm0, 0 * SIZE(BO) + + addq $ 2 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + prefetcht2 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetcht2 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 36 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movsldup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm0 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movsldup 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L66 + ALIGN_4 + +.L68: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm13 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm4 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + addps %xmm10, %xmm13 + addps %xmm11, %xmm4 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm13, 8 * SIZE(CO1) + movhps %xmm13, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + movsd 8 * SIZE(CO2), %xmm10 + movhps 10 * SIZE(CO2), %xmm10 + movsd 12 * SIZE(CO2), %xmm11 + movhps 14 * SIZE(CO2), %xmm11 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + pshufd $0x50, %xmm5, %xmm13 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm5 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + addps %xmm10, %xmm13 + addps %xmm11, %xmm5 + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + movlps %xmm13, 8 * SIZE(CO2) + movhps %xmm13, 10 * SIZE(CO2) + movlps %xmm5, 12 * SIZE(CO2) + movhps %xmm5, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movsldup 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movsldup 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsldup 48 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + movddup 8 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + shufps $0x50, %xmm9, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm9 + movhps 2 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + shufps $0, %xmm8, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 3 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 7 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm0, %xmm12 + + mulps %xmm15, %xmm12 + + addps %xmm8, %xmm12 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + movss %xmm0, 0 * SIZE(BO) + movss %xmm0, 1 * SIZE(BO) + movss %xmm1, 2 * SIZE(BO) + movss %xmm1, 3 * SIZE(BO) + movss %xmm2, 4 * SIZE(BO) + movss %xmm2, 5 * SIZE(BO) + movss %xmm3, 6 * SIZE(BO) + movss %xmm3, 7 * SIZE(BO) + movss %xmm4, 8 * SIZE(BO) + movss %xmm4, 9 * SIZE(BO) + movss %xmm5, 10 * SIZE(BO) + movss %xmm5, 11 * SIZE(BO) + movss %xmm6, 12 * SIZE(BO) + movss %xmm6, 13 * SIZE(BO) + movss %xmm7, 14 * SIZE(BO) + movss %xmm7, 15 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm0 + movss %xmm0, 0 * SIZE(BO) + movss %xmm0, 1 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 2 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movddup 0 * SIZE(BO), %xmm9 + movddup 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + prefetchnta 8 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm9, %xmm0 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + movddup 8 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm9 + movaps 36 * SIZE(AO), %xmm12 + addps %xmm9, %xmm0 + movddup 16 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + addps %xmm11, %xmm0 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movddup 2 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm13 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm4 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + addps %xmm10, %xmm13 + addps %xmm11, %xmm4 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm13, 8 * SIZE(CO1) + movhps %xmm13, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + movaps 0 * SIZE(AO), %xmm8 + movddup 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movddup 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 16 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $4, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $15, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + movsd 0 * SIZE(AO), %xmm8 + movsd 0 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm12 + + mulps %xmm15, %xmm12 + addps %xmm8, %xmm12 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movss 0 * SIZE(BO), %xmm9 + movss 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss 1 * SIZE(AO), %xmm8 + mulss 2 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 16 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 6 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 10 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 24 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 12 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 14 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + movss 0 * SIZE(AO), %xmm8 + movss 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm12 + + mulps %xmm15, %xmm12 + addps %xmm8, %xmm12 + + movlps %xmm12, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_beta.S b/kernel/x86_64/zgemm_beta.S new file mode 100644 index 0000000..ffc775b --- /dev/null +++ b/kernel/x86_64/zgemm_beta.S @@ -0,0 +1,260 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 +#define N ARG2 +#define C ARG3 +#define LDC ARG4 +#define C1 ARG5 + +#define STACK_C 16(%rsp) +#define STACK_LDC 24(%rsp) + +#else + +#define STACKSIZE 256 + +#define M ARG1 +#define N ARG2 +#define C ARG3 +#define LDC ARG4 +#define C1 %r10 + +#define STACK_ALPHA_I 40 + STACKSIZE(%rsp) +#define STACK_C 80 + STACKSIZE(%rsp) +#define STACK_LDC 88 + STACKSIZE(%rsp) + +#endif + +#define I %rax + + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movaps %xmm3, %xmm0 + movsd STACK_ALPHA_I, %xmm1 +#endif + + pxor %xmm15, %xmm15 + + movq STACK_C, C + movq STACK_LDC, LDC + + testq M, M + jle .L999 + testq N, N + jle .L999 + + salq $ZBASE_SHIFT, LDC + +#ifdef DOUBLE + ucomisd %xmm15, %xmm0 + jne .L71 + ucomisd %xmm15, %xmm1 + jne .L71 +#else + ucomiss %xmm15, %xmm0 + jne .L71 + ucomiss %xmm15, %xmm1 + jne .L71 +#endif + ALIGN_2 + +.L53: + movq C, C1 # c_offset1 = c_offset + addq LDC, C # c_offset += ldc + + movq M, I + sarq $2, I + jle .L56 + ALIGN_2 + +.L57: +#ifdef OPTERON + prefetchw 64 * SIZE(C1) +#endif + + MOVSD %xmm0, 0 * SIZE(C1) # c_offset1 + MOVSD %xmm0, 1 * SIZE(C1) + MOVSD %xmm0, 2 * SIZE(C1) + MOVSD %xmm0, 3 * SIZE(C1) + MOVSD %xmm0, 4 * SIZE(C1) + MOVSD %xmm0, 5 * SIZE(C1) + MOVSD %xmm0, 6 * SIZE(C1) + MOVSD %xmm0, 7 * SIZE(C1) + addq $8 * SIZE, C1 # c_offset1 += 8 + decq I # i-- + jg .L57 + ALIGN_2 + +.L56: + movq M, I + andq $3, I + jle .L62 + ALIGN_2 + +.L63: + MOVSD %xmm0, 0 * SIZE(C1) + MOVSD %xmm0, 1 * SIZE(C1) + addq $2 * SIZE,C1 + decq I + jg .L63 + ALIGN_2 + +.L62: + decq N # j -- + jg .L53 + jmp .L999 + ALIGN_3 + +.L71: + movq C, C1 + addq LDC, C # c_offset += ldc + + movq M, I + sarq $1, I + jle .L84 + ALIGN_3 + +.L85: +#ifdef OPTERON + prefetchw 16 * SIZE(C1) +#endif + + MOVSD 0 * SIZE(C1), %xmm2 + MOVSD 1 * SIZE(C1), %xmm3 + MOVSD 0 * SIZE(C1), %xmm4 + MOVSD 1 * SIZE(C1), %xmm5 + + MOVSD 2 * SIZE(C1), %xmm6 + MOVSD 3 * SIZE(C1), %xmm7 + MOVSD 2 * SIZE(C1), %xmm8 + MOVSD 3 * SIZE(C1), %xmm9 + + MULSD %xmm0, %xmm2 + MULSD %xmm1, %xmm3 + MULSD %xmm1, %xmm4 + MULSD %xmm0, %xmm5 + + MULSD %xmm0, %xmm6 + MULSD %xmm1, %xmm7 + MULSD %xmm1, %xmm8 + MULSD %xmm0, %xmm9 + + SUBSD %xmm3, %xmm2 + ADDPD %xmm5, %xmm4 + SUBSD %xmm7, %xmm6 + ADDPD %xmm9, %xmm8 + + MOVSD %xmm2, 0 * SIZE(C1) + MOVSD %xmm4, 1 * SIZE(C1) + MOVSD %xmm6, 2 * SIZE(C1) + MOVSD %xmm8, 3 * SIZE(C1) + addq $4 * SIZE, C1 + decq I + jg .L85 + ALIGN_3 + +.L84: + testq $1, M + jle .L74 + ALIGN_3 + +.L75: + prefetchnta 80 * SIZE(C1) + + MOVSD 0 * SIZE(C1), %xmm2 + MULSD %xmm0, %xmm2 + MOVSD 1 * SIZE(C1), %xmm3 + MULSD %xmm1, %xmm3 + MOVSD 0 * SIZE(C1), %xmm4 + MULSD %xmm1, %xmm4 + MOVSD 1 * SIZE(C1), %xmm5 + MULSD %xmm0, %xmm5 + + SUBSD %xmm3, %xmm2 + ADDPD %xmm5, %xmm4 + + MOVSD %xmm2, 0 * SIZE(C1) + MOVSD %xmm4, 1 * SIZE(C1) + ALIGN_2 + +.L74: + decq N + jg .L71 + ALIGN_2 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S new file mode 100644 index 0000000..e72a19c --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S @@ -0,0 +1,1093 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %rbp + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rdx +#define BB %r12 + +#define PREA %r10 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE 4 +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 addpd +#define ADD2 subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + testq M, M + jle .L999 + + movq N, J + sarq $2, J + NOBRANCH + jle .L20 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + ALIGN_4 + +.L11: + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + PADDING + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm11, %xmm11 + + movaps -16 * SIZE(AO), %xmm0 + + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 1 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + prefetcht0 3 * SIZE(CO2, LDC) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -14 * SIZE(AO), %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm5 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + PADDING; + mulpd %xmm5, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + PADDING; + mulpd %xmm5, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm4 + + subq $-32 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + testq $15, CO1 + NOBRANCH + jne .L18x + +#ifndef TRMMKERNEL + movaps (CO1), %xmm0 + movaps (CO1, LDC), %xmm1 + movaps (CO2), %xmm2 + movaps (CO2, LDC), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 + addpd %xmm2, %xmm12 + addpd %xmm3, %xmm14 +#endif + + movaps %xmm8, (CO1) + movaps %xmm10, (CO1, LDC) + movaps %xmm12, (CO2) + movaps %xmm14, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L11 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + leaq (C, LDC, 4), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + jmp .L20 + ALIGN_4 + +.L18x: +#ifndef TRMMKERNEL + movups (CO1), %xmm0 + movups (CO1, LDC), %xmm1 + movups (CO2), %xmm2 + movups (CO2, LDC), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 + addpd %xmm2, %xmm12 + addpd %xmm3, %xmm14 +#endif + + movups %xmm8, (CO1) + movups %xmm10, (CO1, LDC) + movups %xmm12, (CO2) + movups %xmm14, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L11 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + leaq (C, LDC, 4), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L20: + testq $2, N + BRANCH + jle .L30 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq M, I + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm11 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L21 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L30: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm9 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + + addsubpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x1_atom.S b/kernel/x86_64/zgemm_kernel_2x1_atom.S new file mode 100644 index 0000000..be42e03 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x1_atom.S @@ -0,0 +1,769 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KKK 72(%rsp) +#define KK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KKK 248(%rsp) +#define KK 256(%rsp) +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 subsd +#define ADDSD4 subsd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movsd %xmm0, ALPHA_R + movsd %xmm1, ALPHA_I + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + + movq N, J + testq N, N + jle .L999 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + addq LDC, C + + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + jle .L20 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + ADDSD3 %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + ADDSD2 %xmm2, %xmm13 + ADDSD3 %xmm7, %xmm14 + ADDSD4 %xmm6, %xmm15 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + addsd %xmm15, %xmm12 + addsd %xmm13, %xmm14 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + mulsd %xmm0, %xmm8 + mulsd %xmm1, %xmm9 + mulsd %xmm1, %xmm10 + mulsd %xmm0, %xmm11 + + subsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + mulsd %xmm0, %xmm12 + mulsd %xmm1, %xmm13 + mulsd %xmm1, %xmm14 + mulsd %xmm0, %xmm15 + + subsd %xmm14, %xmm12 + addsd %xmm15, %xmm13 + +#if !defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 + addsd 2 * SIZE(CO1), %xmm12 + addsd 3 * SIZE(CO1), %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + decq I + jg .L10 + ALIGN_4 + +.L20: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + ADDSD1 %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + ADDSD3 %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + ADDSD2 %xmm2, %xmm9 + ADDSD4 %xmm6, %xmm11 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + mulsd %xmm0, %xmm8 + mulsd %xmm1, %xmm9 + mulsd %xmm1, %xmm10 + mulsd %xmm0, %xmm11 + + subsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + movq BO, B + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_barcelona.S b/kernel/x86_64/zgemm_kernel_2x2_barcelona.S new file mode 100644 index 0000000..31fad2b --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_barcelona.S @@ -0,0 +1,1423 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbp +#define CO2 %rbx +#define BB %r12 +#define J %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 subpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 subpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + ADD1 %xmm0, %xmm8 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm0 ;\ + ADD1 %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + ADD1 %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + ADD1 %xmm0, %xmm8 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm0 ;\ + ADD1 %xmm1, %xmm12 ;\ +/*A*/ movapd (AO, %rax, 4), %xmm6 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm10 ;\ + ADD1 %xmm1, %xmm14 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + ADD1 %xmm4, %xmm8 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm4 ;\ + ADD1 %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + ADD1 %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + ADD1 %xmm4, %xmm8 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm4 ;\ + ADD1 %xmm5, %xmm12 ;\ +/*A*/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm10 ;\ + ADD1 %xmm5, %xmm14 ;\ +/**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + ADD1 %xmm6, %xmm8 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm6 ;\ + ADD1 %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + ADD1 %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + ADD1 %xmm6, %xmm8 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm6 ;\ + ADD1 %xmm1, %xmm12 ;\ +/*A*/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm6, %xmm10 ;\ + ADD1 %xmm1, %xmm14 ;\ +/**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + ADD1 %xmm7, %xmm8 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm7 ;\ + ADD1 %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + ADD1 %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + ADD1 %xmm7, %xmm8 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm7 ;\ + ADD1 %xmm5, %xmm12 ;\ +/*A*/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm7, %xmm10 ;\ + ADD1 %xmm5, %xmm14 ;\ +/**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax ;\ + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + ADD1 %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + ADD1 %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + ADD1 %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + ADD1 %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + ADD1 %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + ADD1 %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + ADD1 %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm10 ;\ + ADD1 %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq OLD_M, M + movq OLD_N, N + + movlpd %xmm0, ALPHA_R + movlpd %xmm1, ALPHA_I + +#ifdef TRMMKERNEL + movlpd %xmm12, OFFSET + movlpd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + + movq A, AO # aoffset = a + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq B, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + + prefetchw 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetchw 7 * SIZE(CO2) + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 + + prefetch -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + jl .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm8 + movapd %xmm2, %xmm0 + ADD1 %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + ADD2 %xmm2, %xmm9 + movapd %xmm0, %xmm2 + ADD2 %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + ADD1 %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + ADD2 %xmm2, %xmm11 + ADD2 %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm2 + movupd (CO2), %xmm1 + movupd 2 * SIZE(CO2), %xmm3 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm10, %xmm11 + addsubpd %xmm12, %xmm13 + addsubpd %xmm14, %xmm15 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm9 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm11, %xmm11 + movapd %xmm13, %xmm12 + pshufd $0x4e, %xmm13, %xmm13 + movapd %xmm15, %xmm14 + pshufd $0x4e, %xmm15, %xmm15 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 + addpd %xmm1, %xmm10 + addpd %xmm3, %xmm14 +#endif + + movlpd %xmm8, (CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + + movlpd %xmm10, (CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movlpd %xmm14, 2 * SIZE(CO2) + movhpd %xmm14, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq B, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L46 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + ADD1 %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + ADD1 %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + ADD2 %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + ADD1 %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + ADD1 %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + ADD2 %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L42 + ALIGN_4 + +.L46: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L49 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L47: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L47 + ALIGN_4 + +.L49: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd (CO2), %xmm1 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm9 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm11, %xmm11 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 +#endif + + movlpd %xmm8, (CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm10, (CO2) + movhpd %xmm10, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq B, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + ADD1 %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + ADD1 %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + ADD1 %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + ADD1 %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + ADD1 %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + ADD1 %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L119 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L117: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + ADD1 %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L119: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm2 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm12, %xmm13 + +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm9 + movapd %xmm13, %xmm12 + pshufd $0x4e, %xmm13, %xmm13 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 +#endif + + movlpd %xmm8, (CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq B, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L146 + ALIGN_4 + +.L142: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + ADD2 %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + ADD1 %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + ADD2 %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L142 + ALIGN_4 + +.L146: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L148 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L147: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L147 + ALIGN_4 + +.L148: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm9 +#else + addsubpd %xmm8, %xmm9 + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm9 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + + addsubpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movlpd %xmm8, (CO1) + movhpd %xmm8, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_core2.S b/kernel/x86_64/zgemm_kernel_2x2_core2.S new file mode 100644 index 0000000..799c151 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_core2.S @@ -0,0 +1,1353 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA_R 0(%rsp) +#define ALPHA_I 16(%rsp) +#define J 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 13 + 5) +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 subpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 subpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + +#define ADDSUB subpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movddup %xmm0, %xmm0 + movddup %xmm1, %xmm1 + + movapd %xmm0, ALPHA_R + movapd %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $1, J # j = (n >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + + movapd -16 * SIZE(B), %xmm0 + movapd -8 * SIZE(B), %xmm4 + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_3 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + movddup %xmm0, %xmm8 + movapd %xmm8, -16 * SIZE(BO) + unpckhpd %xmm0, %xmm0 + movapd %xmm0, -14 * SIZE(BO) + movapd 0 * SIZE(B), %xmm0 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movddup %xmm1, %xmm9 + movapd %xmm9, -12 * SIZE(BO) + unpckhpd %xmm1, %xmm1 + movapd %xmm1, -10 * SIZE(BO) + movddup %xmm2, %xmm10 + movapd %xmm10, -8 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + unpckhpd %xmm2, %xmm2 + movapd %xmm2, -6 * SIZE(BO) + movddup %xmm3, %xmm11 + movapd %xmm11, -4 * SIZE(BO) + unpckhpd %xmm3, %xmm3 + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + + movddup %xmm4, %xmm12 + movapd %xmm12, 0 * SIZE(BO) + unpckhpd %xmm4, %xmm4 + movapd %xmm4, 2 * SIZE(BO) + movapd 8 * SIZE(B), %xmm4 + movddup %xmm5, %xmm13 + movapd %xmm13, 4 * SIZE(BO) + unpckhpd %xmm5, %xmm5 + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movddup %xmm6, %xmm14 + movapd %xmm14, 8 * SIZE(BO) + unpckhpd %xmm6, %xmm6 + movapd %xmm6, 10 * SIZE(BO) + movddup %xmm7, %xmm15 + movapd %xmm15, 12 * SIZE(BO) + unpckhpd %xmm7, %xmm7 + movapd %xmm7, 14 * SIZE(BO) + + subq $-32 * SIZE, BO + subq $-16 * SIZE, B + decq %rax + jne .L02 + ALIGN_3 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L05 + ALIGN_3 + +.L04: + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd -12 * SIZE(B), %xmm0 + + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_3 + +.L05: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 19 * SIZE + BUFFER, BO +#else + leaq 19 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -19 * SIZE(BO), %xmm6 + movaps -17 * SIZE(BO), %xmm7 + + prefetcht2 0 * SIZE(BB) + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht2 8 * SIZE(BB) + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO1) + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + prefetcht0 3 * SIZE(CO2) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PADDING; + ADD1 %xmm2, %xmm10 + movaps -15 * SIZE(BO), %xmm2 + PADDING; + ADD1 %xmm3, %xmm14 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + movaps -13 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + ADD1 %xmm6, %xmm8 + movaps -11 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps -9 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm10 + movaps -7 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + movaps -5 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + ADD1 %xmm6, %xmm8 + movaps -3 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps -1 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm10 + movaps 1 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + movaps 3 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + PADDING + movaps %xmm7, %xmm5 + mulpd %xmm1, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + + ADD1 %xmm6, %xmm8 + movaps 5 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps 7 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm10 + movaps 9 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + subq $-16 * SIZE, AO + movaps 11 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + ADD1 %xmm6, %xmm8 + movaps 13 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps 15 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + subq $-32 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht2 -8 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + ADD1 %xmm2, %xmm10 + movaps -15 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + movaps -13 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + ADD1 %xmm6, %xmm8 + movaps -11 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + addq $4 * SIZE, AO + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps -9 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + addq $8 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L19: + movapd ALPHA_R, %xmm6 + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + movapd ALPHA_I, %xmm7 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + movapd %xmm8, %xmm9 + movapd %xmm10, %xmm11 + movapd %xmm12, %xmm13 + movapd %xmm14, %xmm15 +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm10, %xmm11 + addsubpd %xmm12, %xmm13 + addsubpd %xmm14, %xmm15 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + movapd %xmm13, %xmm12 + movapd %xmm15, %xmm14 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm2 + movhpd 3 * SIZE(CO1), %xmm2 + + movsd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm6, %xmm10 + mulpd %xmm6, %xmm12 + mulpd %xmm6, %xmm14 + + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm13 + mulpd %xmm7, %xmm15 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 + addpd %xmm2, %xmm12 + addpd %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm14, 2 * SIZE(CO2) + movhpd %xmm14, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L42 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L41 + +.L42: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L43 + ALIGN_4 + +.L44: + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + + movapd %xmm8, %xmm9 + movapd %xmm10, %xmm11 +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movsd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + + mulpd %xmm6, %xmm8 + mulpd %xmm6, %xmm10 + + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm11 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L112 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L113 + ALIGN_4 + +.L114: + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + + movapd %xmm8, %xmm9 + movapd %xmm12, %xmm13 +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm12 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm2 + movhpd 3 * SIZE(CO1), %xmm2 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm6, %xmm8 + mulpd %xmm6, %xmm12 + + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm13 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L142 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L141 + +.L142: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L143 + ALIGN_4 + +.L144: + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm9, %xmm8 + movapd %xmm8, %xmm9 +#else + addsubpd %xmm8, %xmm9 + movapd %xmm9, %xmm8 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 +#endif + + SHUFPD_1 %xmm9, %xmm9 + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + addsubpd %xmm9, %xmm8 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_penryn.S b/kernel/x86_64/zgemm_kernel_2x2_penryn.S new file mode 100644 index 0000000..751110f --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_penryn.S @@ -0,0 +1,1297 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#ifdef NANO +#define PREFETCHSIZE (8 * 2 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef DUNNINGTON +#define PREFETCHSIZE (8 * 81 + 4) +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht2 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 17 + 4) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 addpd +#define ADD2 subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-17 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $1, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHB -16 * SIZE(BB) + + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + PREFETCHW 3 * SIZE(CO1) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + movaps %xmm4, %xmm12 + movaps %xmm4, %xmm13 + PREFETCHW 3 * SIZE(CO2) + movaps %xmm4, %xmm14 + movaps %xmm4, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm3, %xmm12 + movaps -15 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -11 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -9 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -7 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + PADDING + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + ADD1 %xmm2, %xmm8 + movaps -5 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + subq $-16 * SIZE, AO + movaps -3 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -1 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + subq $-16 * SIZE, BO + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + PREFETCHB -8 * SIZE(BB) +#ifdef DUNNINGTON + PREFETCHB 0 * SIZE(BB) + PREFETCHB 8 * SIZE(BB) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm3, %xmm12 + movaps -15 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#ifndef DUNNINGTON + subq $-16 * SIZE, BB +#else + subq $-32 * SIZE, BB +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm4, %xmm14 + psllq $63, %xmm0 + ADD2 %xmm5, %xmm13 + movddup ALPHA_R, %xmm2 + ADD2 %xmm6, %xmm15 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm11 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm15 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 + addpd %xmm2, %xmm12 + addpd %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movhpd %xmm10, 3 * SIZE(CO1) + movsd %xmm12, 0 * SIZE(CO2) + movhpd %xmm12, 1 * SIZE(CO2) + movsd %xmm14, 2 * SIZE(CO2) + movhpd %xmm14, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm3, %xmm3 + xorps %xmm5, %xmm5 + + movaps %xmm3, %xmm8 + movaps %xmm3, %xmm9 + movaps %xmm3, %xmm12 + movaps %xmm3, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + ADD1 %xmm3, %xmm12 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -11 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -9 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -7 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -5 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -3 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + ADD1 %xmm2, %xmm8 + movaps -1 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + ADD1 %xmm3, %xmm12 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm0, %xmm0 + ADD2 %xmm5, %xmm13 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm12 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm13 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm12, %xmm13 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 0 * SIZE(CO2) + movhpd %xmm12, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + BRANCH + jle .L999 + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -9 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + pcmpeqb %xmm0, %xmm0 + movddup ALPHA_R, %xmm2 + psllq $63, %xmm0 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm12 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm13 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm12, %xmm13 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -13 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -11 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -9 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + pcmpeqb %xmm0, %xmm0 + movddup ALPHA_R, %xmm2 + psllq $63, %xmm0 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + + pshufd $0x4e, %xmm8, %xmm9 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + + addsubpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + addq LDC, C + movq BO, B + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_sse2.S b/kernel/x86_64/zgemm_kernel_2x2_sse2.S new file mode 100644 index 0000000..4b83eee --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_sse2.S @@ -0,0 +1,1829 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define J 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER 256(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 9 + 4) + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 5 + 4) + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) +#endif + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm12 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm10, %xmm10 + + movlpd %xmm0, 0 + ALPHA_R + movlpd %xmm0, 8 + ALPHA_R + + movlpd %xmm1, 8 + ALPHA_I + xorpd %xmm7, %xmm1 + movlpd %xmm1, 0 + ALPHA_I + + movlpd %xmm10, 0 + POSINV + movlpd %xmm7, 8 + POSINV + +#ifdef TRMMKERNEL + movlpd %xmm12, OFFSET + movlpd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-16 * SIZE, A + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq 1 * SIZE(B), %mm1 + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq 3 * SIZE(B), %mm3 + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + movq 4 * SIZE(B), %mm4 + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq 5 * SIZE(B), %mm5 + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) + + movq 6 * SIZE(B), %mm6 + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq 7 * SIZE(B), %mm7 + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) + + movq 8 * SIZE(B), %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq 9 * SIZE(B), %mm1 + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + movq 10 * SIZE(B), %mm2 + movq %mm2, 4 * SIZE(BO) + movq %mm2, 5 * SIZE(BO) + movq 11 * SIZE(B), %mm3 + movq %mm3, 6 * SIZE(BO) + movq %mm3, 7 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + movq 12 * SIZE(B), %mm4 + movq %mm4, 8 * SIZE(BO) + movq %mm4, 9 * SIZE(BO) + movq 13 * SIZE(B), %mm5 + movq %mm5, 10 * SIZE(BO) + movq %mm5, 11 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) + + movq 14 * SIZE(B), %mm6 + movq %mm6, 12 * SIZE(BO) + movq %mm6, 13 * SIZE(BO) + movq 15 * SIZE(B), %mm7 + movq %mm7, 14 * SIZE(BO) + movq %mm7, 15 * SIZE(BO) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movq 0 * SIZE(B), %mm0 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq 1 * SIZE(B), %mm1 + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq 3 * SIZE(B), %mm3 + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + PREFETCH 0 * SIZE(BB) + movapd -14 * SIZE(AO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -12 * SIZE(AO), %xmm4 + movapd -12 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -10 * SIZE(AO), %xmm6 + movapd -8 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + PREFETCHW 3 * SIZE(CO1) + pxor %xmm13, %xmm13 + PREFETCHW 3 * SIZE(CO2) + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: + PREFETCH 8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $32 * SIZE, BO + addq $16 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movapd POSINV, %xmm5 + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm0 + addpd %xmm1, %xmm10 + movapd -16 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm13 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm2 + addpd %xmm1, %xmm14 + movapd -8 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm2, %xmm15 + movapd -10 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_3 + +.L19: +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movlpd 2 * SIZE(CO1), %xmm2 + movhpd 3 * SIZE(CO1), %xmm2 + + movlpd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 + movlpd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm10 + xorpd %xmm5, %xmm12 + xorpd %xmm5, %xmm14 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 + addpd %xmm1, %xmm10 + addpd %xmm3, %xmm14 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + + movlpd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movlpd %xmm14, 2 * SIZE(CO2) + movhpd %xmm14, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movapd -8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L44 + ALIGN_4 + +.L41: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd -4 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movapd 18 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm9 + movapd 20 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 22 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movapd 26 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm9 + movapd 28 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 30 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movapd 34 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm9 + movapd 36 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 38 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 48 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movapd 42 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm9 + movapd 44 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 46 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 56 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + subq $-16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L41 + ALIGN_4 + +.L44: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $4, %rax + BRANCH + jle .L45 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd -4 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -8 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd POSINV, %xmm5 + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L47 + ALIGN_4 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd -8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jg .L46 + ALIGN_4 + +.L47: +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movlpd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm10 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movlpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movlpd 4 * SIZE(B), %xmm12 + movlpd 5 * SIZE(B), %xmm13 + movlpd 6 * SIZE(B), %xmm14 + movlpd 7 * SIZE(B), %xmm15 + + movlpd %xmm8, 0 * SIZE(BO) + movlpd %xmm8, 1 * SIZE(BO) + movlpd %xmm9, 2 * SIZE(BO) + movlpd %xmm9, 3 * SIZE(BO) + movlpd %xmm10, 4 * SIZE(BO) + movlpd %xmm10, 5 * SIZE(BO) + movlpd %xmm11, 6 * SIZE(BO) + movlpd %xmm11, 7 * SIZE(BO) + movlpd %xmm12, 8 * SIZE(BO) + movlpd %xmm12, 9 * SIZE(BO) + movlpd %xmm13, 10 * SIZE(BO) + movlpd %xmm13, 11 * SIZE(BO) + movlpd %xmm14, 12 * SIZE(BO) + movlpd %xmm14, 13 * SIZE(BO) + movlpd %xmm15, 14 * SIZE(BO) + movlpd %xmm15, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + + movlpd %xmm8, 0 * SIZE(BO) + movlpd %xmm8, 1 * SIZE(BO) + movlpd %xmm9, 2 * SIZE(BO) + movlpd %xmm9, 3 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + PREFETCHW 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L112 + +.L111: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm2, %xmm3 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd -6 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd -8 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd 8 * SIZE(AO), %xmm2 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd POSINV, %xmm5 + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movlpd 2 * SIZE(CO1), %xmm2 + movhpd 3 * SIZE(CO1), %xmm2 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm13 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm12 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm13, %xmm12 +#else + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm12, %xmm13 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + movapd -8 * SIZE(AO), %xmm2 + movapd -8 * SIZE(BO), %xmm3 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L144 + ALIGN_4 + +.L141: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm2, %xmm1 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + decq %rax + jne .L141 + ALIGN_4 + + +.L144: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $4, %rax # if (k & 1) + BRANCH + jle .L145 + + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + addpd %xmm0, %xmm11 + movapd -8 * SIZE(AO), %xmm0 + + addq $8 * SIZE, AO + subq $-16 * SIZE, BO + ALIGN_4 + +.L145: + movapd POSINV, %xmm5 + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L148 + ALIGN_4 + +.L146: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 +#endif + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm5, %xmm9 +#else + xorpd %xmm5, %xmm8 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + + addpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_sse3.S b/kernel/x86_64/zgemm_kernel_2x2_sse3.S new file mode 100644 index 0000000..afb0924 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_sse3.S @@ -0,0 +1,1539 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KKK 72(%rsp) +#define KK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KKK 248(%rsp) +#define KK 256(%rsp) +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 subpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 subpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + +#define ADDSUB subpd + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movsd %xmm0, ALPHA_R + movsd %xmm1, ALPHA_I + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + pxor %xmm4, %xmm4 + movddup 16 * SIZE(BO), %xmm13 + pxor %xmm5, %xmm5 + movapd 24 * SIZE(AO), %xmm14 + pxor %xmm6, %xmm6 + movddup 24 * SIZE(BO), %xmm15 + pxor %xmm7, %xmm7 + + prefetchnta 3 * SIZE(CO1) + prefetchnta 3 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + salq $4, %rax + je .L12 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L11: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + ALIGN_4 + +.L12: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA_R, %xmm14 + movddup ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L13 + ALIGN_4 + +.L14: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + + movapd %xmm0, %xmm1 + movapd %xmm2, %xmm3 + movapd %xmm4, %xmm5 + movapd %xmm6, %xmm7 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 + movapd %xmm5, %xmm4 + movapd %xmm7, %xmm6 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm10 + movhpd 3 * SIZE(CO1), %xmm10 + + movsd 0 * SIZE(CO2), %xmm9 + movhpd 1 * SIZE(CO2), %xmm9 + movsd 2 * SIZE(CO2), %xmm11 + movhpd 3 * SIZE(CO2), %xmm11 +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm3 + mulpd %xmm15, %xmm5 + mulpd %xmm15, %xmm7 + + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movhpd %xmm4, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm6, 2 * SIZE(CO2) + movhpd %xmm6, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L42 + +.L41: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA_R, %xmm14 + movddup ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L43 + ALIGN_4 + +.L44: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + + movapd %xmm0, %xmm1 + movapd %xmm2, %xmm3 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + + movsd 0 * SIZE(CO2), %xmm9 + movhpd 1 * SIZE(CO2), %xmm9 +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm3 + + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + movq BO, B + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + prefetchnta 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L112 + +.L111: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA_R, %xmm14 + movddup ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 + + movapd %xmm0, %xmm1 + movapd %xmm4, %xmm5 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm4, %xmm5 + + movapd %xmm1, %xmm0 + movapd %xmm5, %xmm4 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm10 + movhpd 3 * SIZE(CO1), %xmm10 +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm4 + + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm5 + + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm10, %xmm4 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movhpd %xmm4, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L142 + +.L141: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA_R, %xmm14 + movddup ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + SHUFPD_1 %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm1, %xmm0 + movapd %xmm0, %xmm1 +#else + addsubpd %xmm0, %xmm1 + movapd %xmm1, %xmm0 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 +#endif + + SHUFPD_1 %xmm1, %xmm1 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm1 + addsubpd %xmm1, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x4_nehalem.S b/kernel/x86_64/zgemm_kernel_2x4_nehalem.S new file mode 100644 index 0000000..6a16b7e --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x4_nehalem.S @@ -0,0 +1,1628 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %rbp + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rdx +#define BB %r12 + +#define PREA %r10 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE 8 +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addps +#define ADD2 addps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addps +#define ADD2 addps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addps +#define ADD2 addps +#else +#define ADD1 addps +#define ADD2 subps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm0, %xmm0 + unpcklps %xmm1, %xmm1 + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: + prefetcht2 -32 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + movaps -32 * SIZE(AO), %xmm0 + + xorps %xmm12, %xmm12 + prefetcht0 1 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + movaps -28 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + movaps -20 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm4 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + pshufd $0xb1, %xmm12, %xmm13 + pshufd $0xb1, %xmm14, %xmm15 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm2, %xmm12 + mulps %xmm3, %xmm13 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + mulps %xmm2, %xmm14 + mulps %xmm3, %xmm15 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + addsubps %xmm13, %xmm12 + addsubps %xmm15, %xmm14 + +#ifndef TRMMKERNEL + movups 0 * SIZE(CO1), %xmm0 + movups 0 * SIZE(CO1, LDC), %xmm1 + movups 0 * SIZE(CO2), %xmm2 + movups 0 * SIZE(CO2, LDC), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm12 + addps %xmm3, %xmm14 +#endif + + movups %xmm8, 0 * SIZE(CO1) + movups %xmm10, 0 * SIZE(CO1, LDC) + movups %xmm12, 0 * SIZE(CO2) + movups %xmm14, 0 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#else + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC) + movsd %xmm10, (CO2) + movhps %xmm10, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + leaq (C, LDC, 4), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $2, N + BRANCH + jle .L50 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I # i -- + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + + pxor %xmm0, %xmm9 +#else + pxor %xmm0, %xmm8 + + shufps $0xb1, %xmm9, %xmm9 +#endif + + addps %xmm9, %xmm8 + + pshufd $0xb1, %xmm8, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L50: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 +#endif + + haddps %xmm9, %xmm8 + + shufps $0xd8, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + decq I # i -- + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -26 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + + pxor %xmm0, %xmm9 +#else + pxor %xmm0, %xmm8 + + shufps $0xb1, %xmm9, %xmm9 +#endif + + addps %xmm9, %xmm8 + + pshufd $0xb1, %xmm8, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_barcelona.S b/kernel/x86_64/zgemm_kernel_4x2_barcelona.S new file mode 100644 index 0000000..c59a50d --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_barcelona.S @@ -0,0 +1,2226 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define J 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define KERNEL1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + movaps (AO, %rax, 4), %xmm6 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\ + addps %xmm1, %xmm14 ;\ + movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\ + addps %xmm5, %xmm14 ;\ + movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 ;\ + addq $16 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps (AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + pxor %xmm10, %xmm10 + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm10, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm10,12 + POSINV +#else + movss %xmm10, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm10, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + addq $32 * SIZE, A + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + movaps POSINV, %xmm15 + + movq K, %rax + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + prefetch -20 * SIZE(BB) + prefetch 28 * SIZE(BB) + subq $-32 * SIZE, BB + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movaps -28 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movaps 0 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + + prefetchw 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + movaps %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL_SUB1(32 * 0) + KERNEL_SUB2(32 * 0) + KERNEL_SUB3(32 * 0) + KERNEL_SUB4(32 * 0) + + addq $64 * SIZE, BO + addq $32 * SIZE, AO + ALIGN_3 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm8 + movaps %xmm2, %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm9 + movaps %xmm0, %xmm2 + addps %xmm3, %xmm13 + movaps -20 * SIZE(BO, %rax, 8), %xmm3 + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm10 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm11 + addps %xmm3, %xmm15 + movaps -12 * SIZE(BO, %rax, 8), %xmm3 + movaps %xmm0, %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + shufps $0xb1, %xmm13, %xmm13 + shufps $0xb1, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + shufps $0xb1, %xmm12, %xmm12 + shufps $0xb1, %xmm14, %xmm14 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + mulps %xmm6, %xmm15 + mulps %xmm7, %xmm14 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm12 + addps %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -16 * SIZE(AO), %xmm2 + movaps 0 * SIZE(AO), %xmm4 + movaps 16 * SIZE(AO), %xmm6 + + movaps -32 * SIZE(BO), %xmm1 + movaps -16 * SIZE(BO), %xmm3 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -28 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -8 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm3, %xmm10 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm5 + addps %xmm5, %xmm8 + movaps 4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm9 + movaps 8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + mulps 12 * SIZE(BO), %xmm0 + addps %xmm5, %xmm10 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm0, %xmm11 + movaps -20 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm7 + addps %xmm7, %xmm8 + movaps 20 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm9 + movaps 24 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + mulps 28 * SIZE(BO), %xmm0 + addps %xmm7, %xmm10 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm0, %xmm11 + movaps 0 * SIZE(AO), %xmm0 + + mulps %xmm2, %xmm1 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addps %xmm1, %xmm8 + movaps 36 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm9 + movaps 40 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + mulps 44 * SIZE(BO), %xmm2 + addps %xmm1, %xmm10 + movaps 96 * SIZE(BO), %xmm1 + addps %xmm2, %xmm11 + movaps -12 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm3 + addps %xmm3, %xmm8 + movaps 52 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + movaps 56 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + mulps 60 * SIZE(BO), %xmm2 + addps %xmm3, %xmm10 + movaps 112 * SIZE(BO), %xmm3 + addps %xmm2, %xmm11 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movaps 68 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm9 + movaps 72 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + mulps 76 * SIZE(BO), %xmm2 + addps %xmm5, %xmm10 + movaps 128 * SIZE(BO), %xmm5 + addps %xmm2, %xmm11 + movaps -4 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movaps 84 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm9 + movaps 88 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + mulps 92 * SIZE(BO), %xmm2 + addps %xmm7, %xmm10 + movaps 144 * SIZE(BO), %xmm7 + addps %xmm2, %xmm11 + movaps 16 * SIZE(AO), %xmm2 + + subq $ -32 * SIZE, AO + subq $-128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -28 * SIZE(AO), %xmm0 + + subq $- 4 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -24 * SIZE(AO), %xmm2 + + movaps -32 * SIZE(BO), %xmm1 + movaps -16 * SIZE(BO), %xmm3 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movsd -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movsd -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movsd 32 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movsd -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movsd -8 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm10 + movsd -4 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + movsd 48 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm5 + addps %xmm5, %xmm8 + movsd 4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm9 + movsd 8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm10 + movsd 12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm5, %xmm11 + movsd 64 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm7 + addps %xmm7, %xmm8 + movsd 20 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm9 + movsd 24 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm10 + movsd 28 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + movsd -16 * SIZE(AO), %xmm0 + addps %xmm7, %xmm11 + movsd 80 * SIZE(BO), %xmm7 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movsd 36 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm9 + movsd 40 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movsd 44 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + movsd -22 * SIZE(AO), %xmm2 + addps %xmm1, %xmm11 + movsd 96 * SIZE(BO), %xmm1 + + mulps %xmm2, %xmm3 + addps %xmm3, %xmm8 + movsd 52 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + movsd 56 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm10 + movsd 60 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + movsd -20 * SIZE(AO), %xmm2 + addps %xmm3, %xmm11 + movsd 112 * SIZE(BO), %xmm3 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movsd 68 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm9 + movsd 72 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm10 + movsd 76 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -18 * SIZE(AO), %xmm2 + addps %xmm5, %xmm11 + movsd 128 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movsd 84 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm9 + movsd 88 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm10 + movsd 92 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -8 * SIZE(AO), %xmm2 + addps %xmm7, %xmm11 + movsd 144 * SIZE(BO), %xmm7 + + subq $ -16 * SIZE, AO + subq $-128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movsd -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movsd -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movsd -16 * SIZE(BO), %xmm1 + + subq $ -2 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + movaps POSINV, %xmm15 + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -16 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movaps 0 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movaps 16 * SIZE(AO), %xmm6 + pxor %xmm11, %xmm11 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm12, %xmm12 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + movaps 0 * SIZE(BO), %xmm5 + pxor %xmm14, %xmm14 + movaps 16 * SIZE(BO), %xmm7 + pxor %xmm15, %xmm15 + + prefetchw 7 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps 32 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + mulps %xmm2, %xmm3 + mulps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm8 + movaps -16 * SIZE(BO), %xmm3 + addps %xmm2, %xmm9 + movaps -12 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm3 + mulps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm2, %xmm13 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm3 + mulps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm8 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm2, %xmm9 + movaps -4 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm3 + mulps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm2, %xmm13 + movaps 48 * SIZE(AO), %xmm2 + + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + + mulps %xmm4, %xmm5 + mulps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm8 + movaps 0 * SIZE(BO), %xmm5 + addps %xmm4, %xmm9 + movaps 4 * SIZE(AO), %xmm4 + mulps %xmm4, %xmm5 + mulps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm12 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm4, %xmm13 + movaps 8 * SIZE(AO), %xmm4 + + mulps %xmm4, %xmm5 + mulps 12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm8 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm4, %xmm9 + movaps 12 * SIZE(AO), %xmm4 + mulps %xmm4, %xmm5 + mulps 12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm12 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm4, %xmm13 + movaps 64 * SIZE(AO), %xmm4 + + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) + + mulps %xmm6, %xmm7 + mulps 20 * SIZE(BO), %xmm6 + addps %xmm7, %xmm8 + movaps 16 * SIZE(BO), %xmm7 + addps %xmm6, %xmm9 + movaps 20 * SIZE(AO), %xmm6 + mulps %xmm6, %xmm7 + mulps 20 * SIZE(BO), %xmm6 + addps %xmm7, %xmm12 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm6, %xmm13 + movaps 24 * SIZE(AO), %xmm6 + + mulps %xmm6, %xmm7 + mulps 28 * SIZE(BO), %xmm6 + addps %xmm7, %xmm8 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm6, %xmm9 + movaps 28 * SIZE(AO), %xmm6 + mulps %xmm6, %xmm7 + mulps 28 * SIZE(BO), %xmm6 + addps %xmm7, %xmm12 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm6, %xmm13 + movaps 80 * SIZE(AO), %xmm6 + + subq $-64 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps -24 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jg .L56 + ALIGN_4 + +.L58: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm13, %xmm12 +#else + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm12, %xmm13 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm12, %xmm12 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -16 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BO), %xmm0 + addps %xmm3, %xmm8 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm0, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm3, %xmm10 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm0, %xmm11 + movaps 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + mulps %xmm2, %xmm5 + mulps 4 * SIZE(BO), %xmm2 + addps %xmm5, %xmm8 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm2, %xmm9 + movaps -12 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm5 + mulps 12 * SIZE(BO), %xmm2 + addps %xmm5, %xmm10 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm2, %xmm11 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm7 + mulps 20 * SIZE(BO), %xmm2 + addps %xmm7, %xmm8 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm2, %xmm9 + movaps -4 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm7 + mulps 28 * SIZE(BO), %xmm2 + addps %xmm7, %xmm10 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm2, %xmm11 + movaps 16 * SIZE(AO), %xmm2 + + subq $-32 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 +#else + addps %xmm9, %xmm8 +#endif + + movaps %xmm8, %xmm9 + + shufps $0xb1, %xmm8, %xmm8 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + + addps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -24 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm9 + movsd -24 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movsd -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movsd 32 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movsd -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + movsd -8 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm10 + movsd -4 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + movsd 48 * SIZE(BO), %xmm3 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movsd 4 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -22 * SIZE(AO), %xmm2 + addps %xmm5, %xmm9 + movsd 8 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm10 + movsd 12 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -20 * SIZE(AO), %xmm2 + addps %xmm5, %xmm11 + movsd 64 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movsd 20 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -18 * SIZE(AO), %xmm2 + addps %xmm7, %xmm9 + movsd 24 * SIZE(BO), %xmm7 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm10 + movsd 28 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -8 * SIZE(AO), %xmm2 + addps %xmm7, %xmm11 + movsd 80 * SIZE(BO), %xmm7 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm9 + movsd -24 * SIZE(BO), %xmm1 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 +#else + addps %xmm9, %xmm8 +#endif + + movaps %xmm8, %xmm9 + + shufps $0xb1, %xmm8, %xmm8 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + + addps %xmm9, %xmm8 +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_core2.S b/kernel/x86_64/zgemm_kernel_4x2_core2.S new file mode 100644 index 0000000..1b5d9a0 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_core2.S @@ -0,0 +1,1744 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA_R 0(%rsp) +#define ALPHA_I 16(%rsp) +#define J 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (16 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 4) + +#define PREFETCHSIZE (16 * 13 + 10) +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addps +#else +#define ADDSUB subps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq OLD_M, M + movq OLD_N, N + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq 32 * SIZE + BUFFER, BO + + movaps -32 * SIZE(B), %xmm3 + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movaps -28 * SIZE(B), %xmm7 + movaps -24 * SIZE(B), %xmm11 + movaps -20 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0x55, %xmm3, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + pshufd $0xaa, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xff, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps -16 * SIZE(B), %xmm3 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + pshufd $0x00, %xmm7, %xmm4 + movaps %xmm4, -16 * SIZE(BO) + pshufd $0x55, %xmm7, %xmm5 + movaps %xmm5, -12 * SIZE(BO) + pshufd $0xaa, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(BO) + pshufd $0xff, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 32) * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + movaps %xmm8, 0 * SIZE(BO) + pshufd $0x55, %xmm11, %xmm9 + movaps %xmm9, 4 * SIZE(BO) + pshufd $0xaa, %xmm11, %xmm10 + movaps %xmm10, 8 * SIZE(BO) + pshufd $0xff, %xmm11, %xmm11 + movaps %xmm11, 12 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 48) * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm12 + movaps %xmm12, 16 * SIZE(BO) + pshufd $0x55, %xmm15, %xmm13 + movaps %xmm13, 20 * SIZE(BO) + pshufd $0xaa, %xmm15, %xmm14 + movaps %xmm14, 24 * SIZE(BO) + pshufd $0xff, %xmm15, %xmm15 + movaps %xmm15, 28 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-64 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0x55, %xmm3, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + pshufd $0xaa, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xff, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps -28 * SIZE(B), %xmm3 + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 40 * SIZE + BUFFER, BO +#else + leaq 40 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -40 * SIZE(BO), %xmm6 + movaps -36 * SIZE(BO), %xmm7 + + prefetcht2 -32 * SIZE(BB) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + prefetcht0 7 * SIZE(CO1) + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + prefetcht0 7 * SIZE(CO2) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-32 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps -12 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps -4 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps 4 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps 12 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps 20 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + subq $-32 * SIZE, AO + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps 28 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, BO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_4 + +.L15: + prefetcht2 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + addq $8 * SIZE, AO + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + addq $16 * SIZE, BO + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $1, %rax + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm14 + ADDSUB %xmm4, %xmm11 + ADDSUB %xmm5, %xmm15 + +#if !defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + shufps $0xb1, %xmm13, %xmm13 + shufps $0xb1, %xmm15, %xmm15 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + addsubps %xmm13, %xmm12 + addsubps %xmm15, %xmm14 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + shufps $0xb1, %xmm12, %xmm12 + shufps $0xb1, %xmm14, %xmm14 +#else + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + shufps $0xb1, %xmm12, %xmm12 + shufps $0xb1, %xmm14, %xmm14 + + addsubps %xmm8, %xmm9 + addsubps %xmm10, %xmm11 + addsubps %xmm12, %xmm13 + addsubps %xmm14, %xmm15 + + movaps %xmm9, %xmm8 + movaps %xmm11, %xmm10 + movaps %xmm13, %xmm12 + movaps %xmm15, %xmm14 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + shufps $0xb1, %xmm13, %xmm13 + shufps $0xb1, %xmm15, %xmm15 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + mulps %xmm6, %xmm15 + mulps %xmm7, %xmm14 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 + +#if !defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm12 + addps %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movaps -28 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps 0 * SIZE(BO), %xmm2 + movaps 4 * SIZE(BO), %xmm3 + movaps 8 * SIZE(BO), %xmm4 + movaps 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movaps -20 * SIZE(AO), %xmm0 + movaps 16 * SIZE(BO), %xmm2 + movaps 20 * SIZE(BO), %xmm3 + movaps 24 * SIZE(BO), %xmm4 + movaps 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + addq $ 4 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L26 + ALIGN_4 + +.L28: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 +#else + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + addsubps %xmm8, %xmm9 + addsubps %xmm10, %xmm11 + + movaps %xmm9, %xmm8 + movaps %xmm11, %xmm10 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movsd -30 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 4 * SIZE(BO), %xmm3 + movsd 8 * SIZE(BO), %xmm4 + movsd 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movsd -26 * SIZE(AO), %xmm0 + movsd 16 * SIZE(BO), %xmm2 + movsd 20 * SIZE(BO), %xmm3 + movsd 24 * SIZE(BO), %xmm4 + movsd 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + addq $ 2 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 +#else + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + addsubps %xmm8, %xmm9 + addsubps %xmm10, %xmm11 + + movaps %xmm9, %xmm8 + movaps %xmm11, %xmm10 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movss -32 * SIZE(B), %xmm8 + movss -31 * SIZE(B), %xmm9 + movss -30 * SIZE(B), %xmm10 + movss -29 * SIZE(B), %xmm11 + movss -28 * SIZE(B), %xmm12 + movss -27 * SIZE(B), %xmm13 + movss -26 * SIZE(B), %xmm14 + movss -25 * SIZE(B), %xmm15 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + shufps $0, %xmm10, %xmm10 + shufps $0, %xmm11, %xmm11 + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + movaps %xmm12, 16 * SIZE(BO) + movaps %xmm13, 20 * SIZE(BO) + movaps %xmm14, 24 * SIZE(BO) + movaps %xmm15, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movss -32 * SIZE(B), %xmm8 + movss -31 * SIZE(B), %xmm9 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L44 + ALIGN_4 + +.L50: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -20 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + + movaps -16 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -12 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + + movaps -8 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -4 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + subq $-32 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L58: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm13, %xmm13 + + addsubps %xmm9, %xmm8 + addsubps %xmm13, %xmm12 + + movaps %xmm8, %xmm9 + movaps %xmm12, %xmm13 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm12, %xmm12 +#else + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm12, %xmm12 + + addsubps %xmm8, %xmm9 + addsubps %xmm12, %xmm13 + + movaps %xmm9, %xmm8 + movaps %xmm13, %xmm12 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm13, %xmm13 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + addsubps %xmm9, %xmm8 + movaps %xmm8, %xmm9 + shufps $0xb1, %xmm8, %xmm8 +#else + shufps $0xb1, %xmm8, %xmm8 + addsubps %xmm8, %xmm9 + movaps %xmm9, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + addps %xmm9, %xmm8 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L78 + ALIGN_4 + +.L76: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + addsubps %xmm9, %xmm8 + movaps %xmm8, %xmm9 + shufps $0xb1, %xmm8, %xmm8 +#else + shufps $0xb1, %xmm8, %xmm8 + addsubps %xmm8, %xmm9 + movaps %xmm9, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + addps %xmm9, %xmm8 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_penryn.S b/kernel/x86_64/zgemm_kernel_4x2_penryn.S new file mode 100644 index 0000000..241148d --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_penryn.S @@ -0,0 +1,1794 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 17 + 4) +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addps +#define ADD2 addps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addps +#define ADD2 addps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addps +#define ADD2 addps +#else +#define ADD1 addps +#define ADD2 subps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm0, %xmm0 + unpcklps %xmm1, %xmm1 + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $1, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + prefetcht0 -32 * SIZE(BB) + pxor %xmm6, %xmm6 + + prefetcht2 7 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht2 7 * SIZE(CO2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + + subq $-24 * SIZE, BB + + leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH -32 * SIZE(PREA) + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + PREFETCH -16 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 4 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + PREFETCH 0 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 12 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 20 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + PREFETCH 16 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 28 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + subq $-64 * SIZE, AO + movaps 0 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, PREA + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht0 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + pshufd $0xb1, %xmm12, %xmm13 + pshufd $0xb1, %xmm14, %xmm15 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm2, %xmm12 + mulps %xmm3, %xmm13 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + mulps %xmm2, %xmm14 + mulps %xmm3, %xmm15 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + addsubps %xmm13, %xmm12 + addsubps %xmm15, %xmm14 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm10 + addps %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm6, %xmm6 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm6, %xmm10 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + subq $-16 * SIZE, AO + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + ADD1 %xmm6, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm6, %xmm6 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + ADD1 %xmm6, %xmm10 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -30 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -28 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -26 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + subq $-8 * SIZE, AO + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -32 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + ADD1 %xmm6, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + + prefetcht0 7 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0x00, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0x55, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0xaa, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0xff, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0xff, %xmm2, %xmm6 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -12 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0x00, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0x55, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0xaa, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0xff, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0xff, %xmm2, %xmm6 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0x00, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0x55, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0x55, %xmm2, %xmm6 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + addps %xmm3, %xmm8 + addps %xmm4, %xmm12 + addps %xmm5, %xmm9 + addps %xmm6, %xmm13 + + pshufd $0xb1, %xmm9, %xmm9 + movddup ALPHA_R, %xmm2 + pshufd $0xb1, %xmm13, %xmm13 + movddup ALPHA_I, %xmm3 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 + + subps %xmm9, %xmm8 + subps %xmm13, %xmm12 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 +#else + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 + + subps %xmm9, %xmm8 + subps %xmm13, %xmm12 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm12, %xmm13 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm2, %xmm12 + mulps %xmm3, %xmm13 + + addsubps %xmm9, %xmm8 + addsubps %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + addps %xmm3, %xmm8 + movddup ALPHA_R, %xmm2 + addps %xmm4, %xmm9 + movddup ALPHA_I, %xmm3 + + pshufd $0xb1, %xmm9, %xmm9 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + + subps %xmm9, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm9 + + addps %xmm9, %xmm8 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm8 + + addps %xmm9, %xmm8 +#else + pxor %xmm0, %xmm8 + + subps %xmm9, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movsd -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + addps %xmm3, %xmm8 + movddup ALPHA_R, %xmm2 + addps %xmm4, %xmm9 + movddup ALPHA_I, %xmm3 + + pshufd $0xb1, %xmm9, %xmm9 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + + subps %xmm9, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm9 + + addps %xmm9, %xmm8 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm8 + + addps %xmm9, %xmm8 +#else + pxor %xmm0, %xmm8 + + subps %xmm9, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + addps %xmm0, %xmm8 +#endif + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_sse.S b/kernel/x86_64/zgemm_kernel_4x2_sse.S new file mode 100644 index 0000000..04dbf1a --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_sse.S @@ -0,0 +1,2293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi + +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define J 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER 256(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5 + 8) +#endif + +#if defined(PENTIUM4) || defined(GENERIC) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 160 +#endif + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else + +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + pxor %xmm10, %xmm10 + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm10, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm10,12 + POSINV +#else + movss %xmm10, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm10, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + addq $32 * SIZE, A + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + movaps POSINV, %xmm7 + + movq K, %rax + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movss 0 * SIZE(B), %xmm8 + movss 1 * SIZE(B), %xmm9 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + shufps $0, %xmm10, %xmm10 + shufps $0, %xmm11, %xmm11 + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + movaps %xmm12, 16 * SIZE(BO) + movaps %xmm13, 20 * SIZE(BO) + movaps %xmm14, 24 * SIZE(BO) + movaps %xmm15, 28 * SIZE(BO) + + addq $32 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movss 0 * SIZE(B), %xmm8 + movss 1 * SIZE(B), %xmm9 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + shufps $0, %xmm10, %xmm10 + shufps $0, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 +#endif + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movaps -24 * SIZE(AO), %xmm4 + movaps -24 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movaps -20 * SIZE(AO), %xmm6 + movaps -16 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCH -32 * SIZE(BB) + pxor %xmm14, %xmm14 + PREFETCH -16 * SIZE(BB) + pxor %xmm15, %xmm15 + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $64 * SIZE, BO + addq $32 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 64 * SIZE, BO + subq $-32 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm0 + addps %xmm1, %xmm10 + movaps -32 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm13 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm2 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm2, %xmm15 + movaps -20 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + shufps $0xb1, %xmm13, %xmm13 + shufps $0xb1, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + shufps $0xb1, %xmm12, %xmm12 + shufps $0xb1, %xmm14, %xmm14 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + mulps %xmm6, %xmm15 + mulps %xmm7, %xmm14 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm12 + addps %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -16 * SIZE(AO), %xmm2 + movaps 0 * SIZE(AO), %xmm4 + movaps 16 * SIZE(AO), %xmm6 + + movaps -32 * SIZE(BO), %xmm1 + movaps -16 * SIZE(BO), %xmm3 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -28 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -8 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm3, %xmm10 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm5 + addps %xmm5, %xmm8 + movaps 4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm9 + movaps 8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + mulps 12 * SIZE(BO), %xmm0 + addps %xmm5, %xmm10 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm0, %xmm11 + movaps -20 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm7 + addps %xmm7, %xmm8 + movaps 20 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm9 + movaps 24 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + mulps 28 * SIZE(BO), %xmm0 + addps %xmm7, %xmm10 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm0, %xmm11 + movaps 0 * SIZE(AO), %xmm0 + + mulps %xmm2, %xmm1 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addps %xmm1, %xmm8 + movaps 36 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm9 + movaps 40 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + mulps 44 * SIZE(BO), %xmm2 + addps %xmm1, %xmm10 + movaps 96 * SIZE(BO), %xmm1 + addps %xmm2, %xmm11 + movaps -12 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm3 + addps %xmm3, %xmm8 + movaps 52 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + movaps 56 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + mulps 60 * SIZE(BO), %xmm2 + addps %xmm3, %xmm10 + movaps 112 * SIZE(BO), %xmm3 + addps %xmm2, %xmm11 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movaps 68 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm9 + movaps 72 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + mulps 76 * SIZE(BO), %xmm2 + addps %xmm5, %xmm10 + movaps 128 * SIZE(BO), %xmm5 + addps %xmm2, %xmm11 + movaps -4 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movaps 84 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm9 + movaps 88 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + mulps 92 * SIZE(BO), %xmm2 + addps %xmm7, %xmm10 + movaps 144 * SIZE(BO), %xmm7 + addps %xmm2, %xmm11 + movaps 16 * SIZE(AO), %xmm2 + + subq $ -32 * SIZE, AO + subq $-128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -28 * SIZE(AO), %xmm0 + + subq $- 4 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -24 * SIZE(AO), %xmm2 + + movaps -32 * SIZE(BO), %xmm1 + movaps -16 * SIZE(BO), %xmm3 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movaps 32 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -8 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm10 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + movaps 48 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm5 + addps %xmm5, %xmm8 + movaps 4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm9 + movaps 8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm10 + movaps 12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm5, %xmm11 + movaps 64 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm7 + addps %xmm7, %xmm8 + movaps 20 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm9 + movaps 24 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm10 + movaps 28 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + movsd -16 * SIZE(AO), %xmm0 + addps %xmm7, %xmm11 + movaps 80 * SIZE(BO), %xmm7 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movaps 36 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm9 + movaps 40 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps 44 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + movsd -22 * SIZE(AO), %xmm2 + addps %xmm1, %xmm11 + movaps 96 * SIZE(BO), %xmm1 + + mulps %xmm2, %xmm3 + addps %xmm3, %xmm8 + movaps 52 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + movaps 56 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm10 + movaps 60 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + movsd -20 * SIZE(AO), %xmm2 + addps %xmm3, %xmm11 + movaps 112 * SIZE(BO), %xmm3 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movaps 68 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm9 + movaps 72 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm10 + movaps 76 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -18 * SIZE(AO), %xmm2 + addps %xmm5, %xmm11 + movaps 128 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movaps 84 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm9 + movaps 88 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm10 + movaps 92 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -8 * SIZE(AO), %xmm2 + addps %xmm7, %xmm11 + movaps 144 * SIZE(BO), %xmm7 + + subq $ -16 * SIZE, AO + subq $-128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movaps -16 * SIZE(BO), %xmm1 + + subq $ -2 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(CO1), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(CO2), %xmm1 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + movaps POSINV, %xmm7 + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movss 0 * SIZE(B), %xmm8 + movss 1 * SIZE(B), %xmm9 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + shufps $0, %xmm10, %xmm10 + shufps $0, %xmm11, %xmm11 + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + prefetchnta 56 * SIZE(B) +#endif + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + movaps %xmm12, 16 * SIZE(BO) + movaps %xmm13, 20 * SIZE(BO) + movaps %xmm14, 24 * SIZE(BO) + movaps %xmm15, 28 * SIZE(BO) + +#if defined(PENTIUM4) || defined(GENERIC) + PREFETCHW 128 * SIZE(BO) + PREFETCH 112 * SIZE(B) +#endif + + addq $32 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movss 0 * SIZE(B), %xmm8 + movss 1 * SIZE(B), %xmm9 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm9 +#else + xorps %xmm7, %xmm8 +#endif + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -16 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movaps 0 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movaps 16 * SIZE(AO), %xmm6 + pxor %xmm11, %xmm11 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm12, %xmm12 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + movaps 0 * SIZE(BO), %xmm5 + pxor %xmm14, %xmm14 + movaps 16 * SIZE(BO), %xmm7 + pxor %xmm15, %xmm15 + + PREFETCHW 7 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps 32 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + mulps %xmm2, %xmm3 + mulps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm8 + movaps -16 * SIZE(BO), %xmm3 + addps %xmm2, %xmm9 + movaps -12 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm3 + mulps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm2, %xmm13 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm3 + mulps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm8 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm2, %xmm9 + movaps -4 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm3 + mulps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm2, %xmm13 + movaps 48 * SIZE(AO), %xmm2 + + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + + mulps %xmm4, %xmm5 + mulps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm8 + movaps 0 * SIZE(BO), %xmm5 + addps %xmm4, %xmm9 + movaps 4 * SIZE(AO), %xmm4 + mulps %xmm4, %xmm5 + mulps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm12 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm4, %xmm13 + movaps 8 * SIZE(AO), %xmm4 + + mulps %xmm4, %xmm5 + mulps 12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm8 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm4, %xmm9 + movaps 12 * SIZE(AO), %xmm4 + mulps %xmm4, %xmm5 + mulps 12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm12 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm4, %xmm13 + movaps 64 * SIZE(AO), %xmm4 + + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) + + mulps %xmm6, %xmm7 + mulps 20 * SIZE(BO), %xmm6 + addps %xmm7, %xmm8 + movaps 16 * SIZE(BO), %xmm7 + addps %xmm6, %xmm9 + movaps 20 * SIZE(AO), %xmm6 + mulps %xmm6, %xmm7 + mulps 20 * SIZE(BO), %xmm6 + addps %xmm7, %xmm12 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm6, %xmm13 + movaps 24 * SIZE(AO), %xmm6 + + mulps %xmm6, %xmm7 + mulps 28 * SIZE(BO), %xmm6 + addps %xmm7, %xmm8 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm6, %xmm9 + movaps 28 * SIZE(AO), %xmm6 + mulps %xmm6, %xmm7 + mulps 28 * SIZE(BO), %xmm6 + addps %xmm7, %xmm12 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm6, %xmm13 + movaps 80 * SIZE(AO), %xmm6 + + subq $-64 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps -24 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jg .L56 + ALIGN_4 + +.L58: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm13, %xmm12 +#else + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm12, %xmm13 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm12, %xmm12 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm12 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -16 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BO), %xmm0 + addps %xmm3, %xmm8 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm0, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm3, %xmm10 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm0, %xmm11 + movaps 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + mulps %xmm2, %xmm5 + mulps 4 * SIZE(BO), %xmm2 + addps %xmm5, %xmm8 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm2, %xmm9 + movaps -12 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm5 + mulps 12 * SIZE(BO), %xmm2 + addps %xmm5, %xmm10 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm2, %xmm11 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm7 + mulps 20 * SIZE(BO), %xmm2 + addps %xmm7, %xmm8 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm2, %xmm9 + movaps -4 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm7 + mulps 28 * SIZE(BO), %xmm2 + addps %xmm7, %xmm10 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm2, %xmm11 + movaps 16 * SIZE(AO), %xmm2 + + subq $-32 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 +#else + addps %xmm9, %xmm8 +#endif + + movaps %xmm8, %xmm9 + + shufps $0xb1, %xmm8, %xmm8 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + + addps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -24 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movaps 32 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + movaps -8 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm10 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + movaps 48 * SIZE(BO), %xmm3 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movaps 4 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -22 * SIZE(AO), %xmm2 + addps %xmm5, %xmm9 + movaps 8 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm10 + movaps 12 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -20 * SIZE(AO), %xmm2 + addps %xmm5, %xmm11 + movaps 64 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movaps 20 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -18 * SIZE(AO), %xmm2 + addps %xmm7, %xmm9 + movaps 24 * SIZE(BO), %xmm7 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm10 + movaps 28 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -8 * SIZE(AO), %xmm2 + addps %xmm7, %xmm11 + movaps 80 * SIZE(BO), %xmm7 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 +#else + addps %xmm9, %xmm8 +#endif + + movaps %xmm8, %xmm9 + + shufps $0xb1, %xmm8, %xmm8 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + + addps %xmm9, %xmm8 +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + movlps %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_sse3.S b/kernel/x86_64/zgemm_kernel_4x2_sse3.S new file mode 100644 index 0000000..ecc3a6f --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_sse3.S @@ -0,0 +1,2101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r12 +#define BO %r13 +#define CO1 %r14 +#define CO2 %r15 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA_R 0(%rsp) +#define ALPHA_I 16(%rsp) +#define J 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 320 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addps +#else +#define ADDSUB subps +#endif + +#define KERNEL1(address) \ + mulps %xmm8, %xmm9; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO); \ + addps %xmm9, %xmm0; \ + movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + ADDSUB %xmm9, %xmm1; \ + movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ + ADDSUB %xmm9, %xmm3; \ + movsldup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + ADDSUB %xmm9, %xmm5; \ + movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 8 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ + ADDSUB %xmm9, %xmm7; \ + movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm0; \ + movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + ADDSUB %xmm9, %xmm1; \ + movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 12 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ + ADDSUB %xmm9, %xmm3; \ + movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + ADDSUB %xmm9, %xmm5; \ + movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 64 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ + ADDSUB %xmm9, %xmm7; \ + movsldup 64 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + ADDSUB %xmm11, %xmm1; \ + movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 20 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ + ADDSUB %xmm11, %xmm3; \ + movsldup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + ADDSUB %xmm11, %xmm5; \ + movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 24 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ + ADDSUB %xmm11, %xmm7; \ + movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + ADDSUB %xmm11, %xmm1; \ + movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 28 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ + ADDSUB %xmm11, %xmm3; \ + movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + ADDSUB %xmm11, %xmm5; \ + movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 80 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ + ADDSUB %xmm11, %xmm7; \ + movsldup 80 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulps %xmm12, %xmm13; \ + PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * 2 * SIZE(AO); \ + addps %xmm13, %xmm0; \ + movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + ADDSUB %xmm13, %xmm1; \ + movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 36 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ + ADDSUB %xmm13, %xmm3; \ + movsldup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + ADDSUB %xmm13, %xmm5; \ + movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 40 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ + ADDSUB %xmm13, %xmm7; \ + movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm0; \ + movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + ADDSUB %xmm13, %xmm1; \ + movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 44 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ + ADDSUB %xmm13, %xmm3; \ + movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + ADDSUB %xmm13, %xmm5; \ + movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 96 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ + ADDSUB %xmm13, %xmm7; \ + movsldup 96 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + ADDSUB %xmm15, %xmm1; \ + movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 52 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ + ADDSUB %xmm15, %xmm3; \ + movsldup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + ADDSUB %xmm15, %xmm5; \ + movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ + ADDSUB %xmm15, %xmm7; \ + movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + ADDSUB %xmm15, %xmm1; \ + movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 60 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ + ADDSUB %xmm15, %xmm3; \ + movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + ADDSUB %xmm15, %xmm5; \ + movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 112 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ + ADDSUB %xmm15, %xmm7; \ + movsldup 112 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm4 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + pxor %xmm15, %xmm15 + cmpeqps %xmm15, %xmm15 + pslld $31, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm15, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq 112 * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 16 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 32 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 48 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + movsldup 0 * SIZE(BO), %xmm9 + pxor %xmm4, %xmm4 + movsldup 16 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + movsldup 32 * SIZE(BO), %xmm13 + pxor %xmm6, %xmm6 + movsldup 48 * SIZE(BO), %xmm15 + pxor %xmm7, %xmm7 + + prefetchnta 8 * SIZE(CO1) + prefetchnta 8 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 + +.L1X: + KERNEL1 (32 * 0) + KERNEL2 (32 * 0) + KERNEL3 (32 * 0) + KERNEL4 (32 * 0) + KERNEL5 (32 * 0) + KERNEL6 (32 * 0) + KERNEL7 (32 * 0) + KERNEL8 (32 * 0) + KERNEL9 (32 * 0) + KERNEL10(32 * 0) + KERNEL11(32 * 0) + KERNEL12(32 * 0) + KERNEL13(32 * 0) + KERNEL14(32 * 0) + KERNEL15(32 * 0) + KERNEL16(32 * 0) + cmpq $128 * 1, %rax + jle .L12 + KERNEL1 (32 * 1) + KERNEL2 (32 * 1) + KERNEL3 (32 * 1) + KERNEL4 (32 * 1) + KERNEL5 (32 * 1) + KERNEL6 (32 * 1) + KERNEL7 (32 * 1) + KERNEL8 (32 * 1) + KERNEL9 (32 * 1) + KERNEL10(32 * 1) + KERNEL11(32 * 1) + KERNEL12(32 * 1) + KERNEL13(32 * 1) + KERNEL14(32 * 1) + KERNEL15(32 * 1) + KERNEL16(32 * 1) + cmpq $128 * 2, %rax + jle .L12 + KERNEL1 (32 * 2) + KERNEL2 (32 * 2) + KERNEL3 (32 * 2) + KERNEL4 (32 * 2) + KERNEL5 (32 * 2) + KERNEL6 (32 * 2) + KERNEL7 (32 * 2) + KERNEL8 (32 * 2) + KERNEL9 (32 * 2) + KERNEL10(32 * 2) + KERNEL11(32 * 2) + KERNEL12(32 * 2) + KERNEL13(32 * 2) + KERNEL14(32 * 2) + KERNEL15(32 * 2) + KERNEL16(32 * 2) + cmpq $128 * 3, %rax + jle .L12 + KERNEL1 (32 * 3) + KERNEL2 (32 * 3) + KERNEL3 (32 * 3) + KERNEL4 (32 * 3) + KERNEL5 (32 * 3) + KERNEL6 (32 * 3) + KERNEL7 (32 * 3) + KERNEL8 (32 * 3) + KERNEL9 (32 * 3) + KERNEL10(32 * 3) + KERNEL11(32 * 3) + KERNEL12(32 * 3) + KERNEL13(32 * 3) + KERNEL14(32 * 3) + KERNEL15(32 * 3) + KERNEL16(32 * 3) + cmpq $128 * 4, %rax + jle .L12 + KERNEL1 (32 * 4) + KERNEL2 (32 * 4) + KERNEL3 (32 * 4) + KERNEL4 (32 * 4) + KERNEL5 (32 * 4) + KERNEL6 (32 * 4) + KERNEL7 (32 * 4) + KERNEL8 (32 * 4) + KERNEL9 (32 * 4) + KERNEL10(32 * 4) + KERNEL11(32 * 4) + KERNEL12(32 * 4) + KERNEL13(32 * 4) + KERNEL14(32 * 4) + KERNEL15(32 * 4) + KERNEL16(32 * 4) + cmpq $128 * 5, %rax + jle .L12 + KERNEL1 (32 * 5) + KERNEL2 (32 * 5) + KERNEL3 (32 * 5) + KERNEL4 (32 * 5) + KERNEL5 (32 * 5) + KERNEL6 (32 * 5) + KERNEL7 (32 * 5) + KERNEL8 (32 * 5) + KERNEL9 (32 * 5) + KERNEL10(32 * 5) + KERNEL11(32 * 5) + KERNEL12(32 * 5) + KERNEL13(32 * 5) + KERNEL14(32 * 5) + KERNEL15(32 * 5) + KERNEL16(32 * 5) + cmpq $128 * 6, %rax + jle .L12 + KERNEL1 (32 * 6) + KERNEL2 (32 * 6) + KERNEL3 (32 * 6) + KERNEL4 (32 * 6) + KERNEL5 (32 * 6) + KERNEL6 (32 * 6) + KERNEL7 (32 * 6) + KERNEL8 (32 * 6) + KERNEL9 (32 * 6) + KERNEL10(32 * 6) + KERNEL11(32 * 6) + KERNEL12(32 * 6) + KERNEL13(32 * 6) + KERNEL14(32 * 6) + KERNEL15(32 * 6) + KERNEL16(32 * 6) + cmpq $128 * 7, %rax + jle .L12 + KERNEL1 (32 * 7) + KERNEL2 (32 * 7) + KERNEL3 (32 * 7) + KERNEL4 (32 * 7) + KERNEL5 (32 * 7) + KERNEL6 (32 * 7) + KERNEL7 (32 * 7) + KERNEL8 (32 * 7) + KERNEL9 (32 * 7) + KERNEL10(32 * 7) + KERNEL11(32 * 7) + KERNEL12(32 * 7) + KERNEL13(32 * 7) + KERNEL14(32 * 7) + KERNEL15(32 * 7) + KERNEL16(32 * 7) + + addq $64 * 8 * SIZE, AO + addq $64 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + KERNEL1 (32 * 0) + KERNEL2 (32 * 0) + KERNEL3 (32 * 0) + KERNEL4 (32 * 0) + KERNEL5 (32 * 0) + KERNEL6 (32 * 0) + KERNEL7 (32 * 0) + KERNEL8 (32 * 0) + KERNEL9 (32 * 0) + KERNEL10(32 * 0) + KERNEL11(32 * 0) + KERNEL12(32 * 0) + KERNEL13(32 * 0) + KERNEL14(32 * 0) + KERNEL15(32 * 0) + KERNEL16(32 * 0) + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm3 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm6 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm7 + movsldup 8 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + + addsubps %xmm1, %xmm0 + addsubps %xmm3, %xmm2 + addsubps %xmm5, %xmm4 + addsubps %xmm7, %xmm6 + + movaps %xmm0, %xmm1 + movaps %xmm2, %xmm3 + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm2, %xmm2 + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 +#else + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm2, %xmm2 + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + addsubps %xmm0, %xmm1 + addsubps %xmm2, %xmm3 + addsubps %xmm4, %xmm5 + addsubps %xmm6, %xmm7 + + movaps %xmm1, %xmm0 + movaps %xmm3, %xmm2 + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm2 + + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm4 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm6 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + shufps $0xe4, %xmm8, %xmm8 + shufps $0xe4, %xmm9, %xmm9 + shufps $0xe4, %xmm10, %xmm10 + shufps $0xe4, %xmm11, %xmm11 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm10 + movhps 6 * SIZE(CO1), %xmm10 + + movsd 0 * SIZE(CO2), %xmm9 + movhps 2 * SIZE(CO2), %xmm9 + movsd 4 * SIZE(CO2), %xmm11 + movhps 6 * SIZE(CO2), %xmm11 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm2 + addps %xmm10, %xmm4 + addps %xmm11, %xmm6 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movsd %xmm6, 4 * SIZE(CO2) + movhps %xmm6, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 16 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + + movsldup 0 * SIZE(BO), %xmm9 + pxor %xmm2, %xmm2 + movsldup 16 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm3 + movsldup 64 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + ADDSUB %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 12 * SIZE(AO), %xmm8 + ADDSUB %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + ADDSUB %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 32 * SIZE(AO), %xmm8 + ADDSUB %xmm11, %xmm3 + movsldup 80 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 32 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + ADDSUB %xmm13, %xmm1 + movsldup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 20 * SIZE(AO), %xmm10 + ADDSUB %xmm13, %xmm3 + movsldup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + ADDSUB %xmm13, %xmm1 + movsldup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 24 * SIZE(AO), %xmm10 + ADDSUB %xmm13, %xmm3 + movsldup 96 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 48 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + ADDSUB %xmm15, %xmm1 + movsldup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 28 * SIZE(AO), %xmm10 + ADDSUB %xmm15, %xmm3 + movsldup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + ADDSUB %xmm15, %xmm1 + movsldup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 48 * SIZE(AO), %xmm10 + ADDSUB %xmm15, %xmm3 + movsldup 112 * SIZE(BO), %xmm15 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + + addq $ 4 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + + addsubps %xmm1, %xmm0 + addsubps %xmm3, %xmm2 + + movaps %xmm0, %xmm1 + movaps %xmm2, %xmm3 + + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm2, %xmm2 +#else + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm2, %xmm2 + + addsubps %xmm0, %xmm1 + addsubps %xmm2, %xmm3 + + movaps %xmm1, %xmm0 + movaps %xmm3, %xmm2 + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm2 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + shufps $0xe4, %xmm8, %xmm8 + shufps $0xe4, %xmm10, %xmm10 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 8 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm2, %xmm2 + movsd 16 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + movsd 32 * SIZE(BO), %xmm13 + movsd 48 * SIZE(BO), %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + shufps $0x50, %xmm9, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 64 * SIZE(BO), %xmm9 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm8, %xmm11 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm8, %xmm11 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movsd 80 * SIZE(BO), %xmm11 + shufps $0x50, %xmm13, %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + shufps $0x50, %xmm13, %xmm13 + mulps %xmm10, %xmm13 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + shufps $0x50, %xmm13, %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 44 * SIZE(BO), %xmm13 + shufps $0x50, %xmm13, %xmm13 + mulps %xmm10, %xmm13 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movsd 96 * SIZE(BO), %xmm13 + shufps $0x50, %xmm15, %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + shufps $0x50, %xmm15, %xmm15 + mulps %xmm10, %xmm15 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + shufps $0x50, %xmm15, %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 60 * SIZE(BO), %xmm15 + shufps $0x50, %xmm15, %xmm15 + mulps %xmm10, %xmm15 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movsd 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: + movaps %xmm0, %xmm6 + movlhps %xmm1, %xmm0 + movhlps %xmm6, %xmm1 + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm7, %xmm1 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm1, %xmm1 + + addsubps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + + shufps $0xb1, %xmm0, %xmm0 +#else + shufps $0xb1, %xmm0, %xmm0 + + addsubps %xmm0, %xmm1 + + movaps %xmm1, %xmm0 + + shufps $0xb1, %xmm1, %xmm1 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + + addps %xmm1, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 + + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L43 + ALIGN_4 + +.L42: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $7, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movddup 0 * SIZE(B), %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 16 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 32 * SIZE(AO), %xmm12 + pxor %xmm4, %xmm4 + movaps 48 * SIZE(AO), %xmm14 + pxor %xmm5, %xmm5 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + + prefetchnta 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm5 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + ADDSUB %xmm9, %xmm1 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + ADDSUB %xmm9, %xmm5 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + ADDSUB %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + ADDSUB %xmm9, %xmm5 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 36 * SIZE(AO), %xmm12 + ADDSUB %xmm11, %xmm1 + movsldup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + ADDSUB %xmm11, %xmm5 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm0 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + ADDSUB %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + ADDSUB %xmm11, %xmm5 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + ADDSUB %xmm11, %xmm1 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + ADDSUB %xmm11, %xmm5 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + ADDSUB %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + ADDSUB %xmm11, %xmm5 + movsldup 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + + addq $ 8 * SIZE, AO + addq $ 4 * SIZE, BO + decq %rax + jg .L56 + ALIGN_4 + +.L58: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 + + addsubps %xmm1, %xmm0 + addsubps %xmm5, %xmm4 + + movaps %xmm0, %xmm1 + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm4, %xmm4 + + addsubps %xmm0, %xmm1 + addsubps %xmm4, %xmm5 + + movaps %xmm1, %xmm0 + movaps %xmm5, %xmm4 + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm4 + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsldup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movaps 16 * SIZE(AO), %xmm10 + movsldup 16 * SIZE(BO), %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + ADDSUB %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + ADDSUB %xmm11, %xmm1 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + ADDSUB %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + ADDSUB %xmm11, %xmm1 + movsldup 48 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm1, %xmm1 + addsubps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $0xb1, %xmm0, %xmm0 +#else + shufps $0xb1, %xmm0, %xmm0 + addsubps %xmm0, %xmm1 + movaps %xmm1, %xmm0 + shufps $0xb1, %xmm1, %xmm1 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + addps %xmm1, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 8 * SIZE(AO), %xmm10 + movsd 16 * SIZE(BO), %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + shufps $0x50, %xmm9, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm1, %xmm0 + + movhlps %xmm0, %xmm1 + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm7, %xmm1 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm1, %xmm1 + + addsubps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + + shufps $0xb1, %xmm0, %xmm0 +#else + shufps $0xb1, %xmm0, %xmm0 + + addsubps %xmm0, %xmm1 + + movaps %xmm1, %xmm0 + + shufps $0xb1, %xmm1, %xmm1 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + + addps %xmm1, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + addps %xmm8, %xmm0 +#endif + movsd %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_ncopy_1.S b/kernel/x86_64/zgemm_ncopy_1.S new file mode 100644 index 0000000..9f9ae73 --- /dev/null +++ b/kernel/x86_64/zgemm_ncopy_1.S @@ -0,0 +1,203 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 24 + STACKSIZE(%rsp) + +#define B %r10 +#define I %r11 +#define J %r12 +#define AO1 %r13 +#define AO2 %r14 + +#endif + +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 48 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r14 + pushq %r13 +#endif + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + salq $ZBASE_SHIFT, LDA + + testq N, N + movq N, J + jle .L999 + ALIGN_4 + +.L12: + movq A, AO1 + addq LDA, A + + movq M, I + sarq $2, I + jle .L14 + ALIGN_4 + +.L13: +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + movsd 4 * SIZE(AO1), %xmm1 + movhps 6 * SIZE(AO1), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 4 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + movsd 4 * SIZE(AO1), %xmm2 + movhpd 5 * SIZE(AO1), %xmm2 + movsd 6 * SIZE(AO1), %xmm3 + movhpd 7 * SIZE(AO1), %xmm3 + + prefetcht2 RPREFETCHSIZE * SIZE(AO1) + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) + + prefetcht2 WPREFETCHSIZE * SIZE(B) +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + movq M, I + andq $3, I + jle .L16 + ALIGN_4 + +.L15: +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(B) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, B + decq I + jg .L15 + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 +#ifdef WINDOWS_ABI + popq %r13 + popq %r14 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_ncopy_2.S b/kernel/x86_64/zgemm_ncopy_2.S new file mode 100644 index 0000000..bf318b7 --- /dev/null +++ b/kernel/x86_64/zgemm_ncopy_2.S @@ -0,0 +1,359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 24 + STACKSIZE(%rsp) + +#define B %r10 +#define I %r11 +#define J %r12 +#define AO1 %r13 +#define AO2 %r14 + +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE 48 +#endif + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 80 +#endif + +#ifdef OPTERON +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 48 +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 48 +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r14 + pushq %r13 +#endif + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + salq $ZBASE_SHIFT, LDA + + movq N, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L12: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq M, I + sarq $2, I + jle .L14 + ALIGN_4 + +.L13: +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + prefetchw (WPREFETCHSIZE + 8) * SIZE(B) +#endif + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + movlps 2 * SIZE(AO1), %xmm1 + movhps 2 * SIZE(AO2), %xmm1 + + movlps 4 * SIZE(AO1), %xmm2 + movhps 4 * SIZE(AO2), %xmm2 + movlps 6 * SIZE(AO1), %xmm3 + movhps 6 * SIZE(AO2), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + + prefetcht0 WPREFETCHSIZE * SIZE(B) +#endif + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 4 * SIZE(B) + movaps %xmm2, 8 * SIZE(B) + movaps %xmm3, 12 * SIZE(B) + +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 RPREFETCHSIZE * SIZE(AO1) +#endif + + movsd 2 * SIZE(AO1), %xmm2 + movhpd 3 * SIZE(AO1), %xmm2 + movsd 2 * SIZE(AO2), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + movsd 4 * SIZE(AO1), %xmm4 + movhpd 5 * SIZE(AO1), %xmm4 + movsd 4 * SIZE(AO2), %xmm5 + movhpd 5 * SIZE(AO2), %xmm5 + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 RPREFETCHSIZE * SIZE(AO2) +#endif + + movsd 6 * SIZE(AO1), %xmm6 + movhpd 7 * SIZE(AO1), %xmm6 + movsd 6 * SIZE(AO2), %xmm7 + movhpd 7 * SIZE(AO2), %xmm7 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + + prefetcht0 WPREFETCHSIZE * SIZE(B) +#endif + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 WPREFETCHSIZE * SIZE(B) +#endif + + movapd %xmm4, 8 * SIZE(B) + movapd %xmm5, 10 * SIZE(B) + movapd %xmm6, 12 * SIZE(B) + movapd %xmm7, 14 * SIZE(B) +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + movq M, I + andq $3, I + jle .L16 + ALIGN_4 + +.L15: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $4 * SIZE, B + decq I + jg .L15 + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L20: + testq $1, N + jle .L999 + + movq A, AO1 + + movq M, I + sarq $2, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + prefetchw (WPREFETCHSIZE + 8) * SIZE(B) +#endif + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + movlps 4 * SIZE(AO1), %xmm1 + movhps 6 * SIZE(AO1), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 4 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + movsd 4 * SIZE(AO1), %xmm2 + movhpd 5 * SIZE(AO1), %xmm2 + movsd 6 * SIZE(AO1), %xmm3 + movhpd 7 * SIZE(AO1), %xmm3 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) +#endif + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + + prefetcht0 WPREFETCHSIZE * SIZE(B) +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, B + decq I + jg .L23 + ALIGN_4 + +.L24: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L25: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + + movlps %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(B) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, B + decq I + jg .L25 + ALIGN_4 + + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 +#ifdef WINDOWS_ABI + popq %r13 + popq %r14 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_tcopy_1.S b/kernel/x86_64/zgemm_tcopy_1.S new file mode 100644 index 0000000..b4348e6 --- /dev/null +++ b/kernel/x86_64/zgemm_tcopy_1.S @@ -0,0 +1,190 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 24 + STACKSIZE(%rsp) + +#define B %r10 +#define I %r11 +#define J %r12 +#define AO1 %r13 +#define AO2 %r14 + +#endif + +#define RPREFETCHSIZE 4 +#define WPREFETCHSIZE 4 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r14 + pushq %r13 +#endif + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + salq $ZBASE_SHIFT, LDA + + testq N, N + movq N, J + jle .L999 + ALIGN_4 + +.L12: + movq A, AO1 + addq $2 * SIZE, A + + movq M, I + sarq $1, I + jle .L14 + ALIGN_4 + +.L13: +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO1, LDA, 1), %xmm0 + + movaps %xmm0, 0 * SIZE(B) +#else + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + prefetcht0 RPREFETCHSIZE * SIZE(AO1, LDA) + + movsd 0 * SIZE(AO1, LDA), %xmm1 + movhpd 1 * SIZE(AO1, LDA), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + + prefetcht0 WPREFETCHSIZE * SIZE(B) +#endif + + leaq (AO1, LDA, 2), AO1 + addq $4 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $1, M + jle .L16 + +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(B) +#endif + addq $2 * SIZE, B + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 +#ifdef WINDOWS_ABI + popq %r13 + popq %r14 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_tcopy_2.S b/kernel/x86_64/zgemm_tcopy_2.S new file mode 100644 index 0000000..f83022d --- /dev/null +++ b/kernel/x86_64/zgemm_tcopy_2.S @@ -0,0 +1,432 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 16 + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#define J %r10 + +#define AO1 %r11 +#define AO2 %r12 +#define BO1 %r13 +#define M8 %r14 +#define BO %rax + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 48 + STACKSIZE(%rsp) + +#define B %r10 + +#define I %r11 +#define J %r12 + +#define AO1 %r13 +#define AO2 %r14 + +#define BO1 %rdi +#define M8 %rsi +#define BO %rax + +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + pushq %r14 + pushq %r13 + pushq %r12 + pushq %r11 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + movq N, %rax + andq $-2, %rax + imulq M, %rax + + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), BO1 + + salq $ZBASE_SHIFT, LDA + + leaq (, M, SIZE), M8 + + movq M, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), A + + movq B, BO + addq $8 * SIZE, B + + movq N, I + sarq $2, I + jle .L13 + ALIGN_4 + +.L12: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + movlps 4 * SIZE(AO1), %xmm1 + movhps 6 * SIZE(AO1), %xmm1 + + movlps 0 * SIZE(AO2), %xmm2 + movhps 2 * SIZE(AO2), %xmm2 + movlps 4 * SIZE(AO2), %xmm3 + movhps 6 * SIZE(AO2), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + prefetcht0 WPREFETCHSIZE * SIZE(BO) +#endif + +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm2, 4 * SIZE(BO) + leaq (BO, M8, 4), BO + movaps %xmm1, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 RPREFETCHSIZE * SIZE(AO1) +#endif + + movsd 4 * SIZE(AO1), %xmm2 + movhpd 5 * SIZE(AO1), %xmm2 + movsd 6 * SIZE(AO1), %xmm3 + movhpd 7 * SIZE(AO1), %xmm3 + + movsd 0 * SIZE(AO2), %xmm4 + movhpd 1 * SIZE(AO2), %xmm4 + movsd 2 * SIZE(AO2), %xmm5 + movhpd 3 * SIZE(AO2), %xmm5 + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 RPREFETCHSIZE * SIZE(AO2) +#endif + + movsd 4 * SIZE(AO2), %xmm6 + movhpd 5 * SIZE(AO2), %xmm6 + movsd 6 * SIZE(AO2), %xmm7 + movhpd 7 * SIZE(AO2), %xmm7 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + prefetcht0 WPREFETCHSIZE * SIZE(BO) +#endif + +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + prefetchw (WPREFETCHSIZE + 8) * SIZE(BO) +#endif + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm4, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 WPREFETCHSIZE * SIZE(BO) +#endif + leaq (BO, M8, 4), BO + + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm6, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (BO, M8, 4), BO + decq I + jg .L12 + ALIGN_4 + +.L13: + testq $2, N + jle .L14 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movlps 0 * SIZE(AO2), %xmm1 + movhps 2 * SIZE(AO2), %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movsd 0 * SIZE(AO2), %xmm2 + movhpd 1 * SIZE(AO2), %xmm2 + movsd 2 * SIZE(AO2), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (BO, M8, 4), BO + ALIGN_4 + +.L14: + testq $1, N + jle .L19 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movapd %xmm0, 0 * SIZE(BO1) + movapd %xmm1, 2 * SIZE(BO1) +#endif + + addq $4 * SIZE, BO1 + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + jle .L999 + ALIGN_4 + +.L21: + movq A, AO1 + + movq B, BO + + movq N, I + sarq $2, I + jle .L23 + ALIGN_4 + +.L22: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movlps 4 * SIZE(AO1), %xmm1 + movhps 6 * SIZE(AO1), %xmm1 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 WPREFETCHSIZE * SIZE(BO) +#endif + +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) +#endif + + movaps %xmm0, 0 * SIZE(BO) + leaq (BO, M8, 4), BO + movaps %xmm1, 0 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movsd 4 * SIZE(AO1), %xmm2 + movhpd 5 * SIZE(AO1), %xmm2 + movsd 6 * SIZE(AO1), %xmm3 + movhpd 7 * SIZE(AO1), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 WPREFETCHSIZE * SIZE(BO) +#endif + +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) +#endif + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + + leaq (BO, M8, 4), BO + + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#endif + + addq $8 * SIZE, AO1 + leaq (BO, M8, 4), BO + decq I + jg .L22 + ALIGN_4 + +.L23: + testq $2, N + jle .L24 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movaps %xmm0, 0 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) +#endif + + addq $4 * SIZE, AO1 + leaq (BO, M8, 4), BO + ALIGN_4 + +.L24: + testq $1, N + jle .L999 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + + movlps %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(BO1) +#endif + ALIGN_4 + + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r11 + popq %r12 + popq %r13 + popq %r14 +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_n.S b/kernel/x86_64/zgemv_n.S new file mode 100644 index 0000000..b584a53 --- /dev/null +++ b/kernel/x86_64/zgemv_n.S @@ -0,0 +1,2701 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA_R 48 (%rsp) +#define ALPHA_I 56 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) +#define ALPHA_R 224 (%rsp) +#define ALPHA_I 232 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define A1 %r12 +#define A2 %r13 + +#define Y1 %r14 +#define BUFFER %r15 + +#define J %r11 + +#undef SUBPD + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPD subpd +#else +#define SUBPD addpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movapd %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movlpd %xmm0, ALPHA_R + movlpd %xmm1, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, Y1 + + pxor %xmm4, %xmm4 + + movq M, %rax + addq $8, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movapd %xmm4, 0 * SIZE(Y1) + movapd %xmm4, 2 * SIZE(Y1) + movapd %xmm4, 4 * SIZE(Y1) + movapd %xmm4, 6 * SIZE(Y1) + movapd %xmm4, 8 * SIZE(Y1) + movapd %xmm4, 10 * SIZE(Y1) + movapd %xmm4, 12 * SIZE(Y1) + movapd %xmm4, 14 * SIZE(Y1) + + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: +#ifdef ALIGNED_ACCESS + testq $SIZE, A + jne .L100 +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movsd 0 * SIZE(X), %xmm8 + movhpd 1 * SIZE(X), %xmm8 + addq INCX, X + movsd 0 * SIZE(X), %xmm10 + movhpd 1 * SIZE(X), %xmm10 + addq INCX, X + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + movsd 0 * SIZE(X), %xmm14 + movhpd 1 * SIZE(X), %xmm14 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0xc0, %xmm5, %xmm5 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 +#else + movsd ALPHA_R, %xmm6 + unpcklpd %xmm6, %xmm6 + movsd ALPHA_I, %xmm7 + unpcklpd %xmm7, %xmm7 +#endif + + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + +#ifndef XCONJ + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0x44, %xmm8, %xmm8 + + pshufd $0xee, %xmm10, %xmm11 + pshufd $0x44, %xmm10, %xmm10 + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + + pshufd $0xee, %xmm14, %xmm15 + pshufd $0x44, %xmm14, %xmm14 + +#ifndef CONJ + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm10 + xorpd %xmm5, %xmm12 + xorpd %xmm5, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L15 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-10 * SIZE, A1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1( -6 * SIZE, A1, %xmm6) + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-10 * SIZE, A1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm2 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm3 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm0 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + ALIGN_3 + +.L19: + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + movsd 0 * SIZE(X), %xmm14 + movhpd 1 * SIZE(X), %xmm14 + addq INCX, X + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm8 + movddup ALPHA_I, %xmm9 +#else + movsd ALPHA_R, %xmm8 + unpcklpd %xmm8, %xmm8 + movsd ALPHA_I, %xmm9 + unpcklpd %xmm9, %xmm9 +#endif + + xorpd %xmm11, %xmm13 + xorpd %xmm11, %xmm15 + + mulpd %xmm8, %xmm12 + mulpd %xmm9, %xmm13 + mulpd %xmm8, %xmm14 + mulpd %xmm9, %xmm15 + +#ifndef XCONJ + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + + pshufd $0xee, %xmm14, %xmm15 + pshufd $0x44, %xmm14, %xmm14 + +#ifndef CONJ + xorpd %xmm11, %xmm13 + xorpd %xmm11, %xmm15 +#else + xorpd %xmm11, %xmm12 + xorpd %xmm11, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L25 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + MOVUPS_A1(-10 * SIZE, A1, %xmm10) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm10) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1( -6 * SIZE, A1, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm2 + MOVUPS_A1( -4 * SIZE, A1, %xmm8) + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm3 + MOVUPS_A1( -2 * SIZE, A1, %xmm10) + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm10) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm2 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm3 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + MOVUPS_A1(-14 * SIZE, A2, %xmm10) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm0 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, M +#if GEMV_UNROLL == 2 + je .L29 +#else + je .L30 +#endif + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L29: + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: +#endif + + cmpq $1, N + jl .L980 + +#if GEMV_UNROLL == 1 +.L31: + decq N +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + + pshufd $0x4e, %xmm12, %xmm13 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm8 + movddup ALPHA_I, %xmm9 +#else + movsd ALPHA_R, %xmm8 + unpcklpd %xmm8, %xmm8 + movsd ALPHA_I, %xmm9 + unpcklpd %xmm9, %xmm9 +#endif + + xorpd %xmm11, %xmm13 + + mulpd %xmm8, %xmm12 + mulpd %xmm9, %xmm13 + +#ifndef XCONJ + subpd %xmm13, %xmm12 +#else + addpd %xmm13, %xmm12 +#endif + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + +#ifndef CONJ + xorpd %xmm11, %xmm13 +#else + xorpd %xmm11, %xmm12 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + movq M, I + sarq $2, I + jle .L35 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + MOVUPS_A1(-10 * SIZE, A1, %xmm10) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1( -6 * SIZE, A1, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + MOVUPS_A1( -4 * SIZE, A1, %xmm8) + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + MOVUPS_A1( -2 * SIZE, A1, %xmm10) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $1, M +#if GEMV_UNROLL == 1 + je .L39 +#else + je .L980 +#endif + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 1 + ALIGN_3 +.L39: + cmpq $1, N + jge .L31 +#endif + +#ifdef ALIGNED_ACCESS + + jmp .L980 + ALIGN_3 + +.L100: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movsd 0 * SIZE(X), %xmm8 + movhpd 1 * SIZE(X), %xmm8 + addq INCX, X + movsd 0 * SIZE(X), %xmm10 + movhpd 1 * SIZE(X), %xmm10 + addq INCX, X + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + movsd 0 * SIZE(X), %xmm14 + movhpd 1 * SIZE(X), %xmm14 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0xc0, %xmm5, %xmm5 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 +#else + movsd ALPHA_R, %xmm6 + unpcklpd %xmm6, %xmm6 + movsd ALPHA_I, %xmm7 + unpcklpd %xmm7, %xmm7 +#endif + + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + +#ifndef XCONJ + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0x44, %xmm8, %xmm8 + + pshufd $0xee, %xmm10, %xmm11 + pshufd $0x44, %xmm10, %xmm10 + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + + pshufd $0xee, %xmm14, %xmm15 + pshufd $0x44, %xmm14, %xmm14 + +#ifndef CONJ + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm10 + xorpd %xmm5, %xmm12 + xorpd %xmm5, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L105 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A1, LDA), %xmm6 + movhpd -13 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + movhpd -11 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm6 + movhpd -9 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A2), %xmm4 + movhpd -11 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A2), %xmm6 + movhpd -9 * SIZE(A2), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm4 + movhpd -15 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A2, LDA), %xmm6 + movhpd -13 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A2, LDA), %xmm4 + movhpd -11 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A2, LDA), %xmm6 + movhpd -9 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm2 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm3 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A1, LDA), %xmm6 + movhpd -13 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + movhpd -11 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm6 + movhpd -9 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A2), %xmm4 + movhpd -11 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A2), %xmm6 + movhpd -9 * SIZE(A2), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm4 + movhpd -15 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A2, LDA), %xmm6 + movhpd -13 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A2, LDA), %xmm4 + movhpd -11 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A2, LDA), %xmm6 + movhpd -9 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm2 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm3 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L105: + testq $2, M + je .L107 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A1, LDA), %xmm6 + movhpd -13 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2, LDA), %xmm4 + movhpd -15 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A2, LDA), %xmm6 + movhpd -13 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L107: + testq $1, M + je .L109 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm6 + movhpd -15 * SIZE(A1, LDA), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + movsd -16 * SIZE(A2, LDA), %xmm6 + movhpd -15 * SIZE(A2, LDA), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm0 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + ALIGN_3 + +.L109: + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L120 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L111: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + movsd 0 * SIZE(X), %xmm14 + movhpd 1 * SIZE(X), %xmm14 + addq INCX, X + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm8 + movddup ALPHA_I, %xmm9 +#else + movsd ALPHA_R, %xmm8 + unpcklpd %xmm8, %xmm8 + movsd ALPHA_I, %xmm9 + unpcklpd %xmm9, %xmm9 +#endif + + xorpd %xmm11, %xmm13 + xorpd %xmm11, %xmm15 + + mulpd %xmm8, %xmm12 + mulpd %xmm9, %xmm13 + mulpd %xmm8, %xmm14 + mulpd %xmm9, %xmm15 + +#ifndef XCONJ + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + + pshufd $0xee, %xmm14, %xmm15 + pshufd $0x44, %xmm14, %xmm14 + +#ifndef CONJ + xorpd %xmm11, %xmm13 + xorpd %xmm11, %xmm15 +#else + xorpd %xmm11, %xmm12 + xorpd %xmm11, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L115 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + movsd -10 * SIZE(A1), %xmm10 + movhpd -9 * SIZE(A1), %xmm10 + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + movsd -12 * SIZE(A2), %xmm8 + movhpd -11 * SIZE(A2), %xmm8 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + movsd -10 * SIZE(A2), %xmm10 + movhpd -9 * SIZE(A2), %xmm10 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm2 + movsd -4 * SIZE(A1), %xmm8 + movhpd -3 * SIZE(A1), %xmm8 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm3 + movsd -2 * SIZE(A1), %xmm10 + movhpd -1 * SIZE(A1), %xmm10 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + movsd -12 * SIZE(A2), %xmm8 + movhpd -11 * SIZE(A2), %xmm8 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + movsd -10 * SIZE(A2), %xmm10 + movhpd -9 * SIZE(A2), %xmm10 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm2 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm3 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L115: + testq $2, M + je .L117 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + movsd -14 * SIZE(A2), %xmm10 + movhpd -13 * SIZE(A2), %xmm10 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm0 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L117: + testq $1, M +#if GEMV_UNROLL == 2 + je .L119 +#else + je .L120 +#endif + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A2), %xmm6 + movhpd -15 * SIZE(A2), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L119: + cmpq $2, N + jge .L111 +#endif + ALIGN_3 + +.L120: +#endif + + cmpq $1, N + jl .L980 + +#if GEMV_UNROLL == 1 +.L121: + decq N +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + + pshufd $0x4e, %xmm12, %xmm13 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm8 + movddup ALPHA_I, %xmm9 +#else + movsd ALPHA_R, %xmm8 + unpcklpd %xmm8, %xmm8 + movsd ALPHA_I, %xmm9 + unpcklpd %xmm9, %xmm9 +#endif + + xorpd %xmm11, %xmm13 + + mulpd %xmm8, %xmm12 + mulpd %xmm9, %xmm13 + +#ifndef XCONJ + subpd %xmm13, %xmm12 +#else + addpd %xmm13, %xmm12 +#endif + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + +#ifndef CONJ + xorpd %xmm11, %xmm13 +#else + xorpd %xmm11, %xmm12 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + movq M, I + sarq $2, I + jle .L125 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + movsd -10 * SIZE(A1), %xmm10 + movhpd -9 * SIZE(A1), %xmm10 + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + movsd -4 * SIZE(A1), %xmm8 + movhpd -3 * SIZE(A1), %xmm8 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + movsd -2 * SIZE(A1), %xmm10 + movhpd -1 * SIZE(A1), %xmm10 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L125: + testq $2, M + je .L127 + + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L127: + testq $1, M +#if GEMV_UNROLL == 1 + je .L129 +#else + je .L980 +#endif + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 1 + ALIGN_3 +.L129: + cmpq $1, N + jge .L121 +#endif + + +#endif + ALIGN_3 + +.L980: + testq $SIZE, Y + jne .L990 + + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L184 + ALIGN_3 + +.L182: + movapd (Y), %xmm0 + addq INCY, Y + movapd (Y), %xmm1 + addq INCY, Y + movapd (Y), %xmm2 + addq INCY, Y + movapd (Y), %xmm3 + addq INCY, Y + movapd (Y), %xmm4 + addq INCY, Y + movapd (Y), %xmm5 + addq INCY, Y + movapd (Y), %xmm6 + addq INCY, Y + movapd (Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movapd %xmm0, (Y1) + addq INCY, Y1 + movapd %xmm1, (Y1) + addq INCY, Y1 + movapd %xmm2, (Y1) + addq INCY, Y1 + movapd %xmm3, (Y1) + addq INCY, Y1 + movapd %xmm4, (Y1) + addq INCY, Y1 + movapd %xmm5, (Y1) + addq INCY, Y1 + movapd %xmm6, (Y1) + addq INCY, Y1 + movapd %xmm7, (Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L182 + ALIGN_3 + +.L184: + testq $7, M + jle .L999 + + testq $4, M + jle .L185 + + movapd (Y), %xmm0 + addq INCY, Y + movapd (Y), %xmm1 + addq INCY, Y + movapd (Y), %xmm2 + addq INCY, Y + movapd (Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movapd %xmm0, (Y1) + addq INCY, Y1 + movapd %xmm1, (Y1) + addq INCY, Y1 + movapd %xmm2, (Y1) + addq INCY, Y1 + movapd %xmm3, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L185: + testq $2, M + jle .L186 + + movapd (Y), %xmm0 + addq INCY, Y + movapd (Y), %xmm1 + addq INCY, Y + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movapd %xmm0, (Y1) + addq INCY, Y1 + movapd %xmm1, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L186: + testq $1, M + jle .L999 + + movapd (Y), %xmm0 + + addpd (BUFFER), %xmm0 + + movapd %xmm0, (Y1) + jmp .L999 + ALIGN_3 + +.L990: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L994 + ALIGN_3 + +.L992: + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm6 + movhpd 1 * SIZE(Y), %xmm6 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm5, 0 * SIZE(Y1) + movhpd %xmm5, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm7, 0 * SIZE(Y1) + movhpd %xmm7, 1 * SIZE(Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $7, M + jle .L999 + + testq $4, M + jle .L995 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L995: + testq $2, M + jle .L996 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L996: + testq $1, M + jle .L999 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + + addpd 0 * SIZE(BUFFER), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_n_atom.S b/kernel/x86_64/zgemv_n_atom.S new file mode 100644 index 0000000..289c076 --- /dev/null +++ b/kernel/x86_64/zgemv_n_atom.S @@ -0,0 +1,1142 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %r11 +#define A1 %r12 +#define A2 %r13 + +#define Y1 %r14 +#define BUFFER %r15 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 subsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 addsd +#define ADD4 subsd +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 addsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 subsd +#define ADD4 subsd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movapd %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movaps %xmm0, ALPHA_R + movaps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, Y1 + + pxor %xmm4, %xmm4 + + movq M, %rax + addq $8, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movapd %xmm4, 0 * SIZE(Y1) + movapd %xmm4, 2 * SIZE(Y1) + movapd %xmm4, 4 * SIZE(Y1) + movapd %xmm4, 6 * SIZE(Y1) + movapd %xmm4, 8 * SIZE(Y1) + movapd %xmm4, 10 * SIZE(Y1) + movapd %xmm4, 12 * SIZE(Y1) + movapd %xmm4, 14 * SIZE(Y1) + + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: + movq N, J + sarq $1, J + jle .L20 + ALIGN_3 + +.L11: + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + movapd %xmm4, %xmm8 + mulsd ALPHA_R, %xmm4 + mulsd ALPHA_I, %xmm8 + movapd %xmm6, %xmm10 + mulsd ALPHA_R, %xmm6 + mulsd ALPHA_I, %xmm10 + + movapd %xmm5, %xmm9 + mulsd ALPHA_I, %xmm9 + mulsd ALPHA_R, %xmm5 + movapd %xmm7, %xmm11 + mulsd ALPHA_I, %xmm11 + mulsd ALPHA_R, %xmm7 + +#ifndef XCONJ + subsd %xmm9, %xmm4 + addsd %xmm8, %xmm5 + subsd %xmm11, %xmm6 + addsd %xmm10, %xmm7 +#else + addsd %xmm9, %xmm4 + subsd %xmm8, %xmm5 + addsd %xmm11, %xmm6 + subsd %xmm10, %xmm7 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + movsd -14 * SIZE(Y1), %xmm2 + movsd -13 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq M, I + sarq $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -14 * SIZE(A1), %xmm10 + movsd -13 * SIZE(A1), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + ADD2 %xmm12, %xmm1 + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A2) +#endif + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -15 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A1), %xmm10 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -16 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -12 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm12 + movlpd %xmm1, -15 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -14 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movlpd %xmm3, -13 * SIZE(Y1) + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + movsd -10 * SIZE(Y1), %xmm2 + ADD2 %xmm12, %xmm1 + movsd -9 * SIZE(Y1), %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -6 * SIZE(A1), %xmm10 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -7 * SIZE(A1), %xmm9 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -12 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -8 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -5 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm12 + movlpd %xmm1, -11 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -7 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -10 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movsd -6 * SIZE(Y1), %xmm2 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + movlpd %xmm3, -9 * SIZE(Y1) + ADD2 %xmm12, %xmm1 + movsd -5 * SIZE(Y1), %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -15 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A1), %xmm10 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -16 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -12 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm12 + movlpd %xmm1, -15 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -14 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movlpd %xmm3, -13 * SIZE(Y1) + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + movsd -10 * SIZE(Y1), %xmm2 + ADD2 %xmm12, %xmm1 + movsd -9 * SIZE(Y1), %xmm3 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm2 + movlpd %xmm0, -12 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + ADD4 %xmm13, %xmm3 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + movlpd %xmm2, -10 * SIZE(Y1) + movsd -6 * SIZE(Y1), %xmm2 + movlpd %xmm3, -9 * SIZE(Y1) + movsd -5 * SIZE(Y1), %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -14 * SIZE(A1), %xmm10 + movsd -13 * SIZE(A1), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -15 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm2 + ADD4 %xmm13, %xmm3 + + movlpd %xmm0, -16 * SIZE(Y1) + movlpd %xmm1, -15 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movsd -11 * SIZE(Y1), %xmm1 + + movlpd %xmm2, -14 * SIZE(Y1) + movlpd %xmm3, -13 * SIZE(Y1) + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -15 * SIZE(A2), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD1 %xmm10, %xmm0 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm0 + ADD4 %xmm13, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movlpd %xmm1, -15 * SIZE(Y1) + ALIGN_3 + +.L19: + decq J + jg .L11 + ALIGN_3 + +.L20: + testq $1, N + jle .L90 + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + + movapd %xmm4, %xmm8 + mulsd ALPHA_R, %xmm4 + mulsd ALPHA_I, %xmm8 + movapd %xmm5, %xmm9 + mulsd ALPHA_I, %xmm9 + mulsd ALPHA_R, %xmm5 + +#ifndef XCONJ + subsd %xmm9, %xmm4 + addsd %xmm8, %xmm5 +#else + addsd %xmm9, %xmm4 + subsd %xmm8, %xmm5 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + movsd -14 * SIZE(Y1), %xmm2 + movsd -13 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq M, I + sarq $2, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -14 * SIZE(A1), %xmm10 + movsd -13 * SIZE(A1), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm5, %xmm13 + ADD2 %xmm12, %xmm1 + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A2) +#endif + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -16 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -12 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A1), %xmm11 + + mulsd %xmm5, %xmm12 + movlpd %xmm1, -15 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -14 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movsd -10 * SIZE(Y1), %xmm2 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + mulsd %xmm5, %xmm13 + movlpd %xmm3, -13 * SIZE(Y1) + ADD2 %xmm12, %xmm1 + movsd -9 * SIZE(Y1), %xmm3 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -6 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -7 * SIZE(A1), %xmm9 + mulsd %xmm4, %xmm13 + subq $-8 * SIZE, A1 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -12 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -8 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm12 + movlpd %xmm1, -11 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -7 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -10 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movsd -6 * SIZE(Y1), %xmm2 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm5, %xmm13 + movlpd %xmm3, -9 * SIZE(Y1) + ADD2 %xmm12, %xmm1 + movsd -5 * SIZE(Y1), %xmm3 + + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -16 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -12 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A1), %xmm11 + + mulsd %xmm5, %xmm12 + movlpd %xmm1, -15 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + movlpd %xmm2, -14 * SIZE(Y1) + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(Y1), %xmm2 + mulsd %xmm5, %xmm13 + movlpd %xmm3, -13 * SIZE(Y1) + ADD2 %xmm12, %xmm1 + movsd -9 * SIZE(Y1), %xmm3 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm2 + movlpd %xmm0, -12 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + ADD4 %xmm13, %xmm3 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + movlpd %xmm2, -10 * SIZE(Y1) + movlpd %xmm3, -9 * SIZE(Y1) + movsd -6 * SIZE(Y1), %xmm2 + movsd -5 * SIZE(Y1), %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -14 * SIZE(A1), %xmm10 + movsd -13 * SIZE(A1), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + ADD1 %xmm8, %xmm0 + mulsd %xmm5, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm2 + movlpd %xmm0, -16 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + ADD4 %xmm13, %xmm3 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + movlpd %xmm2, -14 * SIZE(Y1) + movlpd %xmm3, -13 * SIZE(Y1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, M + je .L90 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm9, %xmm0 + ADD4 %xmm13, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movlpd %xmm1, -15 * SIZE(Y1) + ALIGN_3 + +.L90: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L94 + ALIGN_3 + +.L92: + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm6 + movhpd 1 * SIZE(Y), %xmm6 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm5, 0 * SIZE(Y1) + movhpd %xmm5, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm7, 0 * SIZE(Y1) + movhpd %xmm7, 1 * SIZE(Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L92 + ALIGN_3 + +.L94: + testq $7, M + jle .L999 + + testq $4, M + jle .L95 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L95: + testq $2, M + jle .L96 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L96: + testq $1, M + jle .L999 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + + addpd 0 * SIZE(BUFFER), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_n_dup.S b/kernel/x86_64/zgemv_n_dup.S new file mode 100644 index 0000000..8a49fc9 --- /dev/null +++ b/kernel/x86_64/zgemv_n_dup.S @@ -0,0 +1,1500 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA_R 48 (%rsp) +#define ALPHA_I 56 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) +#define ALPHA_R 224 (%rsp) +#define ALPHA_I 232 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define A1 %r12 +#define A2 %r13 + +#define Y1 %r14 +#define BUFFER %r15 + +#define J %r11 + +#undef SUBPD + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPD subpd +#else +#define SUBPD addpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movapd %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, Y1 + + xorps %xmm4, %xmm4 + + movq M, %rax + addq $8, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movaps %xmm4, 0 * SIZE(Y1) + movaps %xmm4, 2 * SIZE(Y1) + movaps %xmm4, 4 * SIZE(Y1) + movaps %xmm4, 6 * SIZE(Y1) + movaps %xmm4, 8 * SIZE(Y1) + movaps %xmm4, 10 * SIZE(Y1) + movaps %xmm4, 12 * SIZE(Y1) + movaps %xmm4, 14 * SIZE(Y1) + + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movddup 0 * SIZE(X), %xmm8 + movddup 1 * SIZE(X), %xmm9 + addq INCX, X + movddup 0 * SIZE(X), %xmm10 + movddup 1 * SIZE(X), %xmm11 + addq INCX, X + movddup 0 * SIZE(X), %xmm12 + movddup 1 * SIZE(X), %xmm13 + addq INCX, X + movddup 0 * SIZE(X), %xmm14 + movddup 1 * SIZE(X), %xmm15 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0x40, %xmm5, %xmm5 + + movsd ALPHA_R, %xmm6 + movhps ALPHA_I, %xmm6 + + pshufd $0x4e, %xmm6, %xmm7 + +#ifndef XCONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + +#ifndef XCONJ + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifndef XCONJ + xorps %xmm5, %xmm9 + xorps %xmm5, %xmm11 + xorps %xmm5, %xmm13 + xorps %xmm5, %xmm15 +#else + xorps %xmm5, %xmm8 + xorps %xmm5, %xmm10 + xorps %xmm5, %xmm12 + xorps %xmm5, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L15 + + movddup -16 * SIZE(A1), %xmm4 + movddup -14 * SIZE(A1), %xmm5 + movddup -12 * SIZE(A1), %xmm6 + movddup -10 * SIZE(A1), %xmm7 + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A1, LDA), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A1, LDA), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A1, LDA), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A1, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1, LDA), %xmm4 + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1, LDA), %xmm5 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1, LDA), %xmm6 + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm11, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2), %xmm5 + mulpd %xmm11, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2), %xmm4 + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2), %xmm5 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2), %xmm6 + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2), %xmm7 + + mulpd %xmm13, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2, LDA), %xmm4 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2, LDA), %xmm5 + mulpd %xmm13, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2, LDA), %xmm6 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2, LDA), %xmm4 + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2, LDA), %xmm5 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2, LDA), %xmm6 + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2, LDA), %xmm7 + + mulpd %xmm15, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -8 * SIZE(A1), %xmm4 + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -6 * SIZE(A1), %xmm5 + mulpd %xmm15, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -4 * SIZE(A1), %xmm6 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -2 * SIZE(A1), %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A1, LDA), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A1, LDA), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A1, LDA), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1, LDA), %xmm4 + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1, LDA), %xmm5 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1, LDA), %xmm6 + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm11, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2), %xmm5 + mulpd %xmm11, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2), %xmm7 + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2), %xmm4 + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2), %xmm5 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2), %xmm6 + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2), %xmm7 + + mulpd %xmm13, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2, LDA), %xmm4 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2, LDA), %xmm5 + mulpd %xmm13, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2, LDA), %xmm6 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2, LDA), %xmm7 + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2, LDA), %xmm4 + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2, LDA), %xmm5 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2, LDA), %xmm6 + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2, LDA), %xmm7 + + mulpd %xmm15, %xmm4 + SUBPD %xmm4, %xmm0 + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm1 + mulpd %xmm15, %xmm6 + SUBPD %xmm6, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -14 * SIZE(A1), %xmm6 + movddup -13 * SIZE(A1), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A1, LDA, 1), %xmm4 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movddup -14 * SIZE(A1, LDA, 1), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A1, LDA, 1), %xmm5 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + movddup -13 * SIZE(A1, LDA, 1), %xmm7 + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + movddup -14 * SIZE(A2), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A2), %xmm5 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + movddup -13 * SIZE(A2), %xmm7 + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A2, LDA, 1), %xmm4 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movddup -14 * SIZE(A2, LDA, 1), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A2, LDA, 1), %xmm5 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + movddup -13 * SIZE(A2, LDA, 1), %xmm7 + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -16 * SIZE(A1, LDA, 1), %xmm6 + movddup -15 * SIZE(A1, LDA, 1), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A2), %xmm5 + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + movddup -16 * SIZE(A2, LDA, 1), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm0 + movddup -15 * SIZE(A2, LDA, 1), %xmm7 + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + ALIGN_3 + +.L19: + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movddup 0 * SIZE(X), %xmm8 + movddup 1 * SIZE(X), %xmm9 + addq INCX, X + movddup 0 * SIZE(X), %xmm10 + movddup 1 * SIZE(X), %xmm11 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0x40, %xmm5, %xmm5 + + movsd ALPHA_R, %xmm6 + movhps ALPHA_I, %xmm6 + + pshufd $0x4e, %xmm6, %xmm7 + +#ifndef XCONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + +#ifndef XCONJ + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + +#ifndef XCONJ + xorps %xmm5, %xmm9 + xorps %xmm5, %xmm11 +#else + xorps %xmm5, %xmm8 + xorps %xmm5, %xmm10 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + movq M, I + sarq $2, I + jle .L25 + + movddup -16 * SIZE(A1), %xmm4 + movddup -14 * SIZE(A1), %xmm5 + movddup -12 * SIZE(A1), %xmm6 + movddup -10 * SIZE(A1), %xmm7 + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2), %xmm4 + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2), %xmm5 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2), %xmm6 + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2), %xmm7 + + mulpd %xmm11, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -8 * SIZE(A1), %xmm4 + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -6 * SIZE(A1), %xmm5 + mulpd %xmm11, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -4 * SIZE(A1), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -2 * SIZE(A1), %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2), %xmm7 + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2), %xmm4 + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2), %xmm5 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2), %xmm6 + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2), %xmm7 + + mulpd %xmm11, %xmm4 + SUBPD %xmm4, %xmm0 + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm1 + mulpd %xmm11, %xmm6 + SUBPD %xmm6, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -14 * SIZE(A1), %xmm6 + movddup -13 * SIZE(A1), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movddup -14 * SIZE(A2), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A2), %xmm5 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + movddup -13 * SIZE(A2), %xmm7 + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, M +#if GEMV_UNROLL == 2 + je .L29 +#else + je .L30 +#endif + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -16 * SIZE(A2), %xmm6 + movddup -15 * SIZE(A2), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L29: + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: +#endif + + cmpq $1, N + jl .L980 + +#if GEMV_UNROLL == 1 +.L31: + decq N +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + movddup 0 * SIZE(X), %xmm8 + movddup 1 * SIZE(X), %xmm9 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0x40, %xmm5, %xmm5 + + movsd ALPHA_R, %xmm6 + movhps ALPHA_I, %xmm6 + + pshufd $0x4e, %xmm6, %xmm7 + +#ifndef XCONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef XCONJ + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + +#ifndef XCONJ + xorps %xmm5, %xmm9 +#else + xorps %xmm5, %xmm8 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + movq M, I + sarq $2, I + jle .L35 + + movddup -16 * SIZE(A1), %xmm4 + movddup -14 * SIZE(A1), %xmm5 + movddup -12 * SIZE(A1), %xmm6 + movddup -10 * SIZE(A1), %xmm7 + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -8 * SIZE(A1), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -6 * SIZE(A1), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -4 * SIZE(A1), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -2 * SIZE(A1), %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -14 * SIZE(A1), %xmm6 + movddup -13 * SIZE(A1), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $1, M +#if GEMV_UNROLL == 1 + je .L39 +#else + je .L980 +#endif + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 1 + ALIGN_3 +.L39: + cmpq $1, N + jge .L31 +#endif + +.L980: + testq $SIZE, Y + jne .L990 + + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L184 + ALIGN_3 + +.L182: + movaps (Y), %xmm0 + addq INCY, Y + movaps (Y), %xmm1 + addq INCY, Y + movaps (Y), %xmm2 + addq INCY, Y + movaps (Y), %xmm3 + addq INCY, Y + movaps (Y), %xmm4 + addq INCY, Y + movaps (Y), %xmm5 + addq INCY, Y + movaps (Y), %xmm6 + addq INCY, Y + movaps (Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movaps %xmm0, (Y1) + addq INCY, Y1 + movaps %xmm1, (Y1) + addq INCY, Y1 + movaps %xmm2, (Y1) + addq INCY, Y1 + movaps %xmm3, (Y1) + addq INCY, Y1 + movaps %xmm4, (Y1) + addq INCY, Y1 + movaps %xmm5, (Y1) + addq INCY, Y1 + movaps %xmm6, (Y1) + addq INCY, Y1 + movaps %xmm7, (Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L182 + ALIGN_3 + +.L184: + testq $7, M + jle .L999 + + testq $4, M + jle .L185 + + movaps (Y), %xmm0 + addq INCY, Y + movaps (Y), %xmm1 + addq INCY, Y + movaps (Y), %xmm2 + addq INCY, Y + movaps (Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movaps %xmm0, (Y1) + addq INCY, Y1 + movaps %xmm1, (Y1) + addq INCY, Y1 + movaps %xmm2, (Y1) + addq INCY, Y1 + movaps %xmm3, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L185: + testq $2, M + jle .L186 + + movaps (Y), %xmm0 + addq INCY, Y + movaps (Y), %xmm1 + addq INCY, Y + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movaps %xmm0, (Y1) + addq INCY, Y1 + movaps %xmm1, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L186: + testq $1, M + jle .L999 + + movaps (Y), %xmm0 + + addpd (BUFFER), %xmm0 + + movaps %xmm0, (Y1) + jmp .L999 + ALIGN_3 + +.L990: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L994 + ALIGN_3 + +.L992: + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm6 + movhpd 1 * SIZE(Y), %xmm6 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm5, 0 * SIZE(Y1) + movhpd %xmm5, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm7, 0 * SIZE(Y1) + movhpd %xmm7, 1 * SIZE(Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $7, M + jle .L999 + + testq $4, M + jle .L995 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L995: + testq $2, M + jle .L996 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L996: + testq $1, M + jle .L999 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + + addpd 0 * SIZE(BUFFER), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_t.S b/kernel/x86_64/zgemv_t.S new file mode 100644 index 0000000..d7f9d49 --- /dev/null +++ b/kernel/x86_64/zgemv_t.S @@ -0,0 +1,2433 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %rbx +#define A1 %r11 +#define A2 %r12 + +#define X1 %r13 +#define Y1 %r14 +#define BUFFER %r15 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#undef SUBPD + +#ifndef CONJ +#define SUBPD addpd +#else +#define SUBPD subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + +#ifdef HAVE_SSE3 + movddup %xmm0, ALPHA_R + movddup %xmm1, ALPHA_I +#else + pshufd $0x44, %xmm0, ALPHA_R + pshufd $0x44, %xmm1, ALPHA_I +#endif + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, X1 + + movq Y, Y1 + + movq M, I + sarq $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addq INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addq INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addq INCX, X + + movapd %xmm0, 0 * SIZE(X1) + movapd %xmm1, 2 * SIZE(X1) + movapd %xmm2, 4 * SIZE(X1) + movapd %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + movapd %xmm0, 0 * SIZE(X1) + addq $2 * SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: +#ifdef ALIGNED_ACCESS + testq $SIZE, A + jne .L100 +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorpd %xmm4, %xmm4 + xorpd %xmm5, %xmm5 + xorpd %xmm6, %xmm6 + xorpd %xmm7, %xmm7 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L15 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm7 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm7 + ALIGN_3 + +.L19: + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0xc0, %xmm13, %xmm13 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm13, %xmm0 + xorpd %xmm13, %xmm2 + xorpd %xmm13, %xmm4 + xorpd %xmm13, %xmm6 +#else + xorpd %xmm13, %xmm1 + xorpd %xmm13, %xmm3 + xorpd %xmm13, %xmm5 + xorpd %xmm13, %xmm7 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + movapd %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm10 + + movapd %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + pshufd $0x4e, %xmm2, %xmm3 + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm7 + + xorpd %xmm13, %xmm1 + xorpd %xmm13, %xmm3 + xorpd %xmm13, %xmm5 + xorpd %xmm13, %xmm7 + + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm4) + MOVUPS_XL1(-14 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L25 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + MOVUPS_A1(-14 * SIZE, A1, %xmm12) + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm10) + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1( -8 * SIZE, A2, %xmm10) + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1( -6 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1( -6 * SIZE, A2, %xmm6) + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1( -6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm10) + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1( -6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + + MOVUPS_A1(-14 * SIZE, A1, %xmm12) + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L27: + testq $1, M + je .L29 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + ALIGN_3 + +.L29: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm11, %xmm0 + xorpd %xmm11, %xmm2 +#else + xorpd %xmm11, %xmm1 + xorpd %xmm11, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + pshufd $0x4e, %xmm2, %xmm3 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + xorpd %xmm11, %xmm1 + xorpd %xmm11, %xmm3 + + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: +#endif + + cmpq $1, N + jl .L999 + +#if GEMV_UNROLL == 1 +.L31: + decq N +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm4) + MOVUPS_XL1(-14 * SIZE, X1, %xmm5) + + movq M, I + sarq $2, I + jle .L35 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-14 * SIZE, A1, %xmm12) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1( -6 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-14 * SIZE, A1, %xmm12) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L37: + testq $1, M + je .L39 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + ALIGN_3 + +.L39: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm11, %xmm0 +#else + xorpd %xmm11, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + addpd %xmm8, %xmm0 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + + xorpd %xmm11, %xmm1 + + subpd %xmm1, %xmm0 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + +#if GEMV_UNROLL == 1 + addq INCY, Y + addq INCY, Y1 + + cmpq $1, N + jge .L31 +#endif + +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_3 + +.L100: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorpd %xmm4, %xmm4 + xorpd %xmm5, %xmm5 + xorpd %xmm6, %xmm6 + xorpd %xmm7, %xmm7 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L105 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + + movsd -16 * SIZE(A1, LDA), %xmm10 + movhpd -15 * SIZE(A1, LDA), %xmm10 + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm10 + movhpd -15 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -14 * SIZE(A1), %xmm8 + movhpd -13 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm10 + movhpd -13 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -14 * SIZE(A2), %xmm8 + movhpd -13 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -14 * SIZE(A2, LDA), %xmm10 + movhpd -13 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movsd -12 * SIZE(A1, LDA), %xmm10 + movhpd -11 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A2), %xmm8 + movhpd -11 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -12 * SIZE(A2, LDA), %xmm10 + movhpd -11 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -10 * SIZE(A1), %xmm8 + movhpd -9 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -10 * SIZE(A1, LDA), %xmm10 + movhpd -9 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -10 * SIZE(A2), %xmm8 + movhpd -9 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -10 * SIZE(A2, LDA), %xmm10 + movhpd -9 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movsd -8 * SIZE(A1), %xmm8 + movhpd -7 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movsd -8 * SIZE(A1, LDA), %xmm10 + movhpd -7 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm10 + movhpd -15 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -14 * SIZE(A1), %xmm8 + movhpd -13 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm10 + movhpd -13 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -14 * SIZE(A2), %xmm8 + movhpd -13 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -14 * SIZE(A2, LDA), %xmm10 + movhpd -13 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movsd -12 * SIZE(A1, LDA), %xmm10 + movhpd -11 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A2), %xmm8 + movhpd -11 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -12 * SIZE(A2, LDA), %xmm10 + movhpd -11 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -10 * SIZE(A1), %xmm8 + movhpd -9 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -10 * SIZE(A1, LDA), %xmm10 + movhpd -9 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -10 * SIZE(A2), %xmm8 + movhpd -9 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -10 * SIZE(A2, LDA), %xmm10 + movhpd -9 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L105: + testq $2, M + je .L107 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + + movsd -16 * SIZE(A1, LDA), %xmm10 + movhpd -15 * SIZE(A1, LDA), %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm10 + movhpd -15 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -14 * SIZE(A1), %xmm8 + movhpd -13 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm10 + movhpd -13 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -14 * SIZE(A2), %xmm8 + movhpd -13 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -14 * SIZE(A2, LDA), %xmm10 + movhpd -13 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm7 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L107: + testq $1, M + je .L109 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + + movsd -16 * SIZE(A1, LDA), %xmm10 + movhpd -15 * SIZE(A1, LDA), %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm10 + movhpd -15 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm7 + ALIGN_3 + +.L109: + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0xc0, %xmm13, %xmm13 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm13, %xmm0 + xorpd %xmm13, %xmm2 + xorpd %xmm13, %xmm4 + xorpd %xmm13, %xmm6 +#else + xorpd %xmm13, %xmm1 + xorpd %xmm13, %xmm3 + xorpd %xmm13, %xmm5 + xorpd %xmm13, %xmm7 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + movapd %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm10 + + movapd %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + pshufd $0x4e, %xmm2, %xmm3 + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm7 + + xorpd %xmm13, %xmm1 + xorpd %xmm13, %xmm3 + xorpd %xmm13, %xmm5 + xorpd %xmm13, %xmm7 + + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L120 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L111: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm4) + MOVUPS_XL1(-14 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L115 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A2), %xmm10 + movhpd -15 * SIZE(A2), %xmm10 + + movsd -14 * SIZE(A1), %xmm12 + movhpd -13 * SIZE(A1), %xmm12 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + movsd -12 * SIZE(A2), %xmm10 + movhpd -11 * SIZE(A2), %xmm10 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -10 * SIZE(A1), %xmm12 + movhpd -9 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + movsd -10 * SIZE(A2), %xmm6 + movhpd -9 * SIZE(A2), %xmm6 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + movhpd -7 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + movsd -8 * SIZE(A2), %xmm10 + movhpd -7 * SIZE(A2), %xmm10 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -6 * SIZE(A1), %xmm12 + movhpd -5 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + movsd -6 * SIZE(A2), %xmm6 + movhpd -5 * SIZE(A2), %xmm6 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1( -6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + movsd -12 * SIZE(A2), %xmm10 + movhpd -11 * SIZE(A2), %xmm10 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -10 * SIZE(A1), %xmm12 + movhpd -9 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + movsd -10 * SIZE(A2), %xmm6 + movhpd -9 * SIZE(A2), %xmm6 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1( -6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L115: + testq $2, M + je .L117 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A2), %xmm10 + movhpd -15 * SIZE(A2), %xmm10 + + movsd -14 * SIZE(A1), %xmm12 + movhpd -13 * SIZE(A1), %xmm12 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L117: + testq $1, M + je .L119 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A2), %xmm10 + movhpd -15 * SIZE(A2), %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + ALIGN_3 + +.L119: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm11, %xmm0 + xorpd %xmm11, %xmm2 +#else + xorpd %xmm11, %xmm1 + xorpd %xmm11, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + pshufd $0x4e, %xmm2, %xmm3 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + xorpd %xmm11, %xmm1 + xorpd %xmm11, %xmm3 + + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L111 +#endif + ALIGN_3 + +.L120: +#endif + + cmpq $1, N + jl .L999 + +#if GEMV_UNROLL == 1 +.L121: + decq N +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm4) + MOVUPS_XL1(-14 * SIZE, X1, %xmm5) + + movq M, I + sarq $2, I + jle .L125 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -14 * SIZE(A1), %xmm12 + movhpd -13 * SIZE(A1), %xmm12 + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -10 * SIZE(A1), %xmm12 + movhpd -9 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + movhpd -7 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -6 * SIZE(A1), %xmm12 + movhpd -5 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -10 * SIZE(A1), %xmm12 + movhpd -9 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L125: + testq $2, M + je .L127 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -14 * SIZE(A1), %xmm12 + movhpd -13 * SIZE(A1), %xmm12 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L127: + testq $1, M + je .L129 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + ALIGN_3 + +.L129: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm11, %xmm0 +#else + xorpd %xmm11, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + addpd %xmm8, %xmm0 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + + xorpd %xmm11, %xmm1 + + subpd %xmm1, %xmm0 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + +#if GEMV_UNROLL == 1 + addq INCY, Y + addq INCY, Y1 + + cmpq $1, N + jge .L121 +#endif + + +#endif + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_t_atom.S b/kernel/x86_64/zgemv_t_atom.S new file mode 100644 index 0000000..5d3ecdd --- /dev/null +++ b/kernel/x86_64/zgemv_t_atom.S @@ -0,0 +1,968 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %rbx +#define A1 %r11 +#define A2 %r12 + +#define X1 %r13 +#define Y1 %r14 +#define BUFFER %r15 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 subsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 addsd +#define ADD4 subsd +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 addsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 subsd +#define ADD4 subsd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movaps %xmm0, ALPHA_R + movaps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, X1 + + movq Y, Y1 + + movq M, I + sarq $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addq INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addq INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addq INCX, X + + movapd %xmm0, 0 * SIZE(X1) + movapd %xmm1, 2 * SIZE(X1) + movapd %xmm2, 4 * SIZE(X1) + movapd %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + movapd %xmm0, 0 * SIZE(X1) + addq $2 * SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq N, J + sarq $1, J + jle .L20 + ALIGN_3 + +.L11: + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + + movsd -16 * SIZE(X1), %xmm4 + movsd -15 * SIZE(X1), %xmm5 + movsd -14 * SIZE(X1), %xmm6 + movsd -13 * SIZE(X1), %xmm7 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -15 * SIZE(A2), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + movsd -9 * SIZE(X1), %xmm7 + ADD1 %xmm10, %xmm2 + movsd -12 * SIZE(A2), %xmm10 + mulsd %xmm6, %xmm13 + movsd -10 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -11 * SIZE(A2), %xmm11 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A2) +#endif + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -9 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -7 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -8 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -7 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + movsd -5 * SIZE(X1), %xmm7 + ADD1 %xmm10, %xmm2 + movsd -8 * SIZE(A2), %xmm10 + mulsd %xmm6, %xmm13 + movsd -6 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + subq $-8 * SIZE, A1 + mulsd %xmm4, %xmm8 + subq $-8 * SIZE, X1 + ADD3 %xmm11, %xmm2 + movsd -7 * SIZE(A2), %xmm11 + mulsd %xmm5, %xmm12 + subq $-8 * SIZE, A2 + ADD4 %xmm13, %xmm3 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + movsd -9 * SIZE(X1), %xmm7 + ADD1 %xmm10, %xmm2 + movsd -12 * SIZE(A2), %xmm10 + mulsd %xmm6, %xmm13 + movsd -10 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -11 * SIZE(A2), %xmm11 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -9 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -7 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -8 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + movsd -5 * SIZE(X1), %xmm7 + ADD1 %xmm10, %xmm2 + mulsd %xmm6, %xmm13 + movsd -6 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm3 + + ADD3 %xmm11, %xmm2 + ADD4 %xmm13, %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -15 * SIZE(A2), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD1 %xmm10, %xmm2 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm3 + + ADD3 %xmm11, %xmm2 + ADD4 %xmm13, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -15 * SIZE(A2), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD1 %xmm10, %xmm2 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm3 + + ADD3 %xmm11, %xmm2 + ADD4 %xmm13, %xmm3 + ALIGN_3 + +.L19: + movsd 0 * SIZE(Y), %xmm4 + movapd %xmm0, %xmm10 + mulsd ALPHA_R, %xmm0 + movsd 1 * SIZE(Y), %xmm5 + movapd %xmm1, %xmm11 + mulsd ALPHA_R, %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm6 + movapd %xmm2, %xmm12 + mulsd ALPHA_R, %xmm2 + movsd 1 * SIZE(Y), %xmm7 + movapd %xmm3, %xmm13 + mulsd ALPHA_R, %xmm3 + addq INCY, Y + + mulsd ALPHA_I, %xmm10 + mulsd ALPHA_I, %xmm11 + mulsd ALPHA_I, %xmm12 + mulsd ALPHA_I, %xmm13 + + addsd %xmm10, %xmm1 + subsd %xmm11, %xmm0 + addsd %xmm12, %xmm3 + subsd %xmm13, %xmm2 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + movlpd %xmm0, 0 * SIZE(Y1) + movlpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movlpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + decq J + jg .L11 + ALIGN_3 + +.L20: + testq $1, N + jle .L999 + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + + movsd -16 * SIZE(X1), %xmm4 + movsd -15 * SIZE(X1), %xmm5 + movsd -14 * SIZE(X1), %xmm6 + movsd -13 * SIZE(X1), %xmm7 + + movq M, I + sarq $2, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + movsd -9 * SIZE(X1), %xmm7 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + movsd -10 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -7 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -8 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -9 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + movsd -5 * SIZE(X1), %xmm7 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + movsd -6 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm9, %xmm0 + mulsd %xmm5, %xmm12 + movsd -7 * SIZE(A1), %xmm9 + ADD4 %xmm13, %xmm1 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + subq $-8 * SIZE, A2 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + movsd -9 * SIZE(X1), %xmm7 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + movsd -10 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -7 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -8 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -9 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + movsd -5 * SIZE(X1), %xmm7 + ADD1 %xmm8, %xmm0 + mulsd %xmm6, %xmm13 + movsd -6 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm9, %xmm0 + ADD4 %xmm13, %xmm1 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm9, %xmm0 + ADD4 %xmm13, %xmm1 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L27: + testq $1, M + je .L29 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm9, %xmm0 + ADD4 %xmm13, %xmm1 + ALIGN_3 + +.L29: + movsd 0 * SIZE(Y), %xmm4 + movapd %xmm0, %xmm10 + mulsd ALPHA_R, %xmm0 + movsd 1 * SIZE(Y), %xmm5 + movapd %xmm1, %xmm11 + mulsd ALPHA_R, %xmm1 + + mulsd ALPHA_I, %xmm10 + mulsd ALPHA_I, %xmm11 + + addsd %xmm10, %xmm1 + subsd %xmm11, %xmm0 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movlpd %xmm1, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_t_dup.S b/kernel/x86_64/zgemv_t_dup.S new file mode 100644 index 0000000..2db99b6 --- /dev/null +++ b/kernel/x86_64/zgemv_t_dup.S @@ -0,0 +1,1223 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %rbx +#define A1 %r11 +#define A2 %r12 + +#define X1 %r13 +#define Y1 %r14 +#define BUFFER %r15 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#undef SUBPD + +#ifndef CONJ +#define SUBPD addpd +#else +#define SUBPD subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0x04, %xmm5, %xmm5 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, ALPHA_R + pshufd $0x4e, %xmm0, ALPHA_I + + xorps %xmm5, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, X1 + + movq Y, Y1 + + movq M, I + sarq $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addq INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addq INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addq INCX, X + + movapd %xmm0, 0 * SIZE(X1) + movapd %xmm1, 2 * SIZE(X1) + movapd %xmm2, 4 * SIZE(X1) + movapd %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + movapd %xmm0, 0 * SIZE(X1) + addq $2 * SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L15 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -16 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -15 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -16 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -15 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + movddup -13 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + movddup -11 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movddup -12 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + movddup -11 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -12 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -11 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -10 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -9 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -10 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + movddup -9 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -10 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -9 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A2, LDA), %xmm11 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movddup -8 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + movddup -7 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movddup -8 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + movddup -7 * SIZE(A1, LDA), %xmm11 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -16 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -15 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -16 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -15 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + movddup -13 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + movddup -11 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movddup -12 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + movddup -11 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -12 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -11 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -10 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -9 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -10 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + movddup -9 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -10 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -9 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -16 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -15 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -16 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -15 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + movddup -13 * SIZE(A1, LDA), %xmm11 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -16 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -15 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -16 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -15 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + ALIGN_3 + +.L19: + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0x40, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm13, %xmm1 + xorps %xmm13, %xmm3 + xorps %xmm13, %xmm5 + xorps %xmm13, %xmm7 +#else + xorps %xmm13, %xmm0 + xorps %xmm13, %xmm2 + xorps %xmm13, %xmm4 + xorps %xmm13, %xmm6 +#endif + + pshufd $0x4e, %xmm1, %xmm1 + pshufd $0x4e, %xmm3, %xmm3 + pshufd $0x4e, %xmm5, %xmm5 + pshufd $0x4e, %xmm7, %xmm7 + +#ifndef CONJ + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#else + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#endif + + pshufd $0xee, %xmm0, %xmm1 + movddup %xmm0, %xmm0 + pshufd $0xee, %xmm2, %xmm3 + movddup %xmm2, %xmm2 + pshufd $0xee, %xmm4, %xmm5 + movddup %xmm4, %xmm4 + pshufd $0xee, %xmm6, %xmm7 + movddup %xmm6, %xmm6 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L25 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -12 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -11 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -10 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -9 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -8 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -7 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -8 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -7 * SIZE(A1, LDA), %xmm11 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -12 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -11 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -10 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -9 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L27: + testq $1, M + je .L29 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + ALIGN_3 + +.L29: + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0x40, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm13, %xmm1 + xorps %xmm13, %xmm3 +#else + xorps %xmm13, %xmm0 + xorps %xmm13, %xmm2 +#endif + + pshufd $0x4e, %xmm1, %xmm1 + pshufd $0x4e, %xmm3, %xmm3 + +#ifndef CONJ + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 +#else + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 +#endif + + pshufd $0xee, %xmm0, %xmm1 + movddup %xmm0, %xmm0 + pshufd $0xee, %xmm2, %xmm3 + movddup %xmm2, %xmm2 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: +#endif + + cmpq $1, N + jl .L999 + +#if GEMV_UNROLL == 1 +.L31: + decq N +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + movq M, I + sarq $2, I + jle .L35 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -14 * SIZE(A1), %xmm10 + movddup -13 * SIZE(A1), %xmm11 + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A1), %xmm9 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A1), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A1), %xmm11 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -8 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + movddup -7 * SIZE(A1), %xmm9 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -6 * SIZE(A1), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -5 * SIZE(A1), %xmm11 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A1), %xmm9 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A1), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A1), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -14 * SIZE(A1), %xmm10 + movddup -13 * SIZE(A1), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L37: + testq $1, M + je .L39 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + ALIGN_3 + +.L39: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0x40, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm13, %xmm1 +#else + xorps %xmm13, %xmm0 +#endif + + pshufd $0x4e, %xmm1, %xmm1 + +#ifndef CONJ + addpd %xmm1, %xmm0 +#else + subpd %xmm1, %xmm0 +#endif + + pshufd $0xee, %xmm0, %xmm1 + movddup %xmm0, %xmm0 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + + addpd %xmm1, %xmm0 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + + addpd %xmm1, %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + +#if GEMV_UNROLL == 1 + addq INCY, Y + addq INCY, Y1 + + cmpq $1, N + jge .L31 +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S new file mode 100644 index 0000000..9502626 --- /dev/null +++ b/kernel/x86_64/znrm2.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE * 2, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#if defined(PREFETCH) + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + FLD 2 * SIZE(X) + fmul %st(0), %st + FLD 3 * SIZE(X) + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fmul %st(0), %st + FLD 5 * SIZE(X) + fmul %st(0), %st + FLD 6 * SIZE(X) + fmul %st(0), %st + FLD 7 * SIZE(X) + fmul %st(0), %st + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $3, M + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + faddp %st,%st(3) + faddp %st,%st(1) + addq $2 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + faddp %st,%st(3) + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + fsqrt +#ifndef XDOUBLE + subq $2 * SIZE, %rsp + FST (%rsp) + MOVSD (%rsp), %xmm0 + add $2 * SIZE, %rsp +#endif + ret + + EPILOGUE + diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S new file mode 100644 index 0000000..005536a --- /dev/null +++ b/kernel/x86_64/znrm2_sse.S @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax +#define FLAG %r10 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + pxor %xmm1, %xmm1 + testq INCX, INCX + jle .L999 + + xorq FLAG, FLAG + + pxor %xmm2, %xmm2 + leaq (, INCX, 2 * SIZE), INCX + pxor %xmm3, %xmm3 + cmpq $2 * SIZE, INCX + jne .L40 + + testq $SIZE, X + je .L05 + + movss (X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + addq $SIZE, X + movq $1, FLAG + decq M + jle .L19 + ALIGN_3 + +.L05: + movq M, I + sarq $3, I + jle .L14 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 8 * SIZE(X), %xmm8 + movsd 10 * SIZE(X), %xmm9 + movsd 12 * SIZE(X), %xmm10 + movsd 14 * SIZE(X), %xmm11 + + addq $16 * SIZE, X + decq I + jle .L12 + ALIGN_3 + +.L10: +#if defined(PREFETCH) + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + cvtps2pd %xmm4, %xmm12 + cvtps2pd %xmm5, %xmm13 + cvtps2pd %xmm6, %xmm14 + cvtps2pd %xmm7, %xmm15 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + + mulpd %xmm12, %xmm12 + mulpd %xmm13, %xmm13 + mulpd %xmm14, %xmm14 + mulpd %xmm15, %xmm15 + + addpd %xmm12, %xmm0 + addpd %xmm13, %xmm1 + addpd %xmm14, %xmm2 + addpd %xmm15, %xmm3 + + cvtps2pd %xmm8, %xmm12 + cvtps2pd %xmm9, %xmm13 + cvtps2pd %xmm10, %xmm14 + cvtps2pd %xmm11, %xmm15 + + movsd 8 * SIZE(X), %xmm8 + movsd 10 * SIZE(X), %xmm9 + movsd 12 * SIZE(X), %xmm10 + movsd 14 * SIZE(X), %xmm11 + + mulpd %xmm12, %xmm12 + mulpd %xmm13, %xmm13 + mulpd %xmm14, %xmm14 + mulpd %xmm15, %xmm15 + + addpd %xmm12, %xmm0 + addpd %xmm13, %xmm1 + addpd %xmm14, %xmm2 + addpd %xmm15, %xmm3 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_3 + +.L12: + cvtps2pd %xmm4, %xmm12 + cvtps2pd %xmm5, %xmm13 + cvtps2pd %xmm6, %xmm14 + cvtps2pd %xmm7, %xmm15 + + mulpd %xmm12, %xmm12 + mulpd %xmm13, %xmm13 + mulpd %xmm14, %xmm14 + mulpd %xmm15, %xmm15 + + addpd %xmm12, %xmm0 + addpd %xmm13, %xmm1 + addpd %xmm14, %xmm2 + addpd %xmm15, %xmm3 + + cvtps2pd %xmm8, %xmm12 + cvtps2pd %xmm9, %xmm13 + cvtps2pd %xmm10, %xmm14 + cvtps2pd %xmm11, %xmm15 + + mulpd %xmm12, %xmm12 + mulpd %xmm13, %xmm13 + mulpd %xmm14, %xmm14 + mulpd %xmm15, %xmm15 + + addpd %xmm12, %xmm0 + addpd %xmm13, %xmm1 + addpd %xmm14, %xmm2 + addpd %xmm15, %xmm3 + ALIGN_3 + + +.L14: + testq $4, M + je .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + cvtps2pd %xmm6, %xmm10 + cvtps2pd %xmm7, %xmm11 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + addq $8 * SIZE, X + ALIGN_3 + +.L15: + testq $2, M + je .L16 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L16: + testq $1, M + je .L19 + + movsd (X), %xmm4 + cvtps2pd %xmm4, %xmm6 + mulpd %xmm6, %xmm6 + addpd %xmm6, %xmm2 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq FLAG, FLAG + je .L998 + + movss (X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L41: + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + movsd (X), %xmm8 + addq INCX, X + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm10 + addq INCX, X + movsd (X), %xmm11 + addq INCX, X + + cvtps2pd %xmm4, %xmm4 + cvtps2pd %xmm5, %xmm5 + cvtps2pd %xmm6, %xmm6 + cvtps2pd %xmm7, %xmm7 + cvtps2pd %xmm8, %xmm8 + cvtps2pd %xmm9, %xmm9 + cvtps2pd %xmm10, %xmm10 + cvtps2pd %xmm11, %xmm11 + + mulpd %xmm4, %xmm4 + mulpd %xmm5, %xmm5 + mulpd %xmm6, %xmm6 + mulpd %xmm7, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + decq I + jg .L41 + ALIGN_3 + +.L44: + testq $4, M + je .L45 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + cvtps2pd %xmm6, %xmm10 + cvtps2pd %xmm7, %xmm11 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + ALIGN_3 + +.L45: + testq $2, M + je .L46 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + + cvtps2pd %xmm4, %xmm6 + cvtps2pd %xmm5, %xmm7 + mulpd %xmm6, %xmm6 + mulpd %xmm7, %xmm7 + addpd %xmm6, %xmm0 + addpd %xmm7, %xmm1 + ALIGN_3 + +.L46: + testq $1, M + je .L998 + + movsd (X), %xmm4 + cvtps2pd %xmm4, %xmm6 + mulpd %xmm6, %xmm6 + addpd %xmm6, %xmm3 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + sqrtsd %xmm0, %xmm0 + cvtsd2ss %xmm0, %xmm0 + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zrot.S b/kernel/x86_64/zrot.S new file mode 100644 index 0000000..d645d6f --- /dev/null +++ b/kernel/x86_64/zrot.S @@ -0,0 +1,367 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 48(%rsp), INCY + FLD 72(%rsp) + FLD 56(%rsp) +#else + FLD 24(%rsp) + FLD 8(%rsp) +#endif + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq N, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + movq N, I + sarq $1, I + jle .L15 + ALIGN_4 + +.L10: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 2 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 3 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 3 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + + decq I + jg .L10 + ALIGN_4 + +.L15: + movq N, I + andq $1, I + jle .L999 + ALIGN_4 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + jmp .L999 + ALIGN_4 + +.L50: + movq N, I + sarq $1, I + jle .L55 + ALIGN_4 + +.L51: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq I + jg .L51 + ALIGN_4 + +.L55: + movq N, I + andq $1, I + jle .L999 + ALIGN_4 + +.L56: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + ALIGN_4 + +.L999: + ffreep %st + ffreep %st + ret + + EPILOGUE diff --git a/kernel/x86_64/zrot_sse.S b/kernel/x86_64/zrot_sse.S new file mode 100644 index 0000000..4aa0e72 --- /dev/null +++ b/kernel/x86_64/zrot_sse.S @@ -0,0 +1,1622 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define C %xmm14 +#define S %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY + movss 48(%rsp), %xmm0 + movss 56(%rsp), %xmm1 +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + pshufd $0x0, %xmm0, C + pshufd $0x0, %xmm1, S + + cmpq $0, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + testq $2 * SIZE, X + je .L10 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq N + jle .L999 + +.L10: + testq $1 * SIZE, X + jne .L30 + + testq $3 * SIZE, Y + jne .L20 + + movq N, %rax + sarq $4, %rax + jle .L14 + + movaps 0 * SIZE(Y), %xmm1 + movaps 4 * SIZE(Y), %xmm3 + movaps 8 * SIZE(Y), %xmm9 + movaps 12 * SIZE(Y), %xmm11 + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm8 + movaps 12 * SIZE(X), %xmm10 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 20 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 24 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 28 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 32 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 36 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps 32 * SIZE(X), %xmm0 + movaps %xmm2, 20 * SIZE(X) + movaps 36 * SIZE(X), %xmm2 + movaps %xmm4, 16 * SIZE(Y) + movaps %xmm6, 20 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 40 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 44 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps 40 * SIZE(X), %xmm8 + movaps %xmm10, 28 * SIZE(X) + movaps 44 * SIZE(X), %xmm10 + movaps %xmm4, 24 * SIZE(Y) + movaps %xmm6, 28 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 20 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 24 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 28 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 20 * SIZE(X) + movaps %xmm4, 16 * SIZE(Y) + movaps %xmm6, 20 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps %xmm10, 28 * SIZE(X) + movaps %xmm4, 24 * SIZE(Y) + movaps %xmm6, 28 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + ALIGN_3 + +.L14: + testq $15, N + jle .L999 + + testq $8, N + jle .L15 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps 8 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + movaps 12 * SIZE(Y), %xmm3 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, N + jle .L16 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movq N, %rax + sarq $4, %rax + jle .L24 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movsd 8 * SIZE(Y), %xmm9 + movhps 10 * SIZE(Y), %xmm9 + movsd 12 * SIZE(Y), %xmm11 + movhps 14 * SIZE(Y), %xmm11 + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm8 + movaps 12 * SIZE(X), %xmm10 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 24 * SIZE(Y), %xmm9 + movhps 26 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 28 * SIZE(Y), %xmm11 + movhps 30 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 32 * SIZE(Y), %xmm1 + movhps 34 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 36 * SIZE(Y), %xmm3 + movhps 38 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps 32 * SIZE(X), %xmm0 + movaps %xmm2, 20 * SIZE(X) + movaps 36 * SIZE(X), %xmm2 + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 40 * SIZE(Y), %xmm9 + movhps 42 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 44 * SIZE(Y), %xmm11 + movhps 46 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps 40 * SIZE(X), %xmm8 + movaps %xmm10, 28 * SIZE(X) + movaps 44 * SIZE(X), %xmm10 + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + + movsd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movsd %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 24 * SIZE(Y), %xmm9 + movhps 26 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 28 * SIZE(Y), %xmm11 + movhps 30 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 20 * SIZE(X) + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps %xmm10, 28 * SIZE(X) + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + ALIGN_3 + +.L24: + testq $15, N + jle .L999 + + testq $8, N + jle .L25 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + movsd 12 * SIZE(Y), %xmm3 + movhps 14 * SIZE(Y), %xmm3 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + movq N, %rax + sarq $4, %rax + jle .L34 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movsd 8 * SIZE(Y), %xmm9 + movhps 10 * SIZE(Y), %xmm9 + movsd 12 * SIZE(Y), %xmm11 + movhps 14 * SIZE(Y), %xmm11 + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + movsd 8 * SIZE(X), %xmm8 + movhps 10 * SIZE(X), %xmm8 + movsd 12 * SIZE(X), %xmm10 + movhps 14 * SIZE(X), %xmm10 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movsd 16 * SIZE(X), %xmm0 + movhps 18 * SIZE(X), %xmm0 + movlps %xmm2, 4 * SIZE(X) + movhps %xmm2, 6 * SIZE(X) + movsd 20 * SIZE(X), %xmm2 + movhps 22 * SIZE(X), %xmm2 + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 24 * SIZE(Y), %xmm9 + movhps 26 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 28 * SIZE(Y), %xmm11 + movhps 30 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm8, 8 * SIZE(X) + movhps %xmm8, 10 * SIZE(X) + movsd 24 * SIZE(X), %xmm8 + movhps 26 * SIZE(X), %xmm8 + movlps %xmm10, 12 * SIZE(X) + movhps %xmm10, 14 * SIZE(X) + movsd 28 * SIZE(X), %xmm10 + movhps 30 * SIZE(X), %xmm10 + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 32 * SIZE(Y), %xmm1 + movhps 34 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 36 * SIZE(Y), %xmm3 + movhps 38 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 16 * SIZE(X) + movhps %xmm0, 18 * SIZE(X) + movsd 32 * SIZE(X), %xmm0 + movhps 34 * SIZE(X), %xmm0 + movlps %xmm2, 20 * SIZE(X) + movhps %xmm2, 22 * SIZE(X) + movsd 36 * SIZE(X), %xmm2 + movhps 38 * SIZE(X), %xmm2 + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 40 * SIZE(Y), %xmm9 + movhps 42 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 44 * SIZE(Y), %xmm11 + movhps 46 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm8, 24 * SIZE(X) + movhps %xmm8, 26 * SIZE(X) + movsd 40 * SIZE(X), %xmm8 + movhps 42 * SIZE(X), %xmm8 + movlps %xmm10, 28 * SIZE(X) + movhps %xmm10, 30 * SIZE(X) + movsd 44 * SIZE(X), %xmm10 + movhps 46 * SIZE(X), %xmm10 + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movsd 16 * SIZE(X), %xmm0 + movhps 18 * SIZE(X), %xmm0 + movlps %xmm2, 4 * SIZE(X) + movhps %xmm2, 6 * SIZE(X) + movsd 20 * SIZE(X), %xmm2 + movhps 22 * SIZE(X), %xmm2 + + movsd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movsd %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 24 * SIZE(Y), %xmm9 + movhps 26 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 28 * SIZE(Y), %xmm11 + movhps 30 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm8, 8 * SIZE(X) + movhps %xmm8, 10 * SIZE(X) + movsd 24 * SIZE(X), %xmm8 + movhps 26 * SIZE(X), %xmm8 + movlps %xmm10, 12 * SIZE(X) + movhps %xmm10, 14 * SIZE(X) + movsd 28 * SIZE(X), %xmm10 + movhps 30 * SIZE(X), %xmm10 + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 16 * SIZE(X) + movhps %xmm0, 18 * SIZE(X) + movlps %xmm2, 20 * SIZE(X) + movhps %xmm2, 22 * SIZE(X) + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm8, 24 * SIZE(X) + movhps %xmm8, 26 * SIZE(X) + movlps %xmm10, 28 * SIZE(X) + movhps %xmm10, 30 * SIZE(X) + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + ALIGN_3 + +.L34: + testq $15, N + jle .L999 + + testq $8, N + jle .L35 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 4 * SIZE(X) + movhps %xmm2, 6 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 8 * SIZE(X), %xmm0 + movhps 10 * SIZE(X), %xmm0 + movsd 12 * SIZE(Y), %xmm3 + movhps 14 * SIZE(Y), %xmm3 + movsd 12 * SIZE(X), %xmm2 + movhps 14 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 8 * SIZE(X) + movhps %xmm0, 10 * SIZE(X) + movlps %xmm2, 12 * SIZE(X) + movhps %xmm2, 14 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, N + jle .L36 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 4 * SIZE(X) + movhps %xmm2, 6 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L36: + testq $2, N + jle .L37 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + ALIGN_3 + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movhps %xmm0, (X, INCX) + movlps %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leaq (X, INCX, 2), X + leaq (Y, INCY, 2), Y + + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movhps %xmm0, (X, INCX) + movlps %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leaq (X, INCX, 2), X + leaq (Y, INCY, 2), Y + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd (Y), %xmm1 + movsd (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movlps %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zrot_sse2.S b/kernel/x86_64/zrot_sse2.S new file mode 100644 index 0000000..3681018 --- /dev/null +++ b/kernel/x86_64/zrot_sse2.S @@ -0,0 +1,1727 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define C %xmm14 +#define S %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY + movsd 48(%rsp), %xmm0 + movsd 56(%rsp), %xmm1 +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + pshufd $0x44, %xmm0, C + pshufd $0x44, %xmm1, S + + cmpq $0, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + +.L10: + testq $SIZE, X + jne .L30 + + testq $SIZE, Y + jne .L20 + + movq N, %rax + sarq $3, %rax + jle .L14 + + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + movapd 4 * SIZE(Y), %xmm9 + movapd 6 * SIZE(Y), %xmm11 + + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + movapd 4 * SIZE(X), %xmm8 + movapd 6 * SIZE(X), %xmm10 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd %xmm0, 0 * SIZE(X) + movapd 8 * SIZE(X), %xmm0 + movapd %xmm2, 2 * SIZE(X) + movapd 10 * SIZE(X), %xmm2 + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 4 * SIZE(X) + movapd 12 * SIZE(X), %xmm8 + movapd %xmm10,6 * SIZE(X) + movapd 14 * SIZE(X), %xmm10 + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 16 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 18 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd 16 * SIZE(X), %xmm0 + movapd %xmm2, 10 * SIZE(X) + movapd 18 * SIZE(X), %xmm2 + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 20 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 22 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 12 * SIZE(X) + movapd 20 * SIZE(X), %xmm8 + movapd %xmm10, 14 * SIZE(X) + movapd 22 * SIZE(X), %xmm10 + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd 8 * SIZE(X), %xmm0 + movapd %xmm2, 2 * SIZE(X) + movapd 10 * SIZE(X), %xmm2 + + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 4 * SIZE(X) + movapd 12 * SIZE(X), %xmm8 + movapd %xmm10,6 * SIZE(X) + movapd 14 * SIZE(X), %xmm10 + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 10 * SIZE(X) + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 12 * SIZE(X) + movapd %xmm10, 14 * SIZE(X) + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $7, N + jle .L999 + + testq $4, N + jle .L15 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm3 + movapd 2 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + movapd 6 * SIZE(Y), %xmm3 + movapd 6 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 6 * SIZE(X) + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $2, N + jle .L16 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm3 + movapd 2 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $1, N + jle .L999 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movapd -1 * SIZE(Y), %xmm1 + + movq N, %rax + sarq $3, %rax + jle .L24 + ALIGN_3 + +.L21: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(Y), %xmm3 + movapd 3 * SIZE(Y), %xmm8 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlps %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 5 * SIZE(Y), %xmm9 + movapd 7 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + movapd 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movapd %xmm8, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm9, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 6 * SIZE(X) + movlps %xmm4, 4 * SIZE(Y) + movhps %xmm4, 5 * SIZE(Y) + movlps %xmm6, 6 * SIZE(Y) + movhps %xmm6, 7 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 9 * SIZE(Y), %xmm3 + movapd 11 * SIZE(Y), %xmm8 + movapd 8 * SIZE(X), %xmm0 + movapd 10 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 10 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 9 * SIZE(Y) + movlps %xmm6, 10 * SIZE(Y) + movhps %xmm6, 11 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 13 * SIZE(Y), %xmm9 + movapd 15 * SIZE(Y), %xmm1 + movapd 12 * SIZE(X), %xmm0 + movapd 14 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movapd %xmm8, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm9, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 12 * SIZE(X) + movapd %xmm2, 14 * SIZE(X) + movlps %xmm4, 12 * SIZE(Y) + movhps %xmm4, 13 * SIZE(Y) + movlps %xmm6, 14 * SIZE(Y) + movhps %xmm6, 15 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L24: + testq $7, N + jle .L999 + + testq $4, N + jle .L25 + + movapd 1 * SIZE(Y), %xmm3 + movapd 3 * SIZE(Y), %xmm8 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlps %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + + movapd 5 * SIZE(Y), %xmm9 + movapd 7 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + movapd 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movapd %xmm8, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm9, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 6 * SIZE(X) + movlps %xmm4, 4 * SIZE(Y) + movhps %xmm4, 5 * SIZE(Y) + movlps %xmm6, 6 * SIZE(Y) + movhps %xmm6, 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, N + jle .L26 + + movapd 1 * SIZE(Y), %xmm3 + movapd 3 * SIZE(Y), %xmm8 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlps %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + movapd %xmm8, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, N + jle .L999 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + testq $SIZE, Y + jne .L40 + + movapd -1 * SIZE(X), %xmm0 + + movq N, %rax + sarq $3, %rax + jle .L34 + ALIGN_3 + +.L31: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(X), %xmm2 + movapd 3 * SIZE(X), %xmm8 + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + SHUFPD_1 %xmm8, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 2 * SIZE(X) + movhps %xmm2, 3 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 5 * SIZE(X), %xmm2 + movapd 7 * SIZE(X), %xmm0 + movapd 4 * SIZE(Y), %xmm1 + movapd 6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm8 + SHUFPD_1 %xmm0, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm8, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm8 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm8, 4 * SIZE(X) + movhps %xmm8, 5 * SIZE(X) + movlps %xmm2, 6 * SIZE(X) + movhps %xmm2, 7 * SIZE(X) + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 9 * SIZE(X), %xmm2 + movapd 11 * SIZE(X), %xmm8 + movapd 8 * SIZE(Y), %xmm1 + movapd 10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + SHUFPD_1 %xmm8, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm0, 8 * SIZE(X) + movhps %xmm0, 9 * SIZE(X) + movlps %xmm2, 10 * SIZE(X) + movhps %xmm2, 11 * SIZE(X) + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 13 * SIZE(X), %xmm2 + movapd 15 * SIZE(X), %xmm0 + movapd 12 * SIZE(Y), %xmm1 + movapd 14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm8 + SHUFPD_1 %xmm0, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm8, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm8 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm8, 12 * SIZE(X) + movhps %xmm8, 13 * SIZE(X) + movlps %xmm2, 14 * SIZE(X) + movhps %xmm2, 15 * SIZE(X) + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, Y + addq $16 * SIZE, X + decq %rax + jg .L31 + ALIGN_3 + +.L34: + testq $7, N + jle .L999 + + testq $4, N + jle .L35 + + movapd 1 * SIZE(X), %xmm2 + movapd 3 * SIZE(X), %xmm8 + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + SHUFPD_1 %xmm8, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 2 * SIZE(X) + movhps %xmm2, 3 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd 5 * SIZE(X), %xmm2 + movapd 7 * SIZE(X), %xmm0 + movapd 4 * SIZE(Y), %xmm1 + movapd 6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm8 + SHUFPD_1 %xmm0, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm8, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm8 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm8, 4 * SIZE(X) + movhps %xmm8, 5 * SIZE(X) + movlps %xmm2, 6 * SIZE(X) + movhps %xmm2, 7 * SIZE(X) + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, Y + addq $8 * SIZE, X + ALIGN_3 + +.L35: + testq $2, N + jle .L36 + + movapd 1 * SIZE(X), %xmm2 + movapd 3 * SIZE(X), %xmm8 + + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + SHUFPD_1 %xmm8, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 2 * SIZE(X) + movhps %xmm2, 3 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + movapd %xmm8, %xmm0 + + addq $4 * SIZE, Y + addq $4 * SIZE, X + ALIGN_3 + +.L36: + testq $1, N + jle .L999 + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L40: + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + addq $1 * SIZE, Y + addq $1 * SIZE, X + + decq N + jle .L47 + + movq N, %rax + sarq $3, %rax + jle .L44 + + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + movapd 4 * SIZE(Y), %xmm9 + movapd 6 * SIZE(Y), %xmm11 + + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + movapd 4 * SIZE(X), %xmm8 + movapd 6 * SIZE(X), %xmm10 + + decq %rax + jle .L42 + ALIGN_3 + +.L41: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd %xmm0, 0 * SIZE(X) + movapd 8 * SIZE(X), %xmm0 + movapd %xmm2, 2 * SIZE(X) + movapd 10 * SIZE(X), %xmm2 + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 4 * SIZE(X) + movapd 12 * SIZE(X), %xmm8 + movapd %xmm10,6 * SIZE(X) + movapd 14 * SIZE(X), %xmm10 + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 16 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 18 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd 16 * SIZE(X), %xmm0 + movapd %xmm2, 10 * SIZE(X) + movapd 18 * SIZE(X), %xmm2 + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 20 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 22 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 12 * SIZE(X) + movapd 20 * SIZE(X), %xmm8 + movapd %xmm10, 14 * SIZE(X) + movapd 22 * SIZE(X), %xmm10 + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd 8 * SIZE(X), %xmm0 + movapd %xmm2, 2 * SIZE(X) + movapd 10 * SIZE(X), %xmm2 + + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 4 * SIZE(X) + movapd 12 * SIZE(X), %xmm8 + movapd %xmm10,6 * SIZE(X) + movapd 14 * SIZE(X), %xmm10 + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 10 * SIZE(X) + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 12 * SIZE(X) + movapd %xmm10, 14 * SIZE(X) + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $4, N + jle .L45 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm3 + movapd 2 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + movapd 6 * SIZE(Y), %xmm3 + movapd 6 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 6 * SIZE(X) + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $2, N + jle .L46 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm3 + movapd 2 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + testq $1, N + jle .L47 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + addq $2 * SIZE, Y + addq $2 * SIZE, X + ALIGN_3 + +.L47: + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zscal.S b/kernel/x86_64/zscal.S new file mode 100644 index 0000000..5282e0f --- /dev/null +++ b/kernel/x86_64/zscal.S @@ -0,0 +1,223 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 +#define X ARG4 +#define INCX ARG5 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $ZBASE_SHIFT, INCX + + FLD 8(%rsp) + FLD 24(%rsp) + + testq N, N + jle .L999 + + fld %st(1) + fabs + fld %st(1) + fabs + faddp %st, %st(1) + + fldz + fcomip %st(1), %st + ffreep %st + jne .L30 + + EMMS + + pxor %mm0, %mm0 + + cmpq $2 * SIZE, INCX + jne .L20 + + movq N, I + sarq $2, I + jle .L15 + ALIGN_4 + +.L12: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + movq %mm0, 32(X) + movq %mm0, 40(X) + movq %mm0, 48(X) + movq %mm0, 56(X) + movq %mm0, 64(X) + movq %mm0, 72(X) + movq %mm0, 80(X) + movq %mm0, 88(X) + movq %mm0, 96(X) + movq %mm0, 104(X) + movq %mm0, 112(X) + movq %mm0, 120(X) + addq $8 * SIZE, X + decq I + jg .L12 + ALIGN_3 + +.L15: + movq N, I + andq $3, I + jle .L18 + ALIGN_2 + +.L16: + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + + addq $2 * SIZE, X + decq I + jg .L16 + +.L18: + EMMS + + ret + ALIGN_2 + +.L20: + movq N, I + sarq $2, I + jle .L25 + ALIGN_3 + +.L22: + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + decq I + jg .L22 + ALIGN_3 + +.L25: + movq N, I + andq $3, I + jle .L28 + ALIGN_3 + +.L26: + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + decq I + jg .L26 + +.L28: + EMMS + + ret + ALIGN_3 + +.L30: + movq N, I + ALIGN_2 + +.L32: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 1 * SIZE(X) + fmul %st(3),%st + faddp %st,%st(1) + + FLD 0 * SIZE(X) + fmul %st(3),%st + FLD 1 * SIZE(X) + fmul %st(3),%st + fsubrp %st,%st(1) + + FST 0 * SIZE(X) + FST 1 * SIZE(X) + addq INCX, X + decq I + jg .L32 + ALIGN_2 + +.L999: + ffreep %st + ffreep %st + + ret + + EPILOGUE diff --git a/kernel/x86_64/zscal_atom.S b/kernel/x86_64/zscal_atom.S new file mode 100644 index 0000000..c01d5c1 --- /dev/null +++ b/kernel/x86_64/zscal_atom.S @@ -0,0 +1,394 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + movq 48(%rsp), X + movq 56(%rsp), INCX +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + + testq M, M + jle .L999 + + pxor %xmm15, %xmm15 + comisd %xmm0, %xmm15 + jne .L30 # Alpha_r != ZERO + + comisd %xmm1, %xmm15 + jne .L30 # Alpha_i != ZERO + + +/* Alpha == ZERO */ + cmpq $2 * SIZE, INCX + jne .L20 + + movq M, I + sarq $2, I + jle .L12 + ALIGN_4 + +.L11: + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + movsd %xmm1, 2 * SIZE(X) + movsd %xmm1, 3 * SIZE(X) + + movsd %xmm1, 4 * SIZE(X) + movsd %xmm1, 5 * SIZE(X) + movsd %xmm1, 6 * SIZE(X) + movsd %xmm1, 7 * SIZE(X) + + addq $8 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $2, M + je .L14 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + movsd %xmm1, 2 * SIZE(X) + movsd %xmm1, 3 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq $2 * SIZE, X + jmp .L999 + ALIGN_4 + +.L20: + movq M, I # rcx = n + sarq $2, I + jle .L22 + ALIGN_4 + +.L21: + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + decq I + jg .L21 + ALIGN_4 + +.L22: + testq $2, M + je .L23 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L23: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ +.L30: + movq X, XX + + movq M, I + sarq $2, I + jle .L35 + + movsd 0 * SIZE(X), %xmm2 + movsd 1 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + movaps %xmm2, %xmm4 + movsd 0 * SIZE(X), %xmm8 + mulsd %xmm0, %xmm2 + movaps %xmm3, %xmm5 + movsd 1 * SIZE(X), %xmm9 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm2 + movsd 0 * SIZE(X), %xmm10 + addsd %xmm4, %xmm3 + movsd 1 * SIZE(X), %xmm11 + + movaps %xmm6, %xmm4 + mulsd %xmm0, %xmm6 + addq INCX, X + movaps %xmm7, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm7 + mulsd %xmm1, %xmm4 + + decq I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + subsd %xmm5, %xmm6 + movsd %xmm2, 0 * SIZE(XX) + addsd %xmm4, %xmm7 + movsd %xmm3, 1 * SIZE(XX) + + movaps %xmm8, %xmm4 + movsd 0 * SIZE(X), %xmm2 + mulsd %xmm0, %xmm8 + addq INCX, XX + movaps %xmm9, %xmm5 + movsd 1 * SIZE(X), %xmm3 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm8 + movsd %xmm6, 0 * SIZE(XX) + addsd %xmm4, %xmm9 + movsd %xmm7, 1 * SIZE(XX) + + movaps %xmm10, %xmm4 + movsd 0 * SIZE(X), %xmm6 + mulsd %xmm0, %xmm10 + addq INCX, XX + movaps %xmm11, %xmm5 + movsd 1 * SIZE(X), %xmm7 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm10 + movsd %xmm8, 0 * SIZE(XX) + addsd %xmm4, %xmm11 + movsd %xmm9, 1 * SIZE(XX) + + movaps %xmm2, %xmm4 + movsd 0 * SIZE(X), %xmm8 + mulsd %xmm0, %xmm2 + addq INCX, XX + movaps %xmm3, %xmm5 + movsd 1 * SIZE(X), %xmm9 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm2 + movsd %xmm10, 0 * SIZE(XX) + addsd %xmm4, %xmm3 + movsd %xmm11, 1 * SIZE(XX) + + movaps %xmm6, %xmm4 + movsd 0 * SIZE(X), %xmm10 + mulsd %xmm0, %xmm6 + addq INCX, XX + movaps %xmm7, %xmm5 + movsd 1 * SIZE(X), %xmm11 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm7 + mulsd %xmm1, %xmm4 + + decq I + jg .L31 + ALIGN_4 + +.L32: + subsd %xmm5, %xmm6 + movsd %xmm2, 0 * SIZE(XX) + addsd %xmm4, %xmm7 + movsd %xmm3, 1 * SIZE(XX) + + movaps %xmm8, %xmm4 + mulsd %xmm0, %xmm8 + addq INCX, XX + movaps %xmm9, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm8 + movsd %xmm6, 0 * SIZE(XX) + addsd %xmm4, %xmm9 + movsd %xmm7, 1 * SIZE(XX) + + movaps %xmm10, %xmm4 + mulsd %xmm0, %xmm10 + addq INCX, XX + movaps %xmm11, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm10 + movsd %xmm8, 0 * SIZE(XX) + addsd %xmm4, %xmm11 + movsd %xmm9, 1 * SIZE(XX) + addq INCX, XX + + movsd %xmm10, 0 * SIZE(XX) + movsd %xmm11, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + movsd 0 * SIZE(X), %xmm2 + movsd 1 * SIZE(X), %xmm3 + addq INCX, X + + movaps %xmm2, %xmm4 + movsd 0 * SIZE(X), %xmm6 + mulsd %xmm0, %xmm2 + movaps %xmm3, %xmm5 + movsd 1 * SIZE(X), %xmm7 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm2 + addsd %xmm4, %xmm3 + + movaps %xmm6, %xmm4 + mulsd %xmm0, %xmm6 + movaps %xmm7, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm7 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm6 + movsd %xmm2, 0 * SIZE(XX) + addsd %xmm4, %xmm7 + movsd %xmm3, 1 * SIZE(XX) + addq INCX, XX + + movsd %xmm6, 0 * SIZE(XX) + movsd %xmm7, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L37: + testq $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm2 + movsd 1 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm4 + mulsd %xmm0, %xmm2 + movaps %xmm3, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm2 + addsd %xmm4, %xmm3 + + movsd %xmm2, 0 * SIZE(XX) + movsd %xmm3, 1 * SIZE(XX) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zscal_sse.S b/kernel/x86_64/zscal_sse.S new file mode 100644 index 0000000..eb2092d --- /dev/null +++ b/kernel/x86_64/zscal_sse.S @@ -0,0 +1,1359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define FLAG %r11 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + movq 48(%rsp), X + movq 56(%rsp), INCX +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + xor FLAG, FLAG + + testq M, M + jle .L999 + + pxor %xmm15, %xmm15 + comiss %xmm0, %xmm15 + jne .L100 # Alpha_r != ZERO + + comiss %xmm1, %xmm15 + jne .L100 # Alpha_i != ZERO + +/* Alpha == ZERO */ + cmpq $2 * SIZE, INCX + jne .L50 + +/* INCX == 1 */ + cmpq $3, M + jle .L13 + + testq $4, X + je .L05 + movss %xmm15, 0 * SIZE(X) + addq $SIZE, X + movq $1, FLAG + decq M + ALIGN_3 + +.L05: + testq $8, X + je .L06 + + movlps %xmm15, 0 * SIZE(X) + addq $2 * SIZE, X + subq $1, M + ALIGN_3 +.L06: + + movq M, I # rcx = n + sarq $3, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 4 * SIZE(X) + movaps %xmm15, 8 * SIZE(X) + movaps %xmm15, 12 * SIZE(X) + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $7, M + je .L19 + testq $4, M + je .L13 + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 4 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L13: + testq $2, M + je .L14 + + movlps %xmm15, 0 * SIZE(X) + movhps %xmm15, 2 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $1, M + je .L19 + + movlps %xmm15, 0 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, FLAG + je .L999 + + movss %xmm15, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L50: + movq M, I # rcx = n + sarq $2, I + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + decq I + jg .L51 + ALIGN_4 + +.L52: + testq $2, M + je .L53 + + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L53: + testq $1, M + je .L999 + + movsd %xmm15, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + testq $SIZE, X + jne .L130 + + cmpq $2 * SIZE, INCX + jne .L120 + + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm1 + subps %xmm1, %xmm15 + unpcklps %xmm1, %xmm15 + + subq $-32 * SIZE, X + + testq $2 * SIZE, X + je .L105 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + addq $2 * SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L105: + movq M, I + sarq $4, I + jle .L115 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps 0 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(X) + movaps 4 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(X) + movaps 8 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(X) + movaps 12 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(X) + movaps 16 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(X) + movaps 20 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(X) + movaps 24 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(X) + movaps 28 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + pshufd $0xb1, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(X) + + pshufd $0xb1, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(X) + + pshufd $0xb1, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(X) + + pshufd $0xb1, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(X) + + subq $-32 * SIZE, X + ALIGN_4 + +.L115: + testq $8, M + je .L116 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + addq $16 * SIZE, X + ALIGN_3 + +.L116: + testq $4, M + je .L117 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L117: + testq $2, M + je .L118 + + movaps -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L118: + testq $1, M + je .L999 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + jmp .L999 + ALIGN_3 + +.L120: + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm1 + subps %xmm1, %xmm15 + unpcklps %xmm1, %xmm15 + + movq X, XX + + movq M, I + sarq $3, I + jle .L125 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + decq I + jle .L122 + ALIGN_4 + +.L121: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + addq INCX, XX + movhps %xmm0, (XX) + addq INCX, XX + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movlps %xmm1, (XX) + addq INCX, XX + movhps %xmm1, (XX) + addq INCX, XX + + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + + movlps %xmm2, (XX) + addq INCX, XX + movhps %xmm2, (XX) + addq INCX, XX + + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + + movlps %xmm3, (XX) + addq INCX, XX + movhps %xmm3, (XX) + addq INCX, XX + + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + decq I + jg .L121 + ALIGN_4 + +.L122: + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + addq INCX, XX + movhps %xmm0, (XX) + addq INCX, XX + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movlps %xmm1, (XX) + addq INCX, XX + movhps %xmm1, (XX) + addq INCX, XX + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + + movlps %xmm2, (XX) + addq INCX, XX + movhps %xmm2, (XX) + addq INCX, XX + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + + movlps %xmm3, (XX) + addq INCX, XX + movhps %xmm3, (XX) + addq INCX, XX + ALIGN_4 + +.L125: + testq $4, M + je .L127 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + addq INCX, XX + movhps %xmm0, (XX) + addq INCX, XX + + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movlps %xmm1, (XX) + addq INCX, XX + movhps %xmm1, (XX) + addq INCX, XX + ALIGN_3 + +.L127: + testq $2, M + je .L128 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + addq INCX, XX + movhps %xmm0, (XX) + addq INCX, XX + ALIGN_3 + +.L128: + testq $1, M + je .L999 + + movsd (X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + jmp .L999 + ALIGN_3 + +.L130: + cmpq $2 * SIZE, INCX + jne .L120 + +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) + + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm1 + subps %xmm1, %xmm15 + unpcklps %xmm1, %xmm15 + + subq $-31 * SIZE, X + + testq $2 * SIZE, X + je .L130x + + movsd -31 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -31 * SIZE(X) + addq $2 * SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L130x: + shufps $0xb1, %xmm15, %xmm15 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, %xmm9 + + movq M, I + sarq $4, I + jle .L135 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decq I + jle .L132 + ALIGN_4 + +.L131: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + pshufd $0x1b, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, %xmm9 + movss %xmm10, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps 4 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + pshufd $0x1b, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, %xmm10 + movss %xmm9, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + movaps 8 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + pshufd $0x1b, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, %xmm9 + movss %xmm10, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + movaps 12 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + pshufd $0x1b, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movaps %xmm4, %xmm10 + movss %xmm9, %xmm4 + movaps %xmm4, -16 * SIZE(X) + + movaps 16 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + pshufd $0x1b, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movaps %xmm5, %xmm9 + movss %xmm10, %xmm5 + movaps %xmm5, -12 * SIZE(X) + + movaps 20 * SIZE(X), %xmm5 + + movss %xmm7, %xmm6 + pshufd $0x1b, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movaps %xmm6, %xmm10 + movss %xmm9, %xmm6 + movaps %xmm6, -8 * SIZE(X) + + movaps 24 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + pshufd $0x1b, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movaps %xmm7, %xmm9 + movss %xmm10, %xmm7 + movaps %xmm7, -4 * SIZE(X) + + movaps 28 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + decq I + jg .L131 + ALIGN_4 + +.L132: + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + pshufd $0x1b, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, %xmm9 + movss %xmm10, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movss %xmm3, %xmm2 + pshufd $0x1b, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, %xmm10 + movss %xmm9, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + movss %xmm4, %xmm3 + pshufd $0x1b, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, %xmm9 + movss %xmm10, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + movss %xmm5, %xmm4 + pshufd $0x1b, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movaps %xmm4, %xmm10 + movss %xmm9, %xmm4 + movaps %xmm4, -16 * SIZE(X) + + movss %xmm6, %xmm5 + pshufd $0x1b, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movaps %xmm5, %xmm9 + movss %xmm10, %xmm5 + movaps %xmm5, -12 * SIZE(X) + + movss %xmm7, %xmm6 + pshufd $0x1b, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movaps %xmm6, %xmm10 + movss %xmm9, %xmm6 + movaps %xmm6, -8 * SIZE(X) + + movss %xmm0, %xmm7 + pshufd $0x1b, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movaps %xmm7, %xmm9 + movss %xmm10, %xmm7 + movaps %xmm7, -4 * SIZE(X) + + subq $-32 * SIZE, X + ALIGN_4 + +.L135: + testq $8, M + je .L136 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + pshufd $0x1b, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movaps %xmm1, %xmm9 + movss %xmm10, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -20 * SIZE(X), %xmm3 + + movss %xmm3, %xmm2 + pshufd $0x1b, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + + movaps %xmm2, %xmm10 + movss %xmm9, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + movaps -16 * SIZE(X), %xmm0 + + movss %xmm0, %xmm3 + pshufd $0x1b, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + + movaps %xmm3, %xmm9 + movss %xmm10, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + addq $16 * SIZE, X + ALIGN_3 + +.L136: + testq $4, M + je .L137 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + pshufd $0x1b, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movaps %xmm1, %xmm9 + movss %xmm10, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + ALIGN_3 + +.L137: + testq $2, M + je .L138 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps %xmm10, %xmm9 + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + ALIGN_3 + +.L138: + movss %xmm9, -32 * SIZE(X) + + testq $1, M + je .L999 + + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + pshufd $0x39, %xmm0, %xmm0 + + movlps %xmm0, -31 * SIZE(X) + jmp .L999 + ALIGN_3 + + +#else + + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm1 + subps %xmm1, %xmm15 + unpcklps %xmm1, %xmm15 + + subq $-32 * SIZE, X + + testq $2 * SIZE, X + je .L130x + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + addq $2 * SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L130x: + movq M, I + sarq $4, I + jle .L135 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + decq I + jle .L132 + ALIGN_4 + +.L131: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movlps %xmm4, -16 * SIZE(X) + movhps %xmm4, -14 * SIZE(X) + + movsd 16 * SIZE(X), %xmm4 + movhps 18 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movlps %xmm5, -12 * SIZE(X) + movhps %xmm5, -10 * SIZE(X) + + movsd 20 * SIZE(X), %xmm5 + movhps 22 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movlps %xmm6, -8 * SIZE(X) + movhps %xmm6, -6 * SIZE(X) + + movsd 24 * SIZE(X), %xmm6 + movhps 26 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movlps %xmm7, -4 * SIZE(X) + movhps %xmm7, -2 * SIZE(X) + + movsd 28 * SIZE(X), %xmm7 + movhps 30 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + decq I + jg .L131 + ALIGN_4 + +.L132: + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + + pshufd $0xb1, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movlps %xmm4, -16 * SIZE(X) + movhps %xmm4, -14 * SIZE(X) + + pshufd $0xb1, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movlps %xmm5, -12 * SIZE(X) + movhps %xmm5, -10 * SIZE(X) + + pshufd $0xb1, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movlps %xmm6, -8 * SIZE(X) + movhps %xmm6, -6 * SIZE(X) + + pshufd $0xb1, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movlps %xmm7, -4 * SIZE(X) + movhps %xmm7, -2 * SIZE(X) + + subq $-32 * SIZE, X + ALIGN_4 + +.L135: + testq $8, M + je .L136 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + + addq $16 * SIZE, X + ALIGN_3 + +.L136: + testq $4, M + je .L137 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L137: + testq $2, M + je .L138 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L138: + testq $1, M + je .L999 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + ALIGN_3 +#endif + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S new file mode 100644 index 0000000..23d2da7 --- /dev/null +++ b/kernel/x86_64/zscal_sse2.S @@ -0,0 +1,1724 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define FLAG %r11 +#define I %rax + +#include "l1param.h" + +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) +#define USE_PSHUFD +#else +#define USE_PSHUFD_HALF +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + movq 48(%rsp), X + movq 56(%rsp), INCX +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + xor FLAG, FLAG + + testq M, M + jle .L999 + + pxor %xmm15, %xmm15 + comisd %xmm0, %xmm15 + jne .L100 + + comisd %xmm1, %xmm15 + jne .L100 + +/* Alpha == ZERO */ + cmpq $2 * SIZE, INCX + jne .L20 + +/* INCX == 1 */ + testq $SIZE, X + je .L05 + + movsd %xmm15, 0 * SIZE(X) + addq $SIZE, X + movq $1, FLAG + decq M + jle .L19 + ALIGN_3 +.L05: + + movq M, I # rcx = n + sarq $3, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 2 * SIZE(X) + movaps %xmm15, 4 * SIZE(X) + movaps %xmm15, 6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm15, 8 * SIZE(X) + movaps %xmm15, 10 * SIZE(X) + movaps %xmm15, 12 * SIZE(X) + movaps %xmm15, 14 * SIZE(X) + + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $4, M + je .L13 + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 2 * SIZE(X) + movaps %xmm15, 4 * SIZE(X) + movaps %xmm15, 6 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L13: + testq $2, M + je .L14 + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 2 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $1, M + je .L19 + movaps %xmm15, 0 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, FLAG + je .L999 + + movsd %xmm15, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L20: + testq $SIZE, X + jne .L30 + +/* Aligned Mode */ + movq M, I # rcx = n + sarq $2, I + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm15, (X) + addq INCX, X + movaps %xmm15, (X) + addq INCX, X + movaps %xmm15, (X) + addq INCX, X + movaps %xmm15, (X) + addq INCX, X + decq I + jg .L21 + ALIGN_4 + +.L22: + testq $3, M + je .L999 + + testq $2, M + je .L23 + + movaps %xmm15, (X) + addq INCX, X + movaps %xmm15, (X) + addq INCX, X + ALIGN_3 + +.L23: + testq $1, M + je .L999 + + movaps %xmm15, (X) + jmp .L999 + ALIGN_4 + + +/* Unaligned Mode */ +.L30: + movq M, I # rcx = n + sarq $2, I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + decq I + jg .L31 + ALIGN_4 + +.L32: + testq $3, M + je .L999 + + testq $2, M + je .L33 + + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L33: + testq $1, M + je .L999 + + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ +.L100: + testq $SIZE, X + jne .L200 + +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm14 +#else + pshufd $0x44, %xmm0, %xmm14 +#endif + pxor %xmm15, %xmm15 + subsd %xmm1, %xmm15 + movlhps %xmm1, %xmm15 + + cmpq $2 * SIZE, INCX + jne .L120 + + subq $-16 * SIZE, X + + movq M, I + sarq $3, I + jle .L115 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm8 +#else + movsd -15 * SIZE(X), %xmm8 + movhps -16 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(X) + movaps 0 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm8 +#else + movsd -13 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(X) + movaps 2 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm8 +#else + movsd -11 * SIZE(X), %xmm8 + movhps -12 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(X) + movaps 4 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm8 +#else + movsd -9 * SIZE(X), %xmm8 + movhps -10 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(X) + movaps 6 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm4, %xmm8 +#else + movsd -7 * SIZE(X), %xmm8 + movhps -8 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(X) + movaps 8 * SIZE(X), %xmm4 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm5, %xmm8 +#else + movsd -5 * SIZE(X), %xmm8 + movhps -6 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(X) + movaps 10 * SIZE(X), %xmm5 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm6, %xmm8 +#else + movsd -3 * SIZE(X), %xmm8 + movhps -4 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(X) + movaps 12 * SIZE(X), %xmm6 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm7, %xmm8 +#else + movsd -1 * SIZE(X), %xmm8 + movhps -2 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(X) + movaps 14 * SIZE(X), %xmm7 + + subq $-16 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(X) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(X) + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(X) + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(X) + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(X) + + subq $-16 * SIZE, X + ALIGN_3 + +.L115: + testq $7, M + je .L999 + + testq $4, M + je .L116 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L116: + testq $2, M + je .L117 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L117: + testq $1, M + je .L999 + + movaps -16 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + + movaps %xmm0, -16 * SIZE(X) + jmp .L999 + ALIGN_3 + +.L120: + movq X, XX + + movq M, I + sarq $3, I + jle .L125 + + movaps (X), %xmm0 + addq INCX, X + movaps (X), %xmm1 + addq INCX, X + movaps (X), %xmm2 + addq INCX, X + movaps (X), %xmm3 + addq INCX, X + movaps (X), %xmm4 + addq INCX, X + movaps (X), %xmm5 + addq INCX, X + movaps (X), %xmm6 + addq INCX, X + movaps (X), %xmm7 + addq INCX, X + + decq I + jle .L122 + ALIGN_4 + +.L121: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, (XX) + addq INCX, XX + movaps (X), %xmm0 + addq INCX, X + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, (XX) + addq INCX, XX + movaps (X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, (XX) + addq INCX, XX + movaps (X), %xmm2 + addq INCX, X + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, (XX) + addq INCX, XX + movaps (X), %xmm3 + addq INCX, X + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movaps %xmm4, (XX) + addq INCX, XX + movaps (X), %xmm4 + addq INCX, X + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movaps %xmm5, (XX) + addq INCX, XX + movaps (X), %xmm5 + addq INCX, X + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movaps %xmm6, (XX) + addq INCX, XX + movaps (X), %xmm6 + addq INCX, X + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movaps %xmm7, (XX) + addq INCX, XX + movaps (X), %xmm7 + addq INCX, X + + decq I + jg .L121 + ALIGN_4 + +.L122: + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movaps %xmm4, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movaps %xmm5, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movaps %xmm6, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movaps %xmm7, (XX) + addq INCX, XX + ALIGN_3 + +.L125: + testq $7, M + je .L999 + + testq $4, M + je .L126 + + movaps (X), %xmm0 + addq INCX, X + movaps (X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, (XX) + addq INCX, XX + + movaps (X), %xmm2 + addq INCX, X + movaps (X), %xmm3 + addq INCX, X + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, (XX) + addq INCX, XX + ALIGN_3 + +.L126: + testq $2, M + je .L127 + + movaps (X), %xmm0 + addq INCX, X + movaps (X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, (XX) + addq INCX, XX + ALIGN_3 + +.L127: + testq $1, M + je .L999 + + movaps (X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + + movaps %xmm0, (XX) + jmp .L999 + ALIGN_3 + +.L200: + cmpq $2 * SIZE, INCX + jne .L220 + +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) + + movddup %xmm0, %xmm14 + pxor %xmm15, %xmm15 + subsd %xmm1, %xmm15 + movlhps %xmm1, %xmm15 + shufpd $1, %xmm15, %xmm15 + + movhps 0 * SIZE(X), %xmm0 + movaps 1 * SIZE(X), %xmm1 + subq $-16 * SIZE, X + + unpckhpd %xmm0, %xmm0 + mulsd %xmm14, %xmm0 + movaps %xmm1, %xmm8 + mulsd %xmm15, %xmm8 + subsd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + + decq M + + movq M, I + sarq $3, I + jle .L205 + + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + + decq I + jle .L202 + ALIGN_4 + +.L201: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + movaps -7 * SIZE(X), %xmm5 + + movaps %xmm2, %xmm8 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -13 * SIZE(X) + movaps -5 * SIZE(X), %xmm6 + + movaps %xmm3, %xmm8 + SHUFPD_1 %xmm4, %xmm2 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -11 * SIZE(X) + movaps -3 * SIZE(X), %xmm7 + + movaps %xmm4, %xmm8 + SHUFPD_1 %xmm5, %xmm3 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -9 * SIZE(X) + movaps -1 * SIZE(X), %xmm0 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm5, %xmm8 + SHUFPD_1 %xmm6, %xmm4 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -7 * SIZE(X) + movaps 1 * SIZE(X), %xmm1 + + movaps %xmm6, %xmm8 + SHUFPD_1 %xmm7, %xmm5 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -5 * SIZE(X) + movaps 3 * SIZE(X), %xmm2 + + movaps %xmm7, %xmm8 + SHUFPD_1 %xmm0, %xmm6 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -3 * SIZE(X) + movaps 5 * SIZE(X), %xmm3 + + movaps %xmm0, %xmm8 + SHUFPD_1 %xmm1, %xmm7 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -1 * SIZE(X) + movaps 7 * SIZE(X), %xmm4 + + subq $-16 * SIZE, X + decq I + jg .L201 + ALIGN_4 + +.L202: + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + movaps -7 * SIZE(X), %xmm5 + + movaps %xmm2, %xmm8 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -13 * SIZE(X) + movaps -5 * SIZE(X), %xmm6 + + movaps %xmm3, %xmm8 + SHUFPD_1 %xmm4, %xmm2 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -11 * SIZE(X) + movaps -3 * SIZE(X), %xmm7 + + movaps %xmm4, %xmm8 + SHUFPD_1 %xmm5, %xmm3 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -9 * SIZE(X) + movaps -1 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm8 + SHUFPD_1 %xmm6, %xmm4 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -7 * SIZE(X) + movaps 1 * SIZE(X), %xmm1 + + movaps %xmm6, %xmm8 + SHUFPD_1 %xmm7, %xmm5 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -5 * SIZE(X) + + movaps %xmm7, %xmm8 + SHUFPD_1 %xmm0, %xmm6 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -3 * SIZE(X) + + movaps %xmm0, %xmm8 + SHUFPD_1 %xmm1, %xmm7 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -1 * SIZE(X) + + subq $-16 * SIZE, X + ALIGN_3 + +.L205: + testq $4, M + je .L206 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm8 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + + movaps %xmm3, %xmm8 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -11 * SIZE(X) + + movaps -7 * SIZE(X), %xmm1 + + movaps %xmm0, %xmm8 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L206: + testq $2, M + je .L207 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm8 + SHUFPD_1 %xmm3, %xmm1 + + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L207: + testq $1, M + je .L208 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + addq $2 * SIZE, X + ALIGN_3 + +.L208: + unpckhpd %xmm0, %xmm0 + mulsd %xmm14, %xmm1 + mulsd %xmm15, %xmm0 + addsd %xmm1, %xmm0 + movlps %xmm0, -15 * SIZE(X) + jmp .L999 + ALIGN_3 + +#else + + movddup %xmm0, %xmm14 + pxor %xmm15, %xmm15 + subsd %xmm1, %xmm15 + movlhps %xmm1, %xmm15 + + subq $-16 * SIZE, X + + movq M, I + sarq $3, I + jle .L205 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + decq I + jle .L202 + ALIGN_4 + +.L201: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm8 +#else + movsd -15 * SIZE(X), %xmm8 + movhps -16 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm8 +#else + movsd -13 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm8 +#else + movsd -11 * SIZE(X), %xmm8 + movhps -12 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm8 +#else + movsd -9 * SIZE(X), %xmm8 + movhps -10 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm4, %xmm8 +#else + movsd -7 * SIZE(X), %xmm8 + movhps -8 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movlps %xmm4, -8 * SIZE(X) + movhps %xmm4, -7 * SIZE(X) + movsd 8 * SIZE(X), %xmm4 + movhps 9 * SIZE(X), %xmm4 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm5, %xmm8 +#else + movsd -5 * SIZE(X), %xmm8 + movhps -6 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movlps %xmm5, -6 * SIZE(X) + movhps %xmm5, -5 * SIZE(X) + movsd 10 * SIZE(X), %xmm5 + movhps 11 * SIZE(X), %xmm5 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm6, %xmm8 +#else + movsd -3 * SIZE(X), %xmm8 + movhps -4 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movlps %xmm6, -4 * SIZE(X) + movhps %xmm6, -3 * SIZE(X) + movsd 12 * SIZE(X), %xmm6 + movhps 13 * SIZE(X), %xmm6 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm7, %xmm8 +#else + movsd -1 * SIZE(X), %xmm8 + movhps -2 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movlps %xmm7, -2 * SIZE(X) + movhps %xmm7, -1 * SIZE(X) + movsd 14 * SIZE(X), %xmm7 + movhps 15 * SIZE(X), %xmm7 + + subq $-16 * SIZE, X + decq I + jg .L201 + ALIGN_4 + +.L202: + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movlps %xmm4, -8 * SIZE(X) + movhps %xmm4, -7 * SIZE(X) + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movlps %xmm5, -6 * SIZE(X) + movhps %xmm5, -5 * SIZE(X) + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movlps %xmm6, -4 * SIZE(X) + movhps %xmm6, -3 * SIZE(X) + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movlps %xmm7, -2 * SIZE(X) + movhps %xmm7, -1 * SIZE(X) + + subq $-16 * SIZE, X + ALIGN_3 + +.L205: + testq $7, M + je .L999 + + testq $4, M + je .L206 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L206: + testq $2, M + je .L207 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L207: + testq $1, M + je .L999 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + jmp .L999 + ALIGN_3 + +#endif + +.L220: + movddup %xmm0, %xmm14 + pxor %xmm15, %xmm15 + subsd %xmm1, %xmm15 + movlhps %xmm1, %xmm15 + + movq X, XX + + movq M, I + sarq $3, I + jle .L225 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + movsd 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + + decq I + jle .L222 + ALIGN_4 + +.L221: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addq INCX, X + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addq INCX, X + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movlps %xmm4, 0 * SIZE(XX) + movhps %xmm4, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movlps %xmm5, 0 * SIZE(XX) + movhps %xmm5, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movlps %xmm6, 0 * SIZE(XX) + movhps %xmm6, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movlps %xmm7, 0 * SIZE(XX) + movhps %xmm7, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + + + decq I + jg .L221 + ALIGN_4 + +.L222: + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movlps %xmm4, 0 * SIZE(XX) + movhps %xmm4, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movlps %xmm5, 0 * SIZE(XX) + movhps %xmm5, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movlps %xmm6, 0 * SIZE(XX) + movhps %xmm6, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movlps %xmm7, 0 * SIZE(XX) + movhps %xmm7, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L225: + testq $7, M + je .L999 + + testq $4, M + je .L226 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addq INCX, X + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addq INCX, X + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L226: + testq $2, M + je .L227 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L227: + testq $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zswap.S b/kernel/x86_64/zswap.S new file mode 100644 index 0000000..8f96875 --- /dev/null +++ b/kernel/x86_64/zswap.S @@ -0,0 +1,452 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define N ARG1 /* rdi */ +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define N ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#define XX %r10 +#define YY %r11 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 40(%rsp), INCY +#endif +#else + pushq %rbx + + movq 56(%rsp), X + movq 64(%rsp), INCX + movq 72(%rsp), Y + movq 80(%rsp), INCY +#endif + + EMMS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + cmpq $2 * SIZE, INCX + jne .L14 + cmpq $2 * SIZE, INCY + jne .L14 + + movq N, %rax + sarq $2, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm6, 16(X) + movq %mm7, 24(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) + movq %mm2, 16(Y) + movq %mm3, 24(Y) + + movq 32(X), %mm0 + movq 40(X), %mm1 + movq 48(X), %mm2 + movq 56(X), %mm3 + movq 32(Y), %mm4 + movq 40(Y), %mm5 + movq 48(Y), %mm6 + movq 56(Y), %mm7 + + movq %mm4, 32(X) + movq %mm5, 40(X) + movq %mm6, 48(X) + movq %mm7, 56(X) + movq %mm0, 32(Y) + movq %mm1, 40(Y) + movq %mm2, 48(Y) + movq %mm3, 56(Y) + + movq 64(X), %mm0 + movq 72(X), %mm1 + movq 80(X), %mm2 + movq 88(X), %mm3 + movq 64(Y), %mm4 + movq 72(Y), %mm5 + movq 80(Y), %mm6 + movq 88(Y), %mm7 + + movq %mm4, 64(X) + movq %mm5, 72(X) + movq %mm6, 80(X) + movq %mm7, 88(X) + movq %mm0, 64(Y) + movq %mm1, 72(Y) + movq %mm2, 80(Y) + movq %mm3, 88(Y) + + movq 96(X), %mm0 + movq 104(X), %mm1 + movq 112(X), %mm2 + movq 120(X), %mm3 + movq 96(Y), %mm4 + movq 104(Y), %mm5 + movq 112(Y), %mm6 + movq 120(Y), %mm7 + + movq %mm4, 96(X) + movq %mm5, 104(X) + movq %mm6, 112(X) + movq %mm7, 120(X) + movq %mm0, 96(Y) + movq %mm1, 104(Y) + movq %mm2, 112(Y) + movq %mm3, 120(Y) +#elif defined(DOUBLE) + prefetchw PREFETCHSIZE * SIZE(X) + MOVQ 0 * SIZE(X), %mm0 + MOVQ 1 * SIZE(X), %mm1 + MOVQ 2 * SIZE(X), %mm2 + MOVQ 3 * SIZE(X), %mm3 + prefetchw PREFETCHSIZE * SIZE(Y) + MOVQ 0 * SIZE(Y), %mm4 + MOVQ 1 * SIZE(Y), %mm5 + MOVQ 2 * SIZE(Y), %mm6 + MOVQ 3 * SIZE(Y), %mm7 + + MOVQ %mm4, 0 * SIZE(X) + MOVQ %mm5, 1 * SIZE(X) + MOVQ %mm6, 2 * SIZE(X) + MOVQ %mm7, 3 * SIZE(X) + MOVQ %mm0, 0 * SIZE(Y) + MOVQ %mm1, 1 * SIZE(Y) + MOVQ %mm2, 2 * SIZE(Y) + MOVQ %mm3, 3 * SIZE(Y) + + MOVQ 4 * SIZE(X), %mm0 + MOVQ 5 * SIZE(X), %mm1 + MOVQ 6 * SIZE(X), %mm2 + MOVQ 7 * SIZE(X), %mm3 + MOVQ 4 * SIZE(Y), %mm4 + MOVQ 5 * SIZE(Y), %mm5 + MOVQ 6 * SIZE(Y), %mm6 + MOVQ 7 * SIZE(Y), %mm7 + + MOVQ %mm4, 4 * SIZE(X) + MOVQ %mm5, 5 * SIZE(X) + MOVQ %mm6, 6 * SIZE(X) + MOVQ %mm7, 7 * SIZE(X) + MOVQ %mm0, 4 * SIZE(Y) + MOVQ %mm1, 5 * SIZE(Y) + MOVQ %mm2, 6 * SIZE(Y) + MOVQ %mm3, 7 * SIZE(Y) + +#else +#ifdef OPTERON + prefetchw PREFETCHSIZE * SIZE(X) +#endif + movq 0 * SIZE(X), %mm0 + movq 2 * SIZE(X), %mm1 + movq 4 * SIZE(X), %mm2 + movq 6 * SIZE(X), %mm3 + movq 0 * SIZE(Y), %mm4 + movq 2 * SIZE(Y), %mm5 + movq 4 * SIZE(Y), %mm6 + movq 6 * SIZE(Y), %mm7 + +#ifdef OPTERON + prefetchw PREFETCHSIZE * SIZE(Y) +#endif + movq %mm4, 0 * SIZE(X) + movq %mm5, 2 * SIZE(X) + movq %mm6, 4 * SIZE(X) + movq %mm7, 6 * SIZE(X) + + movq %mm0, 0 * SIZE(Y) + movq %mm1, 2 * SIZE(Y) + movq %mm2, 4 * SIZE(Y) + movq %mm3, 6 * SIZE(Y) +#endif + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq N, %rax + andq $3, %rax + jle .L27 + ALIGN_3 + +.L22: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm6, 16(X) + movq %mm7, 24(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) + movq %mm2, 16(Y) + movq %mm3, 24(Y) +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + movq 0 * SIZE(Y), %mm4 + movq 1 * SIZE(Y), %mm5 + movq %mm4, 0 * SIZE(X) + movq %mm5, 1 * SIZE(X) + movq %mm0, 0 * SIZE(Y) + movq %mm1, 1 * SIZE(Y) +#else + movq 0 * SIZE(X), %mm0 + movq 0 * SIZE(Y), %mm4 + movq %mm4, 0 * SIZE(X) + movq %mm0, 0 * SIZE(Y) +#endif + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq %rax + jg .L22 + jmp .L27 + ALIGN_3 + +/* INCX != 1 or INCY != 1 */ + +.L14: + movq N, %rax + movq X, XX + movq Y, YY + sarq $1, %rax + jle .L28 + ALIGN_2 + +.L29: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + movq %mm6, 16(XX) + movq %mm7, 24(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + movq %mm2, 16(YY) + movq %mm3, 24(YY) + addq INCY, YY + + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + movq %mm6, 16(XX) + movq %mm7, 24(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + movq %mm2, 16(YY) + movq %mm3, 24(YY) + addq INCY, YY +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + addq INCX, X + movq 0 * SIZE(X), %mm2 + movq 1 * SIZE(X), %mm3 + addq INCX, X + + movq 0 * SIZE(Y), %mm4 + movq 1 * SIZE(Y), %mm5 + addq INCY, Y + movq 0 * SIZE(Y), %mm6 + movq 1 * SIZE(Y), %mm7 + addq INCY, Y + + movq %mm4, 0 * SIZE(XX) + movq %mm5, 1 * SIZE(XX) + addq INCX, XX + movq %mm6, 0 * SIZE(XX) + movq %mm7, 1 * SIZE(XX) + addq INCX, XX + + movq %mm0, 0 * SIZE(YY) + movq %mm1, 1 * SIZE(YY) + addq INCY, YY + movq %mm2, 0 * SIZE(YY) + movq %mm3, 1 * SIZE(YY) + addq INCY, YY +#else + movq 0 * SIZE(X), %mm0 + addq INCX, X + movq 0 * SIZE(X), %mm2 + addq INCX, X + + movq 0 * SIZE(Y), %mm4 + addq INCY, Y + movq 0 * SIZE(Y), %mm6 + addq INCY, Y + + movq %mm4, 0 * SIZE(XX) + addq INCX, XX + movq %mm6, 0 * SIZE(XX) + addq INCX, XX + + movq %mm0, 0 * SIZE(YY) + addq INCY, YY + movq %mm2, 0 * SIZE(YY) + addq INCY, YY +#endif + decq %rax + jg .L29 + ALIGN_3 + +.L28: + movq N, %rax + andq $1, %rax + jle .L27 + ALIGN_3 + +.L35: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm6, 16(X) + movq %mm7, 24(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) + movq %mm2, 16(Y) + movq %mm3, 24(Y) +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + movq 0 * SIZE(Y), %mm4 + movq 1 * SIZE(Y), %mm5 + + movq %mm4, 0 * SIZE(X) + movq %mm5, 1 * SIZE(X) + movq %mm0, 0 * SIZE(Y) + movq %mm1, 1 * SIZE(Y) +#else + movq 0 * SIZE(X), %mm0 + movq 0 * SIZE(Y), %mm4 + movq %mm4, 0 * SIZE(X) + movq %mm0, 0 * SIZE(Y) +#endif + addq INCX, X + addq INCY, Y + + decq %rax + jg .L35 + ALIGN_3 + +.L27: + EMMS + xorq %rax,%rax + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/zswap_sse.S b/kernel/x86_64/zswap_sse.S new file mode 100644 index 0000000..2f21759 --- /dev/null +++ b/kernel/x86_64/zswap_sse.S @@ -0,0 +1,1134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 /* rdi */ +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + pushq %rbx + + movq 56(%rsp), X + movq 64(%rsp), INCX + movq 72(%rsp), Y + movq 80(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L19 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + addq M, M + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + cmpq $3, M + jle .L16 + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + ALIGN_3 + +.L05: + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_3 + +.L10: + cmpq $3, M + jle .L16 + + testq $2 * SIZE, X + jne .L30 + + testq $1 * SIZE, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + + decq %rax + jg .L11 + ALIGN_3 + +.L13: + testq $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + addq $2 * SIZE, X + movlps %xmm0, -32 * SIZE(Y) + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L20: + movaps -33 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + pshufd $0x39, %xmm1, %xmm3 + movlps %xmm3, -31 * SIZE(X) + + subq $3, M + + movq M, %rax + sarq $5, %rax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -13 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -5 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -5 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L23: + testq $16, M + jle .L24 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + testq $8, M + jle .L25 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, M + jle .L26 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + pshufd $0x39, %xmm0, %xmm2 + pshufd $0xff, %xmm0, %xmm0 + + movlps %xmm2, -32 * SIZE(Y) + movss %xmm0, -30 * SIZE(Y) + + testq $2, M + jle .L27 + + movsd -29 * SIZE(X), %xmm0 + movsd -29 * SIZE(Y), %xmm1 + + movlps %xmm0, -29 * SIZE(Y) + movlps %xmm1, -29 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, M + jle .L29 + + movss -29 * SIZE(X), %xmm0 + movss -29 * SIZE(Y), %xmm1 + + movss %xmm0, -29 * SIZE(Y) + movss %xmm1, -29 * SIZE(X) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L30: + testq $1 * SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + subq $2, M + + movq M, %rax + sarq $5, %rax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -6 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -6 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L33: + testq $16, M + jle .L34 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + testq $8, M + jle .L35 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, M + jle .L36 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + movhps %xmm0, -32 * SIZE(Y) + + testq $2, M + jle .L37 + + movsd -30 * SIZE(X), %xmm0 + movsd -30 * SIZE(Y), %xmm1 + + movlps %xmm0, -30 * SIZE(Y) + movlps %xmm1, -30 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, M + jle .L39 + + movss -30 * SIZE(X), %xmm0 + movss -30 * SIZE(Y), %xmm1 + + movss %xmm0, -30 * SIZE(Y) + movss %xmm1, -30 * SIZE(X) + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + + subq $3, M + + movq M, %rax + sarq $5, %rax + jle .L43 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -11 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -3 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -3 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L43: + testq $16, M + jle .L44 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $8, M + jle .L45 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, M + jle .L46 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + movsd -31 * SIZE(X), %xmm2 + + pshufd $0x39, %xmm1, %xmm1 + movlps %xmm1, -31 * SIZE(X) + + pshufd $0xff, %xmm0, %xmm0 + + movss %xmm0, -32 * SIZE(Y) + movlps %xmm2, -31 * SIZE(Y) + + addq $3 * SIZE, X + addq $3 * SIZE, Y + + testq $2, M + jle .L47 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm0, -32 * SIZE(Y) + movlps %xmm1, -32 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, M + jle .L49 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm0, -32 * SIZE(Y) + movss %xmm1, -32 * SIZE(X) + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L50: + movq M, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L51: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $3, %rax + jle .L57 + ALIGN_3 + +.L56: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + + EPILOGUE diff --git a/kernel/x86_64/zswap_sse2.S b/kernel/x86_64/zswap_sse2.S new file mode 100644 index 0000000..c505014 --- /dev/null +++ b/kernel/x86_64/zswap_sse2.S @@ -0,0 +1,999 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 /* rdi */ +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + pushq %rbx + + movq 56(%rsp), X + movq 64(%rsp), INCX + movq 72(%rsp), Y + movq 80(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L19 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, Y + jne .L30 + + testq $SIZE, X + jne .L20 + + movq M, %rax + sarq $3, %rax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + + decq %rax + jg .L11 + ALIGN_3 + +.L13: + testq $4, M + jle .L14 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + testq $2, M + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + testq $1, M + jle .L19 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + ret + ALIGN_3 + +.L20: + movhps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + decq M + jle .L29 + + movq M, %rax + sarq $3, %rax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -5 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -3 * SIZE(X), %xmm2 + movaps -2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L23: + testq $4, M + jle .L24 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $2, M + jle .L25 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $1, M + jle .L29 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L29: + movaps -15 * SIZE(X), %xmm2 + + movhps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L30: + testq $SIZE, X + jne .L40 + + movhps -16 * SIZE(Y), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movlps %xmm1, -16 * SIZE(Y) + decq M + jle .L39 + + movq M, %rax + sarq $3, %rax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -11 * SIZE(Y), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(Y) + + movaps -9 * SIZE(Y), %xmm0 + movaps -8 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(Y), %xmm2 + movaps -6 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(Y) + + movaps -5 * SIZE(Y), %xmm0 + movaps -4 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -3 * SIZE(Y), %xmm2 + movaps -2 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(Y) + + movaps -1 * SIZE(Y), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L33: + testq $4, M + jle .L34 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + + movaps -11 * SIZE(Y), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(Y) + + movaps -9 * SIZE(Y), %xmm0 + movaps -8 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L34: + testq $2, M + jle .L35 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L35: + testq $1, M + jle .L39 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L39: + movaps -15 * SIZE(Y), %xmm2 + + movhps %xmm1, -15 * SIZE(Y) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L40: + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm0, -16 * SIZE(Y) + movlps %xmm1, -16 * SIZE(X) + + addq $SIZE, X + addq $SIZE, Y + decq M + jle .L49 + + movq M, %rax + sarq $3, %rax + jle .L43 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + + decq %rax + jg .L41 + ALIGN_3 + +.L43: + testq $4, M + jle .L44 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L44: + testq $2, M + jle .L45 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L45: + testq $1, M + jle .L49 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L49: + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm0, -16 * SIZE(Y) + movlps %xmm1, -16 * SIZE(X) + + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + ret + ALIGN_3 + +.L50: + testq $SIZE, X + jne .L60 + testq $SIZE, Y + jne .L60 + + movq M, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L51: + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $3, %rax + jle .L57 + ALIGN_3 + +.L56: + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L60: + movq M, %rax + sarq $2, %rax + jle .L65 + ALIGN_3 + +.L61: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L61 + ALIGN_3 + +.L65: + movq M, %rax + andq $3, %rax + jle .L67 + ALIGN_3 + +.L66: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L66 + ALIGN_3 + +.L67: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S new file mode 100644 index 0000000..39f0ff4 --- /dev/null +++ b/kernel/x86_64/zsymv_L_sse.S @@ -0,0 +1,814 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 12) +#define movsd movlpd +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA_R %xmm0 +#define ALPHA_I %xmm1 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define xtemp3 %xmm2 +#define xtemp4 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define yy1 %xmm10 +#define yy2 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + salq $ZBASE_SHIFT, LDA + + testq M, M + jle .L999 + + pcmpeqb %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + psllq $63, %xmm2 + unpcklpd %xmm3, %xmm2 + + unpcklpd ALPHA_I, ALPHA_R + unpcklpd ALPHA_R, ALPHA_I + xorpd %xmm2, ALPHA_I + + movq BUFFER, XX + + movq M, %rax + sarq $2, %rax + jle .L02 + ALIGN_3 + +.L01: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + movapd %xmm5, 4 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 6 * SIZE(XX) + + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 8 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 10 * SIZE(XX) + + movapd %xmm5, 12 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 14 * SIZE(XX) + + subq $-16 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $3, %rax + jle .L05 + ALIGN_3 + +.L03: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + + addpd %xmm4, %xmm3 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + addq $4 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $2 * SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $2, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + movhpd 1 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + movhpd 1 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + movhpd 1 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $3, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + + addq $2 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + xorq IS, IS # is = 0 + + cmpq $2, N + jl .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq 4 * SIZE(A, LDA, 2), A + + leaq (, IS, SIZE), I + + leaq 0 * SIZE(NEW_X, I, 4), XX + leaq 4 * SIZE(NEW_Y, I, 2), YY + + movapd 0 * SIZE(XX), atemp1 + movapd 2 * SIZE(XX), atemp2 + movapd 4 * SIZE(XX), atemp3 + movapd 6 * SIZE(XX), atemp4 + + MOVDDUP(0 * SIZE, A1, xsum1) + MOVDDUP(2 * SIZE, A1, xsum2) + + mulpd atemp1, xsum1 + mulpd atemp1, xsum2 + + MOVDDUP(1 * SIZE, A1, a1) + MOVDDUP(3 * SIZE, A1, a2) + + mulpd atemp2, a1 + mulpd atemp2, a2 + addpd a1, xsum1 + addpd a2, xsum2 + + MOVDDUP(2 * SIZE, A1, a1) + MOVDDUP(2 * SIZE, A2, a2) + + mulpd atemp3, a1 + mulpd atemp3, a2 + addpd a1, xsum1 + addpd a2, xsum2 + + MOVDDUP(3 * SIZE, A1, a1) + MOVDDUP(3 * SIZE, A2, a2) + + mulpd atemp4, a1 + mulpd atemp4, a2 + addpd a1, xsum1 + addpd a2, xsum2 + + MOVDDUP(4 * SIZE, A1, a1) + MOVDDUP(6 * SIZE, A2, a2) + + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + + movapd 8 * SIZE(XX), xtemp1 + movapd 10 * SIZE(XX), xtemp2 + movapd 12 * SIZE(XX), xtemp3 + movapd 14 * SIZE(XX), xtemp4 + + addq $8 * SIZE, XX + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + + movq M, I + subq IS, I + subq $2, I + sarq $2, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + PREFETCH PREFETCHSIZE(XX) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(6 * SIZE, A2, a2) + + PREFETCH PREFETCHSIZE(A2) + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(5 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(7 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(6 * SIZE, A1, a1) + + PREFETCHW PREFETCHSIZE(YY) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(4 * SIZE, A2, a2) + + movapd xtemp3, xt1 + movapd 20 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(7 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 16 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(5 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 22 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP( 8 * SIZE, A1, a1) + + movlpd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 18 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(10 * SIZE, A2, a2) + + movlpd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + subq $-16 * SIZE, XX + addq $ 8 * SIZE, YY + addq $ 8 * SIZE, A1 + addq $ 8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + movq M, I + subq IS, I + subq $2, I + testq $2, I + jle .L16 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L16: + testq $1, M + jle .L18 + + MOVDDUP(1 * SIZE, A1, a2) + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + + MOVDDUP(0 * SIZE, A2, a1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + addpd xt1, xsum1 + addpd a2, yy1 + + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum2 + addpd a1, yy1 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + ALIGN_3 + +.L18: + leaq (, IS, SIZE), I + + movsd 0 * SIZE(NEW_Y, I, 2), yy1 + movhpd 1 * SIZE(NEW_Y, I, 2), yy1 + movsd 2 * SIZE(NEW_Y, I, 2), yy2 + movhpd 3 * SIZE(NEW_Y, I, 2), yy2 + + addpd xsum1, yy1 + addpd xsum2, yy2 + + movlpd yy1, 0 * SIZE(NEW_Y, I, 2) + movhpd yy1, 1 * SIZE(NEW_Y, I, 2) + movlpd yy2, 2 * SIZE(NEW_Y, I, 2) + movhpd yy2, 3 * SIZE(NEW_Y, I, 2) + + addq $2, IS + + movq IS, I + addq $2, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + HALT + testq $1, N + jle .L990 + + leaq (, IS, SIZE), I + + movapd 0 * SIZE(NEW_X, I, 4), atemp1 + movapd 2 * SIZE(NEW_X, I, 4), atemp2 + + movsd 0 * SIZE(NEW_Y, I, 2), yy1 + movhpd 1 * SIZE(NEW_Y, I, 2), yy1 + + MOVDDUP(0 * SIZE, A, a1) + MOVDDUP(1 * SIZE, A, a2) + + mulpd atemp1, a1 + mulpd atemp2, a2 + addpd a1, yy1 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(NEW_Y, I, 2) + movhpd yy1, 1 * SIZE(NEW_Y, I, 2) + ALIGN_3 + +.L990: + cmpq $2 * SIZE, INCY + je .L999 + + movq M, %rax + sarq $2, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + movhpd %xmm1, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + movhpd %xmm3, 1 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L998: + movapd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + + addq $2 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S new file mode 100644 index 0000000..7119077 --- /dev/null +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -0,0 +1,886 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 12) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA_R %xmm0 +#define ALPHA_I %xmm1 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define xtemp3 %xmm2 +#define xtemp4 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define yy1 %xmm10 +#define yy2 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + +#ifndef HEMV +#define ADD addpd +#else +#define ADD subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + salq $ZBASE_SHIFT, LDA + + testq M, M + jle .L999 + + pcmpeqb %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + psllq $63, %xmm2 + unpcklpd %xmm3, %xmm2 + + unpcklpd ALPHA_I, ALPHA_R + unpcklpd ALPHA_R, ALPHA_I + xorpd %xmm2, ALPHA_I + + movq BUFFER, XX + + movq M, %rax + sarq $2, %rax + jle .L02 + ALIGN_3 + +.L01: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + movapd %xmm5, 4 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 6 * SIZE(XX) + + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 8 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 10 * SIZE(XX) + + movapd %xmm5, 12 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 14 * SIZE(XX) + + subq $-16 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $3, %rax + jle .L05 + ALIGN_3 + +.L03: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + + addpd %xmm4, %xmm3 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + addq $4 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $2 * SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $2, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + movhpd 1 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + movhpd 1 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + movhpd 1 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $3, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + + addq $2 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + xorq IS, IS # is = 0 + + cmpq $2, N + jl .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq 4 * SIZE(A, LDA, 2), A + + leaq (, IS, SIZE), I + + leaq 0 * SIZE(NEW_X, I, 4), XX + leaq 4 * SIZE(NEW_Y, I, 2), YY + + movapd 0 * SIZE(XX), atemp1 + movapd 2 * SIZE(XX), atemp2 + movapd 4 * SIZE(XX), atemp3 + movapd 6 * SIZE(XX), atemp4 + + MOVDDUP(0 * SIZE, A1, xsum1) + MOVDDUP(2 * SIZE, A1, xsum2) + + mulpd atemp1, xsum1 + mulpd atemp1, xsum2 + +#ifndef HEMV + MOVDDUP(1 * SIZE, A1, a1) + MOVDDUP(3 * SIZE, A1, a2) + + mulpd atemp2, a1 + mulpd atemp2, a2 + addpd a1, xsum1 + addpd a2, xsum2 +#else + MOVDDUP(3 * SIZE, A1, a2) + + mulpd atemp2, a2 + addpd a2, xsum2 +#endif + + MOVDDUP(2 * SIZE, A1, a1) + MOVDDUP(2 * SIZE, A2, a2) + + mulpd atemp3, a1 + mulpd atemp3, a2 + addpd a1, xsum1 + addpd a2, xsum2 + +#ifndef HEMV + MOVDDUP(3 * SIZE, A1, a1) + MOVDDUP(3 * SIZE, A2, a2) + + mulpd atemp4, a1 + mulpd atemp4, a2 + addpd a1, xsum1 + addpd a2, xsum2 +#else + MOVDDUP(3 * SIZE, A1, a1) + + mulpd atemp4, a1 + subpd a1, xsum1 +#endif + + MOVDDUP(4 * SIZE, A1, a1) + MOVDDUP(6 * SIZE, A2, a2) + + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + + movapd 8 * SIZE(XX), xtemp1 + movapd 10 * SIZE(XX), xtemp2 + movapd 12 * SIZE(XX), xtemp3 + movapd 14 * SIZE(XX), xtemp4 + + addq $8 * SIZE, XX + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + + movq M, I + subq IS, I + subq $2, I + sarq $2, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + PREFETCH PREFETCHSIZE(XX) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + MOVDDUP(6 * SIZE, A2, a2) + + PREFETCH PREFETCHSIZE(A2) + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(5 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(7 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy1 + MOVDDUP(6 * SIZE, A1, a1) + + PREFETCHW PREFETCHSIZE(YY) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(4 * SIZE, A2, a2) + + movapd xtemp3, xt1 + movapd 20 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(7 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 16 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(5 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 22 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + MOVDDUP( 8 * SIZE, A1, a1) + + movlpd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 18 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + MOVDDUP(10 * SIZE, A2, a2) + + movlpd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + subq $-16 * SIZE, XX + addq $ 8 * SIZE, YY + addq $ 8 * SIZE, A1 + addq $ 8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + movq M, I + subq IS, I + subq $2, I + testq $2, I + jle .L16 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L16: + testq $1, M + jle .L18 + + MOVDDUP(1 * SIZE, A1, a2) + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + + MOVDDUP(0 * SIZE, A2, a1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + ADD xt1, xsum1 + addpd a2, yy1 + + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum2 + addpd a1, yy1 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + ALIGN_3 + +.L18: + leaq (, IS, SIZE), I + + movsd 0 * SIZE(NEW_Y, I, 2), yy1 + movhpd 1 * SIZE(NEW_Y, I, 2), yy1 + movsd 2 * SIZE(NEW_Y, I, 2), yy2 + movhpd 3 * SIZE(NEW_Y, I, 2), yy2 + + addpd xsum1, yy1 + addpd xsum2, yy2 + + movlpd yy1, 0 * SIZE(NEW_Y, I, 2) + movhpd yy1, 1 * SIZE(NEW_Y, I, 2) + movlpd yy2, 2 * SIZE(NEW_Y, I, 2) + movhpd yy2, 3 * SIZE(NEW_Y, I, 2) + + addq $2, IS + + movq IS, I + addq $2, I + cmpq N, I + jle .L11 + ALIGN_3 + +.L20: + testq $1, N + jle .L990 + + leaq (, IS, SIZE), I + + movapd 0 * SIZE(NEW_X, I, 4), atemp1 + movapd 2 * SIZE(NEW_X, I, 4), atemp2 + + movsd 0 * SIZE(NEW_Y, I, 2), yy1 + movhpd 1 * SIZE(NEW_Y, I, 2), yy1 + +#ifndef HEMV + MOVDDUP(0 * SIZE, A, a1) + MOVDDUP(1 * SIZE, A, a2) + + mulpd atemp1, a1 + mulpd atemp2, a2 + addpd a1, yy1 + addpd a2, yy1 +#else + MOVDDUP(0 * SIZE, A, a1) + + mulpd atemp1, a1 + addpd a1, yy1 +#endif + + movlpd yy1, 0 * SIZE(NEW_Y, I, 2) + movhpd yy1, 1 * SIZE(NEW_Y, I, 2) + ALIGN_3 + +.L990: + cmpq $2 * SIZE, INCY + je .L999 + + movq M, %rax + sarq $2, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + movhpd %xmm1, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + movhpd %xmm3, 1 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L998: + movapd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + + addq $2 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S new file mode 100644 index 0000000..175912c --- /dev/null +++ b/kernel/x86_64/zsymv_U_sse.S @@ -0,0 +1,594 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 12) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 14) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA_R %xmm0 +#define ALPHA_I %xmm1 + +#define xsum1 %xmm0 +#define xsum2 %xmm1 +#define xsum3 %xmm2 +#define xsum4 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xtemp1 %xmm8 +#define xtemp2 %xmm9 +#define a1 %xmm10 +#define a2 %xmm11 + +#define a3 %xmm12 +#define yy1 %xmm13 +#define xt1 %xmm14 +#define xt2 %xmm15 + +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + salq $ZBASE_SHIFT, LDA + + testq M, M + jle .L999 + + negq IS + addq M, IS + + movq IS, TEMP + imulq LDA, TEMP + addq TEMP, A + + pcmpeqb %xmm3, %xmm3 + xorpd %xmm2, %xmm2 + pslld $31, %xmm3 + unpckhps %xmm3, %xmm2 + + shufps $0, ALPHA_R, ALPHA_R + shufps $0, ALPHA_I, ALPHA_I + movaps ALPHA_I, %xmm3 + + unpcklps ALPHA_R, ALPHA_I + unpcklps %xmm3, ALPHA_R + pxor %xmm2, ALPHA_R + + movq BUFFER, XX + + movq M, %rax + sarq $2, %rax + jle .L02 + ALIGN_3 + +.L01: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhps 0 * SIZE(X), %xmm6 + addq INCX, X + + movsldup %xmm4, %xmm3 + movshdup %xmm4, %xmm4 + movsldup %xmm6, %xmm5 + movshdup %xmm6, %xmm6 + + mulps ALPHA_I, %xmm3 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm5 + mulps ALPHA_R, %xmm6 + + addps %xmm4, %xmm3 + addps %xmm6, %xmm5 + + movaps %xmm3, 4 * SIZE(XX) + movaps %xmm5, 12 * SIZE(XX) + + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + + pxor %xmm2, %xmm3 + pxor %xmm2, %xmm5 + + movaps %xmm3, 0 * SIZE(XX) + movaps %xmm5, 8 * SIZE(XX) + + subq $-16 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + testq $2, M + jle .L03 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + + movsldup %xmm4, %xmm3 + movshdup %xmm4, %xmm4 + + mulps ALPHA_I, %xmm3 + mulps ALPHA_R, %xmm4 + + addps %xmm4, %xmm3 + + movaps %xmm3, 4 * SIZE(XX) + + shufps $0xb1, %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movaps %xmm3, 0 * SIZE(XX) + + subq $-8 * SIZE, XX + ALIGN_3 + +.L03: + testq $1, M + jle .L05 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + + movsldup %xmm4, %xmm3 + movshdup %xmm4, %xmm4 + + mulps ALPHA_I, %xmm3 + mulps ALPHA_R, %xmm4 + + addps %xmm4, %xmm3 + + movlps %xmm3, 2 * SIZE(XX) + + shufps $0xb1, %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + + subq $-4 * SIZE, XX + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $2 * SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $2, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movhps 0 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movhps 0 * SIZE(YY), %xmm1 + addq INCY, YY + + movaps %xmm0, 0 * SIZE(XX) + movaps %xmm1, 8 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $3, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movlps %xmm0, 0 * SIZE(XX) + + addq $2 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + movq IS, I + addq $2, I + cmpq M, I + jg .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + leaq (, IS, 4), I + + movsd 0 * SIZE(NEW_X, I, SIZE), atemp2 + movhps 4 * SIZE(NEW_X, I, SIZE), atemp2 + movsd 2 * SIZE(NEW_X, I, SIZE), atemp4 + movhps 6 * SIZE(NEW_X, I, SIZE), atemp4 + + pshufd $0xcc, atemp2, atemp1 + pshufd $0x99, atemp2, atemp2 + pshufd $0xcc, atemp4, atemp3 + pshufd $0x99, atemp4, atemp4 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + pxor xsum3, xsum3 + pxor xsum4, xsum4 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $2, I + jle .L15 + ALIGN_3 + +.L12: + HALT + + subq $-16 * SIZE, XX + addq $ 8 * SIZE, YY + addq $ 8 * SIZE, A1 + addq $ 8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + testq $2, IS + jle .L18 + + movsd 0 * SIZE(YY), yy1 + movhps 2 * SIZE(YY), yy1 + + movaps 0 * SIZE(XX), xtemp1 + movaps 4 * SIZE(XX), xtemp2 + + movsd 0 * SIZE(A1), a1 + movhps 2 * SIZE(A1), a1 + + movaps xtemp1, xt1 + movaps xtemp2, xt2 + mulps a1, xt1 + mulps a1, xt2 + addps xt1, xsum1 + addps xt2, xsum2 + + pshufd $0xb1, a1, xt2 + mulps atemp1, a1 + mulps atemp2, xt2 + addps a1, yy1 + addps xt2, yy1 + + movsd 0 * SIZE(A2), a1 + movhps 2 * SIZE(A2), a1 + + movaps xtemp1, xt1 + movaps xtemp2, xt2 + mulps a1, xt1 + mulps a1, xt2 + addps xt1, xsum3 + addps xt2, xsum4 + + pshufd $0xb1, a1, xt2 + mulps atemp1, a1 + mulps atemp2, xt2 + addps a1, yy1 + addps xt2, yy1 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + + addq $8 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + leaq (, IS, 4), I + + movaps 0 * SIZE(NEW_X, I, SIZE), atemp1 + movaps 4 * SIZE(NEW_X, I, SIZE), atemp2 + + movlps 0 * SIZE(YY), yy1 + movhps 2 * SIZE(YY), yy1 + + movsd 0 * SIZE(A1), a1 + movhps 0 * SIZE(A2), a1 + + movaps a1, a2 + mulps atemp1, a1 + mulps atemp2, a2 + addps a1, xsum1 + addps a2, xsum2 + + movsd 0 * SIZE(A2), a1 + movhps 2 * SIZE(A2), a1 + + movaps a1, a2 + mulps atemp1, a1 + mulps atemp2, a2 + addps a1, xsum3 + addps a2, xsum4 + + haddps xsum2, xsum1 + haddps xsum4, xsum3 + + haddps xsum3, xsum1 + addps xsum1, yy1 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + + addq $2, IS + + movq IS, I + addq $2, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + testq $1, M + jle .L990 + + +.L990: + cmpq $2 * SIZE, INCY + je .L999 + + movq M, %rax + sarq $2, %rax + jle .L997 + ALIGN_3 + +.L996: + movaps 0 * SIZE(NEW_Y), %xmm0 + movaps 4 * SIZE(NEW_Y), %xmm1 + + movlps %xmm0, 0 * SIZE(Y) + addq INCY, Y + movhps %xmm0, 0 * SIZE(Y) + addq INCY, Y + movlps %xmm1, 0 * SIZE(Y) + addq INCY, Y + movhps %xmm1, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L998: + movlps 0 * SIZE(NEW_Y), %xmm0 + addq $2 * SIZE, NEW_Y + + movlps %xmm0, 0 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S new file mode 100644 index 0000000..3e4b170 --- /dev/null +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -0,0 +1,916 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 12) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA_R %xmm0 +#define ALPHA_I %xmm1 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define xtemp3 %xmm2 +#define xtemp4 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define yy1 %xmm10 +#define yy2 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + +#ifndef HEMV +#define ADD addpd +#else +#define ADD subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + salq $ZBASE_SHIFT, LDA + + testq M, M + jle .L999 + + negq IS + addq M, IS + + movq IS, TEMP + imulq LDA, TEMP + addq TEMP, A + + pcmpeqb %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + psllq $63, %xmm2 + unpcklpd %xmm3, %xmm2 + + unpcklpd ALPHA_I, ALPHA_R + unpcklpd ALPHA_R, ALPHA_I + xorpd %xmm2, ALPHA_I + + movq BUFFER, XX + + movq M, %rax + sarq $2, %rax + jle .L02 + ALIGN_3 + +.L01: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + movapd %xmm5, 4 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 6 * SIZE(XX) + + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 8 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 10 * SIZE(XX) + + movapd %xmm5, 12 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 14 * SIZE(XX) + + subq $-16 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $3, %rax + jle .L05 + ALIGN_3 + +.L03: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + + addpd %xmm4, %xmm3 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + addq $4 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $2 * SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $2, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + movhpd 1 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + movhpd 1 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + movhpd 1 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $3, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + + addq $2 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + movq IS, I + addq $2, I + cmpq M, I + jg .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + leaq (, IS, 4), I + + movapd 0 * SIZE(NEW_X, I, SIZE), atemp1 + movapd 2 * SIZE(NEW_X, I, SIZE), atemp2 + movapd 4 * SIZE(NEW_X, I, SIZE), atemp3 + movapd 6 * SIZE(NEW_X, I, SIZE), atemp4 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + movsd 0 * SIZE(NEW_Y), yy1 + movhpd 1 * SIZE(NEW_Y), yy1 + movsd 2 * SIZE(NEW_Y), yy2 + movhpd 3 * SIZE(NEW_Y), yy2 + + movapd 0 * SIZE(NEW_X), xtemp1 + movapd 2 * SIZE(NEW_X), xtemp2 + movapd 4 * SIZE(NEW_X), xtemp3 + movapd 6 * SIZE(NEW_X), xtemp4 + + MOVDDUP(0 * SIZE, A1, a1) + MOVDDUP(2 * SIZE, A2, a2) + MOVDDUP(1 * SIZE, A1, a3) + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $2, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(3 * SIZE, A2, a1) + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(2 * SIZE, A1, a2) + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + ADD xt1, xsum1 + addpd a3, yy1 + MOVDDUP(0 * SIZE, A2, a3) + + movapd xtemp4, xt1 + mulpd a1, xt1 + mulpd atemp4, a1 + ADD xt1, xsum2 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + PREFETCH PREFETCHSIZE(XX) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum2 + addpd a3, yy1 + MOVDDUP(4 * SIZE, A1, a3) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + MOVDDUP(6 * SIZE, A2, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + MOVDDUP(5 * SIZE, A1, a2) + + PREFETCH PREFETCHSIZE(A2) + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp1, a3 + addpd xt1, xsum1 + addpd a3, yy1 + MOVDDUP(7 * SIZE, A2, a3) + + movapd xtemp3, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum2 + addpd a1, yy2 + MOVDDUP(6 * SIZE, A1, a1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + ADD xt1, xsum1 + addpd a2, yy1 + MOVDDUP(4 * SIZE, A2, a2) + + PREFETCHW PREFETCHSIZE(YY) + + movapd xtemp4, xt1 + mulpd a3, xt1 + mulpd atemp4, a3 + ADD xt1, xsum2 + addpd a3, yy2 + MOVDDUP(7 * SIZE, A1, a3) + + movapd xtemp3, xt1 + movapd 20 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(5 * SIZE, A2, a1) + + movapd xtemp1, xt1 + movapd 16 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(10 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 22 * SIZE(XX), xtemp4 + mulpd a3, xt1 + mulpd atemp2, a3 + ADD xt1, xsum1 + addpd a3, yy2 + MOVDDUP( 9 * SIZE, A1, a3) + + movlpd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 18 * SIZE(XX), xtemp2 + mulpd a1, xt1 + mulpd atemp4, a1 + ADD xt1, xsum2 + addpd a1, yy1 + MOVDDUP( 8 * SIZE, A1, a1) + + movlpd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + subq $-16 * SIZE, XX + addq $ 8 * SIZE, YY + addq $ 8 * SIZE, A1 + addq $ 8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + testq $2, IS + jle .L18 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + movapd xtemp3, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + MOVDDUP(0 * SIZE, A1, a1) + MOVDDUP(0 * SIZE, A2, a2) + + mulpd atemp1, a1 + mulpd atemp1, a2 + addpd a1, xsum1 + addpd a2, xsum2 + +#ifndef HEMV + MOVDDUP(1 * SIZE, A1, a1) + MOVDDUP(1 * SIZE, A2, a2) + + mulpd atemp2, a1 + mulpd atemp2, a2 + addpd a1, xsum1 + addpd a2, xsum2 +#else + MOVDDUP(1 * SIZE, A2, a2) + + mulpd atemp2, a2 + subpd a2, xsum2 +#endif + + MOVDDUP(0 * SIZE, A2, a1) + MOVDDUP(2 * SIZE, A2, a2) + + mulpd atemp3, a1 + mulpd atemp3, a2 + addpd a1, xsum1 + addpd a2, xsum2 + +#ifndef HEMV + MOVDDUP(1 * SIZE, A2, a1) + MOVDDUP(3 * SIZE, A2, a2) + + mulpd atemp4, a1 + mulpd atemp4, a2 + addpd a1, xsum1 + addpd a2, xsum2 +#else + MOVDDUP(1 * SIZE, A2, a1) + + mulpd atemp4, a1 + addpd a1, xsum1 +#endif + + addpd xsum1, yy1 + addpd xsum2, yy2 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + + addq $2, IS + + movq IS, I + addq $2, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + testq $1, M + jle .L990 + + movq A, A1 + leaq (, IS, 4), I + + movapd 0 * SIZE(NEW_X, I, SIZE), atemp1 + movapd 2 * SIZE(NEW_X, I, SIZE), atemp2 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + MOVDDUP(0 * SIZE, A1, a1) + MOVDDUP(1 * SIZE, A1, a2) + + movapd 0 * SIZE(NEW_X), xtemp1 + movapd 2 * SIZE(NEW_X), xtemp2 + movapd 4 * SIZE(NEW_X), xtemp3 + movapd 6 * SIZE(NEW_X), xtemp4 + + movsd 0 * SIZE(NEW_Y), yy1 + movhpd 1 * SIZE(NEW_Y), yy1 + movsd 2 * SIZE(NEW_Y), yy2 + movhpd 3 * SIZE(NEW_Y), yy2 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $1, I + jle .L28 + ALIGN_3 + +.L22: + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp2, a2 + ADD xt1, xsum2 + addpd a2, yy1 + MOVDDUP(3 * SIZE, A1, a2) + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a2, xt1 + mulpd atemp2, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(5 * SIZE, A1, a2) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + addq $8 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + + decq I + jg .L22 + ALIGN_3 + +.L28: + MOVDDUP(0 * SIZE, A1, a1) + +#ifndef HEMV + MOVDDUP(1 * SIZE, A1, a2) + + mulpd atemp1, a1 + mulpd atemp2, a2 + addpd a1, xsum1 + addpd a2, xsum2 + +#else + mulpd atemp1, a1 + addpd a1, xsum1 +#endif + + addpd xsum2, xsum1 + addpd xsum1, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + ALIGN_3 + +.L990: + cmpq $2 * SIZE, INCY + je .L999 + + movq M, %rax + sarq $2, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + movhpd %xmm1, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + movhpd %xmm3, 1 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L998: + movapd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + + addq $2 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S b/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S new file mode 100644 index 0000000..31bd57b --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S @@ -0,0 +1,995 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define BB %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#ifndef CONJ +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd + +#elif defined(LN) || defined(LT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#endif + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + testq N, N + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + subq LDC, C +#endif + + movq C, CO1 + +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax), BB + + testq $1, M + jle .L20 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + ADDSD1 %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + ADDSD3 %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + ADDSD2 %xmm2, %xmm9 + ADDSD4 %xmm6, %xmm11 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 +#endif + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm6 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(AO), %xmm7 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BO), %xmm9 + movaps %xmm1, %xmm4 + + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 + mulsd %xmm9, %xmm5 + mulsd %xmm9, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD2 %xmm5, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + ADDSD3 %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + ADDSD2 %xmm2, %xmm13 + ADDSD3 %xmm7, %xmm14 + ADDSD4 %xmm6, %xmm15 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + addsd %xmm15, %xmm12 + addsd %xmm13, %xmm14 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + movsd 2 * SIZE(AO), %xmm2 + movsd 3 * SIZE(AO), %xmm3 +#endif + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm1 + subsd %xmm12, %xmm2 + subsd %xmm14, %xmm3 + +#ifdef LN + movsd 6 * SIZE(AO), %xmm6 + movsd 7 * SIZE(AO), %xmm7 + + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + movsd 4 * SIZE(AO), %xmm6 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + movsd 5 * SIZE(AO), %xmm7 + + ADDSD4 %xmm4, %xmm2 + ADDSD3 %xmm5, %xmm3 + + movaps %xmm2, %xmm4 + movaps %xmm3, %xmm5 + + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm5 + mulsd %xmm3, %xmm6 + mulsd %xmm2, %xmm7 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm1 + movsd 0 * SIZE(AO), %xmm6 + + ADDSD3 %xmm5, %xmm0 + ADDSD4 %xmm7, %xmm1 + movsd 1 * SIZE(AO), %xmm7 + + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm6 + movsd 1 * SIZE(AO), %xmm7 + + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + movsd 3 * SIZE(AO), %xmm7 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 + + movaps %xmm0, %xmm4 + movaps %xmm1, %xmm5 + + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm5 + mulsd %xmm1, %xmm6 + mulsd %xmm0, %xmm7 + + subsd %xmm4, %xmm2 + subsd %xmm6, %xmm3 + movsd 6 * SIZE(AO), %xmm6 + + ADDSD3 %xmm5, %xmm2 + ADDSD4 %xmm7, %xmm3 + movsd 7 * SIZE(AO), %xmm7 + + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm2 + ADDSD3 %xmm5, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BO), %xmm9 + movaps %xmm1, %xmm4 + movaps %xmm2, %xmm7 + movaps %xmm3, %xmm6 + + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 + mulsd %xmm9, %xmm5 + mulsd %xmm9, %xmm4 + + ADDSD4 %xmm4, %xmm0 + mulsd %xmm8, %xmm2 + ADDSD2 %xmm5, %xmm1 + mulsd %xmm8, %xmm3 + mulsd %xmm9, %xmm7 + mulsd %xmm9, %xmm6 + + ADDSD4 %xmm6, %xmm2 + ADDSD2 %xmm7, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movsd %xmm2, 2 * SIZE(CO1) + movsd %xmm3, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) + movsd %xmm2, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S new file mode 100644 index 0000000..065abe0 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S @@ -0,0 +1,2162 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define AORIG 48(%rsp) +#define BORIG 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + jle .L30 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + +#ifdef LN + pxor %xmm8, %xmm8 + prefetcht1 -3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 -3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#else + pxor %xmm8, %xmm8 + prefetcht1 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#endif + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -12 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -8 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -8 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd 0 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 2 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd 4 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd 6 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -4 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + subq $-32 * SIZE, BO + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + subq $1, %rax + BRANCH + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 + xorpd %xmm7, %xmm13 + xorpd %xmm7, %xmm15 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + movddup %xmm15, %xmm14 + unpckhpd %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + testq $1, M + jle .L130 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 +#else + xorpd %xmm7, %xmm8 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L199 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 -3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm13 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm13, %xmm12 +#else + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S new file mode 100644 index 0000000..093a580 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S @@ -0,0 +1,2016 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $1, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L20 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm3, %xmm3 + pxor %xmm5, %xmm5 + + movapd %xmm3, %xmm8 + movapd %xmm3, %xmm9 + movapd %xmm3, %xmm12 + movapd %xmm3, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD2 %xmm5, %xmm13 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + NOBRANCH + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) +#endif + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 16 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + subq $-32 * SIZE, AO + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD1 %xmm4, %xmm14 + psllq $63, %xmm7 + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L60 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $1, I # i = (m >> 2) + NOBRANCH + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S new file mode 100644 index 0000000..fb428cb --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S @@ -0,0 +1,2278 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) +#define BORIG 72(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + jle .L30 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 2 * SIZE(AO), %xmm8 + + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 10 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 14 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 6 * SIZE(AO), %xmm8 + + movapd 24 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 26 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 28 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 30 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + + decq %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm3 + subpd %xmm4, %xmm3 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm2, %xmm3 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm2, %xmm3 + + movapd %xmm3, %xmm0 + pshufd $0x4e, %xmm3, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm4, %xmm4 + movapd 2 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + +#ifdef LN + PREFETCHW -4 * SIZE(CO1) + pxor %xmm6, %xmm6 + PREFETCHW -4 * SIZE(CO2) + pxor %xmm7, %xmm7 +#else + PREFETCHW 4 * SIZE(CO1) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 + xorpd %xmm15, %xmm5 + xorpd %xmm15, %xmm7 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + movapd 4 * SIZE(AO), %xmm3 + movapd 6 * SIZE(AO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 + + movapd %xmm5, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm3 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm3, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm5 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 + + movapd %xmm1, %xmm0 + movapd %xmm5, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm3 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm3 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 + + movapd %xmm3, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm3, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm5 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm5 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + movsd %xmm7, 2 * SIZE(CO2) + movhpd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) + movapd %xmm3, 4 * SIZE(AO) + movapd %xmm7, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + testq $1, M + jle .L130 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movapd POSINV, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 +#else + xorpd %xmm15, %xmm0 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 +#else + addpd %xmm1, %xmm0 +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + + subpd %xmm0, %xmm1 +#else + movapd 0 * SIZE(AO), %xmm1 + + subpd %xmm0, %xmm1 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L199 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 +#ifdef LN + PREFETCHW -4 * SIZE(CO1) +#else + PREFETCHW 4 * SIZE(CO1) +#endif + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 4 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 8 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 10 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 12 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 14 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm5 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm5, %xmm4 +#else + addpd %xmm1, %xmm0 + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm4, %xmm5 + + movapd %xmm5, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm5 + subpd %xmm4, %xmm5 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S new file mode 100644 index 0000000..74a799a --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S @@ -0,0 +1,2203 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#ifndef CONJ +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 addpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + jle .L30 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L42 + +.L41: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L43 + ALIGN_4 + +.L44: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm3, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm9 +#endif + + +#ifdef RT + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) + prefetchnta -4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L12 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L11: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L13 + ALIGN_4 + +.L14: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + movapd 4 * SIZE(BO), %xmm10 + movapd 6 * SIZE(BO), %xmm11 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 + subpd %xmm4, %xmm10 + subpd %xmm6, %xmm11 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 + subpd %xmm5, %xmm10 + subpd %xmm7, %xmm11 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm2, %xmm10 + subpd %xmm6, %xmm11 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 + addsubpd %xmm3, %xmm10 + addsubpd %xmm7, %xmm11 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 +#else + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm10, %xmm13 + movapd %xmm11, %xmm14 + movapd %xmm11, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm8 + subpd %xmm14, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#if defined(LT) || defined(RN) + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 +#else + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + movapd %xmm9, %xmm14 + movapd %xmm9, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm10 + subpd %xmm14, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm10 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movhpd %xmm10, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + jle .L130 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L142 + +.L141: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + SHUFPD_1 %xmm1, %xmm1 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 +#else + addsubpd %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 +#else + movapd 0 * SIZE(AO), %xmm8 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#else + addsubpd %xmm1, %xmm8 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L149 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L112 + +.L111: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm5, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S new file mode 100644 index 0000000..fc5a4a3 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S @@ -0,0 +1,3116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L20 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm13 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm13 + + movhlps %xmm9, %xmm11 + movhlps %xmm13, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm11, %xmm12 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm12 + + addps %xmm10, %xmm9 + addps %xmm12, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, (CO1) + movhps %xmm9, (CO1, LDC) + movsd %xmm11, (CO2) + movhps %xmm11, (CO2, LDC) +#else + movlhps %xmm11, %xmm9 + movlhps %xmm15, %xmm13 + + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm13, -28 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO1, LDC) + movlps %xmm13, (CO2) + movlps %xmm15, (CO2, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + NOBRANCH + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps %xmm12, %xmm11 + movlhps %xmm14, %xmm12 + movhlps %xmm11, %xmm14 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm13 + movaps -24 * SIZE(BO), %xmm11 + movaps -20 * SIZE(BO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + movaps -24 * SIZE(AO), %xmm13 + movaps -20 * SIZE(AO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + movaps %xmm15, %xmm5 + pshufd $0xb1, %xmm15, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + subps %xmm5, %xmm13 + subps %xmm4, %xmm13 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + movaps %xmm13, %xmm5 + pshufd $0xb1, %xmm13, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + subps %xmm5, %xmm15 + subps %xmm4, %xmm15 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm13, -28 * SIZE(BO) + movaps %xmm11, -24 * SIZE(BO) + movaps %xmm15, -20 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhps %xmm13, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + movaps %xmm13, -24 * SIZE(AO) + movaps %xmm15, -20 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhps %xmm13, 2 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $2, N + BRANCH + jle .L50 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L40 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + + movlps %xmm9, (CO1) + movhps %xmm9, (CO2) +#else + movlps %xmm9, -32 * SIZE(AO) + movlps %xmm11, -30 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + movq M, I + sarq $1, I + NOBRANCH + jle .L49 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L31 + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L50: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L60 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -26 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm9 +#else + movsd -32 * SIZE(AO), %xmm9 +#endif + + subps %xmm8, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 +#endif + +#if defined(RN) || defined(RT) + movsd -32 * SIZE(BO), %xmm5 +#endif + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) +#else + movlps %xmm9, -32 * SIZE(AO) +#endif + + movlps %xmm9, (CO1) + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $1, I + NOBRANCH + jle .L69 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + +#endif + + haddps %xmm9, %xmm8 + + shufps $0xd8, %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) + movlps %xmm11, -30 * SIZE(BO) + + movlps %xmm9, 0 * SIZE(CO1) + movlps %xmm11, 2 * SIZE(CO1) +#else + movaps %xmm9, -32 * SIZE(AO) + + movlps %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L51 + ALIGN_4 + +.L69: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S new file mode 100644 index 0000000..e9edc29 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -0,0 +1,4004 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(PENTIUM4) || defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(ATOM) || defined(NANO) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 24 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 28 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 32 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 64 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 36 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 40 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 80 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 68 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 72 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 44 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm15, %xmm15 + cmpeqps %xmm15, %xmm15 + pslld $31, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm15, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm15, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm15, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm15, 12 + POSINV +#endif + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + movaps 8 * SIZE(B), %xmm3 + movaps 12 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + je .L20 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movlps 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movlps 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm2, %xmm0 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) +#else + movlps %xmm1, 0 * SIZE(AO) + movlps %xmm5, 2 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm5, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW -8 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW -8 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + + addq $32 * 2 * SIZE, AO + addq $64 * 2 * SIZE, BO + subq $64 * 2, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L18: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm6 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + movaps 8 * SIZE(AO), %xmm5 + movaps 12 * SIZE(AO), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm2 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm0, %xmm1 + addps %xmm2, %xmm3 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm5 + subps %xmm4, %xmm7 + + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm2 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm0, %xmm5 + addps %xmm2, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + + pshufd $0xa0, %xmm7, %xmm4 + pshufd $0xf5, %xmm7, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + + subps %xmm2, %xmm1 + subps %xmm6, %xmm3 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 + +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm6, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + pshufd $0xaa, %xmm6, %xmm4 + pshufd $0xff, %xmm6, %xmm5 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm5, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm5 + + movaps %xmm0, 48 * SIZE(BO) + movaps %xmm1, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm5, 60 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) + + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) + movhps %xmm6, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + movaps %xmm5, 8 * SIZE(AO) + movaps %xmm7, 12 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) + movlps %xmm7, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#endif + + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c + +#ifndef RT + addq LDC, C +#endif + + testq $1, M + je .L60 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 + + subps %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + subps %xmm0, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + PREFETCHW -8 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $ 8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L58: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#endif + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd 4 * SIZE(B), %xmm6 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm6, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + + movaps %xmm0, 24 * SIZE(BO) + movaps %xmm1, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S new file mode 100644 index 0000000..e53e297 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S @@ -0,0 +1,1586 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 + 2) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq M, M + jle .L999 + + movq N, J + sarq $2, J + NOBRANCH + jle .L20 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht0 2 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 2 * SIZE(CO2, LDC) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm0 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + movaps -12 * SIZE(AO), %xmm0 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + ADD1 %xmm1, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + movapd -12 * SIZE(AO), %xmm13 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 + + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm11 + subpd %xmm3, %xmm11 + + movddup -12 * SIZE(BO), %xmm0 + movddup -11 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm1 + subpd %xmm0, %xmm13 + subpd %xmm1, %xmm13 + + movddup -10 * SIZE(BO), %xmm2 + movddup -9 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -4 * SIZE(BO), %xmm0 + movddup -3 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm0 + mulpd %xmm10, %xmm1 + subpd %xmm0, %xmm13 + subpd %xmm1, %xmm13 + + movddup -2 * SIZE(BO), %xmm2 + movddup -1 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup 4 * SIZE(BO), %xmm0 + movddup 5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + addpd %xmm12, %xmm13 + + movddup 6 * SIZE(BO), %xmm2 + movddup 7 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup 14 * SIZE(BO), %xmm0 + movddup 15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup 14 * SIZE(BO), %xmm0 + movddup 15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + addpd %xmm14, %xmm15 + + movddup 12 * SIZE(BO), %xmm2 + movddup 13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm2 + mulpd %xmm14, %xmm3 + subpd %xmm2, %xmm13 + subpd %xmm3, %xmm13 + + movddup 10 * SIZE(BO), %xmm0 + movddup 11 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm0 + mulpd %xmm14, %xmm1 + subpd %xmm0, %xmm11 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + movddup 9 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm2 + mulpd %xmm14, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup 4 * SIZE(BO), %xmm0 + movddup 5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + addpd %xmm12, %xmm13 + + movddup 2 * SIZE(BO), %xmm0 + movddup 3 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm1 + subpd %xmm0, %xmm11 + subpd %xmm1, %xmm11 + + movddup 0 * SIZE(BO), %xmm2 + movddup 1 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -8 * SIZE(BO), %xmm2 + movddup -7 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhpd %xmm11, 1 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhpd %xmm15, 1 * SIZE(CO2, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + movapd %xmm13, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L20: + testq $2, N + BRANCH + jle .L30 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 + + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm11 + subpd %xmm3, %xmm11 + + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L21 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L30: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 +#endif + +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L31 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S b/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S new file mode 100644 index 0000000..a1760ad --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S @@ -0,0 +1,995 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define BB %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#ifndef CONJ +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd + +#elif defined(LN) || defined(LT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#endif + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + testq N, N + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + subq LDC, C +#endif + + movq C, CO1 + +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + ADDSD3 %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + ADDSD2 %xmm2, %xmm13 + ADDSD3 %xmm7, %xmm14 + ADDSD4 %xmm6, %xmm15 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + addsd %xmm15, %xmm12 + addsd %xmm13, %xmm14 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + movsd 2 * SIZE(AO), %xmm2 + movsd 3 * SIZE(AO), %xmm3 +#endif + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm1 + subsd %xmm12, %xmm2 + subsd %xmm14, %xmm3 + +#ifdef LN + movsd 6 * SIZE(AO), %xmm6 + movsd 7 * SIZE(AO), %xmm7 + + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + movsd 4 * SIZE(AO), %xmm6 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + movsd 5 * SIZE(AO), %xmm7 + + ADDSD4 %xmm4, %xmm2 + ADDSD3 %xmm5, %xmm3 + + movaps %xmm2, %xmm4 + movaps %xmm3, %xmm5 + + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm5 + mulsd %xmm3, %xmm6 + mulsd %xmm2, %xmm7 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm1 + movsd 0 * SIZE(AO), %xmm6 + + ADDSD3 %xmm5, %xmm0 + ADDSD4 %xmm7, %xmm1 + movsd 1 * SIZE(AO), %xmm7 + + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm6 + movsd 1 * SIZE(AO), %xmm7 + + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + movsd 3 * SIZE(AO), %xmm7 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 + + movaps %xmm0, %xmm4 + movaps %xmm1, %xmm5 + + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm5 + mulsd %xmm1, %xmm6 + mulsd %xmm0, %xmm7 + + subsd %xmm4, %xmm2 + subsd %xmm6, %xmm3 + movsd 6 * SIZE(AO), %xmm6 + + ADDSD3 %xmm5, %xmm2 + ADDSD4 %xmm7, %xmm3 + movsd 7 * SIZE(AO), %xmm7 + + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm2 + ADDSD3 %xmm5, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BO), %xmm9 + movaps %xmm1, %xmm4 + movaps %xmm2, %xmm7 + movaps %xmm3, %xmm6 + + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 + mulsd %xmm9, %xmm5 + mulsd %xmm9, %xmm4 + + ADDSD4 %xmm4, %xmm0 + mulsd %xmm8, %xmm2 + ADDSD2 %xmm5, %xmm1 + mulsd %xmm8, %xmm3 + mulsd %xmm9, %xmm7 + mulsd %xmm9, %xmm6 + + ADDSD4 %xmm6, %xmm2 + ADDSD2 %xmm7, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movsd %xmm2, 2 * SIZE(CO1) + movsd %xmm3, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) + movsd %xmm2, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L20: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + ADDSD1 %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + ADDSD3 %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + ADDSD2 %xmm2, %xmm9 + ADDSD4 %xmm6, %xmm11 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 +#endif + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm6 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(AO), %xmm7 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BO), %xmm9 + movaps %xmm1, %xmm4 + + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 + mulsd %xmm9, %xmm5 + mulsd %xmm9, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD2 %xmm5, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S new file mode 100644 index 0000000..93cbcad --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S @@ -0,0 +1,2162 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define AORIG 48(%rsp) +#define BORIG 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + +#ifdef LN + pxor %xmm8, %xmm8 + prefetcht1 -3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 -3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#else + pxor %xmm8, %xmm8 + prefetcht1 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#endif + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -12 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -8 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -8 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd 0 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 2 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd 4 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd 6 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -4 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + subq $-32 * SIZE, BO + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + subq $1, %rax + BRANCH + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 + xorpd %xmm7, %xmm13 + xorpd %xmm7, %xmm15 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + movddup %xmm15, %xmm14 + unpckhpd %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm13 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm13, %xmm12 +#else + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L199 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 +#else + xorpd %xmm7, %xmm8 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S new file mode 100644 index 0000000..e38e87e --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S @@ -0,0 +1,2016 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $1, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) +#endif + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 16 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + subq $-32 * SIZE, AO + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD1 %xmm4, %xmm14 + psllq $63, %xmm7 + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm3, %xmm3 + pxor %xmm5, %xmm5 + + movapd %xmm3, %xmm8 + movapd %xmm3, %xmm9 + movapd %xmm3, %xmm12 + movapd %xmm3, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD2 %xmm5, %xmm13 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S new file mode 100644 index 0000000..dabc97c --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S @@ -0,0 +1,2266 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) +#define BORIG 72(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm4, %xmm4 + movapd 2 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 + xorpd %xmm15, %xmm5 + xorpd %xmm15, %xmm7 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + movapd 4 * SIZE(AO), %xmm3 + movapd 6 * SIZE(AO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 + + movapd %xmm5, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm3 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm3, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm5 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 + + movapd %xmm1, %xmm0 + movapd %xmm5, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm3 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm3 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 + + movapd %xmm3, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm3, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm5 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm5 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + movsd %xmm7, 2 * SIZE(CO2) + movhpd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) + movapd %xmm3, 4 * SIZE(AO) + movapd %xmm7, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 2 * SIZE(AO), %xmm8 + + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 10 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 14 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 6 * SIZE(AO), %xmm8 + + movapd 24 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 26 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 28 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 30 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + + decq %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm3 + subpd %xmm4, %xmm3 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm2, %xmm3 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm2, %xmm3 + + movapd %xmm3, %xmm0 + pshufd $0x4e, %xmm3, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 4 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 8 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 10 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 12 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 14 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm5 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm5, %xmm4 +#else + addpd %xmm1, %xmm0 + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm4, %xmm5 + + movapd %xmm5, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm5 + subpd %xmm4, %xmm5 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L199 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movapd POSINV, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 +#else + xorpd %xmm15, %xmm0 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 +#else + addpd %xmm1, %xmm0 +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + + subpd %xmm0, %xmm1 +#else + movapd 0 * SIZE(AO), %xmm1 + + subpd %xmm0, %xmm1 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S new file mode 100644 index 0000000..708a984 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S @@ -0,0 +1,2194 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#ifndef CONJ +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 addpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L12 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L11: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L13 + ALIGN_4 + +.L14: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + movapd 4 * SIZE(BO), %xmm10 + movapd 6 * SIZE(BO), %xmm11 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 + subpd %xmm4, %xmm10 + subpd %xmm6, %xmm11 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 + subpd %xmm5, %xmm10 + subpd %xmm7, %xmm11 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm2, %xmm10 + subpd %xmm6, %xmm11 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 + addsubpd %xmm3, %xmm10 + addsubpd %xmm7, %xmm11 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 +#else + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm10, %xmm13 + movapd %xmm11, %xmm14 + movapd %xmm11, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm8 + subpd %xmm14, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#if defined(LT) || defined(RN) + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 +#else + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + movapd %xmm9, %xmm14 + movapd %xmm9, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm10 + subpd %xmm14, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm10 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movhpd %xmm10, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L42 + +.L41: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L43 + ALIGN_4 + +.L44: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm3, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm9 +#endif + + +#ifdef RT + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + prefetchnta 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L112 + +.L111: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm5, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L149 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L142 + +.L141: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + SHUFPD_1 %xmm1, %xmm1 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 +#else + addsubpd %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 +#else + movapd 0 * SIZE(AO), %xmm8 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#else + addsubpd %xmm1, %xmm8 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S new file mode 100644 index 0000000..d07930d --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S @@ -0,0 +1,3116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps %xmm12, %xmm11 + movlhps %xmm14, %xmm12 + movhlps %xmm11, %xmm14 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm13 + movaps -24 * SIZE(BO), %xmm11 + movaps -20 * SIZE(BO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + movaps -24 * SIZE(AO), %xmm13 + movaps -20 * SIZE(AO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + movaps %xmm15, %xmm5 + pshufd $0xb1, %xmm15, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + subps %xmm5, %xmm13 + subps %xmm4, %xmm13 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + movaps %xmm13, %xmm5 + pshufd $0xb1, %xmm13, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + subps %xmm5, %xmm15 + subps %xmm4, %xmm15 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm13, -28 * SIZE(BO) + movaps %xmm11, -24 * SIZE(BO) + movaps %xmm15, -20 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhps %xmm13, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + movaps %xmm13, -24 * SIZE(AO) + movaps %xmm15, -20 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhps %xmm13, 2 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm13 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm13 + + movhlps %xmm9, %xmm11 + movhlps %xmm13, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm11, %xmm12 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm12 + + addps %xmm10, %xmm9 + addps %xmm12, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, (CO1) + movhps %xmm9, (CO1, LDC) + movsd %xmm11, (CO2) + movhps %xmm11, (CO2, LDC) +#else + movlhps %xmm11, %xmm9 + movlhps %xmm15, %xmm13 + + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm13, -28 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO1, LDC) + movlps %xmm13, (CO2) + movlps %xmm15, (CO2, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $2, N + BRANCH + jle .L50 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + + movlps %xmm9, (CO1) + movhps %xmm9, (CO2) +#else + movlps %xmm9, -32 * SIZE(AO) + movlps %xmm11, -30 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L50: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + +#endif + + haddps %xmm9, %xmm8 + + shufps $0xd8, %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) + movlps %xmm11, -30 * SIZE(BO) + + movlps %xmm9, 0 * SIZE(CO1) + movlps %xmm11, 2 * SIZE(CO1) +#else + movaps %xmm9, -32 * SIZE(AO) + + movlps %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -26 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm9 +#else + movsd -32 * SIZE(AO), %xmm9 +#endif + + subps %xmm8, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 +#endif + +#if defined(RN) || defined(RT) + movsd -32 * SIZE(BO), %xmm5 +#endif + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) +#else + movlps %xmm9, -32 * SIZE(AO) +#endif + + movlps %xmm9, (CO1) + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S new file mode 100644 index 0000000..7375c34 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -0,0 +1,4004 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(PENTIUM4) || defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(ATOM) || defined(NANO) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 24 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 28 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 32 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 64 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 36 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 40 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 80 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 68 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 72 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 44 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm15, %xmm15 + cmpeqps %xmm15, %xmm15 + pslld $31, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm15, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm15, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm15, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm15, 12 + POSINV +#endif + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + movaps 8 * SIZE(B), %xmm3 + movaps 12 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + + addq $32 * 2 * SIZE, AO + addq $64 * 2 * SIZE, BO + subq $64 * 2, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L18: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm6 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + movaps 8 * SIZE(AO), %xmm5 + movaps 12 * SIZE(AO), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm2 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm0, %xmm1 + addps %xmm2, %xmm3 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm5 + subps %xmm4, %xmm7 + + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm2 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm0, %xmm5 + addps %xmm2, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + + pshufd $0xa0, %xmm7, %xmm4 + pshufd $0xf5, %xmm7, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + + subps %xmm2, %xmm1 + subps %xmm6, %xmm3 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 + +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm6, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + pshufd $0xaa, %xmm6, %xmm4 + pshufd $0xff, %xmm6, %xmm5 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm5, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm5 + + movaps %xmm0, 48 * SIZE(BO) + movaps %xmm1, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm5, 60 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) + + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) + movhps %xmm6, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + movaps %xmm5, 8 * SIZE(AO) + movaps %xmm7, 12 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) + movlps %xmm7, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#endif + + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm5, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movlps 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movlps 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm2, %xmm0 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) +#else + movlps %xmm1, 0 * SIZE(AO) + movlps %xmm5, 2 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c + +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $ 8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L58: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#endif + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd 4 * SIZE(B), %xmm6 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm6, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + + movaps %xmm0, 24 * SIZE(BO) + movaps %xmm1, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + subps %xmm0, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 + + subps %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S new file mode 100644 index 0000000..451aafa --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S @@ -0,0 +1,1586 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 + 2) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq M, M + jle .L999 + + testq $1, N + BRANCH + jle .L20 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 +#endif + +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L31 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L20: + testq $2, N + BRANCH + jle .L30 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 + + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm11 + subpd %xmm3, %xmm11 + + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L21 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L30: + movq N, J + sarq $2, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht0 2 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 2 * SIZE(CO2, LDC) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm0 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + movaps -12 * SIZE(AO), %xmm0 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + ADD1 %xmm1, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + movapd -12 * SIZE(AO), %xmm13 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 + + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm11 + subpd %xmm3, %xmm11 + + movddup -12 * SIZE(BO), %xmm0 + movddup -11 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm1 + subpd %xmm0, %xmm13 + subpd %xmm1, %xmm13 + + movddup -10 * SIZE(BO), %xmm2 + movddup -9 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -4 * SIZE(BO), %xmm0 + movddup -3 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm0 + mulpd %xmm10, %xmm1 + subpd %xmm0, %xmm13 + subpd %xmm1, %xmm13 + + movddup -2 * SIZE(BO), %xmm2 + movddup -1 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup 4 * SIZE(BO), %xmm0 + movddup 5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + addpd %xmm12, %xmm13 + + movddup 6 * SIZE(BO), %xmm2 + movddup 7 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup 14 * SIZE(BO), %xmm0 + movddup 15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup 14 * SIZE(BO), %xmm0 + movddup 15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + addpd %xmm14, %xmm15 + + movddup 12 * SIZE(BO), %xmm2 + movddup 13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm2 + mulpd %xmm14, %xmm3 + subpd %xmm2, %xmm13 + subpd %xmm3, %xmm13 + + movddup 10 * SIZE(BO), %xmm0 + movddup 11 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm0 + mulpd %xmm14, %xmm1 + subpd %xmm0, %xmm11 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + movddup 9 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm2 + mulpd %xmm14, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup 4 * SIZE(BO), %xmm0 + movddup 5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + addpd %xmm12, %xmm13 + + movddup 2 * SIZE(BO), %xmm0 + movddup 3 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm1 + subpd %xmm0, %xmm11 + subpd %xmm1, %xmm11 + + movddup 0 * SIZE(BO), %xmm2 + movddup 1 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -8 * SIZE(BO), %xmm2 + movddup -7 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhpd %xmm11, 1 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhpd %xmm15, 1 * SIZE(CO2, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + movapd %xmm13, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S new file mode 100644 index 0000000..005b65e --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S @@ -0,0 +1,2162 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define AORIG 48(%rsp) +#define BORIG 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + jle .L100 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm13 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm13, %xmm12 +#else + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L199 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 +#else + xorpd %xmm7, %xmm8 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L100: + movq N, J + sarq $1, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + +#ifdef LN + pxor %xmm8, %xmm8 + prefetcht1 -3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 -3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#else + pxor %xmm8, %xmm8 + prefetcht1 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#endif + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -12 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -8 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -8 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd 0 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 2 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd 4 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd 6 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -4 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + subq $-32 * SIZE, BO + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + subq $1, %rax + BRANCH + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 + xorpd %xmm7, %xmm13 + xorpd %xmm7, %xmm15 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + movddup %xmm15, %xmm14 + unpckhpd %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S new file mode 100644 index 0000000..4ed789a --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S @@ -0,0 +1,2010 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq $1, N + BRANCH + jle .L40 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + movq N, J + sarq $1, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + movq B, BB + subq %rax, BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) +#endif + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 16 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + subq $-32 * SIZE, AO + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD1 %xmm4, %xmm14 + psllq $63, %xmm7 + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm3, %xmm3 + pxor %xmm5, %xmm5 + + movapd %xmm3, %xmm8 + movapd %xmm3, %xmm9 + movapd %xmm3, %xmm12 + movapd %xmm3, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD2 %xmm5, %xmm13 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S new file mode 100644 index 0000000..3ab9e5b --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S @@ -0,0 +1,2266 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) +#define BORIG 72(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + jle .L100 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 4 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 8 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 10 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 12 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 14 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm5 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm5, %xmm4 +#else + addpd %xmm1, %xmm0 + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm4, %xmm5 + + movapd %xmm5, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm5 + subpd %xmm4, %xmm5 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L199 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movapd POSINV, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 +#else + xorpd %xmm15, %xmm0 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 +#else + addpd %xmm1, %xmm0 +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + + subpd %xmm0, %xmm1 +#else + movapd 0 * SIZE(AO), %xmm1 + + subpd %xmm0, %xmm1 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L100: + movq N, J + sarq $1, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm4, %xmm4 + movapd 2 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 + xorpd %xmm15, %xmm5 + xorpd %xmm15, %xmm7 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + movapd 4 * SIZE(AO), %xmm3 + movapd 6 * SIZE(AO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 + + movapd %xmm5, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm3 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm3, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm5 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 + + movapd %xmm1, %xmm0 + movapd %xmm5, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm3 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm3 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 + + movapd %xmm3, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm3, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm5 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm5 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + movsd %xmm7, 2 * SIZE(CO2) + movhpd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) + movapd %xmm3, 4 * SIZE(AO) + movapd %xmm7, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 2 * SIZE(AO), %xmm8 + + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 10 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 14 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 6 * SIZE(AO), %xmm8 + + movapd 24 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 26 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 28 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 30 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + + decq %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm3 + subpd %xmm4, %xmm3 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm2, %xmm3 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm2, %xmm3 + + movapd %xmm3, %xmm0 + pshufd $0x4e, %xmm3, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_3 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S new file mode 100644 index 0000000..ca700eb --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S @@ -0,0 +1,2196 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#ifndef CONJ +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 addpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq $1, N + jle .L100 + +.L101: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + prefetchnta 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L112 + +.L111: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm5, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L149 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L142 + +.L141: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + SHUFPD_1 %xmm1, %xmm1 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 +#else + addsubpd %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 +#else + movapd 0 * SIZE(AO), %xmm8 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#else + addsubpd %xmm1, %xmm8 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_3 + + +.L100: + movq N, J + sarq $1, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L12 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L11: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L13 + ALIGN_4 + +.L14: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + movapd 4 * SIZE(BO), %xmm10 + movapd 6 * SIZE(BO), %xmm11 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 + subpd %xmm4, %xmm10 + subpd %xmm6, %xmm11 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 + subpd %xmm5, %xmm10 + subpd %xmm7, %xmm11 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm2, %xmm10 + subpd %xmm6, %xmm11 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 + addsubpd %xmm3, %xmm10 + addsubpd %xmm7, %xmm11 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 +#else + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm10, %xmm13 + movapd %xmm11, %xmm14 + movapd %xmm11, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm8 + subpd %xmm14, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#if defined(LT) || defined(RN) + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 +#else + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + movapd %xmm9, %xmm14 + movapd %xmm9, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm10 + subpd %xmm14, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm10 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movhpd %xmm10, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L42 + +.L41: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L43 + ALIGN_4 + +.L44: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm3, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm9 +#endif + + +#ifdef RT + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + decq J # j -- + jg .L01 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S new file mode 100644 index 0000000..a5f0134 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S @@ -0,0 +1,3116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq $1, N + BRANCH + jle .L30 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + +#endif + + haddps %xmm9, %xmm8 + + shufps $0xd8, %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) + movlps %xmm11, -30 * SIZE(BO) + + movlps %xmm9, 0 * SIZE(CO1) + movlps %xmm11, 2 * SIZE(CO1) +#else + movaps %xmm9, -32 * SIZE(AO) + + movlps %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -26 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm9 +#else + movsd -32 * SIZE(AO), %xmm9 +#endif + + subps %xmm8, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 +#endif + +#if defined(RN) || defined(RT) + movsd -32 * SIZE(BO), %xmm5 +#endif + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) +#else + movlps %xmm9, -32 * SIZE(AO) +#endif + + movlps %xmm9, (CO1) + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L30: + testq $2, N + BRANCH + jle .L50 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + + movlps %xmm9, (CO1) + movhps %xmm9, (CO2) +#else + movlps %xmm9, -32 * SIZE(AO) + movlps %xmm11, -30 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L50: + movq N, J + sarq $2, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps %xmm12, %xmm11 + movlhps %xmm14, %xmm12 + movhlps %xmm11, %xmm14 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm13 + movaps -24 * SIZE(BO), %xmm11 + movaps -20 * SIZE(BO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + movaps -24 * SIZE(AO), %xmm13 + movaps -20 * SIZE(AO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + movaps %xmm15, %xmm5 + pshufd $0xb1, %xmm15, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + subps %xmm5, %xmm13 + subps %xmm4, %xmm13 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + movaps %xmm13, %xmm5 + pshufd $0xb1, %xmm13, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + subps %xmm5, %xmm15 + subps %xmm4, %xmm15 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm13, -28 * SIZE(BO) + movaps %xmm11, -24 * SIZE(BO) + movaps %xmm15, -20 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhps %xmm13, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + movaps %xmm13, -24 * SIZE(AO) + movaps %xmm15, -20 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhps %xmm13, 2 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm13 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm13 + + movhlps %xmm9, %xmm11 + movhlps %xmm13, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm11, %xmm12 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm12 + + addps %xmm10, %xmm9 + addps %xmm12, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, (CO1) + movhps %xmm9, (CO1, LDC) + movsd %xmm11, (CO2) + movhps %xmm11, (CO2, LDC) +#else + movlhps %xmm11, %xmm9 + movlhps %xmm15, %xmm13 + + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm13, -28 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO1, LDC) + movlps %xmm13, (CO2) + movlps %xmm15, (CO2, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S new file mode 100644 index 0000000..85c0ac2 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -0,0 +1,4005 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(PENTIUM4) || defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(ATOM) || defined(NANO) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 24 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 28 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 32 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 64 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 36 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 40 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 80 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 68 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 72 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 44 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm15, %xmm15 + cmpeqps %xmm15, %xmm15 + pslld $31, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm15, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm15, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm15, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm15, 12 + POSINV +#endif + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + ALIGN_4 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movlps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c + +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $ 8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L58: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#endif + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd 4 * SIZE(B), %xmm6 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm6, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + + movaps %xmm0, 24 * SIZE(BO) + movaps %xmm1, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + subps %xmm0, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 + + subps %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + movq N, J + sarq $1, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + movaps 8 * SIZE(B), %xmm3 + movaps 12 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + + addq $32 * 2 * SIZE, AO + addq $64 * 2 * SIZE, BO + subq $64 * 2, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L18: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm6 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + movaps 8 * SIZE(AO), %xmm5 + movaps 12 * SIZE(AO), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm2 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm0, %xmm1 + addps %xmm2, %xmm3 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm5 + subps %xmm4, %xmm7 + + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm2 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm0, %xmm5 + addps %xmm2, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + + pshufd $0xa0, %xmm7, %xmm4 + pshufd $0xf5, %xmm7, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + + subps %xmm2, %xmm1 + subps %xmm6, %xmm3 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 + +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm6, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + pshufd $0xaa, %xmm6, %xmm4 + pshufd $0xff, %xmm6, %xmm5 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm5, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm5 + + movaps %xmm0, 48 * SIZE(BO) + movaps %xmm1, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm5, 60 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) + + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) + movhps %xmm6, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + movaps %xmm5, 8 * SIZE(AO) + movaps %xmm7, 12 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) + movlps %xmm7, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#endif + + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm5, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm2, %xmm0 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) +#else + movlps %xmm1, 0 * SIZE(AO) + movlps %xmm5, 2 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/l1param.h b/l1param.h new file mode 100644 index 0000000..f1d223e --- /dev/null +++ b/l1param.h @@ -0,0 +1,84 @@ +#if defined(CORE2) || defined(PENRYN) +#define ALIGNED_ACCESS +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (128 * 12) +#define ALIGNED_ACCESS +#endif + +#ifdef ATHLON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 10) +#define ALIGNED_ACCESS +#define movsd movlps +#endif + +#ifdef PENTIUM3 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (128 * 10) +#define ALIGNED_ACCESS +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (128 * 10) +#define FETCH128 +#define ALIGNED_ACCESS +#define xorps pxor +#define xorpd pxor +#endif + +#ifdef ATOM +#define ALIGNED_ACCESS +#define PREFETCH prefetcht0 +#define PREFETCHSIZE ( 64 * 12 + 32) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + +#ifdef SHANGHAI +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (128 * 4) +#define ALIGNED_ACCESS +#endif + +#define PREOFFSET 128 + + +#ifdef HAVE_SSE2 +#define PSHUFD1(A, B) pshufd A, B, B +#define PSHUFD2(A, B, C) pshufd A, B, C +#else +#define PSHUFD1(A, B) shufps A, B, B +#define PSHUFD2(A, B, C) movaps B, C; shufps A, C, C +#endif + +#define MOVDDUP1(OFFSET, BASE, REGS) movddup OFFSET(BASE), REGS + +#define MOVAPS(OFFSET, BASE, REGS) movlps REGS, OFFSET(BASE); movhps REGS, OFFSET + SIZE(BASE) + diff --git a/l2param.h b/l2param.h new file mode 100644 index 0000000..af9d171 --- /dev/null +++ b/l2param.h @@ -0,0 +1,165 @@ +#ifndef GEMV_PARAM_H +#define GEMV_PARAM_H + +#ifdef movsd +#undef movsd +#endif + +#undef movapd +#define movapd movaps + +#ifdef ATHLON +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 64 * 3 +#endif + +#ifdef PENTIUM4 +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 64 * 2 +#endif + +#ifdef CORE2 +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 64 * 4 +#endif + +#ifdef PENRYN +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 64 * 4 +#endif + +#ifdef NEHALEM +#define MOVUPS_A movups +#define MOVUPS_XL movups +#define MOVUPS_XS movups +#define MOVUPS_YL movups +#define MOVUPS_YS movups +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 64 * 3 +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#ifndef COMPLEX +#define PREFETCHSIZE 64 * 1 +#else +#define PREFETCHSIZE 64 * 1 +#endif +#define movsd movlps +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps + +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#ifndef COMPLEX +#define PREFETCHSIZE 64 * 2 +#else +#define PREFETCHSIZE 64 * 4 +#endif +#endif + +#ifdef NANO +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#ifndef COMPLEX +#define PREFETCHSIZE 64 * 1 +#else +#define PREFETCHSIZE 64 * 2 +#endif +#endif + +#ifndef PREOFFSET +#ifdef L1_DATA_LINESIZE +#define PREOFFSET (L1_DATA_LINESIZE >> 1) +#else +#define PREOFFSET 32 +#endif +#endif + +#ifndef GEMV_UNROLL +#define GEMV_UNROLL 4 +#endif + +#ifndef ZGEMV_UNROLL +#define ZGEMV_UNROLL 4 +#endif + +/* #define COPY_FORCE */ /* Always copy X or Y to the buffer */ +/* #define NOCOPY_UNALIGNED */ /* Not copy if X or Y is not aligned */ + +#ifdef MOVUPS_A +#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS +#else +#define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS +#endif + +#define MOVRPS_A1(OFF, ADDR, REGS) movsd OFF + 8(ADDR), REGS; movhps OFF(ADDR), REGS +#define MOVRPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF + 8(ADDR, BASE, SCALE), REGS; movhps OFF(ADDR, BASE, SCALE), REGS + +#ifdef MOVUPS_XL +#define MOVUPS_XL1(OFF, ADDR, REGS) MOVUPS_XL OFF(ADDR), REGS +#else +#define MOVUPS_XL1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#endif + +#ifdef MOVUPS_XS +#define MOVUPS_XS1(OFF, ADDR, REGS) MOVUPS_XS REGS, OFF(ADDR) +#else +#define MOVUPS_XS1(OFF, ADDR, REGS) movsd REGS, OFF(ADDR); movhps REGS, OFF + 8(ADDR) +#endif + +#ifdef MOVUPS_YL +#define MOVUPS_YL1(OFF, ADDR, REGS) MOVUPS_YL OFF(ADDR), REGS +#else +#define MOVUPS_YL1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#endif + +#ifdef MOVUPS_YS +#define MOVUPS_YS1(OFF, ADDR, REGS) MOVUPS_YS REGS, OFF(ADDR) +#else +#define MOVUPS_YS1(OFF, ADDR, REGS) movsd REGS, OFF(ADDR); movhps REGS, OFF + 8(ADDR) +#endif + + + +#endif diff --git a/lapack/._Makefile b/lapack/._Makefile new file mode 100644 index 0000000..da6890e Binary files /dev/null and b/lapack/._Makefile differ diff --git a/lapack/._getf2 b/lapack/._getf2 new file mode 100755 index 0000000..a8bb6a4 Binary files /dev/null and b/lapack/._getf2 differ diff --git a/lapack/._getrf b/lapack/._getrf new file mode 100755 index 0000000..2372d8a Binary files /dev/null and b/lapack/._getrf differ diff --git a/lapack/._getri b/lapack/._getri new file mode 100755 index 0000000..eca142d Binary files /dev/null and b/lapack/._getri differ diff --git a/lapack/._getrs b/lapack/._getrs new file mode 100755 index 0000000..e5ac5ad Binary files /dev/null and b/lapack/._getrs differ diff --git a/lapack/._laswp b/lapack/._laswp new file mode 100755 index 0000000..5bc17a6 Binary files /dev/null and b/lapack/._laswp differ diff --git a/lapack/._lauu2 b/lapack/._lauu2 new file mode 100755 index 0000000..41bb0fb Binary files /dev/null and b/lapack/._lauu2 differ diff --git a/lapack/._lauum b/lapack/._lauum new file mode 100755 index 0000000..8517bd4 Binary files /dev/null and b/lapack/._lauum differ diff --git a/lapack/._potf2 b/lapack/._potf2 new file mode 100755 index 0000000..e4aa865 Binary files /dev/null and b/lapack/._potf2 differ diff --git a/lapack/._potrf b/lapack/._potrf new file mode 100755 index 0000000..e322771 Binary files /dev/null and b/lapack/._potrf differ diff --git a/lapack/._trti2 b/lapack/._trti2 new file mode 100755 index 0000000..0d72358 Binary files /dev/null and b/lapack/._trti2 differ diff --git a/lapack/._trtri b/lapack/._trtri new file mode 100755 index 0000000..66a9262 Binary files /dev/null and b/lapack/._trtri differ diff --git a/lapack/Makefile b/lapack/Makefile new file mode 100644 index 0000000..215badb --- /dev/null +++ b/lapack/Makefile @@ -0,0 +1,40 @@ +TOPDIR = .. +include ../Makefile.system + +SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs + +FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 + +libs: + @for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done + +prof: + @for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + (cd $$d; $(MAKE) prof) ; \ + fi; \ + done + +flame: + @for d in $(FLAMEDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d libs || exit 1 ; \ + fi; \ + done + +hpl: + +hpl_p: + +clean :: + @for d in $(SUBDIRS) tpp ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done + diff --git a/lapack/getf2/._Makefile b/lapack/getf2/._Makefile new file mode 100644 index 0000000..aa5453b Binary files /dev/null and b/lapack/getf2/._Makefile differ diff --git a/lapack/getf2/._getf2_k.c b/lapack/getf2/._getf2_k.c new file mode 100644 index 0000000..3895cee Binary files /dev/null and b/lapack/getf2/._getf2_k.c differ diff --git a/lapack/getf2/._zgetf2_k.c b/lapack/getf2/._zgetf2_k.c new file mode 100644 index 0000000..30f6542 Binary files /dev/null and b/lapack/getf2/._zgetf2_k.c differ diff --git a/lapack/getf2/Makefile b/lapack/getf2/Makefile new file mode 100644 index 0000000..612c6f9 --- /dev/null +++ b/lapack/getf2/Makefile @@ -0,0 +1,49 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = sgetf2_k.$(SUFFIX) +DBLASOBJS = dgetf2_k.$(SUFFIX) +QBLASOBJS = qgetf2_k.$(SUFFIX) +CBLASOBJS = cgetf2_k.$(SUFFIX) +ZBLASOBJS = zgetf2_k.$(SUFFIX) +XBLASOBJS = xgetf2_k.$(SUFFIX) + +sgetf2_k.$(SUFFIX) : getf2_k.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dgetf2_k.$(SUFFIX) : getf2_k.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qgetf2_k.$(SUFFIX) : getf2_k.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cgetf2_k.$(SUFFIX) : zgetf2_k.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zgetf2_k.$(SUFFIX) : zgetf2_k.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xgetf2_k.$(SUFFIX) : zgetf2_k.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +sgetf2_k.$(PSUFFIX) : getf2_k.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dgetf2_k.$(PSUFFIX) : getf2_k.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qgetf2_k.$(PSUFFIX) : getf2_k.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cgetf2_k.$(PSUFFIX) : zgetf2_k.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zgetf2_k.$(PSUFFIX) : zgetf2_k.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xgetf2_k.$(PSUFFIX) : zgetf2_k.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail + + diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c new file mode 100644 index 0000000..fdc4eae --- /dev/null +++ b/lapack/getf2/getf2_k.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; +static FLOAT dm1 = -1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, lda; + blasint *ipiv, offset; + FLOAT *a; + + FLOAT temp1, temp2; + blasint i, j; + blasint ip, jp; + blasint info; + BLASLONG len; + FLOAT *b; + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + info = 0; + b = a; + + for (j = 0; j < n; j++) { + + len = MIN(j, m); + + for (i = 0; i < len; i++) { + ip = ipiv[i + offset] - 1 - offset; + if (ip != i) { + temp1 = *(b + i); + temp2 = *(b + ip); + *(b + i) = temp2; + *(b + ip) = temp1; + } + } + + for (i = 1; i < len; i++) { + b[i] -= DOTU_K(i, a + i, lda, b, 1); + } + + if (j < m) { + GEMV_N(m - j, j, 0, dm1, a + j, lda, b, 1, b + j, 1, sb); + + jp = j + IAMAX_K(m - j, b + j, 1); + ipiv[j + offset] = jp + offset; + jp--; + temp1 = *(b + jp); + + if (temp1 != ZERO) { + temp1 = dp1 / temp1; + + if (jp != j) { + SWAP_K(j + 1, 0, 0, ZERO, a + j, lda, a + jp, lda, NULL, 0); + } + if (j + 1 < m) { + SCAL_K(m - j - 1, 0, 0, temp1, b + j + 1, 1, NULL, 0, NULL, 0); + } + } else { + if (!info) info = j + 1; + } + } + b += lda; + } + return info; +} diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c new file mode 100644 index 0000000..ae8c6fd --- /dev/null +++ b/lapack/getf2/zgetf2_k.c @@ -0,0 +1,139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +double fabs(double); + +static FLOAT dp1 = 1.; +static FLOAT dm1 = -1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, lda, offset; + blasint *ipiv; + FLOAT *a; + + FLOAT temp1, temp2, temp3, temp4, ratio, den; + blasint i, j; + blasint ip, jp; + blasint info; + BLASLONG len; + FLOAT *b; + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + info = 0; + b = a; + + for (j = 0; j < n; j++) { + + len = MIN(j, m); + + for (i = 0; i < len; i++) { + ip = ipiv[i + offset] - 1 - offset; + if (ip != i) { + temp1 = *(b + i * 2 + 0); + temp2 = *(b + i * 2 + 1); + temp3 = *(b + ip * 2 + 0); + temp4 = *(b + ip * 2 + 1); + *(b + i * 2 + 0) = temp3; + *(b + i * 2 + 1) = temp4; + *(b + ip * 2 + 0) = temp1; + *(b + ip * 2 + 1) = temp2; + } + } + + ZTRSV_NLU(len, a, lda, b, 1, sb); + + if (j < m) { + GEMV_N(m - j, j, 0, dm1, ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb); + + jp = j + IAMAX_K(m - j, b + j * 2, 1); + ipiv[j + offset] = jp + offset; + jp--; + + temp1 = *(b + jp * 2 + 0); + temp2 = *(b + jp * 2 + 1); + + if ((temp1 != ZERO) || (temp2 != ZERO)) { + + if (jp != j) { + SWAP_K(j + 1, 0, 0, ZERO, ZERO, a + j * 2, lda, + a + jp * 2, lda, NULL, 0); + } + + if (fabs(temp1) >= fabs(temp2)){ + ratio = temp2 / temp1; + den = dp1 /(temp1 * ( 1 + ratio * ratio)); + temp3 = den; + temp4 = -ratio * den; + } else { + ratio = temp1 / temp2; + den = dp1 /(temp2 * ( 1 + ratio * ratio)); + temp3 = ratio * den; + temp4 = -den; + } + + if (j + 1 < m) { + SCAL_K(m - j - 1, 0, 0, temp3, temp4, + b + (j + 1) * 2, 1, NULL, 0, NULL, 0); + } + } else { + if (!info) info = j + 1; + } + } + b += lda * 2; + } + return info; + +} + diff --git a/lapack/getrf/._Makefile b/lapack/getrf/._Makefile new file mode 100644 index 0000000..76ae6fe Binary files /dev/null and b/lapack/getrf/._Makefile differ diff --git a/lapack/getrf/._getrf_parallel.c b/lapack/getrf/._getrf_parallel.c new file mode 100644 index 0000000..64c9560 Binary files /dev/null and b/lapack/getrf/._getrf_parallel.c differ diff --git a/lapack/getrf/._getrf_parallel_omp.c b/lapack/getrf/._getrf_parallel_omp.c new file mode 100644 index 0000000..ad1b422 Binary files /dev/null and b/lapack/getrf/._getrf_parallel_omp.c differ diff --git a/lapack/getrf/._getrf_single.c b/lapack/getrf/._getrf_single.c new file mode 100644 index 0000000..207c713 Binary files /dev/null and b/lapack/getrf/._getrf_single.c differ diff --git a/lapack/getrf/Makefile b/lapack/getrf/Makefile new file mode 100644 index 0000000..a559dfb --- /dev/null +++ b/lapack/getrf/Makefile @@ -0,0 +1,98 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = sgetrf_single.$(SUFFIX) +DBLASOBJS = dgetrf_single.$(SUFFIX) +QBLASOBJS = qgetrf_single.$(SUFFIX) +CBLASOBJS = cgetrf_single.$(SUFFIX) +ZBLASOBJS = zgetrf_single.$(SUFFIX) +XBLASOBJS = xgetrf_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += sgetrf_parallel.$(SUFFIX) +DBLASOBJS += dgetrf_parallel.$(SUFFIX) +QBLASOBJS += qgetrf_parallel.$(SUFFIX) +CBLASOBJS += cgetrf_parallel.$(SUFFIX) +ZBLASOBJS += zgetrf_parallel.$(SUFFIX) +XBLASOBJS += xgetrf_parallel.$(SUFFIX) +endif + +ifeq ($(USE_OPENMP), 1) +GETRF_SRC = getrf_parallel_omp.c +else +GETRF_SRC = getrf_parallel.c +endif + +sgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +dgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +qgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +cgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +zgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +xgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +sgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +dgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +qgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +cgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +zgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +xgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +sgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +dgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +qgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +cgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +zgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +xgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +sgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +dgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +qgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +cgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +zgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +xgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c new file mode 100644 index 0000000..0db93da --- /dev/null +++ b/lapack/getrf/getrf_parallel.c @@ -0,0 +1,857 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +double sqrt(double); + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +#ifndef GETRF_FACTOR +#define GETRF_FACTOR 0.75 +#endif + +#undef GETRF_FACTOR +#define GETRF_FACTOR 1.00 + +static inline long FORMULA1(long M, long N, long IS, long BK, long T) { + + double m = (double)(M - IS - BK); + double n = (double)(N - IS - BK); + double b = (double)BK; + double a = (double)T; + + return (long)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a); + +} + +#define FORMULA2(M, N, IS, BK, T) (BLASLONG)((double)(N - IS + BK) * (1. - sqrt(1. - 1. / (double)(T)))) + + +static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG is, min_i; + BLASLONG js, min_j; + BLASLONG jjs, min_jj; + + BLASLONG m = args -> m; + BLASLONG n = args -> n; + BLASLONG k = args -> k; + + BLASLONG lda = args -> lda; + BLASLONG off = args -> ldb; + + FLOAT *b = (FLOAT *)args -> b + (k ) * COMPSIZE; + FLOAT *c = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; + FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; + FLOAT *sbb = sb; + + volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; + + blasint *ipiv = (blasint *)args -> c; + + if (range_n) { + n = range_n[1] - range_n[0]; + c += range_n[0] * lda * COMPSIZE; + d += range_n[0] * lda * COMPSIZE; + } + + if (args -> a == NULL) { + TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); + sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + } else { + sb = (FLOAT *)args -> a; + } + + for (js = 0; js < n; js += REAL_GEMM_R) { + min_j = n - js; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ + min_jj = js + min_j - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + if (GEMM_UNROLL_N <= 8) { + + LASWP_NCOPY(min_jj, off + 1, off + k, + c + (- off + jjs * lda) * COMPSIZE, lda, + ipiv, sbb + k * (jjs - js) * COMPSIZE); + + } else { + + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, +#ifdef COMPLEX + ZERO, +#endif + c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); + + GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sbb + (jjs - js) * k * COMPSIZE); + + } + + for (is = 0; is < k; is += GEMM_P) { + min_i = k - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRSM_KERNEL_LT(min_i, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb + k * is * COMPSIZE, + sbb + (jjs - js) * k * COMPSIZE, + c + (is + jjs * lda) * COMPSIZE, lda, is); + } + } + + if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0; + + for (is = 0; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa); + + GEMM_KERNEL_N(min_i, min_j, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sbb, d + (is + js * lda) * COMPSIZE, lda); + } + } +} + + +/* Non blocking implementation */ + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); + +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, SA, SB, C, LDC, X, Y) \ + GEMM_KERNEL_N(M, N, K, dm1, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#else +#define KERNEL_OPERATION(M, N, K, SA, SB, C, LDC, X, Y) \ + GEMM_KERNEL_N(M, N, K, dm1, ZERO, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif + +static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + job_t *job = (job_t *)args -> common; + + BLASLONG xxx, bufferside; + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG jjs, min_jj, div_n; + + BLASLONG i, current; + BLASLONG is, min_i; + + BLASLONG m, n_from, n_to; + BLASLONG k = args -> k; + + BLASLONG lda = args -> lda; + BLASLONG off = args -> ldb; + + FLOAT *a = (FLOAT *)args -> b + (k ) * COMPSIZE; + FLOAT *b = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; + FLOAT *c = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; + FLOAT *sbb= sb; + + blasint *ipiv = (blasint *)args -> c; + + volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; + + if (args -> a == NULL) { + TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); + sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + } else { + sb = (FLOAT *)args -> a; + } + + m = range_m[1] - range_m[0]; + n_from = range_n[mypos + 0]; + n_to = range_n[mypos + 1]; + + a += range_m[0] * COMPSIZE; + c += range_m[0] * COMPSIZE; + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + buffer[0] = sbb; + + + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE; + } + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + if (GEMM_UNROLL_N <= 8) { + + LASWP_NCOPY(min_jj, off + 1, off + k, + b + (- off + jjs * lda) * COMPSIZE, lda, + ipiv, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); + + } else { + + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, +#ifdef COMPLEX + ZERO, +#endif + b + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); + + GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda, + buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); + } + + for (is = 0; is < k; is += GEMM_P) { + min_i = k - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRSM_KERNEL_LT(min_i, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb + k * is * COMPSIZE, + buffer[bufferside] + (jjs - xxx) * k * COMPSIZE, + b + (is + jjs * lda) * COMPSIZE, lda, is); + } + } + + for (i = 0; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + + } + + flag[mypos * CACHE_LINE_SIZE] = 0; + + if (m == 0) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; + } + } + + for(is = 0; is < m; is += min_i){ + min_i = m - is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } + + ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); + + current = mypos; + + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if ((current != mypos) && (!is)) { + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; + } + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, is, xxx); + + if (is + min_i >= m) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + } + + for (i = 0; i < args -> nthreads; i++) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; + } + } + + return 0; +} + +#if 1 + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, mn, lda, offset; + BLASLONG init_bk, next_bk, range_n_mine[2], range_n_new[2]; + blasint *ipiv, iinfo, info; + int mode; + blas_arg_t newarg; + + FLOAT *a, *sbb; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1]; + BLASLONG range_N[MAX_CPU_NUMBER + 1]; + + job_t job[MAX_CPU_NUMBER]; + + BLASLONG width, nn, mm; + BLASLONG i, j, k, is, bk; + + BLASLONG num_cpu; + + volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (m <= 0 || n <= 0) return 0; + + newarg.c = ipiv; + newarg.lda = lda; + newarg.common = (void *)job; + + info = 0; + + mn = MIN(m, n); + + init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (init_bk > GEMM_Q) init_bk = GEMM_Q; + + if (init_bk <= GEMM_UNROLL_N) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } + + next_bk = init_bk; + + bk = mn; + if (bk > next_bk) bk = next_bk; + + range_n_new[0] = offset; + range_n_new[1] = offset + bk; + + iinfo = CNAME(args, NULL, range_n_new, sa, sb, 0); + + if (iinfo && !info) info = iinfo; + + TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); + + sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + + is = 0; + num_cpu = 0; + + while (is < mn) { + + width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (width > mn - is - bk) width = mn - is - bk; + + if (width < bk) { + next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1); + + if (next_bk > bk) next_bk = bk; + + width = next_bk; + if (width > mn - is - bk) width = mn - is - bk; + } + + if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]); + + mm = m - bk - is; + nn = n - bk - is; + + newarg.a = sb; + newarg.b = a + (is + is * lda) * COMPSIZE; + newarg.d = (void *)flag; + newarg.m = mm; + newarg.n = nn; + newarg.k = bk; + newarg.ldb = is + offset; + + nn -= width; + + range_n_mine[0] = 0; + range_n_mine[1] = width; + + range_N[0] = width; + range_M[0] = 0; + + num_cpu = 0; + + while (nn > 0){ + + if (mm >= nn) { + + width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (nn < width) width = nn; + nn -= width; + range_N[num_cpu + 1] = range_N[num_cpu] + width; + + width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (mm < width) width = mm; + if (nn <= 0) width = mm; + mm -= width; + range_M[num_cpu + 1] = range_M[num_cpu] + width; + + } else { + + width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (mm < width) width = mm; + mm -= width; + range_M[num_cpu + 1] = range_M[num_cpu] + width; + + width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (nn < width) width = nn; + if (mm <= 0) width = nn; + nn -= width; + range_N[num_cpu + 1] = range_N[num_cpu] + width; + + } + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_advanced_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = &range_M[num_cpu]; + queue[num_cpu].range_n = &range_N[0]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + flag[num_cpu * CACHE_LINE_SIZE] = 1; + + num_cpu ++; + + } + + newarg.nthreads = num_cpu; + + if (num_cpu > 0) { + for (j = 0; j < num_cpu; j++) { + for (i = 0; i < num_cpu; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + } + + is += bk; + + bk = mn - is; + if (bk > next_bk) bk = next_bk; + + range_n_new[0] = offset + is; + range_n_new[1] = offset + is + bk; + + if (num_cpu > 0) { + + queue[num_cpu - 1].next = NULL; + + exec_blas_async(0, &queue[0]); + + inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + + if (iinfo && !info) info = iinfo + is; + + for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; + + TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); + + } else { + + inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + + if (iinfo && !info) info = iinfo + is; + + } + + } + + next_bk = init_bk; + is = 0; + + while (is < mn) { + + bk = mn - is; + if (bk > next_bk) bk = next_bk; + + width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (width > mn - is - bk) width = mn - is - bk; + + if (width < bk) { + next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1); + if (next_bk > bk) next_bk = bk; + } + + blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, + a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, + ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); + + is += bk; + } + + return info; +} + +#else + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, mn, lda, offset; + BLASLONG i, is, bk, init_bk, next_bk, range_n_new[2]; + blasint *ipiv, iinfo, info; + int mode; + blas_arg_t newarg; + FLOAT *a, *sbb; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, nn, num_cpu; + + volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (m <= 0 || n <= 0) return 0; + + newarg.c = ipiv; + newarg.lda = lda; + newarg.common = NULL; + newarg.nthreads = args -> nthreads; + + mn = MIN(m, n); + + init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (init_bk > GEMM_Q) init_bk = GEMM_Q; + + if (init_bk <= GEMM_UNROLL_N) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } + + width = FORMULA1(m, n, 0, init_bk, args -> nthreads); + width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (width > n - init_bk) width = n - init_bk; + + if (width < init_bk) { + long temp; + + temp = FORMULA2(m, n, 0, init_bk, args -> nthreads); + temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (temp < GEMM_UNROLL_N) temp = GEMM_UNROLL_N; + if (temp < init_bk) init_bk = temp; + + } + + next_bk = init_bk; + bk = init_bk; + + range_n_new[0] = offset; + range_n_new[1] = offset + bk; + + info = CNAME(args, NULL, range_n_new, sa, sb, 0); + + TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); + + is = 0; + num_cpu = 0; + + sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + + while (is < mn) { + + width = FORMULA1(m, n, is, bk, args -> nthreads); + width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (width < bk) { + + next_bk = FORMULA2(m, n, is, bk, args -> nthreads); + next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (next_bk > bk) next_bk = bk; +#if 0 + if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is); +#else + if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is); +#endif + + width = next_bk; + } + + if (width > mn - is - bk) { + next_bk = mn - is - bk; + width = next_bk; + } + + nn = n - bk - is; + if (width > nn) width = nn; + + if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]); + + range[0] = 0; + range[1] = width; + + num_cpu = 1; + nn -= width; + + newarg.a = sb; + newarg.b = a + (is + is * lda) * COMPSIZE; + newarg.d = (void *)flag; + newarg.m = m - bk - is; + newarg.n = n - bk - is; + newarg.k = bk; + newarg.ldb = is + offset; + + while (nn > 0){ + + width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu); + + nn -= width; + if (nn < 0) width = width + nn; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + //queue[num_cpu].routine = inner_advanced_thread; + queue[num_cpu].routine = (void *)inner_basic_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = &range[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + flag[num_cpu * CACHE_LINE_SIZE] = 1; + + num_cpu ++; + } + + queue[num_cpu - 1].next = NULL; + + is += bk; + + bk = n - is; + if (bk > next_bk) bk = next_bk; + + range_n_new[0] = offset + is; + range_n_new[1] = offset + is + bk; + + if (num_cpu > 1) { + + exec_blas_async(1, &queue[1]); + +#if 0 + inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); +#else + + if (range[1] >= bk * 4) { + + BLASLONG myrange[2]; + + myrange[0] = 0; + myrange[1] = bk; + + inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + + myrange[0] = bk; + myrange[1] = range[1]; + + inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1); + + } else { + + inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + } + +#endif + + for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; + + TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); + + } else { + + inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + } + + if (iinfo && !info) info = iinfo + is; + + } + + next_bk = init_bk; + bk = init_bk; + + is = 0; + + while (is < mn) { + + bk = mn - is; + if (bk > next_bk) bk = next_bk; + + width = FORMULA1(m, n, is, bk, args -> nthreads); + width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (width < bk) { + next_bk = FORMULA2(m, n, is, bk, args -> nthreads); + next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (next_bk > bk) next_bk = bk; +#if 0 + if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is); +#else + if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is); +#endif + } + + if (width > mn - is - bk) { + next_bk = mn - is - bk; + width = next_bk; + } + + blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, + a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, + ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); + + is += bk; + } + + return info; +} + +#endif + diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c new file mode 100644 index 0000000..b637e6d --- /dev/null +++ b/lapack/getrf/getrf_parallel_omp.c @@ -0,0 +1,222 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +static FLOAT dm1 = -1.; + +static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG is, min_i; + BLASLONG js, min_j; + BLASLONG jjs, min_jj; + + BLASLONG m = args -> m; + BLASLONG n = args -> n; + BLASLONG k = args -> k; + + BLASLONG lda = args -> lda; + BLASLONG off = args -> ldb; + + FLOAT *b = (FLOAT *)args -> b + (k ) * COMPSIZE; + FLOAT *c = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; + FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; + + blasint *ipiv = (blasint *)args -> c; + + if (range_n) { + n = range_n[1] - range_n[0]; + c += range_n[0] * lda * COMPSIZE; + d += range_n[0] * lda * COMPSIZE; + } + + for (js = 0; js < n; js += REAL_GEMM_R) { + min_j = n - js; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ + min_jj = js + min_j - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + LASWP_NCOPY(min_jj, off + 1, off + k, + c + (- off + jjs * lda) * COMPSIZE, lda, + ipiv, sb + k * (jjs - js) * COMPSIZE); + + for (is = 0; is < k; is += GEMM_P) { + min_i = k - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRSM_KERNEL_LT(min_i, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + (FLOAT *)args -> a + k * is * COMPSIZE, + sb + (jjs - js) * k * COMPSIZE, + c + (is + jjs * lda) * COMPSIZE, lda, is); + } + } + + for (is = 0; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa); + + GEMM_KERNEL_N(min_i, min_j, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, d + (is + js * lda) * COMPSIZE, lda); + } + } +} + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, lda, offset; + blasint *ipiv, iinfo, info; + BLASLONG j, jb, mn, blocking; + FLOAT *a, *offsetA, *offsetB; + BLASLONG range_N[2]; + blas_arg_t newarg; + + int mode; + + FLOAT *sbb; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (m <= 0 || n <= 0) return 0; + + mn = MIN(m, n); + + blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + if (blocking <= GEMM_UNROLL_N * 2) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } + + sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + + info = 0; + + for (j = 0; j < mn; j += blocking) { + + jb = mn - j; + if (jb > blocking) jb = blocking; + + offsetA = a + j * lda * COMPSIZE; + offsetB = a + (j + jb) * lda * COMPSIZE; + + range_N[0] = offset + j; + range_N[1] = offset + j + jb; + + iinfo = CNAME(args, NULL, range_N, sa, sb, 0); + + if (iinfo && !info) info = iinfo + j; + + if (j + jb < n) { + + TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); + + newarg.m = m - jb - j; + newarg.n = n - jb - j; + newarg.k = jb; + + newarg.a = sb; + newarg.lda = lda; + newarg.b = a + (j + j * lda) * COMPSIZE; + newarg.ldb = j + offset; + newarg.c = ipiv; + + newarg.common = NULL; + newarg.nthreads = args -> nthreads; + + gemm_thread_n(mode, &newarg, NULL, NULL, (void *)inner_thread, sa, sbb, args -> nthreads); + + } + } + + for (j = 0; j < mn; j += jb) { + jb = MIN(mn - j, blocking); + LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO, +#ifdef COMPLEX + ZERO, +#endif + a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); + + } + + return info; +} diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c new file mode 100644 index 0000000..a761dee --- /dev/null +++ b/lapack/getrf/getrf_single.c @@ -0,0 +1,173 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +static FLOAT dm1 = -1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, lda, offset; + BLASLONG j, js, jmin, is, imin, jc, jcmin; + BLASLONG jjs, min_jj; + blasint *ipiv, iinfo, info; + BLASLONG jb, mn, blocking; + FLOAT *a, *offsetA, *offsetB; + BLASLONG range_N[2]; + + FLOAT *sbb; + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (m <= 0 || n <= 0) return 0; + + mn = MIN(m, n); + + blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + if (blocking <= GEMM_UNROLL_N * 2) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } + + sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + + info = 0; + + for (j = 0; j < mn; j += blocking) { + + jb = mn - j; + if (jb > blocking) jb = blocking; + + offsetA = a + j * lda * COMPSIZE; + offsetB = a + (j + jb) * lda * COMPSIZE; + + range_N[0] = offset + j; + range_N[1] = offset + j + jb; + + iinfo = CNAME(args, NULL, range_N, sa, sb, 0); + + if (iinfo && !info) info = iinfo + j; + + if (j + jb < n) { + + TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); + + for (js = j + jb; js < n; js += REAL_GEMM_R){ + jmin = n - js; + if (jmin > REAL_GEMM_R) jmin = REAL_GEMM_R; + + for (jjs = js; jjs < js + jmin; jjs += GEMM_UNROLL_N){ + min_jj = js + jmin - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#if 0 + LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, +#ifdef COMPLEX + ZERO, +#endif + a + (- offset + jjs * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); + + GEMM_ONCOPY (jb, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sbb + jb * (jjs - js) * COMPSIZE); +#else + LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset, + a + (- offset + jjs * lda) * COMPSIZE, lda, ipiv, sbb + jb * (jjs - js) * COMPSIZE); +#endif + + + for (jc = 0; jc < jb; jc += GEMM_P) { + jcmin = jb - jc; + if (jcmin > GEMM_P) jcmin = GEMM_P; + + TRSM_KERNEL_LT(jcmin, min_jj, jb, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb + jb * jc * COMPSIZE, + sbb + jb * (jjs - js) * COMPSIZE, + a + (j + jc + jjs * lda) * COMPSIZE, lda, jc); + } + } + + + for (is = j + jb; is < m; is += GEMM_P){ + + imin = m - is; + if (imin > GEMM_P) imin = GEMM_P; + + GEMM_ITCOPY (jb, imin, offsetA + is * COMPSIZE, lda, sa); + + GEMM_KERNEL_N(imin, jmin, jb, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sbb, a + (is + js * lda) * COMPSIZE, lda); + } + } + } + } + + for (j = 0; j < mn; j += jb) { + jb = MIN(mn - j, blocking); + LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO, +#ifdef COMPLEX + ZERO, +#endif + a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); + + } + + return info; +} diff --git a/lapack/getri/._cgetri.f b/lapack/getri/._cgetri.f new file mode 100644 index 0000000..671cf46 Binary files /dev/null and b/lapack/getri/._cgetri.f differ diff --git a/lapack/getri/._dgetri.f b/lapack/getri/._dgetri.f new file mode 100644 index 0000000..553093b Binary files /dev/null and b/lapack/getri/._dgetri.f differ diff --git a/lapack/getri/._sgetri.f b/lapack/getri/._sgetri.f new file mode 100644 index 0000000..57aa4d5 Binary files /dev/null and b/lapack/getri/._sgetri.f differ diff --git a/lapack/getri/._zgetri.f b/lapack/getri/._zgetri.f new file mode 100644 index 0000000..334b597 Binary files /dev/null and b/lapack/getri/._zgetri.f differ diff --git a/lapack/getri/cgetri.f b/lapack/getri/cgetri.f new file mode 100644 index 0000000..6840f53 --- /dev/null +++ b/lapack/getri/cgetri.f @@ -0,0 +1,194 @@ + SUBROUTINE CGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* CGETRI computes the inverse of a matrix using the LU factorization +* computed by CGETRF. +* +* This method inverts U and then computes inv(A) by solving the system +* inv(A)*L = inv(U) for inv(A). +* +* Arguments +* ========= +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the factors L and U from the factorization +* A = P*L*U as computed by CGETRF. +* On exit, if INFO = 0, the inverse of the original matrix A. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from CGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* WORK (workspace/output) COMPLEX array, dimension (LWORK) +* On exit, if INFO=0, then WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* The dimension of the array WORK. LWORK >= max(1,N). +* For optimal performance LWORK >= N*NB, where NB is +* the optimal blocksize returned by ILAENV. +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero; the matrix is +* singular and its inverse could not be computed. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), + $ ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, + $ NBMIN, NN +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGEMV, CSWAP, CTRSM, CTRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 ) + LWKOPT = N*NB + WORK( 1 ) = LWKOPT + LQUERY = ( LWORK.EQ.-1 ) + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -3 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -6 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETRI', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form inv(U). If INFO > 0 from CTRTRI, then U is singular, +* and the inverse is not computed. +* + CALL CTRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* + NBMIN = 2 + LDWORK = N + IF( NB.GT.1 .AND. NB.LT.N ) THEN + IWS = MAX( LDWORK*NB, 1 ) + IF( LWORK.LT.IWS ) THEN + NB = LWORK / LDWORK + NBMIN = MAX( 2, ILAENV( 2, 'CGETRI', ' ', N, -1, -1, -1 ) ) + END IF + ELSE + IWS = N + END IF +* +* Solve the equation inv(A)*L = inv(U) for inv(A). +* + IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + DO 20 J = N, 1, -1 +* +* Copy current column of L to WORK and replace with zeros. +* + DO 10 I = J + 1, N + WORK( I ) = A( I, J ) + A( I, J ) = ZERO + 10 CONTINUE +* +* Compute current column of inv(A). +* + IF( J.LT.N ) + $ CALL CGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), + $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) + 20 CONTINUE + ELSE +* +* Use blocked code. +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 50 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) +* +* Copy current block column of L to WORK and replace with +* zeros. +* + DO 40 JJ = J, J + JB - 1 + DO 30 I = JJ + 1, N + WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) + A( I, JJ ) = ZERO + 30 CONTINUE + 40 CONTINUE +* +* Compute current block column of inv(A). +* + IF( J+JB.LE.N ) + $ CALL CGEMM( 'No transpose', 'No transpose', N, JB, + $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, + $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) + CALL CTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, + $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) + 50 CONTINUE + END IF +* +* Apply column interchanges. +* + DO 60 J = N - 1, 1, -1 + JP = IPIV( J ) + IF( JP.NE.J ) + $ CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) + 60 CONTINUE +* + WORK( 1 ) = IWS + RETURN +* +* End of CGETRI +* + END diff --git a/lapack/getri/dgetri.f b/lapack/getri/dgetri.f new file mode 100644 index 0000000..c67a348 --- /dev/null +++ b/lapack/getri/dgetri.f @@ -0,0 +1,193 @@ + SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* DGETRI computes the inverse of a matrix using the LU factorization +* computed by DGETRF. +* +* This method inverts U and then computes inv(A) by solving the system +* inv(A)*L = inv(U) for inv(A). +* +* Arguments +* ========= +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the factors L and U from the factorization +* A = P*L*U as computed by DGETRF. +* On exit, if INFO = 0, the inverse of the original matrix A. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from DGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* WORK (workspace/output) DOUBLE PRECISION array, dimension (LWORK) +* On exit, if INFO=0, then WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* The dimension of the array WORK. LWORK >= max(1,N). +* For optimal performance LWORK >= N*NB, where NB is +* the optimal blocksize returned by ILAENV. +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero; the matrix is +* singular and its inverse could not be computed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, + $ NBMIN, NN +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DGEMV, DSWAP, DTRSM, DTRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 ) + LWKOPT = N*NB + WORK( 1 ) = LWKOPT + LQUERY = ( LWORK.EQ.-1 ) + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -3 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -6 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETRI', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form inv(U). If INFO > 0 from DTRTRI, then U is singular, +* and the inverse is not computed. +* + CALL DTRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* + NBMIN = 2 + LDWORK = N + IF( NB.GT.1 .AND. NB.LT.N ) THEN + IWS = MAX( LDWORK*NB, 1 ) + IF( LWORK.LT.IWS ) THEN + NB = LWORK / LDWORK + NBMIN = MAX( 2, ILAENV( 2, 'DGETRI', ' ', N, -1, -1, -1 ) ) + END IF + ELSE + IWS = N + END IF +* +* Solve the equation inv(A)*L = inv(U) for inv(A). +* + IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + DO 20 J = N, 1, -1 +* +* Copy current column of L to WORK and replace with zeros. +* + DO 10 I = J + 1, N + WORK( I ) = A( I, J ) + A( I, J ) = ZERO + 10 CONTINUE +* +* Compute current column of inv(A). +* + IF( J.LT.N ) + $ CALL DGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), + $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) + 20 CONTINUE + ELSE +* +* Use blocked code. +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 50 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) +* +* Copy current block column of L to WORK and replace with +* zeros. +* + DO 40 JJ = J, J + JB - 1 + DO 30 I = JJ + 1, N + WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) + A( I, JJ ) = ZERO + 30 CONTINUE + 40 CONTINUE +* +* Compute current block column of inv(A). +* + IF( J+JB.LE.N ) + $ CALL DGEMM( 'No transpose', 'No transpose', N, JB, + $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, + $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) + CALL DTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, + $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) + 50 CONTINUE + END IF +* +* Apply column interchanges. +* + DO 60 J = N - 1, 1, -1 + JP = IPIV( J ) + IF( JP.NE.J ) + $ CALL DSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) + 60 CONTINUE +* + WORK( 1 ) = IWS + RETURN +* +* End of DGETRI +* + END diff --git a/lapack/getri/sgetri.f b/lapack/getri/sgetri.f new file mode 100644 index 0000000..ec5932f --- /dev/null +++ b/lapack/getri/sgetri.f @@ -0,0 +1,193 @@ + SUBROUTINE SGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* SGETRI computes the inverse of a matrix using the LU factorization +* computed by SGETRF. +* +* This method inverts U and then computes inv(A) by solving the system +* inv(A)*L = inv(U) for inv(A). +* +* Arguments +* ========= +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the factors L and U from the factorization +* A = P*L*U as computed by SGETRF. +* On exit, if INFO = 0, the inverse of the original matrix A. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from SGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* WORK (workspace/output) REAL array, dimension (LWORK) +* On exit, if INFO=0, then WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* The dimension of the array WORK. LWORK >= max(1,N). +* For optimal performance LWORK >= N*NB, where NB is +* the optimal blocksize returned by ILAENV. +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero; the matrix is +* singular and its inverse could not be computed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, + $ NBMIN, NN +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 ) + LWKOPT = N*NB + WORK( 1 ) = LWKOPT + LQUERY = ( LWORK.EQ.-1 ) + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -3 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -6 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETRI', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form inv(U). If INFO > 0 from STRTRI, then U is singular, +* and the inverse is not computed. +* + CALL STRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* + NBMIN = 2 + LDWORK = N + IF( NB.GT.1 .AND. NB.LT.N ) THEN + IWS = MAX( LDWORK*NB, 1 ) + IF( LWORK.LT.IWS ) THEN + NB = LWORK / LDWORK + NBMIN = MAX( 2, ILAENV( 2, 'SGETRI', ' ', N, -1, -1, -1 ) ) + END IF + ELSE + IWS = N + END IF +* +* Solve the equation inv(A)*L = inv(U) for inv(A). +* + IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + DO 20 J = N, 1, -1 +* +* Copy current column of L to WORK and replace with zeros. +* + DO 10 I = J + 1, N + WORK( I ) = A( I, J ) + A( I, J ) = ZERO + 10 CONTINUE +* +* Compute current column of inv(A). +* + IF( J.LT.N ) + $ CALL SGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), + $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) + 20 CONTINUE + ELSE +* +* Use blocked code. +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 50 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) +* +* Copy current block column of L to WORK and replace with +* zeros. +* + DO 40 JJ = J, J + JB - 1 + DO 30 I = JJ + 1, N + WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) + A( I, JJ ) = ZERO + 30 CONTINUE + 40 CONTINUE +* +* Compute current block column of inv(A). +* + IF( J+JB.LE.N ) + $ CALL SGEMM( 'No transpose', 'No transpose', N, JB, + $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, + $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) + CALL STRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, + $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) + 50 CONTINUE + END IF +* +* Apply column interchanges. +* + DO 60 J = N - 1, 1, -1 + JP = IPIV( J ) + IF( JP.NE.J ) + $ CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) + 60 CONTINUE +* + WORK( 1 ) = IWS + RETURN +* +* End of SGETRI +* + END diff --git a/lapack/getri/zgetri.f b/lapack/getri/zgetri.f new file mode 100644 index 0000000..1eb4eb7 --- /dev/null +++ b/lapack/getri/zgetri.f @@ -0,0 +1,194 @@ + SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* ZGETRI computes the inverse of a matrix using the LU factorization +* computed by ZGETRF. +* +* This method inverts U and then computes inv(A) by solving the system +* inv(A)*L = inv(U) for inv(A). +* +* Arguments +* ========= +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the factors L and U from the factorization +* A = P*L*U as computed by ZGETRF. +* On exit, if INFO = 0, the inverse of the original matrix A. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from ZGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* WORK (workspace/output) COMPLEX*16 array, dimension (LWORK) +* On exit, if INFO=0, then WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* The dimension of the array WORK. LWORK >= max(1,N). +* For optimal performance LWORK >= N*NB, where NB is +* the optimal blocksize returned by ILAENV. +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero; the matrix is +* singular and its inverse could not be computed. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), + $ ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, + $ NBMIN, NN +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGEMM, ZGEMV, ZSWAP, ZTRSM, ZTRTRI +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 ) + LWKOPT = N*NB + WORK( 1 ) = LWKOPT + LQUERY = ( LWORK.EQ.-1 ) + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -3 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -6 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETRI', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form inv(U). If INFO > 0 from ZTRTRI, then U is singular, +* and the inverse is not computed. +* + CALL ZTRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* + NBMIN = 2 + LDWORK = N + IF( NB.GT.1 .AND. NB.LT.N ) THEN + IWS = MAX( LDWORK*NB, 1 ) + IF( LWORK.LT.IWS ) THEN + NB = LWORK / LDWORK + NBMIN = MAX( 2, ILAENV( 2, 'ZGETRI', ' ', N, -1, -1, -1 ) ) + END IF + ELSE + IWS = N + END IF +* +* Solve the equation inv(A)*L = inv(U) for inv(A). +* + IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + DO 20 J = N, 1, -1 +* +* Copy current column of L to WORK and replace with zeros. +* + DO 10 I = J + 1, N + WORK( I ) = A( I, J ) + A( I, J ) = ZERO + 10 CONTINUE +* +* Compute current column of inv(A). +* + IF( J.LT.N ) + $ CALL ZGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), + $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) + 20 CONTINUE + ELSE +* +* Use blocked code. +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 50 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) +* +* Copy current block column of L to WORK and replace with +* zeros. +* + DO 40 JJ = J, J + JB - 1 + DO 30 I = JJ + 1, N + WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) + A( I, JJ ) = ZERO + 30 CONTINUE + 40 CONTINUE +* +* Compute current block column of inv(A). +* + IF( J+JB.LE.N ) + $ CALL ZGEMM( 'No transpose', 'No transpose', N, JB, + $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, + $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) + CALL ZTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, + $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) + 50 CONTINUE + END IF +* +* Apply column interchanges. +* + DO 60 J = N - 1, 1, -1 + JP = IPIV( J ) + IF( JP.NE.J ) + $ CALL ZSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) + 60 CONTINUE +* + WORK( 1 ) = IWS + RETURN +* +* End of ZGETRI +* + END diff --git a/lapack/getrs/._Makefile b/lapack/getrs/._Makefile new file mode 100644 index 0000000..83d99c7 Binary files /dev/null and b/lapack/getrs/._Makefile differ diff --git a/lapack/getrs/._getrs_parallel.c b/lapack/getrs/._getrs_parallel.c new file mode 100644 index 0000000..d914d32 Binary files /dev/null and b/lapack/getrs/._getrs_parallel.c differ diff --git a/lapack/getrs/._getrs_single.c b/lapack/getrs/._getrs_single.c new file mode 100644 index 0000000..28e2410 Binary files /dev/null and b/lapack/getrs/._getrs_single.c differ diff --git a/lapack/getrs/._zgetrs_parallel.c b/lapack/getrs/._zgetrs_parallel.c new file mode 100644 index 0000000..a8738a1 Binary files /dev/null and b/lapack/getrs/._zgetrs_parallel.c differ diff --git a/lapack/getrs/._zgetrs_single.c b/lapack/getrs/._zgetrs_single.c new file mode 100644 index 0000000..84c66ea Binary files /dev/null and b/lapack/getrs/._zgetrs_single.c differ diff --git a/lapack/getrs/Makefile b/lapack/getrs/Makefile new file mode 100644 index 0000000..2640ef0 --- /dev/null +++ b/lapack/getrs/Makefile @@ -0,0 +1,236 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = sgetrs_N_single.$(SUFFIX) sgetrs_T_single.$(SUFFIX) +DBLASOBJS = dgetrs_N_single.$(SUFFIX) dgetrs_T_single.$(SUFFIX) +QBLASOBJS = qgetrs_N_single.$(SUFFIX) qgetrs_T_single.$(SUFFIX) +CBLASOBJS = cgetrs_N_single.$(SUFFIX) cgetrs_T_single.$(SUFFIX) cgetrs_R_single.$(SUFFIX) cgetrs_C_single.$(SUFFIX) +ZBLASOBJS = zgetrs_N_single.$(SUFFIX) zgetrs_T_single.$(SUFFIX) zgetrs_R_single.$(SUFFIX) zgetrs_C_single.$(SUFFIX) +XBLASOBJS = xgetrs_N_single.$(SUFFIX) xgetrs_T_single.$(SUFFIX) xgetrs_R_single.$(SUFFIX) xgetrs_C_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += sgetrs_N_parallel.$(SUFFIX) sgetrs_T_parallel.$(SUFFIX) +DBLASOBJS += dgetrs_N_parallel.$(SUFFIX) dgetrs_T_parallel.$(SUFFIX) +QBLASOBJS += qgetrs_N_parallel.$(SUFFIX) qgetrs_T_parallel.$(SUFFIX) +CBLASOBJS += cgetrs_N_parallel.$(SUFFIX) cgetrs_T_parallel.$(SUFFIX) cgetrs_R_parallel.$(SUFFIX) cgetrs_C_parallel.$(SUFFIX) +ZBLASOBJS += zgetrs_N_parallel.$(SUFFIX) zgetrs_T_parallel.$(SUFFIX) zgetrs_R_parallel.$(SUFFIX) zgetrs_C_parallel.$(SUFFIX) +XBLASOBJS += xgetrs_N_parallel.$(SUFFIX) xgetrs_T_parallel.$(SUFFIX) xgetrs_R_parallel.$(SUFFIX) xgetrs_C_parallel.$(SUFFIX) +endif + +sgetrs_N_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) + +sgetrs_T_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) + +sgetrs_N_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) + +sgetrs_T_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) + +dgetrs_N_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) + +dgetrs_T_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) + +dgetrs_N_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) + +dgetrs_T_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) + +qgetrs_N_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) + +qgetrs_T_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) + +qgetrs_N_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) + +qgetrs_T_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) + +cgetrs_N_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) + +cgetrs_T_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) + +cgetrs_R_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) + +cgetrs_C_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) + +cgetrs_N_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) + +cgetrs_T_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) + +cgetrs_R_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) + +cgetrs_C_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) + +zgetrs_N_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) + +zgetrs_T_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) + +zgetrs_R_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) + +zgetrs_C_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) + +zgetrs_N_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) + +zgetrs_T_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) + +zgetrs_R_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) + +zgetrs_C_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) + +xgetrs_N_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) + +xgetrs_T_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) + +xgetrs_R_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) + +xgetrs_C_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) + +xgetrs_N_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) + +xgetrs_T_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) + +xgetrs_R_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) + +xgetrs_C_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) + +sgetrs_N_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) + +sgetrs_T_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) + +sgetrs_N_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) + +sgetrs_T_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) + +dgetrs_N_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) + +dgetrs_T_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) + +dgetrs_N_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) + +dgetrs_T_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) + +qgetrs_N_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) + +qgetrs_T_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) + +qgetrs_N_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) + +qgetrs_T_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) + +cgetrs_N_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) + +cgetrs_T_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) + +cgetrs_R_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) + +cgetrs_C_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) + +cgetrs_N_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) + +cgetrs_T_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) + +cgetrs_R_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) + +cgetrs_C_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) + +zgetrs_N_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) + +zgetrs_T_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) + +zgetrs_R_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) + +zgetrs_C_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) + +zgetrs_N_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) + +zgetrs_T_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) + +zgetrs_R_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) + +zgetrs_C_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) + +xgetrs_N_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) + +xgetrs_T_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) + +xgetrs_R_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) + +xgetrs_C_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) + +xgetrs_N_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) + +xgetrs_T_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) + +xgetrs_R_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) + +xgetrs_C_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/getrs/getrs_parallel.c b/lapack/getrs/getrs_parallel.c new file mode 100644 index 0000000..3a7e426 --- /dev/null +++ b/lapack/getrs/getrs_parallel.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + BLASLONG n = args -> n; + BLASLONG off = 0; + + if (range_n) { + n = range_n[1] - range_n[0]; + off = range_n[0]; + } + +#ifndef TRANS + LASWP_PLUS(n, 1, args -> m, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + TRSM_LNUN (args, range_m, range_n, sa, sb, 0); +#else + TRSM_LTUN (args, range_m, range_n, sa, sb, 0); + TRSM_LTLU (args, range_m, range_n, sa, sb, 0); + LASWP_MINUS(n, 1, args -> m, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); +#endif + + return 0; +} + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + int mode; + +#ifndef TRANS + if (args -> n == 1){ + LASWP_PLUS(1, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); + } +#else + if (args -> n == 1){ + TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + LASWP_MINUS(1, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); +#else + mode = BLAS_SINGLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); +#endif + + gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); + } +#endif + + return 0; + } diff --git a/lapack/getrs/getrs_single.c b/lapack/getrs/getrs_single.c new file mode 100644 index 0000000..0dbb038 --- /dev/null +++ b/lapack/getrs/getrs_single.c @@ -0,0 +1,68 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + +#ifndef TRANS + LASWP_PLUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + TRSM_LNUN (args, range_m, range_n, sa, sb, 0); + } + +#else + + if (args -> n == 1){ + TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LTUN (args, range_m, range_n, sa, sb, 0); + TRSM_LTLU (args, range_m, range_n, sa, sb, 0); + } + + LASWP_MINUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#endif + + return 0; } diff --git a/lapack/getrs/zgetrs_parallel.c b/lapack/getrs/zgetrs_parallel.c new file mode 100644 index 0000000..b0d3fb0 --- /dev/null +++ b/lapack/getrs/zgetrs_parallel.c @@ -0,0 +1,113 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + BLASLONG n = args -> n; + BLASLONG off = 0; + + if (range_n) { + n = range_n[1] - range_n[0]; + off = range_n[0]; + } + +#if TRANS == 1 + LASWP_PLUS(n, 1, args -> m, ZERO, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + TRSM_LNUN (args, range_m, range_n, sa, sb, 0); +#elif TRANS == 2 + TRSM_LTUN (args, range_m, range_n, sa, sb, 0); + TRSM_LTLU (args, range_m, range_n, sa, sb, 0); + LASWP_MINUS(n, 1, args -> m, ZERO, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); +#elif TRANS == 3 + LASWP_PLUS(n, 1, args -> m, ZERO, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); + TRSM_LRLU (args, range_m, range_n, sa, sb, 0); + TRSM_LRUN (args, range_m, range_n, sa, sb, 0); +#else + TRSM_LCUN (args, range_m, range_n, sa, sb, 0); + TRSM_LCLU (args, range_m, range_n, sa, sb, 0); + LASWP_MINUS(n, 1, args -> m, ZERO, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); +#endif + + return 0; +} + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + int mode; + + if (args -> n == 1){ +#if TRANS == 1 + LASWP_PLUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + ZTRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + ZTRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); +#elif TRANS == 2 + ZTRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + ZTRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + LASWP_MINUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#elif TRANS == 3 + LASWP_PLUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + ZTRSV_RLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + ZTRSV_RUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); +#else + ZTRSV_CUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + ZTRSV_CLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + LASWP_MINUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#endif + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); + } + + return 0; + } diff --git a/lapack/getrs/zgetrs_single.c b/lapack/getrs/zgetrs_single.c new file mode 100644 index 0000000..3910d0e --- /dev/null +++ b/lapack/getrs/zgetrs_single.c @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + +#if TRANS == 1 + LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + + TRSM_LNLU(args, range_m, range_n, sa, sb, 0); + TRSM_LNUN(args, range_m, range_n, sa, sb, 0); +#elif TRANS == 2 + TRSM_LTUN(args, range_m, range_n, sa, sb, 0); + TRSM_LTLU(args, range_m, range_n, sa, sb, 0); + + LASWP_MINUS(args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#elif TRANS == 3 + LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + + TRSM_LRLU(args, range_m, range_n, sa, sb, 0); + TRSM_LRUN(args, range_m, range_n, sa, sb, 0); +#else + TRSM_LCUN(args, range_m, range_n, sa, sb, 0); + TRSM_LCLU(args, range_m, range_n, sa, sb, 0); + + LASWP_MINUS(args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#endif + return 0; + } diff --git a/lapack/laswp/._Makefile b/lapack/laswp/._Makefile new file mode 100644 index 0000000..1852f1f Binary files /dev/null and b/lapack/laswp/._Makefile differ diff --git a/lapack/laswp/._alpha b/lapack/laswp/._alpha new file mode 100755 index 0000000..86f97f8 Binary files /dev/null and b/lapack/laswp/._alpha differ diff --git a/lapack/laswp/._generic b/lapack/laswp/._generic new file mode 100755 index 0000000..bdd39e6 Binary files /dev/null and b/lapack/laswp/._generic differ diff --git a/lapack/laswp/._ia64 b/lapack/laswp/._ia64 new file mode 100755 index 0000000..454ae12 Binary files /dev/null and b/lapack/laswp/._ia64 differ diff --git a/lapack/laswp/._mips64 b/lapack/laswp/._mips64 new file mode 100755 index 0000000..f84c3c7 Binary files /dev/null and b/lapack/laswp/._mips64 differ diff --git a/lapack/laswp/._power b/lapack/laswp/._power new file mode 100755 index 0000000..71a2f10 Binary files /dev/null and b/lapack/laswp/._power differ diff --git a/lapack/laswp/._sparc b/lapack/laswp/._sparc new file mode 100755 index 0000000..31d9894 Binary files /dev/null and b/lapack/laswp/._sparc differ diff --git a/lapack/laswp/._x86 b/lapack/laswp/._x86 new file mode 100755 index 0000000..686c2f4 Binary files /dev/null and b/lapack/laswp/._x86 differ diff --git a/lapack/laswp/._x86_64 b/lapack/laswp/._x86_64 new file mode 100755 index 0000000..969b925 Binary files /dev/null and b/lapack/laswp/._x86_64 differ diff --git a/lapack/laswp/Makefile b/lapack/laswp/Makefile new file mode 100644 index 0000000..3898006 --- /dev/null +++ b/lapack/laswp/Makefile @@ -0,0 +1,22 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) +DBLASOBJS = dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) +QBLASOBJS = qlaswp_plus.$(SUFFIX) qlaswp_minus.$(SUFFIX) +CBLASOBJS = claswp_plus.$(SUFFIX) claswp_minus.$(SUFFIX) +ZBLASOBJS = zlaswp_plus.$(SUFFIX) zlaswp_minus.$(SUFFIX) +XBLASOBJS = xlaswp_plus.$(SUFFIX) xlaswp_minus.$(SUFFIX) + +slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) \ +qlaswp_plus.$(SUFFIX) qlaswp_minus.$(SUFFIX) \ +claswp_plus.$(SUFFIX) claswp_minus.$(SUFFIX) zlaswp_plus.$(SUFFIX) zlaswp_minus.$(SUFFIX) \ +xlaswp_plus.$(SUFFIX) xlaswp_minus.$(SUFFIX) \ +slaswp_plus.$(PSUFFIX) slaswp_minus.$(PSUFFIX) dlaswp_plus.$(PSUFFIX) dlaswp_minus.$(PSUFFIX) \ +qlaswp_plus.$(PSUFFIX) qlaswp_minus.$(PSUFFIX) \ +claswp_plus.$(PSUFFIX) claswp_minus.$(PSUFFIX) zlaswp_plus.$(PSUFFIX) zlaswp_minus.$(PSUFFIX) \ +xlaswp_plus.$(PSUFFIX) xlaswp_minus.$(PSUFFIX) : dummy + cd $(ARCH) && $(MAKE) ../$(@F) + +include ../../Makefile.tail + diff --git a/lapack/laswp/alpha/._Makefile b/lapack/laswp/alpha/._Makefile new file mode 100644 index 0000000..f5c572a Binary files /dev/null and b/lapack/laswp/alpha/._Makefile differ diff --git a/lapack/laswp/alpha/Makefile b/lapack/laswp/alpha/Makefile new file mode 100644 index 0000000..af1f019 --- /dev/null +++ b/lapack/laswp/alpha/Makefile @@ -0,0 +1,8 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c + +include ../generic/Makefile + diff --git a/lapack/laswp/generic/._Makefile b/lapack/laswp/generic/._Makefile new file mode 100644 index 0000000..413d4cb Binary files /dev/null and b/lapack/laswp/generic/._Makefile differ diff --git a/lapack/laswp/generic/._laswp_k.c b/lapack/laswp/generic/._laswp_k.c new file mode 100644 index 0000000..e5ccbb6 Binary files /dev/null and b/lapack/laswp/generic/._laswp_k.c differ diff --git a/lapack/laswp/generic/._laswp_k_1.c b/lapack/laswp/generic/._laswp_k_1.c new file mode 100644 index 0000000..95a79b0 Binary files /dev/null and b/lapack/laswp/generic/._laswp_k_1.c differ diff --git a/lapack/laswp/generic/._laswp_k_2.c b/lapack/laswp/generic/._laswp_k_2.c new file mode 100644 index 0000000..b5bd673 Binary files /dev/null and b/lapack/laswp/generic/._laswp_k_2.c differ diff --git a/lapack/laswp/generic/._laswp_k_4.c b/lapack/laswp/generic/._laswp_k_4.c new file mode 100644 index 0000000..c4b88b4 Binary files /dev/null and b/lapack/laswp/generic/._laswp_k_4.c differ diff --git a/lapack/laswp/generic/._laswp_k_8.c b/lapack/laswp/generic/._laswp_k_8.c new file mode 100644 index 0000000..b1ac8ca Binary files /dev/null and b/lapack/laswp/generic/._laswp_k_8.c differ diff --git a/lapack/laswp/generic/._zlaswp_k.c b/lapack/laswp/generic/._zlaswp_k.c new file mode 100644 index 0000000..6ee06e0 Binary files /dev/null and b/lapack/laswp/generic/._zlaswp_k.c differ diff --git a/lapack/laswp/generic/._zlaswp_k_1.c b/lapack/laswp/generic/._zlaswp_k_1.c new file mode 100644 index 0000000..2861999 Binary files /dev/null and b/lapack/laswp/generic/._zlaswp_k_1.c differ diff --git a/lapack/laswp/generic/._zlaswp_k_2.c b/lapack/laswp/generic/._zlaswp_k_2.c new file mode 100644 index 0000000..e28e3c3 Binary files /dev/null and b/lapack/laswp/generic/._zlaswp_k_2.c differ diff --git a/lapack/laswp/generic/._zlaswp_k_4.c b/lapack/laswp/generic/._zlaswp_k_4.c new file mode 100644 index 0000000..722d75d Binary files /dev/null and b/lapack/laswp/generic/._zlaswp_k_4.c differ diff --git a/lapack/laswp/generic/Makefile b/lapack/laswp/generic/Makefile new file mode 100644 index 0000000..bc9ab80 --- /dev/null +++ b/lapack/laswp/generic/Makefile @@ -0,0 +1,95 @@ +ifndef INCLUDED +TOPDIR = ../../.. +include $(TOPDIR)/Makefile.system +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +LASWP_DEPS = ../generic/laswp_k_1.c ../generic/laswp_k_2.c \ + ../generic/laswp_k_4.c ../generic/laswp_k_8.c + +ZLASWP_DEPS = ../generic/zlaswp_k_1.c ../generic/zlaswp_k_2.c \ + ../generic/zlaswp_k_4.c + +include ../../../Makefile.tail + +all: + +../slaswp_plus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) + +../slaswp_minus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) + +../dlaswp_plus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) + +../dlaswp_minus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) + +../qlaswp_plus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) + +../qlaswp_minus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) + +../claswp_plus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) + +../claswp_minus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) + +../zlaswp_plus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) + +../zlaswp_minus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) + +../xlaswp_plus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) + +../xlaswp_minus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) + +../slaswp_plus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) + +../slaswp_minus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) + +../dlaswp_plus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) + +../dlaswp_minus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) + +../qlaswp_plus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) + +../qlaswp_minus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) + +../claswp_plus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) + +../claswp_minus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) + +../zlaswp_plus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) + +../zlaswp_minus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) + +../xlaswp_plus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) + +../xlaswp_minus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) + diff --git a/lapack/laswp/generic/laswp_k.c b/lapack/laswp/generic/laswp_k.c new file mode 100644 index 0000000..b4ee019 --- /dev/null +++ b/lapack/laswp/generic/laswp_k.c @@ -0,0 +1,49 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if GEMM_UNROLL_N >= 8 +#include "laswp_k_8.c" +#elif GEMM_UNROLL_N >= 4 +#include "laswp_k_4.c" +#elif GEMM_UNROLL_N >= 2 +#include "laswp_k_2.c" +#else +#include "laswp_k_1.c" +#endif diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c new file mode 100644 index 0000000..c190176 --- /dev/null +++ b/lapack/laswp/generic/laswp_k_1.c @@ -0,0 +1,195 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 1) +#else +#define a2 (a1 - 1) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, B1, B2; + + a--; + k1 --; + +#ifndef MINUS + ipiv += k1 +; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = n; + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { +#ifdef OPTERON +#ifndef MINUS + asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(b1)); +#else + asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(b1)); +#endif +#endif + +#ifdef CORE2 +#ifndef MINUS + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b1)); + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b2)); +#else + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b1)); + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b2)); +#endif +#endif + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + *a1 = B1; + *b1 = A1; + } + + a += lda; + + j --; + } while (j > 0); + } + + return 0; +} + diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c new file mode 100644 index 0000000..1105aee --- /dev/null +++ b/lapack/laswp/generic/laswp_k_2.c @@ -0,0 +1,324 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 1) +#define a4 (a3 + 1) +#else +#define a2 (a1 - 1) +#define a4 (a3 - 1) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3; + FLOAT *b1, *b2, *b3, *b4; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + + a--; + k1 --; + +#ifndef MINUS + ipiv += k1 +; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = (n >> 1); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { +#ifdef CORE2 +#ifndef MINUS + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b3)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a3)); +#else + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b3)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a3)); +#endif +#endif + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } + + a += 2 * lda; + j --; + } while (j > 0); + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + *a1 = B1; + *b1 = A1; + } + } + + return 0; +} + diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c new file mode 100644 index 0000000..e08d496 --- /dev/null +++ b/lapack/laswp/generic/laswp_k_4.c @@ -0,0 +1,529 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 1) +#define a4 (a3 + 1) +#define a6 (a5 + 1) +#define a8 (a7 + 1) +#else +#define a2 (a1 - 1) +#define a4 (a3 - 1) +#define a6 (a5 - 1) +#define a8 (a7 - 1) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + + a--; + k1 --; + +#ifndef MINUS + ipiv += k1 +; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = (n >> 2); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + if (n & 2) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + *a1 = B1; + *b1 = A1; + } + } + + return 0; +} + diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c new file mode 100644 index 0000000..a4d4bce --- /dev/null +++ b/lapack/laswp/generic/laswp_k_8.c @@ -0,0 +1,909 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 1) +#define a4 (a3 + 1) +#define a6 (a5 + 1) +#define a8 (a7 + 1) +#define a10 (a9 + 1) +#define a12 (a11 + 1) +#define a14 (a13 + 1) +#define a16 (a15 + 1) +#else +#define a2 (a1 - 1) +#define a4 (a3 - 1) +#define a6 (a5 - 1) +#define a8 (a7 - 1) +#define a10 (a9 - 1) +#define a12 (a11 - 1) +#define a14 (a13 - 1) +#define a16 (a15 - 1) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *a9, *a11, *a13, *a15; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT *b9, *b10, *b11, *b12; + FLOAT *b13, *b14, *b15, *b16; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + FLOAT A9, A10, B9, B10, A11, A12, B11, B12; + FLOAT A13, A14, B13, B14, A15, A16, B15, B16; + + a--; + k1 --; + +#ifndef MINUS + ipiv += k1; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = (n >> 3); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + a9 = a1 + 4 * lda; + a11 = a1 + 5 * lda; + a13 = a1 + 6 * lda; + a15 = a1 + 7 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + b9 = b1 + 4 * lda; + b10 = b2 + 4 * lda; + b11 = b1 + 5 * lda; + b12 = b2 + 5 * lda; + b13 = b1 + 6 * lda; + b14 = b2 + 6 * lda; + b15 = b1 + 7 * lda; + b16 = b2 + 7 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + + *a10 = B10; + *b10 = A10; + *a12 = B12; + *b12 = A12; + *a14 = B14; + *b14 = A14; + *a16 = B16; + *b16 = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + + *a9 = A10; + *a10 = B10; + *b10 = A9; + *a11 = A12; + *a12 = B12; + *b12 = A11; + *a13 = A14; + *a14 = B14; + *b14 = A13; + *a15 = A16; + *a16 = B16; + *b16 = A15; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + + *a9 = A10; + *a10 = B9; + *b9 = A9; + *a11 = A12; + *a12 = B11; + *b11 = A11; + *a13 = A14; + *a14 = B13; + *b13 = A13; + *a15 = A16; + *a16 = B15; + *b15 = A15; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + + *a9 = B9; + *b9 = A9; + *a11 = B11; + *b11 = A11; + *a13 = B13; + *b13 = A13; + *a15 = B15; + *b15 = A15; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + + *a9 = B9; + *a10 = A9; + *b9 = A10; + *a11 = B11; + *a12 = A11; + *b11 = A12; + *a13 = B13; + *a14 = A13; + *b13 = A14; + *a15 = B15; + *a16 = A15; + *b15 = A16; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + + *a9 = B9; + *a10 = B10; + *b9 = A9; + *b10 = A10; + *a11 = B11; + *a12 = B12; + *b11 = A11; + *b12 = A12; + *a13 = B13; + *a14 = B14; + *b13 = A13; + *b14 = A14; + *a15 = B15; + *a16 = B16; + *b15 = A15; + *b16 = A16; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + b9 = b1 + 4 * lda; + b10 = b2 + 4 * lda; + b11 = b1 + 5 * lda; + b12 = b2 + 5 * lda; + b13 = b1 + 6 * lda; + b14 = b2 + 6 * lda; + b15 = b1 + 7 * lda; + b16 = b2 + 7 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + A9 = *a9; + B9 = *b9; + A11 = *a11; + B11 = *b11; + A13 = *a13; + B13 = *b13; + A15 = *a15; + B15 = *b15; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + + *a9 = B9; + *b9 = A9; + *a11 = B11; + *b11 = A11; + *a13 = B13; + *b13 = A13; + *a15 = B15; + *b15 = A15; + } + + a += 8 * lda; + + j --; + } while (j > 0); + } + + if (n & 4) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } + + a += 4 * lda; + } + + if (n & 2) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + *a1 = B1; + *b1 = A1; + } + } + + return 0; +} + diff --git a/lapack/laswp/generic/zlaswp_k.c b/lapack/laswp/generic/zlaswp_k.c new file mode 100644 index 0000000..c793837 --- /dev/null +++ b/lapack/laswp/generic/zlaswp_k.c @@ -0,0 +1,47 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if GEMM_UNROLL_N >= 4 +#include "zlaswp_k_4.c" +#elif GEMM_UNROLL_N >= 2 +#include "zlaswp_k_2.c" +#else +#include "zlaswp_k_1.c" +#endif diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c new file mode 100644 index 0000000..3dd653b --- /dev/null +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -0,0 +1,225 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 2) +#else +#define a2 (a1 - 2) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, + FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + + a -= 2; + lda *= 2; + k1 --; + +#ifndef MINUS + ipiv += k1; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = n; + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { +#ifdef OPTERON +#ifndef MINUS + asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(b1)); +#else + asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(b1)); +#endif +#endif + +#ifdef CORE2 +#ifndef MINUS + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b1)); + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b2)); +#else + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b1)); + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b2)); +#endif +#endif + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + + a += lda; + + j --; + } while (j > 0); + } + + return 0; +} + diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c new file mode 100644 index 0000000..a877ef6 --- /dev/null +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -0,0 +1,406 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 2) +#else +#define a2 (a1 - 2) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, + FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + + a -= 2; + lda *= 2; + k1 --; + +#ifndef MINUS + ipiv += k1; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + + j = (n >> 1); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { +#ifdef CORE2 +#ifndef MINUS + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1 + lda)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1 + lda)); +#else + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1 + lda)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1 + lda)); +#endif +#endif + + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a1 + 0 + lda); + A6 = *(a1 + 1 + lda); + A7 = *(a2 + 0 + lda); + A8 = *(a2 + 1 + lda); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b1 + 0 + lda); + B6 = *(b1 + 1 + lda); + B7 = *(b2 + 0 + lda); + B8 = *(b2 + 1 + lda); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A5; + *(b2 + 1 + lda) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B5; + *(a2 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + *(b1 + 0 + lda) = A7; + *(b1 + 1 + lda) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a1 + 0 + lda); + A4 = *(a1 + 1 + lda); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b1 + 0 + lda); + B4 = *(b1 + 1 + lda); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a1 + 0 + lda) = B3; + *(a1 + 1 + lda) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b1 + 0 + lda) = A3; + *(b1 + 1 + lda) = A4; + } + + a += 2 * lda; + + j --; + } while (j > 0); + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + } + + return 0; +} + diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c new file mode 100644 index 0000000..4dc5598 --- /dev/null +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -0,0 +1,742 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 2) +#define a4 (a3 + 2) +#define a6 (a5 + 2) +#define a8 (a7 + 2) +#else +#define a2 (a1 - 2) +#define a4 (a3 - 2) +#define a6 (a5 - 2) +#define a8 (a7 - 2) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, + FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + FLOAT A9, A10, B9, B10, A11, A12, B11, B12; + FLOAT A13, A14, B13, B14, A15, A16, B15, B16; + + a -= 2; + lda *= 2; + k1 --; + +#ifndef MINUS + ipiv += k1; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = (n >> 2); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + A9 = *(a5 + 0); + A10 = *(a5 + 1); + A11 = *(a6 + 0); + A12 = *(a6 + 1); + A13 = *(a7 + 0); + A14 = *(a7 + 1); + A15 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + B9 = *(b5 + 0); + B10 = *(b5 + 1); + B11 = *(b6 + 0); + B12 = *(b6 + 1); + B13 = *(b7 + 0); + B14 = *(b7 + 1); + B15 = *(b8 + 0); + B16 = *(b8 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A9; + *(b6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A13; + *(b8 + 1) = A14; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B9; + *(a6 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B13; + *(a8 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(b5 + 0) = A11; + *(b5 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + *(b7 + 0) = A15; + *(b7 + 1) = A16; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + +#ifndef MINUS + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; +#else + a1 -= 4; + a3 -= 4; + a5 -= 4; + a7 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + A5 = *(a5 + 0); + A6 = *(a5 + 1); + A7 = *(a7 + 0); + A8 = *(a7 + 1); + B5 = *(b5 + 0); + B6 = *(b5 + 1); + B7 = *(b7 + 0); + B8 = *(b7 + 1); + + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a3 + 0) = B3; + *(a3 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + *(a5 + 0) = B5; + *(a5 + 1) = B6; + *(a7 + 0) = B7; + *(a7 + 1) = B8; + *(b5 + 0) = A5; + *(b5 + 1) = A6; + *(b7 + 0) = A7; + *(b7 + 1) = A8; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + if (n & 2) { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + a3 = a1 + lda; + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + +#ifndef MINUS + a1 += 4; + a3 += 4; +#else + a1 -= 4; + a3 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a3 + 0) = B3; + *(a3 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + + a += 2 * lda; + + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + } + + return 0; +} + diff --git a/lapack/laswp/ia64/._Makefile b/lapack/laswp/ia64/._Makefile new file mode 100644 index 0000000..36450d3 Binary files /dev/null and b/lapack/laswp/ia64/._Makefile differ diff --git a/lapack/laswp/ia64/Makefile b/lapack/laswp/ia64/Makefile new file mode 100644 index 0000000..42245c6 --- /dev/null +++ b/lapack/laswp/ia64/Makefile @@ -0,0 +1,5 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +include ../generic/Makefile + diff --git a/lapack/laswp/mips64/._Makefile b/lapack/laswp/mips64/._Makefile new file mode 100644 index 0000000..73e3b61 Binary files /dev/null and b/lapack/laswp/mips64/._Makefile differ diff --git a/lapack/laswp/mips64/Makefile b/lapack/laswp/mips64/Makefile new file mode 100644 index 0000000..af1f019 --- /dev/null +++ b/lapack/laswp/mips64/Makefile @@ -0,0 +1,8 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c + +include ../generic/Makefile + diff --git a/lapack/laswp/power/._Makefile b/lapack/laswp/power/._Makefile new file mode 100644 index 0000000..b5247d6 Binary files /dev/null and b/lapack/laswp/power/._Makefile differ diff --git a/lapack/laswp/power/Makefile b/lapack/laswp/power/Makefile new file mode 100644 index 0000000..af1f019 --- /dev/null +++ b/lapack/laswp/power/Makefile @@ -0,0 +1,8 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c + +include ../generic/Makefile + diff --git a/lapack/laswp/sparc/._Makefile b/lapack/laswp/sparc/._Makefile new file mode 100644 index 0000000..4c47d22 Binary files /dev/null and b/lapack/laswp/sparc/._Makefile differ diff --git a/lapack/laswp/sparc/Makefile b/lapack/laswp/sparc/Makefile new file mode 100644 index 0000000..af1f019 --- /dev/null +++ b/lapack/laswp/sparc/Makefile @@ -0,0 +1,8 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c + +include ../generic/Makefile + diff --git a/lapack/laswp/x86/._Makefile b/lapack/laswp/x86/._Makefile new file mode 100644 index 0000000..96712e9 Binary files /dev/null and b/lapack/laswp/x86/._Makefile differ diff --git a/lapack/laswp/x86/Makefile b/lapack/laswp/x86/Makefile new file mode 100644 index 0000000..105ec40 --- /dev/null +++ b/lapack/laswp/x86/Makefile @@ -0,0 +1,28 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k_1.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k_1.c +endif + +include ../generic/Makefile + diff --git a/lapack/laswp/x86_64/._Makefile b/lapack/laswp/x86_64/._Makefile new file mode 100644 index 0000000..7793bdd Binary files /dev/null and b/lapack/laswp/x86_64/._Makefile differ diff --git a/lapack/laswp/x86_64/Makefile b/lapack/laswp/x86_64/Makefile new file mode 100644 index 0000000..ba07dcf --- /dev/null +++ b/lapack/laswp/x86_64/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), PENRYN) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k_1.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k_1.c +endif + +include ../generic/Makefile + diff --git a/lapack/lauu2/._Makefile b/lapack/lauu2/._Makefile new file mode 100644 index 0000000..15dbcfc Binary files /dev/null and b/lapack/lauu2/._Makefile differ diff --git a/lapack/lauu2/._lauu2_L.c b/lapack/lauu2/._lauu2_L.c new file mode 100644 index 0000000..79abb4f Binary files /dev/null and b/lapack/lauu2/._lauu2_L.c differ diff --git a/lapack/lauu2/._lauu2_U.c b/lapack/lauu2/._lauu2_U.c new file mode 100644 index 0000000..d41030d Binary files /dev/null and b/lapack/lauu2/._lauu2_U.c differ diff --git a/lapack/lauu2/._zlauu2_L.c b/lapack/lauu2/._zlauu2_L.c new file mode 100644 index 0000000..3ceea19 Binary files /dev/null and b/lapack/lauu2/._zlauu2_L.c differ diff --git a/lapack/lauu2/._zlauu2_U.c b/lapack/lauu2/._zlauu2_U.c new file mode 100644 index 0000000..e54bd4b Binary files /dev/null and b/lapack/lauu2/._zlauu2_U.c differ diff --git a/lapack/lauu2/Makefile b/lapack/lauu2/Makefile new file mode 100644 index 0000000..dc6a640 --- /dev/null +++ b/lapack/lauu2/Makefile @@ -0,0 +1,83 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = slauu2_U.$(SUFFIX) slauu2_L.$(SUFFIX) +DBLASOBJS = dlauu2_U.$(SUFFIX) dlauu2_L.$(SUFFIX) +QBLASOBJS = qlauu2_U.$(SUFFIX) qlauu2_L.$(SUFFIX) +CBLASOBJS = clauu2_U.$(SUFFIX) clauu2_L.$(SUFFIX) +ZBLASOBJS = zlauu2_U.$(SUFFIX) zlauu2_L.$(SUFFIX) +XBLASOBJS = xlauu2_U.$(SUFFIX) xlauu2_L.$(SUFFIX) + +slauu2_U.$(SUFFIX) : lauu2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauu2_L.$(SUFFIX) : lauu2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dlauu2_U.$(SUFFIX) : lauu2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauu2_L.$(SUFFIX) : lauu2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qlauu2_U.$(SUFFIX) : lauu2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauu2_L.$(SUFFIX) : lauu2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +clauu2_U.$(SUFFIX) : zlauu2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauu2_L.$(SUFFIX) : zlauu2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zlauu2_U.$(SUFFIX) : zlauu2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauu2_L.$(SUFFIX) : zlauu2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xlauu2_U.$(SUFFIX) : zlauu2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauu2_L.$(SUFFIX) : zlauu2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +slauu2_U.$(PSUFFIX) : lauu2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauu2_L.$(PSUFFIX) : lauu2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dlauu2_U.$(PSUFFIX) : lauu2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauu2_L.$(PSUFFIX) : lauu2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qlauu2_U.$(PSUFFIX) : lauu2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauu2_L.$(PSUFFIX) : lauu2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +clauu2_U.$(PSUFFIX) : zlauu2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauu2_L.$(PSUFFIX) : zlauu2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zlauu2_U.$(PSUFFIX) : zlauu2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauu2_L.$(PSUFFIX) : zlauu2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xlauu2_U.$(PSUFFIX) : zlauu2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauu2_L.$(PSUFFIX) : zlauu2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/lauu2/lauu2_L.c b/lapack/lauu2/lauu2_L.c new file mode 100644 index 0000000..aedb966 --- /dev/null +++ b/lapack/lauu2/lauu2_L.c @@ -0,0 +1,78 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT aii; + BLASLONG i; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (i = 0; i < n; i++) { + + SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i, lda, NULL, 0, NULL, 0); + + if (i < n - 1) { + aii = DOTU_K(n - i - 1, a + i + 1 + i * lda, 1, a + i + 1 + i * lda, 1); + + *(a + i + i * lda) += aii; + + GEMV_T(n - i - 1, i, 0, dp1, + a + (i + 1) , lda, + a + (i + 1) + i * lda, 1, + a + i , lda, sb); + } + } + + return 0; +} diff --git a/lapack/lauu2/lauu2_U.c b/lapack/lauu2/lauu2_U.c new file mode 100644 index 0000000..f9a7186 --- /dev/null +++ b/lapack/lauu2/lauu2_U.c @@ -0,0 +1,78 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT aii; + BLASLONG i; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (i = 0; i < n; i++) { + + SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i * lda, 1, NULL, 0, NULL, 0); + + if (i < n - 1) { + aii = DOTU_K(n - i - 1, a + i + (i + 1)* lda, lda, a + i + (i + 1) * lda, lda); + + *(a + i + i * lda) += aii; + + GEMV_N(i, n - i - 1, 0, dp1, + a + (i + 1) * lda, lda, + a + i + (i + 1) * lda, lda, + a + i * lda, 1, sb); + } + } + + return 0; +} diff --git a/lapack/lauu2/zlauu2_L.c b/lapack/lauu2/zlauu2_L.c new file mode 100644 index 0000000..8a892d9 --- /dev/null +++ b/lapack/lauu2/zlauu2_L.c @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT temp[2]; + BLASLONG i; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (i = 0; i < n; i++) { + + SCAL_K(i + 1, 0, 0, *(a + (i + i * lda) * COMPSIZE + 0), ZERO, + a + i * COMPSIZE, lda, NULL, 0, NULL, 0); + + if (i < n - 1) { + temp[0] = DOTC_K(n - i - 1, + a + (i + 1 + i * lda) * COMPSIZE, 1, + a + (i + 1 + i * lda) * COMPSIZE, 1); + GET_IMAGE(temp[1]); + + *(a + (i + i * lda) * COMPSIZE + 0) += temp[0]; + *(a + (i + i * lda) * COMPSIZE + 1) = ZERO; + + GEMV_U(n - i - 1, i, 0, dp1, ZERO, + a + ((i + 1) ) * COMPSIZE, lda, + a + ((i + 1) + i * lda) * COMPSIZE, 1, + a + ( i ) * COMPSIZE , lda, sb); + } + } + + return 0; +} diff --git a/lapack/lauu2/zlauu2_U.c b/lapack/lauu2/zlauu2_U.c new file mode 100644 index 0000000..b20ea99 --- /dev/null +++ b/lapack/lauu2/zlauu2_U.c @@ -0,0 +1,81 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT temp[2]; + BLASLONG i; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + for (i = 0; i < n; i++) { + + SCAL_K(i + 1, 0, 0, + *(a + (i + i * lda) * COMPSIZE + 0), ZERO, + a + i * lda * COMPSIZE, 1, NULL, 0, NULL, 0); + + if (i < n - 1) { + temp[0] = DOTC_K(n - i - 1, a + (i + (i + 1) * lda) * COMPSIZE, lda, a + (i + (i + 1) * lda) * COMPSIZE, lda); + GET_IMAGE(temp[1]); + + *(a + (i + i * lda) * COMPSIZE + 0) += temp[0]; + *(a + (i + i * lda) * COMPSIZE + 1) = ZERO; + + GEMV_O(i, n - i - 1, 0, dp1, ZERO, + a + ( (i + 1) * lda) * COMPSIZE, lda, + a + (i + (i + 1) * lda) * COMPSIZE, lda, + a + ( i * lda) * COMPSIZE, 1, sb); + } + } + + return 0; +} diff --git a/lapack/lauum/._Makefile b/lapack/lauum/._Makefile new file mode 100644 index 0000000..9a103ed Binary files /dev/null and b/lapack/lauum/._Makefile differ diff --git a/lapack/lauum/._lauum_L_parallel.c b/lapack/lauum/._lauum_L_parallel.c new file mode 100644 index 0000000..c0bdcfa Binary files /dev/null and b/lapack/lauum/._lauum_L_parallel.c differ diff --git a/lapack/lauum/._lauum_L_single.c b/lapack/lauum/._lauum_L_single.c new file mode 100644 index 0000000..0482a13 Binary files /dev/null and b/lapack/lauum/._lauum_L_single.c differ diff --git a/lapack/lauum/._lauum_U_parallel.c b/lapack/lauum/._lauum_U_parallel.c new file mode 100644 index 0000000..571c5f0 Binary files /dev/null and b/lapack/lauum/._lauum_U_parallel.c differ diff --git a/lapack/lauum/._lauum_U_single.c b/lapack/lauum/._lauum_U_single.c new file mode 100644 index 0000000..ccc18ab Binary files /dev/null and b/lapack/lauum/._lauum_U_single.c differ diff --git a/lapack/lauum/Makefile b/lapack/lauum/Makefile new file mode 100644 index 0000000..f163479 --- /dev/null +++ b/lapack/lauum/Makefile @@ -0,0 +1,164 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = slauum_U_single.$(SUFFIX) slauum_L_single.$(SUFFIX) +DBLASOBJS = dlauum_U_single.$(SUFFIX) dlauum_L_single.$(SUFFIX) +QBLASOBJS = qlauum_U_single.$(SUFFIX) qlauum_L_single.$(SUFFIX) +CBLASOBJS = clauum_U_single.$(SUFFIX) clauum_L_single.$(SUFFIX) +ZBLASOBJS = zlauum_U_single.$(SUFFIX) zlauum_L_single.$(SUFFIX) +XBLASOBJS = xlauum_U_single.$(SUFFIX) xlauum_L_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += slauum_U_parallel.$(SUFFIX) slauum_L_parallel.$(SUFFIX) +DBLASOBJS += dlauum_U_parallel.$(SUFFIX) dlauum_L_parallel.$(SUFFIX) +QBLASOBJS += qlauum_U_parallel.$(SUFFIX) qlauum_L_parallel.$(SUFFIX) +CBLASOBJS += clauum_U_parallel.$(SUFFIX) clauum_L_parallel.$(SUFFIX) +ZBLASOBJS += zlauum_U_parallel.$(SUFFIX) zlauum_L_parallel.$(SUFFIX) +XBLASOBJS += xlauum_U_parallel.$(SUFFIX) xlauum_L_parallel.$(SUFFIX) +endif + +slauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dlauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qlauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +clauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zlauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xlauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +slauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dlauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qlauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +clauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zlauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xlauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c new file mode 100644 index 0000000..8d9cde9 --- /dev/null +++ b/lapack/lauum/lauum_L_parallel.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { + LAUUM_L_SINGLE(args, NULL, NULL, sa, sb, 0); + return 0; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 2) { + LAUUM_L_SINGLE(args, NULL, range_n, sa, sb, 0); + return 0; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.n = i; + newarg.k = bk; + newarg.a = a + i * COMPSIZE; + newarg.c = a; + + syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO, + &newarg, NULL, NULL, (void *)HERK_LC, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = i; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i ) * COMPSIZE; + + gemm_thread_n(mode | BLAS_TRANSA_T, + &newarg, NULL, NULL, (void *)TRMM_LCLN, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + CNAME(&newarg, NULL, NULL, sa, sb, 0); + } + + return 0; +} diff --git a/lapack/lauum/lauum_L_single.c b/lapack/lauum/lauum_L_single.c new file mode 100644 index 0000000..65e8f04 --- /dev/null +++ b/lapack/lauum/lauum_L_single.c @@ -0,0 +1,234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +#ifndef COMPLEX +#define TRMM_KERNEL TRMM_KERNEL_LN +#define SYRK_KERNEL SYRK_KERNEL_L +#else +#define TRMM_KERNEL TRMM_KERNEL_LR +#ifdef XDOUBLE +#define SYRK_KERNEL xherk_kernel_LC +#elif defined(DOUBLE) +#define SYRK_KERNEL zherk_kernel_LC +#else +#define SYRK_KERNEL cherk_kernel_LC +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 64 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG j, bk, blocking; + BLASLONG jjs, min_jj; + + BLASLONG is, ls, ks; + BLASLONG min_i, min_l, min_k; + BLASLONG range_N[2]; + + FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + +#if 0 + FLOAT *aa; +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES) { + LAUU2_L(args, NULL, range_n, sa, sb, 0); + return 0; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (j = 0; j < n; j += blocking) { + bk = MIN(blocking, n - j); + + if (j > 0 ){ + + TRMM_ILNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb); + + for (ls = 0; ls < j; ls += REAL_GEMM_R) { + min_l = j - ls; + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + +#if 0 + + min_i = j - ls; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (ls + min_i >= ls + min_l) { + GEMM_INCOPY(bk, min_i, a + (j + ls * lda)* COMPSIZE, lda, sa); + aa = sa; + } else { + aa = sb2; + } + + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ + min_jj = ls + min_l - jjs; + if (min_jj > GEMM_P) min_jj = GEMM_P; + + GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + aa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (ls + jjs * lda) * COMPSIZE, lda, + ls - jjs); + } + + + for(is = ls + min_i; is < j ; is += GEMM_P){ + min_i = j - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa); + + SYRK_KERNEL(min_i, min_l, bk, dp1, + sa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, + is - ls); + } + + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_k, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sb + ks * bk * COMPSIZE, + sb2, + a + (ks + j + ls * lda) * COMPSIZE, lda, ks); + } +#else + + min_i = j - ls; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_INCOPY(bk, min_i, a + (j + ls * lda)* COMPSIZE, lda, sa); + + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ + min_jj = ls + min_l - jjs; + if (min_jj > GEMM_P) min_jj = GEMM_P; + + GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + sa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (ls + jjs * lda) * COMPSIZE, lda, + ls - jjs); + } + + for(is = ls + min_i; is < j ; is += GEMM_P){ + min_i = j - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa); + + SYRK_KERNEL(min_i, min_l, bk, dp1, + sa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, + is - ls); + } + + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_k, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sb + ks * bk * COMPSIZE, + sb2, + a + (ks + j + ls * lda) * COMPSIZE, lda, ks); + } + +#endif + + } + } + + if (!range_n) { + range_N[0] = j; + range_N[1] = j + bk; + } else { + range_N[0] = range_n[0] + j; + range_N[1] = range_n[0] + j + bk; + } + + CNAME(args, NULL, range_N, sa, sb, 0); + + } + + return 0; +} diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c new file mode 100644 index 0000000..d68d12b --- /dev/null +++ b/lapack/lauum/lauum_U_parallel.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { + LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0); + return 0; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 2) { + LAUUM_U_SINGLE(args, NULL, range_n, sa, sb, 0); + return 0; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.n = i; + newarg.k = bk; + newarg.a = a + ( i * lda) * COMPSIZE; + newarg.c = a; + + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, + &newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads); + + newarg.m = i; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + ( i * lda) * COMPSIZE; + + gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE, + &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + CNAME(&newarg, NULL, NULL, sa, sb, 0); + } + + return 0; +} diff --git a/lapack/lauum/lauum_U_single.c b/lapack/lauum/lauum_U_single.c new file mode 100644 index 0000000..14cf0ad --- /dev/null +++ b/lapack/lauum/lauum_U_single.c @@ -0,0 +1,268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +#ifndef COMPLEX +#define TRMM_KERNEL TRMM_KERNEL_RT +#define SYRK_KERNEL SYRK_KERNEL_U +#else +#define TRMM_KERNEL TRMM_KERNEL_RC +#ifdef XDOUBLE +#define SYRK_KERNEL xherk_kernel_UN +#elif defined(DOUBLE) +#define SYRK_KERNEL zherk_kernel_UN +#else +#define SYRK_KERNEL cherk_kernel_UN +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 24 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG j, bk, blocking; + BLASLONG is, ls, ks; + BLASLONG jjs, min_jj; + + BLASLONG min_i, min_l, min_k; + BLASLONG range_N[2]; + + FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + +#if 0 + FLOAT *aa; +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES) { + LAUU2_U(args, NULL, range_n, sa, sb, 0); + return 0; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (j = 0; j < n; j += blocking) { + bk = n - j; + if (bk > blocking) bk = blocking; + + if (j > 0) { + + TRMM_OUTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb); + + for (ls = 0; ls < j; ls += REAL_GEMM_R) { + min_l = j - ls; + +#if 0 + + + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + min_i = ls + min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (ls > 0) { + GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa); + aa = sa; + } else { + aa = sb2; + } + + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ + min_jj = ls + min_l - jjs; + if (min_jj > GEMM_P) min_jj = GEMM_P; + + GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + aa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (jjs * lda) * COMPSIZE, lda, - jjs); + } + + if (ls + REAL_GEMM_R >= j ) { + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_i, min_k, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + aa, + sb + ks * bk * COMPSIZE, + a + ((ks + j) * lda) * COMPSIZE, lda, -ks); + } + } + + for(is = min_i; is < ls + min_l ; is += GEMM_P){ + min_i = ls + min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (is < ls) { + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + aa = sa; + } else { + aa = sb2 + (is - ls) * bk * COMPSIZE; + } + + SYRK_KERNEL(min_i, min_l, bk, dp1, + aa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, is - ls); + + if (ls + REAL_GEMM_R >= j ) { + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_i, min_k, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + aa, + sb + ks * bk * COMPSIZE, + a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); + } + } + } +#else + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + min_i = ls + min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa); + + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ + min_jj = ls + min_l - jjs; + if (min_jj > GEMM_P) min_jj = GEMM_P; + + GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + sa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (jjs * lda) * COMPSIZE, lda, - jjs); + } + + if (ls + REAL_GEMM_R >= j ) { + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_i, min_k, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + ks * bk * COMPSIZE, + a + ((ks + j) * lda) * COMPSIZE, lda, -ks); + } + } + + for(is = min_i; is < ls + min_l ; is += GEMM_P){ + min_i = ls + min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + + SYRK_KERNEL(min_i, min_l, bk, dp1, + sa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, is - ls); + + if (ls + REAL_GEMM_R >= j ) { + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_i, min_k, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + ks * bk * COMPSIZE, + a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); + } + } + } +#endif + } /* end of ls */ + } + + if (!range_n) { + range_N[0] = j; + range_N[1] = j + bk; + } else { + range_N[0] = range_n[0] + j; + range_N[1] = range_n[0] + j + bk; + } + + CNAME(args, NULL, range_N, sa, sb, 0); + + } + + return 0; +} diff --git a/lapack/potf2/._Makefile b/lapack/potf2/._Makefile new file mode 100644 index 0000000..f147580 Binary files /dev/null and b/lapack/potf2/._Makefile differ diff --git a/lapack/potf2/._potf2_L.c b/lapack/potf2/._potf2_L.c new file mode 100644 index 0000000..53fa528 Binary files /dev/null and b/lapack/potf2/._potf2_L.c differ diff --git a/lapack/potf2/._potf2_U.c b/lapack/potf2/._potf2_U.c new file mode 100644 index 0000000..4151140 Binary files /dev/null and b/lapack/potf2/._potf2_U.c differ diff --git a/lapack/potf2/._zpotf2_L.c b/lapack/potf2/._zpotf2_L.c new file mode 100644 index 0000000..6f0708e Binary files /dev/null and b/lapack/potf2/._zpotf2_L.c differ diff --git a/lapack/potf2/._zpotf2_U.c b/lapack/potf2/._zpotf2_U.c new file mode 100644 index 0000000..ff95829 Binary files /dev/null and b/lapack/potf2/._zpotf2_U.c differ diff --git a/lapack/potf2/Makefile b/lapack/potf2/Makefile new file mode 100644 index 0000000..5946ad9 --- /dev/null +++ b/lapack/potf2/Makefile @@ -0,0 +1,83 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = spotf2_U.$(SUFFIX) spotf2_L.$(SUFFIX) +DBLASOBJS = dpotf2_U.$(SUFFIX) dpotf2_L.$(SUFFIX) +QBLASOBJS = qpotf2_U.$(SUFFIX) qpotf2_L.$(SUFFIX) +CBLASOBJS = cpotf2_U.$(SUFFIX) cpotf2_L.$(SUFFIX) +ZBLASOBJS = zpotf2_U.$(SUFFIX) zpotf2_L.$(SUFFIX) +XBLASOBJS = xpotf2_U.$(SUFFIX) xpotf2_L.$(SUFFIX) + +spotf2_U.$(SUFFIX) : potf2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotf2_L.$(SUFFIX) : potf2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dpotf2_U.$(SUFFIX) : potf2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotf2_L.$(SUFFIX) : potf2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qpotf2_U.$(SUFFIX) : potf2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotf2_L.$(SUFFIX) : potf2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cpotf2_U.$(SUFFIX) : zpotf2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotf2_L.$(SUFFIX) : zpotf2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zpotf2_U.$(SUFFIX) : zpotf2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotf2_L.$(SUFFIX) : zpotf2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xpotf2_U.$(SUFFIX) : zpotf2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotf2_L.$(SUFFIX) : zpotf2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +spotf2_U.$(PSUFFIX) : potf2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotf2_L.$(PSUFFIX) : potf2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dpotf2_U.$(PSUFFIX) : potf2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotf2_L.$(PSUFFIX) : potf2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qpotf2_U.$(PSUFFIX) : potf2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotf2_L.$(PSUFFIX) : potf2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cpotf2_U.$(PSUFFIX) : zpotf2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotf2_L.$(PSUFFIX) : zpotf2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zpotf2_U.$(PSUFFIX) : zpotf2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotf2_L.$(PSUFFIX) : zpotf2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xpotf2_U.$(PSUFFIX) : zpotf2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotf2_L.$(PSUFFIX) : zpotf2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/potf2/potf2_L.c b/lapack/potf2/potf2_L.c new file mode 100644 index 0000000..23aa97c --- /dev/null +++ b/lapack/potf2/potf2_L.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dm1 = -1.; +static FLOAT dp1 = 1.; + +#ifndef SQRT +#define SQRT(x) sqrt(x) +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj; + FLOAT *aoffset; + BLASLONG i, j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + aoffset = a; + + for (j = 0; j < n; j++) { + + ajj = *(aoffset + j) - DOTU_K(j, a + j, lda, a + j, lda); + + if (ajj <= 0){ + *(aoffset + j) = ajj; + return j + 1; + } + ajj = SQRT(ajj); + *(aoffset + j) = ajj; + + i = n - j - 1; + + if (i > 0) { + GEMV_N(i, j, 0, dm1, + a + j + 1, lda, + a + j, lda, + aoffset + j + 1, 1, sb); + + SCAL_K(i, 0, 0, dp1 / ajj, + aoffset + j + 1, 1, NULL, 0, NULL, 0); + } + + aoffset += lda; + } + + return 0; +} diff --git a/lapack/potf2/potf2_U.c b/lapack/potf2/potf2_U.c new file mode 100644 index 0000000..755bf8d --- /dev/null +++ b/lapack/potf2/potf2_U.c @@ -0,0 +1,94 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dm1 = -1.; +static FLOAT dp1 = 1.; + +#ifndef SQRT +#define SQRT(x) sqrt(x) +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj; + BLASLONG i, j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = 0; j < n; j++) { + + ajj = *(a + j) - DOTU_K(j, a, 1, a, 1); + + if (ajj <= 0){ + *(a + j) = ajj; + return j + 1; + } + ajj = SQRT(ajj); + *(a + j) = ajj; + + i = n - j - 1; + + if (i > 0) { + GEMV_T(j, i, 0, dm1, + a + lda, lda, + a, 1, + a + j + lda, lda, sb); + + SCAL_K(i, 0, 0, dp1 / ajj, + a + j + lda, lda, NULL, 0, NULL, 0); + } + + a += lda; + } + + return 0; +} diff --git a/lapack/potf2/zpotf2_L.c b/lapack/potf2/zpotf2_L.c new file mode 100644 index 0000000..8ce0d4e --- /dev/null +++ b/lapack/potf2/zpotf2_L.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifndef SQRT +#define SQRT(x) sqrt(x) +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj[2]; + FLOAT *aoffset; + BLASLONG i, j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + aoffset = a; + + for (j = 0; j < n; j++) { + + ajj[0] = DOTC_K(j, a + j * 2, lda, a + j * 2, lda); + GET_IMAGE(ajj[1]); + + ajj[0] = *(aoffset + j * 2) - ajj[0]; + + if (ajj[0] <= 0){ + *(aoffset + j * 2 + 0) = ajj[0]; + *(aoffset + j * 2 + 1) = ZERO; + return j + 1; + } + ajj[0] = SQRT(ajj[0]); + *(aoffset + j * 2 + 0) = ajj[0]; + *(aoffset + j * 2 + 1) = ZERO; + + i = n - j - 1; + + if (i > 0) { + GEMV_O(i, j, 0, dm1, ZERO, + a + (j + 1) * 2, lda, + a + j * 2, lda, + aoffset + (j + 1) * 2, 1, sb); + + SCAL_K(i, 0, 0, ONE / ajj[0], ZERO, + aoffset + (j + 1) * 2, 1, NULL, 0, NULL, 0); + } + + aoffset += lda * 2; + } + + return 0; +} diff --git a/lapack/potf2/zpotf2_U.c b/lapack/potf2/zpotf2_U.c new file mode 100644 index 0000000..c1f5156 --- /dev/null +++ b/lapack/potf2/zpotf2_U.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifndef SQRT +#define SQRT(x) sqrt(x) +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj[2]; + BLASLONG i, j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = 0; j < n; j++) { + + ajj[0] = DOTC_K(j, a, 1, a, 1); + GET_IMAGE(ajj[1]); + + ajj[0] = *(a + j * 2) - ajj[0]; + + if (ajj[0] <= 0){ + *(a + j * 2 + 0) = ajj[0]; + *(a + j * 2 + 1) = ZERO; + return j + 1; + } + + ajj[0] = SQRT(ajj[0]); + *(a + j * 2 + 0) = ajj[0]; + *(a + j * 2 + 1) = ZERO; + + i = n - j - 1; + + if (i > 0){ + GEMV_U(j, i, 0, dm1, ZERO, + a + lda * 2, lda, + a, 1, + a + (j + lda) * 2, lda, sb); + + SCAL_K(i, 0, 0, ONE / ajj[0], ZERO, + a + (j + lda) * 2, lda, NULL, 0, NULL, 0); + } + + a += 2 * lda; + } + + return 0; +} diff --git a/lapack/potrf/._Makefile b/lapack/potrf/._Makefile new file mode 100644 index 0000000..fab66fd Binary files /dev/null and b/lapack/potrf/._Makefile differ diff --git a/lapack/potrf/._potrf_L_parallel.c b/lapack/potrf/._potrf_L_parallel.c new file mode 100644 index 0000000..b3837cf Binary files /dev/null and b/lapack/potrf/._potrf_L_parallel.c differ diff --git a/lapack/potrf/._potrf_L_single.c b/lapack/potrf/._potrf_L_single.c new file mode 100644 index 0000000..c06fb56 Binary files /dev/null and b/lapack/potrf/._potrf_L_single.c differ diff --git a/lapack/potrf/._potrf_U_parallel.c b/lapack/potrf/._potrf_U_parallel.c new file mode 100644 index 0000000..20eba6b Binary files /dev/null and b/lapack/potrf/._potrf_U_parallel.c differ diff --git a/lapack/potrf/._potrf_U_single.c b/lapack/potrf/._potrf_U_single.c new file mode 100644 index 0000000..d52a8d0 Binary files /dev/null and b/lapack/potrf/._potrf_U_single.c differ diff --git a/lapack/potrf/._potrf_parallel.c b/lapack/potrf/._potrf_parallel.c new file mode 100644 index 0000000..1d655df Binary files /dev/null and b/lapack/potrf/._potrf_parallel.c differ diff --git a/lapack/potrf/Makefile b/lapack/potrf/Makefile new file mode 100644 index 0000000..21efa55 --- /dev/null +++ b/lapack/potrf/Makefile @@ -0,0 +1,164 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = spotrf_U_single.$(SUFFIX) spotrf_L_single.$(SUFFIX) +DBLASOBJS = dpotrf_U_single.$(SUFFIX) dpotrf_L_single.$(SUFFIX) +QBLASOBJS = qpotrf_U_single.$(SUFFIX) qpotrf_L_single.$(SUFFIX) +CBLASOBJS = cpotrf_U_single.$(SUFFIX) cpotrf_L_single.$(SUFFIX) +ZBLASOBJS = zpotrf_U_single.$(SUFFIX) zpotrf_L_single.$(SUFFIX) +XBLASOBJS = xpotrf_U_single.$(SUFFIX) xpotrf_L_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += spotrf_U_parallel.$(SUFFIX) spotrf_L_parallel.$(SUFFIX) +DBLASOBJS += dpotrf_U_parallel.$(SUFFIX) dpotrf_L_parallel.$(SUFFIX) +QBLASOBJS += qpotrf_U_parallel.$(SUFFIX) qpotrf_L_parallel.$(SUFFIX) +CBLASOBJS += cpotrf_U_parallel.$(SUFFIX) cpotrf_L_parallel.$(SUFFIX) +ZBLASOBJS += zpotrf_U_parallel.$(SUFFIX) zpotrf_L_parallel.$(SUFFIX) +XBLASOBJS += xpotrf_U_parallel.$(SUFFIX) xpotrf_L_parallel.$(SUFFIX) +endif + +spotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +spotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c new file mode 100644 index 0000000..1ebcad8 --- /dev/null +++ b/lapack/potrf/potrf_L_parallel.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + BLASLONG info; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { -ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { + info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); + return info; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 4) { + info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); + return info; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + info = CNAME(&newarg, NULL, NULL, sa, sb, 0); + if (info) return info + i; + + if (n - i - bk > 0) { + newarg.m = n - i - bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + bk + i * lda) * COMPSIZE; + + gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + (i + bk + i * lda) * COMPSIZE; + newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); +#endif + } + } + + return 0; +} diff --git a/lapack/potrf/potrf_L_single.c b/lapack/potrf/potrf_L_single.c new file mode 100644 index 0000000..b88f8fc --- /dev/null +++ b/lapack/potrf/potrf_L_single.c @@ -0,0 +1,234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_RN +#else +#define TRSM_KERNEL TRSM_KERNEL_RR +#undef SYRK_KERNEL_L +#ifdef XDOUBLE +#define SYRK_KERNEL_L xherk_kernel_LN +#elif defined(DOUBLE) +#define SYRK_KERNEL_L zherk_kernel_LN +#else +#define SYRK_KERNEL_L cherk_kernel_LN +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 128 +#define GEMM_Q 128 +#define GEMM_R 4000 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +#if 0 +#define SHARED_ARRAY +#define SA aa +#else +#undef SHARED_ARRAY +#define SA sa +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG info; + BLASLONG bk, j, blocking; + BLASLONG is, min_i; + BLASLONG js, min_j; + BLASLONG range_N[2]; + + FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + +#ifdef SHARED_ARRAY + FLOAT *aa; +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES / 2) { + info = POTF2_L(args, NULL, range_n, sa, sb, 0); + return info; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = n / 4; + + for (j = 0; j < n; j += blocking) { + bk = n - j; + if (bk > blocking) bk = blocking; + + if (!range_n) { + range_N[0] = j; + range_N[1] = j + bk; + } else { + range_N[0] = range_n[0] + j; + range_N[1] = range_n[0] + j + bk; + } + info = CNAME(args, NULL, range_N, sa, sb, 0); + if (info) return info + j; + + if (n - j - bk > 0) { + + TRSM_OLTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); + + /* First tile */ + min_j = n - j - bk; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + for (is = j + bk; is < n; is += GEMM_P) { + min_i = n - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifdef SHARED_ARRAY + + if (is < j + bk + min_j) { + aa = sb2 + bk * (is - j - bk) * COMPSIZE; + } else { + aa = sa; + } + + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, aa); + + TRSM_KERNEL(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + sb, + a + (is + j * lda) * COMPSIZE, lda, 0); + + SYRK_KERNEL_L(min_i, min_j, bk, dm1, + aa, + sb2, + a + (is + (j + bk) * lda) * COMPSIZE, lda, + is - j - bk); + +#else + + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + + sa, + sb, + a + (is + j * lda) * COMPSIZE, lda, 0); + + if (is < j + bk + min_j) { + GEMM_OTCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sb2 + bk * (is - j - bk) * COMPSIZE); + } + + SYRK_KERNEL_L(min_i, min_j, bk, dm1, + sa, + sb2, + a + (is + (j + bk) * lda) * COMPSIZE, lda, + is - j - bk); +#endif + } + + for(js = j + bk + min_j; js < n; js += REAL_GEMM_R){ + min_j = n - js; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + GEMM_OTCOPY(bk, min_j, a + (js + j * lda) * COMPSIZE, lda, sb2); + + for (is = js; is < n; is += GEMM_P) { + min_i = n - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifdef SHARED_ARRAY + + if (is + min_i < js + min_j) { + aa = sb2 + bk * (is - js) * COMPSIZE; + } else { + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + aa = sa; + } + + SYRK_KERNEL_L(min_i, min_j, bk, dm1, + aa, + sb2, + a + (is + js * lda) * COMPSIZE, lda, + is - js); + +#else + + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + + SYRK_KERNEL_L(min_i, min_j, bk, dm1, + sa, + sb2, + a + (is + js * lda) * COMPSIZE, lda, + - is + js); +#endif + + } + } + + } + + } + + return 0; +} diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c new file mode 100644 index 0000000..31da141 --- /dev/null +++ b/lapack/potrf/potrf_U_parallel.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + BLASLONG info; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { -ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { + info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); + return info; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 4) { + info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); + return info; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + info = CNAME(&newarg, NULL, NULL, sa, sb, 0); + if (info) return info + i; + + if (n - i - bk > 0) { + newarg.m = bk; + newarg.n = n - i - bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + + gemm_thread_n(mode | BLAS_TRANSA_T, + &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, + &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); +#endif + } + } + + return 0; +} diff --git a/lapack/potrf/potrf_U_single.c b/lapack/potrf/potrf_U_single.c new file mode 100644 index 0000000..aa445c5 --- /dev/null +++ b/lapack/potrf/potrf_U_single.c @@ -0,0 +1,193 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_LT +#else +#define TRSM_KERNEL TRSM_KERNEL_LC +#undef SYRK_KERNEL_U +#ifdef XDOUBLE +#define SYRK_KERNEL_U xherk_kernel_UC +#elif defined(DOUBLE) +#define SYRK_KERNEL_U zherk_kernel_UC +#else +#define SYRK_KERNEL_U cherk_kernel_UC +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 64 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +#if 0 +#define SHARED_ARRAY +#define SA aa +#else +#undef SHARED_ARRAY +#define SA sa +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG info; + BLASLONG bk, blocking; + BLASLONG is, min_i; + BLASLONG jjs, min_jj; + BLASLONG range_N[2]; + BLASLONG j, js, min_j; + +#ifdef SHARED_ARRAY + FLOAT *aa; +#endif + + FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES / 2) { + info = POTF2_U(args, NULL, range_n, sa, sb, 0); + return info; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (j = 0; j < n; j += blocking) { + bk = n - j; + if (bk > blocking) bk = blocking; + + if (!range_n) { + range_N[0] = j; + range_N[1] = j + bk; + } else { + range_N[0] = range_n[0] + j; + range_N[1] = range_n[0] + j + bk; + } + + info = CNAME(args, NULL, range_N, sa, sb, 0); + if (info) return info + j; + + if (n - j - bk > 0) { + + TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); + + for(js = j + bk; js < n; js += REAL_GEMM_R) { + min_j = n - js; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE); + + for (is = 0; is < bk; is += GEMM_P) { + min_i = bk - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRSM_KERNEL (min_i, min_jj, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb + bk * is * COMPSIZE, + sb2 + bk * (jjs - js) * COMPSIZE, + a + (j + is + jjs * lda) * COMPSIZE, lda, is); + } + } + + for (is = j + bk; is < js + min_j; is += min_i) { + min_i = js + min_j - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifdef SHARED_ARRAY + if ((is >= js) && (is + min_i <= js + min_j)) { + aa = sb2 + bk * (is - js) * COMPSIZE; + } else { + GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); + aa = sa; + } +#else + GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); +#endif + + SYRK_KERNEL_U(min_i, min_j, bk, + dm1, + SA, sb2, + a + (is + js * lda) * COMPSIZE, lda, + is - js); + + } + } + } + + } + + return 0; +} diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c new file mode 100644 index 0000000..f270c3d --- /dev/null +++ b/lapack/potrf/potrf_parallel.c @@ -0,0 +1,634 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + +static FLOAT dm1 = -1.; + +#ifndef KERNEL_FUNC +#ifndef LOWER +#define KERNEL_FUNC SYRK_KERNEL_U +#else +#define KERNEL_FUNC SYRK_KERNEL_L +#endif +#endif + +#ifndef LOWER +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_LT +#else +#define TRSM_KERNEL TRSM_KERNEL_LC +#endif +#else +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_RN +#else +#define TRSM_KERNEL TRSM_KERNEL_RR +#endif +#endif + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef LOWER +#define TRANS +#endif + +#ifndef SYRK_LOCAL +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL SYRK_UT +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_LN +#else +#define SYRK_LOCAL SYRK_LT +#endif +#endif + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#endif +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef S +#define S args -> a +#endif +#ifndef A +#define A args -> b +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef N +#define N args -> m +#endif +#ifndef K +#define K args -> k +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG k, lda; + BLASLONG m_from, m_to; + + FLOAT *alpha; + FLOAT *a, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + + BLASLONG jjs, min_jj; + BLASLONG is, min_i, div_n; + + BLASLONG i, current; + + k = K; + + a = (FLOAT *)A; + c = (FLOAT *)C; + + lda = LDA; + + alpha = (FLOAT *)args -> alpha; + + m_from = range_n[mypos + 0]; + m_to = range_n[mypos + 1]; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); +#endif + + div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + buffer[0] = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; + } + +#ifndef LOWER + TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#else + TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#endif + + for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { + + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ + + min_jj = MIN(m_to, xxx + div_n) - jjs; + +#ifndef LOWER + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; +#else + if (min_jj > GEMM_P) min_jj = GEMM_P; +#endif + +#ifndef LOWER + OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (k, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb, + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + a + jjs * lda * COMPSIZE, lda, 0); +#else + ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (min_jj, k, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + sb, + a + jjs * COMPSIZE, lda, 0); +#endif + } + +#ifndef LOWER + for (i = 0; i <= mypos; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#else + for (i = mypos; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#endif + + WMB; + } + + min_i = m_to - m_from; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + /* thread has to wait */ + if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, m_from, xxx); + + if (m_from + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } + +#ifndef LOWER + current ++; +#else + current --; +#endif + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, is, xxx); + + if (is + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } +#ifndef LOWER + current ++; +#else + current --; +#endif + } + } + + for (i = 0; i < args -> nthreads; i++) { + if (i != mypos) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + } + + return 0; + } + +static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ + + blas_arg_t newarg; + + job_t job[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range[MAX_CPU_NUMBER + 100]; + + BLASLONG num_cpu; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k; + BLASLONG n, n_from, n_to; + int mode, mask; + double dnum; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; + mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; + mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_REAL; + mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; + mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; + mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; + mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; +#endif +#endif + + newarg.m = args -> m; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.alpha = args -> alpha; + newarg.common = (void *)job; + + n_from = 0; + n_to = args -> m; + +#ifndef LOWER + + range[MAX_CPU_NUMBER] = n_to - n_from; + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); + + if (num_cpu == 0) width = n - ((n - width) & ~mask); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; + +#else + + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = range; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + newarg.nthreads = num_cpu; + + if (num_cpu) { + + for (j = 0; j < num_cpu; j++) { + for (i = 0; i < num_cpu; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[0].sa = sa; + queue[0].sb = sb; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} + +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + BLASLONG info; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { -ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); +#endif + return info; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 2) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); +#endif + return info; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + info = CNAME(&newarg, NULL, NULL, sa, sb, 0); + if (info) return info + i; + + if (n - i - bk > 0) { +#ifndef USE_SIMPLE_THREADED_LEVEL3 + newarg.m = n - i - bk; + newarg.k = bk; +#ifndef LOWER + newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; +#else + newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; +#endif + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + + thread_driver(&newarg, sa, sb); +#else + +#ifndef LOWER + newarg.m = bk; + newarg.n = n - i - bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + + gemm_thread_n(mode | BLAS_TRANSA_T, + &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, + &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); +#endif +#else + newarg.m = n - i - bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + bk + i * lda) * COMPSIZE; + + gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + (i + bk + i * lda) * COMPSIZE; + newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); +#endif +#endif + +#endif + } + } + return 0; +} diff --git a/lapack/trti2/._Makefile b/lapack/trti2/._Makefile new file mode 100644 index 0000000..5b9c080 Binary files /dev/null and b/lapack/trti2/._Makefile differ diff --git a/lapack/trti2/._trti2_L.c b/lapack/trti2/._trti2_L.c new file mode 100644 index 0000000..f283e16 Binary files /dev/null and b/lapack/trti2/._trti2_L.c differ diff --git a/lapack/trti2/._trti2_U.c b/lapack/trti2/._trti2_U.c new file mode 100644 index 0000000..5d2f5f4 Binary files /dev/null and b/lapack/trti2/._trti2_U.c differ diff --git a/lapack/trti2/._ztrti2_L.c b/lapack/trti2/._ztrti2_L.c new file mode 100644 index 0000000..2125d47 Binary files /dev/null and b/lapack/trti2/._ztrti2_L.c differ diff --git a/lapack/trti2/._ztrti2_U.c b/lapack/trti2/._ztrti2_U.c new file mode 100644 index 0000000..7834473 Binary files /dev/null and b/lapack/trti2/._ztrti2_U.c differ diff --git a/lapack/trti2/Makefile b/lapack/trti2/Makefile new file mode 100644 index 0000000..45251fb --- /dev/null +++ b/lapack/trti2/Makefile @@ -0,0 +1,155 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = strti2_UU.$(SUFFIX) strti2_UN.$(SUFFIX) strti2_LU.$(SUFFIX) strti2_LN.$(SUFFIX) +DBLASOBJS = dtrti2_UU.$(SUFFIX) dtrti2_UN.$(SUFFIX) dtrti2_LU.$(SUFFIX) dtrti2_LN.$(SUFFIX) +QBLASOBJS = qtrti2_UU.$(SUFFIX) qtrti2_UN.$(SUFFIX) qtrti2_LU.$(SUFFIX) qtrti2_LN.$(SUFFIX) +CBLASOBJS = ctrti2_UU.$(SUFFIX) ctrti2_UN.$(SUFFIX) ctrti2_LU.$(SUFFIX) ctrti2_LN.$(SUFFIX) +ZBLASOBJS = ztrti2_UU.$(SUFFIX) ztrti2_UN.$(SUFFIX) ztrti2_LU.$(SUFFIX) ztrti2_LN.$(SUFFIX) +XBLASOBJS = xtrti2_UU.$(SUFFIX) xtrti2_UN.$(SUFFIX) xtrti2_LU.$(SUFFIX) xtrti2_LN.$(SUFFIX) + +strti2_UU.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strti2_UN.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strti2_LU.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strti2_LN.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +dtrti2_UU.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrti2_UN.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrti2_LU.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrti2_LN.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +qtrti2_UU.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrti2_UN.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrti2_LU.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrti2_LN.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +ctrti2_UU.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrti2_UN.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrti2_LU.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrti2_LN.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ztrti2_UU.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrti2_UN.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrti2_LU.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrti2_LN.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +xtrti2_UU.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrti2_UN.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrti2_LU.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrti2_LN.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +strti2_UU.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strti2_UN.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strti2_LU.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strti2_LN.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +dtrti2_UU.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrti2_UN.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrti2_LU.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrti2_LN.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +qtrti2_UU.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrti2_UN.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrti2_LU.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrti2_LN.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +ctrti2_UU.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrti2_UN.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrti2_LU.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrti2_LN.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ztrti2_UU.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrti2_UN.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrti2_LU.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrti2_LN.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +xtrti2_UU.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrti2_UN.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrti2_LU.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrti2_LN.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/trti2/trti2_L.c b/lapack/trti2/trti2_L.c new file mode 100644 index 0000000..47fb53d --- /dev/null +++ b/lapack/trti2/trti2_L.c @@ -0,0 +1,86 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define TRMV TRMV_NLU +#else +#define TRMV TRMV_NLN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj; + BLASLONG j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = n - 1; j >= 0; j--) { + + ajj = ONE; + +#ifndef UNIT + ajj /= *(a + j + j * lda); + *(a + j + j * lda) = ajj; +#endif + + TRMV (n - j - 1, + a + (j + 1) + (j + 1) * lda, lda, + a + (j + 1) + j * lda, + 1, sb); + + SCAL_K(n - j - 1, 0, 0, + -ajj, + a + (j + 1) + j * lda, 1, + NULL, 0, NULL, 0); + } + + return 0; +} diff --git a/lapack/trti2/trti2_U.c b/lapack/trti2/trti2_U.c new file mode 100644 index 0000000..f43cecd --- /dev/null +++ b/lapack/trti2/trti2_U.c @@ -0,0 +1,87 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define TRMV TRMV_NUU +#else +#define TRMV TRMV_NUN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj; + BLASLONG j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = 0; j < n; j++) { + + ajj = ONE; + +#ifndef UNIT + ajj /= *(a + j + j * lda); + *(a + j + j * lda) = ajj; +#endif + + TRMV (j, + a , lda, + a + j * lda, 1, + sb); + + SCAL_K(j, 0, 0, + -ajj, + a + j * lda, 1, + NULL, 0, NULL, 0); + + } + + return 0; +} diff --git a/lapack/trti2/ztrti2_L.c b/lapack/trti2/ztrti2_L.c new file mode 100644 index 0000000..fd19be2 --- /dev/null +++ b/lapack/trti2/ztrti2_L.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define ZTRMV ZTRMV_NLU +#else +#define ZTRMV ZTRMV_NLN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj_r, ajj_i; +#ifndef UNIT + FLOAT ratio, den; +#endif + BLASLONG j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = n - 1; j >= 0; j--) { + + ajj_r = ONE; + ajj_i = ZERO; + +#ifndef UNIT + ajj_r = *(a + (j + j * lda) * COMPSIZE + 0); + ajj_i = *(a + (j + j * lda) * COMPSIZE + 1); + + if (fabs(ajj_r) >= fabs(ajj_i)){ + ratio = ajj_i / ajj_r; + den = 1. / (ajj_r * ( 1 + ratio * ratio)); + ajj_r = den; + ajj_i = -ratio * den; + } else { + ratio = ajj_r / ajj_i; + den = 1. /(ajj_i * ( 1 + ratio * ratio)); + ajj_r = ratio * den; + ajj_i = -den; + } + + *(a + (j + j * lda) * COMPSIZE + 0) = ajj_r; + *(a + (j + j * lda) * COMPSIZE + 1) = ajj_i; +#endif + + ZTRMV (n - j - 1, + a + ((j + 1) + (j + 1) * lda) * COMPSIZE, lda, + a + ((j + 1) + j * lda) * COMPSIZE, 1, + sb); + + SCAL_K(n - j - 1, 0, 0, + -ajj_r, -ajj_i, + a + ((j + 1) + j * lda) * COMPSIZE, 1, + NULL, 0, NULL, 0); + } + + return 0; +} diff --git a/lapack/trti2/ztrti2_U.c b/lapack/trti2/ztrti2_U.c new file mode 100644 index 0000000..d85b327 --- /dev/null +++ b/lapack/trti2/ztrti2_U.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define ZTRMV ZTRMV_NUU +#else +#define ZTRMV ZTRMV_NUN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj_r, ajj_i; +#ifndef UNIT + FLOAT ratio, den; +#endif + BLASLONG j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = 0; j < n; j++) { + + ajj_r = ONE; + ajj_i = ZERO; + +#ifndef UNIT + ajj_r = *(a + (j + j * lda) * COMPSIZE + 0); + ajj_i = *(a + (j + j * lda) * COMPSIZE + 1); + + + if (fabs(ajj_r) >= fabs(ajj_i)){ + ratio = ajj_i / ajj_r; + den = 1. / (ajj_r * ( 1 + ratio * ratio)); + ajj_r = den; + ajj_i = -ratio * den; + } else { + ratio = ajj_r / ajj_i; + den = 1. /(ajj_i * ( 1 + ratio * ratio)); + ajj_r = ratio * den; + ajj_i = -den; + } + + *(a + (j + j * lda) * COMPSIZE + 0) = ajj_r; + *(a + (j + j * lda) * COMPSIZE + 1) = ajj_i; +#endif + + ZTRMV (j, + a , lda, + a + j * lda * COMPSIZE, 1, + sb); + + SCAL_K(j, 0, 0, + -ajj_r, -ajj_i, + a + j * lda * COMPSIZE, 1, + NULL, 0, NULL, 0); + + } + + return 0; +} diff --git a/lapack/trtri/._Makefile b/lapack/trtri/._Makefile new file mode 100644 index 0000000..d77a3c4 Binary files /dev/null and b/lapack/trtri/._Makefile differ diff --git a/lapack/trtri/._trtri_L_parallel.c b/lapack/trtri/._trtri_L_parallel.c new file mode 100644 index 0000000..519ab11 Binary files /dev/null and b/lapack/trtri/._trtri_L_parallel.c differ diff --git a/lapack/trtri/._trtri_L_single.c b/lapack/trtri/._trtri_L_single.c new file mode 100644 index 0000000..f9d3c64 Binary files /dev/null and b/lapack/trtri/._trtri_L_single.c differ diff --git a/lapack/trtri/._trtri_U_parallel.c b/lapack/trtri/._trtri_U_parallel.c new file mode 100644 index 0000000..bb5358c Binary files /dev/null and b/lapack/trtri/._trtri_U_parallel.c differ diff --git a/lapack/trtri/._trtri_U_single.c b/lapack/trtri/._trtri_U_single.c new file mode 100644 index 0000000..1f09ce9 Binary files /dev/null and b/lapack/trtri/._trtri_U_single.c differ diff --git a/lapack/trtri/Makefile b/lapack/trtri/Makefile new file mode 100644 index 0000000..722f112 --- /dev/null +++ b/lapack/trtri/Makefile @@ -0,0 +1,313 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = strtri_UU_single.$(SUFFIX) strtri_UN_single.$(SUFFIX) strtri_LU_single.$(SUFFIX) strtri_LN_single.$(SUFFIX) + +DBLASOBJS = dtrtri_UU_single.$(SUFFIX) dtrtri_UN_single.$(SUFFIX) dtrtri_LU_single.$(SUFFIX) dtrtri_LN_single.$(SUFFIX) + +QBLASOBJS = qtrtri_UU_single.$(SUFFIX) qtrtri_UN_single.$(SUFFIX) qtrtri_LU_single.$(SUFFIX) qtrtri_LN_single.$(SUFFIX) + +CBLASOBJS = ctrtri_UU_single.$(SUFFIX) ctrtri_UN_single.$(SUFFIX) ctrtri_LU_single.$(SUFFIX) ctrtri_LN_single.$(SUFFIX) + +ZBLASOBJS = ztrtri_UU_single.$(SUFFIX) ztrtri_UN_single.$(SUFFIX) ztrtri_LU_single.$(SUFFIX) ztrtri_LN_single.$(SUFFIX) + +XBLASOBJS = xtrtri_UU_single.$(SUFFIX) xtrtri_UN_single.$(SUFFIX) xtrtri_LU_single.$(SUFFIX) xtrtri_LN_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += strtri_UU_parallel.$(SUFFIX) strtri_UN_parallel.$(SUFFIX) strtri_LU_parallel.$(SUFFIX) strtri_LN_parallel.$(SUFFIX) +DBLASOBJS += dtrtri_UU_parallel.$(SUFFIX) dtrtri_UN_parallel.$(SUFFIX) dtrtri_LU_parallel.$(SUFFIX) dtrtri_LN_parallel.$(SUFFIX) +QBLASOBJS += qtrtri_UU_parallel.$(SUFFIX) qtrtri_UN_parallel.$(SUFFIX) qtrtri_LU_parallel.$(SUFFIX) qtrtri_LN_parallel.$(SUFFIX) +CBLASOBJS += ctrtri_UU_parallel.$(SUFFIX) ctrtri_UN_parallel.$(SUFFIX) ctrtri_LU_parallel.$(SUFFIX) ctrtri_LN_parallel.$(SUFFIX) +ZBLASOBJS += ztrtri_UU_parallel.$(SUFFIX) ztrtri_UN_parallel.$(SUFFIX) ztrtri_LU_parallel.$(SUFFIX) ztrtri_LN_parallel.$(SUFFIX) +XBLASOBJS += xtrtri_UU_parallel.$(SUFFIX) xtrtri_UN_parallel.$(SUFFIX) xtrtri_LU_parallel.$(SUFFIX) xtrtri_LN_parallel.$(SUFFIX) +endif + +strtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +dtrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +qtrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +ctrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ztrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +xtrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +strtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +dtrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +qtrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +ctrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ztrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +xtrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/trtri/trtri_L_parallel.c b/lapack/trtri/trtri_L_parallel.c new file mode 100644 index 0000000..5969eb6 --- /dev/null +++ b/lapack/trtri/trtri_L_parallel.c @@ -0,0 +1,151 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define TRTI2 TRTI2_LU +#define TRMM TRMM_LNLU +#define TRSM TRSM_RNLU +#else +#define TRTI2 TRTI2_LN +#define TRMM TRMM_LNLN +#define TRSM TRSM_RNLN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + BLASLONG n, info; + BLASLONG bk, i, blocking, start_i; + int mode; + BLASLONG lda, range_N[2]; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { ONE, ZERO}; + FLOAT beta [2] = {-ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= DTB_ENTRIES) { + info = TRTI2(args, NULL, range_n, sa, sb, 0); + return info; + } + + blocking = GEMM_Q; + if (n < 4 * GEMM_Q) blocking = (n + 3) / 4; + + start_i = 0; + while (start_i < n) start_i += blocking; + start_i -= blocking; + + for (i = start_i; i >= 0; i -= blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + range_N[0] = i; + range_N[1] = i + bk; + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + + newarg.m = n - bk - i; + newarg.n = bk; + newarg.a = a + ( i + i * lda) * COMPSIZE; + newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; + + newarg.beta = beta; + newarg.nthreads = args -> nthreads; + + gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = bk; + + newarg.a = a + (i + i * lda) * COMPSIZE; + + CNAME (&newarg, NULL, NULL, sa, sb, 0); + + newarg.m = n - bk - i; + newarg.n = i; + newarg.k = bk; + + newarg.a = a + (i + bk + i * lda) * COMPSIZE; + newarg.b = a + (i ) * COMPSIZE; + newarg.c = a + (i + bk ) * COMPSIZE; + + newarg.beta = NULL; + + gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads); + + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i ) * COMPSIZE; + + newarg.m = bk; + newarg.n = i; + + gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads); + } + + + return 0; +} diff --git a/lapack/trtri/trtri_L_single.c b/lapack/trtri/trtri_L_single.c new file mode 100644 index 0000000..a940ce2 --- /dev/null +++ b/lapack/trtri/trtri_L_single.c @@ -0,0 +1,190 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; +static FLOAT dm1 = -1.; + +#ifdef UNIT +#define TRTI2 TRTI2_LU +#else +#define TRTI2 TRTI2_LN +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 64 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG i, is, min_i, start_i; + BLASLONG ls, min_l; + BLASLONG bk; + BLASLONG blocking; + BLASLONG range_N[2]; + + FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); + FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_A); + FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES) { + TRTI2(args, NULL, range_n, sa, sb, 0); + return 0; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + start_i = 0; + while (start_i < n) start_i += blocking; + start_i -= blocking; + + for (i = start_i; i >= 0; i -= blocking) { + bk = MIN(blocking, n - i); + + if (n - bk - i > 0) TRSM_OLNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); + + if (!range_n) { + range_N[0] = i; + range_N[1] = i + bk; + } else { + range_N[0] = range_n[0] + i; + range_N[1] = range_n[0] + i + bk; + } + + CNAME(args, NULL, range_N, sa, sa_trmm, 0); + + if (i > 0) { + TRMM_ILTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); + + for (ls = 0; ls < i; ls += REAL_GEMM_R) { + min_l = i - ls; + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + + GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); + + if (n - bk - i > 0) { + for (is = i + bk; is < n; is += GEMM_P) { + min_i = n - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (ls == 0) { + NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL_RT(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sa_trsm, + a + (is + i * lda) * COMPSIZE, lda, 0); + } else { + GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + } + + GEMM_KERNEL_N(min_i, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb_gemm, + a + (is + ls * lda) * COMPSIZE, lda); + } + } + + for (is = 0; is < bk; is += GEMM_P) { + min_i = bk - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRMM_KERNEL_LT(min_i, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa_trmm + is * bk * COMPSIZE, sb_gemm, + a + (i + is + ls * lda) * COMPSIZE, lda, is); + } + } + + } else { + + if (n - bk - i > 0) { + for (is = 0; is < n - bk - i; is += GEMM_P) { + min_i = n - bk - i - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + NEG_TCOPY (bk, min_i, a + (i + bk + is + i * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL_RT(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sa_trsm, + a + (i + bk + is + i * lda) * COMPSIZE, lda, 0); + } + } + + } + } + + return 0; +} diff --git a/lapack/trtri/trtri_U_parallel.c b/lapack/trtri/trtri_U_parallel.c new file mode 100644 index 0000000..8761a40 --- /dev/null +++ b/lapack/trtri/trtri_U_parallel.c @@ -0,0 +1,147 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define TRTI2 TRTI2_UU +#define TRMM TRMM_LNUU +#define TRSM TRSM_RNUU +#else +#define TRTI2 TRTI2_UN +#define TRMM TRMM_LNUN +#define TRSM TRSM_RNUN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + BLASLONG n, info; + BLASLONG bk, i, blocking; + int mode; + BLASLONG lda, range_N[2]; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { ONE, ZERO}; + FLOAT beta [2] = {-ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= DTB_ENTRIES) { + info = TRTI2(args, NULL, range_n, sa, sb, 0); + return info; + } + + blocking = GEMM_Q; + if (n < 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + range_N[0] = i; + range_N[1] = i + bk; + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + + newarg.m = i; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + ( i * lda) * COMPSIZE; + + newarg.beta = beta; + newarg.nthreads = args -> nthreads; + + gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = bk; + + newarg.a = a + (i + i * lda) * COMPSIZE; + + CNAME (&newarg, NULL, NULL, sa, sb, 0); + + newarg.m = i; + newarg.n = n - i - bk; + newarg.k = bk; + + newarg.a = a + ( i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + newarg.c = a + ( (i + bk) * lda) * COMPSIZE; + + newarg.beta = NULL; + + gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads); + + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + + newarg.m = bk; + newarg.n = n - i - bk; + + gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads); + + } + + return 0; +} diff --git a/lapack/trtri/trtri_U_single.c b/lapack/trtri/trtri_U_single.c new file mode 100644 index 0000000..72133d8 --- /dev/null +++ b/lapack/trtri/trtri_U_single.c @@ -0,0 +1,188 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; +static FLOAT dm1 = -1.; + +#ifdef UNIT +#define TRTI2 TRTI2_UU +#else +#define TRTI2 TRTI2_UN +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 64 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG i, is, min_i, start_is; + BLASLONG ls, min_l; + BLASLONG bk; + BLASLONG blocking; + BLASLONG range_N[2]; + + FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); + FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_A); + FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES) { + TRTI2(args, NULL, range_n, sa, sb, 0); + return 0; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (i = 0; i < n; i += blocking) { + bk = MIN(blocking, n - i); + + if (i > 0) TRSM_OUNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); + + if (!range_n) { + range_N[0] = i; + range_N[1] = i + bk; + } else { + range_N[0] = range_n[0] + i; + range_N[1] = range_n[0] + i + bk; + } + + CNAME(args, NULL, range_N, sa, sa_trmm, 0); + + if (n -bk - i > 0) { + TRMM_IUTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); + + for (ls = i + bk; ls < n; ls += REAL_GEMM_R) { + min_l = n - ls; + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + + GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); + + if (i > 0) { + for (is = 0; is < i; is += GEMM_P) { + min_i = i - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (ls == i + bk) { + NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL_RN(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sa_trsm, + a + (is + i * lda) * COMPSIZE, lda, 0); + } else { + GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + } + + GEMM_KERNEL_N(min_i, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb_gemm, + a + (is + ls * lda) * COMPSIZE, lda); + } + } + + start_is = 0; + while (start_is < bk) start_is += GEMM_P; + start_is -= GEMM_P; + + for (is = 0; is < bk; is += GEMM_P) { + min_i = bk - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRMM_KERNEL_LN(min_i, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa_trmm + is * bk * COMPSIZE, sb_gemm, + a + (i + is + ls * lda) * COMPSIZE, lda, is); + } + } + + } else { + if (i > 0) { + for (is = 0; is < i; is += GEMM_P) { + min_i = i - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL_RN(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sa_trsm, + a + (is + i * lda) * COMPSIZE, lda, 0); + } + } + } + } + + return 0; +} diff --git a/make.inc b/make.inc new file mode 100644 index 0000000..3000423 --- /dev/null +++ b/make.inc @@ -0,0 +1,11 @@ +SHELL = /bin/sh +PLAT = _LINUX +DRVOPTS = $(OPTS) +LOADER = $(FORTRAN) +TIMER = NONE +ARCHFLAGS= -ru +RANLIB = ranlib +BLASLIB = +TMGLIB = tmglib.a +EIGSRCLIB = eigsrc.a +LINSRCLIB = linsrc.a diff --git a/param.h b/param.h new file mode 100644 index 0000000..d8dbcfa --- /dev/null +++ b/param.h @@ -0,0 +1,1543 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef PARAM_H +#define PARAM_H + +#ifdef OPTERON + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 256 +#define GEMM_DEFAULT_ALIGN 0x01ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#ifdef ALLOC_HUGETLB + +#define SGEMM_DEFAULT_Q 248 +#define DGEMM_DEFAULT_Q 248 +#define QGEMM_DEFAULT_Q 248 +#define CGEMM_DEFAULT_Q 248 +#define ZGEMM_DEFAULT_Q 248 +#define XGEMM_DEFAULT_Q 248 + +#else + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 240 +#define QGEMM_DEFAULT_Q 240 +#define CGEMM_DEFAULT_Q 240 +#define ZGEMM_DEFAULT_Q 240 +#define XGEMM_DEFAULT_Q 240 + +#endif + + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#if 0 +#define SGEMM_DEFAULT_P 496 +#define DGEMM_DEFAULT_P 248 +#define QGEMM_DEFAULT_P 124 +#define CGEMM_DEFAULT_P 248 +#define ZGEMM_DEFAULT_P 124 +#define XGEMM_DEFAULT_P 62 + +#define SGEMM_DEFAULT_Q 248 +#define DGEMM_DEFAULT_Q 248 +#define QGEMM_DEFAULT_Q 248 +#define CGEMM_DEFAULT_Q 248 +#define ZGEMM_DEFAULT_Q 248 +#define XGEMM_DEFAULT_Q 248 + +#else + +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 224 +#define QGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#define ZGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define QGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + +#endif + +#define SGEMM_DEFAULT_R sgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + +#ifdef ATHLON + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 384 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 1 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_P 208 +#define DGEMM_DEFAULT_P 104 +#define QGEMM_DEFAULT_P 56 +#define CGEMM_DEFAULT_P 104 +#define ZGEMM_DEFAULT_P 56 +#define XGEMM_DEFAULT_P 28 + +#define SGEMM_DEFAULT_Q 208 +#define DGEMM_DEFAULT_Q 208 +#define QGEMM_DEFAULT_Q 208 +#define CGEMM_DEFAULT_Q 208 +#define ZGEMM_DEFAULT_Q 208 +#define XGEMM_DEFAULT_Q 208 + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE +#endif + +#ifdef VIAC3 + +#define SNUMOPT 2 +#define DNUMOPT 1 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 256 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 1 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define QGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 +#define XGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 + +#define SYMV_P 16 +#endif + +#ifdef NANO + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 256 +#define GEMM_DEFAULT_ALIGN 0x01ffffUL + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_DEFAULT_P 288 +#define DGEMM_DEFAULT_P 288 +#define QGEMM_DEFAULT_P 288 +#define CGEMM_DEFAULT_P 288 +#define ZGEMM_DEFAULT_P 288 +#define XGEMM_DEFAULT_P 288 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 64 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 64 +#define XGEMM_DEFAULT_Q 32 + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#endif + +#if defined(PENTIUM) || defined(PENTIUM2) || defined(PENTIUM3) + +#ifdef HAVE_SSE +#define SNUMOPT 2 +#else +#define SNUMOPT 1 +#endif +#define DNUMOPT 1 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#ifdef HAVE_SSE +#define SGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 4 +#else +#define SGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#endif +#define DGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_Q 256 +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 4 + +#endif + +#ifdef PENTIUMM + +#define SNUMOPT 2 +#define DNUMOPT 1 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#ifdef CORE_YONAH +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_Q 256 +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 4 +#endif + +#ifdef CORE_NORTHWOOD + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 32 + +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SYMV_P 8 + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 1 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 +#endif + +#ifdef CORE_PRESCOTT + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#ifndef __64BIT__ +#define GEMM_DEFAULT_OFFSET_A 128 +#define GEMM_DEFAULT_OFFSET_B 192 +#else +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 256 +#endif + +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SYMV_P 8 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 +#endif + +#ifdef CORE2 + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 448 +#define GEMM_DEFAULT_OFFSET_B 128 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 1 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define MASK(a, b) ((((a) + (b) - 1) / (b)) * (b)) + +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 256 + +#endif + +#ifdef PENRYN + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 128 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.75 +#endif + +#ifdef DUNNINGTON + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 128 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 768 +#define DGEMM_DEFAULT_Q 384 +#define QGEMM_DEFAULT_Q 192 +#define CGEMM_DEFAULT_Q 768 +#define ZGEMM_DEFAULT_Q 384 +#define XGEMM_DEFAULT_Q 192 + +#define GETRF_FACTOR 0.75 +#define GEMM_THREAD gemm_thread_mn +#endif + +#ifdef NEHALEM + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 32 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P 504 +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P 504 +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P 252 +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P 252 +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.72 + +#endif + + +#ifdef ATOM + +#define SNUMOPT 2 +#define DNUMOPT 1 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SYMV_P 8 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 256 + +#endif + + +#ifdef ITANIUM2 + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 128 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_N 8 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p + +#define SGEMM_DEFAULT_Q 1024 +#define DGEMM_DEFAULT_Q 1024 +#define QGEMM_DEFAULT_Q 1024 +#define CGEMM_DEFAULT_Q 1024 +#define ZGEMM_DEFAULT_Q 1024 +#define XGEMM_DEFAULT_Q 1024 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 + +#define GETRF_FACTOR 0.65 + +#endif + +#if defined(EV4) || defined(EV5) || defined(EV6) + +#ifdef EV4 +#define SNUMOPT 1 +#define DNUMOPT 1 +#else +#define SNUMOPT 2 +#define DNUMOPT 2 +#endif + +#define GEMM_DEFAULT_OFFSET_A 512 +#define GEMM_DEFAULT_OFFSET_B 512 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SYMV_P 8 + +#ifdef EV4 +#define SGEMM_DEFAULT_P 32 +#define SGEMM_DEFAULT_Q 112 +#define SGEMM_DEFAULT_R 256 + +#define DGEMM_DEFAULT_P 32 +#define DGEMM_DEFAULT_Q 56 +#define DGEMM_DEFAULT_R 256 + +#define CGEMM_DEFAULT_P 32 +#define CGEMM_DEFAULT_Q 64 +#define CGEMM_DEFAULT_R 240 + +#define ZGEMM_DEFAULT_P 32 +#define ZGEMM_DEFAULT_Q 32 +#define ZGEMM_DEFAULT_R 240 +#endif + +#ifdef EV5 +#define SGEMM_DEFAULT_P 64 +#define SGEMM_DEFAULT_Q 256 + +#define DGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_Q 128 + +#define CGEMM_DEFAULT_P 64 +#define CGEMM_DEFAULT_Q 128 + +#define ZGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_Q 64 +#endif + +#ifdef EV6 +#define SGEMM_DEFAULT_P 256 +#define SGEMM_DEFAULT_Q 512 + +#define DGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_Q 256 + +#define CGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_Q 256 + +#define ZGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_Q 256 +#endif + +#endif + +#ifdef CELL + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 8192 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 + +#define SYMV_P 4 +#endif + +#ifdef PPCG4 +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 1024 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 + +#define SYMV_P 4 +#endif + +#ifdef PPC970 + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 2688 +#define GEMM_DEFAULT_OFFSET_B 3072 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#ifdef OS_LINUX +#if L2_SIZE == 1024976 +#define SGEMM_DEFAULT_P 320 +#define DGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 256 +#else +#define SGEMM_DEFAULT_P 176 +#define DGEMM_DEFAULT_P 176 +#define CGEMM_DEFAULT_P 176 +#define ZGEMM_DEFAULT_P 176 +#endif +#endif + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 + +#define SYMV_P 4 + +#endif + +#ifdef PPC440 + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A (32 * 0) +#define GEMM_DEFAULT_OFFSET_B (32 * 0) +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 512 + +#define SGEMM_DEFAULT_Q 1024 +#define DGEMM_DEFAULT_Q 512 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 + +#define SGEMM_DEFAULT_R SGEMM_DEFAULT_P +#define DGEMM_DEFAULT_R DGEMM_DEFAULT_P +#define CGEMM_DEFAULT_R CGEMM_DEFAULT_P +#define ZGEMM_DEFAULT_R ZGEMM_DEFAULT_P + +#define SYMV_P 4 +#endif + +#ifdef PPC440FP2 + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A (32 * 0) +#define GEMM_DEFAULT_OFFSET_B (32 * 0) +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 +#if 1 +#define SGEMM_DEFAULT_Q 4096 +#define DGEMM_DEFAULT_Q 3072 +#define CGEMM_DEFAULT_Q 2048 +#define ZGEMM_DEFAULT_Q 1024 +#else +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 +#endif + +#define SYMV_P 4 +#endif + + + +#if defined(POWER3) || defined(POWER4) || defined(POWER5) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 2048 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#ifdef POWER3 + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define SGEMM_DEFAULT_P 256 +#define SGEMM_DEFAULT_Q 432 +#define SGEMM_DEFAULT_R 1012 + +#define DGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_Q 216 +#define DGEMM_DEFAULT_R 1012 + +#define ZGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_Q 104 +#define ZGEMM_DEFAULT_R 1012 +#endif + +#if defined(POWER4) +#ifdef ALLOC_HUGETLB +#define SGEMM_DEFAULT_P 184 +#define DGEMM_DEFAULT_P 184 +#define CGEMM_DEFAULT_P 184 +#define ZGEMM_DEFAULT_P 184 +#else +#define SGEMM_DEFAULT_P 144 +#define DGEMM_DEFAULT_P 144 +#define CGEMM_DEFAULT_P 144 +#define ZGEMM_DEFAULT_P 144 +#endif +#endif + +#if defined(POWER5) +#ifdef ALLOC_HUGETLB +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 128 +#else +#define SGEMM_DEFAULT_P 320 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 160 +#define ZGEMM_DEFAULT_P 80 +#endif + +#define SGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 +#endif + +#define SYMV_P 8 + +#endif + +#if defined(POWER6) + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 384 +#define GEMM_DEFAULT_OFFSET_B 1024 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 992 +#define DGEMM_DEFAULT_P 480 +#define CGEMM_DEFAULT_P 488 +#define ZGEMM_DEFAULT_P 248 + +#define SGEMM_DEFAULT_Q 504 +#define DGEMM_DEFAULT_Q 504 +#define CGEMM_DEFAULT_Q 400 +#define ZGEMM_DEFAULT_Q 400 + +#define SYMV_P 8 + +#endif + +#if defined(SPARC) && defined(V7) + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 2048 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 + +#define SYMV_P 8 +#define GEMM_THREAD gemm_thread_mn +#endif + +#if defined(SPARC) && defined(V9) + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 2048 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 512 + +#define SGEMM_DEFAULT_Q 1024 +#define DGEMM_DEFAULT_Q 512 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 + +#define SYMV_P 8 +#endif + +#ifdef SICORTEX + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 108 +#define DGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 108 +#define ZGEMM_DEFAULT_P 112 + +#define SGEMM_DEFAULT_Q 288 +#define DGEMM_DEFAULT_Q 144 +#define CGEMM_DEFAULT_Q 144 +#define ZGEMM_DEFAULT_Q 72 + +#define SGEMM_DEFAULT_R 2000 +#define DGEMM_DEFAULT_R 2000 +#define CGEMM_DEFAULT_R 2000 +#define ZGEMM_DEFAULT_R 2000 + +#define SYMV_P 16 +#endif + +#ifdef GENERIC + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_P sgemm_p +#define DGEMM_P dgemm_p +#define QGEMM_P qgemm_p +#define CGEMM_P cgemm_p +#define ZGEMM_P zgemm_p +#define XGEMM_P xgemm_p + +#define SGEMM_R sgemm_r +#define DGEMM_R dgemm_r +#define QGEMM_R qgemm_r +#define CGEMM_R cgemm_r +#define ZGEMM_R zgemm_r +#define XGEMM_R xgemm_r + +#define SGEMM_Q 128 +#define DGEMM_Q 128 +#define QGEMM_Q 128 +#define CGEMM_Q 128 +#define ZGEMM_Q 128 +#define XGEMM_Q 128 + +#define SYMV_P 16 + +#endif + +#ifndef QGEMM_DEFAULT_UNROLL_M +#define QGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef QGEMM_DEFAULT_UNROLL_N +#define QGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_M +#define XGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_N +#define XGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef HAVE_SSE2 +#define SHUFPD_0 shufps $0x44, +#define SHUFPD_1 shufps $0x4e, +#define SHUFPD_2 shufps $0xe4, +#define SHUFPD_3 shufps $0xee, +#endif + +#ifndef SHUFPD_0 +#define SHUFPD_0 shufpd $0, +#endif + +#ifndef SHUFPD_1 +#define SHUFPD_1 shufpd $1, +#endif + +#ifndef SHUFPD_2 +#define SHUFPD_2 shufpd $2, +#endif + +#ifndef SHUFPD_3 +#define SHUFPD_3 shufpd $3, +#endif + +#ifndef SHUFPS_39 +#define SHUFPS_39 shufps $0x39, +#endif + + +#endif diff --git a/patch.for_lapack-3.1.1 b/patch.for_lapack-3.1.1 new file mode 100644 index 0000000..9f10f26 --- /dev/null +++ b/patch.for_lapack-3.1.1 @@ -0,0 +1,684 @@ +diff -ruN lapack-3.1.1.old/INSTALL/Makefile lapack-3.1.1/INSTALL/Makefile +--- lapack-3.1.1.old/INSTALL/Makefile 2007-02-23 14:07:35.000000000 -0600 ++++ lapack-3.1.1/INSTALL/Makefile 2009-12-16 14:40:35.000000000 -0600 +@@ -27,7 +27,7 @@ + $(LOADER) $(LOADOPTS) -o testversion ilaver.o LAPACK_version.o + + clean: +- rm -f *.o ++ rm -f *.o test* + + slamch.o: slamch.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ + dlamch.o: dlamch.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +diff -ruN lapack-3.1.1.old/Makefile lapack-3.1.1/Makefile +--- lapack-3.1.1.old/Makefile 2007-02-22 15:55:00.000000000 -0600 ++++ lapack-3.1.1/Makefile 2009-12-16 14:40:35.000000000 -0600 +@@ -20,9 +20,12 @@ + blaslib: + ( cd BLAS/SRC; $(MAKE) ) + +-lapacklib: lapack_install ++lapacklib: + ( cd SRC; $(MAKE) ) + ++lapack_prof: ++ ( cd SRC; $(MAKE) lapack_prof) ++ + tmglib: + ( cd TESTING/MATGEN; $(MAKE) ) + +diff -ruN lapack-3.1.1.old/SRC/Makefile lapack-3.1.1/SRC/Makefile +--- lapack-3.1.1.old/SRC/Makefile 2007-02-23 15:33:05.000000000 -0600 ++++ lapack-3.1.1/SRC/Makefile 2009-12-16 14:41:09.000000000 -0600 +@@ -38,265 +38,273 @@ + # + ####################################################################### + +-ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o iparmq.o \ +- ../INSTALL/ilaver.o ../INSTALL/lsame.o ++ALLAUX = ilaenv.$(SUFFIX) ieeeck.$(SUFFIX) lsamen.$(SUFFIX) iparmq.$(SUFFIX) \ ++ ../INSTALL/ilaver.$(SUFFIX) + + SCLAUX = \ +- sbdsdc.o \ +- sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ +- slaed0.o slaed1.o slaed2.o slaed3.o slaed4.o slaed5.o slaed6.o \ +- slaed7.o slaed8.o slaed9.o slaeda.o slaev2.o slagtf.o \ +- slagts.o slamrg.o slanst.o \ +- slapy2.o slapy3.o slarnv.o \ +- slarra.o slarrb.o slarrc.o slarrd.o slarre.o slarrf.o slarrj.o \ +- slarrk.o slarrr.o slaneg.o \ +- slartg.o slaruv.o slas2.o slascl.o \ +- slasd0.o slasd1.o slasd2.o slasd3.o slasd4.o slasd5.o slasd6.o \ +- slasd7.o slasd8.o slasda.o slasdq.o slasdt.o \ +- slaset.o slasq1.o slasq2.o slasq3.o slazq3.o slasq4.o slazq4.o slasq5.o slasq6.o \ +- slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ +- ssteqr.o ssterf.o slaisnan.o sisnan.o \ +- ../INSTALL/slamch.o ../INSTALL/second_$(TIMER).o ++ sbdsdc.$(SUFFIX) \ ++ sbdsqr.$(SUFFIX) sdisna.$(SUFFIX) slabad.$(SUFFIX) slacpy.$(SUFFIX) sladiv.$(SUFFIX) slae2.$(SUFFIX) slaebz.$(SUFFIX) \ ++ slaed0.$(SUFFIX) slaed1.$(SUFFIX) slaed2.$(SUFFIX) slaed3.$(SUFFIX) slaed4.$(SUFFIX) slaed5.$(SUFFIX) slaed6.$(SUFFIX) \ ++ slaed7.$(SUFFIX) slaed8.$(SUFFIX) slaed9.$(SUFFIX) slaeda.$(SUFFIX) slaev2.$(SUFFIX) slagtf.$(SUFFIX) \ ++ slagts.$(SUFFIX) slamrg.$(SUFFIX) slanst.$(SUFFIX) \ ++ slapy2.$(SUFFIX) slapy3.$(SUFFIX) slarnv.$(SUFFIX) \ ++ slarra.$(SUFFIX) slarrb.$(SUFFIX) slarrc.$(SUFFIX) slarrd.$(SUFFIX) slarre.$(SUFFIX) slarrf.$(SUFFIX) slarrj.$(SUFFIX) \ ++ slarrk.$(SUFFIX) slarrr.$(SUFFIX) slaneg.$(SUFFIX) \ ++ slartg.$(SUFFIX) slaruv.$(SUFFIX) slas2.$(SUFFIX) slascl.$(SUFFIX) \ ++ slasd0.$(SUFFIX) slasd1.$(SUFFIX) slasd2.$(SUFFIX) slasd3.$(SUFFIX) slasd4.$(SUFFIX) slasd5.$(SUFFIX) slasd6.$(SUFFIX) \ ++ slasd7.$(SUFFIX) slasd8.$(SUFFIX) slasda.$(SUFFIX) slasdq.$(SUFFIX) slasdt.$(SUFFIX) \ ++ slaset.$(SUFFIX) slasq1.$(SUFFIX) slasq2.$(SUFFIX) slasq3.$(SUFFIX) slazq3.$(SUFFIX) slasq4.$(SUFFIX) slazq4.$(SUFFIX) slasq5.$(SUFFIX) slasq6.$(SUFFIX) \ ++ slasr.$(SUFFIX) slasrt.$(SUFFIX) slassq.$(SUFFIX) slasv2.$(SUFFIX) spttrf.$(SUFFIX) sstebz.$(SUFFIX) sstedc.$(SUFFIX) \ ++ ssteqr.$(SUFFIX) ssterf.$(SUFFIX) slaisnan.$(SUFFIX) sisnan.$(SUFFIX) \ ++ ../INSTALL/second_$(TIMER).$(SUFFIX) + + DZLAUX = \ +- dbdsdc.o \ +- dbdsqr.o ddisna.o dlabad.o dlacpy.o dladiv.o dlae2.o dlaebz.o \ +- dlaed0.o dlaed1.o dlaed2.o dlaed3.o dlaed4.o dlaed5.o dlaed6.o \ +- dlaed7.o dlaed8.o dlaed9.o dlaeda.o dlaev2.o dlagtf.o \ +- dlagts.o dlamrg.o dlanst.o \ +- dlapy2.o dlapy3.o dlarnv.o \ +- dlarra.o dlarrb.o dlarrc.o dlarrd.o dlarre.o dlarrf.o dlarrj.o \ +- dlarrk.o dlarrr.o dlaneg.o \ +- dlartg.o dlaruv.o dlas2.o dlascl.o \ +- dlasd0.o dlasd1.o dlasd2.o dlasd3.o dlasd4.o dlasd5.o dlasd6.o \ +- dlasd7.o dlasd8.o dlasda.o dlasdq.o dlasdt.o \ +- dlaset.o dlasq1.o dlasq2.o dlasq3.o dlazq3.o dlasq4.o dlazq4.o dlasq5.o dlasq6.o \ +- dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ +- dsteqr.o dsterf.o dlaisnan.o disnan.o \ +- ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o ++ dbdsdc.$(SUFFIX) \ ++ dbdsqr.$(SUFFIX) ddisna.$(SUFFIX) dlabad.$(SUFFIX) dlacpy.$(SUFFIX) dladiv.$(SUFFIX) dlae2.$(SUFFIX) dlaebz.$(SUFFIX) \ ++ dlaed0.$(SUFFIX) dlaed1.$(SUFFIX) dlaed2.$(SUFFIX) dlaed3.$(SUFFIX) dlaed4.$(SUFFIX) dlaed5.$(SUFFIX) dlaed6.$(SUFFIX) \ ++ dlaed7.$(SUFFIX) dlaed8.$(SUFFIX) dlaed9.$(SUFFIX) dlaeda.$(SUFFIX) dlaev2.$(SUFFIX) dlagtf.$(SUFFIX) \ ++ dlagts.$(SUFFIX) dlamrg.$(SUFFIX) dlanst.$(SUFFIX) \ ++ dlapy2.$(SUFFIX) dlapy3.$(SUFFIX) dlarnv.$(SUFFIX) \ ++ dlarra.$(SUFFIX) dlarrb.$(SUFFIX) dlarrc.$(SUFFIX) dlarrd.$(SUFFIX) dlarre.$(SUFFIX) dlarrf.$(SUFFIX) dlarrj.$(SUFFIX) \ ++ dlarrk.$(SUFFIX) dlarrr.$(SUFFIX) dlaneg.$(SUFFIX) \ ++ dlartg.$(SUFFIX) dlaruv.$(SUFFIX) dlas2.$(SUFFIX) dlascl.$(SUFFIX) \ ++ dlasd0.$(SUFFIX) dlasd1.$(SUFFIX) dlasd2.$(SUFFIX) dlasd3.$(SUFFIX) dlasd4.$(SUFFIX) dlasd5.$(SUFFIX) dlasd6.$(SUFFIX) \ ++ dlasd7.$(SUFFIX) dlasd8.$(SUFFIX) dlasda.$(SUFFIX) dlasdq.$(SUFFIX) dlasdt.$(SUFFIX) \ ++ dlaset.$(SUFFIX) dlasq1.$(SUFFIX) dlasq2.$(SUFFIX) dlasq3.$(SUFFIX) dlazq3.$(SUFFIX) dlasq4.$(SUFFIX) dlazq4.$(SUFFIX) dlasq5.$(SUFFIX) dlasq6.$(SUFFIX) \ ++ dlasr.$(SUFFIX) dlasrt.$(SUFFIX) dlassq.$(SUFFIX) dlasv2.$(SUFFIX) dpttrf.$(SUFFIX) dstebz.$(SUFFIX) dstedc.$(SUFFIX) \ ++ dsteqr.$(SUFFIX) dsterf.$(SUFFIX) dlaisnan.$(SUFFIX) disnan.$(SUFFIX) \ ++ ../INSTALL/dsecnd_$(TIMER).$(SUFFIX) + + SLASRC = \ +- sgbbrd.o sgbcon.o sgbequ.o sgbrfs.o sgbsv.o \ +- sgbsvx.o sgbtf2.o sgbtrf.o sgbtrs.o sgebak.o sgebal.o sgebd2.o \ +- sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ +- sgegs.o sgegv.o sgehd2.o sgehrd.o sgelq2.o sgelqf.o \ +- sgels.o sgelsd.o sgelss.o sgelsx.o sgelsy.o sgeql2.o sgeqlf.o \ +- sgeqp3.o sgeqpf.o sgeqr2.o sgeqrf.o sgerfs.o sgerq2.o sgerqf.o \ +- sgesc2.o sgesdd.o sgesv.o sgesvd.o sgesvx.o sgetc2.o sgetf2.o \ +- sgetrf.o sgetri.o \ +- sgetrs.o sggbak.o sggbal.o sgges.o sggesx.o sggev.o sggevx.o \ +- sggglm.o sgghrd.o sgglse.o sggqrf.o \ +- sggrqf.o sggsvd.o sggsvp.o sgtcon.o sgtrfs.o sgtsv.o \ +- sgtsvx.o sgttrf.o sgttrs.o sgtts2.o shgeqz.o \ +- shsein.o shseqr.o slabrd.o slacon.o slacn2.o \ +- slaein.o slaexc.o slag2.o slags2.o slagtm.o slagv2.o slahqr.o \ +- slahrd.o slahr2.o slaic1.o slaln2.o slals0.o slalsa.o slalsd.o \ +- slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \ +- slansy.o slantb.o slantp.o slantr.o slanv2.o \ +- slapll.o slapmt.o \ +- slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ +- slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ +- slaqtr.o slar1v.o slar2v.o \ +- slarf.o slarfb.o slarfg.o slarft.o slarfx.o slargv.o \ +- slarrv.o slartv.o \ +- slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o \ +- slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o slatzm.o \ +- slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ +- sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ +- sorgrq.o sorgtr.o sorm2l.o sorm2r.o \ +- sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ +- sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ +- spbstf.o spbsv.o spbsvx.o \ +- spbtf2.o spbtrf.o spbtrs.o spocon.o spoequ.o sporfs.o sposv.o \ +- sposvx.o spotf2.o spotrf.o spotri.o spotrs.o sppcon.o sppequ.o \ +- spprfs.o sppsv.o sppsvx.o spptrf.o spptri.o spptrs.o sptcon.o \ +- spteqr.o sptrfs.o sptsv.o sptsvx.o spttrs.o sptts2.o srscl.o \ +- ssbev.o ssbevd.o ssbevx.o ssbgst.o ssbgv.o ssbgvd.o ssbgvx.o \ +- ssbtrd.o sspcon.o sspev.o sspevd.o sspevx.o sspgst.o \ +- sspgv.o sspgvd.o sspgvx.o ssprfs.o sspsv.o sspsvx.o ssptrd.o \ +- ssptrf.o ssptri.o ssptrs.o sstegr.o sstein.o sstev.o sstevd.o sstevr.o \ +- sstevx.o ssycon.o ssyev.o ssyevd.o ssyevr.o ssyevx.o ssygs2.o \ +- ssygst.o ssygv.o ssygvd.o ssygvx.o ssyrfs.o ssysv.o ssysvx.o \ +- ssytd2.o ssytf2.o ssytrd.o ssytrf.o ssytri.o ssytrs.o stbcon.o \ +- stbrfs.o stbtrs.o stgevc.o stgex2.o stgexc.o stgsen.o \ +- stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ +- stptrs.o \ +- strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ +- strti2.o strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o ++ sgbbrd.$(SUFFIX) sgbcon.$(SUFFIX) sgbequ.$(SUFFIX) sgbrfs.$(SUFFIX) sgbsv.$(SUFFIX) \ ++ sgbsvx.$(SUFFIX) sgbtf2.$(SUFFIX) sgbtrf.$(SUFFIX) sgbtrs.$(SUFFIX) sgebak.$(SUFFIX) sgebal.$(SUFFIX) sgebd2.$(SUFFIX) \ ++ sgebrd.$(SUFFIX) sgecon.$(SUFFIX) sgeequ.$(SUFFIX) sgees.$(SUFFIX) sgeesx.$(SUFFIX) sgeev.$(SUFFIX) sgeevx.$(SUFFIX) \ ++ sgegs.$(SUFFIX) sgegv.$(SUFFIX) sgehd2.$(SUFFIX) sgehrd.$(SUFFIX) sgelq2.$(SUFFIX) sgelqf.$(SUFFIX) \ ++ sgels.$(SUFFIX) sgelsd.$(SUFFIX) sgelss.$(SUFFIX) sgelsx.$(SUFFIX) sgelsy.$(SUFFIX) sgeql2.$(SUFFIX) sgeqlf.$(SUFFIX) \ ++ sgeqp3.$(SUFFIX) sgeqpf.$(SUFFIX) sgeqr2.$(SUFFIX) sgeqrf.$(SUFFIX) sgerfs.$(SUFFIX) sgerq2.$(SUFFIX) sgerqf.$(SUFFIX) \ ++ sgesc2.$(SUFFIX) sgesdd.$(SUFFIX) sgesvd.$(SUFFIX) sgesvx.$(SUFFIX) sgetc2.$(SUFFIX) \ ++ sgetri.$(SUFFIX) \ ++ sggbak.$(SUFFIX) sggbal.$(SUFFIX) sgges.$(SUFFIX) sggesx.$(SUFFIX) sggev.$(SUFFIX) sggevx.$(SUFFIX) \ ++ sggglm.$(SUFFIX) sgghrd.$(SUFFIX) sgglse.$(SUFFIX) sggqrf.$(SUFFIX) \ ++ sggrqf.$(SUFFIX) sggsvd.$(SUFFIX) sggsvp.$(SUFFIX) sgtcon.$(SUFFIX) sgtrfs.$(SUFFIX) sgtsv.$(SUFFIX) \ ++ sgtsvx.$(SUFFIX) sgttrf.$(SUFFIX) sgttrs.$(SUFFIX) sgtts2.$(SUFFIX) shgeqz.$(SUFFIX) \ ++ shsein.$(SUFFIX) shseqr.$(SUFFIX) slabrd.$(SUFFIX) slacon.$(SUFFIX) slacn2.$(SUFFIX) \ ++ slaein.$(SUFFIX) slaexc.$(SUFFIX) slag2.$(SUFFIX) slags2.$(SUFFIX) slagtm.$(SUFFIX) slagv2.$(SUFFIX) slahqr.$(SUFFIX) \ ++ slahrd.$(SUFFIX) slahr2.$(SUFFIX) slaic1.$(SUFFIX) slaln2.$(SUFFIX) slals0.$(SUFFIX) slalsa.$(SUFFIX) slalsd.$(SUFFIX) \ ++ slangb.$(SUFFIX) slange.$(SUFFIX) slangt.$(SUFFIX) slanhs.$(SUFFIX) slansb.$(SUFFIX) slansp.$(SUFFIX) \ ++ slansy.$(SUFFIX) slantb.$(SUFFIX) slantp.$(SUFFIX) slantr.$(SUFFIX) slanv2.$(SUFFIX) \ ++ slapll.$(SUFFIX) slapmt.$(SUFFIX) \ ++ slaqgb.$(SUFFIX) slaqge.$(SUFFIX) slaqp2.$(SUFFIX) slaqps.$(SUFFIX) slaqsb.$(SUFFIX) slaqsp.$(SUFFIX) slaqsy.$(SUFFIX) \ ++ slaqr0.$(SUFFIX) slaqr1.$(SUFFIX) slaqr2.$(SUFFIX) slaqr3.$(SUFFIX) slaqr4.$(SUFFIX) slaqr5.$(SUFFIX) \ ++ slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) \ ++ slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ ++ slarrv.$(SUFFIX) slartv.$(SUFFIX) \ ++ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ ++ slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ ++ sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ ++ sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ ++ sorgrq.$(SUFFIX) sorgtr.$(SUFFIX) sorm2l.$(SUFFIX) sorm2r.$(SUFFIX) \ ++ sormbr.$(SUFFIX) sormhr.$(SUFFIX) sorml2.$(SUFFIX) sormlq.$(SUFFIX) sormql.$(SUFFIX) sormqr.$(SUFFIX) sormr2.$(SUFFIX) \ ++ sormr3.$(SUFFIX) sormrq.$(SUFFIX) sormrz.$(SUFFIX) sormtr.$(SUFFIX) spbcon.$(SUFFIX) spbequ.$(SUFFIX) spbrfs.$(SUFFIX) \ ++ spbstf.$(SUFFIX) spbsv.$(SUFFIX) spbsvx.$(SUFFIX) \ ++ spbtf2.$(SUFFIX) spbtrf.$(SUFFIX) spbtrs.$(SUFFIX) spocon.$(SUFFIX) spoequ.$(SUFFIX) sporfs.$(SUFFIX) sposv.$(SUFFIX) \ ++ sposvx.$(SUFFIX) spotrs.$(SUFFIX) sppcon.$(SUFFIX) sppequ.$(SUFFIX) \ ++ spprfs.$(SUFFIX) sppsv.$(SUFFIX) sppsvx.$(SUFFIX) spptrf.$(SUFFIX) spptri.$(SUFFIX) spptrs.$(SUFFIX) sptcon.$(SUFFIX) \ ++ spteqr.$(SUFFIX) sptrfs.$(SUFFIX) sptsv.$(SUFFIX) sptsvx.$(SUFFIX) spttrs.$(SUFFIX) sptts2.$(SUFFIX) srscl.$(SUFFIX) \ ++ ssbev.$(SUFFIX) ssbevd.$(SUFFIX) ssbevx.$(SUFFIX) ssbgst.$(SUFFIX) ssbgv.$(SUFFIX) ssbgvd.$(SUFFIX) ssbgvx.$(SUFFIX) \ ++ ssbtrd.$(SUFFIX) sspcon.$(SUFFIX) sspev.$(SUFFIX) sspevd.$(SUFFIX) sspevx.$(SUFFIX) sspgst.$(SUFFIX) \ ++ sspgv.$(SUFFIX) sspgvd.$(SUFFIX) sspgvx.$(SUFFIX) ssprfs.$(SUFFIX) sspsv.$(SUFFIX) sspsvx.$(SUFFIX) ssptrd.$(SUFFIX) \ ++ ssptrf.$(SUFFIX) ssptri.$(SUFFIX) ssptrs.$(SUFFIX) sstegr.$(SUFFIX) sstein.$(SUFFIX) sstev.$(SUFFIX) sstevd.$(SUFFIX) sstevr.$(SUFFIX) \ ++ sstevx.$(SUFFIX) ssycon.$(SUFFIX) ssyev.$(SUFFIX) ssyevd.$(SUFFIX) ssyevr.$(SUFFIX) ssyevx.$(SUFFIX) ssygs2.$(SUFFIX) \ ++ ssygst.$(SUFFIX) ssygv.$(SUFFIX) ssygvd.$(SUFFIX) ssygvx.$(SUFFIX) ssyrfs.$(SUFFIX) ssysv.$(SUFFIX) ssysvx.$(SUFFIX) \ ++ ssytd2.$(SUFFIX) ssytf2.$(SUFFIX) ssytrd.$(SUFFIX) ssytrf.$(SUFFIX) ssytri.$(SUFFIX) ssytrs.$(SUFFIX) stbcon.$(SUFFIX) \ ++ stbrfs.$(SUFFIX) stbtrs.$(SUFFIX) stgevc.$(SUFFIX) stgex2.$(SUFFIX) stgexc.$(SUFFIX) stgsen.$(SUFFIX) \ ++ stgsja.$(SUFFIX) stgsna.$(SUFFIX) stgsy2.$(SUFFIX) stgsyl.$(SUFFIX) stpcon.$(SUFFIX) stprfs.$(SUFFIX) stptri.$(SUFFIX) \ ++ stptrs.$(SUFFIX) \ ++ strcon.$(SUFFIX) strevc.$(SUFFIX) strexc.$(SUFFIX) strrfs.$(SUFFIX) strsen.$(SUFFIX) strsna.$(SUFFIX) strsyl.$(SUFFIX) \ ++ strtrs.$(SUFFIX) stzrqf.$(SUFFIX) stzrzf.$(SUFFIX) sstemr.$(SUFFIX) + + CLASRC = \ +- cbdsqr.o cgbbrd.o cgbcon.o cgbequ.o cgbrfs.o cgbsv.o cgbsvx.o \ +- cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \ +- cgecon.o cgeequ.o cgees.o cgeesx.o cgeev.o cgeevx.o \ +- cgegs.o cgegv.o cgehd2.o cgehrd.o cgelq2.o cgelqf.o \ +- cgels.o cgelsd.o cgelss.o cgelsx.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \ +- cgeqpf.o cgeqr2.o cgeqrf.o cgerfs.o cgerq2.o cgerqf.o \ +- cgesc2.o cgesdd.o cgesv.o cgesvd.o cgesvx.o cgetc2.o cgetf2.o cgetrf.o \ +- cgetri.o cgetrs.o \ +- cggbak.o cggbal.o cgges.o cggesx.o cggev.o cggevx.o cggglm.o \ +- cgghrd.o cgglse.o cggqrf.o cggrqf.o \ +- cggsvd.o cggsvp.o \ +- cgtcon.o cgtrfs.o cgtsv.o cgtsvx.o cgttrf.o cgttrs.o cgtts2.o chbev.o \ +- chbevd.o chbevx.o chbgst.o chbgv.o chbgvd.o chbgvx.o chbtrd.o \ +- checon.o cheev.o cheevd.o cheevr.o cheevx.o chegs2.o chegst.o \ +- chegv.o chegvd.o chegvx.o cherfs.o chesv.o chesvx.o chetd2.o \ +- chetf2.o chetrd.o \ +- chetrf.o chetri.o chetrs.o chgeqz.o chpcon.o chpev.o chpevd.o \ +- chpevx.o chpgst.o chpgv.o chpgvd.o chpgvx.o chprfs.o chpsv.o \ +- chpsvx.o \ +- chptrd.o chptrf.o chptri.o chptrs.o chsein.o chseqr.o clabrd.o \ +- clacgv.o clacon.o clacn2.o clacp2.o clacpy.o clacrm.o clacrt.o cladiv.o \ +- claed0.o claed7.o claed8.o \ +- claein.o claesy.o claev2.o clags2.o clagtm.o \ +- clahef.o clahqr.o \ +- clahrd.o clahr2.o claic1.o clals0.o clalsa.o clalsd.o clangb.o clange.o clangt.o \ +- clanhb.o clanhe.o \ +- clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \ +- clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \ +- claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ +- claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ +- claqsp.o claqsy.o clar1v.o clar2v.o clarf.o clarfb.o clarfg.o clarft.o \ +- clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ +- clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ +- claswp.o clasyf.o clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ +- clatzm.o clauu2.o clauum.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ +- cpbsvx.o cpbtf2.o cpbtrf.o cpbtrs.o cpocon.o cpoequ.o cporfs.o \ +- cposv.o cposvx.o cpotf2.o cpotrf.o cpotri.o cpotrs.o cppcon.o \ +- cppequ.o cpprfs.o cppsv.o cppsvx.o cpptrf.o cpptri.o cpptrs.o \ +- cptcon.o cpteqr.o cptrfs.o cptsv.o cptsvx.o cpttrf.o cpttrs.o cptts2.o \ +- crot.o cspcon.o cspmv.o cspr.o csprfs.o cspsv.o \ +- cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \ +- cstegr.o cstein.o csteqr.o csycon.o csymv.o \ +- csyr.o csyrfs.o csysv.o csysvx.o csytf2.o csytrf.o csytri.o \ +- csytrs.o ctbcon.o ctbrfs.o ctbtrs.o ctgevc.o ctgex2.o \ +- ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ +- ctprfs.o ctptri.o \ +- ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ +- ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ +- cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ +- cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ +- cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ +- cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o ++ cbdsqr.$(SUFFIX) cgbbrd.$(SUFFIX) cgbcon.$(SUFFIX) cgbequ.$(SUFFIX) cgbrfs.$(SUFFIX) cgbsv.$(SUFFIX) cgbsvx.$(SUFFIX) \ ++ cgbtf2.$(SUFFIX) cgbtrf.$(SUFFIX) cgbtrs.$(SUFFIX) cgebak.$(SUFFIX) cgebal.$(SUFFIX) cgebd2.$(SUFFIX) cgebrd.$(SUFFIX) \ ++ cgecon.$(SUFFIX) cgeequ.$(SUFFIX) cgees.$(SUFFIX) cgeesx.$(SUFFIX) cgeev.$(SUFFIX) cgeevx.$(SUFFIX) \ ++ cgegs.$(SUFFIX) cgegv.$(SUFFIX) cgehd2.$(SUFFIX) cgehrd.$(SUFFIX) cgelq2.$(SUFFIX) cgelqf.$(SUFFIX) \ ++ cgels.$(SUFFIX) cgelsd.$(SUFFIX) cgelss.$(SUFFIX) cgelsx.$(SUFFIX) cgelsy.$(SUFFIX) cgeql2.$(SUFFIX) cgeqlf.$(SUFFIX) cgeqp3.$(SUFFIX) \ ++ cgeqpf.$(SUFFIX) cgeqr2.$(SUFFIX) cgeqrf.$(SUFFIX) cgerfs.$(SUFFIX) cgerq2.$(SUFFIX) cgerqf.$(SUFFIX) \ ++ cgesc2.$(SUFFIX) cgesdd.$(SUFFIX) cgesvd.$(SUFFIX) cgesvx.$(SUFFIX) cgetc2.$(SUFFIX) \ ++ cgetri.$(SUFFIX) \ ++ cggbak.$(SUFFIX) cggbal.$(SUFFIX) cgges.$(SUFFIX) cggesx.$(SUFFIX) cggev.$(SUFFIX) cggevx.$(SUFFIX) cggglm.$(SUFFIX) \ ++ cgghrd.$(SUFFIX) cgglse.$(SUFFIX) cggqrf.$(SUFFIX) cggrqf.$(SUFFIX) \ ++ cggsvd.$(SUFFIX) cggsvp.$(SUFFIX) \ ++ cgtcon.$(SUFFIX) cgtrfs.$(SUFFIX) cgtsv.$(SUFFIX) cgtsvx.$(SUFFIX) cgttrf.$(SUFFIX) cgttrs.$(SUFFIX) cgtts2.$(SUFFIX) chbev.$(SUFFIX) \ ++ chbevd.$(SUFFIX) chbevx.$(SUFFIX) chbgst.$(SUFFIX) chbgv.$(SUFFIX) chbgvd.$(SUFFIX) chbgvx.$(SUFFIX) chbtrd.$(SUFFIX) \ ++ checon.$(SUFFIX) cheev.$(SUFFIX) cheevd.$(SUFFIX) cheevr.$(SUFFIX) cheevx.$(SUFFIX) chegs2.$(SUFFIX) chegst.$(SUFFIX) \ ++ chegv.$(SUFFIX) chegvd.$(SUFFIX) chegvx.$(SUFFIX) cherfs.$(SUFFIX) chesv.$(SUFFIX) chesvx.$(SUFFIX) chetd2.$(SUFFIX) \ ++ chetf2.$(SUFFIX) chetrd.$(SUFFIX) \ ++ chetrf.$(SUFFIX) chetri.$(SUFFIX) chetrs.$(SUFFIX) chgeqz.$(SUFFIX) chpcon.$(SUFFIX) chpev.$(SUFFIX) chpevd.$(SUFFIX) \ ++ chpevx.$(SUFFIX) chpgst.$(SUFFIX) chpgv.$(SUFFIX) chpgvd.$(SUFFIX) chpgvx.$(SUFFIX) chprfs.$(SUFFIX) chpsv.$(SUFFIX) \ ++ chpsvx.$(SUFFIX) \ ++ chptrd.$(SUFFIX) chptrf.$(SUFFIX) chptri.$(SUFFIX) chptrs.$(SUFFIX) chsein.$(SUFFIX) chseqr.$(SUFFIX) clabrd.$(SUFFIX) \ ++ clacgv.$(SUFFIX) clacon.$(SUFFIX) clacn2.$(SUFFIX) clacp2.$(SUFFIX) clacpy.$(SUFFIX) clacrm.$(SUFFIX) clacrt.$(SUFFIX) cladiv.$(SUFFIX) \ ++ claed0.$(SUFFIX) claed7.$(SUFFIX) claed8.$(SUFFIX) \ ++ claein.$(SUFFIX) claesy.$(SUFFIX) claev2.$(SUFFIX) clags2.$(SUFFIX) clagtm.$(SUFFIX) \ ++ clahef.$(SUFFIX) clahqr.$(SUFFIX) \ ++ clahrd.$(SUFFIX) clahr2.$(SUFFIX) claic1.$(SUFFIX) clals0.$(SUFFIX) clalsa.$(SUFFIX) clalsd.$(SUFFIX) clangb.$(SUFFIX) clange.$(SUFFIX) clangt.$(SUFFIX) \ ++ clanhb.$(SUFFIX) clanhe.$(SUFFIX) \ ++ clanhp.$(SUFFIX) clanhs.$(SUFFIX) clanht.$(SUFFIX) clansb.$(SUFFIX) clansp.$(SUFFIX) clansy.$(SUFFIX) clantb.$(SUFFIX) \ ++ clantp.$(SUFFIX) clantr.$(SUFFIX) clapll.$(SUFFIX) clapmt.$(SUFFIX) clarcm.$(SUFFIX) claqgb.$(SUFFIX) claqge.$(SUFFIX) \ ++ claqhb.$(SUFFIX) claqhe.$(SUFFIX) claqhp.$(SUFFIX) claqp2.$(SUFFIX) claqps.$(SUFFIX) claqsb.$(SUFFIX) \ ++ claqr0.$(SUFFIX) claqr1.$(SUFFIX) claqr2.$(SUFFIX) claqr3.$(SUFFIX) claqr4.$(SUFFIX) claqr5.$(SUFFIX) \ ++ claqsp.$(SUFFIX) claqsy.$(SUFFIX) clar1v.$(SUFFIX) clar2v.$(SUFFIX) clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) \ ++ clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ ++ clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ ++ clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ ++ clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ ++ cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ ++ cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotrs.$(SUFFIX) cppcon.$(SUFFIX) \ ++ cppequ.$(SUFFIX) cpprfs.$(SUFFIX) cppsv.$(SUFFIX) cppsvx.$(SUFFIX) cpptrf.$(SUFFIX) cpptri.$(SUFFIX) cpptrs.$(SUFFIX) \ ++ cptcon.$(SUFFIX) cpteqr.$(SUFFIX) cptrfs.$(SUFFIX) cptsv.$(SUFFIX) cptsvx.$(SUFFIX) cpttrf.$(SUFFIX) cpttrs.$(SUFFIX) cptts2.$(SUFFIX) \ ++ crot.$(SUFFIX) cspcon.$(SUFFIX) csprfs.$(SUFFIX) cspsv.$(SUFFIX) \ ++ cspsvx.$(SUFFIX) csptrf.$(SUFFIX) csptri.$(SUFFIX) csptrs.$(SUFFIX) csrscl.$(SUFFIX) cstedc.$(SUFFIX) \ ++ cstegr.$(SUFFIX) cstein.$(SUFFIX) csteqr.$(SUFFIX) csycon.$(SUFFIX) \ ++ csyrfs.$(SUFFIX) csysv.$(SUFFIX) csysvx.$(SUFFIX) csytf2.$(SUFFIX) csytrf.$(SUFFIX) csytri.$(SUFFIX) \ ++ csytrs.$(SUFFIX) ctbcon.$(SUFFIX) ctbrfs.$(SUFFIX) ctbtrs.$(SUFFIX) ctgevc.$(SUFFIX) ctgex2.$(SUFFIX) \ ++ ctgexc.$(SUFFIX) ctgsen.$(SUFFIX) ctgsja.$(SUFFIX) ctgsna.$(SUFFIX) ctgsy2.$(SUFFIX) ctgsyl.$(SUFFIX) ctpcon.$(SUFFIX) \ ++ ctprfs.$(SUFFIX) ctptri.$(SUFFIX) \ ++ ctptrs.$(SUFFIX) ctrcon.$(SUFFIX) ctrevc.$(SUFFIX) ctrexc.$(SUFFIX) ctrrfs.$(SUFFIX) ctrsen.$(SUFFIX) ctrsna.$(SUFFIX) \ ++ ctrsyl.$(SUFFIX) ctrtrs.$(SUFFIX) ctzrqf.$(SUFFIX) ctzrzf.$(SUFFIX) cung2l.$(SUFFIX) cung2r.$(SUFFIX) \ ++ cungbr.$(SUFFIX) cunghr.$(SUFFIX) cungl2.$(SUFFIX) cunglq.$(SUFFIX) cungql.$(SUFFIX) cungqr.$(SUFFIX) cungr2.$(SUFFIX) \ ++ cungrq.$(SUFFIX) cungtr.$(SUFFIX) cunm2l.$(SUFFIX) cunm2r.$(SUFFIX) cunmbr.$(SUFFIX) cunmhr.$(SUFFIX) cunml2.$(SUFFIX) \ ++ cunmlq.$(SUFFIX) cunmql.$(SUFFIX) cunmqr.$(SUFFIX) cunmr2.$(SUFFIX) cunmr3.$(SUFFIX) cunmrq.$(SUFFIX) cunmrz.$(SUFFIX) \ ++ cunmtr.$(SUFFIX) cupgtr.$(SUFFIX) cupmtr.$(SUFFIX) icmax1.$(SUFFIX) scsum1.$(SUFFIX) cstemr.$(SUFFIX) + + DLASRC = \ +- dgbbrd.o dgbcon.o dgbequ.o dgbrfs.o dgbsv.o \ +- dgbsvx.o dgbtf2.o dgbtrf.o dgbtrs.o dgebak.o dgebal.o dgebd2.o \ +- dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ +- dgegs.o dgegv.o dgehd2.o dgehrd.o dgelq2.o dgelqf.o \ +- dgels.o dgelsd.o dgelss.o dgelsx.o dgelsy.o dgeql2.o dgeqlf.o \ +- dgeqp3.o dgeqpf.o dgeqr2.o dgeqrf.o dgerfs.o dgerq2.o dgerqf.o \ +- dgesc2.o dgesdd.o dgesv.o dgesvd.o dgesvx.o dgetc2.o dgetf2.o \ +- dgetrf.o dgetri.o \ +- dgetrs.o dggbak.o dggbal.o dgges.o dggesx.o dggev.o dggevx.o \ +- dggglm.o dgghrd.o dgglse.o dggqrf.o \ +- dggrqf.o dggsvd.o dggsvp.o dgtcon.o dgtrfs.o dgtsv.o \ +- dgtsvx.o dgttrf.o dgttrs.o dgtts2.o dhgeqz.o \ +- dhsein.o dhseqr.o dlabrd.o dlacon.o dlacn2.o \ +- dlaein.o dlaexc.o dlag2.o dlags2.o dlagtm.o dlagv2.o dlahqr.o \ +- dlahrd.o dlahr2.o dlaic1.o dlaln2.o dlals0.o dlalsa.o dlalsd.o \ +- dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \ +- dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \ +- dlapll.o dlapmt.o \ +- dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ +- dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ +- dlaqtr.o dlar1v.o dlar2v.o \ +- dlarf.o dlarfb.o dlarfg.o dlarft.o dlarfx.o dlargv.o \ +- dlarrv.o dlartv.o \ +- dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o dlasyf.o \ +- dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o dlauu2.o \ +- dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ +- dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ +- dorgrq.o dorgtr.o dorm2l.o dorm2r.o \ +- dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ +- dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ +- dpbstf.o dpbsv.o dpbsvx.o \ +- dpbtf2.o dpbtrf.o dpbtrs.o dpocon.o dpoequ.o dporfs.o dposv.o \ +- dposvx.o dpotf2.o dpotrf.o dpotri.o dpotrs.o dppcon.o dppequ.o \ +- dpprfs.o dppsv.o dppsvx.o dpptrf.o dpptri.o dpptrs.o dptcon.o \ +- dpteqr.o dptrfs.o dptsv.o dptsvx.o dpttrs.o dptts2.o drscl.o \ +- dsbev.o dsbevd.o dsbevx.o dsbgst.o dsbgv.o dsbgvd.o dsbgvx.o \ +- dsbtrd.o dspcon.o dspev.o dspevd.o dspevx.o dspgst.o \ +- dspgv.o dspgvd.o dspgvx.o dsprfs.o dspsv.o dspsvx.o dsptrd.o \ +- dsptrf.o dsptri.o dsptrs.o dstegr.o dstein.o dstev.o dstevd.o dstevr.o \ +- dstevx.o dsycon.o dsyev.o dsyevd.o dsyevr.o \ +- dsyevx.o dsygs2.o dsygst.o dsygv.o dsygvd.o dsygvx.o dsyrfs.o \ +- dsysv.o dsysvx.o \ +- dsytd2.o dsytf2.o dsytrd.o dsytrf.o dsytri.o dsytrs.o dtbcon.o \ +- dtbrfs.o dtbtrs.o dtgevc.o dtgex2.o dtgexc.o dtgsen.o \ +- dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ +- dtptrs.o \ +- dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ +- dtrti2.o dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ +- dsgesv.o dlag2s.o slag2d.o ++ dgbbrd.$(SUFFIX) dgbcon.$(SUFFIX) dgbequ.$(SUFFIX) dgbrfs.$(SUFFIX) dgbsv.$(SUFFIX) \ ++ dgbsvx.$(SUFFIX) dgbtf2.$(SUFFIX) dgbtrf.$(SUFFIX) dgbtrs.$(SUFFIX) dgebak.$(SUFFIX) dgebal.$(SUFFIX) dgebd2.$(SUFFIX) \ ++ dgebrd.$(SUFFIX) dgecon.$(SUFFIX) dgeequ.$(SUFFIX) dgees.$(SUFFIX) dgeesx.$(SUFFIX) dgeev.$(SUFFIX) dgeevx.$(SUFFIX) \ ++ dgegs.$(SUFFIX) dgegv.$(SUFFIX) dgehd2.$(SUFFIX) dgehrd.$(SUFFIX) dgelq2.$(SUFFIX) dgelqf.$(SUFFIX) \ ++ dgels.$(SUFFIX) dgelsd.$(SUFFIX) dgelss.$(SUFFIX) dgelsx.$(SUFFIX) dgelsy.$(SUFFIX) dgeql2.$(SUFFIX) dgeqlf.$(SUFFIX) \ ++ dgeqp3.$(SUFFIX) dgeqpf.$(SUFFIX) dgeqr2.$(SUFFIX) dgeqrf.$(SUFFIX) dgerfs.$(SUFFIX) dgerq2.$(SUFFIX) dgerqf.$(SUFFIX) \ ++ dgesc2.$(SUFFIX) dgesdd.$(SUFFIX) dgesvd.$(SUFFIX) dgesvx.$(SUFFIX) dgetc2.$(SUFFIX) \ ++ dgetri.$(SUFFIX) \ ++ dggbak.$(SUFFIX) dggbal.$(SUFFIX) dgges.$(SUFFIX) dggesx.$(SUFFIX) dggev.$(SUFFIX) dggevx.$(SUFFIX) \ ++ dggglm.$(SUFFIX) dgghrd.$(SUFFIX) dgglse.$(SUFFIX) dggqrf.$(SUFFIX) \ ++ dggrqf.$(SUFFIX) dggsvd.$(SUFFIX) dggsvp.$(SUFFIX) dgtcon.$(SUFFIX) dgtrfs.$(SUFFIX) dgtsv.$(SUFFIX) \ ++ dgtsvx.$(SUFFIX) dgttrf.$(SUFFIX) dgttrs.$(SUFFIX) dgtts2.$(SUFFIX) dhgeqz.$(SUFFIX) \ ++ dhsein.$(SUFFIX) dhseqr.$(SUFFIX) dlabrd.$(SUFFIX) dlacon.$(SUFFIX) dlacn2.$(SUFFIX) \ ++ dlaein.$(SUFFIX) dlaexc.$(SUFFIX) dlag2.$(SUFFIX) dlags2.$(SUFFIX) dlagtm.$(SUFFIX) dlagv2.$(SUFFIX) dlahqr.$(SUFFIX) \ ++ dlahrd.$(SUFFIX) dlahr2.$(SUFFIX) dlaic1.$(SUFFIX) dlaln2.$(SUFFIX) dlals0.$(SUFFIX) dlalsa.$(SUFFIX) dlalsd.$(SUFFIX) \ ++ dlangb.$(SUFFIX) dlange.$(SUFFIX) dlangt.$(SUFFIX) dlanhs.$(SUFFIX) dlansb.$(SUFFIX) dlansp.$(SUFFIX) \ ++ dlansy.$(SUFFIX) dlantb.$(SUFFIX) dlantp.$(SUFFIX) dlantr.$(SUFFIX) dlanv2.$(SUFFIX) \ ++ dlapll.$(SUFFIX) dlapmt.$(SUFFIX) \ ++ dlaqgb.$(SUFFIX) dlaqge.$(SUFFIX) dlaqp2.$(SUFFIX) dlaqps.$(SUFFIX) dlaqsb.$(SUFFIX) dlaqsp.$(SUFFIX) dlaqsy.$(SUFFIX) \ ++ dlaqr0.$(SUFFIX) dlaqr1.$(SUFFIX) dlaqr2.$(SUFFIX) dlaqr3.$(SUFFIX) dlaqr4.$(SUFFIX) dlaqr5.$(SUFFIX) \ ++ dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) \ ++ dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) dlargv.$(SUFFIX) \ ++ dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ ++ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ ++ dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ ++ dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ ++ dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ ++ dorgrq.$(SUFFIX) dorgtr.$(SUFFIX) dorm2l.$(SUFFIX) dorm2r.$(SUFFIX) \ ++ dormbr.$(SUFFIX) dormhr.$(SUFFIX) dorml2.$(SUFFIX) dormlq.$(SUFFIX) dormql.$(SUFFIX) dormqr.$(SUFFIX) dormr2.$(SUFFIX) \ ++ dormr3.$(SUFFIX) dormrq.$(SUFFIX) dormrz.$(SUFFIX) dormtr.$(SUFFIX) dpbcon.$(SUFFIX) dpbequ.$(SUFFIX) dpbrfs.$(SUFFIX) \ ++ dpbstf.$(SUFFIX) dpbsv.$(SUFFIX) dpbsvx.$(SUFFIX) \ ++ dpbtf2.$(SUFFIX) dpbtrf.$(SUFFIX) dpbtrs.$(SUFFIX) dpocon.$(SUFFIX) dpoequ.$(SUFFIX) dporfs.$(SUFFIX) dposv.$(SUFFIX) \ ++ dposvx.$(SUFFIX) dpotrs.$(SUFFIX) dppcon.$(SUFFIX) dppequ.$(SUFFIX) \ ++ dpprfs.$(SUFFIX) dppsv.$(SUFFIX) dppsvx.$(SUFFIX) dpptrf.$(SUFFIX) dpptri.$(SUFFIX) dpptrs.$(SUFFIX) dptcon.$(SUFFIX) \ ++ dpteqr.$(SUFFIX) dptrfs.$(SUFFIX) dptsv.$(SUFFIX) dptsvx.$(SUFFIX) dpttrs.$(SUFFIX) dptts2.$(SUFFIX) drscl.$(SUFFIX) \ ++ dsbev.$(SUFFIX) dsbevd.$(SUFFIX) dsbevx.$(SUFFIX) dsbgst.$(SUFFIX) dsbgv.$(SUFFIX) dsbgvd.$(SUFFIX) dsbgvx.$(SUFFIX) \ ++ dsbtrd.$(SUFFIX) dspcon.$(SUFFIX) dspev.$(SUFFIX) dspevd.$(SUFFIX) dspevx.$(SUFFIX) dspgst.$(SUFFIX) \ ++ dspgv.$(SUFFIX) dspgvd.$(SUFFIX) dspgvx.$(SUFFIX) dsprfs.$(SUFFIX) dspsv.$(SUFFIX) dspsvx.$(SUFFIX) dsptrd.$(SUFFIX) \ ++ dsptrf.$(SUFFIX) dsptri.$(SUFFIX) dsptrs.$(SUFFIX) dstegr.$(SUFFIX) dstein.$(SUFFIX) dstev.$(SUFFIX) dstevd.$(SUFFIX) dstevr.$(SUFFIX) \ ++ dstevx.$(SUFFIX) dsycon.$(SUFFIX) dsyev.$(SUFFIX) dsyevd.$(SUFFIX) dsyevr.$(SUFFIX) \ ++ dsyevx.$(SUFFIX) dsygs2.$(SUFFIX) dsygst.$(SUFFIX) dsygv.$(SUFFIX) dsygvd.$(SUFFIX) dsygvx.$(SUFFIX) dsyrfs.$(SUFFIX) \ ++ dsysv.$(SUFFIX) dsysvx.$(SUFFIX) \ ++ dsytd2.$(SUFFIX) dsytf2.$(SUFFIX) dsytrd.$(SUFFIX) dsytrf.$(SUFFIX) dsytri.$(SUFFIX) dsytrs.$(SUFFIX) dtbcon.$(SUFFIX) \ ++ dtbrfs.$(SUFFIX) dtbtrs.$(SUFFIX) dtgevc.$(SUFFIX) dtgex2.$(SUFFIX) dtgexc.$(SUFFIX) dtgsen.$(SUFFIX) \ ++ dtgsja.$(SUFFIX) dtgsna.$(SUFFIX) dtgsy2.$(SUFFIX) dtgsyl.$(SUFFIX) dtpcon.$(SUFFIX) dtprfs.$(SUFFIX) dtptri.$(SUFFIX) \ ++ dtptrs.$(SUFFIX) \ ++ dtrcon.$(SUFFIX) dtrevc.$(SUFFIX) dtrexc.$(SUFFIX) dtrrfs.$(SUFFIX) dtrsen.$(SUFFIX) dtrsna.$(SUFFIX) dtrsyl.$(SUFFIX) \ ++ dtrtrs.$(SUFFIX) dtzrqf.$(SUFFIX) dtzrzf.$(SUFFIX) dstemr.$(SUFFIX) \ ++ dsgesv.$(SUFFIX) dlag2s.$(SUFFIX) slag2d.$(SUFFIX) + + ZLASRC = \ +- zbdsqr.o zgbbrd.o zgbcon.o zgbequ.o zgbrfs.o zgbsv.o zgbsvx.o \ +- zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \ +- zgecon.o zgeequ.o zgees.o zgeesx.o zgeev.o zgeevx.o \ +- zgegs.o zgegv.o zgehd2.o zgehrd.o zgelq2.o zgelqf.o \ +- zgels.o zgelsd.o zgelss.o zgelsx.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \ +- zgeqpf.o zgeqr2.o zgeqrf.o zgerfs.o zgerq2.o zgerqf.o \ +- zgesc2.o zgesdd.o zgesv.o zgesvd.o zgesvx.o zgetc2.o zgetf2.o zgetrf.o \ +- zgetri.o zgetrs.o \ +- zggbak.o zggbal.o zgges.o zggesx.o zggev.o zggevx.o zggglm.o \ +- zgghrd.o zgglse.o zggqrf.o zggrqf.o \ +- zggsvd.o zggsvp.o \ +- zgtcon.o zgtrfs.o zgtsv.o zgtsvx.o zgttrf.o zgttrs.o zgtts2.o zhbev.o \ +- zhbevd.o zhbevx.o zhbgst.o zhbgv.o zhbgvd.o zhbgvx.o zhbtrd.o \ +- zhecon.o zheev.o zheevd.o zheevr.o zheevx.o zhegs2.o zhegst.o \ +- zhegv.o zhegvd.o zhegvx.o zherfs.o zhesv.o zhesvx.o zhetd2.o \ +- zhetf2.o zhetrd.o \ +- zhetrf.o zhetri.o zhetrs.o zhgeqz.o zhpcon.o zhpev.o zhpevd.o \ +- zhpevx.o zhpgst.o zhpgv.o zhpgvd.o zhpgvx.o zhprfs.o zhpsv.o \ +- zhpsvx.o \ +- zhptrd.o zhptrf.o zhptri.o zhptrs.o zhsein.o zhseqr.o zlabrd.o \ +- zlacgv.o zlacon.o zlacn2.o zlacp2.o zlacpy.o zlacrm.o zlacrt.o zladiv.o \ +- zlaed0.o zlaed7.o zlaed8.o \ +- zlaein.o zlaesy.o zlaev2.o zlags2.o zlagtm.o \ +- zlahef.o zlahqr.o \ +- zlahrd.o zlahr2.o zlaic1.o zlals0.o zlalsa.o zlalsd.o zlangb.o zlange.o \ +- zlangt.o zlanhb.o \ +- zlanhe.o \ +- zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \ +- zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \ +- zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ +- zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ +- zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o zlarcm.o zlarf.o zlarfb.o \ +- zlarfg.o zlarft.o \ +- zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ +- zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ +- zlassq.o zlaswp.o zlasyf.o \ +- zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlatzm.o zlauu2.o \ +- zlauum.o zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ +- zpbsvx.o zpbtf2.o zpbtrf.o zpbtrs.o zpocon.o zpoequ.o zporfs.o \ +- zposv.o zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zppcon.o \ +- zppequ.o zpprfs.o zppsv.o zppsvx.o zpptrf.o zpptri.o zpptrs.o \ +- zptcon.o zpteqr.o zptrfs.o zptsv.o zptsvx.o zpttrf.o zpttrs.o zptts2.o \ +- zrot.o zspcon.o zspmv.o zspr.o zsprfs.o zspsv.o \ +- zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \ +- zstegr.o zstein.o zsteqr.o zsycon.o zsymv.o \ +- zsyr.o zsyrfs.o zsysv.o zsysvx.o zsytf2.o zsytrf.o zsytri.o \ +- zsytrs.o ztbcon.o ztbrfs.o ztbtrs.o ztgevc.o ztgex2.o \ +- ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ +- ztprfs.o ztptri.o \ +- ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ +- ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ +- zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ +- zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ +- zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ +- zunmtr.o zupgtr.o \ +- zupmtr.o izmax1.o dzsum1.o zstemr.o \ +- zcgesv.o zlag2c.o clag2z.o ++ zbdsqr.$(SUFFIX) zgbbrd.$(SUFFIX) zgbcon.$(SUFFIX) zgbequ.$(SUFFIX) zgbrfs.$(SUFFIX) zgbsv.$(SUFFIX) zgbsvx.$(SUFFIX) \ ++ zgbtf2.$(SUFFIX) zgbtrf.$(SUFFIX) zgbtrs.$(SUFFIX) zgebak.$(SUFFIX) zgebal.$(SUFFIX) zgebd2.$(SUFFIX) zgebrd.$(SUFFIX) \ ++ zgecon.$(SUFFIX) zgeequ.$(SUFFIX) zgees.$(SUFFIX) zgeesx.$(SUFFIX) zgeev.$(SUFFIX) zgeevx.$(SUFFIX) \ ++ zgegs.$(SUFFIX) zgegv.$(SUFFIX) zgehd2.$(SUFFIX) zgehrd.$(SUFFIX) zgelq2.$(SUFFIX) zgelqf.$(SUFFIX) \ ++ zgels.$(SUFFIX) zgelsd.$(SUFFIX) zgelss.$(SUFFIX) zgelsx.$(SUFFIX) zgelsy.$(SUFFIX) zgeql2.$(SUFFIX) zgeqlf.$(SUFFIX) zgeqp3.$(SUFFIX) \ ++ zgeqpf.$(SUFFIX) zgeqr2.$(SUFFIX) zgeqrf.$(SUFFIX) zgerfs.$(SUFFIX) zgerq2.$(SUFFIX) zgerqf.$(SUFFIX) \ ++ zgesc2.$(SUFFIX) zgesdd.$(SUFFIX) zgesvd.$(SUFFIX) zgesvx.$(SUFFIX) zgetc2.$(SUFFIX) \ ++ zgetri.$(SUFFIX) \ ++ zggbak.$(SUFFIX) zggbal.$(SUFFIX) zgges.$(SUFFIX) zggesx.$(SUFFIX) zggev.$(SUFFIX) zggevx.$(SUFFIX) zggglm.$(SUFFIX) \ ++ zgghrd.$(SUFFIX) zgglse.$(SUFFIX) zggqrf.$(SUFFIX) zggrqf.$(SUFFIX) \ ++ zggsvd.$(SUFFIX) zggsvp.$(SUFFIX) \ ++ zgtcon.$(SUFFIX) zgtrfs.$(SUFFIX) zgtsv.$(SUFFIX) zgtsvx.$(SUFFIX) zgttrf.$(SUFFIX) zgttrs.$(SUFFIX) zgtts2.$(SUFFIX) zhbev.$(SUFFIX) \ ++ zhbevd.$(SUFFIX) zhbevx.$(SUFFIX) zhbgst.$(SUFFIX) zhbgv.$(SUFFIX) zhbgvd.$(SUFFIX) zhbgvx.$(SUFFIX) zhbtrd.$(SUFFIX) \ ++ zhecon.$(SUFFIX) zheev.$(SUFFIX) zheevd.$(SUFFIX) zheevr.$(SUFFIX) zheevx.$(SUFFIX) zhegs2.$(SUFFIX) zhegst.$(SUFFIX) \ ++ zhegv.$(SUFFIX) zhegvd.$(SUFFIX) zhegvx.$(SUFFIX) zherfs.$(SUFFIX) zhesv.$(SUFFIX) zhesvx.$(SUFFIX) zhetd2.$(SUFFIX) \ ++ zhetf2.$(SUFFIX) zhetrd.$(SUFFIX) \ ++ zhetrf.$(SUFFIX) zhetri.$(SUFFIX) zhetrs.$(SUFFIX) zhgeqz.$(SUFFIX) zhpcon.$(SUFFIX) zhpev.$(SUFFIX) zhpevd.$(SUFFIX) \ ++ zhpevx.$(SUFFIX) zhpgst.$(SUFFIX) zhpgv.$(SUFFIX) zhpgvd.$(SUFFIX) zhpgvx.$(SUFFIX) zhprfs.$(SUFFIX) zhpsv.$(SUFFIX) \ ++ zhpsvx.$(SUFFIX) \ ++ zhptrd.$(SUFFIX) zhptrf.$(SUFFIX) zhptri.$(SUFFIX) zhptrs.$(SUFFIX) zhsein.$(SUFFIX) zhseqr.$(SUFFIX) zlabrd.$(SUFFIX) \ ++ zlacgv.$(SUFFIX) zlacon.$(SUFFIX) zlacn2.$(SUFFIX) zlacp2.$(SUFFIX) zlacpy.$(SUFFIX) zlacrm.$(SUFFIX) zlacrt.$(SUFFIX) zladiv.$(SUFFIX) \ ++ zlaed0.$(SUFFIX) zlaed7.$(SUFFIX) zlaed8.$(SUFFIX) \ ++ zlaein.$(SUFFIX) zlaesy.$(SUFFIX) zlaev2.$(SUFFIX) zlags2.$(SUFFIX) zlagtm.$(SUFFIX) \ ++ zlahef.$(SUFFIX) zlahqr.$(SUFFIX) \ ++ zlahrd.$(SUFFIX) zlahr2.$(SUFFIX) zlaic1.$(SUFFIX) zlals0.$(SUFFIX) zlalsa.$(SUFFIX) zlalsd.$(SUFFIX) zlangb.$(SUFFIX) zlange.$(SUFFIX) \ ++ zlangt.$(SUFFIX) zlanhb.$(SUFFIX) \ ++ zlanhe.$(SUFFIX) \ ++ zlanhp.$(SUFFIX) zlanhs.$(SUFFIX) zlanht.$(SUFFIX) zlansb.$(SUFFIX) zlansp.$(SUFFIX) zlansy.$(SUFFIX) zlantb.$(SUFFIX) \ ++ zlantp.$(SUFFIX) zlantr.$(SUFFIX) zlapll.$(SUFFIX) zlapmt.$(SUFFIX) zlaqgb.$(SUFFIX) zlaqge.$(SUFFIX) \ ++ zlaqhb.$(SUFFIX) zlaqhe.$(SUFFIX) zlaqhp.$(SUFFIX) zlaqp2.$(SUFFIX) zlaqps.$(SUFFIX) zlaqsb.$(SUFFIX) \ ++ zlaqr0.$(SUFFIX) zlaqr1.$(SUFFIX) zlaqr2.$(SUFFIX) zlaqr3.$(SUFFIX) zlaqr4.$(SUFFIX) zlaqr5.$(SUFFIX) \ ++ zlaqsp.$(SUFFIX) zlaqsy.$(SUFFIX) zlar1v.$(SUFFIX) zlar2v.$(SUFFIX) zlarcm.$(SUFFIX) zlarf.$(SUFFIX) zlarfb.$(SUFFIX) \ ++ zlarfg.$(SUFFIX) zlarft.$(SUFFIX) \ ++ zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ ++ zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ ++ zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \ ++ zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) \ ++ zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ ++ zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \ ++ zposv.$(SUFFIX) zposvx.$(SUFFIX) zpotrs.$(SUFFIX) zppcon.$(SUFFIX) \ ++ zppequ.$(SUFFIX) zpprfs.$(SUFFIX) zppsv.$(SUFFIX) zppsvx.$(SUFFIX) zpptrf.$(SUFFIX) zpptri.$(SUFFIX) zpptrs.$(SUFFIX) \ ++ zptcon.$(SUFFIX) zpteqr.$(SUFFIX) zptrfs.$(SUFFIX) zptsv.$(SUFFIX) zptsvx.$(SUFFIX) zpttrf.$(SUFFIX) zpttrs.$(SUFFIX) zptts2.$(SUFFIX) \ ++ zrot.$(SUFFIX) zspcon.$(SUFFIX) zsprfs.$(SUFFIX) zspsv.$(SUFFIX) \ ++ zspsvx.$(SUFFIX) zsptrf.$(SUFFIX) zsptri.$(SUFFIX) zsptrs.$(SUFFIX) zdrscl.$(SUFFIX) zstedc.$(SUFFIX) \ ++ zstegr.$(SUFFIX) zstein.$(SUFFIX) zsteqr.$(SUFFIX) zsycon.$(SUFFIX) \ ++ zsyrfs.$(SUFFIX) zsysv.$(SUFFIX) zsysvx.$(SUFFIX) zsytf2.$(SUFFIX) zsytrf.$(SUFFIX) zsytri.$(SUFFIX) \ ++ zsytrs.$(SUFFIX) ztbcon.$(SUFFIX) ztbrfs.$(SUFFIX) ztbtrs.$(SUFFIX) ztgevc.$(SUFFIX) ztgex2.$(SUFFIX) \ ++ ztgexc.$(SUFFIX) ztgsen.$(SUFFIX) ztgsja.$(SUFFIX) ztgsna.$(SUFFIX) ztgsy2.$(SUFFIX) ztgsyl.$(SUFFIX) ztpcon.$(SUFFIX) \ ++ ztprfs.$(SUFFIX) ztptri.$(SUFFIX) \ ++ ztptrs.$(SUFFIX) ztrcon.$(SUFFIX) ztrevc.$(SUFFIX) ztrexc.$(SUFFIX) ztrrfs.$(SUFFIX) ztrsen.$(SUFFIX) ztrsna.$(SUFFIX) \ ++ ztrsyl.$(SUFFIX) ztrtrs.$(SUFFIX) ztzrqf.$(SUFFIX) ztzrzf.$(SUFFIX) zung2l.$(SUFFIX) \ ++ zung2r.$(SUFFIX) zungbr.$(SUFFIX) zunghr.$(SUFFIX) zungl2.$(SUFFIX) zunglq.$(SUFFIX) zungql.$(SUFFIX) zungqr.$(SUFFIX) zungr2.$(SUFFIX) \ ++ zungrq.$(SUFFIX) zungtr.$(SUFFIX) zunm2l.$(SUFFIX) zunm2r.$(SUFFIX) zunmbr.$(SUFFIX) zunmhr.$(SUFFIX) zunml2.$(SUFFIX) \ ++ zunmlq.$(SUFFIX) zunmql.$(SUFFIX) zunmqr.$(SUFFIX) zunmr2.$(SUFFIX) zunmr3.$(SUFFIX) zunmrq.$(SUFFIX) zunmrz.$(SUFFIX) \ ++ zunmtr.$(SUFFIX) zupgtr.$(SUFFIX) \ ++ zupmtr.$(SUFFIX) izmax1.$(SUFFIX) dzsum1.$(SUFFIX) zstemr.$(SUFFIX) \ ++ zcgesv.$(SUFFIX) zlag2c.$(SUFFIX) clag2z.$(SUFFIX) + + all: ../$(LAPACKLIB) + ++lapack_prof: ../$(LAPACKLIB_P) ++ + ALLOBJ=$(SLASRC) $(DLASRC) $(CLASRC) $(ZLASRC) $(SCLAUX) $(DZLAUX) \ + $(ALLAUX) + ++ALLOBJ_P = $(ALLOBJ:.$(SUFFIX)=.$(PSUFFIX)) ++ + ../$(LAPACKLIB): $(ALLOBJ) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) + $(RANLIB) $@ + ++../$(LAPACKLIB_P): $(ALLOBJ_P) ++ $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ_P) ++ $(RANLIB) $@ ++ + single: $(SLASRC) $(ALLAUX) $(SCLAUX) + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(ALLAUX) \ + $(SCLAUX) +@@ -317,6 +325,7 @@ + $(DZLAUX) + $(RANLIB) ../$(LAPACKLIB) + ++ + $(ALLAUX): $(FRC) + $(SCLAUX): $(FRC) + $(DZLAUX): $(FRC) +@@ -329,11 +338,16 @@ + @FRC=$(FRC) + + clean: +- rm -f *.o ++ rm -f *.$(SUFFIX) *.$(PSUFFIX) + +-.f.o: ++%.$(SUFFIX): %.f + $(FORTRAN) $(OPTS) -c $< -o $@ + +-slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ ++%.$(PSUFFIX): %.f ++ $(FORTRAN) $(POPTS) -c $< -o $@ ++ ++slaruv.$(SUFFIX): slaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dlaruv.$(SUFFIX): dlaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ + ++slaruv.$(PSUFFIX): slaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dlaruv.$(PSUFFIX): dlaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ +diff -ruN lapack-3.1.1.old/TESTING/EIG/Makefile lapack-3.1.1/TESTING/EIG/Makefile +--- lapack-3.1.1.old/TESTING/EIG/Makefile 2007-02-20 15:33:03.000000000 -0600 ++++ lapack-3.1.1/TESTING/EIG/Makefile 2009-12-16 14:40:35.000000000 -0600 +@@ -78,7 +78,7 @@ + cget35.o cget36.o cget37.o cget38.o cget51.o cget52.o \ + cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts.o \ + chbt21.o chet21.o chet22.o chpt21.o chst01.o \ +- clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ ++ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o \ + csgt01.o cslect.o \ + cstt21.o cstt22.o cunt01.o cunt03.o + +@@ -115,7 +115,7 @@ + zget35.o zget36.o zget37.o zget38.o zget51.o zget52.o \ + zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts.o \ + zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ +- zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ ++ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o \ + zsgt01.o zslect.o \ + zstt21.o zstt22.o zunt01.o zunt03.o + +@@ -129,22 +129,22 @@ + ../xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) ; \ + $(LOADER) $(LOADOPTS) -o $@ \ + $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) + + ../xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) ; \ + $(LOADER) $(LOADOPTS) -o $@ \ + $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) + + ../xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) ; \ + $(LOADER) $(LOADOPTS) -o $@ \ + $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) + + ../xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) ; \ + $(LOADER) $(LOADOPTS) -o $@ \ + $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) + + $(AEIGTST): $(FRC) + $(SCIGTST): $(FRC) +diff -ruN lapack-3.1.1.old/TESTING/LIN/Makefile lapack-3.1.1/TESTING/LIN/Makefile +--- lapack-3.1.1.old/TESTING/LIN/Makefile 2007-02-20 15:33:03.000000000 -0600 ++++ lapack-3.1.1/TESTING/LIN/Makefile 2009-12-16 14:40:35.000000000 -0600 +@@ -97,7 +97,7 @@ + cqpt01.o cqrt01.o cqrt02.o cqrt03.o cqrt11.o \ + cqrt12.o cqrt13.o cqrt14.o cqrt15.o cqrt16.o \ + cqrt17.o crqt01.o crqt02.o crqt03.o crzt01.o crzt02.o \ +- csbmv.o cspt01.o \ ++ cspt01.o \ + cspt02.o cspt03.o csyt01.o csyt02.o csyt03.o \ + ctbt02.o ctbt03.o ctbt05.o ctbt06.o ctpt01.o \ + ctpt02.o ctpt03.o ctpt05.o ctpt06.o ctrt01.o \ +@@ -159,7 +159,7 @@ + zqpt01.o zqrt01.o zqrt02.o zqrt03.o zqrt11.o \ + zqrt12.o zqrt13.o zqrt14.o zqrt15.o zqrt16.o \ + zqrt17.o zrqt01.o zrqt02.o zrqt03.o zrzt01.o zrzt02.o \ +- zsbmv.o zspt01.o \ ++ zspt01.o \ + zspt02.o zspt03.o zsyt01.o zsyt02.o zsyt03.o \ + ztbt02.o ztbt03.o ztbt05.o ztbt06.o ztpt01.o \ + ztpt02.o ztpt03.o ztpt05.o ztpt06.o ztrt01.o \ +@@ -176,7 +176,7 @@ + zdrvab.o zerrab.o zget08.o \ + alaerh.o alahd.o aladhd.o alareq.o \ + chkxer.o zget02.o zlarhs.o zlatb4.o \ +- zsbmv.o xerbla.o ++ xerbla.o + + all: single double complex complex16 proto-double proto-complex16 + +@@ -190,27 +190,27 @@ + + ../xlintsts : $(ALINTST) $(SLINTST) $(SCLNTST) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(SLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstc : $(ALINTST) $(CLINTST) $(SCLNTST) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(CLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstd : $(ALINTST) $(DLINTST) $(DZLNTST) + $(LOADER) $(LOADOPTS) $(ALINTST) $(DZLNTST) $(DLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstz : $(ALINTST) $(ZLINTST) $(DZLNTST) + $(LOADER) $(LOADOPTS) $(ALINTST) $(DZLNTST) $(ZLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstds : $(DSLINTST) + $(LOADER) $(LOADOPTS) $(DSLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstzc : $(ZCLINTST) + $(LOADER) $(LOADOPTS) $(ZCLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + $(ALINTST): $(FRC) + $(SCLNTST): $(FRC) diff --git a/quickbuild.32bit b/quickbuild.32bit new file mode 100755 index 0000000..b1b548a --- /dev/null +++ b/quickbuild.32bit @@ -0,0 +1,3 @@ +#!/bin/bash + +make -j 2 BINARY=32 diff --git a/quickbuild.64bit b/quickbuild.64bit new file mode 100755 index 0000000..fd313df --- /dev/null +++ b/quickbuild.64bit @@ -0,0 +1,3 @@ +#!/bin/bash + +make BINARY=64 diff --git a/quickbuild.win32 b/quickbuild.win32 new file mode 100755 index 0000000..29949c1 --- /dev/null +++ b/quickbuild.win32 @@ -0,0 +1,3 @@ +#!/bin/bash + +make BINARY=32 CC=gcc FC=gfortran diff --git a/quickbuild.win64 b/quickbuild.win64 new file mode 100755 index 0000000..88f748a --- /dev/null +++ b/quickbuild.win64 @@ -0,0 +1,3 @@ +#!/bin/bash + +make BINARY=64 CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran diff --git a/reference/._Makefile b/reference/._Makefile new file mode 100644 index 0000000..683cfa8 Binary files /dev/null and b/reference/._Makefile differ diff --git a/reference/._caxpycf.f b/reference/._caxpycf.f new file mode 100644 index 0000000..79aeab3 Binary files /dev/null and b/reference/._caxpycf.f differ diff --git a/reference/._caxpyf.f b/reference/._caxpyf.f new file mode 100644 index 0000000..c3a2bdf Binary files /dev/null and b/reference/._caxpyf.f differ diff --git a/reference/._ccopyf.f b/reference/._ccopyf.f new file mode 100644 index 0000000..b7d7657 Binary files /dev/null and b/reference/._ccopyf.f differ diff --git a/reference/._cdotcf.f b/reference/._cdotcf.f new file mode 100644 index 0000000..92e8d01 Binary files /dev/null and b/reference/._cdotcf.f differ diff --git a/reference/._cdotuf.f b/reference/._cdotuf.f new file mode 100644 index 0000000..c75382b Binary files /dev/null and b/reference/._cdotuf.f differ diff --git a/reference/._cgbmvf.f b/reference/._cgbmvf.f new file mode 100644 index 0000000..8b8297d Binary files /dev/null and b/reference/._cgbmvf.f differ diff --git a/reference/._cgemm3mf.f b/reference/._cgemm3mf.f new file mode 100644 index 0000000..1f8b2b4 Binary files /dev/null and b/reference/._cgemm3mf.f differ diff --git a/reference/._cgemmf.f b/reference/._cgemmf.f new file mode 100644 index 0000000..f9d7aa3 Binary files /dev/null and b/reference/._cgemmf.f differ diff --git a/reference/._cgemvf.f b/reference/._cgemvf.f new file mode 100644 index 0000000..dd272fd Binary files /dev/null and b/reference/._cgemvf.f differ diff --git a/reference/._cgercf.f b/reference/._cgercf.f new file mode 100644 index 0000000..6cb7eca Binary files /dev/null and b/reference/._cgercf.f differ diff --git a/reference/._cgeruf.f b/reference/._cgeruf.f new file mode 100644 index 0000000..a611fb2 Binary files /dev/null and b/reference/._cgeruf.f differ diff --git a/reference/._cgesvf.f b/reference/._cgesvf.f new file mode 100644 index 0000000..45bec5d Binary files /dev/null and b/reference/._cgesvf.f differ diff --git a/reference/._cgetf2f.f b/reference/._cgetf2f.f new file mode 100644 index 0000000..cba3a5b Binary files /dev/null and b/reference/._cgetf2f.f differ diff --git a/reference/._cgetrff.f b/reference/._cgetrff.f new file mode 100644 index 0000000..bec7bd6 Binary files /dev/null and b/reference/._cgetrff.f differ diff --git a/reference/._cgetrsf.f b/reference/._cgetrsf.f new file mode 100644 index 0000000..146964f Binary files /dev/null and b/reference/._cgetrsf.f differ diff --git a/reference/._chbmvf.f b/reference/._chbmvf.f new file mode 100644 index 0000000..dfddaa1 Binary files /dev/null and b/reference/._chbmvf.f differ diff --git a/reference/._chemm3mf.f b/reference/._chemm3mf.f new file mode 100644 index 0000000..289bd15 Binary files /dev/null and b/reference/._chemm3mf.f differ diff --git a/reference/._chemmf.f b/reference/._chemmf.f new file mode 100644 index 0000000..5d8c2b8 Binary files /dev/null and b/reference/._chemmf.f differ diff --git a/reference/._chemvf.f b/reference/._chemvf.f new file mode 100644 index 0000000..166d848 Binary files /dev/null and b/reference/._chemvf.f differ diff --git a/reference/._cher2f.f b/reference/._cher2f.f new file mode 100644 index 0000000..684501c Binary files /dev/null and b/reference/._cher2f.f differ diff --git a/reference/._cher2kf.f b/reference/._cher2kf.f new file mode 100644 index 0000000..c77e7a0 Binary files /dev/null and b/reference/._cher2kf.f differ diff --git a/reference/._cherf.f b/reference/._cherf.f new file mode 100644 index 0000000..3d45471 Binary files /dev/null and b/reference/._cherf.f differ diff --git a/reference/._cherkf.f b/reference/._cherkf.f new file mode 100644 index 0000000..509fbc0 Binary files /dev/null and b/reference/._cherkf.f differ diff --git a/reference/._chpmvf.f b/reference/._chpmvf.f new file mode 100644 index 0000000..4067452 Binary files /dev/null and b/reference/._chpmvf.f differ diff --git a/reference/._chpr2f.f b/reference/._chpr2f.f new file mode 100644 index 0000000..cd90639 Binary files /dev/null and b/reference/._chpr2f.f differ diff --git a/reference/._chprf.f b/reference/._chprf.f new file mode 100644 index 0000000..7217d57 Binary files /dev/null and b/reference/._chprf.f differ diff --git a/reference/._claswpf.f b/reference/._claswpf.f new file mode 100644 index 0000000..dbedbd0 Binary files /dev/null and b/reference/._claswpf.f differ diff --git a/reference/._clauu2f.f b/reference/._clauu2f.f new file mode 100644 index 0000000..d9ed38f Binary files /dev/null and b/reference/._clauu2f.f differ diff --git a/reference/._clauumf.f b/reference/._clauumf.f new file mode 100644 index 0000000..4609824 Binary files /dev/null and b/reference/._clauumf.f differ diff --git a/reference/._cpotf2f.f b/reference/._cpotf2f.f new file mode 100644 index 0000000..dea5dde Binary files /dev/null and b/reference/._cpotf2f.f differ diff --git a/reference/._cpotrff.f b/reference/._cpotrff.f new file mode 100644 index 0000000..c72625e Binary files /dev/null and b/reference/._cpotrff.f differ diff --git a/reference/._cpotrif.f b/reference/._cpotrif.f new file mode 100644 index 0000000..88672ee Binary files /dev/null and b/reference/._cpotrif.f differ diff --git a/reference/._crotgf.f b/reference/._crotgf.f new file mode 100644 index 0000000..37651a0 Binary files /dev/null and b/reference/._crotgf.f differ diff --git a/reference/._csbmvf.f b/reference/._csbmvf.f new file mode 100644 index 0000000..8606b78 Binary files /dev/null and b/reference/._csbmvf.f differ diff --git a/reference/._cscalf.f b/reference/._cscalf.f new file mode 100644 index 0000000..101607f Binary files /dev/null and b/reference/._cscalf.f differ diff --git a/reference/._cspmvf.f b/reference/._cspmvf.f new file mode 100644 index 0000000..5d19ce6 Binary files /dev/null and b/reference/._cspmvf.f differ diff --git a/reference/._cspr2f.f b/reference/._cspr2f.f new file mode 100644 index 0000000..9a8adfd Binary files /dev/null and b/reference/._cspr2f.f differ diff --git a/reference/._csprf.f b/reference/._csprf.f new file mode 100644 index 0000000..4c67c96 Binary files /dev/null and b/reference/._csprf.f differ diff --git a/reference/._csrotf.f b/reference/._csrotf.f new file mode 100644 index 0000000..d14d321 Binary files /dev/null and b/reference/._csrotf.f differ diff --git a/reference/._csscalf.f b/reference/._csscalf.f new file mode 100644 index 0000000..25357a1 Binary files /dev/null and b/reference/._csscalf.f differ diff --git a/reference/._cswapf.f b/reference/._cswapf.f new file mode 100644 index 0000000..5002d5f Binary files /dev/null and b/reference/._cswapf.f differ diff --git a/reference/._csymm3mf.f b/reference/._csymm3mf.f new file mode 100644 index 0000000..b054078 Binary files /dev/null and b/reference/._csymm3mf.f differ diff --git a/reference/._csymmf.f b/reference/._csymmf.f new file mode 100644 index 0000000..6ed778b Binary files /dev/null and b/reference/._csymmf.f differ diff --git a/reference/._csymvf.f b/reference/._csymvf.f new file mode 100644 index 0000000..9928e9a Binary files /dev/null and b/reference/._csymvf.f differ diff --git a/reference/._csyr2f.f b/reference/._csyr2f.f new file mode 100644 index 0000000..4f5bc98 Binary files /dev/null and b/reference/._csyr2f.f differ diff --git a/reference/._csyr2kf.f b/reference/._csyr2kf.f new file mode 100644 index 0000000..5aebe21 Binary files /dev/null and b/reference/._csyr2kf.f differ diff --git a/reference/._csyrf.f b/reference/._csyrf.f new file mode 100644 index 0000000..f74f890 Binary files /dev/null and b/reference/._csyrf.f differ diff --git a/reference/._csyrkf.f b/reference/._csyrkf.f new file mode 100644 index 0000000..73a91e8 Binary files /dev/null and b/reference/._csyrkf.f differ diff --git a/reference/._ctbmvf.f b/reference/._ctbmvf.f new file mode 100644 index 0000000..274317f Binary files /dev/null and b/reference/._ctbmvf.f differ diff --git a/reference/._ctbsvf.f b/reference/._ctbsvf.f new file mode 100644 index 0000000..763215b Binary files /dev/null and b/reference/._ctbsvf.f differ diff --git a/reference/._ctpmvf.f b/reference/._ctpmvf.f new file mode 100644 index 0000000..659da17 Binary files /dev/null and b/reference/._ctpmvf.f differ diff --git a/reference/._ctpsvf.f b/reference/._ctpsvf.f new file mode 100644 index 0000000..b55e769 Binary files /dev/null and b/reference/._ctpsvf.f differ diff --git a/reference/._ctrmmf.f b/reference/._ctrmmf.f new file mode 100644 index 0000000..0572a8c Binary files /dev/null and b/reference/._ctrmmf.f differ diff --git a/reference/._ctrmvf.f b/reference/._ctrmvf.f new file mode 100644 index 0000000..8fa0754 Binary files /dev/null and b/reference/._ctrmvf.f differ diff --git a/reference/._ctrsmf.f b/reference/._ctrsmf.f new file mode 100644 index 0000000..56d873b Binary files /dev/null and b/reference/._ctrsmf.f differ diff --git a/reference/._ctrsvf.f b/reference/._ctrsvf.f new file mode 100644 index 0000000..1eca8a5 Binary files /dev/null and b/reference/._ctrsvf.f differ diff --git a/reference/._ctrti2f.f b/reference/._ctrti2f.f new file mode 100644 index 0000000..3bba354 Binary files /dev/null and b/reference/._ctrti2f.f differ diff --git a/reference/._ctrtrif.f b/reference/._ctrtrif.f new file mode 100644 index 0000000..0f1719b Binary files /dev/null and b/reference/._ctrtrif.f differ diff --git a/reference/._damaxf.f b/reference/._damaxf.f new file mode 100644 index 0000000..288ffb9 Binary files /dev/null and b/reference/._damaxf.f differ diff --git a/reference/._daminf.f b/reference/._daminf.f new file mode 100644 index 0000000..91475a1 Binary files /dev/null and b/reference/._daminf.f differ diff --git a/reference/._dasumf.f b/reference/._dasumf.f new file mode 100644 index 0000000..aabd51c Binary files /dev/null and b/reference/._dasumf.f differ diff --git a/reference/._daxpyf.f b/reference/._daxpyf.f new file mode 100644 index 0000000..7e8bc76 Binary files /dev/null and b/reference/._daxpyf.f differ diff --git a/reference/._dcopyf.f b/reference/._dcopyf.f new file mode 100644 index 0000000..b0c9744 Binary files /dev/null and b/reference/._dcopyf.f differ diff --git a/reference/._ddotf.f b/reference/._ddotf.f new file mode 100644 index 0000000..7a211b7 Binary files /dev/null and b/reference/._ddotf.f differ diff --git a/reference/._dgbmvf.f b/reference/._dgbmvf.f new file mode 100644 index 0000000..e8bb4f6 Binary files /dev/null and b/reference/._dgbmvf.f differ diff --git a/reference/._dgemmf.f b/reference/._dgemmf.f new file mode 100644 index 0000000..0677523 Binary files /dev/null and b/reference/._dgemmf.f differ diff --git a/reference/._dgemvf.f b/reference/._dgemvf.f new file mode 100644 index 0000000..b420c89 Binary files /dev/null and b/reference/._dgemvf.f differ diff --git a/reference/._dgerf.f b/reference/._dgerf.f new file mode 100644 index 0000000..15d74c4 Binary files /dev/null and b/reference/._dgerf.f differ diff --git a/reference/._dgesvf.f b/reference/._dgesvf.f new file mode 100644 index 0000000..aec6524 Binary files /dev/null and b/reference/._dgesvf.f differ diff --git a/reference/._dgetf2f.f b/reference/._dgetf2f.f new file mode 100644 index 0000000..842b7e8 Binary files /dev/null and b/reference/._dgetf2f.f differ diff --git a/reference/._dgetrff.f b/reference/._dgetrff.f new file mode 100644 index 0000000..0aa9375 Binary files /dev/null and b/reference/._dgetrff.f differ diff --git a/reference/._dgetrsf.f b/reference/._dgetrsf.f new file mode 100644 index 0000000..6da0c14 Binary files /dev/null and b/reference/._dgetrsf.f differ diff --git a/reference/._dlaswpf.f b/reference/._dlaswpf.f new file mode 100644 index 0000000..75b2525 Binary files /dev/null and b/reference/._dlaswpf.f differ diff --git a/reference/._dlauu2f.f b/reference/._dlauu2f.f new file mode 100644 index 0000000..7edf126 Binary files /dev/null and b/reference/._dlauu2f.f differ diff --git a/reference/._dlauumf.f b/reference/._dlauumf.f new file mode 100644 index 0000000..249ed8b Binary files /dev/null and b/reference/._dlauumf.f differ diff --git a/reference/._dmaxf.f b/reference/._dmaxf.f new file mode 100644 index 0000000..d586e1d Binary files /dev/null and b/reference/._dmaxf.f differ diff --git a/reference/._dminf.f b/reference/._dminf.f new file mode 100644 index 0000000..0c95979 Binary files /dev/null and b/reference/._dminf.f differ diff --git a/reference/._dnrm2f.f b/reference/._dnrm2f.f new file mode 100644 index 0000000..5848969 Binary files /dev/null and b/reference/._dnrm2f.f differ diff --git a/reference/._dpotf2f.f b/reference/._dpotf2f.f new file mode 100644 index 0000000..d253596 Binary files /dev/null and b/reference/._dpotf2f.f differ diff --git a/reference/._dpotrff.f b/reference/._dpotrff.f new file mode 100644 index 0000000..697564d Binary files /dev/null and b/reference/._dpotrff.f differ diff --git a/reference/._dpotrif.f b/reference/._dpotrif.f new file mode 100644 index 0000000..ab1e12c Binary files /dev/null and b/reference/._dpotrif.f differ diff --git a/reference/._drotf.f b/reference/._drotf.f new file mode 100644 index 0000000..ec448a8 Binary files /dev/null and b/reference/._drotf.f differ diff --git a/reference/._drotgf.f b/reference/._drotgf.f new file mode 100644 index 0000000..21e9cb7 Binary files /dev/null and b/reference/._drotgf.f differ diff --git a/reference/._drotmf.f b/reference/._drotmf.f new file mode 100644 index 0000000..beada83 Binary files /dev/null and b/reference/._drotmf.f differ diff --git a/reference/._drotmgf.f b/reference/._drotmgf.f new file mode 100644 index 0000000..fc152ef Binary files /dev/null and b/reference/._drotmgf.f differ diff --git a/reference/._dsbmvf.f b/reference/._dsbmvf.f new file mode 100644 index 0000000..0a82046 Binary files /dev/null and b/reference/._dsbmvf.f differ diff --git a/reference/._dscalf.f b/reference/._dscalf.f new file mode 100644 index 0000000..1d0b635 Binary files /dev/null and b/reference/._dscalf.f differ diff --git a/reference/._dsdotf.f b/reference/._dsdotf.f new file mode 100644 index 0000000..d5c0299 Binary files /dev/null and b/reference/._dsdotf.f differ diff --git a/reference/._dspmvf.f b/reference/._dspmvf.f new file mode 100644 index 0000000..b2e8730 Binary files /dev/null and b/reference/._dspmvf.f differ diff --git a/reference/._dspr2f.f b/reference/._dspr2f.f new file mode 100644 index 0000000..f193118 Binary files /dev/null and b/reference/._dspr2f.f differ diff --git a/reference/._dsprf.f b/reference/._dsprf.f new file mode 100644 index 0000000..c42473a Binary files /dev/null and b/reference/._dsprf.f differ diff --git a/reference/._dswapf.f b/reference/._dswapf.f new file mode 100644 index 0000000..d2e11d0 Binary files /dev/null and b/reference/._dswapf.f differ diff --git a/reference/._dsymmf.f b/reference/._dsymmf.f new file mode 100644 index 0000000..35a821a Binary files /dev/null and b/reference/._dsymmf.f differ diff --git a/reference/._dsymvf.f b/reference/._dsymvf.f new file mode 100644 index 0000000..1cb27c2 Binary files /dev/null and b/reference/._dsymvf.f differ diff --git a/reference/._dsyr2f.f b/reference/._dsyr2f.f new file mode 100644 index 0000000..a176bf8 Binary files /dev/null and b/reference/._dsyr2f.f differ diff --git a/reference/._dsyr2kf.f b/reference/._dsyr2kf.f new file mode 100644 index 0000000..8518c12 Binary files /dev/null and b/reference/._dsyr2kf.f differ diff --git a/reference/._dsyrf.f b/reference/._dsyrf.f new file mode 100644 index 0000000..337d97b Binary files /dev/null and b/reference/._dsyrf.f differ diff --git a/reference/._dsyrkf.f b/reference/._dsyrkf.f new file mode 100644 index 0000000..1c58871 Binary files /dev/null and b/reference/._dsyrkf.f differ diff --git a/reference/._dtbmvf.f b/reference/._dtbmvf.f new file mode 100644 index 0000000..546d0dd Binary files /dev/null and b/reference/._dtbmvf.f differ diff --git a/reference/._dtbsvf.f b/reference/._dtbsvf.f new file mode 100644 index 0000000..e886883 Binary files /dev/null and b/reference/._dtbsvf.f differ diff --git a/reference/._dtpmvf.f b/reference/._dtpmvf.f new file mode 100644 index 0000000..e928fc6 Binary files /dev/null and b/reference/._dtpmvf.f differ diff --git a/reference/._dtpsvf.f b/reference/._dtpsvf.f new file mode 100644 index 0000000..83e46aa Binary files /dev/null and b/reference/._dtpsvf.f differ diff --git a/reference/._dtrmmf.f b/reference/._dtrmmf.f new file mode 100644 index 0000000..e77f7fa Binary files /dev/null and b/reference/._dtrmmf.f differ diff --git a/reference/._dtrmvf.f b/reference/._dtrmvf.f new file mode 100644 index 0000000..01a171e Binary files /dev/null and b/reference/._dtrmvf.f differ diff --git a/reference/._dtrsmf.f b/reference/._dtrsmf.f new file mode 100644 index 0000000..342b841 Binary files /dev/null and b/reference/._dtrsmf.f differ diff --git a/reference/._dtrsvf.f b/reference/._dtrsvf.f new file mode 100644 index 0000000..1aea233 Binary files /dev/null and b/reference/._dtrsvf.f differ diff --git a/reference/._dtrti2f.f b/reference/._dtrti2f.f new file mode 100644 index 0000000..40d3d3c Binary files /dev/null and b/reference/._dtrti2f.f differ diff --git a/reference/._dtrtrif.f b/reference/._dtrtrif.f new file mode 100644 index 0000000..bcc0657 Binary files /dev/null and b/reference/._dtrtrif.f differ diff --git a/reference/._dzamaxf.f b/reference/._dzamaxf.f new file mode 100644 index 0000000..6207aea Binary files /dev/null and b/reference/._dzamaxf.f differ diff --git a/reference/._dzaminf.f b/reference/._dzaminf.f new file mode 100644 index 0000000..7295e39 Binary files /dev/null and b/reference/._dzaminf.f differ diff --git a/reference/._dzasumf.f b/reference/._dzasumf.f new file mode 100644 index 0000000..050a288 Binary files /dev/null and b/reference/._dzasumf.f differ diff --git a/reference/._dznrm2f.f b/reference/._dznrm2f.f new file mode 100644 index 0000000..678d817 Binary files /dev/null and b/reference/._dznrm2f.f differ diff --git a/reference/._icamaxf.f b/reference/._icamaxf.f new file mode 100644 index 0000000..ad4f8db Binary files /dev/null and b/reference/._icamaxf.f differ diff --git a/reference/._icaminf.f b/reference/._icaminf.f new file mode 100644 index 0000000..df5b056 Binary files /dev/null and b/reference/._icaminf.f differ diff --git a/reference/._idamaxf.f b/reference/._idamaxf.f new file mode 100644 index 0000000..9d378a1 Binary files /dev/null and b/reference/._idamaxf.f differ diff --git a/reference/._idaminf.f b/reference/._idaminf.f new file mode 100644 index 0000000..f0ccdb8 Binary files /dev/null and b/reference/._idaminf.f differ diff --git a/reference/._idmaxf.f b/reference/._idmaxf.f new file mode 100644 index 0000000..389559f Binary files /dev/null and b/reference/._idmaxf.f differ diff --git a/reference/._idminf.f b/reference/._idminf.f new file mode 100644 index 0000000..7aa0ae3 Binary files /dev/null and b/reference/._idminf.f differ diff --git a/reference/._iqamaxf.f b/reference/._iqamaxf.f new file mode 100644 index 0000000..6a161ab Binary files /dev/null and b/reference/._iqamaxf.f differ diff --git a/reference/._iqaminf.f b/reference/._iqaminf.f new file mode 100644 index 0000000..4567643 Binary files /dev/null and b/reference/._iqaminf.f differ diff --git a/reference/._iqmaxf.f b/reference/._iqmaxf.f new file mode 100644 index 0000000..074aec3 Binary files /dev/null and b/reference/._iqmaxf.f differ diff --git a/reference/._iqminf.f b/reference/._iqminf.f new file mode 100644 index 0000000..668b021 Binary files /dev/null and b/reference/._iqminf.f differ diff --git a/reference/._isamaxf.f b/reference/._isamaxf.f new file mode 100644 index 0000000..a2fb25a Binary files /dev/null and b/reference/._isamaxf.f differ diff --git a/reference/._isaminf.f b/reference/._isaminf.f new file mode 100644 index 0000000..d9710a3 Binary files /dev/null and b/reference/._isaminf.f differ diff --git a/reference/._ismaxf.f b/reference/._ismaxf.f new file mode 100644 index 0000000..d4e8ec3 Binary files /dev/null and b/reference/._ismaxf.f differ diff --git a/reference/._isminf.f b/reference/._isminf.f new file mode 100644 index 0000000..a9bd2fa Binary files /dev/null and b/reference/._isminf.f differ diff --git a/reference/._ixamaxf.f b/reference/._ixamaxf.f new file mode 100644 index 0000000..6d98cfa Binary files /dev/null and b/reference/._ixamaxf.f differ diff --git a/reference/._ixaminf.f b/reference/._ixaminf.f new file mode 100644 index 0000000..7d884b1 Binary files /dev/null and b/reference/._ixaminf.f differ diff --git a/reference/._izamaxf.f b/reference/._izamaxf.f new file mode 100644 index 0000000..79bcea4 Binary files /dev/null and b/reference/._izamaxf.f differ diff --git a/reference/._izaminf.f b/reference/._izaminf.f new file mode 100644 index 0000000..1e90126 Binary files /dev/null and b/reference/._izaminf.f differ diff --git a/reference/._lsamef.f b/reference/._lsamef.f new file mode 100644 index 0000000..d27469d Binary files /dev/null and b/reference/._lsamef.f differ diff --git a/reference/._samaxf.f b/reference/._samaxf.f new file mode 100644 index 0000000..9322348 Binary files /dev/null and b/reference/._samaxf.f differ diff --git a/reference/._saminf.f b/reference/._saminf.f new file mode 100644 index 0000000..500411d Binary files /dev/null and b/reference/._saminf.f differ diff --git a/reference/._sasumf.f b/reference/._sasumf.f new file mode 100644 index 0000000..92ffc5a Binary files /dev/null and b/reference/._sasumf.f differ diff --git a/reference/._saxpyf.f b/reference/._saxpyf.f new file mode 100644 index 0000000..6bf1e78 Binary files /dev/null and b/reference/._saxpyf.f differ diff --git a/reference/._scamaxf.f b/reference/._scamaxf.f new file mode 100644 index 0000000..4d12024 Binary files /dev/null and b/reference/._scamaxf.f differ diff --git a/reference/._scaminf.f b/reference/._scaminf.f new file mode 100644 index 0000000..aa19931 Binary files /dev/null and b/reference/._scaminf.f differ diff --git a/reference/._scasumf.f b/reference/._scasumf.f new file mode 100644 index 0000000..47d58cb Binary files /dev/null and b/reference/._scasumf.f differ diff --git a/reference/._scnrm2f.f b/reference/._scnrm2f.f new file mode 100644 index 0000000..d9a0865 Binary files /dev/null and b/reference/._scnrm2f.f differ diff --git a/reference/._scopyf.f b/reference/._scopyf.f new file mode 100644 index 0000000..ca56a75 Binary files /dev/null and b/reference/._scopyf.f differ diff --git a/reference/._sdotf.f b/reference/._sdotf.f new file mode 100644 index 0000000..c074726 Binary files /dev/null and b/reference/._sdotf.f differ diff --git a/reference/._sdsdotf.f b/reference/._sdsdotf.f new file mode 100644 index 0000000..4a0e45c Binary files /dev/null and b/reference/._sdsdotf.f differ diff --git a/reference/._sgbmvf.f b/reference/._sgbmvf.f new file mode 100644 index 0000000..a0abf6c Binary files /dev/null and b/reference/._sgbmvf.f differ diff --git a/reference/._sgemmf.f b/reference/._sgemmf.f new file mode 100644 index 0000000..c00403d Binary files /dev/null and b/reference/._sgemmf.f differ diff --git a/reference/._sgemvf.f b/reference/._sgemvf.f new file mode 100644 index 0000000..66962f9 Binary files /dev/null and b/reference/._sgemvf.f differ diff --git a/reference/._sgerf.f b/reference/._sgerf.f new file mode 100644 index 0000000..c36c5a5 Binary files /dev/null and b/reference/._sgerf.f differ diff --git a/reference/._sgesvf.f b/reference/._sgesvf.f new file mode 100644 index 0000000..029457e Binary files /dev/null and b/reference/._sgesvf.f differ diff --git a/reference/._sgetf2f.f b/reference/._sgetf2f.f new file mode 100644 index 0000000..a29befd Binary files /dev/null and b/reference/._sgetf2f.f differ diff --git a/reference/._sgetrff.f b/reference/._sgetrff.f new file mode 100644 index 0000000..55a2dc9 Binary files /dev/null and b/reference/._sgetrff.f differ diff --git a/reference/._sgetrsf.f b/reference/._sgetrsf.f new file mode 100644 index 0000000..de992b5 Binary files /dev/null and b/reference/._sgetrsf.f differ diff --git a/reference/._slaswpf.f b/reference/._slaswpf.f new file mode 100644 index 0000000..3b73baa Binary files /dev/null and b/reference/._slaswpf.f differ diff --git a/reference/._slauu2f.f b/reference/._slauu2f.f new file mode 100644 index 0000000..649dac2 Binary files /dev/null and b/reference/._slauu2f.f differ diff --git a/reference/._slauumf.f b/reference/._slauumf.f new file mode 100644 index 0000000..b1d49b4 Binary files /dev/null and b/reference/._slauumf.f differ diff --git a/reference/._smaxf.f b/reference/._smaxf.f new file mode 100644 index 0000000..8baebba Binary files /dev/null and b/reference/._smaxf.f differ diff --git a/reference/._sminf.f b/reference/._sminf.f new file mode 100644 index 0000000..eaac1eb Binary files /dev/null and b/reference/._sminf.f differ diff --git a/reference/._snrm2f.f b/reference/._snrm2f.f new file mode 100644 index 0000000..3eec47d Binary files /dev/null and b/reference/._snrm2f.f differ diff --git a/reference/._spotf2f.f b/reference/._spotf2f.f new file mode 100644 index 0000000..a775090 Binary files /dev/null and b/reference/._spotf2f.f differ diff --git a/reference/._spotrff.f b/reference/._spotrff.f new file mode 100644 index 0000000..ec57b0d Binary files /dev/null and b/reference/._spotrff.f differ diff --git a/reference/._spotrif.f b/reference/._spotrif.f new file mode 100644 index 0000000..e0e779c Binary files /dev/null and b/reference/._spotrif.f differ diff --git a/reference/._srotf.f b/reference/._srotf.f new file mode 100644 index 0000000..b62efda Binary files /dev/null and b/reference/._srotf.f differ diff --git a/reference/._srotgf.f b/reference/._srotgf.f new file mode 100644 index 0000000..3088d7e Binary files /dev/null and b/reference/._srotgf.f differ diff --git a/reference/._srotmf.f b/reference/._srotmf.f new file mode 100644 index 0000000..ef6d512 Binary files /dev/null and b/reference/._srotmf.f differ diff --git a/reference/._srotmgf.f b/reference/._srotmgf.f new file mode 100644 index 0000000..c4f2bf1 Binary files /dev/null and b/reference/._srotmgf.f differ diff --git a/reference/._ssbmvf.f b/reference/._ssbmvf.f new file mode 100644 index 0000000..2caed2f Binary files /dev/null and b/reference/._ssbmvf.f differ diff --git a/reference/._sscalf.f b/reference/._sscalf.f new file mode 100644 index 0000000..70f89db Binary files /dev/null and b/reference/._sscalf.f differ diff --git a/reference/._sspmvf.f b/reference/._sspmvf.f new file mode 100644 index 0000000..5f90310 Binary files /dev/null and b/reference/._sspmvf.f differ diff --git a/reference/._sspr2f.f b/reference/._sspr2f.f new file mode 100644 index 0000000..7840723 Binary files /dev/null and b/reference/._sspr2f.f differ diff --git a/reference/._ssprf.f b/reference/._ssprf.f new file mode 100644 index 0000000..8af1a04 Binary files /dev/null and b/reference/._ssprf.f differ diff --git a/reference/._sswapf.f b/reference/._sswapf.f new file mode 100644 index 0000000..b3f098d Binary files /dev/null and b/reference/._sswapf.f differ diff --git a/reference/._ssymmf.f b/reference/._ssymmf.f new file mode 100644 index 0000000..37e2b17 Binary files /dev/null and b/reference/._ssymmf.f differ diff --git a/reference/._ssymvf.f b/reference/._ssymvf.f new file mode 100644 index 0000000..1c4ab03 Binary files /dev/null and b/reference/._ssymvf.f differ diff --git a/reference/._ssyr2f.f b/reference/._ssyr2f.f new file mode 100644 index 0000000..997ba42 Binary files /dev/null and b/reference/._ssyr2f.f differ diff --git a/reference/._ssyr2kf.f b/reference/._ssyr2kf.f new file mode 100644 index 0000000..1b11be9 Binary files /dev/null and b/reference/._ssyr2kf.f differ diff --git a/reference/._ssyrf.f b/reference/._ssyrf.f new file mode 100644 index 0000000..e4574b4 Binary files /dev/null and b/reference/._ssyrf.f differ diff --git a/reference/._ssyrkf.f b/reference/._ssyrkf.f new file mode 100644 index 0000000..9e9ec4f Binary files /dev/null and b/reference/._ssyrkf.f differ diff --git a/reference/._stbmvf.f b/reference/._stbmvf.f new file mode 100644 index 0000000..15280ec Binary files /dev/null and b/reference/._stbmvf.f differ diff --git a/reference/._stbsvf.f b/reference/._stbsvf.f new file mode 100644 index 0000000..dd0e9fb Binary files /dev/null and b/reference/._stbsvf.f differ diff --git a/reference/._stpmvf.f b/reference/._stpmvf.f new file mode 100644 index 0000000..ada36a0 Binary files /dev/null and b/reference/._stpmvf.f differ diff --git a/reference/._stpsvf.f b/reference/._stpsvf.f new file mode 100644 index 0000000..b038d3c Binary files /dev/null and b/reference/._stpsvf.f differ diff --git a/reference/._strmmf.f b/reference/._strmmf.f new file mode 100644 index 0000000..8f49076 Binary files /dev/null and b/reference/._strmmf.f differ diff --git a/reference/._strmvf.f b/reference/._strmvf.f new file mode 100644 index 0000000..cd23535 Binary files /dev/null and b/reference/._strmvf.f differ diff --git a/reference/._strsmf.f b/reference/._strsmf.f new file mode 100644 index 0000000..2bc8cec Binary files /dev/null and b/reference/._strsmf.f differ diff --git a/reference/._strsvf.f b/reference/._strsvf.f new file mode 100644 index 0000000..48d1c44 Binary files /dev/null and b/reference/._strsvf.f differ diff --git a/reference/._strti2f.f b/reference/._strti2f.f new file mode 100644 index 0000000..b9f88e6 Binary files /dev/null and b/reference/._strti2f.f differ diff --git a/reference/._strtrif.f b/reference/._strtrif.f new file mode 100644 index 0000000..fe7cb56 Binary files /dev/null and b/reference/._strtrif.f differ diff --git a/reference/._zaxpycf.f b/reference/._zaxpycf.f new file mode 100644 index 0000000..0454f7a Binary files /dev/null and b/reference/._zaxpycf.f differ diff --git a/reference/._zaxpyf.f b/reference/._zaxpyf.f new file mode 100644 index 0000000..5e3dac8 Binary files /dev/null and b/reference/._zaxpyf.f differ diff --git a/reference/._zcopyf.f b/reference/._zcopyf.f new file mode 100644 index 0000000..cea5c93 Binary files /dev/null and b/reference/._zcopyf.f differ diff --git a/reference/._zdotcf.f b/reference/._zdotcf.f new file mode 100644 index 0000000..de2f077 Binary files /dev/null and b/reference/._zdotcf.f differ diff --git a/reference/._zdotuf.f b/reference/._zdotuf.f new file mode 100644 index 0000000..021ba6f Binary files /dev/null and b/reference/._zdotuf.f differ diff --git a/reference/._zdrotf.f b/reference/._zdrotf.f new file mode 100644 index 0000000..05013ee Binary files /dev/null and b/reference/._zdrotf.f differ diff --git a/reference/._zdscalf.f b/reference/._zdscalf.f new file mode 100644 index 0000000..1e3aa42 Binary files /dev/null and b/reference/._zdscalf.f differ diff --git a/reference/._zgbmvf.f b/reference/._zgbmvf.f new file mode 100644 index 0000000..f674276 Binary files /dev/null and b/reference/._zgbmvf.f differ diff --git a/reference/._zgemm3mf.f b/reference/._zgemm3mf.f new file mode 100644 index 0000000..b6c1173 Binary files /dev/null and b/reference/._zgemm3mf.f differ diff --git a/reference/._zgemmf.f b/reference/._zgemmf.f new file mode 100644 index 0000000..8fc5a43 Binary files /dev/null and b/reference/._zgemmf.f differ diff --git a/reference/._zgemvf.f b/reference/._zgemvf.f new file mode 100644 index 0000000..9561078 Binary files /dev/null and b/reference/._zgemvf.f differ diff --git a/reference/._zgercf.f b/reference/._zgercf.f new file mode 100644 index 0000000..c03de5b Binary files /dev/null and b/reference/._zgercf.f differ diff --git a/reference/._zgeruf.f b/reference/._zgeruf.f new file mode 100644 index 0000000..64d1100 Binary files /dev/null and b/reference/._zgeruf.f differ diff --git a/reference/._zgesvf.f b/reference/._zgesvf.f new file mode 100644 index 0000000..3bb01d5 Binary files /dev/null and b/reference/._zgesvf.f differ diff --git a/reference/._zgetf2f.f b/reference/._zgetf2f.f new file mode 100644 index 0000000..302554f Binary files /dev/null and b/reference/._zgetf2f.f differ diff --git a/reference/._zgetrff.f b/reference/._zgetrff.f new file mode 100644 index 0000000..5a2342f Binary files /dev/null and b/reference/._zgetrff.f differ diff --git a/reference/._zgetrsf.f b/reference/._zgetrsf.f new file mode 100644 index 0000000..3f6cb5c Binary files /dev/null and b/reference/._zgetrsf.f differ diff --git a/reference/._zhbmvf.f b/reference/._zhbmvf.f new file mode 100644 index 0000000..5f4e163 Binary files /dev/null and b/reference/._zhbmvf.f differ diff --git a/reference/._zhemm3mf.f b/reference/._zhemm3mf.f new file mode 100644 index 0000000..0591600 Binary files /dev/null and b/reference/._zhemm3mf.f differ diff --git a/reference/._zhemmf.f b/reference/._zhemmf.f new file mode 100644 index 0000000..808e812 Binary files /dev/null and b/reference/._zhemmf.f differ diff --git a/reference/._zhemvf.f b/reference/._zhemvf.f new file mode 100644 index 0000000..af63faf Binary files /dev/null and b/reference/._zhemvf.f differ diff --git a/reference/._zher2f.f b/reference/._zher2f.f new file mode 100644 index 0000000..f139002 Binary files /dev/null and b/reference/._zher2f.f differ diff --git a/reference/._zher2kf.f b/reference/._zher2kf.f new file mode 100644 index 0000000..fd97384 Binary files /dev/null and b/reference/._zher2kf.f differ diff --git a/reference/._zherf.f b/reference/._zherf.f new file mode 100644 index 0000000..1b67156 Binary files /dev/null and b/reference/._zherf.f differ diff --git a/reference/._zherkf.f b/reference/._zherkf.f new file mode 100644 index 0000000..c9a5c4f Binary files /dev/null and b/reference/._zherkf.f differ diff --git a/reference/._zhpmvf.f b/reference/._zhpmvf.f new file mode 100644 index 0000000..33e8a49 Binary files /dev/null and b/reference/._zhpmvf.f differ diff --git a/reference/._zhpr2f.f b/reference/._zhpr2f.f new file mode 100644 index 0000000..f7f16c4 Binary files /dev/null and b/reference/._zhpr2f.f differ diff --git a/reference/._zhprf.f b/reference/._zhprf.f new file mode 100644 index 0000000..fc3efdd Binary files /dev/null and b/reference/._zhprf.f differ diff --git a/reference/._zlaswpf.f b/reference/._zlaswpf.f new file mode 100644 index 0000000..48cafe9 Binary files /dev/null and b/reference/._zlaswpf.f differ diff --git a/reference/._zlauu2f.f b/reference/._zlauu2f.f new file mode 100644 index 0000000..b964b76 Binary files /dev/null and b/reference/._zlauu2f.f differ diff --git a/reference/._zlauumf.f b/reference/._zlauumf.f new file mode 100644 index 0000000..5cc4444 Binary files /dev/null and b/reference/._zlauumf.f differ diff --git a/reference/._zpotf2f.f b/reference/._zpotf2f.f new file mode 100644 index 0000000..22bd20f Binary files /dev/null and b/reference/._zpotf2f.f differ diff --git a/reference/._zpotrff.f b/reference/._zpotrff.f new file mode 100644 index 0000000..5034d90 Binary files /dev/null and b/reference/._zpotrff.f differ diff --git a/reference/._zpotrif.f b/reference/._zpotrif.f new file mode 100644 index 0000000..00f846b Binary files /dev/null and b/reference/._zpotrif.f differ diff --git a/reference/._zrotgf.f b/reference/._zrotgf.f new file mode 100644 index 0000000..795af09 Binary files /dev/null and b/reference/._zrotgf.f differ diff --git a/reference/._zsbmvf.f b/reference/._zsbmvf.f new file mode 100644 index 0000000..291a0f8 Binary files /dev/null and b/reference/._zsbmvf.f differ diff --git a/reference/._zscalf.f b/reference/._zscalf.f new file mode 100644 index 0000000..a4fb34a Binary files /dev/null and b/reference/._zscalf.f differ diff --git a/reference/._zspmvf.f b/reference/._zspmvf.f new file mode 100644 index 0000000..fcfd014 Binary files /dev/null and b/reference/._zspmvf.f differ diff --git a/reference/._zspr2f.f b/reference/._zspr2f.f new file mode 100644 index 0000000..feecf0f Binary files /dev/null and b/reference/._zspr2f.f differ diff --git a/reference/._zsprf.f b/reference/._zsprf.f new file mode 100644 index 0000000..edd90fa Binary files /dev/null and b/reference/._zsprf.f differ diff --git a/reference/._zswapf.f b/reference/._zswapf.f new file mode 100644 index 0000000..41f9932 Binary files /dev/null and b/reference/._zswapf.f differ diff --git a/reference/._zsymm3mf.f b/reference/._zsymm3mf.f new file mode 100644 index 0000000..549971f Binary files /dev/null and b/reference/._zsymm3mf.f differ diff --git a/reference/._zsymmf.f b/reference/._zsymmf.f new file mode 100644 index 0000000..8d08f08 Binary files /dev/null and b/reference/._zsymmf.f differ diff --git a/reference/._zsymvf.f b/reference/._zsymvf.f new file mode 100644 index 0000000..37a83ce Binary files /dev/null and b/reference/._zsymvf.f differ diff --git a/reference/._zsyr2f.f b/reference/._zsyr2f.f new file mode 100644 index 0000000..8d92c77 Binary files /dev/null and b/reference/._zsyr2f.f differ diff --git a/reference/._zsyr2kf.f b/reference/._zsyr2kf.f new file mode 100644 index 0000000..a16c1e4 Binary files /dev/null and b/reference/._zsyr2kf.f differ diff --git a/reference/._zsyrf.f b/reference/._zsyrf.f new file mode 100644 index 0000000..5b23764 Binary files /dev/null and b/reference/._zsyrf.f differ diff --git a/reference/._zsyrkf.f b/reference/._zsyrkf.f new file mode 100644 index 0000000..0cfee91 Binary files /dev/null and b/reference/._zsyrkf.f differ diff --git a/reference/._ztbmvf.f b/reference/._ztbmvf.f new file mode 100644 index 0000000..768d548 Binary files /dev/null and b/reference/._ztbmvf.f differ diff --git a/reference/._ztbsvf.f b/reference/._ztbsvf.f new file mode 100644 index 0000000..2321543 Binary files /dev/null and b/reference/._ztbsvf.f differ diff --git a/reference/._ztpmvf.f b/reference/._ztpmvf.f new file mode 100644 index 0000000..95b4cff Binary files /dev/null and b/reference/._ztpmvf.f differ diff --git a/reference/._ztpsvf.f b/reference/._ztpsvf.f new file mode 100644 index 0000000..f3ce635 Binary files /dev/null and b/reference/._ztpsvf.f differ diff --git a/reference/._ztrmmf.f b/reference/._ztrmmf.f new file mode 100644 index 0000000..2be68b5 Binary files /dev/null and b/reference/._ztrmmf.f differ diff --git a/reference/._ztrmvf.f b/reference/._ztrmvf.f new file mode 100644 index 0000000..faa1fa8 Binary files /dev/null and b/reference/._ztrmvf.f differ diff --git a/reference/._ztrsmf.f b/reference/._ztrsmf.f new file mode 100644 index 0000000..6b41df9 Binary files /dev/null and b/reference/._ztrsmf.f differ diff --git a/reference/._ztrsvf.f b/reference/._ztrsvf.f new file mode 100644 index 0000000..4a4dd91 Binary files /dev/null and b/reference/._ztrsvf.f differ diff --git a/reference/._ztrti2f.f b/reference/._ztrti2f.f new file mode 100644 index 0000000..3fb64ca Binary files /dev/null and b/reference/._ztrti2f.f differ diff --git a/reference/._ztrtrif.f b/reference/._ztrtrif.f new file mode 100644 index 0000000..bc5b548 Binary files /dev/null and b/reference/._ztrtrif.f differ diff --git a/reference/LICENSE b/reference/LICENSE new file mode 100644 index 0000000..85061f2 --- /dev/null +++ b/reference/LICENSE @@ -0,0 +1,23 @@ +This directory contains the reference implementation of BLAS +which is obtainable at: http://netlib.org/blas/ + +The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, +2010, is as follows: + +2) Are there legal restrictions on the use of BLAS reference implementation +software? + +The reference BLAS is a freely-available software package. It is available from +netlib via anonymous ftp and the World Wide Web. Thus, it can be included in +commercial software packages (and has been). We only ask that proper credit be +given to the authors. + +Like all software, it is copyrighted. It is not trademarked, but we do ask the +following: + +If you modify the source for these routines we ask that you change the name of +the routine and comment the changes made to the original. + +We will gladly answer any questions regarding the software. If a modification +is done, however, it is the responsibility of the person who modified the +routine to provide support. diff --git a/reference/Makefile b/reference/Makefile new file mode 100644 index 0000000..6cbde28 --- /dev/null +++ b/reference/Makefile @@ -0,0 +1,176 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + +SBLAS1OBJS = \ + saxpyf.$(SUFFIX) sswapf.$(SUFFIX) \ + scopyf.$(SUFFIX) sscalf.$(SUFFIX) \ + sdotf.$(SUFFIX) sdsdotf.$(SUFFIX) dsdotf.$(SUFFIX) \ + sasumf.$(SUFFIX) snrm2f.$(SUFFIX) \ + smaxf.$(SUFFIX) samaxf.$(SUFFIX) ismaxf.$(SUFFIX) isamaxf.$(SUFFIX) \ + sminf.$(SUFFIX) saminf.$(SUFFIX) isminf.$(SUFFIX) isaminf.$(SUFFIX) \ + srotf.$(SUFFIX) srotgf.$(SUFFIX) srotmf.$(SUFFIX) srotmgf.$(SUFFIX) \ + +SBLAS2OBJS = \ + sgemvf.$(SUFFIX) sgerf.$(SUFFIX) \ + strsvf.$(SUFFIX) strmvf.$(SUFFIX) ssymvf.$(SUFFIX) \ + ssyrf.$(SUFFIX) ssyr2f.$(SUFFIX) sgbmvf.$(SUFFIX) \ + ssbmvf.$(SUFFIX) sspmvf.$(SUFFIX) \ + ssprf.$(SUFFIX) sspr2f.$(SUFFIX) \ + stbsvf.$(SUFFIX) stbmvf.$(SUFFIX) \ + stpsvf.$(SUFFIX) stpmvf.$(SUFFIX) + +SBLAS3OBJS = \ + sgemmf.$(SUFFIX) ssymmf.$(SUFFIX) strmmf.$(SUFFIX) \ + strsmf.$(SUFFIX) ssyrkf.$(SUFFIX) ssyr2kf.$(SUFFIX) + +DBLAS1OBJS = \ + daxpyf.$(SUFFIX) dswapf.$(SUFFIX) \ + dcopyf.$(SUFFIX) dscalf.$(SUFFIX) \ + ddotf.$(SUFFIX) \ + dasumf.$(SUFFIX) dnrm2f.$(SUFFIX) \ + dmaxf.$(SUFFIX) damaxf.$(SUFFIX) idmaxf.$(SUFFIX) idamaxf.$(SUFFIX) \ + dminf.$(SUFFIX) daminf.$(SUFFIX) idminf.$(SUFFIX) idaminf.$(SUFFIX) \ + drotf.$(SUFFIX) drotgf.$(SUFFIX) drotmf.$(SUFFIX) drotmgf.$(SUFFIX) \ + +DBLAS2OBJS = \ + dgemvf.$(SUFFIX) dgerf.$(SUFFIX) \ + dtrsvf.$(SUFFIX) dtrmvf.$(SUFFIX) dsymvf.$(SUFFIX) \ + dsyrf.$(SUFFIX) dsyr2f.$(SUFFIX) dgbmvf.$(SUFFIX) \ + dsbmvf.$(SUFFIX) dspmvf.$(SUFFIX) \ + dsprf.$(SUFFIX) dspr2f.$(SUFFIX) \ + dtbsvf.$(SUFFIX) dtbmvf.$(SUFFIX) \ + dtpsvf.$(SUFFIX) dtpmvf.$(SUFFIX) + +DBLAS3OBJS = \ + dgemmf.$(SUFFIX) dsymmf.$(SUFFIX) dtrmmf.$(SUFFIX) \ + dtrsmf.$(SUFFIX) dsyrkf.$(SUFFIX) dsyr2kf.$(SUFFIX) + +CBLAS1OBJS = \ + caxpyf.$(SUFFIX) caxpycf.$(SUFFIX) cswapf.$(SUFFIX) \ + ccopyf.$(SUFFIX) cscalf.$(SUFFIX) csscalf.$(SUFFIX) \ + cdotcf.$(SUFFIX) cdotuf.$(SUFFIX) \ + scasumf.$(SUFFIX) scnrm2f.$(SUFFIX) \ + scamaxf.$(SUFFIX) icamaxf.$(SUFFIX) \ + scaminf.$(SUFFIX) icaminf.$(SUFFIX) \ + csrotf.$(SUFFIX) crotgf.$(SUFFIX) \ + +CBLAS2OBJS = \ + cgemvf.$(SUFFIX) cgeruf.$(SUFFIX) cgercf.$(SUFFIX) \ + ctrsvf.$(SUFFIX) ctrmvf.$(SUFFIX) csymvf.$(SUFFIX) \ + csyrf.$(SUFFIX) csyr2f.$(SUFFIX) cgbmvf.$(SUFFIX) \ + csbmvf.$(SUFFIX) cspmvf.$(SUFFIX) \ + csprf.$(SUFFIX) cspr2f.$(SUFFIX) \ + ctbsvf.$(SUFFIX) ctbmvf.$(SUFFIX) \ + ctpsvf.$(SUFFIX) ctpmvf.$(SUFFIX) \ + chemvf.$(SUFFIX) chbmvf.$(SUFFIX) \ + cherf.$(SUFFIX) cher2f.$(SUFFIX) \ + chpmvf.$(SUFFIX) chprf.$(SUFFIX) chpr2f.$(SUFFIX) + +CBLAS3OBJS = \ + cgemmf.$(SUFFIX) csymmf.$(SUFFIX) ctrmmf.$(SUFFIX) \ + ctrsmf.$(SUFFIX) csyrkf.$(SUFFIX) csyr2kf.$(SUFFIX) \ + chemmf.$(SUFFIX) cherkf.$(SUFFIX) cher2kf.$(SUFFIX) + +ZBLAS1OBJS = \ + zaxpyf.$(SUFFIX) zaxpycf.$(SUFFIX) zswapf.$(SUFFIX) \ + zcopyf.$(SUFFIX) zscalf.$(SUFFIX) zdscalf.$(SUFFIX) \ + zdotcf.$(SUFFIX) zdotuf.$(SUFFIX) \ + dzasumf.$(SUFFIX) dznrm2f.$(SUFFIX) \ + dzamaxf.$(SUFFIX) izamaxf.$(SUFFIX) \ + dzaminf.$(SUFFIX) izaminf.$(SUFFIX) \ + zdrotf.$(SUFFIX) zrotgf.$(SUFFIX) \ + +ZBLAS2OBJS = \ + zgemvf.$(SUFFIX) zgeruf.$(SUFFIX) zgercf.$(SUFFIX) \ + ztrsvf.$(SUFFIX) ztrmvf.$(SUFFIX) zsymvf.$(SUFFIX) \ + zsyrf.$(SUFFIX) zsyr2f.$(SUFFIX) zgbmvf.$(SUFFIX) \ + zsbmvf.$(SUFFIX) zspmvf.$(SUFFIX) \ + zsprf.$(SUFFIX) zspr2f.$(SUFFIX) \ + ztbsvf.$(SUFFIX) ztbmvf.$(SUFFIX) \ + ztpsvf.$(SUFFIX) ztpmvf.$(SUFFIX) \ + zhemvf.$(SUFFIX) zhbmvf.$(SUFFIX) \ + zherf.$(SUFFIX) zher2f.$(SUFFIX) \ + zhpmvf.$(SUFFIX) zhprf.$(SUFFIX) zhpr2f.$(SUFFIX) + +ZBLAS3OBJS = \ + zgemmf.$(SUFFIX) zsymmf.$(SUFFIX) ztrmmf.$(SUFFIX) \ + ztrsmf.$(SUFFIX) zsyrkf.$(SUFFIX) zsyr2kf.$(SUFFIX) \ + zhemmf.$(SUFFIX) zherkf.$(SUFFIX) zher2kf.$(SUFFIX) + +ifdef SUPPORT_GEMM3M + +CBLAS3OBJS += cgemm3mf.$(SUFFIX) csymm3mf.$(SUFFIX) chemm3mf.$(SUFFIX) + +ZBLAS3OBJS += zgemm3mf.$(SUFFIX) zsymm3mf.$(SUFFIX) zhemm3mf.$(SUFFIX) + +endif + +SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) +DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) +QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) +CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) +ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) +XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) + +SBLASOBJS += \ + sgetf2f.$(SUFFIX) sgetrff.$(SUFFIX) slauu2f.$(SUFFIX) slauumf.$(SUFFIX) \ + spotf2f.$(SUFFIX) spotrff.$(SUFFIX) strti2f.$(SUFFIX) strtrif.$(SUFFIX) \ + slaswpf.$(SUFFIX) sgetrsf.$(SUFFIX) sgesvf.$(SUFFIX) spotrif.$(SUFFIX) \ + +DBLASOBJS += \ + dgetf2f.$(SUFFIX) dgetrff.$(SUFFIX) dlauu2f.$(SUFFIX) dlauumf.$(SUFFIX) \ + dpotf2f.$(SUFFIX) dpotrff.$(SUFFIX) dtrti2f.$(SUFFIX) dtrtrif.$(SUFFIX) \ + dlaswpf.$(SUFFIX) dgetrsf.$(SUFFIX) dgesvf.$(SUFFIX) dpotrif.$(SUFFIX) \ + +QBLASOBJS += \ + qgetf2f.$(SUFFIX) qgetrff.$(SUFFIX) qlauu2f.$(SUFFIX) qlauumf.$(SUFFIX) \ + qpotf2f.$(SUFFIX) qpotrff.$(SUFFIX) qtrti2f.$(SUFFIX) qtrtrif.$(SUFFIX) \ + qlaswpf.$(SUFFIX) qgetrsf.$(SUFFIX) qgesvf.$(SUFFIX) qpotrif.$(SUFFIX) \ + +CBLASOBJS += \ + cgetf2f.$(SUFFIX) cgetrff.$(SUFFIX) clauu2f.$(SUFFIX) clauumf.$(SUFFIX) \ + cpotf2f.$(SUFFIX) cpotrff.$(SUFFIX) ctrti2f.$(SUFFIX) ctrtrif.$(SUFFIX) \ + claswpf.$(SUFFIX) cgetrsf.$(SUFFIX) cgesvf.$(SUFFIX) cpotrif.$(SUFFIX) \ + +ZBLASOBJS += \ + zgetf2f.$(SUFFIX) zgetrff.$(SUFFIX) zlauu2f.$(SUFFIX) zlauumf.$(SUFFIX) \ + zpotf2f.$(SUFFIX) zpotrff.$(SUFFIX) ztrti2f.$(SUFFIX) ztrtrif.$(SUFFIX) \ + zlaswpf.$(SUFFIX) zgetrsf.$(SUFFIX) zgesvf.$(SUFFIX) zpotrif.$(SUFFIX) \ + +XBLASOBJS += \ + xgetf2f.$(SUFFIX) xgetrff.$(SUFFIX) xlauu2f.$(SUFFIX) xlauumf.$(SUFFIX) \ + xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \ + xlaswpf.$(SUFFIX) xgetrsf.$(SUFFIX) xgesvf.$(SUFFIX) xpotrif.$(SUFFIX) \ + + +include $(TOPDIR)/Makefile.tail + +all :: libs + +clean :: + +level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + diff --git a/reference/caxpycf.f b/reference/caxpycf.f new file mode 100644 index 0000000..092c8c1 --- /dev/null +++ b/reference/caxpycf.f @@ -0,0 +1,35 @@ + subroutine caxpycf(n,ca,cx,incx,cy,incy) +c +c constant times a vector plus a vector. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ca + integer i,incx,incy,ix,iy,n + INTRINSIC conjg +c + if(n.le.0)return + if (abs(real(ca)) + abs(aimag(ca)) .eq. 0.0 ) return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + cy(iy) = cy(iy) + ca*conjg(cx(ix)) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + cy(i) = cy(i) + ca*conjg(cx(i)) + 30 continue + return + end diff --git a/reference/caxpyf.f b/reference/caxpyf.f new file mode 100644 index 0000000..554f71d --- /dev/null +++ b/reference/caxpyf.f @@ -0,0 +1,34 @@ + subroutine caxpyf(n,ca,cx,incx,cy,incy) +c +c constant times a vector plus a vector. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ca + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if (abs(real(ca)) + abs(aimag(ca)) .eq. 0.0 ) return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + cy(iy) = cy(iy) + ca*cx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + cy(i) = cy(i) + ca*cx(i) + 30 continue + return + end diff --git a/reference/ccopyf.f b/reference/ccopyf.f new file mode 100644 index 0000000..2a33255 --- /dev/null +++ b/reference/ccopyf.f @@ -0,0 +1,33 @@ + subroutine ccopyf(n,cx,incx,cy,incy) +c +c copies a vector, x, to a vector, y. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*) + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + cy(iy) = cx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + cy(i) = cx(i) + 30 continue + return + end diff --git a/reference/cdotcf.f b/reference/cdotcf.f new file mode 100644 index 0000000..79aa39c --- /dev/null +++ b/reference/cdotcf.f @@ -0,0 +1,38 @@ + complex function cdotcf(n,cx,incx,cy,incy) +c +c forms the dot product of two vectors, conjugating the first +c vector. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ctemp + integer i,incx,incy,ix,iy,n +c + ctemp = (0.0,0.0) + cdotcf = (0.0,0.0) + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ctemp = ctemp + conjg(cx(ix))*cy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + cdotcf = ctemp + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ctemp = ctemp + conjg(cx(i))*cy(i) + 30 continue + cdotcf = ctemp + return + end diff --git a/reference/cdotuf.f b/reference/cdotuf.f new file mode 100644 index 0000000..bf93390 --- /dev/null +++ b/reference/cdotuf.f @@ -0,0 +1,37 @@ + complex function cdotuf(n,cx,incx,cy,incy) +c +c forms the dot product of two vectors. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ctemp + integer i,incx,incy,ix,iy,n +c + ctemp = (0.0,0.0) + cdotuf = (0.0,0.0) + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ctemp = ctemp + cx(ix)*cy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + cdotuf = ctemp + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ctemp = ctemp + cx(i)*cy(i) + 30 continue + cdotuf = ctemp + return + end diff --git a/reference/cgbmvf.f b/reference/cgbmvf.f new file mode 100644 index 0000000..27ce62c --- /dev/null +++ b/reference/cgbmvf.f @@ -0,0 +1,450 @@ + SUBROUTINE CGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, KL, KU, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZGBMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or +* +* y := alpha*conjg( A' )*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n band matrix, with kl sub-diagonals and ku super-diagonals. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* KL - INTEGER. +* On entry, KL specifies the number of sub-diagonals of the +* matrix A. KL must satisfy 0 .le. KL. +* Unchanged on exit. +* +* KU - INTEGER. +* On entry, KU specifies the number of super-diagonals of the +* matrix A. KU must satisfy 0 .le. KU. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading ( kl + ku + 1 ) by n part of the +* array A must contain the matrix of coefficients, supplied +* column by column, with the leading diagonal of the matrix in +* row ( ku + 1 ) of the array, the first super-diagonal +* starting at position 2 in row ku, the first sub-diagonal +* starting at position 1 in row ( ku + 2 ), and so on. +* Elements in the array A that do not correspond to elements +* in the band matrix (such as the top left ku by ku triangle) +* are not referenced. +* The following program segment will transfer a band matrix +* from conventional full matrix storage to band storage: +* +* DO 20, J = 1, N +* K = KU + 1 - J +* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) +* A( K + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( kl + ku + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, + $ LENX, LENY + LOGICAL NOCONJ, NOTRANS, XCONJ +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ).AND. + $ .NOT.LSAME( TRANS, 'O' ).AND. + $ .NOT.LSAME( TRANS, 'U' ).AND. + $ .NOT.LSAME( TRANS, 'S' ).AND. + $ .NOT.LSAME( TRANS, 'D' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( KL.LT.0 )THEN + INFO = 4 + ELSE IF( KU.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN + INFO = 8 + ELSE IF( INCX.EQ.0 )THEN + INFO = 10 + ELSE IF( INCY.EQ.0 )THEN + INFO = 13 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZGBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* + NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) + + NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) + + XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF(NOTRANS)THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the band part of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + KUP1 = KU + 1 + + IF(XCONJ)THEN + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + K = KUP1 - J + IF( NOCONJ )THEN + DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 50 CONTINUE + ELSE + DO 55, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*CONJG(A( K + I, J )) + 55 CONTINUE + END IF + + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + K = KUP1 - J + IF( NOCONJ )THEN + DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 70 CONTINUE + ELSE + DO 75, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*CONJG(A( K + I, J )) + IY = IY + INCY + 75 CONTINUE + END IF + + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = ZERO + K = KUP1 - J + IF( NOCONJ )THEN + DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( I ) + 90 CONTINUE + ELSE + DO 100, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + CONJG( A( K + I, J ) )*X( I ) + 100 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + IF( NOCONJ )THEN + DO 120, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( IX ) + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + CONJG( A( K + I, J ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 140 CONTINUE + END IF + END IF + + ELSE + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 160, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG(X( JX )) + K = KUP1 - J + IF( NOCONJ )THEN + DO 150, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 150 CONTINUE + ELSE + DO 155, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*CONJG(A( K + I, J )) + 155 CONTINUE + END IF + + END IF + JX = JX + INCX + 160 CONTINUE + ELSE + DO 180, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG(X( JX )) + IY = KY + K = KUP1 - J + IF( NOCONJ )THEN + DO 170, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 170 CONTINUE + ELSE + DO 175, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*CONJG(A( K + I, J )) + IY = IY + INCY + 175 CONTINUE + END IF + + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 180 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 210, J = 1, N + TEMP = ZERO + K = KUP1 - J + IF( NOCONJ )THEN + DO 190, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*CONJG(X( I )) + 190 CONTINUE + ELSE + DO 200, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + CONJG( A( K + I, J ) )*CONJG(X( I )) + 200 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 210 CONTINUE + ELSE + DO 240, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + IF( NOCONJ )THEN + DO 220, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*CONJG(X( IX )) + IX = IX + INCX + 220 CONTINUE + ELSE + DO 230, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + CONJG( A( K + I, J ) )*CONJG(X(IX )) + IX = IX + INCX + 230 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 240 CONTINUE + END IF + END IF + + END IF + +* + RETURN +* +* End of ZGBMV . +* + END diff --git a/reference/cgemm3mf.f b/reference/cgemm3mf.f new file mode 100644 index 0000000..a144aa2 --- /dev/null +++ b/reference/cgemm3mf.f @@ -0,0 +1,414 @@ + SUBROUTINE CGEMM3MF(TRA,TRB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + COMPLEX ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRA,TRB +* .. +* .. Array Arguments .. + COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* CGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRA - CHARACTER*1. +* On entry, TRA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRA = 'N' or 'n', op( A ) = A. +* +* TRA = 'T' or 't', op( A ) = A'. +* +* TRA = 'C' or 'c', op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* TRB - CHARACTER*1. +* On entry, TRB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRB = 'N' or 'n', op( B ) = B. +* +* TRB = 'T' or 't', op( B ) = B'. +* +* TRB = 'C' or 'c', op( B ) = conjg( B' ). +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRA = 'N' or 'n', and is m otherwise. +* Before entry with TRA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is +* n when TRB = 'N' or 'n', and is k otherwise. +* Before entry with TRB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG,MAX +* .. +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL CONJA,CONJB,NOTA,NOTB +* .. +* .. Parameters .. + COMPLEX ONE + PARAMETER (ONE= (1.0E+0,0.0E+0)) + COMPLEX ZERO + PARAMETER (ZERO= (0.0E+0,0.0E+0)) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* conjugated or transposed, set CONJA and CONJB as true if A and +* B respectively are to be transposed but not conjugated and set +* NROWA, NCOLA and NROWB as the number of rows and columns of A +* and the number of rows of B respectively. +* + NOTA = LSAME(TRA,'N') + NOTB = LSAME(TRB,'N') + CONJA = LSAME(TRA,'C') + CONJB = LSAME(TRB,'C') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + + (.NOT.LSAME(TRA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + + (.NOT.LSAME(TRB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('CGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And when alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE IF (CONJA) THEN +* +* Form C := alpha*conjg( A' )*B + beta*C. +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 150 J = 1,N + DO 140 I = 1,M + TEMP = ZERO + DO 130 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 130 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 140 CONTINUE + 150 CONTINUE + END IF + ELSE IF (NOTA) THEN + IF (CONJB) THEN +* +* Form C := alpha*A*conjg( B' ) + beta*C. +* + DO 200 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 160 I = 1,M + C(I,J) = ZERO + 160 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 170 I = 1,M + C(I,J) = BETA*C(I,J) + 170 CONTINUE + END IF + DO 190 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*CONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE +* +* Form C := alpha*A*B' + beta*C +* + DO 250 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 210 I = 1,M + C(I,J) = ZERO + 210 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 220 I = 1,M + C(I,J) = BETA*C(I,J) + 220 CONTINUE + END IF + DO 240 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE + END IF + 240 CONTINUE + 250 CONTINUE + END IF + ELSE IF (CONJA) THEN + IF (CONJB) THEN +* +* Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. +* + DO 280 J = 1,N + DO 270 I = 1,M + TEMP = ZERO + DO 260 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*CONJG(B(J,L)) + 260 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 270 CONTINUE + 280 CONTINUE + ELSE +* +* Form C := alpha*conjg( A' )*B' + beta*C +* + DO 310 J = 1,N + DO 300 I = 1,M + TEMP = ZERO + DO 290 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*B(J,L) + 290 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 300 CONTINUE + 310 CONTINUE + END IF + ELSE + IF (CONJB) THEN +* +* Form C := alpha*A'*conjg( B' ) + beta*C +* + DO 340 J = 1,N + DO 330 I = 1,M + TEMP = ZERO + DO 320 L = 1,K + TEMP = TEMP + A(L,I)*CONJG(B(J,L)) + 320 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 330 CONTINUE + 340 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 370 J = 1,N + DO 360 I = 1,M + TEMP = ZERO + DO 350 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 350 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 360 CONTINUE + 370 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMM . +* + END diff --git a/reference/cgemmf.f b/reference/cgemmf.f new file mode 100644 index 0000000..d554fd3 --- /dev/null +++ b/reference/cgemmf.f @@ -0,0 +1,414 @@ + SUBROUTINE CGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + COMPLEX ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRANA,TRANB +* .. +* .. Array Arguments .. + COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* CGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRANA - CHARACTER*1. +* On entry, TRANA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANA = 'N' or 'n', op( A ) = A. +* +* TRANA = 'T' or 't', op( A ) = A'. +* +* TRANA = 'C' or 'c', op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* TRANB - CHARACTER*1. +* On entry, TRANB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRANB = 'N' or 'n', op( B ) = B. +* +* TRANB = 'T' or 't', op( B ) = B'. +* +* TRANB = 'C' or 'c', op( B ) = conjg( B' ). +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANA = 'N' or 'n', and is m otherwise. +* Before entry with TRANA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is +* n when TRANB = 'N' or 'n', and is k otherwise. +* Before entry with TRANB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG,MAX +* .. +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL CONJA,CONJB,NOTA,NOTB +* .. +* .. Parameters .. + COMPLEX ONE + PARAMETER (ONE= (1.0E+0,0.0E+0)) + COMPLEX ZERO + PARAMETER (ZERO= (0.0E+0,0.0E+0)) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* conjugated or transposed, set CONJA and CONJB as true if A and +* B respectively are to be transposed but not conjugated and set +* NROWA, NCOLA and NROWB as the number of rows and columns of A +* and the number of rows of B respectively. +* + NOTA = LSAME(TRANA,'N') + NOTB = LSAME(TRANB,'N') + CONJA = LSAME(TRANA,'C') + CONJB = LSAME(TRANB,'C') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + + (.NOT.LSAME(TRANA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + + (.NOT.LSAME(TRANB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('CGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And when alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE IF (CONJA) THEN +* +* Form C := alpha*conjg( A' )*B + beta*C. +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 150 J = 1,N + DO 140 I = 1,M + TEMP = ZERO + DO 130 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 130 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 140 CONTINUE + 150 CONTINUE + END IF + ELSE IF (NOTA) THEN + IF (CONJB) THEN +* +* Form C := alpha*A*conjg( B' ) + beta*C. +* + DO 200 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 160 I = 1,M + C(I,J) = ZERO + 160 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 170 I = 1,M + C(I,J) = BETA*C(I,J) + 170 CONTINUE + END IF + DO 190 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*CONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE +* +* Form C := alpha*A*B' + beta*C +* + DO 250 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 210 I = 1,M + C(I,J) = ZERO + 210 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 220 I = 1,M + C(I,J) = BETA*C(I,J) + 220 CONTINUE + END IF + DO 240 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE + END IF + 240 CONTINUE + 250 CONTINUE + END IF + ELSE IF (CONJA) THEN + IF (CONJB) THEN +* +* Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. +* + DO 280 J = 1,N + DO 270 I = 1,M + TEMP = ZERO + DO 260 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*CONJG(B(J,L)) + 260 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 270 CONTINUE + 280 CONTINUE + ELSE +* +* Form C := alpha*conjg( A' )*B' + beta*C +* + DO 310 J = 1,N + DO 300 I = 1,M + TEMP = ZERO + DO 290 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*B(J,L) + 290 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 300 CONTINUE + 310 CONTINUE + END IF + ELSE + IF (CONJB) THEN +* +* Form C := alpha*A'*conjg( B' ) + beta*C +* + DO 340 J = 1,N + DO 330 I = 1,M + TEMP = ZERO + DO 320 L = 1,K + TEMP = TEMP + A(L,I)*CONJG(B(J,L)) + 320 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 330 CONTINUE + 340 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 370 J = 1,N + DO 360 I = 1,M + TEMP = ZERO + DO 350 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 350 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 360 CONTINUE + 370 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMM . +* + END diff --git a/reference/cgemvf.f b/reference/cgemvf.f new file mode 100644 index 0000000..d3a1d9e --- /dev/null +++ b/reference/cgemvf.f @@ -0,0 +1,332 @@ + SUBROUTINE CGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or +* +* y := alpha*conjg( A' )*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY + LOGICAL NOCONJ, NOTRANS, XCONJ +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ).AND. + $ .NOT.LSAME( TRANS, 'O' ).AND. + $ .NOT.LSAME( TRANS, 'U' ).AND. + $ .NOT.LSAME( TRANS, 'S' ).AND. + $ .NOT.LSAME( TRANS, 'D' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CGEMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* + NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) + + NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) + + XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF(NOTRANS)THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (XCONJ) THEN + TEMP = ALPHA*X( JX ) + ELSE + TEMP = ALPHA*CONJG(X( JX )) + ENDIF + IF (NOCONJ) THEN + DO 50, I = 1, M + Y( I ) = Y( I ) + TEMP*A( I, J ) + 50 CONTINUE + ELSE + DO 55, I = 1, M + Y( I ) = Y( I ) + TEMP*CONJG(A( I, J )) + 55 CONTINUE + ENDIF + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (XCONJ) THEN + TEMP = ALPHA*X( JX ) + ELSE + TEMP = ALPHA*CONJG(X( JX )) + ENDIF + IY = KY + IF (NOCONJ) THEN + DO 70, I = 1, M + Y( IY ) = Y( IY ) + TEMP*A( I, J ) + IY = IY + INCY + 70 CONTINUE + ELSE + DO 75, I = 1, M + Y( IY ) = Y( IY ) + TEMP* CONJG(A( I, J )) + IY = IY + INCY + 75 CONTINUE + ENDIF + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = ZERO + IF( NOCONJ )THEN + DO 90, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + A( I, J )*X( I ) + ELSE + TEMP = TEMP + A( I, J )*CONJG(X( I )) + ENDIF + 90 CONTINUE + ELSE + DO 100, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + CONJG( A( I, J ) )*X( I ) + ELSE + TEMP = TEMP + CONJG( A( I, J ) )*CONJG(X( I )) + ENDIF + 100 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140, J = 1, N + TEMP = ZERO + IX = KX + IF( NOCONJ )THEN + DO 120, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + A( I, J )*X( IX ) + ELSE + TEMP = TEMP + A( I, J )*CONJG(X( IX )) + ENDIF + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + CONJG( A( I, J ) )*X( IX ) + ELSE + TEMP = TEMP + CONJG( A( I, J ) )*CONJG(X( IX )) + ENDIF + IX = IX + INCX + 130 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 140 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMV . +* + END + diff --git a/reference/cgercf.f b/reference/cgercf.f new file mode 100644 index 0000000..9b4b41b --- /dev/null +++ b/reference/cgercf.f @@ -0,0 +1,157 @@ + SUBROUTINE CGERCF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CGERC performs the rank 1 operation +* +* A := alpha*x*conjg( y' ) + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CGERC ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( Y( JY ) ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( Y( JY ) ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of CGERC . +* + END diff --git a/reference/cgeruf.f b/reference/cgeruf.f new file mode 100644 index 0000000..72e6969 --- /dev/null +++ b/reference/cgeruf.f @@ -0,0 +1,157 @@ + SUBROUTINE CGERUF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CGERU performs the rank 1 operation +* +* A := alpha*x*y' + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CGERU ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of CGERU . +* + END diff --git a/reference/cgesvf.f b/reference/cgesvf.f new file mode 100644 index 0000000..6544059 --- /dev/null +++ b/reference/cgesvf.f @@ -0,0 +1,107 @@ + SUBROUTINE CGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK driver routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* CGESV computes the solution to a complex system of linear equations +* A * X = B, +* where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +* +* The LU decomposition with partial pivoting and row interchanges is +* used to factor A as +* A = P * L * U, +* where P is a permutation matrix, L is unit lower triangular, and U is +* upper triangular. The factored form of A is then used to solve the +* system of equations A * X = B. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of linear equations, i.e., the order of the +* matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the N-by-N coefficient matrix A. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (output) INTEGER array, dimension (N) +* The pivot indices that define the permutation matrix P; +* row i of the matrix was interchanged with row IPIV(i). +* +* B (input/output) COMPLEX array, dimension (LDB,NRHS) +* On entry, the N-by-NRHS matrix of right hand side matrix B. +* On exit, if INFO = 0, the N-by-NRHS solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, so the solution could not be computed. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL CGETRF, CGETRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGESV ', -INFO ) + RETURN + END IF +* +* Compute the LU factorization of A. +* + CALL CGETRF( N, N, A, LDA, IPIV, INFO ) + IF( INFO.EQ.0 ) THEN +* +* Solve the system A*X = B, overwriting B with X. +* + CALL CGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, + $ INFO ) + END IF + RETURN +* +* End of CGESV +* + END diff --git a/reference/cgetf2f.f b/reference/cgetf2f.f new file mode 100644 index 0000000..f406750 --- /dev/null +++ b/reference/cgetf2f.f @@ -0,0 +1,136 @@ + SUBROUTINE CGETF2F( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CGETF2 computes an LU factorization of a general m-by-n matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the m by n matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, U(k,k) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE, ZERO + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), + $ ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER J, JP +* .. +* .. External Functions .. + INTEGER ICAMAX + EXTERNAL ICAMAX +* .. +* .. External Subroutines .. + EXTERNAL CGERU, CSCAL, CSWAP, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* + DO 10 J = 1, MIN( M, N ) +* +* Find pivot and test for singularity. +* + JP = J - 1 + ICAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + IF( A( JP, J ).NE.ZERO ) THEN +* +* Apply the interchange to columns 1:N. +* + IF( JP.NE.J ) + $ CALL CSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) +* +* Compute elements J+1:M of J-th column. +* + IF( J.LT.M ) + $ CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) +* + ELSE IF( INFO.EQ.0 ) THEN +* + INFO = J + END IF +* + IF( J.LT.MIN( M, N ) ) THEN +* +* Update trailing submatrix. +* + CALL CGERU( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), + $ LDA, A( J+1, J+1 ), LDA ) + END IF + 10 CONTINUE + RETURN +* +* End of CGETF2 +* + END diff --git a/reference/cgetrff.f b/reference/cgetrff.f new file mode 100644 index 0000000..2935c5d --- /dev/null +++ b/reference/cgetrff.f @@ -0,0 +1,156 @@ + SUBROUTINE CGETRFF( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGETF2, CLASWP, CTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 64 + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL CGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL CGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to columns 1:J-1. +* + CALL CLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF( J+JB.LE.N ) THEN +* +* Apply interchanges to columns J+JB:N. +* + CALL CLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* +* Compute block row of U. +* + CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL CGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + 20 CONTINUE + END IF + RETURN +* +* End of CGETRF +* + END diff --git a/reference/cgetrsf.f b/reference/cgetrsf.f new file mode 100644 index 0000000..c4f0079 --- /dev/null +++ b/reference/cgetrsf.f @@ -0,0 +1,150 @@ + SUBROUTINE CGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* CGETRS solves a system of linear equations +* A * X = B, A**T * X = B, or A**H * X = B +* with a general N-by-N matrix A using the LU factorization computed +* by CGETRF. +* +* Arguments +* ========= +* +* TRANS (input) CHARACTER*1 +* Specifies the form of the system of equations: +* = 'N': A * X = B (No transpose) +* = 'T': A**T * X = B (Transpose) +* = 'C': A**H * X = B (Conjugate transpose) +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input) COMPLEX array, dimension (LDA,N) +* The factors L and U from the factorization A = P*L*U +* as computed by CGETRF. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from CGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* B (input/output) COMPLEX array, dimension (LDB,NRHS) +* On entry, the right hand side matrix B. +* On exit, the solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRAN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CLASWP, CTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NOTRAN = LSAME( TRANS, 'N' ) .OR. LSAME(TRANS, 'R') + IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETRS', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 .OR. NRHS.EQ.0 ) + $ RETURN +* + IF( NOTRAN ) THEN +* +* Solve A * X = B. +* +* Apply row interchanges to the right hand sides. +* + CALL CLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) +* +* Solve L*X = B, overwriting B with X. +* + CALL CTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve U*X = B, overwriting B with X. +* + CALL CTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, + $ NRHS, ONE, A, LDA, B, LDB ) + ELSE +* +* Solve A**T * X = B or A**H * X = B. +* +* Solve U'*X = B, overwriting B with X. +* + CALL CTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, NRHS, ONE, + $ A, LDA, B, LDB ) +* +* Solve L'*X = B, overwriting B with X. +* + CALL CTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, ONE, A, + $ LDA, B, LDB ) +* +* Apply row interchanges to the solution vectors. +* + CALL CLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) + END IF +* + RETURN +* +* End of CGETRS +* + END diff --git a/reference/chbmvf.f b/reference/chbmvf.f new file mode 100644 index 0000000..85285c4 --- /dev/null +++ b/reference/chbmvf.f @@ -0,0 +1,309 @@ + SUBROUTINE CHBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, K, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian band matrix, with k super-diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the hermitian matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a hermitian band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the hermitian matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a hermitian band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( K.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*REAL( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*REAL( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*REAL( A( 1, J ) ) + L = 1 - J + DO 90, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*REAL( A( 1, J ) ) + L = 1 - J + IX = JX + IY = JY + DO 110, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHBMV . +* + END diff --git a/reference/chemm3mf.f b/reference/chemm3mf.f new file mode 100644 index 0000000..7fd2e6e --- /dev/null +++ b/reference/chemm3mf.f @@ -0,0 +1,304 @@ + SUBROUTINE CHEMM3MF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CHEMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is an hermitian matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the hermitian matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the hermitian matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* hermitian matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* hermitian matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHEMM3M', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*CONJG( A( K, I ) ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*CONJG( A( K, I ) ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*REAL( A( J, J ) ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*CONJG( A( J, K ) ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*CONJG( A( J, K ) ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of CHEMM . +* + END diff --git a/reference/chemmf.f b/reference/chemmf.f new file mode 100644 index 0000000..ccb9b0a --- /dev/null +++ b/reference/chemmf.f @@ -0,0 +1,304 @@ + SUBROUTINE CHEMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CHEMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is an hermitian matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the hermitian matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the hermitian matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* hermitian matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* hermitian matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHEMM3M', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*CONJG( A( K, I ) ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*CONJG( A( K, I ) ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*REAL( A( J, J ) ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*CONJG( A( J, K ) ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*CONJG( A( J, K ) ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of CHEMM . +* + END diff --git a/reference/chemvf.f b/reference/chemvf.f new file mode 100644 index 0000000..6ce567d --- /dev/null +++ b/reference/chemvf.f @@ -0,0 +1,349 @@ + SUBROUTINE CHEMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHEMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ).AND. + $ .NOT.LSAME( UPLO, 'V' ).AND. + $ .NOT.LSAME( UPLO, 'M' ))THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 5 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + ELSE IF( INCY.EQ.0 )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHEMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + RETURN + ENDIF + + IF( LSAME( UPLO, 'L' ) )THEN +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + IX = JX + IY = JY + DO 110, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + RETURN + END IF + + IF( LSAME( UPLO, 'V' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 160, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 150, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1* CONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 150 CONTINUE + Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 + 160 CONTINUE + ELSE + JX = KX + JY = KY + DO 180, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 170, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1* CONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 170 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 180 CONTINUE + END IF + RETURN + ENDIF + + + IF( LSAME( UPLO, 'M' ) )THEN +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 200, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + DO 190, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*CONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 190 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 200 CONTINUE + ELSE + JX = KX + JY = KY + DO 220, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + IX = JX + IY = JY + DO 210, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*CONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 210 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 220 CONTINUE + END IF + RETURN + END IF + +* +* +* End of CHEMV . +* + END diff --git a/reference/cher2f.f b/reference/cher2f.f new file mode 100644 index 0000000..096709a --- /dev/null +++ b/reference/cher2f.f @@ -0,0 +1,249 @@ + SUBROUTINE CHER2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHER2 performs the hermitian rank 2 operation +* +* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHER2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( J ) ) + TEMP2 = CONJG( ALPHA*X( J ) ) + DO 10, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + A( J, J ) = REAL( A( J, J ) ) + + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( JY ) ) + TEMP2 = CONJG( ALPHA*X( JX ) ) + IX = KX + IY = KY + DO 30, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + A( J, J ) = REAL( A( J, J ) ) + + $ REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( J ) ) + TEMP2 = CONJG( ALPHA*X( J ) ) + A( J, J ) = REAL( A( J, J ) ) + + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) + DO 50, I = J + 1, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( JY ) ) + TEMP2 = CONJG( ALPHA*X( JX ) ) + A( J, J ) = REAL( A( J, J ) ) + + $ REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + IX = JX + IY = JY + DO 70, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + 70 CONTINUE + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHER2 . +* + END diff --git a/reference/cher2kf.f b/reference/cher2kf.f new file mode 100644 index 0000000..935c92d --- /dev/null +++ b/reference/cher2kf.f @@ -0,0 +1,371 @@ + SUBROUTINE CHER2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + REAL BETA + COMPLEX ALPHA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CHER2K performs one of the hermitian rank 2k operations +* +* C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C, +* +* or +* +* C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C, +* +* where alpha and beta are scalars with beta real, C is an n by n +* hermitian matrix and A and B are n by k matrices in the first case +* and k by n matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*conjg( B' ) + +* conjg( alpha )*B*conjg( A' ) + +* beta*C. +* +* TRANS = 'C' or 'c' C := alpha*conjg( A' )*B + +* conjg( alpha )*conjg( B' )*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'C' or 'c', K specifies the number of rows of the +* matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1. +* Ed Anderson, Cray Research Inc. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHER2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.REAL( ZERO ) )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.REAL( ZERO ) )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + C( J, J ) = BETA*REAL( C( J, J ) ) + DO 70, I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + +* C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.REAL( ZERO ) )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + ELSE + C( J, J ) = REAL( C( J, J ) ) + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( B( J, L ) ) + TEMP2 = CONJG( ALPHA*A( J, L ) ) + DO 110, I = 1, J - 1 + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 110 CONTINUE + C( J, J ) = REAL( C( J, J ) ) + + $ REAL( A( J, L )*TEMP1 + + $ B( J, L )*TEMP2 ) + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.REAL( ZERO ) )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + ELSE + C( J, J ) = REAL( C( J, J ) ) + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( B( J, L ) ) + TEMP2 = CONJG( ALPHA*A( J, L ) ) + DO 160, I = J + 1, N + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 160 CONTINUE + C( J, J ) = REAL( C( J, J ) ) + + $ REAL( A( J, L )*TEMP1 + + $ B( J, L )*TEMP2 ) + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + +* C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + CONJG( A( L, I ) )*B( L, J ) + TEMP2 = TEMP2 + CONJG( B( L, I ) )*A( L, J ) + 190 CONTINUE + IF( I.EQ.J )THEN + IF( BETA.EQ.REAL( ZERO ) )THEN + C( J, J ) = REAL( ALPHA *TEMP1 + + $ CONJG( ALPHA )*TEMP2 ) + ELSE + C( J, J ) = BETA*REAL( C( J, J ) ) + + $ REAL( ALPHA *TEMP1 + + $ CONJG( ALPHA )*TEMP2 ) + END IF + ELSE + IF( BETA.EQ.REAL( ZERO ) )THEN + C( I, J ) = ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 + END IF + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + CONJG( A( L, I ) )*B( L, J ) + TEMP2 = TEMP2 + CONJG( B( L, I ) )*A( L, J ) + 220 CONTINUE + IF( I.EQ.J )THEN + IF( BETA.EQ.REAL( ZERO ) )THEN + C( J, J ) = REAL( ALPHA *TEMP1 + + $ CONJG( ALPHA )*TEMP2 ) + ELSE + C( J, J ) = BETA*REAL( C( J, J ) ) + + $ REAL( ALPHA *TEMP1 + + $ CONJG( ALPHA )*TEMP2 ) + END IF + ELSE + IF( BETA.EQ.REAL( ZERO ) )THEN + C( I, J ) = ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 + END IF + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHER2K. +* + END diff --git a/reference/cherf.f b/reference/cherf.f new file mode 100644 index 0000000..748ae56 --- /dev/null +++ b/reference/cherf.f @@ -0,0 +1,212 @@ + SUBROUTINE CHERF ( UPLO, N, ALPHA, X, INCX, A, LDA ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CHER performs the hermitian rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHER ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.REAL( ZERO ) ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( J ) ) + DO 10, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + A( J, J ) = REAL( A( J, J ) ) + REAL( X( J )*TEMP ) + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( JX ) ) + IX = KX + DO 30, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + A( J, J ) = REAL( A( J, J ) ) + REAL( X( JX )*TEMP ) + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( J ) ) + A( J, J ) = REAL( A( J, J ) ) + REAL( TEMP*X( J ) ) + DO 50, I = J + 1, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( JX ) ) + A( J, J ) = REAL( A( J, J ) ) + REAL( TEMP*X( JX ) ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + A( I, J ) = A( I, J ) + X( IX )*TEMP + 70 CONTINUE + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHER . +* + END diff --git a/reference/cherkf.f b/reference/cherkf.f new file mode 100644 index 0000000..e3d0157 --- /dev/null +++ b/reference/cherkf.f @@ -0,0 +1,328 @@ + SUBROUTINE CHERKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + REAL ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CHERK performs one of the hermitian rank k operations +* +* C := alpha*A*conjg( A' ) + beta*C, +* +* or +* +* C := alpha*conjg( A' )*A + beta*C, +* +* where alpha and beta are real scalars, C is an n by n hermitian +* matrix and A is an n by k matrix in the first case and a k by n +* matrix in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*conjg( A' ) + beta*C. +* +* TRANS = 'C' or 'c' C := alpha*conjg( A' )*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'C' or 'c', K specifies the number of rows of the +* matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1. +* Ed Anderson, Cray Research Inc. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, REAL +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + REAL RTEMP + COMPLEX TEMP +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHERK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + C( J, J ) = BETA*REAL( C( J, J ) ) + DO 70, I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*conjg( A' ) + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + ELSE + C( J, J ) = REAL( C( J, J ) ) + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.CMPLX( ZERO ) )THEN + TEMP = ALPHA*CONJG( A( J, L ) ) + DO 110, I = 1, J - 1 + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + C( J, J ) = REAL( C( J, J ) ) + + $ REAL( TEMP*A( I, L ) ) + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + C( J, J ) = BETA*REAL( C( J, J ) ) + DO 150, I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + ELSE + C( J, J ) = REAL( C( J, J ) ) + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.CMPLX( ZERO ) )THEN + TEMP = ALPHA*CONJG( A( J, L ) ) + C( J, J ) = REAL( C( J, J ) ) + + $ REAL( TEMP*A( J, L ) ) + DO 160, I = J + 1, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*conjg( A' )*A + beta*C. +* + IF( UPPER )THEN + DO 220, J = 1, N + DO 200, I = 1, J - 1 + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + CONJG( A( L, I ) )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + RTEMP = ZERO + DO 210, L = 1, K + RTEMP = RTEMP + CONJG( A( L, J ) )*A( L, J ) + 210 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( J, J ) = ALPHA*RTEMP + ELSE + C( J, J ) = ALPHA*RTEMP + BETA*REAL( C( J, J ) ) + END IF + 220 CONTINUE + ELSE + DO 260, J = 1, N + RTEMP = ZERO + DO 230, L = 1, K + RTEMP = RTEMP + CONJG( A( L, J ) )*A( L, J ) + 230 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( J, J ) = ALPHA*RTEMP + ELSE + C( J, J ) = ALPHA*RTEMP + BETA*REAL( C( J, J ) ) + END IF + DO 250, I = J + 1, N + TEMP = ZERO + DO 240, L = 1, K + TEMP = TEMP + CONJG( A( L, I ) )*A( L, J ) + 240 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 250 CONTINUE + 260 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHERK . +* + END diff --git a/reference/chpmvf.f b/reference/chpmvf.f new file mode 100644 index 0000000..9f65105 --- /dev/null +++ b/reference/chpmvf.f @@ -0,0 +1,270 @@ + SUBROUTINE CHPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 6 + ELSE IF( INCY.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + CONJG( AP( K ) )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*REAL( AP( KK + J - 1 ) ) + $ + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + CONJG( AP( K ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*REAL( AP( KK + J - 1 ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*REAL( AP( KK ) ) + K = KK + 1 + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + CONJG( AP( K ) )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N - J + 1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*REAL( AP( KK ) ) + IX = JX + IY = JY + DO 110, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + CONJG( AP( K ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N - J + 1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHPMV . +* + END diff --git a/reference/chpr2f.f b/reference/chpr2f.f new file mode 100644 index 0000000..64f8fe9 --- /dev/null +++ b/reference/chpr2f.f @@ -0,0 +1,251 @@ + SUBROUTINE CHPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + COMPLEX ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHPR2 performs the hermitian rank 2 operation +* +* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( J ) ) + TEMP2 = CONJG( ALPHA*X( J ) ) + K = KK + DO 10, I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) + ELSE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( JY ) ) + TEMP2 = CONJG( ALPHA*X( JX ) ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + + $ REAL( X( JX )*TEMP1 + + $ Y( JY )*TEMP2 ) + ELSE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( J ) ) + TEMP2 = CONJG( ALPHA*X( J ) ) + AP( KK ) = REAL( AP( KK ) ) + + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) + K = KK + 1 + DO 50, I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = REAL( AP( KK ) ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( JY ) ) + TEMP2 = CONJG( ALPHA*X( JX ) ) + AP( KK ) = REAL( AP( KK ) ) + + $ REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + IX = JX + IY = JY + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + 70 CONTINUE + ELSE + AP( KK ) = REAL( AP( KK ) ) + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHPR2 . +* + END diff --git a/reference/chprf.f b/reference/chprf.f new file mode 100644 index 0000000..6d1d380 --- /dev/null +++ b/reference/chprf.f @@ -0,0 +1,217 @@ + SUBROUTINE CHPRF ( UPLO, N, ALPHA, X, INCX, AP ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CHPR performs the hermitian rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.REAL( ZERO ) ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( J ) ) + K = KK + DO 10, I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + $ + REAL( X( J )*TEMP ) + ELSE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( JX ) ) + IX = KX + DO 30, K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + $ + REAL( X( JX )*TEMP ) + ELSE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( J ) ) + AP( KK ) = REAL( AP( KK ) ) + REAL( TEMP*X( J ) ) + K = KK + 1 + DO 50, I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = REAL( AP( KK ) ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( JX ) ) + AP( KK ) = REAL( AP( KK ) ) + REAL( TEMP*X( JX ) ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + AP( K ) = AP( K ) + X( IX )*TEMP + 70 CONTINUE + ELSE + AP( KK ) = REAL( AP( KK ) ) + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHPR . +* + END diff --git a/reference/claswpf.f b/reference/claswpf.f new file mode 100644 index 0000000..4d47e4f --- /dev/null +++ b/reference/claswpf.f @@ -0,0 +1,120 @@ + SUBROUTINE CLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INCX, K1, K2, LDA, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CLASWP performs a series of row interchanges on the matrix A. +* One row interchange is initiated for each of rows K1 through K2 of A. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of columns of the matrix A. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the matrix of column dimension N to which the row +* interchanges will be applied. +* On exit, the permuted matrix. +* +* LDA (input) INTEGER +* The leading dimension of the array A. +* +* K1 (input) INTEGER +* The first element of IPIV for which a row interchange will +* be done. +* +* K2 (input) INTEGER +* The last element of IPIV for which a row interchange will +* be done. +* +* IPIV (input) INTEGER array, dimension (M*abs(INCX)) +* The vector of pivot indices. Only the elements in positions +* K1 through K2 of IPIV are accessed. +* IPIV(K) = L implies rows K and L are to be interchanged. +* +* INCX (input) INTEGER +* The increment between successive values of IPIV. If IPIV +* is negative, the pivots are applied in reverse order. +* +* Further Details +* =============== +* +* Modified by +* R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA +* +* ===================================================================== +* +* .. Local Scalars .. + INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 + COMPLEX TEMP +* .. +* .. Executable Statements .. +* +* Interchange row I with row IPIV(I) for each of rows K1 through K2. +* + IF( INCX.GT.0 ) THEN + IX0 = K1 + I1 = K1 + I2 = K2 + INC = 1 + ELSE IF( INCX.LT.0 ) THEN + IX0 = 1 + ( 1-K2 )*INCX + I1 = K2 + I2 = K1 + INC = -1 + ELSE + RETURN + END IF +* + N32 = ( N / 32 )*32 + IF( N32.NE.0 ) THEN + DO 30 J = 1, N32, 32 + IX = IX0 + DO 20 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 10 K = J, J + 31 + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 10 CONTINUE + END IF + IX = IX + INCX + 20 CONTINUE + 30 CONTINUE + END IF + IF( N32.NE.N ) THEN + N32 = N32 + 1 + IX = IX0 + DO 50 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 40 K = N32, N + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 40 CONTINUE + END IF + IX = IX + INCX + 50 CONTINUE + END IF +* + RETURN +* +* End of CLASWP +* + END diff --git a/reference/clauu2f.f b/reference/clauu2f.f new file mode 100644 index 0000000..4bb8725 --- /dev/null +++ b/reference/clauu2f.f @@ -0,0 +1,143 @@ + SUBROUTINE CLAUU2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CLAUU2 computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the unblocked form of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I + REAL AII +* .. +* .. External Functions .. + LOGICAL LSAME + COMPLEX CDOTC + EXTERNAL LSAME, CDOTC +* .. +* .. External Subroutines .. + EXTERNAL CGEMV, CLACGV, CSSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLAUU2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = AII*AII + REAL( CDOTC( N-I, A( I, I+1 ), LDA, + $ A( I, I+1 ), LDA ) ) + CALL CLACGV( N-I, A( I, I+1 ), LDA ) + CALL CGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), + $ LDA, A( I, I+1 ), LDA, CMPLX( AII ), + $ A( 1, I ), 1 ) + CALL CLACGV( N-I, A( I, I+1 ), LDA ) + ELSE + CALL CSSCAL( I, AII, A( 1, I ), 1 ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = AII*AII + REAL( CDOTC( N-I, A( I+1, I ), 1, + $ A( I+1, I ), 1 ) ) + CALL CLACGV( I-1, A( I, 1 ), LDA ) + CALL CGEMV( 'Conjugate transpose', N-I, I-1, ONE, + $ A( I+1, 1 ), LDA, A( I+1, I ), 1, + $ CMPLX( AII ), A( I, 1 ), LDA ) + CALL CLACGV( I-1, A( I, 1 ), LDA ) + ELSE + CALL CSSCAL( I, AII, A( I, 1 ), LDA ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of CLAUU2 +* + END diff --git a/reference/clauumf.f b/reference/clauumf.f new file mode 100644 index 0000000..9b57fe1 --- /dev/null +++ b/reference/clauumf.f @@ -0,0 +1,161 @@ + SUBROUTINE CLAUUMF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CLAUUM computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the blocked form of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, IB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CHERK, CLAUU2, CTRMM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLAUUM', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 128 +* + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL CLAUU2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL CTRMM( 'Right', 'Upper', 'Conjugate transpose', + $ 'Non-unit', I-1, IB, CONE, A( I, I ), LDA, + $ A( 1, I ), LDA ) + CALL CLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ I-1, IB, N-I-IB+1, CONE, A( 1, I+IB ), + $ LDA, A( I, I+IB ), LDA, CONE, A( 1, I ), + $ LDA ) + CALL CHERK( 'Upper', 'No transpose', IB, N-I-IB+1, + $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), + $ LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL CTRMM( 'Left', 'Lower', 'Conjugate transpose', + $ 'Non-unit', IB, I-1, CONE, A( I, I ), LDA, + $ A( I, 1 ), LDA ) + CALL CLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL CGEMM( 'Conjugate transpose', 'No transpose', IB, + $ I-1, N-I-IB+1, CONE, A( I+IB, I ), LDA, + $ A( I+IB, 1 ), LDA, CONE, A( I, 1 ), LDA ) + CALL CHERK( 'Lower', 'Conjugate transpose', IB, + $ N-I-IB+1, ONE, A( I+IB, I ), LDA, ONE, + $ A( I, I ), LDA ) + END IF + 20 CONTINUE + END IF + END IF +* + RETURN +* +* End of CLAUUM +* + END diff --git a/reference/cpotf2f.f b/reference/cpotf2f.f new file mode 100644 index 0000000..2b451cc --- /dev/null +++ b/reference/cpotf2f.f @@ -0,0 +1,175 @@ + SUBROUTINE CPOTF2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CPOTF2 computes the Cholesky factorization of a complex Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U' * U , if UPLO = 'U', or +* A = L * L', if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the unblocked version of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the upper or lower triangular part of the +* Hermitian matrix A is stored. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* n by n upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U'*U or A = L*L'. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, the leading minor of order k is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J + REAL AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + COMPLEX CDOTC + EXTERNAL LSAME, CDOTC +* .. +* .. External Subroutines .. + EXTERNAL CGEMV, CLACGV, CSSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, REAL, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CPOTF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N +* +* Compute U(J,J) and test for non-positive-definiteness. +* + AJJ = REAL( A( J, J ) ) - CDOTC( J-1, A( 1, J ), 1, + $ A( 1, J ), 1 ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of row J. +* + IF( J.LT.N ) THEN + CALL CLACGV( J-1, A( 1, J ), 1 ) + CALL CGEMV( 'Transpose', J-1, N-J, -CONE, A( 1, J+1 ), + $ LDA, A( 1, J ), 1, CONE, A( J, J+1 ), LDA ) + CALL CLACGV( J-1, A( 1, J ), 1 ) + CALL CSSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N +* +* Compute L(J,J) and test for non-positive-definiteness. +* + AJJ = REAL( A( J, J ) ) - CDOTC( J-1, A( J, 1 ), LDA, + $ A( J, 1 ), LDA ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of column J. +* + IF( J.LT.N ) THEN + CALL CLACGV( J-1, A( J, 1 ), LDA ) + CALL CGEMV( 'No transpose', N-J, J-1, -CONE, A( J+1, 1 ), + $ LDA, A( J, 1 ), LDA, CONE, A( J+1, J ), 1 ) + CALL CLACGV( J-1, A( J, 1 ), LDA ) + CALL CSSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF + GO TO 40 +* + 30 CONTINUE + INFO = J +* + 40 CONTINUE + RETURN +* +* End of CPOTF2 +* + END diff --git a/reference/cpotrff.f b/reference/cpotrff.f new file mode 100644 index 0000000..696de86 --- /dev/null +++ b/reference/cpotrff.f @@ -0,0 +1,187 @@ + SUBROUTINE CPOTRFF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CPOTRF computes the Cholesky factorization of a complex Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U**H * U, if UPLO = 'U', or +* A = L * L**H, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + COMPLEX CONE + PARAMETER ( ONE = 1.0E+0, CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CHERK, CPOTF2, CTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 56 + + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL CPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL CHERK( 'Upper', 'Conjugate transpose', JB, J-1, + $ -ONE, A( 1, J ), LDA, ONE, A( J, J ), LDA ) + CALL CPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block row. +* + CALL CGEMM( 'Conjugate transpose', 'No transpose', JB, + $ N-J-JB+1, J-1, -CONE, A( 1, J ), LDA, + $ A( 1, J+JB ), LDA, CONE, A( J, J+JB ), + $ LDA ) + CALL CTRSM( 'Left', 'Upper', 'Conjugate transpose', + $ 'Non-unit', JB, N-J-JB+1, CONE, A( J, J ), + $ LDA, A( J, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL CHERK( 'Lower', 'No transpose', JB, J-1, -ONE, + $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) + CALL CPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block column. +* + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ N-J-JB+1, JB, J-1, -CONE, A( J+JB, 1 ), + $ LDA, A( J, 1 ), LDA, CONE, A( J+JB, J ), + $ LDA ) + CALL CTRSM( 'Right', 'Lower', 'Conjugate transpose', + $ 'Non-unit', N-J-JB+1, JB, CONE, A( J, J ), + $ LDA, A( J+JB, J ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of CPOTRF +* + END diff --git a/reference/cpotrif.f b/reference/cpotrif.f new file mode 100644 index 0000000..e14b287 --- /dev/null +++ b/reference/cpotrif.f @@ -0,0 +1,96 @@ + SUBROUTINE CPOTRIF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CPOTRI computes the inverse of a complex Hermitian positive definite +* matrix A using the Cholesky factorization A = U**H*U or A = L*L**H +* computed by CPOTRF. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H, as computed by +* CPOTRF. +* On exit, the upper or lower triangle of the (Hermitian) +* inverse of A, overwriting the input factor U or L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the (i,i) element of the factor U or L is +* zero, and the inverse could not be computed. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CLAUUM, CTRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CPOTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Invert the triangular Cholesky factor U or L. +* + CALL CTRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* +* Form inv(U)*inv(U)' or inv(L)'*inv(L). +* + CALL CLAUUM( UPLO, N, A, LDA, INFO ) +* + RETURN +* +* End of CPOTRI +* + END diff --git a/reference/crotgf.f b/reference/crotgf.f new file mode 100644 index 0000000..6195133 --- /dev/null +++ b/reference/crotgf.f @@ -0,0 +1,20 @@ + subroutine crotgf(ca,cb,c,s) + complex ca,cb,s + real c + real norm,scale + complex alpha + if (cabs(ca) .ne. 0.) go to 10 + c = 0. + s = (1.,0.) + ca = cb + go to 20 + 10 continue + scale = cabs(ca) + cabs(cb) + norm = scale * sqrt((cabs(ca/scale))**2 + (cabs(cb/scale))**2) + alpha = ca /cabs(ca) + c = cabs(ca) / norm + s = alpha * conjg(cb) / norm + ca = alpha * norm + 20 continue + return + end diff --git a/reference/csbmvf.f b/reference/csbmvf.f new file mode 100644 index 0000000..e635af8 --- /dev/null +++ b/reference/csbmvf.f @@ -0,0 +1,306 @@ + SUBROUTINE CSBMVF(UPLO, N, K, ALPHA, A, LDA, X, INCX, BETA, Y, + $ INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, K, LDA, N + COMPLEX ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CSBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric band matrix, with k super-diagonals. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array, dimension( LDA, N ) +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L + COMPLEX TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( K.LT.0 ) THEN + INFO = 3 + ELSE IF( LDA.LT.( K+1 ) ) THEN + INFO = 6 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 8 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50 I = MAX( 1, J-K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70 I = MAX( 1, J-K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K ) THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( 1, J ) + L = 1 - J + DO 90 I = J + 1, MIN( N, J+K ) + Y( I ) = Y( I ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) + L = 1 - J + IX = JX + IY = JY + DO 110 I = J + 1, MIN( N, J+K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSBMV +* + END diff --git a/reference/cscalf.f b/reference/cscalf.f new file mode 100644 index 0000000..714dc42 --- /dev/null +++ b/reference/cscalf.f @@ -0,0 +1,28 @@ + subroutine cscalf(n,ca,cx,incx) +c +c scales a vector by a constant. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex ca,cx(*) + integer i,incx,n,nincx +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + cx(i) = ca*cx(i) + 10 continue + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + cx(i) = ca*cx(i) + 30 continue + return + end diff --git a/reference/cspmvf.f b/reference/cspmvf.f new file mode 100644 index 0000000..7f357c6 --- /dev/null +++ b/reference/cspmvf.f @@ -0,0 +1,264 @@ + SUBROUTINE CSPMVF(UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, N + COMPLEX ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CSPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix, supplied in packed form. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP (input) COMPLEX array, dimension at least +* ( ( N*( N + 1 ) )/2 ). +* Before entry, with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry, with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Unchanged on exit. +* +* X (input) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA (input) COMPLEX +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y (input/output) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY (input) INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY + COMPLEX TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 6 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 9 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50 I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70 K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*AP( KK ) + K = KK + 1 + DO 90 I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N-J+1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*AP( KK ) + IX = JX + IY = JY + DO 110 K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N-J+1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSPMV +* + END diff --git a/reference/cspr2f.f b/reference/cspr2f.f new file mode 100644 index 0000000..8ba35f5 --- /dev/null +++ b/reference/cspr2f.f @@ -0,0 +1,229 @@ + SUBROUTINE CSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + COMPLEX*8 ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*8 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSPR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*8 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*8 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*8 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX*8 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*8 ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + COMPLEX*8 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSPR2 . +* + END diff --git a/reference/csprf.f b/reference/csprf.f new file mode 100644 index 0000000..9010f0c --- /dev/null +++ b/reference/csprf.f @@ -0,0 +1,213 @@ + SUBROUTINE CSPRF( UPLO, N, ALPHA, X, INCX, AP ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, N + COMPLEX ALPHA +* .. +* .. Array Arguments .. + COMPLEX AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CSPR performs the symmetric rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a complex scalar, x is an n element vector and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X (input) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP (input/output) COMPLEX array, dimension at least +* ( ( N*( N + 1 ) )/2 ). +* Before entry, with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry, with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, J, JX, K, KK, KX + COMPLEX TEMP +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 ) THEN + KX = 1 - ( N-1 )*INCX + ELSE IF( INCX.NE.1 ) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 ) THEN + DO 20 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + K = KK + DO 10 I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + AP( KK+J-1 ) = AP( KK+J-1 ) + X( J )*TEMP + ELSE + AP( KK+J-1 ) = AP( KK+J-1 ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30 K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + AP( KK+J-1 ) = AP( KK+J-1 ) + X( JX )*TEMP + ELSE + AP( KK+J-1 ) = AP( KK+J-1 ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 ) THEN + DO 60 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + AP( KK ) = AP( KK ) + TEMP*X( J ) + K = KK + 1 + DO 50 I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = AP( KK ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + AP( KK ) = AP( KK ) + TEMP*X( JX ) + IX = JX + DO 70 K = KK + 1, KK + N - J + IX = IX + INCX + AP( K ) = AP( K ) + X( IX )*TEMP + 70 CONTINUE + ELSE + AP( KK ) = AP( KK ) + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSPR +* + END diff --git a/reference/csrotf.f b/reference/csrotf.f new file mode 100644 index 0000000..1ecdb0a --- /dev/null +++ b/reference/csrotf.f @@ -0,0 +1,38 @@ + subroutine csrotf (n,cx,incx,cy,incy,c,s) +c +c applies a plane rotation, where the cos and sin (c and s) are real +c and the vectors cx and cy are complex. +c jack dongarra, linpack, 3/11/78. +c + complex cx(1),cy(1),ctemp + real c,s + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ctemp = c*cx(ix) + s*cy(iy) + cy(iy) = c*cy(iy) - s*cx(ix) + cx(ix) = ctemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ctemp = c*cx(i) + s*cy(i) + cy(i) = c*cy(i) - s*cx(i) + cx(i) = ctemp + 30 continue + return + end diff --git a/reference/csscalf.f b/reference/csscalf.f new file mode 100644 index 0000000..099d519 --- /dev/null +++ b/reference/csscalf.f @@ -0,0 +1,29 @@ + subroutine csscalf(n,sa,cx,incx) +c +c scales a complex vector by a real constant. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*) + real sa + integer i,incx,n,nincx +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + cx(i) = cmplx(sa*real(cx(i)),sa*aimag(cx(i))) + 10 continue + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + cx(i) = cmplx(sa*real(cx(i)),sa*aimag(cx(i))) + 30 continue + return + end diff --git a/reference/cswapf.f b/reference/cswapf.f new file mode 100644 index 0000000..39683b6 --- /dev/null +++ b/reference/cswapf.f @@ -0,0 +1,36 @@ + subroutine cswapf (n,cx,incx,cy,incy) +c +c interchanges two vectors. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ctemp + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ctemp = cx(ix) + cx(ix) = cy(iy) + cy(iy) = ctemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 + 20 do 30 i = 1,n + ctemp = cx(i) + cx(i) = cy(i) + cy(i) = ctemp + 30 continue + return + end diff --git a/reference/csymm3mf.f b/reference/csymm3mf.f new file mode 100644 index 0000000..2640a18 --- /dev/null +++ b/reference/csymm3mf.f @@ -0,0 +1,296 @@ + SUBROUTINE CSYMM3MF( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of CSYMM . +* + END diff --git a/reference/csymmf.f b/reference/csymmf.f new file mode 100644 index 0000000..d5480e4 --- /dev/null +++ b/reference/csymmf.f @@ -0,0 +1,296 @@ + SUBROUTINE CSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of CSYMM . +* + END diff --git a/reference/csymvf.f b/reference/csymvf.f new file mode 100644 index 0000000..09d247a --- /dev/null +++ b/reference/csymvf.f @@ -0,0 +1,264 @@ + SUBROUTINE CSYMVF(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, LDA, N + COMPLEX ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CSYMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A (input) COMPLEX array, dimension ( LDA, N ) +* Before entry, with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry, with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. +* Unchanged on exit. +* +* LDA (input) INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, N ). +* Unchanged on exit. +* +* X (input) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA (input) COMPLEX +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y (input/output) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY (input) INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY + COMPLEX TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = 5 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 7 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 10 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSYMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50 I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70 I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( J, J ) + DO 90 I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + IX = JX + IY = JY + DO 110 I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSYMV +* + END diff --git a/reference/csyr2f.f b/reference/csyr2f.f new file mode 100644 index 0000000..1fde4c0 --- /dev/null +++ b/reference/csyr2f.f @@ -0,0 +1,230 @@ + SUBROUTINE CSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*8 ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*8 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSYR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*8 ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + COMPLEX*8 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYR2 . +* + END diff --git a/reference/csyr2kf.f b/reference/csyr2kf.f new file mode 100644 index 0000000..f9468dd --- /dev/null +++ b/reference/csyr2kf.f @@ -0,0 +1,324 @@ + SUBROUTINE CSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CSYR2K performs one of the symmetric rank 2k operations +* +* C := alpha*A*B' + alpha*B*A' + beta*C, +* +* or +* +* C := alpha*A'*B + alpha*B'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A and B are n by k matrices in the first case and k by n +* matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + +* beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'T' or 't', K specifies the number of rows of the +* matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CSYR2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*B' + alpha*B*A' + C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*B + alpha*B'*A + C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSYR2K. +* + END diff --git a/reference/csyrf.f b/reference/csyrf.f new file mode 100644 index 0000000..f1a2d59 --- /dev/null +++ b/reference/csyrf.f @@ -0,0 +1,198 @@ + SUBROUTINE CSYRF( UPLO, N, ALPHA, X, INCX, A, LDA ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, LDA, N + COMPLEX ALPHA +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CSYR performs the symmetric rank 1 operation +* +* A := alpha*x*( x' ) + A, +* +* where alpha is a complex scalar, x is an n element vector and A is an +* n by n symmetric matrix. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X (input) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A (input/output) COMPLEX array, dimension ( LDA, N ) +* Before entry, with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry, with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA (input) INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, N ). +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, J, JX, KX + COMPLEX TEMP +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = 7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSYR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 ) THEN + KX = 1 - ( N-1 )*INCX + ELSE IF( INCX.NE.1 ) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 ) THEN + DO 20 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + DO 10 I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30 I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 ) THEN + DO 60 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + DO 50 I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70 I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSYR +* + END diff --git a/reference/csyrkf.f b/reference/csyrkf.f new file mode 100644 index 0000000..7dbaefa --- /dev/null +++ b/reference/csyrkf.f @@ -0,0 +1,293 @@ + SUBROUTINE CSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CSYRK performs one of the symmetric rank k operations +* +* C := alpha*A*A' + beta*C, +* +* or +* +* C := alpha*A'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A is an n by k matrix in the first case and a k by n matrix +* in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'T' or 't', K specifies the number of rows of the +* matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX TEMP +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CSYRK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*A' + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*A + beta*C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP = ZERO + DO 220, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSYRK . +* + END diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f new file mode 100644 index 0000000..ff3c526 --- /dev/null +++ b/reference/ctbmvf.f @@ -0,0 +1,377 @@ + SUBROUTINE CTBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, K, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTBMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular band matrix, with ( k + 1 ) diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( K.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 7 + ELSE IF( INCX.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = KPLUS1 - J + DO 10, I = MAX( 1, J - K ), J - 1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( KPLUS1, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = KPLUS1 - J + DO 30, I = MAX( 1, J - K ), J - 1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( KPLUS1, J ) + END IF + JX = JX + INCX + IF( J.GT.K ) + $ KX = KX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = 1 - J + DO 50, I = MIN( N, J + K ), J + 1, -1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( 1, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = 1 - J + DO 70, I = MIN( N, J + K ), J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( 1, J ) + END IF + JX = JX - INCX + IF( ( N - J ).GE.K ) + $ KX = KX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + L = KPLUS1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 90, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( I ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( KPLUS1, J ) ) + DO 100, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + CONJG( A( L + I, J ) )*X( I ) + 100 CONTINUE + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + KX = KX - INCX + IX = KX + L = KPLUS1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 120, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX - INCX + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( KPLUS1, J ) ) + DO 130, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + CONJG( A( L + I, J ) )*X( IX ) + IX = IX - INCX + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + L = 1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 150, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( I ) + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( 1, J ) ) + DO 160, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + CONJG( A( L + I, J ) )*X( I ) + 160 CONTINUE + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + KX = KX + INCX + IX = KX + L = 1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 180, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX + INCX + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( 1, J ) ) + DO 190, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + CONJG( A( L + I, J ) )*X( IX ) + IX = IX + INCX + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTBMV . +* + END diff --git a/reference/ctbsvf.f b/reference/ctbsvf.f new file mode 100644 index 0000000..9358433 --- /dev/null +++ b/reference/ctbsvf.f @@ -0,0 +1,367 @@ + SUBROUTINE CTBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) +* .. Scalar Arguments .. + INTEGER INCX,K,LDA,N + CHARACTER DIAG,TRANS,UPLO +* .. +* .. Array Arguments .. + COMPLEX A(LDA,*),X(*) +* .. +* +* Purpose +* ======= +* +* CTBSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular band matrix, with ( k + 1 ) +* diagonals. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER (ZERO= (0.0E+0,0.0E+0)) +* .. +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L + LOGICAL NOCONJ,NOUNIT +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG,MAX,MIN +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN + INFO = 1 + ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 2 + ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT. (K+1)) THEN + INFO = 7 + ELSE IF (INCX.EQ.0) THEN + INFO = 9 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('CTBSV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF (N.EQ.0) RETURN +* + NOCONJ = LSAME(TRANS,'T') + NOUNIT = LSAME(DIAG,'N') +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF (INCX.LE.0) THEN + KX = 1 - (N-1)*INCX + ELSE IF (INCX.NE.1) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed by sequentially with one pass through A. +* + IF (LSAME(TRANS,'N')) THEN +* +* Form x := inv( A )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 20 J = N,1,-1 + IF (X(J).NE.ZERO) THEN + L = KPLUS1 - J + IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) + TEMP = X(J) + DO 10 I = J - 1,MAX(1,J-K),-1 + X(I) = X(I) - TEMP*A(L+I,J) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 40 J = N,1,-1 + KX = KX - INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = KPLUS1 - J + IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) + TEMP = X(JX) + DO 30 I = J - 1,MAX(1,J-K),-1 + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX - INCX + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 60 J = 1,N + IF (X(J).NE.ZERO) THEN + L = 1 - J + IF (NOUNIT) X(J) = X(J)/A(1,J) + TEMP = X(J) + DO 50 I = J + 1,MIN(N,J+K) + X(I) = X(I) - TEMP*A(L+I,J) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1,N + KX = KX + INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = 1 - J + IF (NOUNIT) X(JX) = X(JX)/A(1,J) + TEMP = X(JX) + DO 70 I = J + 1,MIN(N,J+K) + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A') )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 110 J = 1,N + TEMP = X(J) + L = KPLUS1 - J + IF (NOCONJ) THEN + DO 90 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(I) + 90 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + ELSE + DO 100 I = MAX(1,J-K),J - 1 + TEMP = TEMP - CONJG(A(L+I,J))*X(I) + 100 CONTINUE + IF (NOUNIT) TEMP = TEMP/CONJG(A(KPLUS1,J)) + END IF + X(J) = TEMP + 110 CONTINUE + ELSE + JX = KX + DO 140 J = 1,N + TEMP = X(JX) + IX = KX + L = KPLUS1 - J + IF (NOCONJ) THEN + DO 120 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX + INCX + 120 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + ELSE + DO 130 I = MAX(1,J-K),J - 1 + TEMP = TEMP - CONJG(A(L+I,J))*X(IX) + IX = IX + INCX + 130 CONTINUE + IF (NOUNIT) TEMP = TEMP/CONJG(A(KPLUS1,J)) + END IF + X(JX) = TEMP + JX = JX + INCX + IF (J.GT.K) KX = KX + INCX + 140 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 170 J = N,1,-1 + TEMP = X(J) + L = 1 - J + IF (NOCONJ) THEN + DO 150 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(I) + 150 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + ELSE + DO 160 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - CONJG(A(L+I,J))*X(I) + 160 CONTINUE + IF (NOUNIT) TEMP = TEMP/CONJG(A(1,J)) + END IF + X(J) = TEMP + 170 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 200 J = N,1,-1 + TEMP = X(JX) + IX = KX + L = 1 - J + IF (NOCONJ) THEN + DO 180 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX - INCX + 180 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + ELSE + DO 190 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - CONJG(A(L+I,J))*X(IX) + IX = IX - INCX + 190 CONTINUE + IF (NOUNIT) TEMP = TEMP/CONJG(A(1,J)) + END IF + X(JX) = TEMP + JX = JX - INCX + IF ((N-J).GE.K) KX = KX - INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTBSV . +* + END diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f new file mode 100644 index 0000000..cd29ec5 --- /dev/null +++ b/reference/ctpmvf.f @@ -0,0 +1,376 @@ + SUBROUTINE CTPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTPMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTPMVF', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ).OR.LSAME( TRANS, 'R' ))THEN +* +* Form x:= A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 10, I = 1, J - 1 + IF( NOCONJ )THEN + X( I ) = X( I ) + TEMP*AP( K ) + ELSE + X( I ) = X( I ) + TEMP*CONJG(AP( K )) + END IF + K = K + 1 + 10 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK + J - 1 ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*CONJG(AP( KK + J-1)) + END IF + END IF + + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, K = KK, KK + J - 2 + IF( NOCONJ )THEN + X( IX ) = X( IX ) + TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) + TEMP*CONJG(AP(K)) + END IF + IX = IX + INCX + 30 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK + J - 1 ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*CONJG(AP( KK + J-1)) + END IF + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 50, I = N, J + 1, -1 + IF( NOCONJ )THEN + X( I ) = X( I ) + TEMP*AP( K ) + ELSE + X( I ) = X( I ) + TEMP*CONJG(AP( K )) + END IF + K = K - 1 + 50 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK - N + J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*CONJG(AP(KK - N+J)) + END IF + END IF + KK = KK - ( N - J + 1 ) + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 + IF( NOCONJ )THEN + X( IX ) = X( IX ) + TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) + TEMP*CONJG(AP(K)) + ENDIF + IX = IX - INCX + 70 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK - N + J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*CONJG(AP(KK-N+J)) + ENDIF + END IF + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + K = KK - 1 + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + AP( K )*X( I ) + K = K - 1 + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( AP( KK ) ) + DO 100, I = J - 1, 1, -1 + TEMP = TEMP + CONJG( AP( K ) )*X( I ) + K = K - 1 + 100 CONTINUE + END IF + X( J ) = TEMP + KK = KK - J + 110 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 120, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + AP( K )*X( IX ) + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( AP( KK ) ) + DO 130, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + CONJG( AP( K ) )*X( IX ) + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + KK = KK - J + 140 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + K = KK + 1 + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 150, I = J + 1, N + TEMP = TEMP + AP( K )*X( I ) + K = K + 1 + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( AP( KK ) ) + DO 160, I = J + 1, N + TEMP = TEMP + CONJG( AP( K ) )*X( I ) + K = K + 1 + 160 CONTINUE + END IF + X( J ) = TEMP + KK = KK + ( N - J + 1 ) + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 180, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + AP( K )*X( IX ) + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( AP( KK ) ) + DO 190, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + CONJG( AP( K ) )*X( IX ) + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTPMV . +* + END diff --git a/reference/ctpsvf.f b/reference/ctpsvf.f new file mode 100644 index 0000000..2da9215 --- /dev/null +++ b/reference/ctpsvf.f @@ -0,0 +1,379 @@ + SUBROUTINE CTPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTPSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix, supplied in packed form. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTPSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) .OR.LSAME( TRANS, 'R' ))THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/CONJG(AP( KK )) + END IF + + TEMP = X( J ) + K = KK - 1 + DO 10, I = J - 1, 1, -1 + IF( NOCONJ )THEN + X( I ) = X( I ) - TEMP*AP( K ) + ELSE + X( I ) = X( I ) - TEMP*CONJG(AP( K )) + END IF + K = K - 1 + 10 CONTINUE + END IF + KK = KK - J + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/CONJG(AP( KK )) + END IF + TEMP = X( JX ) + IX = JX + DO 30, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + IF( NOCONJ )THEN + X( IX ) = X( IX ) - TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) - TEMP*CONJG(AP( K )) + END IF + 30 CONTINUE + END IF + JX = JX - INCX + KK = KK - J + 40 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/CONJG(AP( KK )) + END IF + TEMP = X( J ) + K = KK + 1 + DO 50, I = J + 1, N + IF( NOCONJ )THEN + X( I ) = X( I ) - TEMP*AP( K ) + ELSE + X( I ) = X( I ) - TEMP*CONJG(AP( K )) + END IF + K = K + 1 + 50 CONTINUE + END IF + KK = KK + ( N - J + 1 ) + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/CONJG(AP( KK )) + END IF + TEMP = X( JX ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + IF( NOCONJ )THEN + X( IX ) = X( IX ) - TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) - TEMP*CONJG(AP( K )) + END IF + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = X( J ) + K = KK + IF( NOCONJ )THEN + DO 90, I = 1, J - 1 + TEMP = TEMP - AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + ELSE + DO 100, I = 1, J - 1 + TEMP = TEMP - CONJG( AP( K ) )*X( I ) + K = K + 1 + 100 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( AP( KK + J - 1 ) ) + END IF + X( J ) = TEMP + KK = KK + J + 110 CONTINUE + ELSE + JX = KX + DO 140, J = 1, N + TEMP = X( JX ) + IX = KX + IF( NOCONJ )THEN + DO 120, K = KK, KK + J - 2 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX + INCX + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + ELSE + DO 130, K = KK, KK + J - 2 + TEMP = TEMP - CONJG( AP( K ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( AP( KK + J - 1 ) ) + END IF + X( JX ) = TEMP + JX = JX + INCX + KK = KK + J + 140 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 170, J = N, 1, -1 + TEMP = X( J ) + K = KK + IF( NOCONJ )THEN + DO 150, I = N, J + 1, -1 + TEMP = TEMP - AP( K )*X( I ) + K = K - 1 + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + ELSE + DO 160, I = N, J + 1, -1 + TEMP = TEMP - CONJG( AP( K ) )*X( I ) + K = K - 1 + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( AP( KK - N + J ) ) + END IF + X( J ) = TEMP + KK = KK - ( N - J + 1 ) + 170 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 200, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + IF( NOCONJ )THEN + DO 180, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX - INCX + 180 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + ELSE + DO 190, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - CONJG( AP( K ) )*X( IX ) + IX = IX - INCX + 190 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( AP( KK - N + J ) ) + END IF + X( JX ) = TEMP + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTPSV . +* + END diff --git a/reference/ctrmmf.f b/reference/ctrmmf.f new file mode 100644 index 0000000..d65bf44 --- /dev/null +++ b/reference/ctrmmf.f @@ -0,0 +1,428 @@ + SUBROUTINE CTRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + COMPLEX ALPHA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* CTRMM performs one of the matrix-matrix operations +* +* B := alpha*op( A )*B, or B := alpha*B*op( A ) +* +* where alpha is a scalar, B is an m by n matrix, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) multiplies B from +* the left or right as follows: +* +* SIDE = 'L' or 'l' B := alpha*op( A )*B. +* +* SIDE = 'R' or 'r' B := alpha*B*op( A ). +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B, and on exit is overwritten by the +* transformed matrix. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOCONJ = LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTRMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*A*B. +* + IF( UPPER )THEN + DO 50, J = 1, N + DO 40, K = 1, M + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + IF (NOCONJ) THEN + DO 30, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 30 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + B( K, J ) = TEMP + ELSE + DO 35, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*CONJG(A( I, K )) + 35 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG(A( K, K )) + B( K, J ) = TEMP + ENDIF + END IF + 40 CONTINUE + 50 CONTINUE + ELSE + DO 80, J = 1, N + DO 70 K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + B( K, J ) = TEMP + IF (NOCONJ) THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*A( K, K ) + DO 60, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 60 CONTINUE + ELSE + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*CONJG(A( K, K )) + DO 65, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*CONJG(A( I, K )) + 65 CONTINUE + ENDIF + END IF + 70 CONTINUE + 80 CONTINUE + END IF + ELSE +* +* Form B := alpha*A'*B or B := alpha*conjg( A' )*B. +* + IF( UPPER )THEN + DO 120, J = 1, N + DO 110, I = M, 1, -1 + TEMP = B( I, J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 90, K = 1, I - 1 + TEMP = TEMP + A( K, I )*B( K, J ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( I, I ) ) + DO 100, K = 1, I - 1 + TEMP = TEMP + CONJG( A( K, I ) )*B( K, J ) + 100 CONTINUE + END IF + B( I, J ) = ALPHA*TEMP + 110 CONTINUE + 120 CONTINUE + ELSE + DO 160, J = 1, N + DO 150, I = 1, M + TEMP = B( I, J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 130, K = I + 1, M + TEMP = TEMP + A( K, I )*B( K, J ) + 130 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( I, I ) ) + DO 140, K = I + 1, M + TEMP = TEMP + CONJG( A( K, I ) )*B( K, J ) + 140 CONTINUE + END IF + B( I, J ) = ALPHA*TEMP + 150 CONTINUE + 160 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*B*A. +* + IF( UPPER )THEN + DO 200, J = N, 1, -1 + TEMP = ALPHA + IF (NOCONJ) THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG(A( J, J )) + ENDIF + DO 170, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 170 CONTINUE + DO 190, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + TEMP = ALPHA*A( K, J ) + ELSE + TEMP = ALPHA*CONJG(A( K, J )) + ENDIF + DO 180, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE + DO 240, J = 1, N + TEMP = ALPHA + IF (NOCONJ) THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG(A( J, J )) + ENDIF + DO 210, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 210 CONTINUE + DO 230, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + TEMP = ALPHA*A( K, J ) + ELSE + TEMP = ALPHA*CONJG(A( K, J )) + ENDIF + DO 220, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 220 CONTINUE + END IF + 230 CONTINUE + 240 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*A' or B := alpha*B*conjg( A' ). +* + IF( UPPER )THEN + DO 280, K = 1, N + DO 260, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = ALPHA*A( J, K ) + ELSE + TEMP = ALPHA*CONJG( A( J, K ) ) + END IF + DO 250, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 250 CONTINUE + END IF + 260 CONTINUE + TEMP = ALPHA + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = TEMP*A( K, K ) + ELSE + TEMP = TEMP*CONJG( A( K, K ) ) + END IF + END IF + IF( TEMP.NE.ONE )THEN + DO 270, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 270 CONTINUE + END IF + 280 CONTINUE + ELSE + DO 320, K = N, 1, -1 + DO 300, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = ALPHA*A( J, K ) + ELSE + TEMP = ALPHA*CONJG( A( J, K ) ) + END IF + DO 290, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 290 CONTINUE + END IF + 300 CONTINUE + TEMP = ALPHA + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = TEMP*A( K, K ) + ELSE + TEMP = TEMP*CONJG( A( K, K ) ) + END IF + END IF + IF( TEMP.NE.ONE )THEN + DO 310, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 310 CONTINUE + END IF + 320 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTRMM . +* + END diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f new file mode 100644 index 0000000..f9d3b44 --- /dev/null +++ b/reference/ctrmvf.f @@ -0,0 +1,358 @@ + SUBROUTINE CTRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTRMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTRMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ))THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 10, I = 1, J - 1 + IF (NOCONJ) THEN + X( I ) = X( I ) + TEMP*A( I, J ) + ELSE + X( I ) = X( I ) + TEMP*CONJG(A( I, J )) + ENDIF + 10 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*CONJG(A( J, J )) + ENDIF + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, I = 1, J - 1 + IF (NOCONJ) THEN + X( IX ) = X( IX ) + TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) + TEMP*CONJG(A( I, J )) + ENDIF + IX = IX + INCX + 30 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*CONJG(A( J, J )) + ENDIF + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 50, I = N, J + 1, -1 + IF (NOCONJ) THEN + X( I ) = X( I ) + TEMP*A( I, J ) + ELSE + X( I ) = X( I ) + TEMP*CONJG(A( I, J )) + ENDIF + 50 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*CONJG(A( J, J )) + ENDIF + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, I = N, J + 1, -1 + IF (NOCONJ) THEN + X( IX ) = X( IX ) + TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) + TEMP*CONJG(A( I, J )) + ENDIF + IX = IX - INCX + 70 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*CONJG(A( J, J )) + ENDIF + END IF + JX = JX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( J, J ) ) + DO 100, I = J - 1, 1, -1 + TEMP = TEMP + CONJG( A( I, J ) )*X( I ) + 100 CONTINUE + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 120, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + A( I, J )*X( IX ) + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( J, J ) ) + DO 130, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + CONJG( A( I, J ) )*X( IX ) + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = J + 1, N + TEMP = TEMP + A( I, J )*X( I ) + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( J, J ) ) + DO 160, I = J + 1, N + TEMP = TEMP + CONJG( A( I, J ) )*X( I ) + 160 CONTINUE + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 180, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + A( I, J )*X( IX ) + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( J, J ) ) + DO 190, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + CONJG( A( I, J ) )*X( IX ) + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTRMV . +* + END diff --git a/reference/ctrsmf.f b/reference/ctrsmf.f new file mode 100644 index 0000000..3d27822 --- /dev/null +++ b/reference/ctrsmf.f @@ -0,0 +1,459 @@ + SUBROUTINE CTRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + COMPLEX ALPHA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* CTRSM solves one of the matrix equations +* +* op( A )*X = alpha*B, or X*op( A ) = alpha*B, +* +* where alpha is a scalar, X and B are m by n matrices, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). +* +* The matrix X is overwritten on B. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) appears on the left +* or right of X as follows: +* +* SIDE = 'L' or 'l' op( A )*X = alpha*B. +* +* SIDE = 'R' or 'r' X*op( A ) = alpha*B. +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the right-hand side matrix B, and on exit is +* overwritten by the solution matrix X. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOCONJ = (LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' )) + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTRSM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*inv( A )*B. +* + IF( UPPER )THEN + DO 60, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 30, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 30 CONTINUE + END IF + + DO 50, K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + + IF( NOUNIT ) THEN + IF (NOCONJ) THEN + B( K, J ) = B( K, J )/A( K, K ) + ELSE + B( K, J ) = B( K, J )/CONJG(A( K, K )) + ENDIF + ENDIF + + IF (NOCONJ) THEN + DO 40, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 40 CONTINUE + ELSE + DO 45, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*CONJG(A( I, K )) + 45 CONTINUE + ENDIF + ENDIF + 50 CONTINUE + 60 CONTINUE + ELSE + DO 100, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 70, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 70 CONTINUE + END IF + DO 90 K = 1, M + IF (NOCONJ) THEN + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 80, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 80 CONTINUE + END IF + ELSE + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/CONJG(A( K, K )) + DO 85, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*CONJG(A( I, K )) + 85 CONTINUE + END IF + ENDIF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form B := alpha*inv( A' )*B +* or B := alpha*inv( conjg( A' ) )*B. +* + IF( UPPER )THEN + DO 140, J = 1, N + DO 130, I = 1, M + TEMP = ALPHA*B( I, J ) + IF( NOCONJ )THEN + DO 110, K = 1, I - 1 + TEMP = TEMP - A( K, I )*B( K, J ) + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + ELSE + DO 120, K = 1, I - 1 + TEMP = TEMP - CONJG( A( K, I ) )*B( K, J ) + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( I, I ) ) + END IF + B( I, J ) = TEMP + 130 CONTINUE + 140 CONTINUE + ELSE + DO 180, J = 1, N + DO 170, I = M, 1, -1 + TEMP = ALPHA*B( I, J ) + IF( NOCONJ )THEN + DO 150, K = I + 1, M + TEMP = TEMP - A( K, I )*B( K, J ) + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + ELSE + DO 160, K = I + 1, M + TEMP = TEMP - CONJG( A( K, I ) )*B( K, J ) + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( I, I ) ) + END IF + B( I, J ) = TEMP + 170 CONTINUE + 180 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*B*inv( A ). +* + IF( UPPER )THEN + DO 230, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 190, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 190 CONTINUE + END IF + DO 210, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + DO 200, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 200 CONTINUE + ELSE + DO 205, I = 1, M + B( I, J ) = B( I, J ) - CONJG(A( K, J ))*B( I, K ) + 205 CONTINUE + ENDIF + END IF + 210 CONTINUE + IF( NOUNIT )THEN + IF (NOCONJ) THEN + TEMP = ONE/A( J, J ) + ELSE + TEMP = ONE/CONJG(A( J, J )) + ENDIF + DO 220, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 220 CONTINUE + END IF + 230 CONTINUE + ELSE + DO 280, J = N, 1, -1 + IF( ALPHA.NE.ONE )THEN + DO 240, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 240 CONTINUE + END IF + DO 260, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + DO 250, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 250 CONTINUE + ELSE + DO 255, I = 1, M + B( I, J ) = B( I, J ) - CONJG(A( K, J ))*B( I, K ) + 255 CONTINUE + ENDIF + END IF + 260 CONTINUE + IF( NOUNIT )THEN + IF (NOCONJ) THEN + TEMP = ONE/A( J, J ) + ELSE + TEMP = ONE/CONJG(A( J, J )) + ENDIF + DO 270, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 270 CONTINUE + END IF + 280 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*inv( A' ) +* or B := alpha*B*inv( conjg( A' ) ). +* + IF( UPPER )THEN + DO 330, K = N, 1, -1 + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = ONE/A( K, K ) + ELSE + TEMP = ONE/CONJG( A( K, K ) ) + END IF + DO 290, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 290 CONTINUE + END IF + DO 310, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = A( J, K ) + ELSE + TEMP = CONJG( A( J, K ) ) + END IF + DO 300, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 300 CONTINUE + END IF + 310 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 320, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 320 CONTINUE + END IF + 330 CONTINUE + ELSE + DO 380, K = 1, N + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = ONE/A( K, K ) + ELSE + TEMP = ONE/CONJG( A( K, K ) ) + END IF + DO 340, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 340 CONTINUE + END IF + DO 360, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = A( J, K ) + ELSE + TEMP = CONJG( A( J, K ) ) + END IF + DO 350, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 350 CONTINUE + END IF + 360 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 370, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 370 CONTINUE + END IF + 380 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTRSM . +* + END diff --git a/reference/ctrsvf.f b/reference/ctrsvf.f new file mode 100644 index 0000000..86061b4 --- /dev/null +++ b/reference/ctrsvf.f @@ -0,0 +1,361 @@ + SUBROUTINE CTRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTRSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTRSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ))THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*A( I, J ) + 10 CONTINUE + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/CONJG(A( J, J )) + TEMP = X( J ) + DO 15, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*CONJG(A( I, J )) + 15 CONTINUE + ENDIF + END IF + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/CONJG(A( J, J )) + ENDIF + TEMP = X( JX ) + IX = JX + DO 30, I = J - 1, 1, -1 + IX = IX - INCX + IF (NOCONJ) THEN + X( IX ) = X( IX ) - TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) - TEMP*CONJG(A( I, J )) + ENDIF + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*A( I, J ) + 50 CONTINUE + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/CONJG(A( J, J )) + TEMP = X( J ) + DO 55, I = J + 1, N + X( I ) = X( I ) - TEMP*CONJG(A( I, J )) + 55 CONTINUE + ENDIF + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/CONJG(A( J, J )) + ENDIF + TEMP = X( JX ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + IF (NOCONJ) THEN + X( IX ) = X( IX ) - TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) - TEMP*CONJG(A( I, J )) + ENDIF + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = X( J ) + IF( NOCONJ )THEN + DO 90, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( I ) + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 100, I = 1, J - 1 + TEMP = TEMP - CONJG( A( I, J ) )*X( I ) + 100 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( J, J ) ) + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + JX = KX + DO 140, J = 1, N + IX = KX + TEMP = X( JX ) + IF( NOCONJ )THEN + DO 120, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX + INCX + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 130, I = 1, J - 1 + TEMP = TEMP - CONJG( A( I, J ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( J, J ) ) + END IF + X( JX ) = TEMP + JX = JX + INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = N, 1, -1 + TEMP = X( J ) + IF( NOCONJ )THEN + DO 150, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( I ) + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 160, I = N, J + 1, -1 + TEMP = TEMP - CONJG( A( I, J ) )*X( I ) + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( J, J ) ) + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 200, J = N, 1, -1 + IX = KX + TEMP = X( JX ) + IF( NOCONJ )THEN + DO 180, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX - INCX + 180 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 190, I = N, J + 1, -1 + TEMP = TEMP - CONJG( A( I, J ) )*X( IX ) + IX = IX - INCX + 190 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( J, J ) ) + END IF + X( JX ) = TEMP + JX = JX - INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTRSV . +* + END diff --git a/reference/ctrti2f.f b/reference/ctrti2f.f new file mode 100644 index 0000000..24604b4 --- /dev/null +++ b/reference/ctrti2f.f @@ -0,0 +1,146 @@ + SUBROUTINE CTRTI2F( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CTRTI2 computes the inverse of a complex upper or lower triangular +* matrix. +* +* This is the Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the matrix A is upper or lower triangular. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* DIAG (input) CHARACTER*1 +* Specifies whether or not the matrix A is unit triangular. +* = 'N': Non-unit triangular +* = 'U': Unit triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading n by n upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J + COMPLEX AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CSCAL, CTRMV, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CTRTI2', -INFO ) + RETURN + END IF +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix. +* + DO 10 J = 1, N + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF +* +* Compute elements 1:j-1 of j-th column. +* + CALL CTRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, + $ A( 1, J ), 1 ) + CALL CSCAL( J-1, AJJ, A( 1, J ), 1 ) + 10 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix. +* + DO 20 J = N, 1, -1 + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF + IF( J.LT.N ) THEN +* +* Compute elements j+1:n of j-th column. +* + CALL CTRMV( 'Lower', 'No transpose', DIAG, N-J, + $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) + CALL CSCAL( N-J, AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of CTRTI2 +* + END diff --git a/reference/ctrtrif.f b/reference/ctrtrif.f new file mode 100644 index 0000000..cb1ec98 --- /dev/null +++ b/reference/ctrtrif.f @@ -0,0 +1,177 @@ + SUBROUTINE CTRTRIF( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CTRTRI computes the inverse of a complex upper or lower triangular +* matrix A. +* +* This is the Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': A is upper triangular; +* = 'L': A is lower triangular. +* +* DIAG (input) CHARACTER*1 +* = 'N': A is non-unit triangular; +* = 'U': A is unit triangular. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading N-by-N upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, A(i,i) is exactly zero. The triangular +* matrix is singular and its inverse can not be computed. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE, ZERO + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), + $ ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J, JB, NB, NN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CTRMM, CTRSM, CTRTI2, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CTRTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Check for singularity if non-unit. +* + IF( NOUNIT ) THEN + DO 10 INFO = 1, N + IF( A( INFO, INFO ).EQ.ZERO ) + $ RETURN + 10 CONTINUE + INFO = 0 + END IF +* +* Determine the block size for this environment. +* + NB = 128 + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL CTRTI2( UPLO, DIAG, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix +* + DO 20 J = 1, N, NB + JB = MIN( NB, N-J+1 ) +* +* Compute rows 1:j-1 of current block column +* + CALL CTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, + $ JB, ONE, A, LDA, A( 1, J ), LDA ) + CALL CTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, + $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) +* +* Compute inverse of current diagonal block +* + CALL CTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) + 20 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 30 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) + IF( J+JB.LE.N ) THEN +* +* Compute rows j+jb:n of current block column +* + CALL CTRMM( 'Left', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, + $ A( J+JB, J ), LDA ) + CALL CTRSM( 'Right', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF +* +* Compute inverse of current diagonal block +* + CALL CTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) + 30 CONTINUE + END IF + END IF +* + RETURN +* +* End of CTRTRI +* + END diff --git a/reference/damaxf.f b/reference/damaxf.f new file mode 100644 index 0000000..a0c0b01 --- /dev/null +++ b/reference/damaxf.f @@ -0,0 +1,36 @@ + REAL*8 function damaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*) + integer i,incx,ix,n +c + damaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + damaxf = dabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(dabs(dx(ix)).le.damaxf) go to 5 + damaxf = dabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 damaxf = dabs(dx(1)) + do 30 i = 2,n + if(dabs(dx(i)).le.damaxf) go to 30 + damaxf = dabs(dx(i)) + 30 continue + return + end diff --git a/reference/daminf.f b/reference/daminf.f new file mode 100644 index 0000000..21ce9d7 --- /dev/null +++ b/reference/daminf.f @@ -0,0 +1,36 @@ + REAL*8 function daminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*) + integer i,incx,ix,n +c + daminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + daminf = dabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(dabs(dx(ix)).ge.daminf) go to 5 + daminf = dabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 daminf = dabs(dx(1)) + do 30 i = 2,n + if(dabs(dx(i)).ge.daminf) go to 30 + daminf = dabs(dx(i)) + 30 continue + return + end diff --git a/reference/dasumf.f b/reference/dasumf.f new file mode 100644 index 0000000..0713694 --- /dev/null +++ b/reference/dasumf.f @@ -0,0 +1,43 @@ + double precision function dasumf(n,dx,incx) +c +c takes the sum of the absolute values. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dtemp + integer i,incx,m,mp1,n,nincx +c + dasumf = 0.0d0 + dtemp = 0.0d0 + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + dtemp = dtemp + dabs(dx(i)) + 10 continue + dasumf = dtemp + return +c +c code for increment equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,6) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dtemp = dtemp + dabs(dx(i)) + 30 continue + if( n .lt. 6 ) go to 60 + 40 mp1 = m + 1 + do 50 i = mp1,n,6 + dtemp = dtemp + dabs(dx(i)) + dabs(dx(i + 1)) + dabs(dx(i + 2)) + * + dabs(dx(i + 3)) + dabs(dx(i + 4)) + dabs(dx(i + 5)) + 50 continue + 60 dasumf = dtemp + return + end diff --git a/reference/daxpyf.f b/reference/daxpyf.f new file mode 100644 index 0000000..259217c --- /dev/null +++ b/reference/daxpyf.f @@ -0,0 +1,48 @@ + subroutine daxpyf(n,da,dx,incx,dy,incy) +c +c constant times a vector plus a vector. +c uses unrolled loops for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*),da + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if (da .eq. 0.0d0) return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dy(iy) = dy(iy) + da*dx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,4) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dy(i) = dy(i) + da*dx(i) + 30 continue + if( n .lt. 4 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,4 + dy(i) = dy(i) + da*dx(i) + dy(i + 1) = dy(i + 1) + da*dx(i + 1) + dy(i + 2) = dy(i + 2) + da*dx(i + 2) + dy(i + 3) = dy(i + 3) + da*dx(i + 3) + 50 continue + return + end diff --git a/reference/dcopyf.f b/reference/dcopyf.f new file mode 100644 index 0000000..e930303 --- /dev/null +++ b/reference/dcopyf.f @@ -0,0 +1,50 @@ + subroutine dcopyf(n,dx,incx,dy,incy) +c +c copies a vector, x, to a vector, y. +c uses unrolled loops for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*) + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dy(iy) = dx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,7) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dy(i) = dx(i) + 30 continue + if( n .lt. 7 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,7 + dy(i) = dx(i) + dy(i + 1) = dx(i + 1) + dy(i + 2) = dx(i + 2) + dy(i + 3) = dx(i + 3) + dy(i + 4) = dx(i + 4) + dy(i + 5) = dx(i + 5) + dy(i + 6) = dx(i + 6) + 50 continue + return + end diff --git a/reference/ddotf.f b/reference/ddotf.f new file mode 100644 index 0000000..ed8defc --- /dev/null +++ b/reference/ddotf.f @@ -0,0 +1,49 @@ + double precision function ddotf(n,dx,incx,dy,incy) +c +c forms the dot product of two vectors. +c uses unrolled loops for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*),dtemp + integer i,incx,incy,ix,iy,m,mp1,n +c + ddotf = 0.0d0 + dtemp = 0.0d0 + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dtemp = dtemp + dx(ix)*dy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + ddotf = dtemp + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,5) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dtemp = dtemp + dx(i)*dy(i) + 30 continue + if( n .lt. 5 ) go to 60 + 40 mp1 = m + 1 + do 50 i = mp1,n,5 + dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) + + * dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) + dx(i + 4)*dy(i + 4) + 50 continue + 60 ddotf = dtemp + return + end diff --git a/reference/dgbmvf.f b/reference/dgbmvf.f new file mode 100644 index 0000000..0033ac1 --- /dev/null +++ b/reference/dgbmvf.f @@ -0,0 +1,300 @@ + SUBROUTINE DGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, KL, KU, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DGBMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n band matrix, with kl sub-diagonals and ku super-diagonals. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* KL - INTEGER. +* On entry, KL specifies the number of sub-diagonals of the +* matrix A. KL must satisfy 0 .le. KL. +* Unchanged on exit. +* +* KU - INTEGER. +* On entry, KU specifies the number of super-diagonals of the +* matrix A. KU must satisfy 0 .le. KU. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry, the leading ( kl + ku + 1 ) by n part of the +* array A must contain the matrix of coefficients, supplied +* column by column, with the leading diagonal of the matrix in +* row ( ku + 1 ) of the array, the first super-diagonal +* starting at position 2 in row ku, the first sub-diagonal +* starting at position 1 in row ( ku + 2 ), and so on. +* Elements in the array A that do not correspond to elements +* in the band matrix (such as the top left ku by ku triangle) +* are not referenced. +* The following program segment will transfer a band matrix +* from conventional full matrix storage to band storage: +* +* DO 20, J = 1, N +* K = KU + 1 - J +* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) +* A( K + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( kl + ku + 1 ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, + $ LENX, LENY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( KL.LT.0 )THEN + INFO = 4 + ELSE IF( KU.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN + INFO = 8 + ELSE IF( INCX.EQ.0 )THEN + INFO = 10 + ELSE IF( INCY.EQ.0 )THEN + INFO = 13 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DGBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF( LSAME( TRANS, 'N' ) )THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the band part of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KUP1 = KU + 1 + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + K = KUP1 - J + DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + K = KUP1 - J + DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = ZERO + K = KUP1 - J + DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( I ) + 90 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + DO 110, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DGBMV . +* + END diff --git a/reference/dgemmf.f b/reference/dgemmf.f new file mode 100644 index 0000000..0af8120 --- /dev/null +++ b/reference/dgemmf.f @@ -0,0 +1,313 @@ + SUBROUTINE DGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRANA,TRANB +* .. +* .. Array Arguments .. + DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* DGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X', +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRANA - CHARACTER*1. +* On entry, TRANA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANA = 'N' or 'n', op( A ) = A. +* +* TRANA = 'T' or 't', op( A ) = A'. +* +* TRANA = 'C' or 'c', op( A ) = A'. +* +* Unchanged on exit. +* +* TRANB - CHARACTER*1. +* On entry, TRANB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRANB = 'N' or 'n', op( B ) = B. +* +* TRANB = 'T' or 't', op( B ) = B'. +* +* TRANB = 'C' or 'c', op( B ) = B'. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is +* k when TRANA = 'N' or 'n', and is m otherwise. +* Before entry with TRANA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is +* n when TRANB = 'N' or 'n', and is k otherwise. +* Before entry with TRANB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL NOTA,NOTB +* .. +* .. Parameters .. + DOUBLE PRECISION ONE,ZERO + PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* transposed and set NROWA, NCOLA and NROWB as the number of rows +* and columns of A and the number of rows of B respectively. +* + NOTA = LSAME(TRANA,'N') + NOTB = LSAME(TRANB,'N') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANA,'C')) .AND. + + (.NOT.LSAME(TRANA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANB,'C')) .AND. + + (.NOT.LSAME(TRANB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('DGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And if alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + END IF + ELSE + IF (NOTA) THEN +* +* Form C := alpha*A*B' + beta*C +* + DO 170 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 130 I = 1,M + C(I,J) = ZERO + 130 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 140 I = 1,M + C(I,J) = BETA*C(I,J) + 140 CONTINUE + END IF + DO 160 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 150 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 150 CONTINUE + END IF + 160 CONTINUE + 170 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 200 J = 1,N + DO 190 I = 1,M + TEMP = ZERO + DO 180 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 180 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 190 CONTINUE + 200 CONTINUE + END IF + END IF +* + RETURN +* +* End of DGEMM . +* + END diff --git a/reference/dgemvf.f b/reference/dgemvf.f new file mode 100644 index 0000000..ae50c3c --- /dev/null +++ b/reference/dgemvf.f @@ -0,0 +1,256 @@ + SUBROUTINE DGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF( LSAME( TRANS, 'N' ) )THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + DO 50, I = 1, M + Y( I ) = Y( I ) + TEMP*A( I, J ) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + DO 70, I = 1, M + Y( IY ) = Y( IY ) + TEMP*A( I, J ) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = ZERO + DO 90, I = 1, M + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120, J = 1, N + TEMP = ZERO + IX = KX + DO 110, I = 1, M + TEMP = TEMP + A( I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DGEMV . +* + END diff --git a/reference/dgerf.f b/reference/dgerf.f new file mode 100644 index 0000000..f340ceb --- /dev/null +++ b/reference/dgerf.f @@ -0,0 +1,158 @@ + SUBROUTINE DGERF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) + +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DGER performs the rank 1 operation +* +* A := alpha*x*y' + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DGER ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of DGER . +* + END diff --git a/reference/dgesvf.f b/reference/dgesvf.f new file mode 100644 index 0000000..751acf3 --- /dev/null +++ b/reference/dgesvf.f @@ -0,0 +1,107 @@ + SUBROUTINE DGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK driver routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* DGESV computes the solution to a real system of linear equations +* A * X = B, +* where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +* +* The LU decomposition with partial pivoting and row interchanges is +* used to factor A as +* A = P * L * U, +* where P is a permutation matrix, L is unit lower triangular, and U is +* upper triangular. The factored form of A is then used to solve the +* system of equations A * X = B. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of linear equations, i.e., the order of the +* matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the N-by-N coefficient matrix A. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (output) INTEGER array, dimension (N) +* The pivot indices that define the permutation matrix P; +* row i of the matrix was interchanged with row IPIV(i). +* +* B (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS) +* On entry, the N-by-NRHS matrix of right hand side matrix B. +* On exit, if INFO = 0, the N-by-NRHS solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, so the solution could not be computed. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL DGETRF, DGETRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGESV ', -INFO ) + RETURN + END IF +* +* Compute the LU factorization of A. +* + CALL DGETRF( N, N, A, LDA, IPIV, INFO ) + IF( INFO.EQ.0 ) THEN +* +* Solve the system A*X = B, overwriting B with X. +* + CALL DGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, + $ INFO ) + END IF + RETURN +* +* End of DGESV +* + END diff --git a/reference/dgetf2f.f b/reference/dgetf2f.f new file mode 100644 index 0000000..f977a7c --- /dev/null +++ b/reference/dgetf2f.f @@ -0,0 +1,135 @@ + SUBROUTINE DGETF2F( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1992 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DGETF2 computes an LU factorization of a general m-by-n matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the m by n matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, U(k,k) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER J, JP +* .. +* .. External Functions .. + INTEGER IDAMAX + EXTERNAL IDAMAX +* .. +* .. External Subroutines .. + EXTERNAL DGER, DSCAL, DSWAP, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* + DO 10 J = 1, MIN( M, N ) +* +* Find pivot and test for singularity. +* + JP = J - 1 + IDAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + IF( A( JP, J ).NE.ZERO ) THEN +* +* Apply the interchange to columns 1:N. +* + IF( JP.NE.J ) + $ CALL DSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) +* +* Compute elements J+1:M of J-th column. +* + IF( J.LT.M ) + $ CALL DSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) +* + ELSE IF( INFO.EQ.0 ) THEN +* + INFO = J + END IF +* + IF( J.LT.MIN( M, N ) ) THEN +* +* Update trailing submatrix. +* + CALL DGER( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), LDA, + $ A( J+1, J+1 ), LDA ) + END IF + 10 CONTINUE + RETURN +* +* End of DGETF2 +* + END diff --git a/reference/dgetrff.f b/reference/dgetrff.f new file mode 100644 index 0000000..1425596 --- /dev/null +++ b/reference/dgetrff.f @@ -0,0 +1,156 @@ + SUBROUTINE DGETRFF( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DGETF2, DLASWP, DTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 64 + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL DGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL DGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to columns 1:J-1. +* + CALL DLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF( J+JB.LE.N ) THEN +* +* Apply interchanges to columns J+JB:N. +* + CALL DLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* +* Compute block row of U. +* + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL DGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + 20 CONTINUE + END IF + RETURN +* +* End of DGETRF +* + END diff --git a/reference/dgetrsf.f b/reference/dgetrsf.f new file mode 100644 index 0000000..86624cb --- /dev/null +++ b/reference/dgetrsf.f @@ -0,0 +1,150 @@ + SUBROUTINE DGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* DGETRS solves a system of linear equations +* A * X = B or A' * X = B +* with a general N-by-N matrix A using the LU factorization computed +* by DGETRF. +* +* Arguments +* ========= +* +* TRANS (input) CHARACTER*1 +* Specifies the form of the system of equations: +* = 'N': A * X = B (No transpose) +* = 'T': A'* X = B (Transpose) +* = 'C': A'* X = B (Conjugate transpose = Transpose) +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input) DOUBLE PRECISION array, dimension (LDA,N) +* The factors L and U from the factorization A = P*L*U +* as computed by DGETRF. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from DGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* B (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS) +* On entry, the right hand side matrix B. +* On exit, the solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRAN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DLASWP, DTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NOTRAN = LSAME( TRANS, 'N' ) + IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETRS', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 .OR. NRHS.EQ.0 ) + $ RETURN +* + IF( NOTRAN ) THEN +* +* Solve A * X = B. +* +* Apply row interchanges to the right hand sides. +* + CALL DLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) +* +* Solve L*X = B, overwriting B with X. +* + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve U*X = B, overwriting B with X. +* + CALL DTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, + $ NRHS, ONE, A, LDA, B, LDB ) + ELSE +* +* Solve A' * X = B. +* +* Solve U'*X = B, overwriting B with X. +* + CALL DTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve L'*X = B, overwriting B with X. +* + CALL DTRSM( 'Left', 'Lower', 'Transpose', 'Unit', N, NRHS, ONE, + $ A, LDA, B, LDB ) +* +* Apply row interchanges to the solution vectors. +* + CALL DLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) + END IF +* + RETURN +* +* End of DGETRS +* + END diff --git a/reference/dlaswpf.f b/reference/dlaswpf.f new file mode 100644 index 0000000..1e83dbe --- /dev/null +++ b/reference/dlaswpf.f @@ -0,0 +1,120 @@ + SUBROUTINE DLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INCX, K1, K2, LDA, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DLASWP performs a series of row interchanges on the matrix A. +* One row interchange is initiated for each of rows K1 through K2 of A. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of columns of the matrix A. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the matrix of column dimension N to which the row +* interchanges will be applied. +* On exit, the permuted matrix. +* +* LDA (input) INTEGER +* The leading dimension of the array A. +* +* K1 (input) INTEGER +* The first element of IPIV for which a row interchange will +* be done. +* +* K2 (input) INTEGER +* The last element of IPIV for which a row interchange will +* be done. +* +* IPIV (input) INTEGER array, dimension (M*abs(INCX)) +* The vector of pivot indices. Only the elements in positions +* K1 through K2 of IPIV are accessed. +* IPIV(K) = L implies rows K and L are to be interchanged. +* +* INCX (input) INTEGER +* The increment between successive values of IPIV. If IPIV +* is negative, the pivots are applied in reverse order. +* +* Further Details +* =============== +* +* Modified by +* R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA +* +* ===================================================================== +* +* .. Local Scalars .. + INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 + DOUBLE PRECISION TEMP +* .. +* .. Executable Statements .. +* +* Interchange row I with row IPIV(I) for each of rows K1 through K2. +* + IF( INCX.GT.0 ) THEN + IX0 = K1 + I1 = K1 + I2 = K2 + INC = 1 + ELSE IF( INCX.LT.0 ) THEN + IX0 = 1 + ( 1-K2 )*INCX + I1 = K2 + I2 = K1 + INC = -1 + ELSE + RETURN + END IF +* + N32 = ( N / 32 )*32 + IF( N32.NE.0 ) THEN + DO 30 J = 1, N32, 32 + IX = IX0 + DO 20 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 10 K = J, J + 31 + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 10 CONTINUE + END IF + IX = IX + INCX + 20 CONTINUE + 30 CONTINUE + END IF + IF( N32.NE.N ) THEN + N32 = N32 + 1 + IX = IX0 + DO 50 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 40 K = N32, N + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 40 CONTINUE + END IF + IX = IX + INCX + 50 CONTINUE + END IF +* + RETURN +* +* End of DLASWP +* + END diff --git a/reference/dlauu2f.f b/reference/dlauu2f.f new file mode 100644 index 0000000..0f957b4 --- /dev/null +++ b/reference/dlauu2f.f @@ -0,0 +1,135 @@ + SUBROUTINE DLAUU2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DLAUU2 computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the unblocked form of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I + DOUBLE PRECISION AII +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DDOT + EXTERNAL LSAME, DDOT +* .. +* .. External Subroutines .. + EXTERNAL DGEMV, DSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLAUU2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = DDOT( N-I+1, A( I, I ), LDA, A( I, I ), LDA ) + CALL DGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), + $ LDA, A( I, I+1 ), LDA, AII, A( 1, I ), 1 ) + ELSE + CALL DSCAL( I, AII, A( 1, I ), 1 ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = DDOT( N-I+1, A( I, I ), 1, A( I, I ), 1 ) + CALL DGEMV( 'Transpose', N-I, I-1, ONE, A( I+1, 1 ), LDA, + $ A( I+1, I ), 1, AII, A( I, 1 ), LDA ) + ELSE + CALL DSCAL( I, AII, A( I, 1 ), LDA ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of DLAUU2 +* + END diff --git a/reference/dlauumf.f b/reference/dlauumf.f new file mode 100644 index 0000000..c0584cc --- /dev/null +++ b/reference/dlauumf.f @@ -0,0 +1,155 @@ + SUBROUTINE DLAUUMF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* February 29, 1992 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DLAUUM computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the blocked form of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, IB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLAUU2, DSYRK, DTRMM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLAUUM', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 128 +* + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL DLAUU2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL DTRMM( 'Right', 'Upper', 'Transpose', 'Non-unit', + $ I-1, IB, ONE, A( I, I ), LDA, A( 1, I ), + $ LDA ) + CALL DLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL DGEMM( 'No transpose', 'Transpose', I-1, IB, + $ N-I-IB+1, ONE, A( 1, I+IB ), LDA, + $ A( I, I+IB ), LDA, ONE, A( 1, I ), LDA ) + CALL DSYRK( 'Upper', 'No transpose', IB, N-I-IB+1, + $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), + $ LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL DTRMM( 'Left', 'Lower', 'Transpose', 'Non-unit', IB, + $ I-1, ONE, A( I, I ), LDA, A( I, 1 ), LDA ) + CALL DLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL DGEMM( 'Transpose', 'No transpose', IB, I-1, + $ N-I-IB+1, ONE, A( I+IB, I ), LDA, + $ A( I+IB, 1 ), LDA, ONE, A( I, 1 ), LDA ) + CALL DSYRK( 'Lower', 'Transpose', IB, N-I-IB+1, ONE, + $ A( I+IB, I ), LDA, ONE, A( I, I ), LDA ) + END IF + 20 CONTINUE + END IF + END IF +* + RETURN +* +* End of DLAUUM +* + END diff --git a/reference/dmaxf.f b/reference/dmaxf.f new file mode 100644 index 0000000..11a7322 --- /dev/null +++ b/reference/dmaxf.f @@ -0,0 +1,36 @@ + REAL*8 function dmaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*) + integer i,incx,ix,n +c + dmaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmaxf = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).le.dmaxf) go to 5 + dmaxf = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmaxf = dx(1) + do 30 i = 2,n + if(dx(i).le.dmaxf) go to 30 + dmaxf = dx(i) + 30 continue + return + end diff --git a/reference/dminf.f b/reference/dminf.f new file mode 100644 index 0000000..497fb53 --- /dev/null +++ b/reference/dminf.f @@ -0,0 +1,36 @@ + REAL*8 function dminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*) + integer i,incx,ix,n +c + dminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dminf = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).ge.dminf) go to 5 + dminf = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dminf = dx(1) + do 30 i = 2,n + if(dx(i).ge.dminf) go to 30 + dminf = dx(i) + 30 continue + return + end diff --git a/reference/dnrm2f.f b/reference/dnrm2f.f new file mode 100644 index 0000000..2a4b6f2 --- /dev/null +++ b/reference/dnrm2f.f @@ -0,0 +1,61 @@ + DOUBLE PRECISION FUNCTION DNRM2F ( N, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N +* .. Array Arguments .. + DOUBLE PRECISION X( * ) +* .. +* +* DNRM2 returns the euclidean norm of a vector via the function +* name, so that +* +* DNRM2 := sqrt( x'*x ) +* +* +* +* -- This version written on 25-October-1982. +* Modified on 14-October-1993 to inline the call to DLASSQ. +* Sven Hammarling, Nag Ltd. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + INTEGER IX + DOUBLE PRECISION ABSXI, NORM, SCALE, SSQ +* .. Intrinsic Functions .. + INTRINSIC ABS, SQRT +* .. +* .. Executable Statements .. + + IF( N.LT.1 .OR. INCX.LT.1 )THEN + NORM = ZERO + ELSE IF( N.EQ.1 )THEN + NORM = ABS( X( 1 ) ) + ELSE + SCALE = ZERO + SSQ = ONE +* The following loop is equivalent to this call to the LAPACK +* auxiliary routine: +* CALL DLASSQ( N, X, INCX, SCALE, SSQ ) +* + DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX + IF( X( IX ).NE.ZERO )THEN + ABSXI = ABS( X( IX ) ) + IF( SCALE.LT.ABSXI )THEN + SSQ = ONE + SSQ*( SCALE/ABSXI )**2 + SCALE = ABSXI + ELSE + SSQ = SSQ + ( ABSXI/SCALE )**2 + END IF + END IF + 10 CONTINUE + NORM = SCALE * SQRT( SSQ ) + END IF +* + DNRM2F = NORM + RETURN +* +* End of DNRM2. +* + END diff --git a/reference/dpotf2f.f b/reference/dpotf2f.f new file mode 100644 index 0000000..9327263 --- /dev/null +++ b/reference/dpotf2f.f @@ -0,0 +1,168 @@ + SUBROUTINE DPOTF2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* February 29, 1992 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DPOTF2 computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U' * U , if UPLO = 'U', or +* A = L * L', if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the unblocked version of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the upper or lower triangular part of the +* symmetric matrix A is stored. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* n by n upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U'*U or A = L*L'. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, the leading minor of order k is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J + DOUBLE PRECISION AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DDOT + EXTERNAL LSAME, DDOT +* .. +* .. External Subroutines .. + EXTERNAL DGEMV, DSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DPOTF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N +* +* Compute U(J,J) and test for non-positive-definiteness. +* + AJJ = A( J, J ) - DDOT( J-1, A( 1, J ), 1, A( 1, J ), 1 ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of row J. +* + IF( J.LT.N ) THEN + CALL DGEMV( 'Transpose', J-1, N-J, -ONE, A( 1, J+1 ), + $ LDA, A( 1, J ), 1, ONE, A( J, J+1 ), LDA ) + CALL DSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N +* +* Compute L(J,J) and test for non-positive-definiteness. +* + AJJ = A( J, J ) - DDOT( J-1, A( J, 1 ), LDA, A( J, 1 ), + $ LDA ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of column J. +* + IF( J.LT.N ) THEN + CALL DGEMV( 'No transpose', N-J, J-1, -ONE, A( J+1, 1 ), + $ LDA, A( J, 1 ), LDA, ONE, A( J+1, J ), 1 ) + CALL DSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF + GO TO 40 +* + 30 CONTINUE + INFO = J +* + 40 CONTINUE + RETURN +* +* End of DPOTF2 +* + END diff --git a/reference/dpotrff.f b/reference/dpotrff.f new file mode 100644 index 0000000..10faf05 --- /dev/null +++ b/reference/dpotrff.f @@ -0,0 +1,184 @@ + SUBROUTINE DPOTRFF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**T * U, if UPLO = 'U', or +* A = L * L**T, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DPOTF2, DSYRK, DTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 224 + + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL DPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL DSYRK( 'Upper', 'Transpose', JB, J-1, -ONE, + $ A( 1, J ), LDA, ONE, A( J, J ), LDA ) + CALL DPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block row. +* + CALL DGEMM( 'Transpose', 'No transpose', JB, N-J-JB+1, + $ J-1, -ONE, A( 1, J ), LDA, A( 1, J+JB ), + $ LDA, ONE, A( J, J+JB ), LDA ) + CALL DTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL DSYRK( 'Lower', 'No transpose', JB, J-1, -ONE, + $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) + CALL DPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block column. +* + CALL DGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, + $ J-1, -ONE, A( J+JB, 1 ), LDA, A( J, 1 ), + $ LDA, ONE, A( J+JB, J ), LDA ) + CALL DTRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', + $ N-J-JB+1, JB, ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of DPOTRF +* + END diff --git a/reference/dpotrif.f b/reference/dpotrif.f new file mode 100644 index 0000000..2027042 --- /dev/null +++ b/reference/dpotrif.f @@ -0,0 +1,96 @@ + SUBROUTINE DPOTRIF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DPOTRI computes the inverse of a real symmetric positive definite +* matrix A using the Cholesky factorization A = U**T*U or A = L*L**T +* computed by DPOTRF. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T, as computed by +* DPOTRF. +* On exit, the upper or lower triangle of the (symmetric) +* inverse of A, overwriting the input factor U or L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the (i,i) element of the factor U or L is +* zero, and the inverse could not be computed. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DLAUUM, DTRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DPOTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Invert the triangular Cholesky factor U or L. +* + CALL DTRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* +* Form inv(U)*inv(U)' or inv(L)'*inv(L). +* + CALL DLAUUM( UPLO, N, A, LDA, INFO ) +* + RETURN +* +* End of DPOTRI +* + END diff --git a/reference/drotf.f b/reference/drotf.f new file mode 100644 index 0000000..70525ad --- /dev/null +++ b/reference/drotf.f @@ -0,0 +1,37 @@ + subroutine drotf (n,dx,incx,dy,incy,c,s) +c +c applies a plane rotation. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*),dtemp,c,s + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dtemp = c*dx(ix) + s*dy(iy) + dy(iy) = c*dy(iy) - s*dx(ix) + dx(ix) = dtemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + dtemp = c*dx(i) + s*dy(i) + dy(i) = c*dy(i) - s*dx(i) + dx(i) = dtemp + 30 continue + return + end diff --git a/reference/drotgf.f b/reference/drotgf.f new file mode 100644 index 0000000..265a7cd --- /dev/null +++ b/reference/drotgf.f @@ -0,0 +1,27 @@ + subroutine drotgf(da,db,c,s) +c +c construct givens plane rotation. +c jack dongarra, linpack, 3/11/78. +c + double precision da,db,c,s,roe,scale,r,z +c + roe = db + if( dabs(da) .gt. dabs(db) ) roe = da + scale = dabs(da) + dabs(db) + if( scale .ne. 0.0d0 ) go to 10 + c = 1.0d0 + s = 0.0d0 + r = 0.0d0 + z = 0.0d0 + go to 20 + 10 r = scale*dsqrt((da/scale)**2 + (db/scale)**2) + r = dsign(1.0d0,roe)*r + c = da/r + s = db/r + z = 1.0d0 + if( dabs(da) .gt. dabs(db) ) z = s + if( dabs(db) .ge. dabs(da) .and. c .ne. 0.0d0 ) z = 1.0d0/c + 20 da = r + db = z + return + end diff --git a/reference/drotmf.f b/reference/drotmf.f new file mode 100644 index 0000000..7447680 --- /dev/null +++ b/reference/drotmf.f @@ -0,0 +1,108 @@ + SUBROUTINE DROTMF (N,DX,INCX,DY,INCY,DPARAM) +C +C APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX +C +C (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN +C (DY**T) +C +C DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE +C LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. +C WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. +C +C DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 +C +C (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) +C H=( ) ( ) ( ) ( ) +C (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). +C SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. +C + DOUBLE PRECISION DFLAG,DH12,DH22,DX,TWO,Z,DH11,DH21, + 1 DPARAM,DY,W,ZERO + DIMENSION DX(1),DY(1),DPARAM(5) + DATA ZERO,TWO/0.D0,2.D0/ +C + DFLAG=DPARAM(1) + IF(N .LE. 0 .OR.(DFLAG+TWO.EQ.ZERO)) GO TO 140 + IF(.NOT.(INCX.EQ.INCY.AND. INCX .GT.0)) GO TO 70 +C + NSTEPS=N*INCX + IF(DFLAG) 50,10,30 + 10 CONTINUE + DH12=DPARAM(4) + DH21=DPARAM(3) + DO 20 I=1,NSTEPS,INCX + W=DX(I) + Z=DY(I) + DX(I)=W+Z*DH12 + DY(I)=W*DH21+Z + 20 CONTINUE + GO TO 140 + 30 CONTINUE + DH11=DPARAM(2) + DH22=DPARAM(5) + DO 40 I=1,NSTEPS,INCX + W=DX(I) + Z=DY(I) + DX(I)=W*DH11+Z + DY(I)=-W+DH22*Z + 40 CONTINUE + GO TO 140 + 50 CONTINUE + DH11=DPARAM(2) + DH12=DPARAM(4) + DH21=DPARAM(3) + DH22=DPARAM(5) + DO 60 I=1,NSTEPS,INCX + W=DX(I) + Z=DY(I) + DX(I)=W*DH11+Z*DH12 + DY(I)=W*DH21+Z*DH22 + 60 CONTINUE + GO TO 140 + 70 CONTINUE + KX=1 + KY=1 + IF(INCX .LT. 0) KX=1+(1-N)*INCX + IF(INCY .LT. 0) KY=1+(1-N)*INCY +C + IF(DFLAG)120,80,100 + 80 CONTINUE + DH12=DPARAM(4) + DH21=DPARAM(3) + DO 90 I=1,N + W=DX(KX) + Z=DY(KY) + DX(KX)=W+Z*DH12 + DY(KY)=W*DH21+Z + KX=KX+INCX + KY=KY+INCY + 90 CONTINUE + GO TO 140 + 100 CONTINUE + DH11=DPARAM(2) + DH22=DPARAM(5) + DO 110 I=1,N + W=DX(KX) + Z=DY(KY) + DX(KX)=W*DH11+Z + DY(KY)=-W+DH22*Z + KX=KX+INCX + KY=KY+INCY + 110 CONTINUE + GO TO 140 + 120 CONTINUE + DH11=DPARAM(2) + DH12=DPARAM(4) + DH21=DPARAM(3) + DH22=DPARAM(5) + DO 130 I=1,N + W=DX(KX) + Z=DY(KY) + DX(KX)=W*DH11+Z*DH12 + DY(KY)=W*DH21+Z*DH22 + KX=KX+INCX + KY=KY+INCY + 130 CONTINUE + 140 CONTINUE + RETURN + END diff --git a/reference/drotmgf.f b/reference/drotmgf.f new file mode 100644 index 0000000..bc9c03e --- /dev/null +++ b/reference/drotmgf.f @@ -0,0 +1,169 @@ + SUBROUTINE DROTMGF (DD1,DD2,DX1,DY1,DPARAM) +C +C CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS +C THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* +C DY2)**T. +C WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. +C +C DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 +C +C (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) +C H=( ) ( ) ( ) ( ) +C (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). +C LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 +C RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE +C VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) +C +C THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE +C INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE +C OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. +C + DOUBLE PRECISION GAM,ONE,RGAMSQ,DD2,DH11,DH21,DPARAM,DP2, + 1 DQ2,DU,DY1,ZERO,GAMSQ,DD1,DFLAG,DH12,DH22,DP1,DQ1, + 2 DTEMP,DX1,TWO + DIMENSION DPARAM(5) +C + DATA ZERO,ONE,TWO /0.D0,1.D0,2.D0/ + DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/ + IF(.NOT. DD1 .LT. ZERO) GO TO 10 +C GO ZERO-H-D-AND-DX1.. + GO TO 60 + 10 CONTINUE +C CASE-DD1-NONNEGATIVE + DP2=DD2*DY1 + IF(.NOT. DP2 .EQ. ZERO) GO TO 20 + DFLAG=-TWO + GO TO 260 +C REGULAR-CASE.. + 20 CONTINUE + DP1=DD1*DX1 + DQ2=DP2*DY1 + DQ1=DP1*DX1 +C + IF(.NOT. DABS(DQ1) .GT. DABS(DQ2)) GO TO 40 + DH21=-DY1/DX1 + DH12=DP2/DP1 +C + DU=ONE-DH12*DH21 +C + IF(.NOT. DU .LE. ZERO) GO TO 30 +C GO ZERO-H-D-AND-DX1.. + GO TO 60 + 30 CONTINUE + DFLAG=ZERO + DD1=DD1/DU + DD2=DD2/DU + DX1=DX1*DU +C GO SCALE-CHECK.. + GO TO 100 + 40 CONTINUE + IF(.NOT. DQ2 .LT. ZERO) GO TO 50 +C GO ZERO-H-D-AND-DX1.. + GO TO 60 + 50 CONTINUE + DFLAG=ONE + DH11=DP1/DP2 + DH22=DX1/DY1 + DU=ONE+DH11*DH22 + DTEMP=DD2/DU + DD2=DD1/DU + DD1=DTEMP + DX1=DY1*DU +C GO SCALE-CHECK + GO TO 100 +C PROCEDURE..ZERO-H-D-AND-DX1.. + 60 CONTINUE + DFLAG=-ONE + DH11=ZERO + DH12=ZERO + DH21=ZERO + DH22=ZERO +C + DD1=ZERO + DD2=ZERO + DX1=ZERO +C RETURN.. + GO TO 220 +C PROCEDURE..FIX-H.. + 70 CONTINUE + IF(.NOT. DFLAG .GE. ZERO) GO TO 90 +C + IF(.NOT. DFLAG .EQ. ZERO) GO TO 80 + DH11=ONE + DH22=ONE + DFLAG=-ONE + GO TO 90 + 80 CONTINUE + DH21=-ONE + DH12=ONE + DFLAG=-ONE + 90 CONTINUE + GO TO IGO,(120,150,180,210) +C PROCEDURE..SCALE-CHECK + 100 CONTINUE + 110 CONTINUE + IF(.NOT. DD1 .LE. RGAMSQ) GO TO 130 + IF(DD1 .EQ. ZERO) GO TO 160 + ASSIGN 120 TO IGO +C FIX-H.. + GO TO 70 + 120 CONTINUE + DD1=DD1*GAM**2 + DX1=DX1/GAM + DH11=DH11/GAM + DH12=DH12/GAM + GO TO 110 + 130 CONTINUE + 140 CONTINUE + IF(.NOT. DD1 .GE. GAMSQ) GO TO 160 + ASSIGN 150 TO IGO +C FIX-H.. + GO TO 70 + 150 CONTINUE + DD1=DD1/GAM**2 + DX1=DX1*GAM + DH11=DH11*GAM + DH12=DH12*GAM + GO TO 140 + 160 CONTINUE + 170 CONTINUE + IF(.NOT. DABS(DD2) .LE. RGAMSQ) GO TO 190 + IF(DD2 .EQ. ZERO) GO TO 220 + ASSIGN 180 TO IGO +C FIX-H.. + GO TO 70 + 180 CONTINUE + DD2=DD2*GAM**2 + DH21=DH21/GAM + DH22=DH22/GAM + GO TO 170 + 190 CONTINUE + 200 CONTINUE + IF(.NOT. DABS(DD2) .GE. GAMSQ) GO TO 220 + ASSIGN 210 TO IGO +C FIX-H.. + GO TO 70 + 210 CONTINUE + DD2=DD2/GAM**2 + DH21=DH21*GAM + DH22=DH22*GAM + GO TO 200 + 220 CONTINUE + IF(DFLAG)250,230,240 + 230 CONTINUE + DPARAM(3)=DH21 + DPARAM(4)=DH12 + GO TO 260 + 240 CONTINUE + DPARAM(2)=DH11 + DPARAM(5)=DH22 + GO TO 260 + 250 CONTINUE + DPARAM(2)=DH11 + DPARAM(3)=DH21 + DPARAM(4)=DH12 + DPARAM(5)=DH22 + 260 CONTINUE + DPARAM(1)=DFLAG + RETURN + END diff --git a/reference/dsbmvf.f b/reference/dsbmvf.f new file mode 100644 index 0000000..7a882a3 --- /dev/null +++ b/reference/dsbmvf.f @@ -0,0 +1,303 @@ + SUBROUTINE DSBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, K, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric band matrix, with k super-diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( K.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( 1, J ) + L = 1 - J + DO 90, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) + L = 1 - J + IX = JX + IY = JY + DO 110, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSBMV . +* + END diff --git a/reference/dscalf.f b/reference/dscalf.f new file mode 100644 index 0000000..84d8898 --- /dev/null +++ b/reference/dscalf.f @@ -0,0 +1,43 @@ + subroutine dscalf(n,da,dx,incx) +c +c scales a vector by a constant. +c uses unrolled loops for increment equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision da,dx(*) + integer i,incx,m,mp1,n,nincx +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + dx(i) = da*dx(i) + 10 continue + return +c +c code for increment equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,5) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dx(i) = da*dx(i) + 30 continue + if( n .lt. 5 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,5 + dx(i) = da*dx(i) + dx(i + 1) = da*dx(i + 1) + dx(i + 2) = da*dx(i + 2) + dx(i + 3) = da*dx(i + 3) + dx(i + 4) = da*dx(i + 4) + 50 continue + return + end diff --git a/reference/dsdotf.f b/reference/dsdotf.f new file mode 100644 index 0000000..d4e183e --- /dev/null +++ b/reference/dsdotf.f @@ -0,0 +1,74 @@ +*DECK DSDOT + DOUBLE PRECISION FUNCTION DSDOTF (N, SX, INCX, SY, INCY) +C***BEGIN PROLOGUE DSDOT +C***PURPOSE Compute the inner product of two vectors with extended +C precision accumulation and result. +C***LIBRARY SLATEC (BLAS) +C***CATEGORY D1A4 +C***TYPE DOUBLE PRECISION (DSDOT-D, DCDOT-C) +C***KEYWORDS BLAS, COMPLEX VECTORS, DOT PRODUCT, INNER PRODUCT, +C LINEAR ALGEBRA, VECTOR +C***AUTHOR Lawson, C. L., (JPL) +C Hanson, R. J., (SNLA) +C Kincaid, D. R., (U. of Texas) +C Krogh, F. T., (JPL) +C***DESCRIPTION +C +C B L A S Subprogram +C Description of Parameters +C +C --Input-- +C N number of elements in input vector(s) +C SX single precision vector with N elements +C INCX storage spacing between elements of SX +C SY single precision vector with N elements +C INCY storage spacing between elements of SY +C +C --Output-- +C DSDOT double precision dot product (zero if N.LE.0) +C +C Returns D.P. dot product accumulated in D.P., for S.P. SX and SY +C DSDOT = sum for I = 0 to N-1 of SX(LX+I*INCX) * SY(LY+I*INCY), +C where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is +C defined in a similar way using INCY. +C +C***REFERENCES C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T. +C Krogh, Basic linear algebra subprograms for Fortran +C usage, Algorithm No. 539, Transactions on Mathematical +C Software 5, 3 (September 1979), pp. 308-323. +C***ROUTINES CALLED (NONE) +C***REVISION HISTORY (YYMMDD) +C 791001 DATE WRITTEN +C 890831 Modified array declarations. (WRB) +C 890831 REVISION DATE from Version 3.2 +C 891214 Prologue converted to Version 4.0 format. (BAB) +C 920310 Corrected definition of LX in DESCRIPTION. (WRB) +C 920501 Reformatted the REFERENCES section. (WRB) +C***END PROLOGUE DSDOT + REAL SX(*),SY(*) +C***FIRST EXECUTABLE STATEMENT DSDOT + DSDOTF = 0.0D0 + IF (N .LE. 0) RETURN + IF (INCX.EQ.INCY .AND. INCX.GT.0) GO TO 20 +C +C Code for unequal or nonpositive increments. +C + KX = 1 + KY = 1 + IF (INCX .LT. 0) KX = 1+(1-N)*INCX + IF (INCY .LT. 0) KY = 1+(1-N)*INCY + DO 10 I = 1,N + DSDOTF = DSDOTF + DBLE(SX(KX))*DBLE(SY(KY)) + KX = KX + INCX + KY = KY + INCY + 10 CONTINUE + RETURN +C +C Code for equal, positive, non-unit increments. +C + 20 NS = N*INCX + DO 30 I = 1,NS,INCX + DSDOTF = DSDOTF + DBLE(SX(I))*DBLE(SY(I)) + 30 CONTINUE + RETURN + END diff --git a/reference/dspmvf.f b/reference/dspmvf.f new file mode 100644 index 0000000..a83a609 --- /dev/null +++ b/reference/dspmvf.f @@ -0,0 +1,262 @@ + SUBROUTINE DSPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 6 + ELSE IF( INCY.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*AP( KK ) + K = KK + 1 + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N - J + 1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*AP( KK ) + IX = JX + IY = JY + DO 110, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N - J + 1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSPMV . +* + END diff --git a/reference/dspr2f.f b/reference/dspr2f.f new file mode 100644 index 0000000..9eabacf --- /dev/null +++ b/reference/dspr2f.f @@ -0,0 +1,229 @@ + SUBROUTINE DSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSPR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSPR2 . +* + END diff --git a/reference/dsprf.f b/reference/dsprf.f new file mode 100644 index 0000000..69b7400 --- /dev/null +++ b/reference/dsprf.f @@ -0,0 +1,198 @@ + SUBROUTINE DSPRF ( UPLO, N, ALPHA, X, INCX, AP ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DSPR performs the symmetric rank 1 operation +* +* A := alpha*x*x' + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSPR . +* + END diff --git a/reference/dswapf.f b/reference/dswapf.f new file mode 100644 index 0000000..597ee83 --- /dev/null +++ b/reference/dswapf.f @@ -0,0 +1,56 @@ + subroutine dswapf (n,dx,incx,dy,incy) +c +c interchanges two vectors. +c uses unrolled loops for increments equal one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*),dtemp + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dtemp = dx(ix) + dx(ix) = dy(iy) + dy(iy) = dtemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,3) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dtemp = dx(i) + dx(i) = dy(i) + dy(i) = dtemp + 30 continue + if( n .lt. 3 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,3 + dtemp = dx(i) + dx(i) = dy(i) + dy(i) = dtemp + dtemp = dx(i + 1) + dx(i + 1) = dy(i + 1) + dy(i + 1) = dtemp + dtemp = dx(i + 2) + dx(i + 2) = dy(i + 2) + dy(i + 2) = dtemp + 50 continue + return + end diff --git a/reference/dsymmf.f b/reference/dsymmf.f new file mode 100644 index 0000000..d0053f3 --- /dev/null +++ b/reference/dsymmf.f @@ -0,0 +1,294 @@ + SUBROUTINE DSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* DSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + DOUBLE PRECISION TEMP1, TEMP2 +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of DSYMM . +* + END diff --git a/reference/dsymvf.f b/reference/dsymvf.f new file mode 100644 index 0000000..1b38747 --- /dev/null +++ b/reference/dsymvf.f @@ -0,0 +1,262 @@ + SUBROUTINE DSYMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSYMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 5 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + ELSE IF( INCY.EQ.0 )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( J, J ) + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + IX = JX + IY = JY + DO 110, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYMV . +* + END diff --git a/reference/dsyr2f.f b/reference/dsyr2f.f new file mode 100644 index 0000000..826bdb0 --- /dev/null +++ b/reference/dsyr2f.f @@ -0,0 +1,230 @@ + SUBROUTINE DSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSYR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYR2 . +* + END diff --git a/reference/dsyr2kf.f b/reference/dsyr2kf.f new file mode 100644 index 0000000..81e73da --- /dev/null +++ b/reference/dsyr2kf.f @@ -0,0 +1,327 @@ + SUBROUTINE DSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* DSYR2K performs one of the symmetric rank 2k operations +* +* C := alpha*A*B' + alpha*B*A' + beta*C, +* +* or +* +* C := alpha*A'*B + alpha*B'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A and B are n by k matrices in the first case and k by n +* matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + +* beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* TRANS = 'C' or 'c' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'T' or 't' or 'C' or 'c', K specifies the number +* of rows of the matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + DOUBLE PRECISION TEMP1, TEMP2 +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYR2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*B' + alpha*B*A' + C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*B + alpha*B'*A + C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYR2K. +* + END diff --git a/reference/dsyrf.f b/reference/dsyrf.f new file mode 100644 index 0000000..b5bcd00 --- /dev/null +++ b/reference/dsyrf.f @@ -0,0 +1,197 @@ + SUBROUTINE DSYRF ( UPLO, N, ALPHA, X, INCX, A, LDA ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DSYR performs the symmetric rank 1 operation +* +* A := alpha*x*x' + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYR . +* + END diff --git a/reference/dsyrkf.f b/reference/dsyrkf.f new file mode 100644 index 0000000..6376b09 --- /dev/null +++ b/reference/dsyrkf.f @@ -0,0 +1,294 @@ + SUBROUTINE DSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + DOUBLE PRECISION ALPHA, BETA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* DSYRK performs one of the symmetric rank k operations +* +* C := alpha*A*A' + beta*C, +* +* or +* +* C := alpha*A'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A is an n by k matrix in the first case and a k by n matrix +* in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*A + beta*C. +* +* TRANS = 'C' or 'c' C := alpha*A'*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'T' or 't' or 'C' or 'c', K specifies the number +* of rows of the matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + DOUBLE PRECISION TEMP +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYRK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*A' + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*A + beta*C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP = ZERO + DO 220, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYRK . +* + END diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f new file mode 100644 index 0000000..da34077 --- /dev/null +++ b/reference/dtbmvf.f @@ -0,0 +1,342 @@ + SUBROUTINE DTBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, K, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTBMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular band matrix, with ( k + 1 ) diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( K.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 7 + ELSE IF( INCX.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = KPLUS1 - J + DO 10, I = MAX( 1, J - K ), J - 1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( KPLUS1, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = KPLUS1 - J + DO 30, I = MAX( 1, J - K ), J - 1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( KPLUS1, J ) + END IF + JX = JX + INCX + IF( J.GT.K ) + $ KX = KX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = 1 - J + DO 50, I = MIN( N, J + K ), J + 1, -1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( 1, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = 1 - J + DO 70, I = MIN( N, J + K ), J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( 1, J ) + END IF + JX = JX - INCX + IF( ( N - J ).GE.K ) + $ KX = KX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + L = KPLUS1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 90, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( I ) + 90 CONTINUE + X( J ) = TEMP + 100 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + KX = KX - INCX + IX = KX + L = KPLUS1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 110, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX - INCX + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + L = 1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 130, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( I ) + 130 CONTINUE + X( J ) = TEMP + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + KX = KX + INCX + IX = KX + L = 1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 150, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX + INCX + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTBMV . +* + END diff --git a/reference/dtbsvf.f b/reference/dtbsvf.f new file mode 100644 index 0000000..4dd16d5 --- /dev/null +++ b/reference/dtbsvf.f @@ -0,0 +1,336 @@ + SUBROUTINE DTBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) +* .. Scalar Arguments .. + INTEGER INCX,K,LDA,N + CHARACTER DIAG,TRANS,UPLO +* .. +* .. Array Arguments .. + DOUBLE PRECISION A(LDA,*),X(*) +* .. +* +* Purpose +* ======= +* +* DTBSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular band matrix, with ( k + 1 ) +* diagonals. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER (ZERO=0.0D+0) +* .. +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L + LOGICAL NOUNIT +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX,MIN +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN + INFO = 1 + ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 2 + ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT. (K+1)) THEN + INFO = 7 + ELSE IF (INCX.EQ.0) THEN + INFO = 9 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('DTBSV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF (N.EQ.0) RETURN +* + NOUNIT = LSAME(DIAG,'N') +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF (INCX.LE.0) THEN + KX = 1 - (N-1)*INCX + ELSE IF (INCX.NE.1) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed by sequentially with one pass through A. +* + IF (LSAME(TRANS,'N')) THEN +* +* Form x := inv( A )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 20 J = N,1,-1 + IF (X(J).NE.ZERO) THEN + L = KPLUS1 - J + IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) + TEMP = X(J) + DO 10 I = J - 1,MAX(1,J-K),-1 + X(I) = X(I) - TEMP*A(L+I,J) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 40 J = N,1,-1 + KX = KX - INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = KPLUS1 - J + IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) + TEMP = X(JX) + DO 30 I = J - 1,MAX(1,J-K),-1 + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX - INCX + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 60 J = 1,N + IF (X(J).NE.ZERO) THEN + L = 1 - J + IF (NOUNIT) X(J) = X(J)/A(1,J) + TEMP = X(J) + DO 50 I = J + 1,MIN(N,J+K) + X(I) = X(I) - TEMP*A(L+I,J) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1,N + KX = KX + INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = 1 - J + IF (NOUNIT) X(JX) = X(JX)/A(1,J) + TEMP = X(JX) + DO 70 I = J + 1,MIN(N,J+K) + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A')*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 100 J = 1,N + TEMP = X(J) + L = KPLUS1 - J + DO 90 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(I) + 90 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + X(J) = TEMP + 100 CONTINUE + ELSE + JX = KX + DO 120 J = 1,N + TEMP = X(JX) + IX = KX + L = KPLUS1 - J + DO 110 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX + INCX + 110 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + X(JX) = TEMP + JX = JX + INCX + IF (J.GT.K) KX = KX + INCX + 120 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 140 J = N,1,-1 + TEMP = X(J) + L = 1 - J + DO 130 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(I) + 130 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + X(J) = TEMP + 140 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 160 J = N,1,-1 + TEMP = X(JX) + IX = KX + L = 1 - J + DO 150 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX - INCX + 150 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + X(JX) = TEMP + JX = JX - INCX + IF ((N-J).GE.K) KX = KX - INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTBSV . +* + END diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f new file mode 100644 index 0000000..e8f6eb4 --- /dev/null +++ b/reference/dtpmvf.f @@ -0,0 +1,299 @@ + SUBROUTINE DTPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTPMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTPMVF', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x:= A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK =1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 10, I = 1, J - 1 + X( I ) = X( I ) + TEMP*AP( K ) + K = K + 1 + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK + J - 1 ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, K = KK, KK + J - 2 + X( IX ) = X( IX ) + TEMP*AP( K ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK + J - 1 ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 50, I = N, J + 1, -1 + X( I ) = X( I ) + TEMP*AP( K ) + K = K - 1 + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK - N + J ) + END IF + KK = KK - ( N - J + 1 ) + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 + X( IX ) = X( IX ) + TEMP*AP( K ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK - N + J ) + END IF + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + K = KK - 1 + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + AP( K )*X( I ) + K = K - 1 + 90 CONTINUE + X( J ) = TEMP + KK = KK - J + 100 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 110, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + AP( K )*X( IX ) + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + KK = KK - J + 120 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + K = KK + 1 + DO 130, I = J + 1, N + TEMP = TEMP + AP( K )*X( I ) + K = K + 1 + 130 CONTINUE + X( J ) = TEMP + KK = KK + ( N - J + 1 ) + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 150, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + AP( K )*X( IX ) + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTPMV . +* + END diff --git a/reference/dtpsvf.f b/reference/dtpsvf.f new file mode 100644 index 0000000..3639ba2 --- /dev/null +++ b/reference/dtpsvf.f @@ -0,0 +1,302 @@ + SUBROUTINE DTPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTPSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix, supplied in packed form. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTPSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + TEMP = X( J ) + K = KK - 1 + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*AP( K ) + K = K - 1 + 10 CONTINUE + END IF + KK = KK - J + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + TEMP = X( JX ) + IX = JX + DO 30, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + X( IX ) = X( IX ) - TEMP*AP( K ) + 30 CONTINUE + END IF + JX = JX - INCX + KK = KK - J + 40 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + TEMP = X( J ) + K = KK + 1 + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*AP( K ) + K = K + 1 + 50 CONTINUE + END IF + KK = KK + ( N - J + 1 ) + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + TEMP = X( JX ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + X( IX ) = X( IX ) - TEMP*AP( K ) + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = X( J ) + K = KK + DO 90, I = 1, J - 1 + TEMP = TEMP - AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + X( J ) = TEMP + KK = KK + J + 100 CONTINUE + ELSE + JX = KX + DO 120, J = 1, N + TEMP = X( JX ) + IX = KX + DO 110, K = KK, KK + J - 2 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX + INCX + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + X( JX ) = TEMP + JX = JX + INCX + KK = KK + J + 120 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 140, J = N, 1, -1 + TEMP = X( J ) + K = KK + DO 130, I = N, J + 1, -1 + TEMP = TEMP - AP( K )*X( I ) + K = K - 1 + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + X( J ) = TEMP + KK = KK - ( N - J + 1 ) + 140 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 160, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + DO 150, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX - INCX + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + X( JX ) = TEMP + JX = JX - INCX + KK = KK - (N - J + 1 ) + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTPSV . +* + END diff --git a/reference/dtrmmf.f b/reference/dtrmmf.f new file mode 100644 index 0000000..399d45b --- /dev/null +++ b/reference/dtrmmf.f @@ -0,0 +1,355 @@ + SUBROUTINE DTRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + DOUBLE PRECISION ALPHA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* DTRMM performs one of the matrix-matrix operations +* +* B := alpha*op( A )*B, or B := alpha*B*op( A ), +* +* where alpha is a scalar, B is an m by n matrix, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A'. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) multiplies B from +* the left or right as follows: +* +* SIDE = 'L' or 'l' B := alpha*op( A )*B. +* +* SIDE = 'R' or 'r' B := alpha*B*op( A ). +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = A'. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B, and on exit is overwritten by the +* transformed matrix. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + DOUBLE PRECISION TEMP +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTRMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*A*B. +* + IF( UPPER )THEN + DO 50, J = 1, N + DO 40, K = 1, M + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + DO 30, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 30 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + B( K, J ) = TEMP + END IF + 40 CONTINUE + 50 CONTINUE + ELSE + DO 80, J = 1, N + DO 70 K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + B( K, J ) = TEMP + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*A( K, K ) + DO 60, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 60 CONTINUE + END IF + 70 CONTINUE + 80 CONTINUE + END IF + ELSE +* +* Form B := alpha*A'*B. +* + IF( UPPER )THEN + DO 110, J = 1, N + DO 100, I = M, 1, -1 + TEMP = B( I, J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 90, K = 1, I - 1 + TEMP = TEMP + A( K, I )*B( K, J ) + 90 CONTINUE + B( I, J ) = ALPHA*TEMP + 100 CONTINUE + 110 CONTINUE + ELSE + DO 140, J = 1, N + DO 130, I = 1, M + TEMP = B( I, J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 120, K = I + 1, M + TEMP = TEMP + A( K, I )*B( K, J ) + 120 CONTINUE + B( I, J ) = ALPHA*TEMP + 130 CONTINUE + 140 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*B*A. +* + IF( UPPER )THEN + DO 180, J = N, 1, -1 + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 150 CONTINUE + DO 170, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + TEMP = ALPHA*A( K, J ) + DO 160, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + ELSE + DO 220, J = 1, N + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 190, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 190 CONTINUE + DO 210, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + TEMP = ALPHA*A( K, J ) + DO 200, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 200 CONTINUE + END IF + 210 CONTINUE + 220 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*A'. +* + IF( UPPER )THEN + DO 260, K = 1, N + DO 240, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + TEMP = ALPHA*A( J, K ) + DO 230, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 230 CONTINUE + END IF + 240 CONTINUE + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + IF( TEMP.NE.ONE )THEN + DO 250, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 250 CONTINUE + END IF + 260 CONTINUE + ELSE + DO 300, K = N, 1, -1 + DO 280, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + TEMP = ALPHA*A( J, K ) + DO 270, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 270 CONTINUE + END IF + 280 CONTINUE + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + IF( TEMP.NE.ONE )THEN + DO 290, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 290 CONTINUE + END IF + 300 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTRMM . +* + END diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f new file mode 100644 index 0000000..0619d3e --- /dev/null +++ b/reference/dtrmvf.f @@ -0,0 +1,286 @@ + SUBROUTINE DTRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTRMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTRMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 10, I = 1, J - 1 + X( I ) = X( I ) + TEMP*A( I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, I = 1, J - 1 + X( IX ) = X( IX ) + TEMP*A( I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 50, I = N, J + 1, -1 + X( I ) = X( I ) + TEMP*A( I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, I = N, J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + END IF + JX = JX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + X( J ) = TEMP + 100 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 110, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + A( I, J )*X( IX ) + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 130, I = J + 1, N + TEMP = TEMP + A( I, J )*X( I ) + 130 CONTINUE + X( J ) = TEMP + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + A( I, J )*X( IX ) + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTRMV . +* + END diff --git a/reference/dtrsmf.f b/reference/dtrsmf.f new file mode 100644 index 0000000..be3b407 --- /dev/null +++ b/reference/dtrsmf.f @@ -0,0 +1,378 @@ + SUBROUTINE DTRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + DOUBLE PRECISION ALPHA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* DTRSM solves one of the matrix equations +* +* op( A )*X = alpha*B, or X*op( A ) = alpha*B, +* +* where alpha is a scalar, X and B are m by n matrices, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A'. +* +* The matrix X is overwritten on B. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) appears on the left +* or right of X as follows: +* +* SIDE = 'L' or 'l' op( A )*X = alpha*B. +* +* SIDE = 'R' or 'r' X*op( A ) = alpha*B. +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = A'. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the right-hand side matrix B, and on exit is +* overwritten by the solution matrix X. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + DOUBLE PRECISION TEMP +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTRSM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*inv( A )*B. +* + IF( UPPER )THEN + DO 60, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 30, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 30 CONTINUE + END IF + DO 50, K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 40, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 40 CONTINUE + END IF + 50 CONTINUE + 60 CONTINUE + ELSE + DO 100, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 70, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 70 CONTINUE + END IF + DO 90 K = 1, M + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 80, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 80 CONTINUE + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form B := alpha*inv( A' )*B. +* + IF( UPPER )THEN + DO 130, J = 1, N + DO 120, I = 1, M + TEMP = ALPHA*B( I, J ) + DO 110, K = 1, I - 1 + TEMP = TEMP - A( K, I )*B( K, J ) + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + B( I, J ) = TEMP + 120 CONTINUE + 130 CONTINUE + ELSE + DO 160, J = 1, N + DO 150, I = M, 1, -1 + TEMP = ALPHA*B( I, J ) + DO 140, K = I + 1, M + TEMP = TEMP - A( K, I )*B( K, J ) + 140 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + B( I, J ) = TEMP + 150 CONTINUE + 160 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*B*inv( A ). +* + IF( UPPER )THEN + DO 210, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 170, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 170 CONTINUE + END IF + DO 190, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + DO 180, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 180 CONTINUE + END IF + 190 CONTINUE + IF( NOUNIT )THEN + TEMP = ONE/A( J, J ) + DO 200, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 200 CONTINUE + END IF + 210 CONTINUE + ELSE + DO 260, J = N, 1, -1 + IF( ALPHA.NE.ONE )THEN + DO 220, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 220 CONTINUE + END IF + DO 240, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + DO 230, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 230 CONTINUE + END IF + 240 CONTINUE + IF( NOUNIT )THEN + TEMP = ONE/A( J, J ) + DO 250, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 250 CONTINUE + END IF + 260 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*inv( A' ). +* + IF( UPPER )THEN + DO 310, K = N, 1, -1 + IF( NOUNIT )THEN + TEMP = ONE/A( K, K ) + DO 270, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 270 CONTINUE + END IF + DO 290, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + TEMP = A( J, K ) + DO 280, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 280 CONTINUE + END IF + 290 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 300, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 300 CONTINUE + END IF + 310 CONTINUE + ELSE + DO 360, K = 1, N + IF( NOUNIT )THEN + TEMP = ONE/A( K, K ) + DO 320, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 320 CONTINUE + END IF + DO 340, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + TEMP = A( J, K ) + DO 330, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 330 CONTINUE + END IF + 340 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 350, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 350 CONTINUE + END IF + 360 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTRSM . +* + END diff --git a/reference/dtrsvf.f b/reference/dtrsvf.f new file mode 100644 index 0000000..2f4a702 --- /dev/null +++ b/reference/dtrsvf.f @@ -0,0 +1,289 @@ + SUBROUTINE DTRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTRSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTRSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*A( I, J ) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + TEMP = X( JX ) + IX = JX + DO 30, I = J - 1, 1, -1 + IX = IX - INCX + X( IX ) = X( IX ) - TEMP*A( I, J ) + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*A( I, J ) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + TEMP = X( JX ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + X( IX ) = X( IX ) - TEMP*A( I, J ) + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = X( J ) + DO 90, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( I ) + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( J ) = TEMP + 100 CONTINUE + ELSE + JX = KX + DO 120, J = 1, N + TEMP = X( JX ) + IX = KX + DO 110, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( JX ) = TEMP + JX = JX + INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = N, 1, -1 + TEMP = X( J ) + DO 130, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( I ) + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( J ) = TEMP + 140 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 160, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + DO 150, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX - INCX + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( JX ) = TEMP + JX = JX - INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTRSV . +* + END diff --git a/reference/dtrti2f.f b/reference/dtrti2f.f new file mode 100644 index 0000000..214d4f5 --- /dev/null +++ b/reference/dtrti2f.f @@ -0,0 +1,146 @@ + SUBROUTINE DTRTI2F( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DTRTI2 computes the inverse of a real upper or lower triangular +* matrix. +* +* This is the Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the matrix A is upper or lower triangular. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* DIAG (input) CHARACTER*1 +* Specifies whether or not the matrix A is unit triangular. +* = 'N': Non-unit triangular +* = 'U': Unit triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading n by n upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J + DOUBLE PRECISION AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DSCAL, DTRMV, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRTI2', -INFO ) + RETURN + END IF +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix. +* + DO 10 J = 1, N + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF +* +* Compute elements 1:j-1 of j-th column. +* + CALL DTRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, + $ A( 1, J ), 1 ) + CALL DSCAL( J-1, AJJ, A( 1, J ), 1 ) + 10 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix. +* + DO 20 J = N, 1, -1 + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF + IF( J.LT.N ) THEN +* +* Compute elements j+1:n of j-th column. +* + CALL DTRMV( 'Lower', 'No transpose', DIAG, N-J, + $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) + CALL DSCAL( N-J, AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of DTRTI2 +* + END diff --git a/reference/dtrtrif.f b/reference/dtrtrif.f new file mode 100644 index 0000000..e2af835 --- /dev/null +++ b/reference/dtrtrif.f @@ -0,0 +1,176 @@ + SUBROUTINE DTRTRIF( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DTRTRI computes the inverse of a real upper or lower triangular +* matrix A. +* +* This is the Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': A is upper triangular; +* = 'L': A is lower triangular. +* +* DIAG (input) CHARACTER*1 +* = 'N': A is non-unit triangular; +* = 'U': A is unit triangular. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading N-by-N upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, A(i,i) is exactly zero. The triangular +* matrix is singular and its inverse can not be computed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J, JB, NB, NN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DTRMM, DTRSM, DTRTI2, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Check for singularity if non-unit. +* + IF( NOUNIT ) THEN + DO 10 INFO = 1, N + IF( A( INFO, INFO ).EQ.ZERO ) + $ RETURN + 10 CONTINUE + INFO = 0 + END IF +* +* Determine the block size for this environment. +* + NB = 128 + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL DTRTI2( UPLO, DIAG, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix +* + DO 20 J = 1, N, NB + JB = MIN( NB, N-J+1 ) +* +* Compute rows 1:j-1 of current block column +* + CALL DTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, + $ JB, ONE, A, LDA, A( 1, J ), LDA ) + CALL DTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, + $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) +* +* Compute inverse of current diagonal block +* + CALL DTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) + 20 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 30 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) + IF( J+JB.LE.N ) THEN +* +* Compute rows j+jb:n of current block column +* + CALL DTRMM( 'Left', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, + $ A( J+JB, J ), LDA ) + CALL DTRSM( 'Right', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF +* +* Compute inverse of current diagonal block +* + CALL DTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) + 30 CONTINUE + END IF + END IF +* + RETURN +* +* End of DTRTRI +* + END diff --git a/reference/dzamaxf.f b/reference/dzamaxf.f new file mode 100644 index 0000000..e75cbc6 --- /dev/null +++ b/reference/dzamaxf.f @@ -0,0 +1,40 @@ + REAL*8 function dzamaxf(n,zx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + COMPLEX*16 zx(*) + integer i,incx,ix,n + double precision dcabs1 +c + dzamaxf = 0. + if( n.lt.1 .or. incx.le.0 )return + dzamaxf = dcabs1(zx(1)) + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dzamaxf = dcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(dcabs1(zx(ix)).le.dzamaxf) go to 5 + dzamaxf = i + dzamaxf = dcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dzamaxf = dcabs1(zx(1)) + do 30 i = 2,n + if(dcabs1(zx(i)).le.dzamaxf) go to 30 + dzamaxf = i + dzamaxf = dcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/dzaminf.f b/reference/dzaminf.f new file mode 100644 index 0000000..61f59e3 --- /dev/null +++ b/reference/dzaminf.f @@ -0,0 +1,38 @@ + REAL*8 function dzaminf(n,zx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + COMPLEX*16 zx(*) + integer i,incx,ix,n + double precision dcabs1 +c + dzaminf = 0. + if( n.lt.1 .or. incx.le.0 )return + dzaminf = dcabs1(zx(1)) + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dzaminf = dcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(dcabs1(zx(ix)).ge.dzaminf) go to 5 + dzaminf = dcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dzaminf = dcabs1(zx(1)) + do 30 i = 2,n + if(dcabs1(zx(i)).ge.dzaminf) go to 30 + dzaminf = dcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/dzasumf.f b/reference/dzasumf.f new file mode 100644 index 0000000..1b4dbdb --- /dev/null +++ b/reference/dzasumf.f @@ -0,0 +1,34 @@ + double precision function dzasumf(n,zx,incx) +c +c takes the sum of the absolute values. +c jack dongarra, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*) + double precision stemp,dcabs1 + integer i,incx,ix,n +c + dzasumf = 0.0d0 + stemp = 0.0d0 + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + do 10 i = 1,n + stemp = stemp + dcabs1(zx(ix)) + ix = ix + incx + 10 continue + dzasumf = stemp + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + stemp = stemp + dcabs1(zx(i)) + 30 continue + dzasumf = stemp + return + end diff --git a/reference/dznrm2f.f b/reference/dznrm2f.f new file mode 100644 index 0000000..1e9cba6 --- /dev/null +++ b/reference/dznrm2f.f @@ -0,0 +1,67 @@ + DOUBLE PRECISION FUNCTION DZNRM2F( N, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N +* .. Array Arguments .. + COMPLEX*16 X( * ) +* .. +* +* DZNRM2 returns the euclidean norm of a vector via the function +* name, so that +* +* DZNRM2 := sqrt( conjg( x' )*x ) +* +* +* +* -- This version written on 25-October-1982. +* Modified on 14-October-1993 to inline the call to ZLASSQ. +* Sven Hammarling, Nag Ltd. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + INTEGER IX + DOUBLE PRECISION NORM, SCALE, SSQ, TEMP +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DBLE, SQRT +* .. +* .. Executable Statements .. + IF( N.LT.1 .OR. INCX.LT.1 )THEN + NORM = ZERO + ELSE + SCALE = ZERO + SSQ = ONE +* The following loop is equivalent to this call to the LAPACK +* auxiliary routine: +* CALL ZLASSQ( N, X, INCX, SCALE, SSQ ) +* + DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX + IF( DBLE( X( IX ) ).NE.ZERO )THEN + TEMP = ABS( DBLE( X( IX ) ) ) + IF( SCALE.LT.TEMP )THEN + SSQ = ONE + SSQ*( SCALE/TEMP )**2 + SCALE = TEMP + ELSE + SSQ = SSQ + ( TEMP/SCALE )**2 + END IF + END IF + IF( DIMAG( X( IX ) ).NE.ZERO )THEN + TEMP = ABS( DIMAG( X( IX ) ) ) + IF( SCALE.LT.TEMP )THEN + SSQ = ONE + SSQ*( SCALE/TEMP )**2 + SCALE = TEMP + ELSE + SSQ = SSQ + ( TEMP/SCALE )**2 + END IF + END IF + 10 CONTINUE + NORM = SCALE * SQRT( SSQ ) + END IF +* + DZNRM2F = NORM + RETURN +* +* End of DZNRM2. +* + END diff --git a/reference/icamaxf.f b/reference/icamaxf.f new file mode 100644 index 0000000..928ad32 --- /dev/null +++ b/reference/icamaxf.f @@ -0,0 +1,41 @@ + integer function icamaxf(n,cx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*) + real smax + integer i,incx,ix,n + real scabs1 +c + icamaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + icamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = scabs1(cx(1)) + ix = ix + incx + do 10 i = 2,n + if(scabs1(cx(ix)).le.smax) go to 5 + icamaxf = i + smax = scabs1(cx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = scabs1(cx(1)) + do 30 i = 2,n + if(scabs1(cx(i)).le.smax) go to 30 + icamaxf = i + smax = scabs1(cx(i)) + 30 continue + return + end diff --git a/reference/icaminf.f b/reference/icaminf.f new file mode 100644 index 0000000..3535450 --- /dev/null +++ b/reference/icaminf.f @@ -0,0 +1,41 @@ + integer function icaminf(n,cx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*) + real smin + integer i,incx,ix,n + real scabs1 +c + icaminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + icaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = scabs1(cx(1)) + ix = ix + incx + do 10 i = 2,n + if(scabs1(cx(ix)).ge.smin) go to 5 + icaminf = i + smin = scabs1(cx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = scabs1(cx(1)) + do 30 i = 2,n + if(scabs1(cx(i)).ge.smin) go to 30 + icaminf = i + smin = scabs1(cx(i)) + 30 continue + return + end diff --git a/reference/idamaxf.f b/reference/idamaxf.f new file mode 100644 index 0000000..e1359e5 --- /dev/null +++ b/reference/idamaxf.f @@ -0,0 +1,39 @@ + integer function idamaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dmax + integer i,incx,ix,n +c + idamaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + idamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmax = dabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(dabs(dx(ix)).le.dmax) go to 5 + idamaxf = i + dmax = dabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmax = dabs(dx(1)) + do 30 i = 2,n + if(dabs(dx(i)).le.dmax) go to 30 + idamaxf = i + dmax = dabs(dx(i)) + 30 continue + return + end diff --git a/reference/idaminf.f b/reference/idaminf.f new file mode 100644 index 0000000..86e18cb --- /dev/null +++ b/reference/idaminf.f @@ -0,0 +1,39 @@ + integer function idaminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dmin + integer i,incx,ix,n +c + idaminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + idaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmin = dabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(dabs(dx(ix)).ge.dmin) go to 5 + idaminf = i + dmin = dabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmin = dabs(dx(1)) + do 30 i = 2,n + if(dabs(dx(i)).ge.dmin) go to 30 + idaminf = i + dmin = dabs(dx(i)) + 30 continue + return + end diff --git a/reference/idmaxf.f b/reference/idmaxf.f new file mode 100644 index 0000000..9b0d25c --- /dev/null +++ b/reference/idmaxf.f @@ -0,0 +1,39 @@ + integer function idmaxf(n,dx,incx) +c +c finds the index of element having max. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dmax + integer i,incx,ix,n +c + idmaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + idmaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmax = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).le.dmax) go to 5 + idmaxf = i + dmax = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmax = dx(1) + do 30 i = 2,n + if(dx(i).le.dmax) go to 30 + idmaxf = i + dmax = dx(i) + 30 continue + return + end diff --git a/reference/idminf.f b/reference/idminf.f new file mode 100644 index 0000000..4ba0b5e --- /dev/null +++ b/reference/idminf.f @@ -0,0 +1,39 @@ + integer function idminf(n,dx,incx) +c +c finds the index of element having min. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dmin + integer i,incx,ix,n +c + idminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + idminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmin = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).ge.dmin) go to 5 + idminf = i + dmin = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmin = dx(1) + do 30 i = 2,n + if(dx(i).ge.dmin) go to 30 + idminf = i + dmin = dx(i) + 30 continue + return + end diff --git a/reference/iqamaxf.f b/reference/iqamaxf.f new file mode 100644 index 0000000..13e9fc7 --- /dev/null +++ b/reference/iqamaxf.f @@ -0,0 +1,48 @@ + REAL*10 function qabs(dx) + REAL*10 dx + + qabs = dx + if (dx >= 0) return + qabs = -dx + return + end + + integer function iqamaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real*10 dx(*),dmax + integer i,incx,ix,n +c + iqamaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + iqamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmax = qabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(qabs(dx(ix)).le.dmax) go to 5 + iqamaxf = i + dmax = qabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmax = qabs(dx(1)) + do 30 i = 2,n + if(qabs(dx(i)).le.dmax) go to 30 + iqamaxf = i + dmax = qabs(dx(i)) + 30 continue + return + end diff --git a/reference/iqaminf.f b/reference/iqaminf.f new file mode 100644 index 0000000..1429be7 --- /dev/null +++ b/reference/iqaminf.f @@ -0,0 +1,49 @@ + REAL*10 function qabs(dx) + REAL*10 dx + + qabs = dx + if (dx >= 0) return + qabs = -dx + return + end + + + integer function iqaminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real*10 dx(*),dmin + integer i,incx,ix,n +c + iqaminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + iqaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmin = qabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(qabs(dx(ix)).ge.dmin) go to 5 + iqaminf = i + dmin = qabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmin = qabs(dx(1)) + do 30 i = 2,n + if(qabs(dx(i)).ge.dmin) go to 30 + iqaminf = i + dmin = qabs(dx(i)) + 30 continue + return + end diff --git a/reference/iqmaxf.f b/reference/iqmaxf.f new file mode 100644 index 0000000..782e4f2 --- /dev/null +++ b/reference/iqmaxf.f @@ -0,0 +1,39 @@ + integer function iqmaxf(n,dx,incx) +c +c finds the index of element having max. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real*10 dx(*),dmax + integer i,incx,ix,n +c + iqmaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + iqmaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmax = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).le.dmax) go to 5 + iqmaxf = i + dmax = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmax = dx(1) + do 30 i = 2,n + if(dx(i).le.dmax) go to 30 + iqmaxf = i + dmax = dx(i) + 30 continue + return + end diff --git a/reference/iqminf.f b/reference/iqminf.f new file mode 100644 index 0000000..bc75c2b --- /dev/null +++ b/reference/iqminf.f @@ -0,0 +1,39 @@ + integer function iqminf(n,dx,incx) +c +c finds the index of element having min. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real*10 dx(*),dmin + integer i,incx,ix,n +c + iqminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + iqminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmin = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).ge.dmin) go to 5 + iqminf = i + dmin = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmin = dx(1) + do 30 i = 2,n + if(dx(i).ge.dmin) go to 30 + iqminf = i + dmin = dx(i) + 30 continue + return + end diff --git a/reference/isamaxf.f b/reference/isamaxf.f new file mode 100644 index 0000000..95be5a5 --- /dev/null +++ b/reference/isamaxf.f @@ -0,0 +1,39 @@ + integer function isamaxf(n,sx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),smax + integer i,incx,ix,n +c + isamaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + isamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = abs(sx(1)) + ix = ix + incx + do 10 i = 2,n + if(abs(sx(ix)).le.smax) go to 5 + isamaxf = i + smax = abs(sx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = abs(sx(1)) + do 30 i = 2,n + if(abs(sx(i)).le.smax) go to 30 + isamaxf = i + smax = abs(sx(i)) + 30 continue + return + end diff --git a/reference/isaminf.f b/reference/isaminf.f new file mode 100644 index 0000000..83eb129 --- /dev/null +++ b/reference/isaminf.f @@ -0,0 +1,39 @@ + integer function isaminf(n,sx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),smin + integer i,incx,ix,n +c + isaminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + isaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = abs(sx(1)) + ix = ix + incx + do 10 i = 2,n + if(abs(sx(ix)).ge.smin) go to 5 + isaminf = i + smin = abs(sx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = abs(sx(1)) + do 30 i = 2,n + if(abs(sx(i)).ge.smin) go to 30 + isaminf = i + smin = abs(sx(i)) + 30 continue + return + end diff --git a/reference/ismaxf.f b/reference/ismaxf.f new file mode 100644 index 0000000..63cab5f --- /dev/null +++ b/reference/ismaxf.f @@ -0,0 +1,39 @@ + integer function ismaxf(n,sx,incx) +c +c finds the index of element having max. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),smax + integer i,incx,ix,n +c + ismaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + ismaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = sx(1) + ix = ix + incx + do 10 i = 2,n + if(sx(ix).le.smax) go to 5 + ismaxf = i + smax = sx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = sx(1) + do 30 i = 2,n + if(sx(i).le.smax) go to 30 + ismaxf = i + smax = sx(i) + 30 continue + return + end diff --git a/reference/isminf.f b/reference/isminf.f new file mode 100644 index 0000000..dc59801 --- /dev/null +++ b/reference/isminf.f @@ -0,0 +1,39 @@ + integer function isminf(n,sx,incx) +c +c finds the index of element having min. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),smin + integer i,incx,ix,n +c + isminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + isminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = sx(1) + ix = ix + incx + do 10 i = 2,n + if(sx(ix).ge.smin) go to 5 + isminf = i + smin = sx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = sx(1) + do 30 i = 2,n + if(sx(i).ge.smin) go to 30 + isminf = i + smin = sx(i) + 30 continue + return + end diff --git a/reference/ixamaxf.f b/reference/ixamaxf.f new file mode 100644 index 0000000..536602f --- /dev/null +++ b/reference/ixamaxf.f @@ -0,0 +1,41 @@ + integer function ixamaxf(n,zx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex*20 zx(*) + real*10 smax + integer i,incx,ix,n + real*10 qcabs1 +c + ixamaxf = 0 + if( n.lt.1 .or. incx.le.0 )return + ixamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = qcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(qcabs1(zx(ix)).le.smax) go to 5 + ixamaxf = i + smax = qcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = qcabs1(zx(1)) + do 30 i = 2,n + if(qcabs1(zx(i)).le.smax) go to 30 + ixamaxf = i + smax = qcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/ixaminf.f b/reference/ixaminf.f new file mode 100644 index 0000000..8112e8b --- /dev/null +++ b/reference/ixaminf.f @@ -0,0 +1,41 @@ + integer function ixaminf(n,zx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex*20 zx(*) + real*10 smin + integer i,incx,ix,n + real*10 qcabs1 +c + ixaminf = 0 + if( n.lt.1 .or. incx.le.0 )return + ixaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = qcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(qcabs1(zx(ix)).ge.smin) go to 5 + ixaminf = i + smin = qcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = qcabs1(zx(1)) + do 30 i = 2,n + if(qcabs1(zx(i)).ge.smin) go to 30 + ixaminf = i + smin = qcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/izamaxf.f b/reference/izamaxf.f new file mode 100644 index 0000000..902c014 --- /dev/null +++ b/reference/izamaxf.f @@ -0,0 +1,41 @@ + integer function izamaxf(n,zx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*) + double precision smax + integer i,incx,ix,n + double precision dcabs1 +c + izamaxf = 0 + if( n.lt.1 .or. incx.le.0 )return + izamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = dcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(dcabs1(zx(ix)).le.smax) go to 5 + izamaxf = i + smax = dcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = dcabs1(zx(1)) + do 30 i = 2,n + if(dcabs1(zx(i)).le.smax) go to 30 + izamaxf = i + smax = dcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/izaminf.f b/reference/izaminf.f new file mode 100644 index 0000000..8779849 --- /dev/null +++ b/reference/izaminf.f @@ -0,0 +1,41 @@ + integer function izaminf(n,zx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*) + double precision smin + integer i,incx,ix,n + double precision dcabs1 +c + izaminf = 0 + if( n.lt.1 .or. incx.le.0 )return + izaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = dcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(dcabs1(zx(ix)).ge.smin) go to 5 + izaminf = i + smin = dcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = dcabs1(zx(1)) + do 30 i = 2,n + if(dcabs1(zx(i)).ge.smin) go to 30 + izaminf = i + smin = dcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/lsamef.f b/reference/lsamef.f new file mode 100644 index 0000000..f895174 --- /dev/null +++ b/reference/lsamef.f @@ -0,0 +1,87 @@ + LOGICAL FUNCTION LSAME( CA, CB ) +* +* -- LAPACK auxiliary routine (version 2.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* January 31, 1994 +* +* .. Scalar Arguments .. + CHARACTER CA, CB +* .. +* +* Purpose +* ======= +* +* LSAME returns .TRUE. if CA is the same letter as CB regardless of +* case. +* +* Arguments +* ========= +* +* CA (input) CHARACTER*1 +* CB (input) CHARACTER*1 +* CA and CB specify the single characters to be compared. +* +* ===================================================================== +* +* .. Intrinsic Functions .. + INTRINSIC ICHAR +* .. +* .. Local Scalars .. + INTEGER INTA, INTB, ZCODE +* .. +* .. Executable Statements .. +* +* Test if the characters are equal +* + LSAME = CA.EQ.CB + IF( LSAME ) + $ RETURN +* +* Now test for equivalence if both characters are alphabetic. +* + ZCODE = ICHAR( 'Z' ) +* +* Use 'Z' rather than 'A' so that ASCII can be detected on Prime +* machines, on which ICHAR returns a value with bit 8 set. +* ICHAR('A') on Prime machines returns 193 which is the same as +* ICHAR('A') on an EBCDIC machine. +* + INTA = ICHAR( CA ) + INTB = ICHAR( CB ) +* + IF( ZCODE.EQ.90 .OR. ZCODE.EQ.122 ) THEN +* +* ASCII is assumed - ZCODE is the ASCII code of either lower or +* upper case 'Z'. +* + IF( INTA.GE.97 .AND. INTA.LE.122 ) INTA = INTA - 32 + IF( INTB.GE.97 .AND. INTB.LE.122 ) INTB = INTB - 32 +* + ELSE IF( ZCODE.EQ.233 .OR. ZCODE.EQ.169 ) THEN +* +* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or +* upper case 'Z'. +* + IF( INTA.GE.129 .AND. INTA.LE.137 .OR. + $ INTA.GE.145 .AND. INTA.LE.153 .OR. + $ INTA.GE.162 .AND. INTA.LE.169 ) INTA = INTA + 64 + IF( INTB.GE.129 .AND. INTB.LE.137 .OR. + $ INTB.GE.145 .AND. INTB.LE.153 .OR. + $ INTB.GE.162 .AND. INTB.LE.169 ) INTB = INTB + 64 +* + ELSE IF( ZCODE.EQ.218 .OR. ZCODE.EQ.250 ) THEN +* +* ASCII is assumed, on Prime machines - ZCODE is the ASCII code +* plus 128 of either lower or upper case 'Z'. +* + IF( INTA.GE.225 .AND. INTA.LE.250 ) INTA = INTA - 32 + IF( INTB.GE.225 .AND. INTB.LE.250 ) INTB = INTB - 32 + END IF + LSAME = INTA.EQ.INTB +* +* RETURN +* +* End of LSAME +* + END diff --git a/reference/samaxf.f b/reference/samaxf.f new file mode 100644 index 0000000..ef0b80f --- /dev/null +++ b/reference/samaxf.f @@ -0,0 +1,36 @@ + REAL*4 function samaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + REAL*4 dx(*) + integer i,incx,ix,n +c + samaxf = 0. + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + samaxf = abs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(abs(dx(ix)).le.samaxf) go to 5 + samaxf = abs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 samaxf = abs(dx(1)) + do 30 i = 2,n + if(abs(dx(i)).le.samaxf) go to 30 + samaxf = abs(dx(i)) + 30 continue + return + end diff --git a/reference/saminf.f b/reference/saminf.f new file mode 100644 index 0000000..455436b --- /dev/null +++ b/reference/saminf.f @@ -0,0 +1,36 @@ + REAL*4 function saminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + REAL*4 dx(*) + integer i,incx,ix,n +c + saminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + saminf = abs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(abs(dx(ix)).ge.saminf) go to 5 + saminf = abs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 saminf = abs(dx(1)) + do 30 i = 2,n + if(abs(dx(i)).ge.saminf) go to 30 + saminf = abs(dx(i)) + 30 continue + return + end diff --git a/reference/sasumf.f b/reference/sasumf.f new file mode 100644 index 0000000..bf3805b --- /dev/null +++ b/reference/sasumf.f @@ -0,0 +1,44 @@ + real function sasumf(n,sx,incx) +c +c takes the sum of the absolute values. +c uses unrolled loops for increment equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),stemp + integer i,incx,m,mp1,n,nincx +c + sasumf = 0.0e0 + stemp = 0.0e0 + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + stemp = stemp + abs(sx(i)) + 10 continue + sasumf = stemp + return +c +c code for increment equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,6) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + stemp = stemp + abs(sx(i)) + 30 continue + if( n .lt. 6 ) go to 60 + 40 mp1 = m + 1 + do 50 i = mp1,n,6 + stemp = stemp + abs(sx(i)) + abs(sx(i + 1)) + abs(sx(i + 2)) + * + abs(sx(i + 3)) + abs(sx(i + 4)) + abs(sx(i + 5)) + 50 continue + 60 sasumf = stemp + return + end diff --git a/reference/saxpyf.f b/reference/saxpyf.f new file mode 100644 index 0000000..95f1e01 --- /dev/null +++ b/reference/saxpyf.f @@ -0,0 +1,48 @@ + subroutine saxpyf(n,sa,sx,incx,sy,incy) +c +c constant times a vector plus a vector. +c uses unrolled loop for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*),sa + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if (sa .eq. 0.0) return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + sy(iy) = sy(iy) + sa*sx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,4) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + sy(i) = sy(i) + sa*sx(i) + 30 continue + if( n .lt. 4 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,4 + sy(i) = sy(i) + sa*sx(i) + sy(i + 1) = sy(i + 1) + sa*sx(i + 1) + sy(i + 2) = sy(i + 2) + sa*sx(i + 2) + sy(i + 3) = sy(i + 3) + sa*sx(i + 3) + 50 continue + return + end diff --git a/reference/scamaxf.f b/reference/scamaxf.f new file mode 100644 index 0000000..f3d0a51 --- /dev/null +++ b/reference/scamaxf.f @@ -0,0 +1,40 @@ + REAL*4 function scamaxf(n,zx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + COMPLEX*8 zx(*) + integer i,incx,ix,n + REAL*4 scabs1 +c + scamaxf = 0. + if( n.lt.1 .or. incx.le.0 )return + scamaxf = scabs1(zx(1)) + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + scamaxf = scabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(scabs1(zx(ix)).le.scamaxf) go to 5 + scamaxf = i + scamaxf = scabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 scamaxf = scabs1(zx(1)) + do 30 i = 2,n + if(scabs1(zx(i)).le.scamaxf) go to 30 + scamaxf = i + scamaxf = scabs1(zx(i)) + 30 continue + return + end diff --git a/reference/scaminf.f b/reference/scaminf.f new file mode 100644 index 0000000..e6a6e91 --- /dev/null +++ b/reference/scaminf.f @@ -0,0 +1,38 @@ + REAL*4 function scaminf(n,zx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + COMPLEX*8 zx(*) + integer i,incx,ix,n + REAL*4 scabs1 +c + scaminf = 0. + if( n.lt.1 .or. incx.le.0 )return + scaminf = scabs1(zx(1)) + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + scaminf = scabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(scabs1(zx(ix)).ge.scaminf) go to 5 + scaminf = scabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 scaminf = scabs1(zx(1)) + do 30 i = 2,n + if(scabs1(zx(i)).ge.scaminf) go to 30 + scaminf = scabs1(zx(i)) + 30 continue + return + end diff --git a/reference/scasumf.f b/reference/scasumf.f new file mode 100644 index 0000000..6cc139f --- /dev/null +++ b/reference/scasumf.f @@ -0,0 +1,34 @@ + real function scasumf(n,cx,incx) +c +c takes the sum of the absolute values of a complex vector and +c returns a single precision result. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*) + real stemp + integer i,incx,n,nincx +c + scasumf = 0.0e0 + stemp = 0.0e0 + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + stemp = stemp + abs(real(cx(i))) + abs(aimag(cx(i))) + 10 continue + scasumf = stemp + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + stemp = stemp + abs(real(cx(i))) + abs(aimag(cx(i))) + 30 continue + scasumf = stemp + return + end diff --git a/reference/scnrm2f.f b/reference/scnrm2f.f new file mode 100644 index 0000000..d7e0b37 --- /dev/null +++ b/reference/scnrm2f.f @@ -0,0 +1,67 @@ + REAL FUNCTION SCNRM2F( N, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N +* .. Array Arguments .. + COMPLEX X( * ) +* .. +* +* SCNRM2 returns the euclidean norm of a vector via the function +* name, so that +* +* SCNRM2 := sqrt( conjg( x' )*x ) +* +* +* +* -- This version written on 25-October-1982. +* Modified on 14-October-1993 to inline the call to CLASSQ. +* Sven Hammarling, Nag Ltd. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + INTEGER IX + REAL NORM, SCALE, SSQ, TEMP +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, REAL, SQRT +* .. +* .. Executable Statements .. + IF( N.LT.1 .OR. INCX.LT.1 )THEN + NORM = ZERO + ELSE + SCALE = ZERO + SSQ = ONE +* The following loop is equivalent to this call to the LAPACK +* auxiliary routine: +* CALL CLASSQ( N, X, INCX, SCALE, SSQ ) +* + DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX + IF( REAL( X( IX ) ).NE.ZERO )THEN + TEMP = ABS( REAL( X( IX ) ) ) + IF( SCALE.LT.TEMP )THEN + SSQ = ONE + SSQ*( SCALE/TEMP )**2 + SCALE = TEMP + ELSE + SSQ = SSQ + ( TEMP/SCALE )**2 + END IF + END IF + IF( AIMAG( X( IX ) ).NE.ZERO )THEN + TEMP = ABS( AIMAG( X( IX ) ) ) + IF( SCALE.LT.TEMP )THEN + SSQ = ONE + SSQ*( SCALE/TEMP )**2 + SCALE = TEMP + ELSE + SSQ = SSQ + ( TEMP/SCALE )**2 + END IF + END IF + 10 CONTINUE + NORM = SCALE * SQRT( SSQ ) + END IF +* + SCNRM2F = NORM + RETURN +* +* End of SCNRM2. +* + END diff --git a/reference/scopyf.f b/reference/scopyf.f new file mode 100644 index 0000000..bec1584 --- /dev/null +++ b/reference/scopyf.f @@ -0,0 +1,50 @@ + subroutine scopyf(n,sx,incx,sy,incy) +c +c copies a vector, x, to a vector, y. +c uses unrolled loops for increments equal to 1. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*) + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + sy(iy) = sx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,7) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + sy(i) = sx(i) + 30 continue + if( n .lt. 7 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,7 + sy(i) = sx(i) + sy(i + 1) = sx(i + 1) + sy(i + 2) = sx(i + 2) + sy(i + 3) = sx(i + 3) + sy(i + 4) = sx(i + 4) + sy(i + 5) = sx(i + 5) + sy(i + 6) = sx(i + 6) + 50 continue + return + end diff --git a/reference/sdotf.f b/reference/sdotf.f new file mode 100644 index 0000000..dabda7c --- /dev/null +++ b/reference/sdotf.f @@ -0,0 +1,49 @@ + real function sdotf(n,sx,incx,sy,incy) +c +c forms the dot product of two vectors. +c uses unrolled loops for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*),stemp + integer i,incx,incy,ix,iy,m,mp1,n +c + stemp = 0.0e0 + sdotf = 0.0e0 + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + stemp = stemp + sx(ix)*sy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + sdotf = stemp + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,5) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + stemp = stemp + sx(i)*sy(i) + 30 continue + if( n .lt. 5 ) go to 60 + 40 mp1 = m + 1 + do 50 i = mp1,n,5 + stemp = stemp + sx(i)*sy(i) + sx(i + 1)*sy(i + 1) + + * sx(i + 2)*sy(i + 2) + sx(i + 3)*sy(i + 3) + sx(i + 4)*sy(i + 4) + 50 continue + 60 sdotf = stemp + return + end diff --git a/reference/sdsdotf.f b/reference/sdsdotf.f new file mode 100644 index 0000000..c3aa6a5 --- /dev/null +++ b/reference/sdsdotf.f @@ -0,0 +1,78 @@ +*DECK SDSDOTF + REAL FUNCTION SDSDOTF (N, SB, SX, INCX, SY, INCY) +C***BEGIN PROLOGUE SDSDOT +C***PURPOSE Compute the inner product of two vectors with extended +C precision accumulation. +C***LIBRARY SLATEC (BLAS) +C***CATEGORY D1A4 +C***TYPE SINGLE PRECISION (SDSDOT-S, CDCDOT-C) +C***KEYWORDS BLAS, DOT PRODUCT, INNER PRODUCT, LINEAR ALGEBRA, VECTOR +C***AUTHOR Lawson, C. L., (JPL) +C Hanson, R. J., (SNLA) +C Kincaid, D. R., (U. of Texas) +C Krogh, F. T., (JPL) +C***DESCRIPTION +C +C B L A S Subprogram +C Description of Parameters +C +C --Input-- +C N number of elements in input vector(s) +C SB single precision scalar to be added to inner product +C SX single precision vector with N elements +C INCX storage spacing between elements of SX +C SY single precision vector with N elements +C INCY storage spacing between elements of SY +C +C --Output-- +C SDSDOT single precision dot product (SB if N .LE. 0) +C +C Returns S.P. result with dot product accumulated in D.P. +C SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY), +C where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is +C defined in a similar way using INCY. +C +C***REFERENCES C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T. +C Krogh, Basic linear algebra subprograms for Fortran +C usage, Algorithm No. 539, Transactions on Mathematical +C Software 5, 3 (September 1979), pp. 308-323. +C***ROUTINES CALLED (NONE) +C***REVISION HISTORY (YYMMDD) +C 791001 DATE WRITTEN +C 890531 Changed all specific intrinsics to generic. (WRB) +C 890831 Modified array declarations. (WRB) +C 890831 REVISION DATE from Version 3.2 +C 891214 Prologue converted to Version 4.0 format. (BAB) +C 920310 Corrected definition of LX in DESCRIPTION. (WRB) +C 920501 Reformatted the REFERENCES section. (WRB) +C***END PROLOGUE SDSDOT + REAL SX(*), SY(*), SB + DOUBLE PRECISION DSDOT +C***FIRST EXECUTABLE STATEMENT SDSDOT + DSDOT = SB + IF (N .LE. 0) GO TO 30 + IF (INCX.EQ.INCY .AND. INCX.GT.0) GO TO 40 +C +C Code for unequal or nonpositive increments. +C + KX = 1 + KY = 1 + IF (INCX .LT. 0) KX = 1+(1-N)*INCX + IF (INCY .LT. 0) KY = 1+(1-N)*INCY + DO 10 I = 1,N + DSDOT = DSDOT + DBLE(SX(KX))*DBLE(SY(KY)) + KX = KX + INCX + KY = KY + INCY + 10 CONTINUE + 30 SDSDOTF = DSDOT + RETURN +C +C Code for equal and positive increments. +C + 40 NS = N*INCX + DO 50 I = 1,NS,INCX + DSDOT = DSDOT + DBLE(SX(I))*DBLE(SY(I)) + 50 CONTINUE + SDSDOTF = DSDOT + RETURN + END diff --git a/reference/sgbmvf.f b/reference/sgbmvf.f new file mode 100644 index 0000000..c8bc9ff --- /dev/null +++ b/reference/sgbmvf.f @@ -0,0 +1,300 @@ + SUBROUTINE SGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, KL, KU, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SGBMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n band matrix, with kl sub-diagonals and ku super-diagonals. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* KL - INTEGER. +* On entry, KL specifies the number of sub-diagonals of the +* matrix A. KL must satisfy 0 .le. KL. +* Unchanged on exit. +* +* KU - INTEGER. +* On entry, KU specifies the number of super-diagonals of the +* matrix A. KU must satisfy 0 .le. KU. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry, the leading ( kl + ku + 1 ) by n part of the +* array A must contain the matrix of coefficients, supplied +* column by column, with the leading diagonal of the matrix in +* row ( ku + 1 ) of the array, the first super-diagonal +* starting at position 2 in row ku, the first sub-diagonal +* starting at position 1 in row ( ku + 2 ), and so on. +* Elements in the array A that do not correspond to elements +* in the band matrix (such as the top left ku by ku triangle) +* are not referenced. +* The following program segment will transfer a band matrix +* from conventional full matrix storage to band storage: +* +* DO 20, J = 1, N +* K = KU + 1 - J +* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) +* A( K + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( kl + ku + 1 ). +* Unchanged on exit. +* +* X - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, + $ LENX, LENY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( KL.LT.0 )THEN + INFO = 4 + ELSE IF( KU.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN + INFO = 8 + ELSE IF( INCX.EQ.0 )THEN + INFO = 10 + ELSE IF( INCY.EQ.0 )THEN + INFO = 13 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SGBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF( LSAME( TRANS, 'N' ) )THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the band part of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KUP1 = KU + 1 + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + K = KUP1 - J + DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + K = KUP1 - J + DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = ZERO + K = KUP1 - J + DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( I ) + 90 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + DO 110, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SGBMV . +* + END diff --git a/reference/sgemmf.f b/reference/sgemmf.f new file mode 100644 index 0000000..ebb50c3 --- /dev/null +++ b/reference/sgemmf.f @@ -0,0 +1,313 @@ + SUBROUTINE SGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + REAL ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRANA,TRANB +* .. +* .. Array Arguments .. + REAL A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* SGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X', +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRANA - CHARACTER*1. +* On entry, TRANA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANA = 'N' or 'n', op( A ) = A. +* +* TRANA = 'T' or 't', op( A ) = A'. +* +* TRANA = 'C' or 'c', op( A ) = A'. +* +* Unchanged on exit. +* +* TRANB - CHARACTER*1. +* On entry, TRANB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRANB = 'N' or 'n', op( B ) = B. +* +* TRANB = 'T' or 't', op( B ) = B'. +* +* TRANB = 'C' or 'c', op( B ) = B'. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, ka ), where ka is +* k when TRANA = 'N' or 'n', and is m otherwise. +* Before entry with TRANA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, kb ), where kb is +* n when TRANB = 'N' or 'n', and is k otherwise. +* Before entry with TRANB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - REAL array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Local Scalars .. + REAL TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL NOTA,NOTB +* .. +* .. Parameters .. + REAL ONE,ZERO + PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* transposed and set NROWA, NCOLA and NROWB as the number of rows +* and columns of A and the number of rows of B respectively. +* + NOTA = LSAME(TRANA,'N') + NOTB = LSAME(TRANB,'N') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANA,'C')) .AND. + + (.NOT.LSAME(TRANA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANB,'C')) .AND. + + (.NOT.LSAME(TRANB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('SGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And if alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + END IF + ELSE + IF (NOTA) THEN +* +* Form C := alpha*A*B' + beta*C +* + DO 170 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 130 I = 1,M + C(I,J) = ZERO + 130 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 140 I = 1,M + C(I,J) = BETA*C(I,J) + 140 CONTINUE + END IF + DO 160 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 150 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 150 CONTINUE + END IF + 160 CONTINUE + 170 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 200 J = 1,N + DO 190 I = 1,M + TEMP = ZERO + DO 180 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 180 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 190 CONTINUE + 200 CONTINUE + END IF + END IF +* + RETURN +* +* End of SGEMM . +* + END diff --git a/reference/sgemvf.f b/reference/sgemvf.f new file mode 100644 index 0000000..351da45 --- /dev/null +++ b/reference/sgemvf.f @@ -0,0 +1,257 @@ + SUBROUTINE SGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF( LSAME( TRANS, 'N' ) )THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + DO 50, I = 1, M + Y( I ) = Y( I ) + TEMP*A( I, J ) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + DO 70, I = 1, M + Y( IY ) = Y( IY ) + TEMP*A( I, J ) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = ZERO + DO 90, I = 1, M + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120, J = 1, N + TEMP = ZERO + IX = KX + DO 110, I = 1, M + TEMP = TEMP + A( I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SGEMV . +* + END diff --git a/reference/sgerf.f b/reference/sgerf.f new file mode 100644 index 0000000..f84c933 --- /dev/null +++ b/reference/sgerf.f @@ -0,0 +1,157 @@ + SUBROUTINE SGERF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SGER performs the rank 1 operation +* +* A := alpha*x*y' + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SGER ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of SGER . +* + END diff --git a/reference/sgesvf.f b/reference/sgesvf.f new file mode 100644 index 0000000..8d313ab --- /dev/null +++ b/reference/sgesvf.f @@ -0,0 +1,107 @@ + SUBROUTINE SGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK driver routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* SGESV computes the solution to a real system of linear equations +* A * X = B, +* where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +* +* The LU decomposition with partial pivoting and row interchanges is +* used to factor A as +* A = P * L * U, +* where P is a permutation matrix, L is unit lower triangular, and U is +* upper triangular. The factored form of A is then used to solve the +* system of equations A * X = B. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of linear equations, i.e., the order of the +* matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the N-by-N coefficient matrix A. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (output) INTEGER array, dimension (N) +* The pivot indices that define the permutation matrix P; +* row i of the matrix was interchanged with row IPIV(i). +* +* B (input/output) REAL array, dimension (LDB,NRHS) +* On entry, the N-by-NRHS matrix of right hand side matrix B. +* On exit, if INFO = 0, the N-by-NRHS solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, so the solution could not be computed. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL SGETRF, SGETRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGESV ', -INFO ) + RETURN + END IF +* +* Compute the LU factorization of A. +* + CALL SGETRF( N, N, A, LDA, IPIV, INFO ) + IF( INFO.EQ.0 ) THEN +* +* Solve the system A*X = B, overwriting B with X. +* + CALL SGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, + $ INFO ) + END IF + RETURN +* +* End of SGESV +* + END diff --git a/reference/sgetf2f.f b/reference/sgetf2f.f new file mode 100644 index 0000000..15861b1 --- /dev/null +++ b/reference/sgetf2f.f @@ -0,0 +1,135 @@ + SUBROUTINE SGETF2F( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1992 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SGETF2 computes an LU factorization of a general m-by-n matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the m by n matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, U(k,k) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER J, JP +* .. +* .. External Functions .. + INTEGER ISAMAX + EXTERNAL ISAMAX +* .. +* .. External Subroutines .. + EXTERNAL SGER, SSCAL, SSWAP, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* + DO 10 J = 1, MIN( M, N ) +* +* Find pivot and test for singularity. +* + JP = J - 1 + ISAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + IF( A( JP, J ).NE.ZERO ) THEN +* +* Apply the interchange to columns 1:N. +* + IF( JP.NE.J ) + $ CALL SSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) +* +* Compute elements J+1:M of J-th column. +* + IF( J.LT.M ) + $ CALL SSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) +* + ELSE IF( INFO.EQ.0 ) THEN +* + INFO = J + END IF +* + IF( J.LT.MIN( M, N ) ) THEN +* +* Update trailing submatrix. +* + CALL SGER( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), LDA, + $ A( J+1, J+1 ), LDA ) + END IF + 10 CONTINUE + RETURN +* +* End of SGETF2 +* + END diff --git a/reference/sgetrff.f b/reference/sgetrff.f new file mode 100644 index 0000000..139e7de --- /dev/null +++ b/reference/sgetrff.f @@ -0,0 +1,156 @@ + SUBROUTINE SGETRFF( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SGETF2, SLASWP, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 64 + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL SGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL SGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to columns 1:J-1. +* + CALL SLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF( J+JB.LE.N ) THEN +* +* Apply interchanges to columns J+JB:N. +* + CALL SLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* +* Compute block row of U. +* + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL SGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + 20 CONTINUE + END IF + RETURN +* +* End of SGETRF +* + END diff --git a/reference/sgetrsf.f b/reference/sgetrsf.f new file mode 100644 index 0000000..f009218 --- /dev/null +++ b/reference/sgetrsf.f @@ -0,0 +1,150 @@ + SUBROUTINE SGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* SGETRS solves a system of linear equations +* A * X = B or A' * X = B +* with a general N-by-N matrix A using the LU factorization computed +* by SGETRF. +* +* Arguments +* ========= +* +* TRANS (input) CHARACTER*1 +* Specifies the form of the system of equations: +* = 'N': A * X = B (No transpose) +* = 'T': A'* X = B (Transpose) +* = 'C': A'* X = B (Conjugate transpose = Transpose) +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input) REAL array, dimension (LDA,N) +* The factors L and U from the factorization A = P*L*U +* as computed by SGETRF. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from SGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* B (input/output) REAL array, dimension (LDB,NRHS) +* On entry, the right hand side matrix B. +* On exit, the solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRAN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SLASWP, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NOTRAN = LSAME( TRANS, 'N' ) + IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETRS', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 .OR. NRHS.EQ.0 ) + $ RETURN +* + IF( NOTRAN ) THEN +* +* Solve A * X = B. +* +* Apply row interchanges to the right hand sides. +* + CALL SLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) +* +* Solve L*X = B, overwriting B with X. +* + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve U*X = B, overwriting B with X. +* + CALL STRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, + $ NRHS, ONE, A, LDA, B, LDB ) + ELSE +* +* Solve A' * X = B. +* +* Solve U'*X = B, overwriting B with X. +* + CALL STRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve L'*X = B, overwriting B with X. +* + CALL STRSM( 'Left', 'Lower', 'Transpose', 'Unit', N, NRHS, ONE, + $ A, LDA, B, LDB ) +* +* Apply row interchanges to the solution vectors. +* + CALL SLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) + END IF +* + RETURN +* +* End of SGETRS +* + END diff --git a/reference/slaswpf.f b/reference/slaswpf.f new file mode 100644 index 0000000..ab300e2 --- /dev/null +++ b/reference/slaswpf.f @@ -0,0 +1,120 @@ + SUBROUTINE SLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INCX, K1, K2, LDA, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SLASWP performs a series of row interchanges on the matrix A. +* One row interchange is initiated for each of rows K1 through K2 of A. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of columns of the matrix A. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the matrix of column dimension N to which the row +* interchanges will be applied. +* On exit, the permuted matrix. +* +* LDA (input) INTEGER +* The leading dimension of the array A. +* +* K1 (input) INTEGER +* The first element of IPIV for which a row interchange will +* be done. +* +* K2 (input) INTEGER +* The last element of IPIV for which a row interchange will +* be done. +* +* IPIV (input) INTEGER array, dimension (M*abs(INCX)) +* The vector of pivot indices. Only the elements in positions +* K1 through K2 of IPIV are accessed. +* IPIV(K) = L implies rows K and L are to be interchanged. +* +* INCX (input) INTEGER +* The increment between successive values of IPIV. If IPIV +* is negative, the pivots are applied in reverse order. +* +* Further Details +* =============== +* +* Modified by +* R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA +* +* ===================================================================== +* +* .. Local Scalars .. + INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 + REAL TEMP +* .. +* .. Executable Statements .. +* +* Interchange row I with row IPIV(I) for each of rows K1 through K2. +* + IF( INCX.GT.0 ) THEN + IX0 = K1 + I1 = K1 + I2 = K2 + INC = 1 + ELSE IF( INCX.LT.0 ) THEN + IX0 = 1 + ( 1-K2 )*INCX + I1 = K2 + I2 = K1 + INC = -1 + ELSE + RETURN + END IF +* + N32 = ( N / 32 )*32 + IF( N32.NE.0 ) THEN + DO 30 J = 1, N32, 32 + IX = IX0 + DO 20 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 10 K = J, J + 31 + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 10 CONTINUE + END IF + IX = IX + INCX + 20 CONTINUE + 30 CONTINUE + END IF + IF( N32.NE.N ) THEN + N32 = N32 + 1 + IX = IX0 + DO 50 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 40 K = N32, N + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 40 CONTINUE + END IF + IX = IX + INCX + 50 CONTINUE + END IF +* + RETURN +* +* End of SLASWP +* + END diff --git a/reference/slauu2f.f b/reference/slauu2f.f new file mode 100644 index 0000000..5d48e12 --- /dev/null +++ b/reference/slauu2f.f @@ -0,0 +1,135 @@ + SUBROUTINE SLAUU2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SLAUU2 computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the unblocked form of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I + REAL AII +* .. +* .. External Functions .. + LOGICAL LSAME + REAL SDOT + EXTERNAL LSAME, SDOT +* .. +* .. External Subroutines .. + EXTERNAL SGEMV, SSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLAUU2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = SDOT( N-I+1, A( I, I ), LDA, A( I, I ), LDA ) + CALL SGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), + $ LDA, A( I, I+1 ), LDA, AII, A( 1, I ), 1 ) + ELSE + CALL SSCAL( I, AII, A( 1, I ), 1 ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = SDOT( N-I+1, A( I, I ), 1, A( I, I ), 1 ) + CALL SGEMV( 'Transpose', N-I, I-1, ONE, A( I+1, 1 ), LDA, + $ A( I+1, I ), 1, AII, A( I, 1 ), LDA ) + ELSE + CALL SSCAL( I, AII, A( I, 1 ), LDA ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of SLAUU2 +* + END diff --git a/reference/slauumf.f b/reference/slauumf.f new file mode 100644 index 0000000..a4b25b9 --- /dev/null +++ b/reference/slauumf.f @@ -0,0 +1,156 @@ + SUBROUTINE SLAUUMF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* February 29, 1992 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SLAUUM computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the blocked form of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, IB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLAUU2, SSYRK, STRMM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLAUUM', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 128 +* + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL SLAUU2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL STRMM( 'Right', 'Upper', 'Transpose', 'Non-unit', + $ I-1, IB, ONE, A( I, I ), LDA, A( 1, I ), + $ LDA ) + CALL SLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL SGEMM( 'No transpose', 'Transpose', I-1, IB, + $ N-I-IB+1, ONE, A( 1, I+IB ), LDA, + $ A( I, I+IB ), LDA, ONE, A( 1, I ), LDA ) + CALL SSYRK( 'Upper', 'No transpose', IB, N-I-IB+1, + $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), + $ LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL STRMM( 'Left', 'Lower', 'Transpose', 'Non-unit', IB, + $ I-1, ONE, A( I, I ), LDA, A( I, 1 ), LDA ) + CALL SLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL SGEMM( 'Transpose', 'No transpose', IB, I-1, + $ N-I-IB+1, ONE, A( I+IB, I ), LDA, + $ A( I+IB, 1 ), LDA, ONE, A( I, 1 ), LDA ) + CALL SSYRK( 'Lower', 'Transpose', IB, N-I-IB+1, ONE, + $ A( I+IB, I ), LDA, ONE, A( I, I ), LDA ) + END IF + 20 CONTINUE + END IF + END IF +* + RETURN +* +* End of SLAUUM +* + END diff --git a/reference/smaxf.f b/reference/smaxf.f new file mode 100644 index 0000000..69d4738 --- /dev/null +++ b/reference/smaxf.f @@ -0,0 +1,36 @@ + REAL*4 function smaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + REAL*4 dx(*) + integer i,incx,ix,n +c + smaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smaxf = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).le.smaxf) go to 5 + smaxf = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smaxf = dx(1) + do 30 i = 2,n + if(dx(i).le.smaxf) go to 30 + smaxf = dx(i) + 30 continue + return + end diff --git a/reference/sminf.f b/reference/sminf.f new file mode 100644 index 0000000..de59c2e --- /dev/null +++ b/reference/sminf.f @@ -0,0 +1,36 @@ + REAL*4 function sminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + REAL*4 dx(*) + integer i,incx,ix,n +c + sminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + sminf = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).ge.sminf) go to 5 + sminf = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 sminf = dx(1) + do 30 i = 2,n + if(dx(i).ge.sminf) go to 30 + sminf = dx(i) + 30 continue + return + end diff --git a/reference/snrm2f.f b/reference/snrm2f.f new file mode 100644 index 0000000..cff495d --- /dev/null +++ b/reference/snrm2f.f @@ -0,0 +1,60 @@ + REAL FUNCTION SNRM2F ( N, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N +* .. Array Arguments .. + REAL X( * ) +* .. +* +* SNRM2 returns the euclidean norm of a vector via the function +* name, so that +* +* SNRM2 := sqrt( x'*x ) +* +* +* +* -- This version written on 25-October-1982. +* Modified on 14-October-1993 to inline the call to SLASSQ. +* Sven Hammarling, Nag Ltd. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + INTEGER IX + REAL ABSXI, NORM, SCALE, SSQ +* .. Intrinsic Functions .. + INTRINSIC ABS, SQRT +* .. +* .. Executable Statements .. + IF( N.LT.1 .OR. INCX.LT.1 )THEN + NORM = ZERO + ELSE IF( N.EQ.1 )THEN + NORM = ABS( X( 1 ) ) + ELSE + SCALE = ZERO + SSQ = ONE +* The following loop is equivalent to this call to the LAPACK +* auxiliary routine: +* CALL SLASSQ( N, X, INCX, SCALE, SSQ ) +* + DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX + IF( X( IX ).NE.ZERO )THEN + ABSXI = ABS( X( IX ) ) + IF( SCALE.LT.ABSXI )THEN + SSQ = ONE + SSQ*( SCALE/ABSXI )**2 + SCALE = ABSXI + ELSE + SSQ = SSQ + ( ABSXI/SCALE )**2 + END IF + END IF + 10 CONTINUE + NORM = SCALE * SQRT( SSQ ) + END IF +* + SNRM2F = NORM + RETURN +* +* End of SNRM2. +* + END diff --git a/reference/spotf2f.f b/reference/spotf2f.f new file mode 100644 index 0000000..5662b80 --- /dev/null +++ b/reference/spotf2f.f @@ -0,0 +1,168 @@ + SUBROUTINE SPOTF2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* February 29, 1992 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SPOTF2 computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U' * U , if UPLO = 'U', or +* A = L * L', if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the unblocked version of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the upper or lower triangular part of the +* symmetric matrix A is stored. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* n by n upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U'*U or A = L*L'. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, the leading minor of order k is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J + REAL AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + REAL SDOT + EXTERNAL LSAME, SDOT +* .. +* .. External Subroutines .. + EXTERNAL SGEMV, SSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SPOTF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N +* +* Compute U(J,J) and test for non-positive-definiteness. +* + AJJ = A( J, J ) - SDOT( J-1, A( 1, J ), 1, A( 1, J ), 1 ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of row J. +* + IF( J.LT.N ) THEN + CALL SGEMV( 'Transpose', J-1, N-J, -ONE, A( 1, J+1 ), + $ LDA, A( 1, J ), 1, ONE, A( J, J+1 ), LDA ) + CALL SSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N +* +* Compute L(J,J) and test for non-positive-definiteness. +* + AJJ = A( J, J ) - SDOT( J-1, A( J, 1 ), LDA, A( J, 1 ), + $ LDA ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of column J. +* + IF( J.LT.N ) THEN + CALL SGEMV( 'No transpose', N-J, J-1, -ONE, A( J+1, 1 ), + $ LDA, A( J, 1 ), LDA, ONE, A( J+1, J ), 1 ) + CALL SSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF + GO TO 40 +* + 30 CONTINUE + INFO = J +* + 40 CONTINUE + RETURN +* +* End of SPOTF2 +* + END diff --git a/reference/spotrff.f b/reference/spotrff.f new file mode 100644 index 0000000..0a49251 --- /dev/null +++ b/reference/spotrff.f @@ -0,0 +1,184 @@ + SUBROUTINE SPOTRFF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**T * U, if UPLO = 'U', or +* A = L * L**T, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SPOTF2, SSYRK, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 56 + + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL SPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL SSYRK( 'Upper', 'Transpose', JB, J-1, -ONE, + $ A( 1, J ), LDA, ONE, A( J, J ), LDA ) + CALL SPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block row. +* + CALL SGEMM( 'Transpose', 'No transpose', JB, N-J-JB+1, + $ J-1, -ONE, A( 1, J ), LDA, A( 1, J+JB ), + $ LDA, ONE, A( J, J+JB ), LDA ) + CALL STRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL SSYRK( 'Lower', 'No transpose', JB, J-1, -ONE, + $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) + CALL SPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block column. +* + CALL SGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, + $ J-1, -ONE, A( J+JB, 1 ), LDA, A( J, 1 ), + $ LDA, ONE, A( J+JB, J ), LDA ) + CALL STRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', + $ N-J-JB+1, JB, ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of SPOTRF +* + END diff --git a/reference/spotrif.f b/reference/spotrif.f new file mode 100644 index 0000000..ad24e23 --- /dev/null +++ b/reference/spotrif.f @@ -0,0 +1,96 @@ + SUBROUTINE SPOTRIF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SPOTRI computes the inverse of a real symmetric positive definite +* matrix A using the Cholesky factorization A = U**T*U or A = L*L**T +* computed by SPOTRF. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T, as computed by +* SPOTRF. +* On exit, the upper or lower triangle of the (symmetric) +* inverse of A, overwriting the input factor U or L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the (i,i) element of the factor U or L is +* zero, and the inverse could not be computed. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SLAUUM, STRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SPOTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Invert the triangular Cholesky factor U or L. +* + CALL STRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* +* Form inv(U)*inv(U)' or inv(L)'*inv(L). +* + CALL SLAUUM( UPLO, N, A, LDA, INFO ) +* + RETURN +* +* End of SPOTRI +* + END diff --git a/reference/srotf.f b/reference/srotf.f new file mode 100644 index 0000000..0223080 --- /dev/null +++ b/reference/srotf.f @@ -0,0 +1,37 @@ + subroutine srotf (n,sx,incx,sy,incy,c,s) +c +c applies a plane rotation. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*),stemp,c,s + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + stemp = c*sx(ix) + s*sy(iy) + sy(iy) = c*sy(iy) - s*sx(ix) + sx(ix) = stemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + stemp = c*sx(i) + s*sy(i) + sy(i) = c*sy(i) - s*sx(i) + sx(i) = stemp + 30 continue + return + end diff --git a/reference/srotgf.f b/reference/srotgf.f new file mode 100644 index 0000000..4f22298 --- /dev/null +++ b/reference/srotgf.f @@ -0,0 +1,27 @@ + subroutine srotgf(sa,sb,c,s) +c +c construct givens plane rotation. +c jack dongarra, linpack, 3/11/78. +c + real sa,sb,c,s,roe,scale,r,z +c + roe = sb + if( abs(sa) .gt. abs(sb) ) roe = sa + scale = abs(sa) + abs(sb) + if( scale .ne. 0.0 ) go to 10 + c = 1.0 + s = 0.0 + r = 0.0 + z = 0.0 + go to 20 + 10 r = scale*sqrt((sa/scale)**2 + (sb/scale)**2) + r = sign(1.0,roe)*r + c = sa/r + s = sb/r + z = 1.0 + if( abs(sa) .gt. abs(sb) ) z = s + if( abs(sb) .ge. abs(sa) .and. c .ne. 0.0 ) z = 1.0/c + 20 sa = r + sb = z + return + end diff --git a/reference/srotmf.f b/reference/srotmf.f new file mode 100644 index 0000000..3924edb --- /dev/null +++ b/reference/srotmf.f @@ -0,0 +1,106 @@ + SUBROUTINE SROTMF (N,SX,INCX,SY,INCY,SPARAM) +C +C APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX +C +C (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN +C (DX**T) +C +C SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE +C LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. +C WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. +C +C SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 +C +C (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) +C H=( ) ( ) ( ) ( ) +C (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). +C SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. +C + DIMENSION SX(1),SY(1),SPARAM(5) + DATA ZERO,TWO/0.E0,2.E0/ +C + SFLAG=SPARAM(1) + IF(N .LE. 0 .OR.(SFLAG+TWO.EQ.ZERO)) GO TO 140 + IF(.NOT.(INCX.EQ.INCY.AND. INCX .GT.0)) GO TO 70 +C + NSTEPS=N*INCX + IF(SFLAG) 50,10,30 + 10 CONTINUE + SH12=SPARAM(4) + SH21=SPARAM(3) + DO 20 I=1,NSTEPS,INCX + W=SX(I) + Z=SY(I) + SX(I)=W+Z*SH12 + SY(I)=W*SH21+Z + 20 CONTINUE + GO TO 140 + 30 CONTINUE + SH11=SPARAM(2) + SH22=SPARAM(5) + DO 40 I=1,NSTEPS,INCX + W=SX(I) + Z=SY(I) + SX(I)=W*SH11+Z + SY(I)=-W+SH22*Z + 40 CONTINUE + GO TO 140 + 50 CONTINUE + SH11=SPARAM(2) + SH12=SPARAM(4) + SH21=SPARAM(3) + SH22=SPARAM(5) + DO 60 I=1,NSTEPS,INCX + W=SX(I) + Z=SY(I) + SX(I)=W*SH11+Z*SH12 + SY(I)=W*SH21+Z*SH22 + 60 CONTINUE + GO TO 140 + 70 CONTINUE + KX=1 + KY=1 + IF(INCX .LT. 0) KX=1+(1-N)*INCX + IF(INCY .LT. 0) KY=1+(1-N)*INCY +C + IF(SFLAG)120,80,100 + 80 CONTINUE + SH12=SPARAM(4) + SH21=SPARAM(3) + DO 90 I=1,N + W=SX(KX) + Z=SY(KY) + SX(KX)=W+Z*SH12 + SY(KY)=W*SH21+Z + KX=KX+INCX + KY=KY+INCY + 90 CONTINUE + GO TO 140 + 100 CONTINUE + SH11=SPARAM(2) + SH22=SPARAM(5) + DO 110 I=1,N + W=SX(KX) + Z=SY(KY) + SX(KX)=W*SH11+Z + SY(KY)=-W+SH22*Z + KX=KX+INCX + KY=KY+INCY + 110 CONTINUE + GO TO 140 + 120 CONTINUE + SH11=SPARAM(2) + SH12=SPARAM(4) + SH21=SPARAM(3) + SH22=SPARAM(5) + DO 130 I=1,N + W=SX(KX) + Z=SY(KY) + SX(KX)=W*SH11+Z*SH12 + SY(KY)=W*SH21+Z*SH22 + KX=KX+INCX + KY=KY+INCY + 130 CONTINUE + 140 CONTINUE + RETURN + END diff --git a/reference/srotmgf.f b/reference/srotmgf.f new file mode 100644 index 0000000..e9998ff --- /dev/null +++ b/reference/srotmgf.f @@ -0,0 +1,166 @@ + SUBROUTINE SROTMGF (SD1,SD2,SX1,SY1,SPARAM) +C +C CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS +C THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* +C SY2)**T. +C WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. +C +C SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 +C +C (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) +C H=( ) ( ) ( ) ( ) +C (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). +C LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 +C RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE +C VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) +C +C THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE +C INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE +C OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. +C + DIMENSION SPARAM(5) +C + DATA ZERO,ONE,TWO /0.E0,1.E0,2.E0/ + DATA GAM,GAMSQ,RGAMSQ/4096.E0,1.67772E7,5.96046E-8/ + IF(.NOT. SD1 .LT. ZERO) GO TO 10 +C GO ZERO-H-D-AND-SX1.. + GO TO 60 + 10 CONTINUE +C CASE-SD1-NONNEGATIVE + SP2=SD2*SY1 + IF(.NOT. SP2 .EQ. ZERO) GO TO 20 + SFLAG=-TWO + GO TO 260 +C REGULAR-CASE.. + 20 CONTINUE + SP1=SD1*SX1 + SQ2=SP2*SY1 + SQ1=SP1*SX1 +C + IF(.NOT. ABS(SQ1) .GT. ABS(SQ2)) GO TO 40 + SH21=-SY1/SX1 + SH12=SP2/SP1 +C + SU=ONE-SH12*SH21 +C + IF(.NOT. SU .LE. ZERO) GO TO 30 +C GO ZERO-H-D-AND-SX1.. + GO TO 60 + 30 CONTINUE + SFLAG=ZERO + SD1=SD1/SU + SD2=SD2/SU + SX1=SX1*SU +C GO SCALE-CHECK.. + GO TO 100 + 40 CONTINUE + IF(.NOT. SQ2 .LT. ZERO) GO TO 50 +C GO ZERO-H-D-AND-SX1.. + GO TO 60 + 50 CONTINUE + SFLAG=ONE + SH11=SP1/SP2 + SH22=SX1/SY1 + SU=ONE+SH11*SH22 + STEMP=SD2/SU + SD2=SD1/SU + SD1=STEMP + SX1=SY1*SU +C GO SCALE-CHECK + GO TO 100 +C PROCEDURE..ZERO-H-D-AND-SX1.. + 60 CONTINUE + SFLAG=-ONE + SH11=ZERO + SH12=ZERO + SH21=ZERO + SH22=ZERO +C + SD1=ZERO + SD2=ZERO + SX1=ZERO +C RETURN.. + GO TO 220 +C PROCEDURE..FIX-H.. + 70 CONTINUE + IF(.NOT. SFLAG .GE. ZERO) GO TO 90 +C + IF(.NOT. SFLAG .EQ. ZERO) GO TO 80 + SH11=ONE + SH22=ONE + SFLAG=-ONE + GO TO 90 + 80 CONTINUE + SH21=-ONE + SH12=ONE + SFLAG=-ONE + 90 CONTINUE + GO TO IGO,(120,150,180,210) +C PROCEDURE..SCALE-CHECK + 100 CONTINUE + 110 CONTINUE + IF(.NOT. SD1 .LE. RGAMSQ) GO TO 130 + IF(SD1 .EQ. ZERO) GO TO 160 + ASSIGN 120 TO IGO +C FIX-H.. + GO TO 70 + 120 CONTINUE + SD1=SD1*GAM**2 + SX1=SX1/GAM + SH11=SH11/GAM + SH12=SH12/GAM + GO TO 110 + 130 CONTINUE + 140 CONTINUE + IF(.NOT. SD1 .GE. GAMSQ) GO TO 160 + ASSIGN 150 TO IGO +C FIX-H.. + GO TO 70 + 150 CONTINUE + SD1=SD1/GAM**2 + SX1=SX1*GAM + SH11=SH11*GAM + SH12=SH12*GAM + GO TO 140 + 160 CONTINUE + 170 CONTINUE + IF(.NOT. ABS(SD2) .LE. RGAMSQ) GO TO 190 + IF(SD2 .EQ. ZERO) GO TO 220 + ASSIGN 180 TO IGO +C FIX-H.. + GO TO 70 + 180 CONTINUE + SD2=SD2*GAM**2 + SH21=SH21/GAM + SH22=SH22/GAM + GO TO 170 + 190 CONTINUE + 200 CONTINUE + IF(.NOT. ABS(SD2) .GE. GAMSQ) GO TO 220 + ASSIGN 210 TO IGO +C FIX-H.. + GO TO 70 + 210 CONTINUE + SD2=SD2/GAM**2 + SH21=SH21*GAM + SH22=SH22*GAM + GO TO 200 + 220 CONTINUE + IF(SFLAG)250,230,240 + 230 CONTINUE + SPARAM(3)=SH21 + SPARAM(4)=SH12 + GO TO 260 + 240 CONTINUE + SPARAM(2)=SH11 + SPARAM(5)=SH22 + GO TO 260 + 250 CONTINUE + SPARAM(2)=SH11 + SPARAM(3)=SH21 + SPARAM(4)=SH12 + SPARAM(5)=SH22 + 260 CONTINUE + SPARAM(1)=SFLAG + RETURN + END diff --git a/reference/ssbmvf.f b/reference/ssbmvf.f new file mode 100644 index 0000000..d1d7a67 --- /dev/null +++ b/reference/ssbmvf.f @@ -0,0 +1,303 @@ + SUBROUTINE SSBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, K, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric band matrix, with k super-diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( K.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( 1, J ) + L = 1 - J + DO 90, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) + L = 1 - J + IX = JX + IY = JY + DO 110, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSBMV . +* + END diff --git a/reference/sscalf.f b/reference/sscalf.f new file mode 100644 index 0000000..73571bc --- /dev/null +++ b/reference/sscalf.f @@ -0,0 +1,43 @@ + subroutine sscalf(n,sa,sx,incx) +c +c scales a vector by a constant. +c uses unrolled loops for increment equal to 1. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sa,sx(*) + integer i,incx,m,mp1,n,nincx +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + sx(i) = sa*sx(i) + 10 continue + return +c +c code for increment equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,5) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + sx(i) = sa*sx(i) + 30 continue + if( n .lt. 5 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,5 + sx(i) = sa*sx(i) + sx(i + 1) = sa*sx(i + 1) + sx(i + 2) = sa*sx(i + 2) + sx(i + 3) = sa*sx(i + 3) + sx(i + 4) = sa*sx(i + 4) + 50 continue + return + end diff --git a/reference/sspmvf.f b/reference/sspmvf.f new file mode 100644 index 0000000..70740ae --- /dev/null +++ b/reference/sspmvf.f @@ -0,0 +1,262 @@ + SUBROUTINE SSPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 6 + ELSE IF( INCY.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*AP( KK ) + K = KK + 1 + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N - J + 1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*AP( KK ) + IX = JX + IY = JY + DO 110, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N - J + 1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSPMV . +* + END diff --git a/reference/sspr2f.f b/reference/sspr2f.f new file mode 100644 index 0000000..fd9b0e4 --- /dev/null +++ b/reference/sspr2f.f @@ -0,0 +1,229 @@ + SUBROUTINE SSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSPR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSPR2 . +* + END diff --git a/reference/ssprf.f b/reference/ssprf.f new file mode 100644 index 0000000..cdf352b --- /dev/null +++ b/reference/ssprf.f @@ -0,0 +1,198 @@ + SUBROUTINE SSPRF ( UPLO, N, ALPHA, X, INCX, AP ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* SSPR performs the symmetric rank 1 operation +* +* A := alpha*x*x' + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSPR . +* + END diff --git a/reference/sswapf.f b/reference/sswapf.f new file mode 100644 index 0000000..d736896 --- /dev/null +++ b/reference/sswapf.f @@ -0,0 +1,56 @@ + subroutine sswapf (n,sx,incx,sy,incy) +c +c interchanges two vectors. +c uses unrolled loops for increments equal to 1. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*),stemp + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + stemp = sx(ix) + sx(ix) = sy(iy) + sy(iy) = stemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,3) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + stemp = sx(i) + sx(i) = sy(i) + sy(i) = stemp + 30 continue + if( n .lt. 3 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,3 + stemp = sx(i) + sx(i) = sy(i) + sy(i) = stemp + stemp = sx(i + 1) + sx(i + 1) = sy(i + 1) + sy(i + 1) = stemp + stemp = sx(i + 2) + sx(i + 2) = sy(i + 2) + sy(i + 2) = stemp + 50 continue + return + end diff --git a/reference/ssymmf.f b/reference/ssymmf.f new file mode 100644 index 0000000..5b08824 --- /dev/null +++ b/reference/ssymmf.f @@ -0,0 +1,294 @@ + SUBROUTINE SSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + REAL ALPHA, BETA +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* SSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - REAL array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + REAL TEMP1, TEMP2 +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of SSYMM . +* + END diff --git a/reference/ssymvf.f b/reference/ssymvf.f new file mode 100644 index 0000000..c1ebc35 --- /dev/null +++ b/reference/ssymvf.f @@ -0,0 +1,262 @@ + SUBROUTINE SSYMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSYMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 5 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + ELSE IF( INCY.EQ.0 )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( J, J ) + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + IX = JX + IY = JY + DO 110, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYMV . +* + END diff --git a/reference/ssyr2f.f b/reference/ssyr2f.f new file mode 100644 index 0000000..bd962e1 --- /dev/null +++ b/reference/ssyr2f.f @@ -0,0 +1,230 @@ + SUBROUTINE SSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSYR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYR2 . +* + END diff --git a/reference/ssyr2kf.f b/reference/ssyr2kf.f new file mode 100644 index 0000000..bc214ca --- /dev/null +++ b/reference/ssyr2kf.f @@ -0,0 +1,327 @@ + SUBROUTINE SSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + REAL ALPHA, BETA +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* SSYR2K performs one of the symmetric rank 2k operations +* +* C := alpha*A*B' + alpha*B*A' + beta*C, +* +* or +* +* C := alpha*A'*B + alpha*B'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A and B are n by k matrices in the first case and k by n +* matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + +* beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* TRANS = 'C' or 'c' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'T' or 't' or 'C' or 'c', K specifies the number +* of rows of the matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - REAL array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + REAL TEMP1, TEMP2 +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYR2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*B' + alpha*B*A' + C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*B + alpha*B'*A + C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYR2K. +* + END diff --git a/reference/ssyrf.f b/reference/ssyrf.f new file mode 100644 index 0000000..9877f56 --- /dev/null +++ b/reference/ssyrf.f @@ -0,0 +1,197 @@ + SUBROUTINE SSYRF ( UPLO, N, ALPHA, X, INCX, A, LDA ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* SSYR performs the symmetric rank 1 operation +* +* A := alpha*x*x' + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYR . +* + END diff --git a/reference/ssyrkf.f b/reference/ssyrkf.f new file mode 100644 index 0000000..26b2509 --- /dev/null +++ b/reference/ssyrkf.f @@ -0,0 +1,294 @@ + SUBROUTINE SSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + REAL ALPHA, BETA +* .. Array Arguments .. + REAL A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* SSYRK performs one of the symmetric rank k operations +* +* C := alpha*A*A' + beta*C, +* +* or +* +* C := alpha*A'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A is an n by k matrix in the first case and a k by n matrix +* in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*A + beta*C. +* +* TRANS = 'C' or 'c' C := alpha*A'*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'T' or 't' or 'C' or 'c', K specifies the number +* of rows of the matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - REAL array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + REAL TEMP +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYRK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*A' + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*A + beta*C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP = ZERO + DO 220, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYRK . +* + END diff --git a/reference/stbmvf.f b/reference/stbmvf.f new file mode 100644 index 0000000..353e63e --- /dev/null +++ b/reference/stbmvf.f @@ -0,0 +1,342 @@ + SUBROUTINE STBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, K, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STBMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular band matrix, with ( k + 1 ) diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( K.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 7 + ELSE IF( INCX.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = KPLUS1 - J + DO 10, I = MAX( 1, J - K ), J - 1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( KPLUS1, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = KPLUS1 - J + DO 30, I = MAX( 1, J - K ), J - 1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( KPLUS1, J ) + END IF + JX = JX + INCX + IF( J.GT.K ) + $ KX = KX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = 1 - J + DO 50, I = MIN( N, J + K ), J + 1, -1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( 1, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = 1 - J + DO 70, I = MIN( N, J + K ), J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( 1, J ) + END IF + JX = JX - INCX + IF( ( N - J ).GE.K ) + $ KX = KX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + L = KPLUS1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 90, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( I ) + 90 CONTINUE + X( J ) = TEMP + 100 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + KX = KX - INCX + IX = KX + L = KPLUS1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 110, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX - INCX + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + L = 1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 130, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( I ) + 130 CONTINUE + X( J ) = TEMP + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + KX = KX + INCX + IX = KX + L = 1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 150, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX + INCX + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STBMV . +* + END diff --git a/reference/stbsvf.f b/reference/stbsvf.f new file mode 100644 index 0000000..b0f7e46 --- /dev/null +++ b/reference/stbsvf.f @@ -0,0 +1,336 @@ + SUBROUTINE STBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) +* .. Scalar Arguments .. + INTEGER INCX,K,LDA,N + CHARACTER DIAG,TRANS,UPLO +* .. +* .. Array Arguments .. + REAL A(LDA,*),X(*) +* .. +* +* Purpose +* ======= +* +* STBSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular band matrix, with ( k + 1 ) +* diagonals. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER (ZERO=0.0E+0) +* .. +* .. Local Scalars .. + REAL TEMP + INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L + LOGICAL NOUNIT +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX,MIN +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN + INFO = 1 + ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 2 + ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT. (K+1)) THEN + INFO = 7 + ELSE IF (INCX.EQ.0) THEN + INFO = 9 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('STBSV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF (N.EQ.0) RETURN +* + NOUNIT = LSAME(DIAG,'N') +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF (INCX.LE.0) THEN + KX = 1 - (N-1)*INCX + ELSE IF (INCX.NE.1) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed by sequentially with one pass through A. +* + IF (LSAME(TRANS,'N')) THEN +* +* Form x := inv( A )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 20 J = N,1,-1 + IF (X(J).NE.ZERO) THEN + L = KPLUS1 - J + IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) + TEMP = X(J) + DO 10 I = J - 1,MAX(1,J-K),-1 + X(I) = X(I) - TEMP*A(L+I,J) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 40 J = N,1,-1 + KX = KX - INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = KPLUS1 - J + IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) + TEMP = X(JX) + DO 30 I = J - 1,MAX(1,J-K),-1 + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX - INCX + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 60 J = 1,N + IF (X(J).NE.ZERO) THEN + L = 1 - J + IF (NOUNIT) X(J) = X(J)/A(1,J) + TEMP = X(J) + DO 50 I = J + 1,MIN(N,J+K) + X(I) = X(I) - TEMP*A(L+I,J) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1,N + KX = KX + INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = 1 - J + IF (NOUNIT) X(JX) = X(JX)/A(1,J) + TEMP = X(JX) + DO 70 I = J + 1,MIN(N,J+K) + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A')*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 100 J = 1,N + TEMP = X(J) + L = KPLUS1 - J + DO 90 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(I) + 90 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + X(J) = TEMP + 100 CONTINUE + ELSE + JX = KX + DO 120 J = 1,N + TEMP = X(JX) + IX = KX + L = KPLUS1 - J + DO 110 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX + INCX + 110 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + X(JX) = TEMP + JX = JX + INCX + IF (J.GT.K) KX = KX + INCX + 120 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 140 J = N,1,-1 + TEMP = X(J) + L = 1 - J + DO 130 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(I) + 130 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + X(J) = TEMP + 140 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 160 J = N,1,-1 + TEMP = X(JX) + IX = KX + L = 1 - J + DO 150 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX - INCX + 150 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + X(JX) = TEMP + JX = JX - INCX + IF ((N-J).GE.K) KX = KX - INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STBSV . +* + END diff --git a/reference/stpmvf.f b/reference/stpmvf.f new file mode 100644 index 0000000..1e93b84 --- /dev/null +++ b/reference/stpmvf.f @@ -0,0 +1,299 @@ + SUBROUTINE STPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STPMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STPMVF', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x:= A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK =1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 10, I = 1, J - 1 + X( I ) = X( I ) + TEMP*AP( K ) + K = K + 1 + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK + J - 1 ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, K = KK, KK + J - 2 + X( IX ) = X( IX ) + TEMP*AP( K ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK + J - 1 ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 50, I = N, J + 1, -1 + X( I ) = X( I ) + TEMP*AP( K ) + K = K - 1 + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK - N + J ) + END IF + KK = KK - ( N - J + 1 ) + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 + X( IX ) = X( IX ) + TEMP*AP( K ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK - N + J ) + END IF + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + K = KK - 1 + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + AP( K )*X( I ) + K = K - 1 + 90 CONTINUE + X( J ) = TEMP + KK = KK - J + 100 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 110, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + AP( K )*X( IX ) + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + KK = KK - J + 120 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + K = KK + 1 + DO 130, I = J + 1, N + TEMP = TEMP + AP( K )*X( I ) + K = K + 1 + 130 CONTINUE + X( J ) = TEMP + KK = KK + ( N - J + 1 ) + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 150, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + AP( K )*X( IX ) + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STPMV . +* + END diff --git a/reference/stpsvf.f b/reference/stpsvf.f new file mode 100644 index 0000000..9fa2f59 --- /dev/null +++ b/reference/stpsvf.f @@ -0,0 +1,302 @@ + SUBROUTINE STPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STPSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix, supplied in packed form. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STPSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + TEMP = X( J ) + K = KK - 1 + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*AP( K ) + K = K - 1 + 10 CONTINUE + END IF + KK = KK - J + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + TEMP = X( JX ) + IX = JX + DO 30, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + X( IX ) = X( IX ) - TEMP*AP( K ) + 30 CONTINUE + END IF + JX = JX - INCX + KK = KK - J + 40 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + TEMP = X( J ) + K = KK + 1 + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*AP( K ) + K = K + 1 + 50 CONTINUE + END IF + KK = KK + ( N - J + 1 ) + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + TEMP = X( JX ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + X( IX ) = X( IX ) - TEMP*AP( K ) + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = X( J ) + K = KK + DO 90, I = 1, J - 1 + TEMP = TEMP - AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + X( J ) = TEMP + KK = KK + J + 100 CONTINUE + ELSE + JX = KX + DO 120, J = 1, N + TEMP = X( JX ) + IX = KX + DO 110, K = KK, KK + J - 2 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX + INCX + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + X( JX ) = TEMP + JX = JX + INCX + KK = KK + J + 120 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 140, J = N, 1, -1 + TEMP = X( J ) + K = KK + DO 130, I = N, J + 1, -1 + TEMP = TEMP - AP( K )*X( I ) + K = K - 1 + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + X( J ) = TEMP + KK = KK - ( N - J + 1 ) + 140 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 160, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + DO 150, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX - INCX + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + X( JX ) = TEMP + JX = JX - INCX + KK = KK - (N - J + 1 ) + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STPSV . +* + END diff --git a/reference/strmmf.f b/reference/strmmf.f new file mode 100644 index 0000000..04ea865 --- /dev/null +++ b/reference/strmmf.f @@ -0,0 +1,355 @@ + SUBROUTINE STRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + REAL ALPHA +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* STRMM performs one of the matrix-matrix operations +* +* B := alpha*op( A )*B, or B := alpha*B*op( A ), +* +* where alpha is a scalar, B is an m by n matrix, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A'. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) multiplies B from +* the left or right as follows: +* +* SIDE = 'L' or 'l' B := alpha*op( A )*B. +* +* SIDE = 'R' or 'r' B := alpha*B*op( A ). +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = A'. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B, and on exit is overwritten by the +* transformed matrix. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + REAL TEMP +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STRMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*A*B. +* + IF( UPPER )THEN + DO 50, J = 1, N + DO 40, K = 1, M + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + DO 30, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 30 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + B( K, J ) = TEMP + END IF + 40 CONTINUE + 50 CONTINUE + ELSE + DO 80, J = 1, N + DO 70 K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + B( K, J ) = TEMP + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*A( K, K ) + DO 60, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 60 CONTINUE + END IF + 70 CONTINUE + 80 CONTINUE + END IF + ELSE +* +* Form B := alpha*A'*B. +* + IF( UPPER )THEN + DO 110, J = 1, N + DO 100, I = M, 1, -1 + TEMP = B( I, J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 90, K = 1, I - 1 + TEMP = TEMP + A( K, I )*B( K, J ) + 90 CONTINUE + B( I, J ) = ALPHA*TEMP + 100 CONTINUE + 110 CONTINUE + ELSE + DO 140, J = 1, N + DO 130, I = 1, M + TEMP = B( I, J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 120, K = I + 1, M + TEMP = TEMP + A( K, I )*B( K, J ) + 120 CONTINUE + B( I, J ) = ALPHA*TEMP + 130 CONTINUE + 140 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*B*A. +* + IF( UPPER )THEN + DO 180, J = N, 1, -1 + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 150 CONTINUE + DO 170, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + TEMP = ALPHA*A( K, J ) + DO 160, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + ELSE + DO 220, J = 1, N + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 190, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 190 CONTINUE + DO 210, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + TEMP = ALPHA*A( K, J ) + DO 200, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 200 CONTINUE + END IF + 210 CONTINUE + 220 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*A'. +* + IF( UPPER )THEN + DO 260, K = 1, N + DO 240, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + TEMP = ALPHA*A( J, K ) + DO 230, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 230 CONTINUE + END IF + 240 CONTINUE + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + IF( TEMP.NE.ONE )THEN + DO 250, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 250 CONTINUE + END IF + 260 CONTINUE + ELSE + DO 300, K = N, 1, -1 + DO 280, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + TEMP = ALPHA*A( J, K ) + DO 270, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 270 CONTINUE + END IF + 280 CONTINUE + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + IF( TEMP.NE.ONE )THEN + DO 290, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 290 CONTINUE + END IF + 300 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STRMM . +* + END diff --git a/reference/strmvf.f b/reference/strmvf.f new file mode 100644 index 0000000..249aff2 --- /dev/null +++ b/reference/strmvf.f @@ -0,0 +1,286 @@ + SUBROUTINE STRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STRMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STRMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 10, I = 1, J - 1 + X( I ) = X( I ) + TEMP*A( I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, I = 1, J - 1 + X( IX ) = X( IX ) + TEMP*A( I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 50, I = N, J + 1, -1 + X( I ) = X( I ) + TEMP*A( I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, I = N, J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + END IF + JX = JX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + X( J ) = TEMP + 100 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 110, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + A( I, J )*X( IX ) + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 130, I = J + 1, N + TEMP = TEMP + A( I, J )*X( I ) + 130 CONTINUE + X( J ) = TEMP + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + A( I, J )*X( IX ) + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STRMV . +* + END diff --git a/reference/strsmf.f b/reference/strsmf.f new file mode 100644 index 0000000..31d71a7 --- /dev/null +++ b/reference/strsmf.f @@ -0,0 +1,378 @@ + SUBROUTINE STRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + REAL ALPHA +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* STRSM solves one of the matrix equations +* +* op( A )*X = alpha*B, or X*op( A ) = alpha*B, +* +* where alpha is a scalar, X and B are m by n matrices, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A'. +* +* The matrix X is overwritten on B. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) appears on the left +* or right of X as follows: +* +* SIDE = 'L' or 'l' op( A )*X = alpha*B. +* +* SIDE = 'R' or 'r' X*op( A ) = alpha*B. +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = A'. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the right-hand side matrix B, and on exit is +* overwritten by the solution matrix X. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + REAL TEMP +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STRSM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*inv( A )*B. +* + IF( UPPER )THEN + DO 60, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 30, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 30 CONTINUE + END IF + DO 50, K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 40, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 40 CONTINUE + END IF + 50 CONTINUE + 60 CONTINUE + ELSE + DO 100, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 70, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 70 CONTINUE + END IF + DO 90 K = 1, M + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 80, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 80 CONTINUE + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form B := alpha*inv( A' )*B. +* + IF( UPPER )THEN + DO 130, J = 1, N + DO 120, I = 1, M + TEMP = ALPHA*B( I, J ) + DO 110, K = 1, I - 1 + TEMP = TEMP - A( K, I )*B( K, J ) + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + B( I, J ) = TEMP + 120 CONTINUE + 130 CONTINUE + ELSE + DO 160, J = 1, N + DO 150, I = M, 1, -1 + TEMP = ALPHA*B( I, J ) + DO 140, K = I + 1, M + TEMP = TEMP - A( K, I )*B( K, J ) + 140 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + B( I, J ) = TEMP + 150 CONTINUE + 160 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*B*inv( A ). +* + IF( UPPER )THEN + DO 210, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 170, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 170 CONTINUE + END IF + DO 190, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + DO 180, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 180 CONTINUE + END IF + 190 CONTINUE + IF( NOUNIT )THEN + TEMP = ONE/A( J, J ) + DO 200, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 200 CONTINUE + END IF + 210 CONTINUE + ELSE + DO 260, J = N, 1, -1 + IF( ALPHA.NE.ONE )THEN + DO 220, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 220 CONTINUE + END IF + DO 240, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + DO 230, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 230 CONTINUE + END IF + 240 CONTINUE + IF( NOUNIT )THEN + TEMP = ONE/A( J, J ) + DO 250, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 250 CONTINUE + END IF + 260 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*inv( A' ). +* + IF( UPPER )THEN + DO 310, K = N, 1, -1 + IF( NOUNIT )THEN + TEMP = ONE/A( K, K ) + DO 270, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 270 CONTINUE + END IF + DO 290, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + TEMP = A( J, K ) + DO 280, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 280 CONTINUE + END IF + 290 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 300, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 300 CONTINUE + END IF + 310 CONTINUE + ELSE + DO 360, K = 1, N + IF( NOUNIT )THEN + TEMP = ONE/A( K, K ) + DO 320, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 320 CONTINUE + END IF + DO 340, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + TEMP = A( J, K ) + DO 330, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 330 CONTINUE + END IF + 340 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 350, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 350 CONTINUE + END IF + 360 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STRSM . +* + END diff --git a/reference/strsvf.f b/reference/strsvf.f new file mode 100644 index 0000000..dcf020f --- /dev/null +++ b/reference/strsvf.f @@ -0,0 +1,289 @@ + SUBROUTINE STRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STRSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STRSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*A( I, J ) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + TEMP = X( JX ) + IX = JX + DO 30, I = J - 1, 1, -1 + IX = IX - INCX + X( IX ) = X( IX ) - TEMP*A( I, J ) + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*A( I, J ) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + TEMP = X( JX ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + X( IX ) = X( IX ) - TEMP*A( I, J ) + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = X( J ) + DO 90, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( I ) + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( J ) = TEMP + 100 CONTINUE + ELSE + JX = KX + DO 120, J = 1, N + TEMP = X( JX ) + IX = KX + DO 110, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( JX ) = TEMP + JX = JX + INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = N, 1, -1 + TEMP = X( J ) + DO 130, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( I ) + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( J ) = TEMP + 140 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 160, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + DO 150, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX - INCX + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( JX ) = TEMP + JX = JX - INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STRSV . +* + END diff --git a/reference/strti2f.f b/reference/strti2f.f new file mode 100644 index 0000000..b859cff --- /dev/null +++ b/reference/strti2f.f @@ -0,0 +1,146 @@ + SUBROUTINE STRTI2F( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* STRTI2 computes the inverse of a real upper or lower triangular +* matrix. +* +* This is the Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the matrix A is upper or lower triangular. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* DIAG (input) CHARACTER*1 +* Specifies whether or not the matrix A is unit triangular. +* = 'N': Non-unit triangular +* = 'U': Unit triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading n by n upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J + REAL AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SSCAL, STRMV, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRTI2', -INFO ) + RETURN + END IF +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix. +* + DO 10 J = 1, N + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF +* +* Compute elements 1:j-1 of j-th column. +* + CALL STRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, + $ A( 1, J ), 1 ) + CALL SSCAL( J-1, AJJ, A( 1, J ), 1 ) + 10 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix. +* + DO 20 J = N, 1, -1 + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF + IF( J.LT.N ) THEN +* +* Compute elements j+1:n of j-th column. +* + CALL STRMV( 'Lower', 'No transpose', DIAG, N-J, + $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) + CALL SSCAL( N-J, AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of STRTI2 +* + END diff --git a/reference/strtrif.f b/reference/strtrif.f new file mode 100644 index 0000000..27e3234 --- /dev/null +++ b/reference/strtrif.f @@ -0,0 +1,176 @@ + SUBROUTINE STRTRIF( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* STRTRI computes the inverse of a real upper or lower triangular +* matrix A. +* +* This is the Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': A is upper triangular; +* = 'L': A is lower triangular. +* +* DIAG (input) CHARACTER*1 +* = 'N': A is non-unit triangular; +* = 'U': A is unit triangular. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading N-by-N upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, A(i,i) is exactly zero. The triangular +* matrix is singular and its inverse can not be computed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J, JB, NB, NN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL STRMM, STRSM, STRTI2, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Check for singularity if non-unit. +* + IF( NOUNIT ) THEN + DO 10 INFO = 1, N + IF( A( INFO, INFO ).EQ.ZERO ) + $ RETURN + 10 CONTINUE + INFO = 0 + END IF +* +* Determine the block size for this environment. +* + NB = 128 + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL STRTI2( UPLO, DIAG, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix +* + DO 20 J = 1, N, NB + JB = MIN( NB, N-J+1 ) +* +* Compute rows 1:j-1 of current block column +* + CALL STRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, + $ JB, ONE, A, LDA, A( 1, J ), LDA ) + CALL STRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, + $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) +* +* Compute inverse of current diagonal block +* + CALL STRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) + 20 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 30 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) + IF( J+JB.LE.N ) THEN +* +* Compute rows j+jb:n of current block column +* + CALL STRMM( 'Left', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, + $ A( J+JB, J ), LDA ) + CALL STRSM( 'Right', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF +* +* Compute inverse of current diagonal block +* + CALL STRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) + 30 CONTINUE + END IF + END IF +* + RETURN +* +* End of STRTRI +* + END diff --git a/reference/zaxpycf.f b/reference/zaxpycf.f new file mode 100644 index 0000000..aaf21da --- /dev/null +++ b/reference/zaxpycf.f @@ -0,0 +1,36 @@ + subroutine zaxpycf(n,za,zx,incx,zy,incy) +c +c constant times a vector plus a vector. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),za + integer i,incx,incy,ix,iy,n + double precision dcabs1 + INTRINSIC dconjg + + if(n.le.0)return + if (dcabs1(za) .eq. 0.0d0) return + if (incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + zy(iy) = zy(iy) + za*dconjg(zx(ix)) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + zy(i) = zy(i) + za*dconjg(zx(i)) + 30 continue + return + end diff --git a/reference/zaxpyf.f b/reference/zaxpyf.f new file mode 100644 index 0000000..2f0f6a0 --- /dev/null +++ b/reference/zaxpyf.f @@ -0,0 +1,34 @@ + subroutine zaxpyf(n,za,zx,incx,zy,incy) +c +c constant times a vector plus a vector. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),za + integer i,incx,incy,ix,iy,n + double precision dcabs1 + if(n.le.0)return + if (dcabs1(za) .eq. 0.0d0) return + if (incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + zy(iy) = zy(iy) + za*zx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + zy(i) = zy(i) + za*zx(i) + 30 continue + return + end diff --git a/reference/zcopyf.f b/reference/zcopyf.f new file mode 100644 index 0000000..a3bfdfc --- /dev/null +++ b/reference/zcopyf.f @@ -0,0 +1,33 @@ + subroutine zcopyf(n,zx,incx,zy,incy) +c +c copies a vector, x, to a vector, y. +c jack dongarra, linpack, 4/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*) + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + zy(iy) = zx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + zy(i) = zx(i) + 30 continue + return + end diff --git a/reference/zdotcf.f b/reference/zdotcf.f new file mode 100644 index 0000000..1611aee --- /dev/null +++ b/reference/zdotcf.f @@ -0,0 +1,36 @@ + double complex function zdotcf(n,zx,incx,zy,incy) +c +c forms the dot product of a vector. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),ztemp + integer i,incx,incy,ix,iy,n + ztemp = (0.0d0,0.0d0) + zdotcf = (0.0d0,0.0d0) + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ztemp = ztemp + dconjg(zx(ix))*zy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + zdotcf = ztemp + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ztemp = ztemp + dconjg(zx(i))*zy(i) + 30 continue + zdotcf = ztemp + return + end diff --git a/reference/zdotuf.f b/reference/zdotuf.f new file mode 100644 index 0000000..cc2ea93 --- /dev/null +++ b/reference/zdotuf.f @@ -0,0 +1,36 @@ + double complex function zdotuf(n,zx,incx,zy,incy) +c +c forms the dot product of two vectors. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),ztemp + integer i,incx,incy,ix,iy,n + ztemp = (0.0d0,0.0d0) + zdotuf = (0.0d0,0.0d0) + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ztemp = ztemp + zx(ix)*zy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + zdotuf = ztemp + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ztemp = ztemp + zx(i)*zy(i) + 30 continue + zdotuf = ztemp + return + end diff --git a/reference/zdrotf.f b/reference/zdrotf.f new file mode 100644 index 0000000..fe11288 --- /dev/null +++ b/reference/zdrotf.f @@ -0,0 +1,38 @@ + subroutine zdrotf (n,zx,incx,zy,incy,c,s) +c +c applies a plane rotation, where the cos and sin (c and s) are +c double precision and the vectors zx and zy are double complex. +c jack dongarra, linpack, 3/11/78. +c + double complex zx(1),zy(1),ztemp + double precision c,s + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ztemp = c*zx(ix) + s*zy(iy) + zy(iy) = c*zy(iy) - s*zx(ix) + zx(ix) = ztemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ztemp = c*zx(i) + s*zy(i) + zy(i) = c*zy(i) - s*zx(i) + zx(i) = ztemp + 30 continue + return + end diff --git a/reference/zdscalf.f b/reference/zdscalf.f new file mode 100644 index 0000000..0ac1534 --- /dev/null +++ b/reference/zdscalf.f @@ -0,0 +1,30 @@ + subroutine zdscalf(n,da,zx,incx) +c +c scales a vector by a constant. +c jack dongarra, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*) + double precision da + integer i,incx,ix,n +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + do 10 i = 1,n + zx(ix) = dcmplx(da,0.0d0)*zx(ix) + ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + zx(i) = dcmplx(da,0.0d0)*zx(i) + 30 continue + return + end diff --git a/reference/zgbmvf.f b/reference/zgbmvf.f new file mode 100644 index 0000000..bd888b1 --- /dev/null +++ b/reference/zgbmvf.f @@ -0,0 +1,450 @@ + SUBROUTINE ZGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + INTEGER INCX, INCY, KL, KU, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZGBMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or +* +* y := alpha*conjg( A' )*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n band matrix, with kl sub-diagonals and ku super-diagonals. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* KL - INTEGER. +* On entry, KL specifies the number of sub-diagonals of the +* matrix A. KL must satisfy 0 .le. KL. +* Unchanged on exit. +* +* KU - INTEGER. +* On entry, KU specifies the number of super-diagonals of the +* matrix A. KU must satisfy 0 .le. KU. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading ( kl + ku + 1 ) by n part of the +* array A must contain the matrix of coefficients, supplied +* column by column, with the leading diagonal of the matrix in +* row ( ku + 1 ) of the array, the first super-diagonal +* starting at position 2 in row ku, the first sub-diagonal +* starting at position 1 in row ( ku + 2 ), and so on. +* Elements in the array A that do not correspond to elements +* in the band matrix (such as the top left ku by ku triangle) +* are not referenced. +* The following program segment will transfer a band matrix +* from conventional full matrix storage to band storage: +* +* DO 20, J = 1, N +* K = KU + 1 - J +* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) +* A( K + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( kl + ku + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, + $ LENX, LENY + LOGICAL NOCONJ, NOTRANS, XCONJ +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ).AND. + $ .NOT.LSAME( TRANS, 'O' ).AND. + $ .NOT.LSAME( TRANS, 'U' ).AND. + $ .NOT.LSAME( TRANS, 'S' ).AND. + $ .NOT.LSAME( TRANS, 'D' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( KL.LT.0 )THEN + INFO = 4 + ELSE IF( KU.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN + INFO = 8 + ELSE IF( INCX.EQ.0 )THEN + INFO = 10 + ELSE IF( INCY.EQ.0 )THEN + INFO = 13 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZGBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* + NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) + + NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) + + XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF(NOTRANS)THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the band part of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + KUP1 = KU + 1 + + IF(XCONJ)THEN + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + K = KUP1 - J + IF( NOCONJ )THEN + DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 50 CONTINUE + ELSE + DO 55, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*DCONJG(A( K + I, J )) + 55 CONTINUE + END IF + + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + K = KUP1 - J + IF( NOCONJ )THEN + DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 70 CONTINUE + ELSE + DO 75, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*DCONJG(A( K + I, J )) + IY = IY + INCY + 75 CONTINUE + END IF + + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = ZERO + K = KUP1 - J + IF( NOCONJ )THEN + DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( I ) + 90 CONTINUE + ELSE + DO 100, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + DCONJG( A( K + I, J ) )*X( I ) + 100 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + IF( NOCONJ )THEN + DO 120, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( IX ) + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + DCONJG( A( K + I, J ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 140 CONTINUE + END IF + END IF + + ELSE + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 160, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG(X( JX )) + K = KUP1 - J + IF( NOCONJ )THEN + DO 150, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 150 CONTINUE + ELSE + DO 155, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*DCONJG(A( K + I, J )) + 155 CONTINUE + END IF + + END IF + JX = JX + INCX + 160 CONTINUE + ELSE + DO 180, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG(X( JX )) + IY = KY + K = KUP1 - J + IF( NOCONJ )THEN + DO 170, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 170 CONTINUE + ELSE + DO 175, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*DCONJG(A( K + I, J )) + IY = IY + INCY + 175 CONTINUE + END IF + + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 180 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 210, J = 1, N + TEMP = ZERO + K = KUP1 - J + IF( NOCONJ )THEN + DO 190, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*DCONJG(X( I )) + 190 CONTINUE + ELSE + DO 200, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + DCONJG( A( K + I, J ) )*DCONJG(X( I )) + 200 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 210 CONTINUE + ELSE + DO 240, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + IF( NOCONJ )THEN + DO 220, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*DCONJG(X( IX )) + IX = IX + INCX + 220 CONTINUE + ELSE + DO 230, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + DCONJG( A( K + I, J ) )*DCONJG(X(IX )) + IX = IX + INCX + 230 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 240 CONTINUE + END IF + END IF + + END IF + +* + RETURN +* +* End of ZGBMV . +* + END diff --git a/reference/zgemm3mf.f b/reference/zgemm3mf.f new file mode 100644 index 0000000..3bfc88b --- /dev/null +++ b/reference/zgemm3mf.f @@ -0,0 +1,414 @@ + SUBROUTINE ZGEMM3MF(TRA,TRB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + DOUBLE COMPLEX ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRA,TRB +* .. +* .. Array Arguments .. + DOUBLE COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* ZGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRA - CHARACTER*1. +* On entry, TRA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRA = 'N' or 'n', op( A ) = A. +* +* TRA = 'T' or 't', op( A ) = A'. +* +* TRA = 'C' or 'c', op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* TRB - CHARACTER*1. +* On entry, TRB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRB = 'N' or 'n', op( B ) = B. +* +* TRB = 'T' or 't', op( B ) = B'. +* +* TRB = 'C' or 'c', op( B ) = conjg( B' ). +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRA = 'N' or 'n', and is m otherwise. +* Before entry with TRA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is +* n when TRB = 'N' or 'n', and is k otherwise. +* Before entry with TRB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCONJG,MAX +* .. +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL CONJA,CONJB,NOTA,NOTB +* .. +* .. Parameters .. + DOUBLE COMPLEX ONE + PARAMETER (ONE= (1.0D+0,0.0D+0)) + DOUBLE COMPLEX ZERO + PARAMETER (ZERO= (0.0D+0,0.0D+0)) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* conjugated or transposed, set CONJA and CONJB as true if A and +* B respectively are to be transposed but not conjugated and set +* NROWA, NCOLA and NROWB as the number of rows and columns of A +* and the number of rows of B respectively. +* + NOTA = LSAME(TRA,'N') + NOTB = LSAME(TRB,'N') + CONJA = LSAME(TRA,'C') + CONJB = LSAME(TRB,'C') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + + (.NOT.LSAME(TRA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + + (.NOT.LSAME(TRB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('ZGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And when alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE IF (CONJA) THEN +* +* Form C := alpha*conjg( A' )*B + beta*C. +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 150 J = 1,N + DO 140 I = 1,M + TEMP = ZERO + DO 130 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 130 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 140 CONTINUE + 150 CONTINUE + END IF + ELSE IF (NOTA) THEN + IF (CONJB) THEN +* +* Form C := alpha*A*conjg( B' ) + beta*C. +* + DO 200 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 160 I = 1,M + C(I,J) = ZERO + 160 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 170 I = 1,M + C(I,J) = BETA*C(I,J) + 170 CONTINUE + END IF + DO 190 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*DCONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE +* +* Form C := alpha*A*B' + beta*C +* + DO 250 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 210 I = 1,M + C(I,J) = ZERO + 210 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 220 I = 1,M + C(I,J) = BETA*C(I,J) + 220 CONTINUE + END IF + DO 240 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE + END IF + 240 CONTINUE + 250 CONTINUE + END IF + ELSE IF (CONJA) THEN + IF (CONJB) THEN +* +* Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. +* + DO 280 J = 1,N + DO 270 I = 1,M + TEMP = ZERO + DO 260 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*DCONJG(B(J,L)) + 260 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 270 CONTINUE + 280 CONTINUE + ELSE +* +* Form C := alpha*conjg( A' )*B' + beta*C +* + DO 310 J = 1,N + DO 300 I = 1,M + TEMP = ZERO + DO 290 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*B(J,L) + 290 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 300 CONTINUE + 310 CONTINUE + END IF + ELSE + IF (CONJB) THEN +* +* Form C := alpha*A'*conjg( B' ) + beta*C +* + DO 340 J = 1,N + DO 330 I = 1,M + TEMP = ZERO + DO 320 L = 1,K + TEMP = TEMP + A(L,I)*DCONJG(B(J,L)) + 320 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 330 CONTINUE + 340 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 370 J = 1,N + DO 360 I = 1,M + TEMP = ZERO + DO 350 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 350 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 360 CONTINUE + 370 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZGEMM . +* + END diff --git a/reference/zgemmf.f b/reference/zgemmf.f new file mode 100644 index 0000000..65cd317 --- /dev/null +++ b/reference/zgemmf.f @@ -0,0 +1,414 @@ + SUBROUTINE ZGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + DOUBLE COMPLEX ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRANA,TRANB +* .. +* .. Array Arguments .. + DOUBLE COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* ZGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRANA - CHARACTER*1. +* On entry, TRANA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANA = 'N' or 'n', op( A ) = A. +* +* TRANA = 'T' or 't', op( A ) = A'. +* +* TRANA = 'C' or 'c', op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* TRANB - CHARACTER*1. +* On entry, TRANB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRANB = 'N' or 'n', op( B ) = B. +* +* TRANB = 'T' or 't', op( B ) = B'. +* +* TRANB = 'C' or 'c', op( B ) = conjg( B' ). +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANA = 'N' or 'n', and is m otherwise. +* Before entry with TRANA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is +* n when TRANB = 'N' or 'n', and is k otherwise. +* Before entry with TRANB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCONJG,MAX +* .. +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL CONJA,CONJB,NOTA,NOTB +* .. +* .. Parameters .. + DOUBLE COMPLEX ONE + PARAMETER (ONE= (1.0D+0,0.0D+0)) + DOUBLE COMPLEX ZERO + PARAMETER (ZERO= (0.0D+0,0.0D+0)) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* conjugated or transposed, set CONJA and CONJB as true if A and +* B respectively are to be transposed but not conjugated and set +* NROWA, NCOLA and NROWB as the number of rows and columns of A +* and the number of rows of B respectively. +* + NOTA = LSAME(TRANA,'N') + NOTB = LSAME(TRANB,'N') + CONJA = LSAME(TRANA,'C') + CONJB = LSAME(TRANB,'C') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + + (.NOT.LSAME(TRANA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + + (.NOT.LSAME(TRANB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('ZGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And when alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE IF (CONJA) THEN +* +* Form C := alpha*conjg( A' )*B + beta*C. +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 150 J = 1,N + DO 140 I = 1,M + TEMP = ZERO + DO 130 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 130 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 140 CONTINUE + 150 CONTINUE + END IF + ELSE IF (NOTA) THEN + IF (CONJB) THEN +* +* Form C := alpha*A*conjg( B' ) + beta*C. +* + DO 200 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 160 I = 1,M + C(I,J) = ZERO + 160 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 170 I = 1,M + C(I,J) = BETA*C(I,J) + 170 CONTINUE + END IF + DO 190 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*DCONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE +* +* Form C := alpha*A*B' + beta*C +* + DO 250 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 210 I = 1,M + C(I,J) = ZERO + 210 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 220 I = 1,M + C(I,J) = BETA*C(I,J) + 220 CONTINUE + END IF + DO 240 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE + END IF + 240 CONTINUE + 250 CONTINUE + END IF + ELSE IF (CONJA) THEN + IF (CONJB) THEN +* +* Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. +* + DO 280 J = 1,N + DO 270 I = 1,M + TEMP = ZERO + DO 260 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*DCONJG(B(J,L)) + 260 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 270 CONTINUE + 280 CONTINUE + ELSE +* +* Form C := alpha*conjg( A' )*B' + beta*C +* + DO 310 J = 1,N + DO 300 I = 1,M + TEMP = ZERO + DO 290 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*B(J,L) + 290 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 300 CONTINUE + 310 CONTINUE + END IF + ELSE + IF (CONJB) THEN +* +* Form C := alpha*A'*conjg( B' ) + beta*C +* + DO 340 J = 1,N + DO 330 I = 1,M + TEMP = ZERO + DO 320 L = 1,K + TEMP = TEMP + A(L,I)*DCONJG(B(J,L)) + 320 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 330 CONTINUE + 340 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 370 J = 1,N + DO 360 I = 1,M + TEMP = ZERO + DO 350 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 350 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 360 CONTINUE + 370 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZGEMM . +* + END diff --git a/reference/zgemvf.f b/reference/zgemvf.f new file mode 100644 index 0000000..10d2d74 --- /dev/null +++ b/reference/zgemvf.f @@ -0,0 +1,332 @@ + SUBROUTINE ZGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE COMPLEX ALPHA, BETA + INTEGER INCX, INCY, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or +* +* y := alpha*conjg( A' )*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + DOUBLE COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY + LOGICAL NOCONJ, NOTRANS, XCONJ +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ).AND. + $ .NOT.LSAME( TRANS, 'O' ).AND. + $ .NOT.LSAME( TRANS, 'U' ).AND. + $ .NOT.LSAME( TRANS, 'S' ).AND. + $ .NOT.LSAME( TRANS, 'D' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CGEMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* + NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) + + NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) + + XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF(NOTRANS)THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (XCONJ) THEN + TEMP = ALPHA*X( JX ) + ELSE + TEMP = ALPHA*DCONJG(X( JX )) + ENDIF + IF (NOCONJ) THEN + DO 50, I = 1, M + Y( I ) = Y( I ) + TEMP*A( I, J ) + 50 CONTINUE + ELSE + DO 55, I = 1, M + Y( I ) = Y( I ) + TEMP*DCONJG(A( I, J )) + 55 CONTINUE + ENDIF + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (XCONJ) THEN + TEMP = ALPHA*X( JX ) + ELSE + TEMP = ALPHA*DCONJG(X( JX )) + ENDIF + IY = KY + IF (NOCONJ) THEN + DO 70, I = 1, M + Y( IY ) = Y( IY ) + TEMP*A( I, J ) + IY = IY + INCY + 70 CONTINUE + ELSE + DO 75, I = 1, M + Y( IY ) = Y( IY ) + TEMP* DCONJG(A( I, J )) + IY = IY + INCY + 75 CONTINUE + ENDIF + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = ZERO + IF( NOCONJ )THEN + DO 90, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + A( I, J )*X( I ) + ELSE + TEMP = TEMP + A( I, J )*DCONJG(X( I )) + ENDIF + 90 CONTINUE + ELSE + DO 100, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + DCONJG( A( I, J ) )*X( I ) + ELSE + TEMP = TEMP + DCONJG( A( I, J ) )*DCONJG(X( I )) + ENDIF + 100 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140, J = 1, N + TEMP = ZERO + IX = KX + IF( NOCONJ )THEN + DO 120, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + A( I, J )*X( IX ) + ELSE + TEMP = TEMP + A( I, J )*DCONJG(X( IX )) + ENDIF + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + DCONJG( A( I, J ) )*X( IX ) + ELSE + TEMP = TEMP + DCONJG( A( I, J ) )*DCONJG(X( IX )) + ENDIF + IX = IX + INCX + 130 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 140 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMV . +* + END + diff --git a/reference/zgercf.f b/reference/zgercf.f new file mode 100644 index 0000000..47f8a93 --- /dev/null +++ b/reference/zgercf.f @@ -0,0 +1,157 @@ + SUBROUTINE ZGERCF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZGERC performs the rank 1 operation +* +* A := alpha*x*conjg( y' ) + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZGERC ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( Y( JY ) ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( Y( JY ) ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of ZGERC . +* + END diff --git a/reference/zgeruf.f b/reference/zgeruf.f new file mode 100644 index 0000000..619f778 --- /dev/null +++ b/reference/zgeruf.f @@ -0,0 +1,157 @@ + SUBROUTINE ZGERUF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZGERU performs the rank 1 operation +* +* A := alpha*x*y' + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZGERU ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of ZGERU . +* + END diff --git a/reference/zgesvf.f b/reference/zgesvf.f new file mode 100644 index 0000000..d341dd7 --- /dev/null +++ b/reference/zgesvf.f @@ -0,0 +1,107 @@ + SUBROUTINE ZGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK driver routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* ZGESV computes the solution to a complex system of linear equations +* A * X = B, +* where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +* +* The LU decomposition with partial pivoting and row interchanges is +* used to factor A as +* A = P * L * U, +* where P is a permutation matrix, L is unit lower triangular, and U is +* upper triangular. The factored form of A is then used to solve the +* system of equations A * X = B. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of linear equations, i.e., the order of the +* matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the N-by-N coefficient matrix A. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (output) INTEGER array, dimension (N) +* The pivot indices that define the permutation matrix P; +* row i of the matrix was interchanged with row IPIV(i). +* +* B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) +* On entry, the N-by-NRHS matrix of right hand side matrix B. +* On exit, if INFO = 0, the N-by-NRHS solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, so the solution could not be computed. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL XERBLA, ZGETRF, ZGETRS +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGESV ', -INFO ) + RETURN + END IF +* +* Compute the LU factorization of A. +* + CALL ZGETRF( N, N, A, LDA, IPIV, INFO ) + IF( INFO.EQ.0 ) THEN +* +* Solve the system A*X = B, overwriting B with X. +* + CALL ZGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, + $ INFO ) + END IF + RETURN +* +* End of ZGESV +* + END diff --git a/reference/zgetf2f.f b/reference/zgetf2f.f new file mode 100644 index 0000000..6b8bc39 --- /dev/null +++ b/reference/zgetf2f.f @@ -0,0 +1,136 @@ + SUBROUTINE ZGETF2F( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZGETF2 computes an LU factorization of a general m-by-n matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the m by n matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, U(k,k) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE, ZERO + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), + $ ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER J, JP +* .. +* .. External Functions .. + INTEGER IZAMAX + EXTERNAL IZAMAX +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGERU, ZSCAL, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* + DO 10 J = 1, MIN( M, N ) +* +* Find pivot and test for singularity. +* + JP = J - 1 + IZAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + IF( A( JP, J ).NE.ZERO ) THEN +* +* Apply the interchange to columns 1:N. +* + IF( JP.NE.J ) + $ CALL ZSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) +* +* Compute elements J+1:M of J-th column. +* + IF( J.LT.M ) + $ CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) +* + ELSE IF( INFO.EQ.0 ) THEN +* + INFO = J + END IF +* + IF( J.LT.MIN( M, N ) ) THEN +* +* Update trailing submatrix. +* + CALL ZGERU( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), + $ LDA, A( J+1, J+1 ), LDA ) + END IF + 10 CONTINUE + RETURN +* +* End of ZGETF2 +* + END diff --git a/reference/zgetrff.f b/reference/zgetrff.f new file mode 100644 index 0000000..bfb438d --- /dev/null +++ b/reference/zgetrff.f @@ -0,0 +1,156 @@ + SUBROUTINE ZGETRFF( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGEMM, ZGETF2, ZLASWP, ZTRSM +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 64 + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL ZGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL ZGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to columns 1:J-1. +* + CALL ZLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF( J+JB.LE.N ) THEN +* +* Apply interchanges to columns J+JB:N. +* + CALL ZLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* +* Compute block row of U. +* + CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL ZGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + 20 CONTINUE + END IF + RETURN +* +* End of ZGETRF +* + END diff --git a/reference/zgetrsf.f b/reference/zgetrsf.f new file mode 100644 index 0000000..823798b --- /dev/null +++ b/reference/zgetrsf.f @@ -0,0 +1,150 @@ + SUBROUTINE ZGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* ZGETRS solves a system of linear equations +* A * X = B, A**T * X = B, or A**H * X = B +* with a general N-by-N matrix A using the LU factorization computed +* by ZGETRF. +* +* Arguments +* ========= +* +* TRANS (input) CHARACTER*1 +* Specifies the form of the system of equations: +* = 'N': A * X = B (No transpose) +* = 'T': A**T * X = B (Transpose) +* = 'C': A**H * X = B (Conjugate transpose) +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input) COMPLEX*16 array, dimension (LDA,N) +* The factors L and U from the factorization A = P*L*U +* as computed by ZGETRF. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from ZGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) +* On entry, the right hand side matrix B. +* On exit, the solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRAN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZLASWP, ZTRSM +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NOTRAN = LSAME( TRANS, 'N' ) .OR. LSAME(TRANS, 'R') + IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETRS', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 .OR. NRHS.EQ.0 ) + $ RETURN +* + IF( NOTRAN ) THEN +* +* Solve A * X = B. +* +* Apply row interchanges to the right hand sides. +* + CALL ZLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) +* +* Solve L*X = B, overwriting B with X. +* + CALL ZTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve U*X = B, overwriting B with X. +* + CALL ZTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, + $ NRHS, ONE, A, LDA, B, LDB ) + ELSE +* +* Solve A**T * X = B or A**H * X = B. +* +* Solve U'*X = B, overwriting B with X. +* + CALL ZTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, NRHS, ONE, + $ A, LDA, B, LDB ) +* +* Solve L'*X = B, overwriting B with X. +* + CALL ZTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, ONE, A, + $ LDA, B, LDB ) +* +* Apply row interchanges to the solution vectors. +* + CALL ZLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) + END IF +* + RETURN +* +* End of ZGETRS +* + END diff --git a/reference/zhbmvf.f b/reference/zhbmvf.f new file mode 100644 index 0000000..875c3e0 --- /dev/null +++ b/reference/zhbmvf.f @@ -0,0 +1,406 @@ + SUBROUTINE ZHBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + INTEGER INCX, INCY, K, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian band matrix, with k super-diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the hermitian matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a hermitian band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the hermitian matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a hermitian band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, MIN, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ).AND. + $ .NOT.LSAME( UPLO, 'V' ).AND. + $ .NOT.LSAME( UPLO, 'M' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( K.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + +* +* Form y when upper triangle of A is stored. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + RETURN + ENDIF + +* +* Form y when lower triangle of A is stored. +* + IF( LSAME( UPLO, 'L' ) )THEN + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( A( 1, J ) ) + L = 1 - J + DO 90, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( 1, J ) ) + L = 1 - J + IX = JX + IY = JY + DO 110, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + RETURN + END IF + + +* +* Form y when upper triangle of A is stored. +* + IF( LSAME( UPLO, 'V' ) )THEN + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 160, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 150, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*DCONJG(A( L + I, J )) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 150 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + 160 CONTINUE + ELSE + JX = KX + JY = KY + DO 180, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 170, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*DCONJG(A( L + I, J )) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 170 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 180 CONTINUE + END IF + RETURN + ENDIF + +* +* Form y when lower triangle of A is stored. +* + IF( LSAME( UPLO, 'M' ) )THEN + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 200, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( A( 1, J ) ) + L = 1 - J + DO 190, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*DCONJG(A( L + I, J )) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 190 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 200 CONTINUE + ELSE + JX = KX + JY = KY + DO 220, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( 1, J ) ) + L = 1 - J + IX = JX + IY = JY + DO 210, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*DCONJG(A( L + I, J )) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + 210 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 220 CONTINUE + END IF + RETURN + END IF + + + +* + RETURN +* +* End of ZHBMV . +* + END diff --git a/reference/zhemm3mf.f b/reference/zhemm3mf.f new file mode 100644 index 0000000..2247e2c --- /dev/null +++ b/reference/zhemm3mf.f @@ -0,0 +1,304 @@ + SUBROUTINE ZHEMM3MF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZHEMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is an hermitian matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the hermitian matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the hermitian matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* hermitian matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* hermitian matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHEMM3M', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*DCONJG( A( K, I ) ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*DCONJG( A( K, I ) ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*DBLE( A( J, J ) ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*DCONJG( A( J, K ) ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*DCONJG( A( J, K ) ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of ZHEMM . +* + END diff --git a/reference/zhemmf.f b/reference/zhemmf.f new file mode 100644 index 0000000..dbe8fb1 --- /dev/null +++ b/reference/zhemmf.f @@ -0,0 +1,304 @@ + SUBROUTINE ZHEMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZHEMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is an hermitian matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the hermitian matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the hermitian matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* hermitian matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* hermitian matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHEMM3M', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*DCONJG( A( K, I ) ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*DCONJG( A( K, I ) ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*DBLE( A( J, J ) ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*DCONJG( A( J, K ) ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*DCONJG( A( J, K ) ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of ZHEMM . +* + END diff --git a/reference/zhemvf.f b/reference/zhemvf.f new file mode 100644 index 0000000..ac8a04f --- /dev/null +++ b/reference/zhemvf.f @@ -0,0 +1,351 @@ + SUBROUTINE ZHEMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHEMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ).AND. + $ .NOT.LSAME( UPLO, 'V' ).AND. + $ .NOT.LSAME( UPLO, 'M' ))THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 5 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + ELSE IF( INCY.EQ.0 )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHEMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + RETURN + ENDIF + + + IF( LSAME( UPLO, 'L' ) )THEN +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + IX = JX + IY = JY + DO 110, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + RETURN + END IF + + + IF( LSAME( UPLO, 'V' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 160, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 150, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1* DCONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 150 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 + 160 CONTINUE + ELSE + JX = KX + JY = KY + DO 180, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 170, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1* DCONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 170 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 180 CONTINUE + END IF + RETURN + ENDIF + + + IF( LSAME( UPLO, 'M' ) )THEN +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 200, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + DO 190, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*DCONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 190 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 200 CONTINUE + ELSE + JX = KX + JY = KY + DO 220, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + IX = JX + IY = JY + DO 210, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*DCONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 210 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 220 CONTINUE + END IF + RETURN + END IF +* + RETURN +* +* End of ZHEMV . +* + END diff --git a/reference/zher2f.f b/reference/zher2f.f new file mode 100644 index 0000000..4ae3e4c --- /dev/null +++ b/reference/zher2f.f @@ -0,0 +1,249 @@ + SUBROUTINE ZHER2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHER2 performs the hermitian rank 2 operation +* +* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHER2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( J ) ) + TEMP2 = DCONJG( ALPHA*X( J ) ) + DO 10, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + A( J, J ) = DBLE( A( J, J ) ) + + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( JY ) ) + TEMP2 = DCONJG( ALPHA*X( JX ) ) + IX = KX + IY = KY + DO 30, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + A( J, J ) = DBLE( A( J, J ) ) + + $ DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( J ) ) + TEMP2 = DCONJG( ALPHA*X( J ) ) + A( J, J ) = DBLE( A( J, J ) ) + + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) + DO 50, I = J + 1, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( JY ) ) + TEMP2 = DCONJG( ALPHA*X( JX ) ) + A( J, J ) = DBLE( A( J, J ) ) + + $ DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + IX = JX + IY = JY + DO 70, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + 70 CONTINUE + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHER2 . +* + END diff --git a/reference/zher2kf.f b/reference/zher2kf.f new file mode 100644 index 0000000..43b75d9 --- /dev/null +++ b/reference/zher2kf.f @@ -0,0 +1,372 @@ + SUBROUTINE ZHER2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B,LDB, BETA, + $ C, LDC ) +* .. Scalar Arguments .. + CHARACTER TRANS, UPLO + INTEGER K, LDA, LDB, LDC, N + DOUBLE PRECISION BETA + COMPLEX*16 ALPHA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZHER2K performs one of the hermitian rank 2k operations +* +* C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C, +* +* or +* +* C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C, +* +* where alpha and beta are scalars with beta real, C is an n by n +* hermitian matrix and A and B are n by k matrices in the first case +* and k by n matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*conjg( B' ) + +* conjg( alpha )*B*conjg( A' ) + +* beta*C. +* +* TRANS = 'C' or 'c' C := alpha*conjg( A' )*B + +* conjg( alpha )*conjg( B' )*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'C' or 'c', K specifies the number of rows of the +* matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1. +* Ed Anderson, Cray Research Inc. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, DCONJG, MAX +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) ) THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ) .AND. ( .NOT.LSAME( UPLO, 'L' ) ) ) THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ) .AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN + INFO = 2 + ELSE IF( N.LT.0 ) THEN + INFO = 3 + ELSE IF( K.LT.0 ) THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) ) THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) ) THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) ) THEN + INFO = 12 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHER2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ( ALPHA.EQ.ZERO ) .OR. ( K.EQ.0 ) ) .AND. + $ ( BETA.EQ.ONE ) ) )RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO ) THEN + IF( UPPER ) THEN + IF( BETA.EQ.DBLE( ZERO ) ) THEN + DO 20 J = 1, N + DO 10 I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1, N + DO 30 I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.DBLE( ZERO ) ) THEN + DO 60 J = 1, N + DO 50 I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80 J = 1, N + C( J, J ) = BETA*DBLE( C( J, J ) ) + DO 70 I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) ) THEN +* +* Form C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + +* C. +* + IF( UPPER ) THEN + DO 130 J = 1, N + IF( BETA.EQ.DBLE( ZERO ) ) THEN + DO 90 I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE ) THEN + DO 100 I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + ELSE + C( J, J ) = DBLE( C( J, J ) ) + END IF + DO 120 L = 1, K + IF( ( A( J, L ).NE.ZERO ) .OR. ( B( J, L ).NE.ZERO ) ) + $ THEN + TEMP1 = ALPHA*DCONJG( B( J, L ) ) + TEMP2 = DCONJG( ALPHA*A( J, L ) ) + DO 110 I = 1, J - 1 + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 110 CONTINUE + C( J, J ) = DBLE( C( J, J ) ) + + $ DBLE( A( J, L )*TEMP1+B( J, L )*TEMP2 ) + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180 J = 1, N + IF( BETA.EQ.DBLE( ZERO ) ) THEN + DO 140 I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE ) THEN + DO 150 I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + ELSE + C( J, J ) = DBLE( C( J, J ) ) + END IF + DO 170 L = 1, K + IF( ( A( J, L ).NE.ZERO ) .OR. ( B( J, L ).NE.ZERO ) ) + $ THEN + TEMP1 = ALPHA*DCONJG( B( J, L ) ) + TEMP2 = DCONJG( ALPHA*A( J, L ) ) + DO 160 I = J + 1, N + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 160 CONTINUE + C( J, J ) = DBLE( C( J, J ) ) + + $ DBLE( A( J, L )*TEMP1+B( J, L )*TEMP2 ) + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + +* C. +* + IF( UPPER ) THEN + DO 210 J = 1, N + DO 200 I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190 L = 1, K + TEMP1 = TEMP1 + DCONJG( A( L, I ) )*B( L, J ) + TEMP2 = TEMP2 + DCONJG( B( L, I ) )*A( L, J ) + 190 CONTINUE + IF( I.EQ.J ) THEN + IF( BETA.EQ.DBLE( ZERO ) ) THEN + C( J, J ) = DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* + $ TEMP2 ) + ELSE + C( J, J ) = BETA*DBLE( C( J, J ) ) + + $ DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* + $ TEMP2 ) + END IF + ELSE + IF( BETA.EQ.DBLE( ZERO ) ) THEN + C( I, J ) = ALPHA*TEMP1 + DCONJG( ALPHA )*TEMP2 + ELSE + C( I, J ) = BETA*C( I, J ) + ALPHA*TEMP1 + + $ DCONJG( ALPHA )*TEMP2 + END IF + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240 J = 1, N + DO 230 I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220 L = 1, K + TEMP1 = TEMP1 + DCONJG( A( L, I ) )*B( L, J ) + TEMP2 = TEMP2 + DCONJG( B( L, I ) )*A( L, J ) + 220 CONTINUE + IF( I.EQ.J ) THEN + IF( BETA.EQ.DBLE( ZERO ) ) THEN + C( J, J ) = DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* + $ TEMP2 ) + ELSE + C( J, J ) = BETA*DBLE( C( J, J ) ) + + $ DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* + $ TEMP2 ) + END IF + ELSE + IF( BETA.EQ.DBLE( ZERO ) ) THEN + C( I, J ) = ALPHA*TEMP1 + DCONJG( ALPHA )*TEMP2 + ELSE + C( I, J ) = BETA*C( I, J ) + ALPHA*TEMP1 + + $ DCONJG( ALPHA )*TEMP2 + END IF + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHER2K. +* + END diff --git a/reference/zherf.f b/reference/zherf.f new file mode 100644 index 0000000..ebde22c --- /dev/null +++ b/reference/zherf.f @@ -0,0 +1,212 @@ + SUBROUTINE ZHERF ( UPLO, N, ALPHA, X, INCX, A, LDA ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZHER performs the hermitian rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHER ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.DBLE( ZERO ) ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( J ) ) + DO 10, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + A( J, J ) = DBLE( A( J, J ) ) + DBLE( X( J )*TEMP ) + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( JX ) ) + IX = KX + DO 30, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + A( J, J ) = DBLE( A( J, J ) ) + DBLE( X( JX )*TEMP ) + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( J ) ) + A( J, J ) = DBLE( A( J, J ) ) + DBLE( TEMP*X( J ) ) + DO 50, I = J + 1, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( JX ) ) + A( J, J ) = DBLE( A( J, J ) ) + DBLE( TEMP*X( JX ) ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + A( I, J ) = A( I, J ) + X( IX )*TEMP + 70 CONTINUE + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHER . +* + END diff --git a/reference/zherkf.f b/reference/zherkf.f new file mode 100644 index 0000000..5a7e082 --- /dev/null +++ b/reference/zherkf.f @@ -0,0 +1,330 @@ + SUBROUTINE ZHERKF( UPLO,TRANS, N, K, ALPHA, A, LDA, BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER TRANS, UPLO + INTEGER K, LDA, LDC, N + DOUBLE PRECISION ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZHERK performs one of the hermitian rank k operations +* +* C := alpha*A*conjg( A' ) + beta*C, +* +* or +* +* C := alpha*conjg( A' )*A + beta*C, +* +* where alpha and beta are real scalars, C is an n by n hermitian +* matrix and A is an n by k matrix in the first case and a k by n +* matrix in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*conjg( A' ) + beta*C. +* +* TRANS = 'C' or 'c' C := alpha*conjg( A' )*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'C' or 'c', K specifies the number of rows of the +* matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1. +* Ed Anderson, Cray Research Inc. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, DCMPLX, DCONJG, MAX +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + DOUBLE PRECISION RTEMP + COMPLEX*16 TEMP +* .. +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) ) THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ) .AND. ( .NOT.LSAME( UPLO, 'L' ) ) ) THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ) .AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN + INFO = 2 + ELSE IF( N.LT.0 ) THEN + INFO = 3 + ELSE IF( K.LT.0 ) THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) ) THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) ) THEN + INFO = 10 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHERK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ( ALPHA.EQ.ZERO ) .OR. ( K.EQ.0 ) ) .AND. + $ ( BETA.EQ.ONE ) ) )RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO ) THEN + IF( UPPER ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 20 J = 1, N + DO 10 I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1, N + DO 30 I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO ) THEN + DO 60 J = 1, N + DO 50 I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80 J = 1, N + C( J, J ) = BETA*DBLE( C( J, J ) ) + DO 70 I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) ) THEN +* +* Form C := alpha*A*conjg( A' ) + beta*C. +* + IF( UPPER ) THEN + DO 130 J = 1, N + IF( BETA.EQ.ZERO ) THEN + DO 90 I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE ) THEN + DO 100 I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + ELSE + C( J, J ) = DBLE( C( J, J ) ) + END IF + DO 120 L = 1, K + IF( A( J, L ).NE.DCMPLX( ZERO ) ) THEN + TEMP = ALPHA*DCONJG( A( J, L ) ) + DO 110 I = 1, J - 1 + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + C( J, J ) = DBLE( C( J, J ) ) + + $ DBLE( TEMP*A( I, L ) ) + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180 J = 1, N + IF( BETA.EQ.ZERO ) THEN + DO 140 I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE ) THEN + C( J, J ) = BETA*DBLE( C( J, J ) ) + DO 150 I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + ELSE + C( J, J ) = DBLE( C( J, J ) ) + END IF + DO 170 L = 1, K + IF( A( J, L ).NE.DCMPLX( ZERO ) ) THEN + TEMP = ALPHA*DCONJG( A( J, L ) ) + C( J, J ) = DBLE( C( J, J ) ) + + $ DBLE( TEMP*A( J, L ) ) + DO 160 I = J + 1, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*conjg( A' )*A + beta*C. +* + IF( UPPER ) THEN + DO 220 J = 1, N + DO 200 I = 1, J - 1 + TEMP = ZERO + DO 190 L = 1, K + TEMP = TEMP + DCONJG( A( L, I ) )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO ) THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + RTEMP = ZERO + DO 210 L = 1, K + RTEMP = RTEMP + DCONJG( A( L, J ) )*A( L, J ) + 210 CONTINUE + IF( BETA.EQ.ZERO ) THEN + C( J, J ) = ALPHA*RTEMP + ELSE + C( J, J ) = ALPHA*RTEMP + BETA*DBLE( C( J, J ) ) + END IF + 220 CONTINUE + ELSE + DO 260 J = 1, N + RTEMP = ZERO + DO 230 L = 1, K + RTEMP = RTEMP + DCONJG( A( L, J ) )*A( L, J ) + 230 CONTINUE + IF( BETA.EQ.ZERO ) THEN + C( J, J ) = ALPHA*RTEMP + ELSE + C( J, J ) = ALPHA*RTEMP + BETA*DBLE( C( J, J ) ) + END IF + DO 250 I = J + 1, N + TEMP = ZERO + DO 240 L = 1, K + TEMP = TEMP + DCONJG( A( L, I ) )*A( L, J ) + 240 CONTINUE + IF( BETA.EQ.ZERO ) THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 250 CONTINUE + 260 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHERK . +* + END diff --git a/reference/zhpmvf.f b/reference/zhpmvf.f new file mode 100644 index 0000000..8631861 --- /dev/null +++ b/reference/zhpmvf.f @@ -0,0 +1,270 @@ + SUBROUTINE ZHPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 6 + ELSE IF( INCY.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( AP( KK + J - 1 ) ) + $ + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( AP( KK + J - 1 ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( AP( KK ) ) + K = KK + 1 + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N - J + 1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( AP( KK ) ) + IX = JX + IY = JY + DO 110, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N - J + 1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHPMV . +* + END diff --git a/reference/zhpr2f.f b/reference/zhpr2f.f new file mode 100644 index 0000000..462913d --- /dev/null +++ b/reference/zhpr2f.f @@ -0,0 +1,251 @@ + SUBROUTINE ZHPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHPR2 performs the hermitian rank 2 operation +* +* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( J ) ) + TEMP2 = DCONJG( ALPHA*X( J ) ) + K = KK + DO 10, I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) + ELSE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( JY ) ) + TEMP2 = DCONJG( ALPHA*X( JX ) ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + + $ DBLE( X( JX )*TEMP1 + + $ Y( JY )*TEMP2 ) + ELSE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( J ) ) + TEMP2 = DCONJG( ALPHA*X( J ) ) + AP( KK ) = DBLE( AP( KK ) ) + + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) + K = KK + 1 + DO 50, I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = DBLE( AP( KK ) ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( JY ) ) + TEMP2 = DCONJG( ALPHA*X( JX ) ) + AP( KK ) = DBLE( AP( KK ) ) + + $ DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + IX = JX + IY = JY + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + 70 CONTINUE + ELSE + AP( KK ) = DBLE( AP( KK ) ) + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHPR2 . +* + END diff --git a/reference/zhprf.f b/reference/zhprf.f new file mode 100644 index 0000000..2c93f1e --- /dev/null +++ b/reference/zhprf.f @@ -0,0 +1,217 @@ + SUBROUTINE ZHPRF ( UPLO, N, ALPHA, X, INCX, AP ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZHPR performs the hermitian rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.DBLE( ZERO ) ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( J ) ) + K = KK + DO 10, I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + $ + DBLE( X( J )*TEMP ) + ELSE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( JX ) ) + IX = KX + DO 30, K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + $ + DBLE( X( JX )*TEMP ) + ELSE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( J ) ) + AP( KK ) = DBLE( AP( KK ) ) + DBLE( TEMP*X( J ) ) + K = KK + 1 + DO 50, I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = DBLE( AP( KK ) ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( JX ) ) + AP( KK ) = DBLE( AP( KK ) ) + DBLE( TEMP*X( JX ) ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + AP( K ) = AP( K ) + X( IX )*TEMP + 70 CONTINUE + ELSE + AP( KK ) = DBLE( AP( KK ) ) + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHPR . +* + END diff --git a/reference/zlaswpf.f b/reference/zlaswpf.f new file mode 100644 index 0000000..582f15b --- /dev/null +++ b/reference/zlaswpf.f @@ -0,0 +1,120 @@ + SUBROUTINE ZLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INCX, K1, K2, LDA, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZLASWP performs a series of row interchanges on the matrix A. +* One row interchange is initiated for each of rows K1 through K2 of A. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of columns of the matrix A. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the matrix of column dimension N to which the row +* interchanges will be applied. +* On exit, the permuted matrix. +* +* LDA (input) INTEGER +* The leading dimension of the array A. +* +* K1 (input) INTEGER +* The first element of IPIV for which a row interchange will +* be done. +* +* K2 (input) INTEGER +* The last element of IPIV for which a row interchange will +* be done. +* +* IPIV (input) INTEGER array, dimension (M*abs(INCX)) +* The vector of pivot indices. Only the elements in positions +* K1 through K2 of IPIV are accessed. +* IPIV(K) = L implies rows K and L are to be interchanged. +* +* INCX (input) INTEGER +* The increment between successive values of IPIV. If IPIV +* is negative, the pivots are applied in reverse order. +* +* Further Details +* =============== +* +* Modified by +* R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA +* +* ===================================================================== +* +* .. Local Scalars .. + INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 + COMPLEX*16 TEMP +* .. +* .. Executable Statements .. +* +* Interchange row I with row IPIV(I) for each of rows K1 through K2. +* + IF( INCX.GT.0 ) THEN + IX0 = K1 + I1 = K1 + I2 = K2 + INC = 1 + ELSE IF( INCX.LT.0 ) THEN + IX0 = 1 + ( 1-K2 )*INCX + I1 = K2 + I2 = K1 + INC = -1 + ELSE + RETURN + END IF +* + N32 = ( N / 32 )*32 + IF( N32.NE.0 ) THEN + DO 30 J = 1, N32, 32 + IX = IX0 + DO 20 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 10 K = J, J + 31 + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 10 CONTINUE + END IF + IX = IX + INCX + 20 CONTINUE + 30 CONTINUE + END IF + IF( N32.NE.N ) THEN + N32 = N32 + 1 + IX = IX0 + DO 50 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 40 K = N32, N + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 40 CONTINUE + END IF + IX = IX + INCX + 50 CONTINUE + END IF +* + RETURN +* +* End of ZLASWP +* + END diff --git a/reference/zlauu2f.f b/reference/zlauu2f.f new file mode 100644 index 0000000..f53f99d --- /dev/null +++ b/reference/zlauu2f.f @@ -0,0 +1,143 @@ + SUBROUTINE ZLAUU2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZLAUU2 computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the unblocked form of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I + DOUBLE PRECISION AII +* .. +* .. External Functions .. + LOGICAL LSAME + COMPLEX*16 ZDOTC + EXTERNAL LSAME, ZDOTC +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZDSCAL, ZGEMV, ZLACGV +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, DCMPLX, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLAUU2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = AII*AII + DBLE( ZDOTC( N-I, A( I, I+1 ), LDA, + $ A( I, I+1 ), LDA ) ) + CALL ZLACGV( N-I, A( I, I+1 ), LDA ) + CALL ZGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), + $ LDA, A( I, I+1 ), LDA, DCMPLX( AII ), + $ A( 1, I ), 1 ) + CALL ZLACGV( N-I, A( I, I+1 ), LDA ) + ELSE + CALL ZDSCAL( I, AII, A( 1, I ), 1 ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = AII*AII + DBLE( ZDOTC( N-I, A( I+1, I ), 1, + $ A( I+1, I ), 1 ) ) + CALL ZLACGV( I-1, A( I, 1 ), LDA ) + CALL ZGEMV( 'Conjugate transpose', N-I, I-1, ONE, + $ A( I+1, 1 ), LDA, A( I+1, I ), 1, + $ DCMPLX( AII ), A( I, 1 ), LDA ) + CALL ZLACGV( I-1, A( I, 1 ), LDA ) + ELSE + CALL ZDSCAL( I, AII, A( I, 1 ), LDA ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of ZLAUU2 +* + END diff --git a/reference/zlauumf.f b/reference/zlauumf.f new file mode 100644 index 0000000..3a84646 --- /dev/null +++ b/reference/zlauumf.f @@ -0,0 +1,160 @@ + SUBROUTINE ZLAUUMF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZLAUUM computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the blocked form of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, IB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGEMM, ZHERK, ZLAUU2, ZTRMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLAUUM', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 128 +* + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL ZLAUU2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL ZTRMM( 'Right', 'Upper', 'Conjugate transpose', + $ 'Non-unit', I-1, IB, CONE, A( I, I ), LDA, + $ A( 1, I ), LDA ) + CALL ZLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ I-1, IB, N-I-IB+1, CONE, A( 1, I+IB ), + $ LDA, A( I, I+IB ), LDA, CONE, A( 1, I ), + $ LDA ) + CALL ZHERK( 'Upper', 'No transpose', IB, N-I-IB+1, + $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), + $ LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL ZTRMM( 'Left', 'Lower', 'Conjugate transpose', + $ 'Non-unit', IB, I-1, CONE, A( I, I ), LDA, + $ A( I, 1 ), LDA ) + CALL ZLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL ZGEMM( 'Conjugate transpose', 'No transpose', IB, + $ I-1, N-I-IB+1, CONE, A( I+IB, I ), LDA, + $ A( I+IB, 1 ), LDA, CONE, A( I, 1 ), LDA ) + CALL ZHERK( 'Lower', 'Conjugate transpose', IB, + $ N-I-IB+1, ONE, A( I+IB, I ), LDA, ONE, + $ A( I, I ), LDA ) + END IF + 20 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZLAUUM +* + END diff --git a/reference/zpotf2f.f b/reference/zpotf2f.f new file mode 100644 index 0000000..bfb6f11 --- /dev/null +++ b/reference/zpotf2f.f @@ -0,0 +1,175 @@ + SUBROUTINE ZPOTF2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZPOTF2 computes the Cholesky factorization of a complex Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U' * U , if UPLO = 'U', or +* A = L * L', if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the unblocked version of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the upper or lower triangular part of the +* Hermitian matrix A is stored. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* n by n upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U'*U or A = L*L'. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, the leading minor of order k is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J + DOUBLE PRECISION AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + COMPLEX*16 ZDOTC + EXTERNAL LSAME, ZDOTC +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZDSCAL, ZGEMV, ZLACGV +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZPOTF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N +* +* Compute U(J,J) and test for non-positive-definiteness. +* + AJJ = DBLE( A( J, J ) ) - ZDOTC( J-1, A( 1, J ), 1, + $ A( 1, J ), 1 ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of row J. +* + IF( J.LT.N ) THEN + CALL ZLACGV( J-1, A( 1, J ), 1 ) + CALL ZGEMV( 'Transpose', J-1, N-J, -CONE, A( 1, J+1 ), + $ LDA, A( 1, J ), 1, CONE, A( J, J+1 ), LDA ) + CALL ZLACGV( J-1, A( 1, J ), 1 ) + CALL ZDSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N +* +* Compute L(J,J) and test for non-positive-definiteness. +* + AJJ = DBLE( A( J, J ) ) - ZDOTC( J-1, A( J, 1 ), LDA, + $ A( J, 1 ), LDA ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of column J. +* + IF( J.LT.N ) THEN + CALL ZLACGV( J-1, A( J, 1 ), LDA ) + CALL ZGEMV( 'No transpose', N-J, J-1, -CONE, A( J+1, 1 ), + $ LDA, A( J, 1 ), LDA, CONE, A( J+1, J ), 1 ) + CALL ZLACGV( J-1, A( J, 1 ), LDA ) + CALL ZDSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF + GO TO 40 +* + 30 CONTINUE + INFO = J +* + 40 CONTINUE + RETURN +* +* End of ZPOTF2 +* + END diff --git a/reference/zpotrff.f b/reference/zpotrff.f new file mode 100644 index 0000000..7cef580 --- /dev/null +++ b/reference/zpotrff.f @@ -0,0 +1,187 @@ + SUBROUTINE ZPOTRFF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZPOTRF computes the Cholesky factorization of a complex Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U**H * U, if UPLO = 'U', or +* A = L * L**H, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + COMPLEX*16 CONE + PARAMETER ( ONE = 1.0D+0, CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGEMM, ZHERK, ZPOTF2, ZTRSM +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 56 + + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL ZPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL ZHERK( 'Upper', 'Conjugate transpose', JB, J-1, + $ -ONE, A( 1, J ), LDA, ONE, A( J, J ), LDA ) + CALL ZPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block row. +* + CALL ZGEMM( 'Conjugate transpose', 'No transpose', JB, + $ N-J-JB+1, J-1, -CONE, A( 1, J ), LDA, + $ A( 1, J+JB ), LDA, CONE, A( J, J+JB ), + $ LDA ) + CALL ZTRSM( 'Left', 'Upper', 'Conjugate transpose', + $ 'Non-unit', JB, N-J-JB+1, CONE, A( J, J ), + $ LDA, A( J, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL ZHERK( 'Lower', 'No transpose', JB, J-1, -ONE, + $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) + CALL ZPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block column. +* + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ N-J-JB+1, JB, J-1, -CONE, A( J+JB, 1 ), + $ LDA, A( J, 1 ), LDA, CONE, A( J+JB, J ), + $ LDA ) + CALL ZTRSM( 'Right', 'Lower', 'Conjugate transpose', + $ 'Non-unit', N-J-JB+1, JB, CONE, A( J, J ), + $ LDA, A( J+JB, J ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of ZPOTRF +* + END diff --git a/reference/zpotrif.f b/reference/zpotrif.f new file mode 100644 index 0000000..5a11880 --- /dev/null +++ b/reference/zpotrif.f @@ -0,0 +1,96 @@ + SUBROUTINE ZPOTRIF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZPOTRI computes the inverse of a complex Hermitian positive definite +* matrix A using the Cholesky factorization A = U**H*U or A = L*L**H +* computed by ZPOTRF. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H, as computed by +* ZPOTRF. +* On exit, the upper or lower triangle of the (Hermitian) +* inverse of A, overwriting the input factor U or L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the (i,i) element of the factor U or L is +* zero, and the inverse could not be computed. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZLAUUM, ZTRTRI +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZPOTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Invert the triangular Cholesky factor U or L. +* + CALL ZTRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* +* Form inv(U)*inv(U)' or inv(L)'*inv(L). +* + CALL ZLAUUM( UPLO, N, A, LDA, INFO ) +* + RETURN +* +* End of ZPOTRI +* + END diff --git a/reference/zrotgf.f b/reference/zrotgf.f new file mode 100644 index 0000000..d6f1e0d --- /dev/null +++ b/reference/zrotgf.f @@ -0,0 +1,23 @@ + subroutine zrotgf(ca,cb,c,s) + double complex ca,cb,s + double precision c + double precision norm,scale + double complex alpha + if (cdabs(ca) .ne. 0.0d0) go to 10 + c = 0.0d0 + s = (1.0d0,0.0d0) + ca = cb + go to 20 + 10 continue + scale = cdabs(ca) + cdabs(cb) + + norm = scale*dsqrt((cdabs(ca/dcmplx(scale,0.0d0)))**2 + + * (cdabs(cb/dcmplx(scale,0.0d0)))**2) + + alpha = ca /cdabs(ca) + c = cdabs(ca) / norm + s = alpha * dconjg(cb) / norm + ca = alpha * norm + 20 continue + return + end diff --git a/reference/zsbmvf.f b/reference/zsbmvf.f new file mode 100644 index 0000000..2b7787c --- /dev/null +++ b/reference/zsbmvf.f @@ -0,0 +1,306 @@ + SUBROUTINE ZSBMVF(UPLO, N, K, ALPHA, A, LDA, X, INCX, BETA, Y, + $ INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, K, LDA, N + COMPLEX*16 ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZSBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric band matrix, with k super-diagonals. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array, dimension( LDA, N ) +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L + COMPLEX*16 TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( K.LT.0 ) THEN + INFO = 3 + ELSE IF( LDA.LT.( K+1 ) ) THEN + INFO = 6 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 8 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50 I = MAX( 1, J-K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70 I = MAX( 1, J-K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K ) THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( 1, J ) + L = 1 - J + DO 90 I = J + 1, MIN( N, J+K ) + Y( I ) = Y( I ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) + L = 1 - J + IX = JX + IY = JY + DO 110 I = J + 1, MIN( N, J+K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSBMV +* + END diff --git a/reference/zscalf.f b/reference/zscalf.f new file mode 100644 index 0000000..f9c2c53 --- /dev/null +++ b/reference/zscalf.f @@ -0,0 +1,29 @@ + subroutine zscalf(n,za,zx,incx) +c +c scales a vector by a constant. +c jack dongarra, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex za,zx(*) + integer i,incx,ix,n +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + do 10 i = 1,n + zx(ix) = za*zx(ix) + ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + zx(i) = za*zx(i) + 30 continue + return + end diff --git a/reference/zspmvf.f b/reference/zspmvf.f new file mode 100644 index 0000000..8c6057e --- /dev/null +++ b/reference/zspmvf.f @@ -0,0 +1,264 @@ + SUBROUTINE ZSPMVF(UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, N + COMPLEX*16 ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZSPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix, supplied in packed form. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP (input) COMPLEX*16 array, dimension at least +* ( ( N*( N + 1 ) )/2 ). +* Before entry, with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry, with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Unchanged on exit. +* +* X (input) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA (input) COMPLEX*16 +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y (input/output) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY (input) INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY + COMPLEX*16 TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 6 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 9 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50 I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70 K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*AP( KK ) + K = KK + 1 + DO 90 I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N-J+1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*AP( KK ) + IX = JX + IY = JY + DO 110 K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N-J+1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSPMV +* + END diff --git a/reference/zspr2f.f b/reference/zspr2f.f new file mode 100644 index 0000000..aad5f71 --- /dev/null +++ b/reference/zspr2f.f @@ -0,0 +1,229 @@ + SUBROUTINE ZSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSPR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSPR2 . +* + END diff --git a/reference/zsprf.f b/reference/zsprf.f new file mode 100644 index 0000000..c21f602 --- /dev/null +++ b/reference/zsprf.f @@ -0,0 +1,213 @@ + SUBROUTINE ZSPRF( UPLO, N, ALPHA, X, INCX, AP ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, N + COMPLEX*16 ALPHA +* .. +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZSPR performs the symmetric rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a complex scalar, x is an n element vector and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X (input) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP (input/output) COMPLEX*16 array, dimension at least +* ( ( N*( N + 1 ) )/2 ). +* Before entry, with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry, with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, J, JX, K, KK, KX + COMPLEX*16 TEMP +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 ) THEN + KX = 1 - ( N-1 )*INCX + ELSE IF( INCX.NE.1 ) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 ) THEN + DO 20 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + K = KK + DO 10 I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + AP( KK+J-1 ) = AP( KK+J-1 ) + X( J )*TEMP + ELSE + AP( KK+J-1 ) = AP( KK+J-1 ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30 K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + AP( KK+J-1 ) = AP( KK+J-1 ) + X( JX )*TEMP + ELSE + AP( KK+J-1 ) = AP( KK+J-1 ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 ) THEN + DO 60 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + AP( KK ) = AP( KK ) + TEMP*X( J ) + K = KK + 1 + DO 50 I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = AP( KK ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + AP( KK ) = AP( KK ) + TEMP*X( JX ) + IX = JX + DO 70 K = KK + 1, KK + N - J + IX = IX + INCX + AP( K ) = AP( K ) + X( IX )*TEMP + 70 CONTINUE + ELSE + AP( KK ) = AP( KK ) + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSPR +* + END diff --git a/reference/zswapf.f b/reference/zswapf.f new file mode 100644 index 0000000..f42d7ec --- /dev/null +++ b/reference/zswapf.f @@ -0,0 +1,36 @@ + subroutine zswapf (n,zx,incx,zy,incy) +c +c interchanges two vectors. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),ztemp + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ztemp = zx(ix) + zx(ix) = zy(iy) + zy(iy) = ztemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 + 20 do 30 i = 1,n + ztemp = zx(i) + zx(i) = zy(i) + zy(i) = ztemp + 30 continue + return + end diff --git a/reference/zsymm3mf.f b/reference/zsymm3mf.f new file mode 100644 index 0000000..82423ba --- /dev/null +++ b/reference/zsymm3mf.f @@ -0,0 +1,296 @@ + SUBROUTINE ZSYMM3MF( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of ZSYMM . +* + END diff --git a/reference/zsymmf.f b/reference/zsymmf.f new file mode 100644 index 0000000..ce24be4 --- /dev/null +++ b/reference/zsymmf.f @@ -0,0 +1,296 @@ + SUBROUTINE ZSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of ZSYMM . +* + END diff --git a/reference/zsymvf.f b/reference/zsymvf.f new file mode 100644 index 0000000..7161f1a --- /dev/null +++ b/reference/zsymvf.f @@ -0,0 +1,264 @@ + SUBROUTINE ZSYMVF(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, LDA, N + COMPLEX*16 ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZSYMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A (input) COMPLEX*16 array, dimension ( LDA, N ) +* Before entry, with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry, with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. +* Unchanged on exit. +* +* LDA (input) INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, N ). +* Unchanged on exit. +* +* X (input) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA (input) COMPLEX*16 +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y (input/output) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY (input) INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY + COMPLEX*16 TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = 5 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 7 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 10 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSYMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50 I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70 I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( J, J ) + DO 90 I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + IX = JX + IY = JY + DO 110 I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSYMV +* + END diff --git a/reference/zsyr2f.f b/reference/zsyr2f.f new file mode 100644 index 0000000..d77e4d2 --- /dev/null +++ b/reference/zsyr2f.f @@ -0,0 +1,230 @@ + SUBROUTINE ZSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSYR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYR2 . +* + END diff --git a/reference/zsyr2kf.f b/reference/zsyr2kf.f new file mode 100644 index 0000000..f6f0992 --- /dev/null +++ b/reference/zsyr2kf.f @@ -0,0 +1,324 @@ + SUBROUTINE ZSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZSYR2K performs one of the symmetric rank 2k operations +* +* C := alpha*A*B' + alpha*B*A' + beta*C, +* +* or +* +* C := alpha*A'*B + alpha*B'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A and B are n by k matrices in the first case and k by n +* matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + +* beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'T' or 't', K specifies the number of rows of the +* matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZSYR2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*B' + alpha*B*A' + C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*B + alpha*B'*A + C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSYR2K. +* + END diff --git a/reference/zsyrf.f b/reference/zsyrf.f new file mode 100644 index 0000000..4262ed9 --- /dev/null +++ b/reference/zsyrf.f @@ -0,0 +1,198 @@ + SUBROUTINE ZSYRF( UPLO, N, ALPHA, X, INCX, A, LDA ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, LDA, N + COMPLEX*16 ALPHA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZSYR performs the symmetric rank 1 operation +* +* A := alpha*x*( x' ) + A, +* +* where alpha is a complex scalar, x is an n element vector and A is an +* n by n symmetric matrix. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X (input) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A (input/output) COMPLEX*16 array, dimension ( LDA, N ) +* Before entry, with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry, with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA (input) INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, N ). +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, J, JX, KX + COMPLEX*16 TEMP +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = 7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSYR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 ) THEN + KX = 1 - ( N-1 )*INCX + ELSE IF( INCX.NE.1 ) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 ) THEN + DO 20 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + DO 10 I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30 I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 ) THEN + DO 60 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + DO 50 I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70 I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSYR +* + END diff --git a/reference/zsyrkf.f b/reference/zsyrkf.f new file mode 100644 index 0000000..99bfa82 --- /dev/null +++ b/reference/zsyrkf.f @@ -0,0 +1,293 @@ + SUBROUTINE ZSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZSYRK performs one of the symmetric rank k operations +* +* C := alpha*A*A' + beta*C, +* +* or +* +* C := alpha*A'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A is an n by k matrix in the first case and a k by n matrix +* in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'T' or 't', K specifies the number of rows of the +* matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX*16 TEMP +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZSYRK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*A' + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*A + beta*C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP = ZERO + DO 220, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSYRK . +* + END diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f new file mode 100644 index 0000000..8df5609 --- /dev/null +++ b/reference/ztbmvf.f @@ -0,0 +1,378 @@ + SUBROUTINE ZTBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, K, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTBMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular band matrix, with ( k + 1 ) diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( K.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 7 + ELSE IF( INCX.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ).OR.LSAME( TRANS, 'R' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = KPLUS1 - J + DO 10, I = MAX( 1, J - K ), J - 1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( KPLUS1, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = KPLUS1 - J + DO 30, I = MAX( 1, J - K ), J - 1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( KPLUS1, J ) + END IF + JX = JX + INCX + IF( J.GT.K ) + $ KX = KX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = 1 - J + DO 50, I = MIN( N, J + K ), J + 1, -1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( 1, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = 1 - J + DO 70, I = MIN( N, J + K ), J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( 1, J ) + END IF + JX = JX - INCX + IF( ( N - J ).GE.K ) + $ KX = KX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + L = KPLUS1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 90, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( I ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( KPLUS1, J ) ) + DO 100, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + DCONJG( A( L + I, J ) )*X( I ) + 100 CONTINUE + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + KX = KX - INCX + IX = KX + L = KPLUS1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 120, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX - INCX + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( KPLUS1, J ) ) + DO 130, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + DCONJG( A( L + I, J ) )*X( IX ) + IX = IX - INCX + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + L = 1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 150, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( I ) + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( 1, J ) ) + DO 160, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + DCONJG( A( L + I, J ) )*X( I ) + 160 CONTINUE + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + KX = KX + INCX + IX = KX + L = 1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 180, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX + INCX + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( 1, J ) ) + DO 190, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + DCONJG( A( L + I, J ) )*X( IX ) + IX = IX + INCX + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTBMV . +* + END diff --git a/reference/ztbsvf.f b/reference/ztbsvf.f new file mode 100644 index 0000000..78c37e3 --- /dev/null +++ b/reference/ztbsvf.f @@ -0,0 +1,367 @@ + SUBROUTINE ZTBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) +* .. Scalar Arguments .. + INTEGER INCX,K,LDA,N + CHARACTER DIAG,TRANS,UPLO +* .. +* .. Array Arguments .. + DOUBLE COMPLEX A(LDA,*),X(*) +* .. +* +* Purpose +* ======= +* +* ZTBSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular band matrix, with ( k + 1 ) +* diagonals. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE COMPLEX ZERO + PARAMETER (ZERO= (0.0D+0,0.0D+0)) +* .. +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L + LOGICAL NOCONJ,NOUNIT +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCONJG,MAX,MIN +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN + INFO = 1 + ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 2 + ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT. (K+1)) THEN + INFO = 7 + ELSE IF (INCX.EQ.0) THEN + INFO = 9 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('ZTBSV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF (N.EQ.0) RETURN +* + NOCONJ = LSAME(TRANS,'T') + NOUNIT = LSAME(DIAG,'N') +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF (INCX.LE.0) THEN + KX = 1 - (N-1)*INCX + ELSE IF (INCX.NE.1) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed by sequentially with one pass through A. +* + IF (LSAME(TRANS,'N')) THEN +* +* Form x := inv( A )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 20 J = N,1,-1 + IF (X(J).NE.ZERO) THEN + L = KPLUS1 - J + IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) + TEMP = X(J) + DO 10 I = J - 1,MAX(1,J-K),-1 + X(I) = X(I) - TEMP*A(L+I,J) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 40 J = N,1,-1 + KX = KX - INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = KPLUS1 - J + IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) + TEMP = X(JX) + DO 30 I = J - 1,MAX(1,J-K),-1 + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX - INCX + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 60 J = 1,N + IF (X(J).NE.ZERO) THEN + L = 1 - J + IF (NOUNIT) X(J) = X(J)/A(1,J) + TEMP = X(J) + DO 50 I = J + 1,MIN(N,J+K) + X(I) = X(I) - TEMP*A(L+I,J) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1,N + KX = KX + INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = 1 - J + IF (NOUNIT) X(JX) = X(JX)/A(1,J) + TEMP = X(JX) + DO 70 I = J + 1,MIN(N,J+K) + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A') )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 110 J = 1,N + TEMP = X(J) + L = KPLUS1 - J + IF (NOCONJ) THEN + DO 90 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(I) + 90 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + ELSE + DO 100 I = MAX(1,J-K),J - 1 + TEMP = TEMP - DCONJG(A(L+I,J))*X(I) + 100 CONTINUE + IF (NOUNIT) TEMP = TEMP/DCONJG(A(KPLUS1,J)) + END IF + X(J) = TEMP + 110 CONTINUE + ELSE + JX = KX + DO 140 J = 1,N + TEMP = X(JX) + IX = KX + L = KPLUS1 - J + IF (NOCONJ) THEN + DO 120 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX + INCX + 120 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + ELSE + DO 130 I = MAX(1,J-K),J - 1 + TEMP = TEMP - DCONJG(A(L+I,J))*X(IX) + IX = IX + INCX + 130 CONTINUE + IF (NOUNIT) TEMP = TEMP/DCONJG(A(KPLUS1,J)) + END IF + X(JX) = TEMP + JX = JX + INCX + IF (J.GT.K) KX = KX + INCX + 140 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 170 J = N,1,-1 + TEMP = X(J) + L = 1 - J + IF (NOCONJ) THEN + DO 150 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(I) + 150 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + ELSE + DO 160 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - DCONJG(A(L+I,J))*X(I) + 160 CONTINUE + IF (NOUNIT) TEMP = TEMP/DCONJG(A(1,J)) + END IF + X(J) = TEMP + 170 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 200 J = N,1,-1 + TEMP = X(JX) + IX = KX + L = 1 - J + IF (NOCONJ) THEN + DO 180 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX - INCX + 180 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + ELSE + DO 190 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - DCONJG(A(L+I,J))*X(IX) + IX = IX - INCX + 190 CONTINUE + IF (NOUNIT) TEMP = TEMP/DCONJG(A(1,J)) + END IF + X(JX) = TEMP + JX = JX - INCX + IF ((N-J).GE.K) KX = KX - INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTBSV . +* + END diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f new file mode 100644 index 0000000..d050272 --- /dev/null +++ b/reference/ztpmvf.f @@ -0,0 +1,377 @@ + SUBROUTINE ZTPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTPMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTPMVF', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ).OR.LSAME( TRANS, 'R' ))THEN +* +* Form x:= A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 10, I = 1, J - 1 + IF( NOCONJ )THEN + X( I ) = X( I ) + TEMP*AP( K ) + ELSE + X( I ) = X( I ) + TEMP*DCONJG(AP( K )) + END IF + K = K + 1 + 10 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK + J - 1 ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*DCONJG(AP( KK + J-1)) + END IF + END IF + + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, K = KK, KK + J - 2 + IF( NOCONJ )THEN + X( IX ) = X( IX ) + TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) + TEMP*DCONJG(AP(K)) + END IF + IX = IX + INCX + 30 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK + J - 1 ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*DCONJG(AP( KK + J-1)) + END IF + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 50, I = N, J + 1, -1 + IF( NOCONJ )THEN + X( I ) = X( I ) + TEMP*AP( K ) + ELSE + X( I ) = X( I ) + TEMP*DCONJG(AP( K )) + END IF + K = K - 1 + 50 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK - N + J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*DCONJG(AP(KK - N+J)) + END IF + + END IF + KK = KK - ( N - J + 1 ) + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 + IF( NOCONJ )THEN + X( IX ) = X( IX ) + TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) + TEMP*DCONJG(AP(K)) + ENDIF + IX = IX - INCX + 70 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK - N + J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*DCONJG(AP(KK-N+J)) + ENDIF + END IF + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + K = KK - 1 + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + AP( K )*X( I ) + K = K - 1 + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( AP( KK ) ) + DO 100, I = J - 1, 1, -1 + TEMP = TEMP + DCONJG( AP( K ) )*X( I ) + K = K - 1 + 100 CONTINUE + END IF + X( J ) = TEMP + KK = KK - J + 110 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 120, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + AP( K )*X( IX ) + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( AP( KK ) ) + DO 130, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + DCONJG( AP( K ) )*X( IX ) + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + KK = KK - J + 140 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + K = KK + 1 + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 150, I = J + 1, N + TEMP = TEMP + AP( K )*X( I ) + K = K + 1 + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( AP( KK ) ) + DO 160, I = J + 1, N + TEMP = TEMP + DCONJG( AP( K ) )*X( I ) + K = K + 1 + 160 CONTINUE + END IF + X( J ) = TEMP + KK = KK + ( N - J + 1 ) + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 180, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + AP( K )*X( IX ) + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( AP( KK ) ) + DO 190, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + DCONJG( AP( K ) )*X( IX ) + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTPMV . +* + END diff --git a/reference/ztpsvf.f b/reference/ztpsvf.f new file mode 100644 index 0000000..d5a981e --- /dev/null +++ b/reference/ztpsvf.f @@ -0,0 +1,379 @@ + SUBROUTINE ZTPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTPSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix, supplied in packed form. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTPSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) .OR.LSAME( TRANS, 'R' ))THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/DCONJG(AP( KK )) + END IF + + TEMP = X( J ) + K = KK - 1 + DO 10, I = J - 1, 1, -1 + IF( NOCONJ )THEN + X( I ) = X( I ) - TEMP*AP( K ) + ELSE + X( I ) = X( I ) - TEMP*DCONJG(AP( K )) + END IF + K = K - 1 + 10 CONTINUE + END IF + KK = KK - J + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/DCONJG(AP( KK )) + END IF + TEMP = X( JX ) + IX = JX + DO 30, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + IF( NOCONJ )THEN + X( IX ) = X( IX ) - TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) - TEMP*DCONJG(AP( K )) + END IF + 30 CONTINUE + END IF + JX = JX - INCX + KK = KK - J + 40 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/DCONJG(AP( KK )) + END IF + TEMP = X( J ) + K = KK + 1 + DO 50, I = J + 1, N + IF( NOCONJ )THEN + X( I ) = X( I ) - TEMP*AP( K ) + ELSE + X( I ) = X( I ) - TEMP*DCONJG(AP( K )) + END IF + K = K + 1 + 50 CONTINUE + END IF + KK = KK + ( N - J + 1 ) + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/DCONJG(AP( KK )) + END IF + TEMP = X( JX ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + IF( NOCONJ )THEN + X( IX ) = X( IX ) - TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) - TEMP*DCONJG(AP( K )) + END IF + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = X( J ) + K = KK + IF( NOCONJ )THEN + DO 90, I = 1, J - 1 + TEMP = TEMP - AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + ELSE + DO 100, I = 1, J - 1 + TEMP = TEMP - DCONJG( AP( K ) )*X( I ) + K = K + 1 + 100 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( AP( KK + J - 1 ) ) + END IF + X( J ) = TEMP + KK = KK + J + 110 CONTINUE + ELSE + JX = KX + DO 140, J = 1, N + TEMP = X( JX ) + IX = KX + IF( NOCONJ )THEN + DO 120, K = KK, KK + J - 2 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX + INCX + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + ELSE + DO 130, K = KK, KK + J - 2 + TEMP = TEMP - DCONJG( AP( K ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( AP( KK + J - 1 ) ) + END IF + X( JX ) = TEMP + JX = JX + INCX + KK = KK + J + 140 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 170, J = N, 1, -1 + TEMP = X( J ) + K = KK + IF( NOCONJ )THEN + DO 150, I = N, J + 1, -1 + TEMP = TEMP - AP( K )*X( I ) + K = K - 1 + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + ELSE + DO 160, I = N, J + 1, -1 + TEMP = TEMP - DCONJG( AP( K ) )*X( I ) + K = K - 1 + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( AP( KK - N + J ) ) + END IF + X( J ) = TEMP + KK = KK - ( N - J + 1 ) + 170 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 200, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + IF( NOCONJ )THEN + DO 180, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX - INCX + 180 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + ELSE + DO 190, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - DCONJG( AP( K ) )*X( IX ) + IX = IX - INCX + 190 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( AP( KK - N + J ) ) + END IF + X( JX ) = TEMP + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTPSV . +* + END diff --git a/reference/ztrmmf.f b/reference/ztrmmf.f new file mode 100644 index 0000000..d286f96 --- /dev/null +++ b/reference/ztrmmf.f @@ -0,0 +1,428 @@ + SUBROUTINE ZTRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + COMPLEX*16 ALPHA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* ZTRMM performs one of the matrix-matrix operations +* +* B := alpha*op( A )*B, or B := alpha*B*op( A ) +* +* where alpha is a scalar, B is an m by n matrix, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) multiplies B from +* the left or right as follows: +* +* SIDE = 'L' or 'l' B := alpha*op( A )*B. +* +* SIDE = 'R' or 'r' B := alpha*B*op( A ). +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B, and on exit is overwritten by the +* transformed matrix. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOCONJ = LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTRMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*A*B. +* + IF( UPPER )THEN + DO 50, J = 1, N + DO 40, K = 1, M + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + IF (NOCONJ) THEN + DO 30, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 30 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + B( K, J ) = TEMP + ELSE + DO 35, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*DCONJG(A( I, K )) + 35 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG(A( K, K )) + B( K, J ) = TEMP + ENDIF + END IF + 40 CONTINUE + 50 CONTINUE + ELSE + DO 80, J = 1, N + DO 70 K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + B( K, J ) = TEMP + IF (NOCONJ) THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*A( K, K ) + DO 60, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 60 CONTINUE + ELSE + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*DCONJG(A( K, K )) + DO 65, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*DCONJG(A( I, K )) + 65 CONTINUE + ENDIF + END IF + 70 CONTINUE + 80 CONTINUE + END IF + ELSE +* +* Form B := alpha*A'*B or B := alpha*conjg( A' )*B. +* + IF( UPPER )THEN + DO 120, J = 1, N + DO 110, I = M, 1, -1 + TEMP = B( I, J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 90, K = 1, I - 1 + TEMP = TEMP + A( K, I )*B( K, J ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( I, I ) ) + DO 100, K = 1, I - 1 + TEMP = TEMP + DCONJG( A( K, I ) )*B( K, J ) + 100 CONTINUE + END IF + B( I, J ) = ALPHA*TEMP + 110 CONTINUE + 120 CONTINUE + ELSE + DO 160, J = 1, N + DO 150, I = 1, M + TEMP = B( I, J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 130, K = I + 1, M + TEMP = TEMP + A( K, I )*B( K, J ) + 130 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( I, I ) ) + DO 140, K = I + 1, M + TEMP = TEMP + DCONJG( A( K, I ) )*B( K, J ) + 140 CONTINUE + END IF + B( I, J ) = ALPHA*TEMP + 150 CONTINUE + 160 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*B*A. +* + IF( UPPER )THEN + DO 200, J = N, 1, -1 + TEMP = ALPHA + IF (NOCONJ) THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG(A( J, J )) + ENDIF + DO 170, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 170 CONTINUE + DO 190, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + TEMP = ALPHA*A( K, J ) + ELSE + TEMP = ALPHA*DCONJG(A( K, J )) + ENDIF + DO 180, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE + DO 240, J = 1, N + TEMP = ALPHA + IF (NOCONJ) THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG(A( J, J )) + ENDIF + DO 210, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 210 CONTINUE + DO 230, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + TEMP = ALPHA*A( K, J ) + ELSE + TEMP = ALPHA*DCONJG(A( K, J )) + ENDIF + DO 220, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 220 CONTINUE + END IF + 230 CONTINUE + 240 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*A' or B := alpha*B*conjg( A' ). +* + IF( UPPER )THEN + DO 280, K = 1, N + DO 260, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = ALPHA*A( J, K ) + ELSE + TEMP = ALPHA*DCONJG( A( J, K ) ) + END IF + DO 250, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 250 CONTINUE + END IF + 260 CONTINUE + TEMP = ALPHA + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = TEMP*A( K, K ) + ELSE + TEMP = TEMP*DCONJG( A( K, K ) ) + END IF + END IF + IF( TEMP.NE.ONE )THEN + DO 270, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 270 CONTINUE + END IF + 280 CONTINUE + ELSE + DO 320, K = N, 1, -1 + DO 300, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = ALPHA*A( J, K ) + ELSE + TEMP = ALPHA*DCONJG( A( J, K ) ) + END IF + DO 290, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 290 CONTINUE + END IF + 300 CONTINUE + TEMP = ALPHA + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = TEMP*A( K, K ) + ELSE + TEMP = TEMP*DCONJG( A( K, K ) ) + END IF + END IF + IF( TEMP.NE.ONE )THEN + DO 310, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 310 CONTINUE + END IF + 320 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTRMM . +* + END diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f new file mode 100644 index 0000000..db0f9ca --- /dev/null +++ b/reference/ztrmvf.f @@ -0,0 +1,358 @@ + SUBROUTINE ZTRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTRMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTRMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ))THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 10, I = 1, J - 1 + IF (NOCONJ) THEN + X( I ) = X( I ) + TEMP*A( I, J ) + ELSE + X( I ) = X( I ) + TEMP*DCONJG(A( I, J )) + ENDIF + 10 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*DCONJG(A( J, J )) + ENDIF + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, I = 1, J - 1 + IF (NOCONJ) THEN + X( IX ) = X( IX ) + TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) + TEMP*DCONJG(A( I, J )) + ENDIF + IX = IX + INCX + 30 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*DCONJG(A( J, J )) + ENDIF + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 50, I = N, J + 1, -1 + IF (NOCONJ) THEN + X( I ) = X( I ) + TEMP*A( I, J ) + ELSE + X( I ) = X( I ) + TEMP*DCONJG(A( I, J )) + ENDIF + 50 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*DCONJG(A( J, J )) + ENDIF + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, I = N, J + 1, -1 + IF (NOCONJ) THEN + X( IX ) = X( IX ) + TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) + TEMP*DCONJG(A( I, J )) + ENDIF + IX = IX - INCX + 70 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*DCONJG(A( J, J )) + ENDIF + END IF + JX = JX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( J, J ) ) + DO 100, I = J - 1, 1, -1 + TEMP = TEMP + DCONJG( A( I, J ) )*X( I ) + 100 CONTINUE + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 120, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + A( I, J )*X( IX ) + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( J, J ) ) + DO 130, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + DCONJG( A( I, J ) )*X( IX ) + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = J + 1, N + TEMP = TEMP + A( I, J )*X( I ) + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( J, J ) ) + DO 160, I = J + 1, N + TEMP = TEMP + DCONJG( A( I, J ) )*X( I ) + 160 CONTINUE + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 180, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + A( I, J )*X( IX ) + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( J, J ) ) + DO 190, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + DCONJG( A( I, J ) )*X( IX ) + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTRMV . +* + END diff --git a/reference/ztrsmf.f b/reference/ztrsmf.f new file mode 100644 index 0000000..ed7d227 --- /dev/null +++ b/reference/ztrsmf.f @@ -0,0 +1,457 @@ + SUBROUTINE ZTRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + IMPLICIT NONE + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + COMPLEX*16 ALPHA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* ZTRSM solves one of the matrix equations +* +* op( A )*X = alpha*B, or X*op( A ) = alpha*B, +* +* where alpha is a scalar, X and B are m by n matrices, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). +* +* The matrix X is overwritten on B. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) appears on the left +* or right of X as follows: +* +* SIDE = 'L' or 'l' op( A )*X = alpha*B. +* +* SIDE = 'R' or 'r' X*op( A ) = alpha*B. +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the right-hand side matrix B, and on exit is +* overwritten by the solution matrix X. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOCONJ = (LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' )) + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTRSM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ) )THEN +* +* Form B := alpha*inv( A )*B. +* + IF( UPPER )THEN + DO 60, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 30, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 30 CONTINUE + END IF + DO 50, K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) THEN + IF (NOCONJ) THEN + B( K, J ) = B( K, J )/A( K, K ) + ELSE + B( K, J ) = B( K, J )/DCONJG(A( K, K )) + ENDIF + ENDIF + IF (NOCONJ) THEN + DO 40, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 40 CONTINUE + ELSE + DO 45, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*DCONJG(A( I, K )) + 45 CONTINUE + ENDIF + END IF + 50 CONTINUE + 60 CONTINUE + ELSE + DO 100, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 70, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 70 CONTINUE + END IF + DO 90 K = 1, M + IF (NOCONJ) THEN + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 80, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 80 CONTINUE + END IF + ELSE + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/DCONJG(A( K, K )) + DO 85, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*DCONJG(A( I, K )) + 85 CONTINUE + END IF + ENDIF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form B := alpha*inv( A' )*B +* or B := alpha*inv( conjg( A' ) )*B. +* + IF( UPPER )THEN + DO 140, J = 1, N + DO 130, I = 1, M + TEMP = ALPHA*B( I, J ) + IF( NOCONJ )THEN + DO 110, K = 1, I - 1 + TEMP = TEMP - A( K, I )*B( K, J ) + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + ELSE + DO 120, K = 1, I - 1 + TEMP = TEMP - DCONJG( A( K, I ) )*B( K, J ) + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( I, I ) ) + END IF + B( I, J ) = TEMP + 130 CONTINUE + 140 CONTINUE + ELSE + DO 180, J = 1, N + DO 170, I = M, 1, -1 + TEMP = ALPHA*B( I, J ) + IF( NOCONJ )THEN + DO 150, K = I + 1, M + TEMP = TEMP - A( K, I )*B( K, J ) + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + ELSE + DO 160, K = I + 1, M + TEMP = TEMP - DCONJG( A( K, I ) )*B( K, J ) + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( I, I ) ) + END IF + B( I, J ) = TEMP + 170 CONTINUE + 180 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ) )THEN +* +* Form B := alpha*B*inv( A ). +* + IF( UPPER )THEN + DO 230, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 190, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 190 CONTINUE + END IF + DO 210, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + DO 200, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 200 CONTINUE + ELSE + DO 205, I = 1, M + B( I, J ) = B( I, J ) - DCONJG(A( K, J ))*B( I, K ) + 205 CONTINUE + ENDIF + END IF + 210 CONTINUE + IF( NOUNIT )THEN + IF (NOCONJ) THEN + TEMP = ONE/A( J, J ) + ELSE + TEMP = ONE/DCONJG(A( J, J )) + ENDIF + DO 220, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 220 CONTINUE + END IF + 230 CONTINUE + ELSE + DO 280, J = N, 1, -1 + IF( ALPHA.NE.ONE )THEN + DO 240, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 240 CONTINUE + END IF + DO 260, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + DO 250, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 250 CONTINUE + ELSE + DO 255, I = 1, M + B( I, J ) = B( I, J ) - DCONJG(A( K, J ))*B( I, K ) + 255 CONTINUE + ENDIF + END IF + 260 CONTINUE + IF( NOUNIT )THEN + IF (NOCONJ) THEN + TEMP = ONE/A( J, J ) + ELSE + TEMP = ONE/DCONJG(A( J, J )) + ENDIF + DO 270, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 270 CONTINUE + END IF + 280 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*inv( A' ) +* or B := alpha*B*inv( conjg( A' ) ). +* + IF( UPPER )THEN + DO 330, K = N, 1, -1 + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = ONE/A( K, K ) + ELSE + TEMP = ONE/DCONJG( A( K, K ) ) + END IF + DO 290, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 290 CONTINUE + END IF + DO 310, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = A( J, K ) + ELSE + TEMP = DCONJG( A( J, K ) ) + END IF + DO 300, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 300 CONTINUE + END IF + 310 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 320, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 320 CONTINUE + END IF + 330 CONTINUE + ELSE + DO 380, K = 1, N + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = ONE/A( K, K ) + ELSE + TEMP = ONE/DCONJG( A( K, K ) ) + END IF + DO 340, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 340 CONTINUE + END IF + DO 360, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = A( J, K ) + ELSE + TEMP = DCONJG( A( J, K ) ) + END IF + DO 350, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 350 CONTINUE + END IF + 360 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 370, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 370 CONTINUE + END IF + 380 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTRSM . +* + END diff --git a/reference/ztrsvf.f b/reference/ztrsvf.f new file mode 100644 index 0000000..c8b3d54 --- /dev/null +++ b/reference/ztrsvf.f @@ -0,0 +1,361 @@ + SUBROUTINE ZTRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTRSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTRSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) ) THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*A( I, J ) + 10 CONTINUE + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/DCONJG(A( J, J )) + TEMP = X( J ) + DO 15, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*DCONJG(A( I, J )) + 15 CONTINUE + ENDIF + END IF + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/DCONJG(A( J, J )) + ENDIF + TEMP = X( JX ) + IX = JX + DO 30, I = J - 1, 1, -1 + IX = IX - INCX + IF (NOCONJ) THEN + X( IX ) = X( IX ) - TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) - TEMP*DCONJG(A( I, J )) + ENDIF + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*A( I, J ) + 50 CONTINUE + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/DCONJG(A( J, J )) + TEMP = X( J ) + DO 55, I = J + 1, N + X( I ) = X( I ) - TEMP*DCONJG(A( I, J )) + 55 CONTINUE + ENDIF + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/DCONJG(A( J, J )) + ENDIF + TEMP = X( JX ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + IF (NOCONJ) THEN + X( IX ) = X( IX ) - TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) - TEMP*DCONJG(A( I, J )) + ENDIF + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = X( J ) + IF( NOCONJ )THEN + DO 90, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( I ) + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 100, I = 1, J - 1 + TEMP = TEMP - DCONJG( A( I, J ) )*X( I ) + 100 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( J, J ) ) + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + JX = KX + DO 140, J = 1, N + IX = KX + TEMP = X( JX ) + IF( NOCONJ )THEN + DO 120, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX + INCX + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 130, I = 1, J - 1 + TEMP = TEMP - DCONJG( A( I, J ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( J, J ) ) + END IF + X( JX ) = TEMP + JX = JX + INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = N, 1, -1 + TEMP = X( J ) + IF( NOCONJ )THEN + DO 150, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( I ) + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 160, I = N, J + 1, -1 + TEMP = TEMP - DCONJG( A( I, J ) )*X( I ) + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( J, J ) ) + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 200, J = N, 1, -1 + IX = KX + TEMP = X( JX ) + IF( NOCONJ )THEN + DO 180, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX - INCX + 180 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 190, I = N, J + 1, -1 + TEMP = TEMP - DCONJG( A( I, J ) )*X( IX ) + IX = IX - INCX + 190 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( J, J ) ) + END IF + X( JX ) = TEMP + JX = JX - INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTRSV . +* + END diff --git a/reference/ztrti2f.f b/reference/ztrti2f.f new file mode 100644 index 0000000..a40d2ed --- /dev/null +++ b/reference/ztrti2f.f @@ -0,0 +1,146 @@ + SUBROUTINE ZTRTI2F( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZTRTI2 computes the inverse of a complex upper or lower triangular +* matrix. +* +* This is the Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the matrix A is upper or lower triangular. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* DIAG (input) CHARACTER*1 +* Specifies whether or not the matrix A is unit triangular. +* = 'N': Non-unit triangular +* = 'U': Unit triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading n by n upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J + COMPLEX*16 AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZSCAL, ZTRMV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZTRTI2', -INFO ) + RETURN + END IF +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix. +* + DO 10 J = 1, N + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF +* +* Compute elements 1:j-1 of j-th column. +* + CALL ZTRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, + $ A( 1, J ), 1 ) + CALL ZSCAL( J-1, AJJ, A( 1, J ), 1 ) + 10 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix. +* + DO 20 J = N, 1, -1 + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF + IF( J.LT.N ) THEN +* +* Compute elements j+1:n of j-th column. +* + CALL ZTRMV( 'Lower', 'No transpose', DIAG, N-J, + $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) + CALL ZSCAL( N-J, AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of ZTRTI2 +* + END diff --git a/reference/ztrtrif.f b/reference/ztrtrif.f new file mode 100644 index 0000000..f68caf4 --- /dev/null +++ b/reference/ztrtrif.f @@ -0,0 +1,177 @@ + SUBROUTINE ZTRTRIF( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZTRTRI computes the inverse of a complex upper or lower triangular +* matrix A. +* +* This is the Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': A is upper triangular; +* = 'L': A is lower triangular. +* +* DIAG (input) CHARACTER*1 +* = 'N': A is non-unit triangular; +* = 'U': A is unit triangular. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading N-by-N upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, A(i,i) is exactly zero. The triangular +* matrix is singular and its inverse can not be computed. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE, ZERO + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), + $ ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J, JB, NB, NN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZTRMM, ZTRSM, ZTRTI2 +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZTRTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Check for singularity if non-unit. +* + IF( NOUNIT ) THEN + DO 10 INFO = 1, N + IF( A( INFO, INFO ).EQ.ZERO ) + $ RETURN + 10 CONTINUE + INFO = 0 + END IF +* +* Determine the block size for this environment. +* + NB = 128 + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL ZTRTI2( UPLO, DIAG, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix +* + DO 20 J = 1, N, NB + JB = MIN( NB, N-J+1 ) +* +* Compute rows 1:j-1 of current block column +* + CALL ZTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, + $ JB, ONE, A, LDA, A( 1, J ), LDA ) + CALL ZTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, + $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) +* +* Compute inverse of current diagonal block +* + CALL ZTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) + 20 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 30 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) + IF( J+JB.LE.N ) THEN +* +* Compute rows j+jb:n of current block column +* + CALL ZTRMM( 'Left', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, + $ A( J+JB, J ), LDA ) + CALL ZTRSM( 'Right', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF +* +* Compute inverse of current diagonal block +* + CALL ZTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) + 30 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZTRTRI +* + END diff --git a/symcopy.h b/symcopy.h new file mode 100644 index 0000000..ed6e5b4 --- /dev/null +++ b/symcopy.h @@ -0,0 +1,1873 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/* This implementation is completely wrong. I'll rewrite this */ + +#ifndef SYMCOPY_H +#define SYMCOPY_H + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + +static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 2; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m + 2; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2 * m + 2; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a21; + *(bb2 + 1) = a22; + aa1 += 2; + aa2 += 2; + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is --; + } + + is = ((m - js - 2) & 1); + + if (is == 1){ + a11 = *(aa1 + 0); + a12 = *(aa2 + 0); + + *(bb1 + 0) = a11; + *(bb2 + 0) = a12; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + + } +} + +static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a12; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + aa1 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(cc1 + 0) = a11; + *(cc2 + 0) = a21; + bb1 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + } +} + + +static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + a22 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = a22; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + + } +} + +static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + } +} + +static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = -a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = 0.; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = -a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = -a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = -a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = -a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + + } +} + +static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = -a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = -a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = -a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a12; + *(bb1 + 3) = -a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = 0.; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = -a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + } +} + + +static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = 0.; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = -a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + + } +} + +static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = -a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = 0.; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + } +} + + +static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 2; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m + 2; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2 * m + 2; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a21; + *(bb2 + 1) = a22; + aa1 += 2; + aa2 += 2; + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is --; + } + + is = ((m - js - 2) & 1); + + if (is == 1){ + a11 = *(aa1 + 0); + a12 = *(aa2 + 0); + + *(bb1 + 0) = a11; + *(bb2 + 0) = a12; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + + } +} + +static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 2; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m + 2; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2 * m + 2; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a21; + *(bb2 + 1) = a22; + aa1 += 2; + aa2 += 2; + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is --; + } + + is = ((m - js - 2) & 1); + + if (is == 1){ + a11 = *(aa1 + 0); + a12 = *(aa2 + 0); + + *(bb1 + 0) = a11; + *(bb2 + 0) = a12; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + + } +} + +static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a12; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + aa1 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(cc1 + 0) = a11; + *(cc2 + 0) = a21; + bb1 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + } +} + +static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a12; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + aa1 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(cc1 + 0) = a11; + *(cc2 + 0) = a21; + bb1 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + } +} + +static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + a22 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = a22; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + + } +} + +static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + a22 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = a22; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + + } +} + +static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + } +} + +static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + } +} + +#endif +#endif + diff --git a/test/._Makefile b/test/._Makefile new file mode 100644 index 0000000..f3cd5cf Binary files /dev/null and b/test/._Makefile differ diff --git a/test/._cblat1.f b/test/._cblat1.f new file mode 100644 index 0000000..f29529f Binary files /dev/null and b/test/._cblat1.f differ diff --git a/test/._cblat2.dat b/test/._cblat2.dat new file mode 100644 index 0000000..91a3a82 Binary files /dev/null and b/test/._cblat2.dat differ diff --git a/test/._cblat2.f b/test/._cblat2.f new file mode 100644 index 0000000..ab90d52 Binary files /dev/null and b/test/._cblat2.f differ diff --git a/test/._cblat3.dat b/test/._cblat3.dat new file mode 100644 index 0000000..6a11154 Binary files /dev/null and b/test/._cblat3.dat differ diff --git a/test/._cblat3.f b/test/._cblat3.f new file mode 100644 index 0000000..f668014 Binary files /dev/null and b/test/._cblat3.f differ diff --git a/test/._dblat1.f b/test/._dblat1.f new file mode 100644 index 0000000..78b2699 Binary files /dev/null and b/test/._dblat1.f differ diff --git a/test/._dblat2.dat b/test/._dblat2.dat new file mode 100644 index 0000000..4281a32 Binary files /dev/null and b/test/._dblat2.dat differ diff --git a/test/._dblat2.f b/test/._dblat2.f new file mode 100644 index 0000000..6a4144e Binary files /dev/null and b/test/._dblat2.f differ diff --git a/test/._dblat3.dat b/test/._dblat3.dat new file mode 100644 index 0000000..41ddaf1 Binary files /dev/null and b/test/._dblat3.dat differ diff --git a/test/._dblat3.f b/test/._dblat3.f new file mode 100644 index 0000000..0c8a42d Binary files /dev/null and b/test/._dblat3.f differ diff --git a/test/._sblat1.f b/test/._sblat1.f new file mode 100644 index 0000000..37c8189 Binary files /dev/null and b/test/._sblat1.f differ diff --git a/test/._sblat2.dat b/test/._sblat2.dat new file mode 100644 index 0000000..f89d6db Binary files /dev/null and b/test/._sblat2.dat differ diff --git a/test/._sblat2.f b/test/._sblat2.f new file mode 100644 index 0000000..a29bbe7 Binary files /dev/null and b/test/._sblat2.f differ diff --git a/test/._sblat3.dat b/test/._sblat3.dat new file mode 100644 index 0000000..e1daf47 Binary files /dev/null and b/test/._sblat3.dat differ diff --git a/test/._sblat3.f b/test/._sblat3.f new file mode 100644 index 0000000..4432c61 Binary files /dev/null and b/test/._sblat3.f differ diff --git a/test/._zblat1.f b/test/._zblat1.f new file mode 100644 index 0000000..9cecc13 Binary files /dev/null and b/test/._zblat1.f differ diff --git a/test/._zblat2.dat b/test/._zblat2.dat new file mode 100644 index 0000000..b479c2d Binary files /dev/null and b/test/._zblat2.dat differ diff --git a/test/._zblat2.f b/test/._zblat2.f new file mode 100644 index 0000000..8fdbb4d Binary files /dev/null and b/test/._zblat2.f differ diff --git a/test/._zblat3.dat b/test/._zblat3.dat new file mode 100644 index 0000000..0559d11 Binary files /dev/null and b/test/._zblat3.dat differ diff --git a/test/._zblat3.f b/test/._zblat3.f new file mode 100644 index 0000000..7b34b3c Binary files /dev/null and b/test/._zblat3.f differ diff --git a/test/LICENSE b/test/LICENSE new file mode 100644 index 0000000..85061f2 --- /dev/null +++ b/test/LICENSE @@ -0,0 +1,23 @@ +This directory contains the reference implementation of BLAS +which is obtainable at: http://netlib.org/blas/ + +The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, +2010, is as follows: + +2) Are there legal restrictions on the use of BLAS reference implementation +software? + +The reference BLAS is a freely-available software package. It is available from +netlib via anonymous ftp and the World Wide Web. Thus, it can be included in +commercial software packages (and has been). We only ask that proper credit be +given to the authors. + +Like all software, it is copyrighted. It is not trademarked, but we do ask the +following: + +If you modify the source for these routines we ask that you change the name of +the routine and comment the changes made to the original. + +We will gladly answer any questions regarding the software. If a modification +is done, however, it is the responsibility of the person who modified the +routine to provide support. diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..4f6ca91 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,122 @@ +TOPDIR = .. +include ../Makefile.system + +all :: level1 level2 level3 + +level1 : sblat1 dblat1 cblat1 zblat1 + GOTO_NUM_THREADS=1 ./sblat1 + GOTO_NUM_THREADS=1 ./dblat1 + GOTO_NUM_THREADS=1 ./cblat1 + GOTO_NUM_THREADS=1 ./zblat1 +ifdef SMP + GOTO_NUM_THREADS=2 ./sblat1 + GOTO_NUM_THREADS=2 ./dblat1 + GOTO_NUM_THREADS=2 ./cblat1 + GOTO_NUM_THREADS=2 ./zblat1 +endif + +level2 : sblat2 dblat2 cblat2 zblat2 + rm -f ?BLAT2.SUMM + GOTO_NUM_THREADS=1 ./sblat2 < ./sblat2.dat + @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./dblat2 < ./dblat2.dat + @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./cblat2 < ./cblat2.dat + @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./zblat2 < ./zblat2.dat + @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +ifdef SMP + rm -f ?BLAT2.SUMM + GOTO_NUM_THREADS=2 ./sblat2 < ./sblat2.dat + @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./dblat2 < ./dblat2.dat + @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./cblat2 < ./cblat2.dat + @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./zblat2 < ./zblat2.dat + @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +endif + +level3 : sblat3 dblat3 cblat3 zblat3 + rm -f ?BLAT3.SUMM + GOTO_NUM_THREADS=1 ./sblat3 < ./sblat3.dat + @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./dblat3 < ./dblat3.dat + @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./cblat3 < ./cblat3.dat + @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./zblat3 < ./zblat3.dat + @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +ifdef SMP + rm -f ?BLAT3.SUMM + GOTO_NUM_THREADS=2 ./sblat3 < ./sblat3.dat + @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./dblat3 < ./dblat3.dat + @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./cblat3 < ./cblat3.dat + @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./zblat3 < ./zblat3.dat + @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +endif + +FLDFLAGS = $(FFLAGS:-fPIC=) +CEXTRALIB = + + +sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat1 : dblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat1 dblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +qblat1 : qblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o qblat1 qblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +cblat1 : cblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat1 cblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +cblat2 : cblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat2 cblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat2 zblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +clean: + @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ + sblat1 dblat1 cblat1 zblat1 \ + sblat2 dblat2 cblat2 zblat2 \ + sblat3 dblat3 cblat3 zblat3 \ + sblat1p dblat1p cblat1p zblat1p \ + sblat2p dblat2p cblat2p zblat2p \ + sblat3p dblat3p cblat3p zblat3p \ + *.stackdump *.dll + +libs: + +prof: + +quick : + $(MAKE) -C $(TOPDIR) libs + +# include ../Makefile.tail diff --git a/test/cblat1.f b/test/cblat1.f new file mode 100644 index 0000000..a4c996f --- /dev/null +++ b/test/cblat1.f @@ -0,0 +1,681 @@ + PROGRAM CBLAT1 +* Test program for the COMPLEX Level 1 BLAS. +* Based upon the original BLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex BLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*6 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CDOTC '/ + DATA L(2)/'CDOTU '/ + DATA L(3)/'CAXPY '/ + DATA L(4)/'CCOPY '/ + DATA L(5)/'CSWAP '/ + DATA L(6)/'SCNRM2'/ + DATA L(7)/'SCASUM'/ + DATA L(8)/'CSCAL '/ + DATA L(9)/'CSSCAL'/ + DATA L(10)/'ICAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,12X,A6) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA + REAL SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + REAL STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + REAL SCASUM, SCNRM2 + INTEGER ICAMAX + EXTERNAL SCASUM, SCNRM2, ICAMAX +* .. External Subroutines .. + EXTERNAL CSCAL, CSSCAL, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3E0, (0.4E0,-0.7E0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.3E0,-0.4E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.1E0,-0.3E0), (0.5E0,-0.1E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), + + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.3E0,-0.4E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.1E0,-0.3E0), (8.0E0,9.0E0), (0.5E0,-0.1E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (0.1E0,0.1E0), + + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), + + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), + + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ + DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ + DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (-0.16E0,-0.37E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (-0.17E0,-0.19E0), (0.13E0,-0.39E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.19E0,-0.17E0), (0.32E0,0.09E0), + + (0.23E0,-0.24E0), (0.18E0,0.01E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (-0.16E0,-0.37E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (-0.17E0,-0.19E0), (8.0E0,9.0E0), + + (0.13E0,-0.39E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.11E0,-0.03E0), (3.0E0,6.0E0), + + (-0.17E0,0.46E0), (4.0E0,7.0E0), + + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), + + (0.32E0,0.09E0), (6.0E0,9.0E0), + + (0.23E0,-0.24E0), (8.0E0,3.0E0), + + (0.18E0,0.01E0), (9.0E0,4.0E0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.09E0,-0.12E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.03E0,-0.09E0), (0.15E0,-0.03E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.03E0,0.03E0), (-0.18E0,0.03E0), + + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.09E0,0.03E0), (0.03E0,0.12E0), + + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.09E0,-0.12E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.03E0,-0.09E0), (8.0E0,9.0E0), + + (0.15E0,-0.03E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.03E0,0.03E0), (3.0E0,6.0E0), + + (-0.18E0,0.03E0), (4.0E0,7.0E0), + + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), + + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), + + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. SCNRM2 .. + CALL STEST1(SCNRM2(N,CX,INCX),STRUE2(NP1),STRUE2(NP1), + + SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. SCASUM .. + CALL STEST1(SCASUM(N,CX,INCX),STRUE4(NP1),STRUE4(NP1), + + SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. CSCAL .. + CALL CSCAL(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. CSSCAL .. + CALL CSSCAL(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ICAMAX .. + CALL ITEST1(ICAMAX(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* CSCAL +* Add a test for alpha equal to zero. + CA = (0.0E0,0.0E0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 80 CONTINUE + CALL CSCAL(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* CSSCAL +* Add a test for alpha equal to zero. + SA = 0.0E0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 100 CONTINUE + CALL CSSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0E0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL CSSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0E0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL CSSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + COMPLEX CDOTC, CDOTU + EXTERNAL CDOTC, CDOTU +* .. External Subroutines .. + EXTERNAL CAXPY, CCOPY, CSWAP, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4E0,-0.7E0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (-0.1E0,-0.9E0), (0.2E0,-0.8E0), + + (-0.9E0,-0.4E0), (0.1E0,0.4E0), (-0.6E0,0.6E0)/ + DATA CY1/(0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.7E0,-0.6E0), (0.1E0,-0.5E0), (-0.1E0,-0.2E0), + + (-0.5E0,-0.3E0), (0.8E0,-0.7E0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-1.55E0,0.5E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (-1.55E0,0.5E0), + + (0.03E0,-0.89E0), (-0.38E0,-0.96E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-0.9E0,0.5E0), (0.42E0,-1.41E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-0.9E0,0.5E0), + + (0.06E0,-0.13E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.52E0,-1.51E0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-1.18E0,-0.31E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-1.54E0,0.97E0), + + (0.03E0,-0.89E0), (-0.18E0,-1.31E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), (-0.9E0,0.5E0), + + (0.05E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-0.9E0,0.5E0), (0.05E0,-0.6E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.32E0,-1.16E0)/ + DATA CT7/(0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (0.65E0,-0.47E0), (-0.34E0,-1.22E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.59E0,-1.46E0), (-1.04E0,-0.04E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.83E0,0.59E0), (0.07E0,-0.37E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.76E0,-1.15E0), (-1.33E0,-1.82E0)/ + DATA CT6/(0.0E0,0.0E0), (0.90E0,0.06E0), + + (0.91E0,-0.77E0), (1.80E0,-0.10E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.45E0,0.74E0), + + (0.20E0,0.90E0), (0.0E0,0.0E0), (0.90E0,0.06E0), + + (-0.55E0,0.23E0), (0.83E0,-0.39E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.04E0,0.79E0), + + (1.95E0,1.22E0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (-0.9E0,0.5E0), (0.7E0,-0.6E0), (0.1E0,-0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.6E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.8E0,-0.7E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.2E0), + + (0.2E0,-0.8E0), (0.7E0,-0.6E0), (0.1E0,0.4E0), + + (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.9E0,0.5E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.1E0,-0.5E0), + + (-0.4E0,-0.7E0), (0.7E0,-0.6E0), (0.2E0,-0.8E0), + + (-0.9E0,0.5E0), (0.1E0,0.4E0), (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (0.7E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (0.7E0,-0.6E0), (-0.1E0,-0.2E0), (0.8E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.9E0), + + (0.2E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (-0.9E0,0.5E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,0.5E0), (-0.9E0,-0.4E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.7E0,-0.8E0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,-0.4E0), (-0.1E0,-0.9E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.9E0,0.5E0), + + (-0.4E0,-0.7E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.9E0,0.5E0), (-0.4E0,-0.7E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.2E0,-0.8E0)/ + DATA CSIZE1/(0.0E0,0.0E0), (0.9E0,0.9E0), + + (1.63E0,1.73E0), (2.90E0,2.78E0)/ + DATA CSIZE3/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0)/ + DATA CSIZE2/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. CDOTC .. + CDOT(1) = CDOTC(N,CX,INCX,CY,INCY) + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. CDOTU .. + CDOT(1) = CDOTU(N,CX,INCX,CY,INCY) + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. CAXPY .. + CALL CAXPY(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. CCOPY .. + CALL CCOPY(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE IF (ICASE.EQ.5) THEN +* .. CSWAP .. + CALL CSWAP(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0E0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + REAL SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC AIMAG, REAL +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = REAL(CCOMP(I)) + SCOMP(2*I) = AIMAG(CCOMP(I)) + STRUE(2*I-1) = REAL(CTRUE(I)) + STRUE(2*I) = AIMAG(CTRUE(I)) + SSIZE(2*I-1) = REAL(CSIZE(I)) + SSIZE(2*I) = AIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/test/cblat2.dat b/test/cblat2.dat new file mode 100644 index 0000000..1c6e315 --- /dev/null +++ b/test/cblat2.dat @@ -0,0 +1,35 @@ +'CBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +CGEMV T PUT F FOR NO TEST. SAME COLUMNS. +CGBMV T PUT F FOR NO TEST. SAME COLUMNS. +CHEMV T PUT F FOR NO TEST. SAME COLUMNS. +CHBMV T PUT F FOR NO TEST. SAME COLUMNS. +CHPMV T PUT F FOR NO TEST. SAME COLUMNS. +CTRMV T PUT F FOR NO TEST. SAME COLUMNS. +CTBMV T PUT F FOR NO TEST. SAME COLUMNS. +CTPMV T PUT F FOR NO TEST. SAME COLUMNS. +CTRSV T PUT F FOR NO TEST. SAME COLUMNS. +CTBSV T PUT F FOR NO TEST. SAME COLUMNS. +CTPSV T PUT F FOR NO TEST. SAME COLUMNS. +CGERC T PUT F FOR NO TEST. SAME COLUMNS. +CGERU T PUT F FOR NO TEST. SAME COLUMNS. +CHER T PUT F FOR NO TEST. SAME COLUMNS. +CHPR T PUT F FOR NO TEST. SAME COLUMNS. +CHER2 T PUT F FOR NO TEST. SAME COLUMNS. +CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/cblat2.f b/test/cblat2.f new file mode 100644 index 0000000..20f1881 --- /dev/null +++ b/test/cblat2.f @@ -0,0 +1,3241 @@ + PROGRAM CBLAT2 +* +* Test program for the COMPLEX Level 2 Blas. +* +* The program must be driven by a short data file. The first 18 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 35 lines: +* 'CBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* CGEMV T PUT F FOR NO TEST. SAME COLUMNS. +* CGBMV T PUT F FOR NO TEST. SAME COLUMNS. +* CHEMV T PUT F FOR NO TEST. SAME COLUMNS. +* CHBMV T PUT F FOR NO TEST. SAME COLUMNS. +* CHPMV T PUT F FOR NO TEST. SAME COLUMNS. +* CTRMV T PUT F FOR NO TEST. SAME COLUMNS. +* CTBMV T PUT F FOR NO TEST. SAME COLUMNS. +* CTPMV T PUT F FOR NO TEST. SAME COLUMNS. +* CTRSV T PUT F FOR NO TEST. SAME COLUMNS. +* CTBSV T PUT F FOR NO TEST. SAME COLUMNS. +* CTPSV T PUT F FOR NO TEST. SAME COLUMNS. +* CGERC T PUT F FOR NO TEST. SAME COLUMNS. +* CGERU T PUT F FOR NO TEST. SAME COLUMNS. +* CHER T PUT F FOR NO TEST. SAME COLUMNS. +* CHPR T PUT F FOR NO TEST. SAME COLUMNS. +* CHER2 T PUT F FOR NO TEST. SAME COLUMNS. +* CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANS + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHK6, + $ CCHKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'CGEMV ', 'CGBMV ', 'CHEMV ', 'CHBMV ', + $ 'CHPMV ', 'CTRMV ', 'CTBMV ', 'CTPMV ', + $ 'CTRSV ', 'CTBSV ', 'CTPSV ', 'CGERC ', + $ 'CGERU ', 'CHER ', 'CHPR ', 'CHER2 ', + $ 'CHPR2 '/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from CMVCH YT holds +* the result computed by CMVCH. + TRANS = 'N' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test CGEMV, 01, and CGBMV, 02. + 140 CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. + 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test CTRMV, 06, CTBMV, 07, CTPMV, 08, +* CTRSV, 09, CTBSV, 10, and CTPSV, 11. + 160 CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) + GO TO 200 +* Test CGERC, 12, CGERU, 13. + 170 CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test CHER, 14, and CHPR, 15. + 180 CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test CHER2, 16, and CHPR2, 17. + 190 CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE COMPLEX LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A6, L2 ) + 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT2. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests CGEMV and CGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CGBMV, CGEMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ TRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CGEMV( TRANS, M, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CGBMV( TRANS, M, N, KL, KU, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LCERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests CHEMV, CHBMV and CHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHBMV, CHEMV, CHPMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CHEMV( UPLO, N, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CHBMV( UPLO, N, K, ALPHA, AA, LDA, + $ XX, INCX, BETA, YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CHPMV( UPLO, N, ALPHA, AA, XX, INCX, + $ BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LCE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LCERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LCE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LCERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,', I2, ',(', F4.1, ',', F4.1, '), Y,', I2, + $ ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', F4.1, '), ', + $ 'Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) +* +* Tests CTRMV, CTBMV, CTPMV, CTRSV, CTBSV and CTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX TRANSL + REAL ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMVCH, CTBMV, CTBSV, CTPMV, CTPSV, + $ CTRMV, CTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'R' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for CMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTRMV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTBMV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTPMV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTRSV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTBSV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTPSV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LCERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LCERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LCERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MV' )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, + $ INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests CGERC and CGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CGERC, CGERU, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. + CONJ = SNAME( 5: 5 ).EQ.'C' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL CGERC( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL CGERU( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( 'GE', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = CONJG( W( 1 ) ) + CALL CMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests CHER and CHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, TRANSL + REAL ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHER, CHPR, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = REAL( ALF( IA ) ) + ALPHA = CMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CHER( UPLO, N, RALPHA, XX, INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CHPR( UPLO, N, RALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LCERES( SNAME( 2: 3 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = CONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END + SUBROUTINE CCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests CHER2 and CHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHER2, CHPR2, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CHER2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CHPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( SNAME( 2: 3 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*CONJG( Z( J, 2 ) ) + W( 2 ) = CONJG( ALPHA )*CONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') ', + $ ' .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK6. +* + END + SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 2 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, A, X and Y should not need to be defined. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX ALPHA, BETA + REAL RALPHA +* .. Local Arrays .. + COMPLEX A( 1, 1 ), X( 1 ), Y( 1 ) +* .. External Subroutines .. + EXTERNAL CGBMV, CGEMV, CGERC, CGERU, CHBMV, CHEMV, CHER, + $ CHER2, CHKXER, CHPMV, CHPR, CHPR2, CTBMV, + $ CTBSV, CTPMV, CTPSV, CTRMV, CTRSV +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90, 100, 110, 120, 130, 140, 150, 160, + $ 170 )ISNUM + 10 INFOT = 1 + CALL CGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 20 INFOT = 1 + CALL CGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 30 INFOT = 1 + CALL CHEMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHEMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHEMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHEMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 40 INFOT = 1 + CALL CHBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CHBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CHBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CHBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 50 INFOT = 1 + CALL CHPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CHPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 60 INFOT = 1 + CALL CTRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 70 INFOT = 1 + CALL CTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 80 INFOT = 1 + CALL CTPMV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTPMV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTPMV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTPMV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTPMV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 90 INFOT = 1 + CALL CTRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 100 INFOT = 1 + CALL CTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 110 INFOT = 1 + CALL CTPSV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTPSV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTPSV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTPSV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTPSV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 120 INFOT = 1 + CALL CGERC( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGERC( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGERC( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CGERC( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CGERC( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 130 INFOT = 1 + CALL CGERU( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGERU( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGERU( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CGERU( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CGERU( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 140 INFOT = 1 + CALL CHER( '/', 0, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHER( 'U', -1, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHER( 'U', 0, RALPHA, X, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER( 'U', 2, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 150 INFOT = 1 + CALL CHPR( '/', 0, RALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHPR( 'U', -1, RALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHPR( 'U', 0, RALPHA, X, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 160 INFOT = 1 + CALL CHER2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHER2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHER2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 170 INFOT = 1 + CALL CHPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 180 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of CCHKE. +* + END + SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'GB', 'HE', 'HB', 'HP', 'TR', 'TB' OR 'TP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, MIN, REAL +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'G' + SYM = TYPE( 1: 1 ).EQ.'H' + TRI = TYPE( 1: 1 ).EQ.'T' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = CBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'GB' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'TR' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'HB'.OR.TYPE.EQ.'TB' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'HP'.OR.TYPE.EQ.'TP' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = CMPLX( REAL( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END + SUBROUTINE CMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX C + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + CONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of CMVCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE', 'HE' or 'HP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 2 BLAS +* routines. +* +* XERBLA is an error handler for the Level 2 BLAS routines. +* +* It is called by the Level 2 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/cblat3.dat b/test/cblat3.dat new file mode 100644 index 0000000..72c00b9 --- /dev/null +++ b/test/cblat3.dat @@ -0,0 +1,23 @@ +'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +F LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +CGEMM T PUT F FOR NO TEST. SAME COLUMNS. +CHEMM T PUT F FOR NO TEST. SAME COLUMNS. +CSYMM T PUT F FOR NO TEST. SAME COLUMNS. +CTRMM T PUT F FOR NO TEST. SAME COLUMNS. +CTRSM T PUT F FOR NO TEST. SAME COLUMNS. +CHERK T PUT F FOR NO TEST. SAME COLUMNS. +CSYRK T PUT F FOR NO TEST. SAME COLUMNS. +CHER2K T PUT F FOR NO TEST. SAME COLUMNS. +CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/cblat3.f b/test/cblat3.f new file mode 100644 index 0000000..b26be91 --- /dev/null +++ b/test/cblat3.f @@ -0,0 +1,3439 @@ + PROGRAM CBLAT3 +* +* Test program for the COMPLEX Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 23 lines: +* 'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* CGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* CHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* CSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* CTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* CTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* CHERK T PUT F FOR NO TEST. SAME COLUMNS. +* CSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* CHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'CGEMM ', 'CHEMM ', 'CSYMM ', 'CTRMM ', + $ 'CTRSM ', 'CHERK ', 'CSYRK ', 'CHER2K', + $ 'CSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from CMMCH CT holds +* the result computed by CMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test CGEMM, 01. + 140 CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test CHEMM, 02, CSYMM, 03. + 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test CTRMM, 04, CTRSM, 05. + 160 CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test CHERK, 06, CSYRK, 07. + 170 CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test CHER2K, 08, CSYR2K, 09. + 180 CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A6, L2 ) + 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT3. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests CGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CGEMM, CMAKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL CMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CGEMM( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LCERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests CHEMM and CSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHEMM, CMAKE, CMMCH, CSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + ELSE + CALL CSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests CTRMM and CTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS + REAL ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMMCH, CTRMM, CTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for CMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL CMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL CTRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL CTRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LCE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LCE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LCERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests CHERK and CSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHERK, CMAKE, CMMCH, CSYRK +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = REAL( ALPHA ) + ALPHA = CMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, RALPHA, LDA, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CHERK( UPLO, TRANS, N, K, RALPHA, AA, + $ LDA, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CSYRK( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LCERES( SNAME( 2: 3 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL CMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, + $ LDA, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests CHER2K and CSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHER2K, CMAKE, CMMCH, CSYR2K +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CHER2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CSYR2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'HE', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = CONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*CONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = CONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC + END IF +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END + SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX ALPHA, BETA + REAL RALPHA, RBETA +* .. Local Arrays .. + COMPLEX A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL CGEMM, CHEMM, CHER2K, CHERK, CHKXER, CSYMM, + $ CSYR2K, CSYRK, CTRMM, CTRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90 )ISNUM + 10 INFOT = 1 + CALL CGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL CGEMM( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL CGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMM( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 20 INFOT = 1 + CALL CHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 30 INFOT = 1 + CALL CSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 40 INFOT = 1 + CALL CTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 50 INFOT = 1 + CALL CTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 60 INFOT = 1 + CALL CHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 70 INFOT = 1 + CALL CSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 80 INFOT = 1 + CALL CHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 90 INFOT = 1 + CALL CSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 100 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of CCHKE. +* + END + SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'HE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, REAL +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + HER = TYPE.EQ.'HE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = CBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END + SUBROUTINE CMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX CL + REAL ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )* + $ CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of CMMCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'HE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/dblat1.f b/test/dblat1.f new file mode 100644 index 0000000..5a45d69 --- /dev/null +++ b/test/dblat1.f @@ -0,0 +1,769 @@ + PROGRAM DBLAT1 +* Test program for the DOUBLE PRECISION Level 1 BLAS. +* Based upon the original BLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real BLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*6 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/' DDOT '/ + DATA L(2)/'DAXPY '/ + DATA L(3)/'DROTG '/ + DATA L(4)/' DROT '/ + DATA L(5)/'DCOPY '/ + DATA L(6)/'DSWAP '/ + DATA L(7)/'DNRM2 '/ + DATA L(8)/'DASUM '/ + DATA L(9)/'DSCAL '/ + DATA L(10)/'IDAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,12X,A6) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION D12, SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + DOUBLE PRECISION DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL DROTG, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3D0, 0.4D0, -0.3D0, -0.4D0, -0.3D0, 0.0D0, + + 0.0D0, 1.0D0/ + DATA DB1/0.4D0, 0.3D0, 0.4D0, 0.3D0, -0.4D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DC1/0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.6D0, 1.0D0, + + 0.0D0, 1.0D0/ + DATA DS1/0.8D0, 0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DATRUE/0.5D0, 0.5D0, 0.5D0, -0.5D0, -0.5D0, + + 0.0D0, 1.0D0, 1.0D0/ + DATA DBTRUE/0.0D0, 0.6D0, 0.0D0, -0.6D0, 0.0D0, + + 0.0D0, 1.0D0, 0.0D0/ + DATA D12/4096.0D0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0D0/0.6D0 + DBTRUE(3) = -1.0D0/0.6D0 + DBTRUE(5) = 1.0D0/0.6D0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. DROTG .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL DROTG(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + DOUBLE PRECISION DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + DOUBLE PRECISION DASUM, DNRM2 + INTEGER IDAMAX + EXTERNAL DASUM, DNRM2, IDAMAX +* .. External Subroutines .. + EXTERNAL ITEST1, DSCAL, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0, -1.0D0, 0.0D0, 1.0D0, 0.3D0, 0.3D0, + + 0.3D0, 0.3D0, 0.3D0, 0.3D0/ + DATA DV/0.1D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 0.3D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 0.3D0, -0.4D0, 4.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 0.2D0, + + -0.6D0, 0.3D0, 5.0D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 0.1D0, -0.3D0, 0.5D0, -0.1D0, 6.0D0, + + 6.0D0, 6.0D0, 6.0D0, 0.1D0, 8.0D0, 8.0D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 0.3D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 0.3D0, 2.0D0, + + -0.4D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 0.2D0, 3.0D0, -0.6D0, 5.0D0, 0.3D0, 2.0D0, + + 2.0D0, 2.0D0, 0.1D0, 4.0D0, -0.3D0, 6.0D0, + + -0.5D0, 7.0D0, -0.1D0, 3.0D0/ + DATA DTRUE1/0.0D0, 0.3D0, 0.5D0, 0.7D0, 0.6D0/ + DATA DTRUE3/0.0D0, 0.3D0, 0.7D0, 1.1D0, 1.0D0/ + DATA DTRUE5/0.10D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, -0.3D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, 0.0D0, 0.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, + + 0.20D0, -0.60D0, 0.30D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 5.0D0, 0.03D0, -0.09D0, 0.15D0, -0.03D0, + + 6.0D0, 6.0D0, 6.0D0, 6.0D0, 0.10D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, + + 0.09D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 0.09D0, 2.0D0, -0.12D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, 2.0D0, 0.06D0, 3.0D0, + + -0.18D0, 5.0D0, 0.09D0, 2.0D0, 2.0D0, 2.0D0, + + 0.03D0, 4.0D0, -0.09D0, 6.0D0, -0.15D0, 7.0D0, + + -0.03D0, 3.0D0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. DNRM2 .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(DNRM2(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. DASUM .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(DASUM(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. DSCAL .. + CALL DSCAL(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IDAMAX .. + CALL ITEST1(IDAMAX(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA, SC, SS + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DT9X(7,4,4), DT9Y(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + DOUBLE PRECISION DDOT + EXTERNAL DDOT +* .. External Subroutines .. + EXTERNAL DAXPY, DCOPY, DSWAP, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA SC, SS/0.8D0, 0.6D0/ + DATA DT7/0.0D0, 0.30D0, 0.21D0, 0.62D0, 0.0D0, + + 0.30D0, -0.07D0, 0.85D0, 0.0D0, 0.30D0, -0.79D0, + + -0.74D0, 0.0D0, 0.30D0, 0.33D0, 1.27D0/ + DATA DT8/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.15D0, + + 0.94D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.68D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.35D0, -0.9D0, 0.48D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.38D0, -0.9D0, 0.57D0, 0.7D0, -0.75D0, + + 0.2D0, 0.98D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.35D0, -0.72D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.38D0, + + -0.63D0, 0.15D0, 0.88D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.7D0, + + -0.75D0, 0.2D0, 1.04D0/ + DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + + 0.0D0, 0.0D0, 0.0D0/ + DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + + -0.18D0, 0.2D0, 0.16D0/ + DATA DT10X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, -0.9D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, -0.9D0, 0.3D0, 0.7D0, + + 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.3D0, 0.1D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.8D0, 0.1D0, -0.6D0, + + 0.8D0, 0.3D0, -0.3D0, 0.5D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.9D0, + + 0.1D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + 0.1D0, 0.3D0, 0.8D0, -0.9D0, -0.3D0, 0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, -0.6D0, 0.8D0, 0.0D0, 0.0D0, + + 0.0D0/ + DATA DT10Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, -0.5D0, -0.9D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, -0.4D0, -0.9D0, 0.9D0, + + 0.7D0, -0.5D0, 0.2D0, 0.6D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + -0.4D0, 0.9D0, -0.5D0, 0.6D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.7D0, + + -0.5D0, 0.2D0, 0.8D0/ + DATA SSIZE1/0.0D0, 0.3D0, 1.6D0, 3.2D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. DDOT .. + CALL STEST1(DDOT(N,SX,INCX,SY,INCY),DT7(KN,KI),SSIZE1(KN) + + ,SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. DAXPY .. + CALL DAXPY(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. DCOPY .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL DCOPY(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE IF (ICASE.EQ.6) THEN +* .. DSWAP .. + CALL DSWAP(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0D0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA, SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL DROT, STEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA SC, SS/0.8D0, 0.6D0/ + DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + + 0.0D0, 0.0D0, 0.0D0/ + DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + + -0.18D0, 0.2D0, 0.16D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. DROT .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL DROT(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0 + DO 100 I = 2, 6 + MWPS(I) = 1 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL DROT(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/test/dblat2.dat b/test/dblat2.dat new file mode 100644 index 0000000..2680425 --- /dev/null +++ b/test/dblat2.dat @@ -0,0 +1,34 @@ +'DBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +DGEMV T PUT F FOR NO TEST. SAME COLUMNS. +DGBMV T PUT F FOR NO TEST. SAME COLUMNS. +DSYMV T PUT F FOR NO TEST. SAME COLUMNS. +DSBMV T PUT F FOR NO TEST. SAME COLUMNS. +DSPMV T PUT F FOR NO TEST. SAME COLUMNS. +DTRMV T PUT F FOR NO TEST. SAME COLUMNS. +DTBMV T PUT F FOR NO TEST. SAME COLUMNS. +DTPMV T PUT F FOR NO TEST. SAME COLUMNS. +DTRSV T PUT F FOR NO TEST. SAME COLUMNS. +DTBSV T PUT F FOR NO TEST. SAME COLUMNS. +DTPSV T PUT F FOR NO TEST. SAME COLUMNS. +DGER T PUT F FOR NO TEST. SAME COLUMNS. +DSYR T PUT F FOR NO TEST. SAME COLUMNS. +DSPR T PUT F FOR NO TEST. SAME COLUMNS. +DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/dblat2.f b/test/dblat2.f new file mode 100644 index 0000000..4002d43 --- /dev/null +++ b/test/dblat2.f @@ -0,0 +1,3138 @@ + PROGRAM DBLAT2 +* +* Test program for the DOUBLE PRECISION Level 2 Blas. +* +* The program must be driven by a short data file. The first 18 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'DBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* DGEMV T PUT F FOR NO TEST. SAME COLUMNS. +* DGBMV T PUT F FOR NO TEST. SAME COLUMNS. +* DSYMV T PUT F FOR NO TEST. SAME COLUMNS. +* DSBMV T PUT F FOR NO TEST. SAME COLUMNS. +* DSPMV T PUT F FOR NO TEST. SAME COLUMNS. +* DTRMV T PUT F FOR NO TEST. SAME COLUMNS. +* DTBMV T PUT F FOR NO TEST. SAME COLUMNS. +* DTPMV T PUT F FOR NO TEST. SAME COLUMNS. +* DTRSV T PUT F FOR NO TEST. SAME COLUMNS. +* DTBSV T PUT F FOR NO TEST. SAME COLUMNS. +* DTPSV T PUT F FOR NO TEST. SAME COLUMNS. +* DGER T PUT F FOR NO TEST. SAME COLUMNS. +* DSYR T PUT F FOR NO TEST. SAME COLUMNS. +* DSPR T PUT F FOR NO TEST. SAME COLUMNS. +* DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +* DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANS + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHK6, + $ DCHKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'DGEMV ', 'DGBMV ', 'DSYMV ', 'DSBMV ', + $ 'DSPMV ', 'DTRMV ', 'DTBMV ', 'DTPMV ', + $ 'DTRSV ', 'DTBSV ', 'DTPSV ', 'DGER ', + $ 'DSYR ', 'DSPR ', 'DSYR2 ', 'DSPR2 '/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from DMVCH YT holds +* the result computed by DMVCH. + TRANS = 'N' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL DCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test DGEMV, 01, and DGBMV, 02. + 140 CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. + 150 CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test DTRMV, 06, DTBMV, 07, DTPMV, 08, +* DTRSV, 09, DTBSV, 10, and DTPSV, 11. + 160 CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) + GO TO 200 +* Test DGER, 12. + 170 CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test DSYR, 13, and DSPR, 14. + 180 CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test DSYR2, 15, and DSPR2, 16. + 190 CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN DMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A6, L2 ) + 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT2. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests DGEMV and DGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DGBMV, DGEMV, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ TRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL DGEMV( TRANS, M, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL DGBMV( TRANS, M, N, KL, KU, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LDERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests DSYMV, DSBMV and DSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, DSBMV, DSPMV, DSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL DSYMV( UPLO, N, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL DSBMV( UPLO, N, K, ALPHA, AA, LDA, + $ XX, INCX, BETA, YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL DSPMV( UPLO, N, ALPHA, AA, XX, INCX, + $ BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LDE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LDERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LDERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) +* +* Tests DTRMV, DTBMV, DTPMV, DTRSV, DTBSV and DTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, DTBMV, DTBSV, DTPMV, DTPSV, + $ DTRMV, DTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'R' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for DMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTRMV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTBMV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTPMV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTRSV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTBSV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTPSV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LDERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LDERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LDERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MV' )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, + $ INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests DGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DGER, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL DGER( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( 'GE', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL DMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests DSYR and DSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, DSPR, DSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL DSYR( UPLO, N, ALPHA, XX, INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DSPR( UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LDERES( SNAME( 2: 3 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END + SUBROUTINE DCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests DSYR2 and DSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, DSPR2, DSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL DSYR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL DSPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( SNAME( 2: 3 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK6. +* + END + SUBROUTINE DCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 2 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, BETA, A, X and Y should not need to be defined. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, BETA +* .. Local Arrays .. + DOUBLE PRECISION A( 1, 1 ), X( 1 ), Y( 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, DGBMV, DGEMV, DGER, DSBMV, DSPMV, DSPR, + $ DSPR2, DSYMV, DSYR, DSYR2, DTBMV, DTBSV, DTPMV, + $ DTPSV, DTRMV, DTRSV +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90, 100, 110, 120, 130, 140, 150, + $ 160 )ISNUM + 10 INFOT = 1 + CALL DGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 20 INFOT = 1 + CALL DGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 30 INFOT = 1 + CALL DSYMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSYMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 40 INFOT = 1 + CALL DSBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DSBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DSBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DSBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 50 INFOT = 1 + CALL DSPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DSPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 60 INFOT = 1 + CALL DTRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 70 INFOT = 1 + CALL DTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 80 INFOT = 1 + CALL DTPMV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTPMV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTPMV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTPMV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTPMV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 90 INFOT = 1 + CALL DTRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 100 INFOT = 1 + CALL DTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 110 INFOT = 1 + CALL DTPSV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTPSV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTPSV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTPSV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTPSV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 120 INFOT = 1 + CALL DGER( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGER( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGER( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DGER( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DGER( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 130 INFOT = 1 + CALL DSYR( '/', 0, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYR( 'U', -1, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSYR( 'U', 0, ALPHA, X, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR( 'U', 2, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 140 INFOT = 1 + CALL DSPR( '/', 0, ALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSPR( 'U', -1, ALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSPR( 'U', 0, ALPHA, X, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 150 INFOT = 1 + CALL DSYR2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYR2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSYR2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 160 INFOT = 1 + CALL DSPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 170 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of DCHKE. +* + END + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'GB', 'SY', 'SB', 'SP', 'TR', 'TB' OR 'TP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'G' + SYM = TYPE( 1: 1 ).EQ.'S' + TRI = TYPE( 1: 1 ).EQ.'T' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = DBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'GB' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'SB'.OR.TYPE.EQ.'TB' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'SP'.OR.TYPE.EQ.'TP' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of DMVCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE', 'SY' or 'SP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC DBLE +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = DBLE( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 2 BLAS +* routines. +* +* XERBLA is an error handler for the Level 2 BLAS routines. +* +* It is called by the Level 2 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/dblat3.dat b/test/dblat3.dat new file mode 100644 index 0000000..78b6d18 --- /dev/null +++ b/test/dblat3.dat @@ -0,0 +1,20 @@ +'DBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +DGEMM T PUT F FOR NO TEST. SAME COLUMNS. +DSYMM T PUT F FOR NO TEST. SAME COLUMNS. +DTRMM T PUT F FOR NO TEST. SAME COLUMNS. +DTRSM T PUT F FOR NO TEST. SAME COLUMNS. +DSYRK T PUT F FOR NO TEST. SAME COLUMNS. +DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/dblat3.f b/test/dblat3.f new file mode 100644 index 0000000..082e03e --- /dev/null +++ b/test/dblat3.f @@ -0,0 +1,2823 @@ + PROGRAM DBLAT3 +* +* Test program for the DOUBLE PRECISION Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 20 lines: +* 'DBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* DGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* DSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* DTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* DTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* DSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHKE, DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'DGEMM ', 'DSYMM ', 'DTRMM ', 'DTRSM ', + $ 'DSYRK ', 'DSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from DMMCH CT holds +* the result computed by DMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL DCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test DGEMM, 01. + 140 CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test DSYMM, 02. + 150 CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test DTRMM, 03, DTRSM, 04. + 160 CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test DSYRK, 05. + 170 CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test DSYR2K, 06. + 180 CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN DMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A6, L2 ) + 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT3. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests DGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DGEMM, DMAKE, DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL DGEMM( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests DSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, DSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL DMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL DSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests DTRMM and DTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, DTRMM, DTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for DMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL DMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL DTRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL DTRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LDE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LDE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LDERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests DSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, DSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL DSYRK( UPLO, TRANS, N, K, ALPHA, AA, LDA, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL DMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests DSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, DSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL DSYR2K( UPLO, TRANS, N, K, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL DMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL DMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END + SUBROUTINE DCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, BETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, BETA +* .. Local Arrays .. + DOUBLE PRECISION A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, DGEMM, DSYMM, DSYR2K, DSYRK, DTRMM, + $ DTRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM + 10 INFOT = 1 + CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL DGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 20 INFOT = 1 + CALL DSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 30 INFOT = 1 + CALL DTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 40 INFOT = 1 + CALL DTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 50 INFOT = 1 + CALL DSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYRK( 'U', '/', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 60 INFOT = 1 + CALL DSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYR2K( 'U', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 70 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of DCHKE. +* + END + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = DBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of DMMCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = ( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/sblat1.f b/test/sblat1.f new file mode 100644 index 0000000..a982d18 --- /dev/null +++ b/test/sblat1.f @@ -0,0 +1,769 @@ + PROGRAM SBLAT1 +* Test program for the REAL Level 1 BLAS. +* Based upon the original BLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real BLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*6 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/' SDOT '/ + DATA L(2)/'SAXPY '/ + DATA L(3)/'SROTG '/ + DATA L(4)/' SROT '/ + DATA L(5)/'SCOPY '/ + DATA L(6)/'SSWAP '/ + DATA L(7)/'SNRM2 '/ + DATA L(8)/'SASUM '/ + DATA L(9)/'SSCAL '/ + DATA L(10)/'ISAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,12X,A6) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL D12, SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + REAL DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL SROTG, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3E0, 0.4E0, -0.3E0, -0.4E0, -0.3E0, 0.0E0, + + 0.0E0, 1.0E0/ + DATA DB1/0.4E0, 0.3E0, 0.4E0, 0.3E0, -0.4E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DC1/0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.6E0, 1.0E0, + + 0.0E0, 1.0E0/ + DATA DS1/0.8E0, 0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DATRUE/0.5E0, 0.5E0, 0.5E0, -0.5E0, -0.5E0, + + 0.0E0, 1.0E0, 1.0E0/ + DATA DBTRUE/0.0E0, 0.6E0, 0.0E0, -0.6E0, 0.0E0, + + 0.0E0, 1.0E0, 0.0E0/ + DATA D12/4096.0E0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0E0/0.6E0 + DBTRUE(3) = -1.0E0/0.6E0 + DBTRUE(5) = 1.0E0/0.6E0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. SROTG .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL SROTG(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + REAL DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + REAL SASUM, SNRM2 + INTEGER ISAMAX + EXTERNAL SASUM, SNRM2, ISAMAX +* .. External Subroutines .. + EXTERNAL ITEST1, SSCAL, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0, -1.0E0, 0.0E0, 1.0E0, 0.3E0, 0.3E0, + + 0.3E0, 0.3E0, 0.3E0, 0.3E0/ + DATA DV/0.1E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 0.3E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 0.3E0, -0.4E0, 4.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 0.2E0, + + -0.6E0, 0.3E0, 5.0E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 0.1E0, -0.3E0, 0.5E0, -0.1E0, 6.0E0, + + 6.0E0, 6.0E0, 6.0E0, 0.1E0, 8.0E0, 8.0E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 0.3E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 0.3E0, 2.0E0, + + -0.4E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 0.2E0, 3.0E0, -0.6E0, 5.0E0, 0.3E0, 2.0E0, + + 2.0E0, 2.0E0, 0.1E0, 4.0E0, -0.3E0, 6.0E0, + + -0.5E0, 7.0E0, -0.1E0, 3.0E0/ + DATA DTRUE1/0.0E0, 0.3E0, 0.5E0, 0.7E0, 0.6E0/ + DATA DTRUE3/0.0E0, 0.3E0, 0.7E0, 1.1E0, 1.0E0/ + DATA DTRUE5/0.10E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, -0.3E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, 0.0E0, 0.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, + + 0.20E0, -0.60E0, 0.30E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 5.0E0, 0.03E0, -0.09E0, 0.15E0, -0.03E0, + + 6.0E0, 6.0E0, 6.0E0, 6.0E0, 0.10E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, + + 0.09E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 0.09E0, 2.0E0, -0.12E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, 2.0E0, 0.06E0, 3.0E0, + + -0.18E0, 5.0E0, 0.09E0, 2.0E0, 2.0E0, 2.0E0, + + 0.03E0, 4.0E0, -0.09E0, 6.0E0, -0.15E0, 7.0E0, + + -0.03E0, 3.0E0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. SNRM2 .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(SNRM2(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. SASUM .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(SASUM(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. SSCAL .. + CALL SSCAL(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ISAMAX .. + CALL ITEST1(ISAMAX(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA, SC, SS + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DT9X(7,4,4), DT9Y(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + REAL SDOT + EXTERNAL SDOT +* .. External Subroutines .. + EXTERNAL SAXPY, SCOPY, SSWAP, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA SC, SS/0.8E0, 0.6E0/ + DATA DT7/0.0E0, 0.30E0, 0.21E0, 0.62E0, 0.0E0, + + 0.30E0, -0.07E0, 0.85E0, 0.0E0, 0.30E0, -0.79E0, + + -0.74E0, 0.0E0, 0.30E0, 0.33E0, 1.27E0/ + DATA DT8/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.15E0, + + 0.94E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.68E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.35E0, -0.9E0, 0.48E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.38E0, -0.9E0, 0.57E0, 0.7E0, -0.75E0, + + 0.2E0, 0.98E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.35E0, -0.72E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.38E0, + + -0.63E0, 0.15E0, 0.88E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.7E0, + + -0.75E0, 0.2E0, 1.04E0/ + DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + + 0.0E0, 0.0E0, 0.0E0/ + DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + + -0.18E0, 0.2E0, 0.16E0/ + DATA DT10X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, -0.9E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, -0.9E0, 0.3E0, 0.7E0, + + 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.3E0, 0.1E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.8E0, 0.1E0, -0.6E0, + + 0.8E0, 0.3E0, -0.3E0, 0.5E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.9E0, + + 0.1E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + 0.1E0, 0.3E0, 0.8E0, -0.9E0, -0.3E0, 0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, -0.6E0, 0.8E0, 0.0E0, 0.0E0, + + 0.0E0/ + DATA DT10Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, -0.5E0, -0.9E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, -0.4E0, -0.9E0, 0.9E0, + + 0.7E0, -0.5E0, 0.2E0, 0.6E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + -0.4E0, 0.9E0, -0.5E0, 0.6E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.7E0, + + -0.5E0, 0.2E0, 0.8E0/ + DATA SSIZE1/0.0E0, 0.3E0, 1.6E0, 3.2E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. SDOT .. + CALL STEST1(SDOT(N,SX,INCX,SY,INCY),DT7(KN,KI),SSIZE1(KN) + + ,SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. SAXPY .. + CALL SAXPY(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. SCOPY .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL SCOPY(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE IF (ICASE.EQ.6) THEN +* .. SSWAP .. + CALL SSWAP(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0E0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA, SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL SROT, STEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA SC, SS/0.8E0, 0.6E0/ + DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + + 0.0E0, 0.0E0, 0.0E0/ + DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + + -0.18E0, 0.2E0, 0.16E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. SROT .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL SROT(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0 + DO 100 I = 2, 6 + MWPS(I) = 1 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL SROT(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/test/sblat2.dat b/test/sblat2.dat new file mode 100644 index 0000000..5ed9dd7 --- /dev/null +++ b/test/sblat2.dat @@ -0,0 +1,34 @@ +'SBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +SGEMV T PUT F FOR NO TEST. SAME COLUMNS. +SGBMV T PUT F FOR NO TEST. SAME COLUMNS. +SSYMV T PUT F FOR NO TEST. SAME COLUMNS. +SSBMV T PUT F FOR NO TEST. SAME COLUMNS. +SSPMV T PUT F FOR NO TEST. SAME COLUMNS. +STRMV T PUT F FOR NO TEST. SAME COLUMNS. +STBMV T PUT F FOR NO TEST. SAME COLUMNS. +STPMV T PUT F FOR NO TEST. SAME COLUMNS. +STRSV T PUT F FOR NO TEST. SAME COLUMNS. +STBSV T PUT F FOR NO TEST. SAME COLUMNS. +STPSV T PUT F FOR NO TEST. SAME COLUMNS. +SGER T PUT F FOR NO TEST. SAME COLUMNS. +SSYR T PUT F FOR NO TEST. SAME COLUMNS. +SSPR T PUT F FOR NO TEST. SAME COLUMNS. +SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/sblat2.f b/test/sblat2.f new file mode 100644 index 0000000..057a854 --- /dev/null +++ b/test/sblat2.f @@ -0,0 +1,3138 @@ + PROGRAM SBLAT2 +* +* Test program for the REAL Level 2 Blas. +* +* The program must be driven by a short data file. The first 18 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'SBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* SGEMV T PUT F FOR NO TEST. SAME COLUMNS. +* SGBMV T PUT F FOR NO TEST. SAME COLUMNS. +* SSYMV T PUT F FOR NO TEST. SAME COLUMNS. +* SSBMV T PUT F FOR NO TEST. SAME COLUMNS. +* SSPMV T PUT F FOR NO TEST. SAME COLUMNS. +* STRMV T PUT F FOR NO TEST. SAME COLUMNS. +* STBMV T PUT F FOR NO TEST. SAME COLUMNS. +* STPMV T PUT F FOR NO TEST. SAME COLUMNS. +* STRSV T PUT F FOR NO TEST. SAME COLUMNS. +* STBSV T PUT F FOR NO TEST. SAME COLUMNS. +* STPSV T PUT F FOR NO TEST. SAME COLUMNS. +* SGER T PUT F FOR NO TEST. SAME COLUMNS. +* SSYR T PUT F FOR NO TEST. SAME COLUMNS. +* SSPR T PUT F FOR NO TEST. SAME COLUMNS. +* SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +* SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANS + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHK6, + $ SCHKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'SGEMV ', 'SGBMV ', 'SSYMV ', 'SSBMV ', + $ 'SSPMV ', 'STRMV ', 'STBMV ', 'STPMV ', + $ 'STRSV ', 'STBSV ', 'STPSV ', 'SGER ', + $ 'SSYR ', 'SSPR ', 'SSYR2 ', 'SSPR2 '/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from SMVCH YT holds +* the result computed by SMVCH. + TRANS = 'N' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL SCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test SGEMV, 01, and SGBMV, 02. + 140 CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. + 150 CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test STRMV, 06, STBMV, 07, STPMV, 08, +* STRSV, 09, STBSV, 10, and STPSV, 11. + 160 CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) + GO TO 200 +* Test SGER, 12. + 170 CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test SSYR, 13, and SSPR, 14. + 180 CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test SSYR2, 15, and SSPR2, 16. + 190 CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE REAL LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN SMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A6, L2 ) + 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT2. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests SGEMV and SGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SGBMV, SGEMV, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ TRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL SGEMV( TRANS, M, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL SGBMV( TRANS, M, N, KL, KU, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LSERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests SSYMV, SSBMV and SSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, SSBMV, SSPMV, SSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL SSYMV( UPLO, N, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL SSBMV( UPLO, N, K, ALPHA, AA, LDA, + $ XX, INCX, BETA, YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL SSPMV( UPLO, N, ALPHA, AA, XX, INCX, + $ BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LSE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LSERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LSE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LSERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) +* +* Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, STBMV, STBSV, STPMV, STPSV, + $ STRMV, STRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'R' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for SMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL STRMV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL STBMV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL STPMV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL STRSV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL STBSV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL STPSV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LSERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LSERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LSERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MV' )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, + $ INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests SGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SGER, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL SGER( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( 'GE', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL SMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests SSYR and SSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, SSPR, SSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL SSYR( UPLO, N, ALPHA, XX, INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL SSPR( UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LSERES( SNAME( 2: 3 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END + SUBROUTINE SCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests SSYR2 and SSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, SSPR2, SSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL SSYR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL SSPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( SNAME( 2: 3 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK6. +* + END + SUBROUTINE SCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 2 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, BETA, A, X and Y should not need to be defined. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + REAL ALPHA, BETA +* .. Local Arrays .. + REAL A( 1, 1 ), X( 1 ), Y( 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, SGBMV, SGEMV, SGER, SSBMV, SSPMV, SSPR, + $ SSPR2, SSYMV, SSYR, SSYR2, STBMV, STBSV, STPMV, + $ STPSV, STRMV, STRSV +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90, 100, 110, 120, 130, 140, 150, + $ 160 )ISNUM + 10 INFOT = 1 + CALL SGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL SGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 20 INFOT = 1 + CALL SGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 30 INFOT = 1 + CALL SSYMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 40 INFOT = 1 + CALL SSBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SSBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SSBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL SSBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 50 INFOT = 1 + CALL SSPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SSPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 60 INFOT = 1 + CALL STRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL STRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 70 INFOT = 1 + CALL STBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 80 INFOT = 1 + CALL STPMV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STPMV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STPMV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STPMV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STPMV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 90 INFOT = 1 + CALL STRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL STRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 100 INFOT = 1 + CALL STBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 110 INFOT = 1 + CALL STPSV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STPSV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STPSV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STPSV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STPSV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 120 INFOT = 1 + CALL SGER( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGER( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGER( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SGER( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SGER( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 130 INFOT = 1 + CALL SSYR( '/', 0, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYR( 'U', -1, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYR( 'U', 0, ALPHA, X, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR( 'U', 2, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 140 INFOT = 1 + CALL SSPR( '/', 0, ALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSPR( 'U', -1, ALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSPR( 'U', 0, ALPHA, X, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 150 INFOT = 1 + CALL SSYR2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYR2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYR2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 160 INFOT = 1 + CALL SSPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 170 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of SCHKE. +* + END + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'GB', 'SY', 'SB', 'SP', 'TR', 'TB' OR 'TP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'G' + SYM = TYPE( 1: 1 ).EQ.'S' + TRI = TYPE( 1: 1 ).EQ.'T' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = SBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'GB' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'SB'.OR.TYPE.EQ.'TB' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'SP'.OR.TYPE.EQ.'TP' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + REAL A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of SMVCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE', 'SY' or 'SP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = REAL( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 2 BLAS +* routines. +* +* XERBLA is an error handler for the Level 2 BLAS routines. +* +* It is called by the Level 2 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/sblat3.dat b/test/sblat3.dat new file mode 100644 index 0000000..98d36a5 --- /dev/null +++ b/test/sblat3.dat @@ -0,0 +1,20 @@ +'SBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +SGEMM T PUT F FOR NO TEST. SAME COLUMNS. +SSYMM T PUT F FOR NO TEST. SAME COLUMNS. +STRMM T PUT F FOR NO TEST. SAME COLUMNS. +STRSM T PUT F FOR NO TEST. SAME COLUMNS. +SSYRK T PUT F FOR NO TEST. SAME COLUMNS. +SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/sblat3.f b/test/sblat3.f new file mode 100644 index 0000000..325a9eb --- /dev/null +++ b/test/sblat3.f @@ -0,0 +1,2823 @@ + PROGRAM SBLAT3 +* +* Test program for the REAL Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 20 lines: +* 'SBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* SGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* SSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* STRMM T PUT F FOR NO TEST. SAME COLUMNS. +* STRSM T PUT F FOR NO TEST. SAME COLUMNS. +* SSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + REAL AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHKE, SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'SGEMM ', 'SSYMM ', 'STRMM ', 'STRSM ', + $ 'SSYRK ', 'SSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from SMMCH CT holds +* the result computed by SMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL SCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test SGEMM, 01. + 140 CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test SSYMM, 02. + 150 CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test STRMM, 03, STRSM, 04. + 160 CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test SSYRK, 05. + 170 CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test SSYR2K, 06. + 180 CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE REAL LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN SMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A6, L2 ) + 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT3. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests SGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SGEMM, SMAKE, SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL SGEMM( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests SSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, SSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL SMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL SSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests STRMM and STRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, STRMM, STRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for SMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL SMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL STRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL STRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LSE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LSE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LSERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests SSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, SSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL SSYRK( UPLO, TRANS, N, K, ALPHA, AA, LDA, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL SMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests SSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, SSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL SSYR2K( UPLO, TRANS, N, K, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL SMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL SMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END + SUBROUTINE SCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, BETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + REAL ALPHA, BETA +* .. Local Arrays .. + REAL A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, SGEMM, SSYMM, SSYR2K, SSYRK, STRMM, + $ STRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM + 10 INFOT = 1 + CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL SGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 20 INFOT = 1 + CALL SSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 30 INFOT = 1 + CALL STRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 40 INFOT = 1 + CALL STRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 50 INFOT = 1 + CALL SSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYRK( 'U', '/', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 60 INFOT = 1 + CALL SSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYR2K( 'U', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 70 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of SCHKE. +* + END + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = SBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of SMMCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = ( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/zblat1.f b/test/zblat1.f new file mode 100644 index 0000000..e2415e1 --- /dev/null +++ b/test/zblat1.f @@ -0,0 +1,681 @@ + PROGRAM ZBLAT1 +* Test program for the COMPLEX*16 Level 1 BLAS. +* Based upon the original BLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex BLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*6 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'ZDOTC '/ + DATA L(2)/'ZDOTU '/ + DATA L(3)/'ZAXPY '/ + DATA L(4)/'ZCOPY '/ + DATA L(5)/'ZSWAP '/ + DATA L(6)/'DZNRM2'/ + DATA L(7)/'DZASUM'/ + DATA L(8)/'ZSCAL '/ + DATA L(9)/'ZDSCAL'/ + DATA L(10)/'IZAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,12X,A6) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA + DOUBLE PRECISION SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX*16 CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + DOUBLE PRECISION STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + DOUBLE PRECISION DZASUM, DZNRM2 + INTEGER IZAMAX + EXTERNAL DZASUM, DZNRM2, IZAMAX +* .. External Subroutines .. + EXTERNAL ZSCAL, ZDSCAL, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3D0, (0.4D0,-0.7D0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.3D0,-0.4D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.1D0,-0.3D0), (0.5D0,-0.1D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), + + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.3D0,-0.4D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.1D0,-0.3D0), (8.0D0,9.0D0), (0.5D0,-0.1D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (0.1D0,0.1D0), + + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), + + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), + + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ + DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ + DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (-0.16D0,-0.37D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (-0.17D0,-0.19D0), (0.13D0,-0.39D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.19D0,-0.17D0), (0.32D0,0.09D0), + + (0.23D0,-0.24D0), (0.18D0,0.01D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (-0.16D0,-0.37D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (-0.17D0,-0.19D0), (8.0D0,9.0D0), + + (0.13D0,-0.39D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.11D0,-0.03D0), (3.0D0,6.0D0), + + (-0.17D0,0.46D0), (4.0D0,7.0D0), + + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), + + (0.32D0,0.09D0), (6.0D0,9.0D0), + + (0.23D0,-0.24D0), (8.0D0,3.0D0), + + (0.18D0,0.01D0), (9.0D0,4.0D0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.09D0,-0.12D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.03D0,-0.09D0), (0.15D0,-0.03D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.03D0,0.03D0), (-0.18D0,0.03D0), + + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.09D0,0.03D0), (0.03D0,0.12D0), + + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.09D0,-0.12D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.03D0,-0.09D0), (8.0D0,9.0D0), + + (0.15D0,-0.03D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.03D0,0.03D0), (3.0D0,6.0D0), + + (-0.18D0,0.03D0), (4.0D0,7.0D0), + + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), + + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), + + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. DZNRM2 .. + CALL STEST1(DZNRM2(N,CX,INCX),STRUE2(NP1),STRUE2(NP1), + + SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. DZASUM .. + CALL STEST1(DZASUM(N,CX,INCX),STRUE4(NP1),STRUE4(NP1), + + SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. ZSCAL .. + CALL ZSCAL(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. ZDSCAL .. + CALL ZDSCAL(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IZAMAX .. + CALL ITEST1(IZAMAX(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* ZSCAL +* Add a test for alpha equal to zero. + CA = (0.0D0,0.0D0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 80 CONTINUE + CALL ZSCAL(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* ZDSCAL +* Add a test for alpha equal to zero. + SA = 0.0D0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 100 CONTINUE + CALL ZDSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0D0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL ZDSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0D0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL ZDSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX*16 CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + COMPLEX*16 ZDOTC, ZDOTU + EXTERNAL ZDOTC, ZDOTU +* .. External Subroutines .. + EXTERNAL ZAXPY, ZCOPY, ZSWAP, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4D0,-0.7D0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (-0.1D0,-0.9D0), (0.2D0,-0.8D0), + + (-0.9D0,-0.4D0), (0.1D0,0.4D0), (-0.6D0,0.6D0)/ + DATA CY1/(0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.7D0,-0.6D0), (0.1D0,-0.5D0), (-0.1D0,-0.2D0), + + (-0.5D0,-0.3D0), (0.8D0,-0.7D0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-1.55D0,0.5D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (-1.55D0,0.5D0), + + (0.03D0,-0.89D0), (-0.38D0,-0.96D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-0.9D0,0.5D0), (0.42D0,-1.41D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-0.9D0,0.5D0), + + (0.06D0,-0.13D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.52D0,-1.51D0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-1.18D0,-0.31D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-1.54D0,0.97D0), + + (0.03D0,-0.89D0), (-0.18D0,-1.31D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), (-0.9D0,0.5D0), + + (0.05D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-0.9D0,0.5D0), (0.05D0,-0.6D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.32D0,-1.16D0)/ + DATA CT7/(0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (0.65D0,-0.47D0), (-0.34D0,-1.22D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.59D0,-1.46D0), (-1.04D0,-0.04D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.83D0,0.59D0), (0.07D0,-0.37D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.76D0,-1.15D0), (-1.33D0,-1.82D0)/ + DATA CT6/(0.0D0,0.0D0), (0.90D0,0.06D0), + + (0.91D0,-0.77D0), (1.80D0,-0.10D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.45D0,0.74D0), + + (0.20D0,0.90D0), (0.0D0,0.0D0), (0.90D0,0.06D0), + + (-0.55D0,0.23D0), (0.83D0,-0.39D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.04D0,0.79D0), + + (1.95D0,1.22D0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (-0.9D0,0.5D0), (0.7D0,-0.6D0), (0.1D0,-0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.6D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.8D0,-0.7D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.2D0), + + (0.2D0,-0.8D0), (0.7D0,-0.6D0), (0.1D0,0.4D0), + + (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.9D0,0.5D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.1D0,-0.5D0), + + (-0.4D0,-0.7D0), (0.7D0,-0.6D0), (0.2D0,-0.8D0), + + (-0.9D0,0.5D0), (0.1D0,0.4D0), (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (0.7D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (0.7D0,-0.6D0), (-0.1D0,-0.2D0), (0.8D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.9D0), + + (0.2D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (-0.9D0,0.5D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,0.5D0), (-0.9D0,-0.4D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.7D0,-0.8D0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,-0.4D0), (-0.1D0,-0.9D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.9D0,0.5D0), + + (-0.4D0,-0.7D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.9D0,0.5D0), (-0.4D0,-0.7D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.2D0,-0.8D0)/ + DATA CSIZE1/(0.0D0,0.0D0), (0.9D0,0.9D0), + + (1.63D0,1.73D0), (2.90D0,2.78D0)/ + DATA CSIZE3/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0)/ + DATA CSIZE2/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. ZDOTC .. + CDOT(1) = ZDOTC(N,CX,INCX,CY,INCY) + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. ZDOTU .. + CDOT(1) = ZDOTU(N,CX,INCX,CY,INCY) + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. ZAXPY .. + CALL ZAXPY(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. ZCOPY .. + CALL ZCOPY(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE IF (ICASE.EQ.5) THEN +* .. ZSWAP .. + CALL ZSWAP(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0D0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX*16 CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC DIMAG, DBLE +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = DBLE(CCOMP(I)) + SCOMP(2*I) = DIMAG(CCOMP(I)) + STRUE(2*I-1) = DBLE(CTRUE(I)) + STRUE(2*I) = DIMAG(CTRUE(I)) + SSIZE(2*I-1) = DBLE(CSIZE(I)) + SSIZE(2*I) = DIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/test/zblat2.dat b/test/zblat2.dat new file mode 100644 index 0000000..69a9f15 --- /dev/null +++ b/test/zblat2.dat @@ -0,0 +1,35 @@ +'ZBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. +ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. +ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. +ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. +ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. +ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. +ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. +ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. +ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. +ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. +ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. +ZGERC T PUT F FOR NO TEST. SAME COLUMNS. +ZGERU T PUT F FOR NO TEST. SAME COLUMNS. +ZHER T PUT F FOR NO TEST. SAME COLUMNS. +ZHPR T PUT F FOR NO TEST. SAME COLUMNS. +ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. +ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/zblat2.f b/test/zblat2.f new file mode 100644 index 0000000..e65cdcc --- /dev/null +++ b/test/zblat2.f @@ -0,0 +1,3249 @@ + PROGRAM ZBLAT2 +* +* Test program for the COMPLEX*16 Level 2 Blas. +* +* The program must be driven by a short data file. The first 18 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 35 lines: +* 'ZBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. +* ZGERC T PUT F FOR NO TEST. SAME COLUMNS. +* ZGERU T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER T PUT F FOR NO TEST. SAME COLUMNS. +* ZHPR T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. +* ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANS + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHK6, + $ ZCHKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'ZGEMV ', 'ZGBMV ', 'ZHEMV ', 'ZHBMV ', + $ 'ZHPMV ', 'ZTRMV ', 'ZTBMV ', 'ZTPMV ', + $ 'ZTRSV ', 'ZTBSV ', 'ZTPSV ', 'ZGERC ', + $ 'ZGERU ', 'ZHER ', 'ZHPR ', 'ZHER2 ', + $ 'ZHPR2 '/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from ZMVCH YT holds +* the result computed by ZMVCH. + TRANS = 'N' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test ZGEMV, 01, and ZGBMV, 02. + 140 CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. + 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, +* ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. + 160 CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) + GO TO 200 +* Test ZGERC, 12, ZGERU, 13. + 170 CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test ZHER, 14, and ZHPR, 15. + 180 CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test ZHER2, 16, and ZHPR2, 17. + 190 CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE COMPLEX*16 LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN ZMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' ZMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A6, L2 ) + 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT2. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests ZGEMV and ZGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZGBMV, ZGEMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ TRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL ZGEMV( TRANS, M, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL ZGBMV( TRANS, M, N, KL, KU, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LZERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests ZHEMV, ZHBMV and ZHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHBMV, ZHEMV, ZHPMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL ZHEMV( UPLO, N, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL ZHBMV( UPLO, N, K, ALPHA, AA, LDA, + $ XX, INCX, BETA, YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL ZHPMV( UPLO, N, ALPHA, AA, XX, INCX, + $ BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LZE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LZERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LZE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LZERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,', I2, ',(', F4.1, ',', F4.1, '), Y,', I2, + $ ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', F4.1, '), ', + $ 'Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK2. +* + END + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) +* +* Tests ZTRMV, ZTBMV, ZTPMV, ZTRSV, ZTBSV and ZTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMVCH, ZTBMV, ZTBSV, ZTPMV, ZTPSV, + $ ZTRMV, ZTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'R' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for ZMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTRMV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTBMV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTPMV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTRSV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTBSV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTPSV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LZERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LZERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LZERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MV' )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, + $ INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests ZGERC and ZGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZGERC, ZGERU, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. + CONJ = SNAME( 5: 5 ).EQ.'C' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL ZGERC( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL ZGERU( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( 'GE', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = DCONJG( W( 1 ) ) + CALL ZMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK4. +* + END + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests ZHER and ZHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, TRANSL + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHER, ZHPR, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCMPLX, DCONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = DBLE( ALF( IA ) ) + ALPHA = DCMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL ZHER( UPLO, N, RALPHA, XX, INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZHPR( UPLO, N, RALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LZERES( SNAME( 2: 3 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = DCONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK5. +* + END + SUBROUTINE ZCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests ZHER2 and ZHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHER2, ZHPR2, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL ZHER2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL ZHPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( SNAME( 2: 3 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*DCONJG( Z( J, 2 ) ) + W( 2 ) = DCONJG( ALPHA )*DCONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') ', + $ ' .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK6. +* + END + SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 2 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, A, X and Y should not need to be defined. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION RALPHA +* .. Local Arrays .. + COMPLEX*16 A( 1, 1 ), X( 1 ), Y( 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, ZGBMV, ZGEMV, ZGERC, ZGERU, ZHBMV, + $ ZHEMV, ZHER, ZHER2, ZHPMV, ZHPR, ZHPR2, ZTBMV, + $ ZTBSV, ZTPMV, ZTPSV, ZTRMV, ZTRSV +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90, 100, 110, 120, 130, 140, 150, 160, + $ 170 )ISNUM + 10 INFOT = 1 + CALL ZGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 20 INFOT = 1 + CALL ZGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 30 INFOT = 1 + CALL ZHEMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHEMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHEMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHEMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 40 INFOT = 1 + CALL ZHBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZHBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZHBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZHBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 50 INFOT = 1 + CALL ZHPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZHPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 60 INFOT = 1 + CALL ZTRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 70 INFOT = 1 + CALL ZTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 80 INFOT = 1 + CALL ZTPMV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTPMV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTPMV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTPMV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTPMV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 90 INFOT = 1 + CALL ZTRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 100 INFOT = 1 + CALL ZTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 110 INFOT = 1 + CALL ZTPSV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTPSV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTPSV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTPSV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTPSV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 120 INFOT = 1 + CALL ZGERC( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGERC( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGERC( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZGERC( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZGERC( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 130 INFOT = 1 + CALL ZGERU( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGERU( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGERU( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZGERU( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZGERU( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 140 INFOT = 1 + CALL ZHER( '/', 0, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHER( 'U', -1, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHER( 'U', 0, RALPHA, X, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER( 'U', 2, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 150 INFOT = 1 + CALL ZHPR( '/', 0, RALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHPR( 'U', -1, RALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHPR( 'U', 0, RALPHA, X, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 160 INFOT = 1 + CALL ZHER2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHER2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHER2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 170 INFOT = 1 + CALL ZHPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 180 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of ZCHKE. +* + END + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'GB', 'HE', 'HB', 'HP', 'TR', 'TB' OR 'TP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DBLE, DCMPLX, DCONJG, MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'G' + SYM = TYPE( 1: 1 ).EQ.'H' + TRI = TYPE( 1: 1 ).EQ.'T' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'GB' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'TR' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'HB'.OR.TYPE.EQ.'TB' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'HP'.OR.TYPE.EQ.'TP' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = DCMPLX( DBLE( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END + SUBROUTINE ZMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 C + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + DCONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of ZMVCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE', 'HE' or 'HP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 2 BLAS +* routines. +* +* XERBLA is an error handler for the Level 2 BLAS routines. +* +* It is called by the Level 2 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/zblat3.dat b/test/zblat3.dat new file mode 100644 index 0000000..c02ac4f --- /dev/null +++ b/test/zblat3.dat @@ -0,0 +1,23 @@ +'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +F LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/zblat3.f b/test/zblat3.f new file mode 100644 index 0000000..d6a522f --- /dev/null +++ b/test/zblat3.f @@ -0,0 +1,3445 @@ + PROGRAM ZBLAT3 +* +* Test program for the COMPLEX*16 Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 23 lines: +* 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'ZGEMM ', 'ZHEMM ', 'ZSYMM ', 'ZTRMM ', + $ 'ZTRSM ', 'ZHERK ', 'ZSYRK ', 'ZHER2K', + $ 'ZSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from ZMMCH CT holds +* the result computed by ZMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test ZGEMM, 01. + 140 CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test ZHEMM, 02, ZSYMM, 03. + 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test ZTRMM, 04, ZTRSM, 05. + 160 CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test ZHERK, 06, ZSYRK, 07. + 170 CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test ZHER2K, 08, ZSYR2K, 09. + 180 CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A6, L2 ) + 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT3. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests ZGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZGEMM, ZMAKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL ZMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZGEMM( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LZERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests ZHEMM and ZSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHEMM, ZMAKE, ZMMCH, ZSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL ZHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + ELSE + CALL ZSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK2. +* + END + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests ZTRMM and ZTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMMCH, ZTRMM, ZTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for ZMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL ZMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL ZTRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL ZTRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LZE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LZE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LZERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests ZHERK and ZSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHERK, ZMAKE, ZMMCH, ZSYRK +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = DBLE( ALPHA ) + ALPHA = DCMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, RALPHA, LDA, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZHERK( UPLO, TRANS, N, K, RALPHA, AA, + $ LDA, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZSYRK( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LZERES( SNAME( 2: 3 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL ZMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, + $ LDA, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK4. +* + END + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests ZHER2K and ZSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHER2K, ZMAKE, ZMMCH, ZSYR2K +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZHER2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZSYR2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'HE', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = DCONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*DCONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = DCONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC + END IF +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK5. +* + END + SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION RALPHA, RBETA +* .. Local Arrays .. + COMPLEX*16 A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM, + $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90 )ISNUM + 10 INFOT = 1 + CALL ZGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL ZGEMM( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL ZGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMM( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 20 INFOT = 1 + CALL ZHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 30 INFOT = 1 + CALL ZSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 40 INFOT = 1 + CALL ZTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 50 INFOT = 1 + CALL ZTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 60 INFOT = 1 + CALL ZHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 70 INFOT = 1 + CALL ZSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 80 INFOT = 1 + CALL ZHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 90 INFOT = 1 + CALL ZSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 100 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of ZCHKE. +* + END + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'HE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, DBLE +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + HER = TYPE.EQ.'HE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END + SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 CL + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of ZMMCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'HE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/version.h b/version.h new file mode 100644 index 0000000..d414446 --- /dev/null +++ b/version.h @@ -0,0 +1,43 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef VERSION_H +#define VERSION_H + +#define VERSION " Optimized BLAS by Kazushige Goto " +#endif